@tangle-network/agent-eval 0.22.0 → 0.23.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/CHANGELOG.md +156 -0
  2. package/README.md +13 -3
  3. package/dist/benchmarks/index.d.ts +2 -2
  4. package/dist/{chunk-UAND2LOT.js → chunk-7EAUOUQS.js} +4 -247
  5. package/dist/chunk-7EAUOUQS.js.map +1 -0
  6. package/dist/chunk-AXHNWLIX.js +246 -0
  7. package/dist/chunk-AXHNWLIX.js.map +1 -0
  8. package/dist/chunk-EXGR4XEM.js +283 -0
  9. package/dist/chunk-EXGR4XEM.js.map +1 -0
  10. package/dist/chunk-LZKIOBG2.js +2026 -0
  11. package/dist/chunk-LZKIOBG2.js.map +1 -0
  12. package/dist/{chunk-YUFXO3TU.js → chunk-QBW3YBTR.js} +1 -1
  13. package/dist/chunk-QBW3YBTR.js.map +1 -0
  14. package/dist/{chunk-ARZ6BEV6.js → chunk-V5QSWN7L.js} +2 -2
  15. package/dist/{chunk-USHQBPMH.js → chunk-VQQSPGSM.js} +7 -283
  16. package/dist/chunk-VQQSPGSM.js.map +1 -0
  17. package/dist/{chunk-4W4NCYM2.js → chunk-XPHOZPOM.js} +4 -2
  18. package/dist/chunk-XPHOZPOM.js.map +1 -0
  19. package/dist/{control-cxwMOAsy.d.ts → control-DvkH87qJ.d.ts} +2 -2
  20. package/dist/control.d.ts +3 -3
  21. package/dist/control.js +2 -2
  22. package/dist/{optimization-UVDNKaO6.d.ts → eval-campaign-Ds5QljIh.d.ts} +4 -5
  23. package/dist/{feedback-trajectory-CB0A32o3.d.ts → feedback-trajectory-c43WGtTX.d.ts} +1 -1
  24. package/dist/{index-c5saLbKD.d.ts → index-DDTlbHEK.d.ts} +1 -1
  25. package/dist/index-ekBXweiQ.d.ts +1894 -0
  26. package/dist/index.d.ts +18 -154
  27. package/dist/index.js +126 -26
  28. package/dist/index.js.map +1 -1
  29. package/dist/{integrity-K2oVlF57.d.ts → integrity-Cr5YodSY.d.ts} +1 -1
  30. package/dist/openapi.json +1 -1
  31. package/dist/optimization.d.ts +5 -5
  32. package/dist/optimization.js +7 -5
  33. package/dist/reporting.d.ts +294 -4
  34. package/dist/reporting.js +6 -4
  35. package/dist/rl.d.ts +8 -0
  36. package/dist/rl.js +113 -0
  37. package/dist/rl.js.map +1 -0
  38. package/dist/{run-record-CX_jcAyr.d.ts → run-record-DNiOMBrZ.d.ts} +10 -1
  39. package/dist/sequential-DgU2mFsE.d.ts +304 -0
  40. package/dist/{summary-report-D4p7RlDu.d.ts → summary-report-Ce1r4EYo.d.ts} +2 -2
  41. package/dist/traces.d.ts +2 -2
  42. package/dist/traces.js +6 -6
  43. package/docs/auto-research-loop-end-to-end.md +186 -0
  44. package/docs/three-package-architecture.md +180 -0
  45. package/package.json +22 -10
  46. package/dist/chunk-4W4NCYM2.js.map +0 -1
  47. package/dist/chunk-UAND2LOT.js.map +0 -1
  48. package/dist/chunk-USHQBPMH.js.map +0 -1
  49. package/dist/chunk-YUFXO3TU.js.map +0 -1
  50. package/dist/reporting-B82RSv9C.d.ts +0 -593
  51. /package/dist/{chunk-ARZ6BEV6.js.map → chunk-V5QSWN7L.js.map} +0 -0
@@ -0,0 +1,1894 @@
1
+ import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-Ce1r4EYo.js';
2
+ import { a as RunSplitTag, R as RunRecord } from './run-record-DNiOMBrZ.js';
3
+ import { S as Span, T as TraceStore } from './store-u47QaJ9G.js';
4
+ import { i as EvalCampaignResult, E as EvalCampaignOptions, R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult } from './eval-campaign-Ds5QljIh.js';
5
+ import { c as InterimReleaseConfidence, h as RubricPredictiveValidityReport, a as OutcomeStore } from './sequential-DgU2mFsE.js';
6
+
7
+ /**
8
+ * Multi-layer verifier — ordered pipeline of verification layers.
9
+ *
10
+ * Different contract from {@link JudgeRunner} (which runs parallel
11
+ * specs against a sandbox). MultiLayerVerifier is a DAG of layers
12
+ * (install → typecheck → build → lint → serve → semantic → …) with
13
+ * dependency-based skip, per-layer findings, soft-fail semantics, and
14
+ * an aggregated `blendedScore` across all passed layers.
15
+ *
16
+ * Use when you want:
17
+ * - ordered stages where a failing upstream stage skips downstream ones
18
+ * - each stage produces rich `findings` (severity + message + evidence)
19
+ * - a single composite score across stages with per-stage weights
20
+ * - soft-fail stages whose failure doesn't abort the pipeline
21
+ *
22
+ * Use {@link JudgeRunner} when you want:
23
+ * - N independent judges running in parallel against the same artifact
24
+ * - no inter-judge dependencies
25
+ * - boolean `passed` per judge + overall
26
+ *
27
+ * Both primitives compose — JudgeRunner can be invoked as a single
28
+ * layer inside a MultiLayerVerifier if that suits the caller.
29
+ */
30
+ type LayerStatus = 'pass' | 'fail' | 'skipped' | 'error' | 'timeout';
31
+ type Severity = 'critical' | 'major' | 'minor' | 'info';
32
+ interface Finding {
33
+ severity: Severity;
34
+ message: string;
35
+ evidence?: string;
36
+ /** Optional layer name the finding belongs to (set by the verifier if omitted). */
37
+ layer?: string;
38
+ /**
39
+ * Free-form structured payload — used by `multiToolchainLayer` to attach
40
+ * `{ adapter: 'pnpm' }`, by judges to attach evidence pointers, etc.
41
+ * Renderers MAY interrogate; agent-eval primitives never assume shape.
42
+ */
43
+ detail?: Record<string, unknown>;
44
+ }
45
+ interface LayerResult {
46
+ layer: string;
47
+ status: LayerStatus;
48
+ /** 0..1 score, optional — layers that don't produce a numeric score omit. */
49
+ score?: number;
50
+ durationMs: number;
51
+ findings: Finding[];
52
+ /** Short human-readable summary (one line). */
53
+ reason?: string;
54
+ /**
55
+ * Numeric layer-level diagnostics: error counts, warning counts,
56
+ * cyclomatic complexity, total adapter wall-time, etc. Keyed by
57
+ * diagnostic name; null = "diagnostic not applicable / not measured."
58
+ * Renderers that know the keys can display them; ones that don't,
59
+ * ignore. Free-form on purpose — consumers type the value shape in
60
+ * their own namespace. Added in 0.10.
61
+ */
62
+ diagnostics?: Record<string, number | null>;
63
+ /** Any rich per-layer detail — rendered as-is by consumers that know the layer. */
64
+ detail?: Record<string, unknown>;
65
+ }
66
+ interface VerifyContext<Env = unknown> {
67
+ /** Per-run opaque context the caller provides. Layers destructure what they need. */
68
+ env: Env;
69
+ /** Previously-computed results from layers that already ran. */
70
+ prior: Record<string, LayerResult>;
71
+ /** Signal — if aborted, layers MUST bail within reasonable wall. */
72
+ signal: AbortSignal;
73
+ }
74
+ interface Layer<Env = unknown> {
75
+ name: string;
76
+ /** Stages that must have `status: 'pass'` before this layer runs. */
77
+ dependsOn?: string[];
78
+ /**
79
+ * Weight in the composite `blendedScore`. Default 1.0. Layers with weight 0
80
+ * contribute findings but not score.
81
+ */
82
+ weight?: number;
83
+ /**
84
+ * If true, a `fail` status contributes to `blendedScore` (as 0) instead of
85
+ * being dropped — use for layers whose failure is a real signal. Default:
86
+ * fail drops from numerator + denominator, matching VB's existing semantics.
87
+ */
88
+ failContributesToScore?: boolean;
89
+ /** Optional per-layer wall-cap in ms. Honored by the verifier (AbortSignal). */
90
+ capMs?: number;
91
+ run: (ctx: VerifyContext<Env>) => Promise<LayerResult> | LayerResult;
92
+ }
93
+ interface VerifyOptions<Env = unknown> {
94
+ env: Env;
95
+ /**
96
+ * Overall wall cap. Default: sum of layer capMs, or Infinity if any layer
97
+ * omits a cap. The verifier short-circuits remaining layers on overall cap.
98
+ */
99
+ overallCapMs?: number;
100
+ /** Called with each layer result as it completes. */
101
+ onLayer?: (result: LayerResult) => void;
102
+ }
103
+ interface VerificationReport {
104
+ layers: LayerResult[];
105
+ passCount: number;
106
+ failCount: number;
107
+ skippedCount: number;
108
+ errorCount: number;
109
+ /** True iff at least one scored layer ran AND every scored layer passed. */
110
+ allPass: boolean;
111
+ /**
112
+ * Weighted mean of `score` across contributing layers. 0 when no layers
113
+ * contributed. See {@link Layer.failContributesToScore} for fail semantics.
114
+ */
115
+ blendedScore: number;
116
+ durationMs: number;
117
+ startedAt: string;
118
+ finishedAt: string;
119
+ }
120
+ /**
121
+ * Grade a semantic-concept-style judge result into a single layer status.
122
+ *
123
+ * Pass when overall score >= threshold AND no critical-severity concept gap.
124
+ * Fail otherwise. Use inside a `Layer.run` when wrapping a concept judge.
125
+ *
126
+ * Generalized from VerticalBench H3 fix: `failingConcepts.length === 0` was
127
+ * too strict — a single concept at 6/10 failed the entire layer despite
128
+ * overall score being >= 0.7. Now we trust the judge's own `severity` field:
129
+ * `critical` findings veto; `major`/`minor` reduce the score but don't veto.
130
+ */
131
+ declare function gradeSemanticStatus(input: {
132
+ score: number;
133
+ findings: Array<{
134
+ severity: Severity;
135
+ present?: boolean;
136
+ score?: number;
137
+ }>;
138
+ available: boolean;
139
+ threshold?: number;
140
+ }): LayerStatus;
141
+ declare class MultiLayerVerifier<Env = unknown> {
142
+ private readonly layers;
143
+ constructor(layers: Layer<Env>[]);
144
+ run(opts: VerifyOptions<Env>): Promise<VerificationReport>;
145
+ }
146
+
147
+ /**
148
+ * Adapters: convert legacy optimization outputs into the canonical
149
+ * `RunRecord[]` artifact that 0.22+ primitives consume.
150
+ *
151
+ * The 0.22 release standardized the campaign artifact: every cell of an
152
+ * eval matrix produces one `RunRecord`. The pre-0.22 optimization
153
+ * primitives (`runMultiShotOptimization`, `runPromptEvolution`) produce
154
+ * `TrialResult[]` with a different shape. This file bridges the two so
155
+ * the new primitives (`replayCache`, `pairedEvalueSequence`,
156
+ * `rubricPredictiveValidity`) compose cleanly with the existing RL stack.
157
+ *
158
+ * The adapters are thin and explicit — every mandatory `RunRecord` field
159
+ * comes from a caller-supplied context (`commitSha`, `model`,
160
+ * `promptHash`, `configHash`) plus the trial's runtime data. Defaults
161
+ * exist for fields the trial doesn't carry (`tokenUsage`, `costUsd`),
162
+ * but the validator still rejects records with bare-alias model strings
163
+ * — the caller is responsible for snapshot-pinning.
164
+ */
165
+
166
+ interface AdapterContext {
167
+ /** Logical experiment id — typically the campaign or sweep identifier. */
168
+ experimentId: string;
169
+ /** Snapshot model id (e.g. `claude-sonnet-4-6@2025-04-15`). */
170
+ model: string;
171
+ /** Git SHA the harness was run from. */
172
+ commitSha: string;
173
+ /** Hash of the effective prompt sent to the model. */
174
+ promptHash: string | ((t: TrialResult) => string);
175
+ /** Hash of the effective config (model, temperature, tools, judges, splits). */
176
+ configHash: string | ((t: TrialResult) => string);
177
+ /** Default split tag. Default `'search'` — optimization sweeps run on the search split. */
178
+ splitTag?: RunSplitTag;
179
+ /** Default cost in USD when the trial doesn't record one. Default `0`. */
180
+ defaultCostUsd?: number;
181
+ }
182
+ /**
183
+ * Convert one `TrialResult` (from `runPromptEvolution` or
184
+ * `runMultiShotOptimization`) into a canonical `RunRecord`.
185
+ *
186
+ * The conversion is **not lossy** — every `TrialResult.metrics` field is
187
+ * carried through to `outcome.raw`, plus a synthetic
188
+ * `raw.cost_unknown = 1` flag when the trial omits cost (so downstream
189
+ * filters can distinguish "free" from "untracked"). This preserves the
190
+ * paper-grade contract: a record without a cost number is unbounded by
191
+ * definition, but we don't drop the record.
192
+ */
193
+ declare function trialToRunRecord(trial: TrialResult, ctx: AdapterContext, opts?: {
194
+ runId?: string;
195
+ experimentIdPerTrial?: (t: TrialResult) => string;
196
+ }): RunRecord;
197
+ /** Convenience: convert an array of `TrialResult` in one go. */
198
+ declare function trialsToRunRecords(trials: TrialResult[], ctx: AdapterContext): RunRecord[];
199
+ /**
200
+ * Convert a `MultiLayerVerifier` `VerificationReport` into a `RunRecord`.
201
+ *
202
+ * The verifier produces per-layer results; we synthesize one canonical
203
+ * record where:
204
+ * - `outcome.searchScore` (or `holdoutScore`) is `report.blendedScore`
205
+ * - `outcome.raw` carries every layer's score keyed `layer.<name>`
206
+ * plus a `layer_<name>_pass` 1/0 indicator
207
+ * - `failureMode` is taken from the first failing layer's `reason`
208
+ * - `wallMs` is `report.durationMs`
209
+ */
210
+ declare function verificationReportToRunRecord(report: VerificationReport, ctx: AdapterContext & {
211
+ candidateId: string;
212
+ scenarioId?: string;
213
+ }, opts?: {
214
+ runId?: string;
215
+ }): RunRecord;
216
+ /**
217
+ * Convert a `VariantAggregate` (per-variant rollup from `prompt-evolution`)
218
+ * into a synthetic `RunRecord` representing the aggregate. Useful when the
219
+ * downstream consumer wants per-variant entries for a `researchReport`
220
+ * rather than per-(variant, scenario, rep) trial entries.
221
+ */
222
+ declare function variantAggregateToRunRecord(agg: VariantAggregate, ctx: AdapterContext, opts?: {
223
+ runId?: string;
224
+ }): RunRecord;
225
+
226
+ /**
227
+ * Verifiable reward channel.
228
+ *
229
+ * For RL on coding / math / theorem-proving / structured-output tasks, the
230
+ * reward signal is *decidable* — a test passes or fails, a proof checks or
231
+ * doesn't, an output validates against a schema or doesn't. These rewards
232
+ * are dramatically more useful for RL training than LLM-judge scores
233
+ * because they don't drift, can't be Goodhart-gamed by the policy in the
234
+ * same way, and don't require a separate calibration loop.
235
+ *
236
+ * The `MultiLayerVerifier` already produces this signal — it just doesn't
237
+ * surface it in a shape that's clean enough for RL training. This module
238
+ * wraps the verifier output so consumers can:
239
+ *
240
+ * 1. Extract a clean `VerifiableReward` from a `VerificationReport`
241
+ * 2. Distinguish *deterministic* rewards (compile, test, schema) from
242
+ * *probabilistic* rewards (judge) so they can be weighted differently
243
+ * in the RL training step
244
+ * 3. Filter `RunRecord[]` to only those with a verifiable reward,
245
+ * producing the clean training set that DeepSeek-R1-style GRPO and
246
+ * AlphaProof-style search both depend on
247
+ *
248
+ * Why this matters: every credible 2025-2026 frontier RL result on coding
249
+ * agents leans on verifiable reward (DeepSeek-R1 GRPO on test pass-rate,
250
+ * o-series RL on math/code, AlphaProof on Lean kernel checking). Mixing
251
+ * judge scores into the reward signal poisons the gradient. This module
252
+ * is the seam.
253
+ */
254
+
255
+ type VerifiableRewardSource = 'compile' | 'test' | 'schema' | 'sandbox' | 'judge' | 'composite';
256
+ interface VerifiableReward {
257
+ /** Scalar in [0, 1]. The RL training signal. */
258
+ value: number;
259
+ /** What produced the reward — different sources have different determinism. */
260
+ source: VerifiableRewardSource;
261
+ /**
262
+ * Determinism class. `'deterministic'` rewards are repeatable byte-for-byte
263
+ * given the same inputs (compile, test, schema validation, sandbox exit code).
264
+ * `'probabilistic'` rewards depend on a stochastic component (LLM judge).
265
+ * Mixing these in the same training batch without separation is a known
266
+ * footgun in production RLHF pipelines.
267
+ */
268
+ determinism: 'deterministic' | 'probabilistic';
269
+ /**
270
+ * Confidence in the reward value. For deterministic sources this is 1.0
271
+ * (the bit either flipped or didn't). For judge sources this is the
272
+ * judge-reported confidence or — when missing — a calibrated prior.
273
+ */
274
+ confidence: number;
275
+ /** The layer / judge id that produced the signal, for provenance. */
276
+ origin: string;
277
+ /**
278
+ * Any per-source breakdown the consumer might want — e.g. `{ tests_passed: 7, tests_total: 10 }`.
279
+ */
280
+ breakdown?: Record<string, number>;
281
+ }
282
+ interface VerifiableRewardExtractionOptions {
283
+ /**
284
+ * Which layers count as deterministic-reward sources. The verifier doesn't
285
+ * tag layers as "this is verifiable"; the caller declares it via this list
286
+ * (or via the layer name → source mapping). Default treats common names
287
+ * (`install`, `typecheck`, `build`, `lint`, `test`, `compile`, `schema`,
288
+ * `sandbox`) as deterministic.
289
+ */
290
+ deterministicLayers?: string[];
291
+ /**
292
+ * Map layer name → reward source. Defaults to a sensible string-match.
293
+ */
294
+ sourceFor?: (layerName: string) => VerifiableRewardSource;
295
+ /**
296
+ * Whether to fall back to a probabilistic (judge) reward when no
297
+ * deterministic layer produced a numeric score. Default `true`. Set to
298
+ * `false` for "deterministic-only" training pipelines that should
299
+ * discard runs without a verifiable signal.
300
+ */
301
+ fallbackToJudge?: boolean;
302
+ /**
303
+ * Default confidence for probabilistic (judge) rewards when the judge
304
+ * doesn't report one. Default `0.7`.
305
+ */
306
+ judgeConfidenceFloor?: number;
307
+ }
308
+ /**
309
+ * Extract a `VerifiableReward` from a `VerificationReport`.
310
+ *
311
+ * Strategy: prefer the deterministic layers (in order: test → compile →
312
+ * schema → sandbox), fall back to the judge layer if `fallbackToJudge` is
313
+ * true, return `null` if no signal qualifies. When multiple deterministic
314
+ * layers contribute, return a `'composite'` source with a weighted blend.
315
+ */
316
+ declare function extractVerifiableReward(report: VerificationReport, opts?: VerifiableRewardExtractionOptions): VerifiableReward | null;
317
+ /**
318
+ * Extract verifiable rewards from `RunRecord[]` produced via the
319
+ * `verificationReportToRunRecord` adapter (which encodes per-layer scores
320
+ * in `outcome.raw['layer.<name>']`). For records that don't carry layer
321
+ * scores, returns `null` for that record.
322
+ *
323
+ * This is the canonical bridge from "campaign-shaped artifacts" to
324
+ * "RL-training-ready reward signals": every record that has a clean
325
+ * verifiable reward becomes a training datum, every record that doesn't
326
+ * gets filtered out (or kept with `'probabilistic'` determinism for
327
+ * separate downstream handling).
328
+ */
329
+ declare function extractVerifiableRewardsFromRecords(runs: RunRecord[], opts?: VerifiableRewardExtractionOptions): Array<{
330
+ runId: string;
331
+ reward: VerifiableReward | null;
332
+ }>;
333
+ /** Filter `RunRecord[]` to those with deterministic verifiable rewards. */
334
+ declare function filterDeterministicallyRewarded(runs: RunRecord[], opts?: VerifiableRewardExtractionOptions): Array<{
335
+ run: RunRecord;
336
+ reward: VerifiableReward;
337
+ }>;
338
+
339
+ /**
340
+ * Preference dataset extraction — bridge from `RunRecord[]` to RL training.
341
+ *
342
+ * Production RLHF / DPO / KTO / SimPO pipelines need preference triples:
343
+ * `(prompt, chosen, rejected)`. The campaign artifact already contains the
344
+ * ingredients — every (variantId, scenarioId, seed) cell is a candidate
345
+ * that ran the same prompt against the same scenario, scored by the same
346
+ * judge — but turning that into a clean preference dataset requires
347
+ * deciding *what counts as a preference*.
348
+ *
349
+ * This module ships three preference-extraction strategies with explicit
350
+ * tradeoffs, plus a unified output type compatible with HuggingFace TRL,
351
+ * Anthropic finetuning JSONL, and OpenAI fine-tuning APIs. The strategies
352
+ * are deliberately not auto-magical — picking the wrong one corrupts the
353
+ * gradient.
354
+ *
355
+ * Strategies:
356
+ *
357
+ * 1. **`paired-by-scenario-and-seed`** — exact-match comparisons. For
358
+ * each scenario × seed pair, compare every (variantA, variantB) on
359
+ * that exact (scenario, seed). Matches scenarios so the comparison
360
+ * isolates variant effects. Highest signal-to-noise; smallest
361
+ * dataset (only matched pairs count).
362
+ *
363
+ * 2. **`paired-by-scenario`** — looser matching. For each scenario,
364
+ * compare every (variantA, variantB) where both have ≥ 1 run on the
365
+ * same scenario. Aggregates across seeds to compute mean scores per
366
+ * (variant, scenario), then forms preferences from the means. More
367
+ * data, lower per-pair signal.
368
+ *
369
+ * 3. **`top-vs-bottom`** — coarsest. Within each scenario, the highest-
370
+ * scoring run is `chosen`, the lowest is `rejected`. Smallest dataset
371
+ * per scenario but biggest score gap per pair. Useful for early
372
+ * bootstrapping when you have few variants.
373
+ *
374
+ * The output `PreferenceTriple` is *agent-eval-canonical* but trivially
375
+ * mappable to TRL's `DPODataset` shape (`prompt`, `chosen`, `rejected`)
376
+ * via the `toTRLFormat` helper.
377
+ */
378
+
379
+ type PreferenceStrategy = 'paired-by-scenario-and-seed' | 'paired-by-scenario' | 'top-vs-bottom';
380
+ interface PreferenceTriple {
381
+ /** The scenario (input) the variants were run against. */
382
+ scenarioId: string;
383
+ /** RunRecord ids on each side, for traceability. */
384
+ chosenRunId: string;
385
+ rejectedRunId: string;
386
+ /** Variant ids — load-bearing for the RL update. */
387
+ chosenVariantId: string;
388
+ rejectedVariantId: string;
389
+ /** The score gap between chosen and rejected. Larger = stronger signal. */
390
+ marginScore: number;
391
+ /**
392
+ * Optional `(chosen_score, rejected_score)` pair for soft-margin DPO
393
+ * variants. Omitted for `top-vs-bottom` runs that don't carry meaningful
394
+ * scalar gaps.
395
+ */
396
+ scores?: {
397
+ chosen: number;
398
+ rejected: number;
399
+ };
400
+ /** Tie-breaker — when multiple seeds match this scenario, the one used. */
401
+ seed?: number;
402
+ /**
403
+ * Free-form metadata propagated from the run records — e.g. original
404
+ * prompt-hash, model, etc. Lets the RL trainer reconstruct the prompt.
405
+ */
406
+ meta: {
407
+ chosenPromptHash: string;
408
+ rejectedPromptHash: string;
409
+ chosenConfigHash: string;
410
+ rejectedConfigHash: string;
411
+ chosenModel: string;
412
+ rejectedModel: string;
413
+ };
414
+ }
415
+ interface ExtractPreferencesOptions {
416
+ strategy?: PreferenceStrategy;
417
+ /**
418
+ * Minimum score gap required to admit a pair. Pairs below this are
419
+ * dropped — they're noise, not signal. Default 0.05 (5% of [0,1]).
420
+ */
421
+ minMargin?: number;
422
+ /**
423
+ * Optional split tag filter — restrict to runs from one split. Default
424
+ * `'holdout'` (the canonical "real" signal).
425
+ */
426
+ splitTag?: RunRecord['splitTag'];
427
+ /**
428
+ * Optional reward extractor that overrides `outcome.holdoutScore` /
429
+ * `outcome.searchScore`. Use to drive preferences off a verifiable
430
+ * reward instead of the headline score.
431
+ */
432
+ rewardOf?: (run: RunRecord) => number | null;
433
+ }
434
+ interface PreferenceExtractionReport {
435
+ pairs: PreferenceTriple[];
436
+ /** Number of (scenario, seed) cells inspected. */
437
+ cellsInspected: number;
438
+ /** Number of pairs filtered by `minMargin`. */
439
+ pairsBelowMargin: number;
440
+ /** Number of cells with only one variant (no comparison possible). */
441
+ cellsSingleton: number;
442
+ /** Strategy used. */
443
+ strategy: PreferenceStrategy;
444
+ }
445
+ /**
446
+ * Convert `RunRecord[]` to preference triples for RL training.
447
+ *
448
+ * Returns a structured report so callers can see how much data was
449
+ * dropped and why (low-margin pairs, singleton cells). For production
450
+ * pipelines, you usually want to:
451
+ *
452
+ * 1. Run a campaign producing 5–10 variants × 50–200 scenarios × 3 seeds
453
+ * 2. Call this with `strategy: 'paired-by-scenario-and-seed'` and a
454
+ * verifiable-reward extractor as `rewardOf`
455
+ * 3. Pass `report.pairs` to `toTRLFormat` and pipe to your DPO trainer
456
+ */
457
+ declare function extractPreferences(runs: RunRecord[], opts?: ExtractPreferencesOptions): PreferenceExtractionReport;
458
+ /**
459
+ * TRL-compatible export. TRL's `DPODataset` is `{ prompt, chosen, rejected }`
460
+ * but the prompt isn't stored on the RunRecord — only its hash. The caller
461
+ * passes a `promptOf(promptHash)` lookup that the TRL trainer can use.
462
+ */
463
+ declare function toTRLFormat(triples: PreferenceTriple[], promptOf: (hash: string) => string): Array<{
464
+ prompt: string;
465
+ chosen: string;
466
+ rejected: string;
467
+ }>;
468
+ /**
469
+ * Anthropic finetuning JSONL export — `{ system, user, assistant_chosen, assistant_rejected }`
470
+ * shape. Same caveat as TRL: prompt + outputs are content the caller has
471
+ * to map back from the run record / raw event log.
472
+ */
473
+ declare function toAnthropicFormat(triples: PreferenceTriple[]): Array<{
474
+ scenarioId: string;
475
+ chosenRunId: string;
476
+ rejectedRunId: string;
477
+ margin: number;
478
+ }>;
479
+
480
+ /**
481
+ * Off-policy evaluation primitives.
482
+ *
483
+ * Standard inverse-probability-weighted (IPS), self-normalized
484
+ * importance-weighted (SNIPS), and doubly-robust (DR) estimators for the
485
+ * value of a *target* policy given trajectories collected under a
486
+ * *behavior* policy. This is the canonical RL eval task: "we have last
487
+ * week's runs, we changed the policy — how would the new one do without
488
+ * re-running?"
489
+ *
490
+ * The math here is textbook (Dudík, Langford, Li 2011 for DR; Swaminathan
491
+ * & Joachims 2015 for SNIPS) but the *application* to LLM-agent
492
+ * evaluation needs care:
493
+ *
494
+ * - The "policy" is the (prompt, tool config, model snapshot) triple.
495
+ * Two policies have the same probability over an action *iff* their
496
+ * LLM call would emit the same token with the same probability —
497
+ * which is generally unknowable without the model log-probs.
498
+ * - For LLM agents, propensity scores must be supplied by the caller
499
+ * (logged in the trace, recovered from token log-probs, or estimated
500
+ * via a learned propensity model). We do NOT estimate propensity here.
501
+ * - Doubly-robust requires a Q-function (model-based reward predictor).
502
+ * We accept any callable; consumers pass either a tabular average,
503
+ * a regression fit, or a learned reward model.
504
+ *
505
+ * Bias / variance tradeoffs:
506
+ * - IPS: unbiased; high variance for small overlap, infinite variance
507
+ * when target has support outside behavior.
508
+ * - SNIPS: lower variance, slight bias; usually preferred in practice.
509
+ * - DR: doubly-robust — unbiased if either propensity OR Q-function is
510
+ * correct. Lowest practical variance when Q is decent. Use this.
511
+ *
512
+ * Caveat the panel will land: on the LLM-agent setting, propensity scores
513
+ * recovered from token log-probs are noisy, the action space is enormous,
514
+ * and overlap is often poor. These estimators are useful but not magic;
515
+ * complement with `replayCampaign` (exact replay where the request hashes
516
+ * match) for high-confidence answers and OPE for the gap.
517
+ */
518
+ interface OffPolicyTrajectory {
519
+ /** Stable id, for traceability through the dataset. */
520
+ runId: string;
521
+ /** Reward observed under the behavior policy (the realized outcome). */
522
+ reward: number;
523
+ /**
524
+ * Behavior-policy probability of the action that was taken. For LLM
525
+ * agents this is typically `exp(sum(token_log_probs))` over the chosen
526
+ * trajectory. Must be in (0, 1].
527
+ */
528
+ behaviorProb: number;
529
+ /**
530
+ * Target-policy probability of the same action. For replay-style
531
+ * counterfactual evaluation this is what the *new* policy would have
532
+ * assigned to the *old* trajectory. Must be in [0, 1].
533
+ */
534
+ targetProb: number;
535
+ /**
536
+ * Optional model-based reward prediction at the same context. Used by
537
+ * `doublyRobust`. Set to `null` for IPS-only evaluation.
538
+ */
539
+ qHat?: number | null;
540
+ }
541
+ interface OffPolicyEstimate {
542
+ /** Estimated value of the target policy. */
543
+ value: number;
544
+ /** Standard error of the estimate. */
545
+ standardError: number;
546
+ /** Effective sample size (Kong 1992). Lower = more reliance on a few high-weight samples. */
547
+ effectiveSampleSize: number;
548
+ /** Number of trajectories used. */
549
+ n: number;
550
+ /**
551
+ * Diagnostic: maximum importance weight observed. Large values (>>10x
552
+ * mean) are a red flag — variance is dominated by a few outliers.
553
+ */
554
+ maxImportanceWeight: number;
555
+ }
556
+ interface OffPolicyOptions {
557
+ /**
558
+ * Cap importance weights at this value (Ionides 2008 truncated IS) to
559
+ * trade unbiasedness for variance reduction. Default `Infinity` (no cap).
560
+ * Set e.g. `10` for stable estimates when the policies are close.
561
+ */
562
+ weightCap?: number;
563
+ /** Reward clipping range. Default `[0, 1]`. */
564
+ rewardClip?: {
565
+ low: number;
566
+ high: number;
567
+ };
568
+ }
569
+ /**
570
+ * Inverse Probability Weighting (Horvitz-Thompson). Unbiased estimator
571
+ * of E[reward under target policy]. Variance scales with the spread of
572
+ * target/behavior ratios.
573
+ */
574
+ declare function inverseProbabilityWeighting(trajectories: OffPolicyTrajectory[], opts?: OffPolicyOptions): OffPolicyEstimate;
575
+ /**
576
+ * Self-Normalized Importance Sampling. Lower variance than vanilla IPS at
577
+ * the cost of small bias (vanishing as N grows). The right default for
578
+ * LLM-agent evaluation where overlap is often poor.
579
+ */
580
+ declare function selfNormalizedImportanceWeighting(trajectories: OffPolicyTrajectory[], opts?: OffPolicyOptions): OffPolicyEstimate;
581
+ /**
582
+ * Doubly-robust off-policy estimator (Dudík, Langford, Li 2011).
583
+ *
584
+ * V_DR = (1/N) * sum_i [ q_hat_i + (target_prob_i / behavior_prob_i) * (r_i - q_hat_i) ]
585
+ *
586
+ * Unbiased if EITHER:
587
+ * - the importance ratios are correct (IPS-style validity), OR
588
+ * - the Q-hat function is correct (model-based validity).
589
+ *
590
+ * In practice both are imperfect, but the residual bias is the *product*
591
+ * of both errors — much smaller than either alone. This is why DR is the
592
+ * default in production OPE pipelines.
593
+ *
594
+ * Requires `qHat` on every trajectory. If any are `null`, the estimator
595
+ * falls back to SNIPS for those entries (loud-fallback behavior; the
596
+ * report's `n` reflects the full set but `effectiveSampleSize` accounts
597
+ * for the lost variance reduction).
598
+ */
599
+ declare function doublyRobust(trajectories: OffPolicyTrajectory[], opts?: OffPolicyOptions): OffPolicyEstimate;
600
+ /**
601
+ * Convenience: run all three estimators and return them side-by-side.
602
+ * The recommended diagnostic — agreement across estimators is a much
603
+ * stronger signal than any single one.
604
+ */
605
+ declare function offPolicyEstimateAll(trajectories: OffPolicyTrajectory[], opts?: OffPolicyOptions): {
606
+ ips: OffPolicyEstimate;
607
+ snips: OffPolicyEstimate;
608
+ dr: OffPolicyEstimate;
609
+ };
610
+
611
+ /**
612
+ * Process reward extraction — step-level credit assignment from trace spans.
613
+ *
614
+ * RL on long-horizon agents needs *step-level* rewards, not run-level
615
+ * ones. The classic credit-assignment problem (Sutton & Barto) requires
616
+ * knowing which sub-decisions in a trajectory contributed to the
617
+ * outcome. Modern systems (DeepSeek-R1, OpenAI o-series, Lightman et al.
618
+ * "Let's Verify Step by Step" 2023) train *process reward models* (PRMs)
619
+ * that score every step, then do RL with the PRM as the reward signal.
620
+ *
621
+ * This module extracts `StepReward[]` from trace spans — one per
622
+ * meaningful step — and ships:
623
+ *
624
+ * 1. `extractStepRewards(store, runId, opts)` — span → step-reward
625
+ * conversion using configurable per-span scorers (LLM judge over the
626
+ * span output, deterministic checkers, or a learned PRM).
627
+ * 2. `runwiseStepRewardSummary(stepRewards)` — aggregate the per-step
628
+ * signal into a credit-assignment-aware run-level score.
629
+ * 3. `prmTrainingPairs(stepRewards, options)` — produce the
630
+ * `(prefix, suffix_chosen, suffix_rejected)` triples that PRM
631
+ * training pipelines consume.
632
+ *
633
+ * What we ship: the *extraction* and *aggregation* infrastructure plus
634
+ * the data shape PRM training expects. We do NOT ship the actual PRM
635
+ * training (gradient descent over a transformer is out of scope for a
636
+ * TS package). The interface is the contract; downstream consumers wire
637
+ * their preferred trainer.
638
+ *
639
+ * Caveat the panel will land: this is descriptive credit assignment
640
+ * (which steps correlate with outcome), not causal credit assignment
641
+ * (which steps caused outcome). For causal claims you need
642
+ * counterfactual rollouts or a learned dynamics model. Future work; the
643
+ * descriptive version is what production PRM training actually uses.
644
+ */
645
+
646
+ interface StepReward {
647
+ /** Trace span this reward attaches to. */
648
+ spanId: string;
649
+ runId: string;
650
+ /** Index in the trajectory (0-based, in started-at order). */
651
+ stepIndex: number;
652
+ /** Span kind (typically 'tool', 'llm', 'judge'). */
653
+ kind: Span['kind'];
654
+ /** Span name — for the consumer's downstream filtering. */
655
+ name: string;
656
+ /** Step-level reward in [0, 1]. */
657
+ reward: number;
658
+ /**
659
+ * Determinism class. Mirrors the verifiable-reward distinction:
660
+ * deterministic = test/compile/schema check; probabilistic = LLM judge.
661
+ */
662
+ determinism: 'deterministic' | 'probabilistic';
663
+ /** Optional rationale / evidence — the trainer typically discards. */
664
+ rationale?: string;
665
+ /** Optional weight — how much this step contributes to credit assignment. */
666
+ weight?: number;
667
+ }
668
+ interface StepScorer {
669
+ /** Span kinds this scorer applies to. */
670
+ appliesTo: Span['kind'][];
671
+ /** Returns null to skip the span; returns a `StepReward` shape (without index/runId/spanId, which are filled in). */
672
+ score(span: Span): Promise<Omit<StepReward, 'spanId' | 'runId' | 'stepIndex'>> | null | undefined;
673
+ }
674
+ interface ExtractStepRewardsOptions {
675
+ /**
676
+ * Ordered list of scorers. Each span runs through scorers in order;
677
+ * the first non-null result wins. If no scorer applies, the span is
678
+ * skipped (not all spans are training-worthy).
679
+ */
680
+ scorers: StepScorer[];
681
+ /** Optional filter — return null to drop the span entirely before scoring. */
682
+ preFilter?: (span: Span) => boolean;
683
+ }
684
+ declare function extractStepRewards(store: TraceStore, runId: string, opts: ExtractStepRewardsOptions): Promise<StepReward[]>;
685
+ interface RunwiseStepSummary {
686
+ runId: string;
687
+ totalSteps: number;
688
+ meanReward: number;
689
+ /** Sum-of-rewards (weighted by `weight ?? 1`). Use as the run-level proxy. */
690
+ sumWeightedReward: number;
691
+ /** Fraction of steps where reward < 0.5 — proxy for "where the policy was wrong." */
692
+ failureFraction: number;
693
+ /** Maximum drop in reward between consecutive steps — diagnoses a step where things went sideways. */
694
+ worstStepDelta: number;
695
+ worstStepIndex: number | null;
696
+ }
697
+ declare function runwiseStepRewardSummary(stepRewards: StepReward[]): RunwiseStepSummary;
698
+ interface PrmTrainingTriple {
699
+ /** Prefix run-id (or composite key) — the trajectory up to step k-1. */
700
+ prefixRunId: string;
701
+ prefixStepIndex: number;
702
+ /** The step that came next on a high-reward trajectory. */
703
+ chosenSpanId: string;
704
+ chosenReward: number;
705
+ /** A step from a divergent low-reward trajectory at the same prefix length. */
706
+ rejectedSpanId: string;
707
+ rejectedReward: number;
708
+ /** The prefix run came from this run; the rejected step came from `rejectedRunId`. */
709
+ rejectedRunId: string;
710
+ marginScore: number;
711
+ }
712
+ /**
713
+ * Build PRM training triples. The shape: pair runs that share an early
714
+ * prefix (same scenario, same first N steps) and diverge later — at the
715
+ * point of divergence, the high-reward run's next step is `chosen`, the
716
+ * low-reward run's next step is `rejected`. This is the canonical PRM
717
+ * training data shape from Lightman et al. and DeepSeek-R1 process
718
+ * supervision.
719
+ *
720
+ * Implementation note: we don't have a way to detect "same prefix" in
721
+ * the general agent setting (token-level prefixes require hashing model
722
+ * outputs). The current heuristic groups by `(scenarioId, prefixSpanName
723
+ * sequence)` — runs are paired when their first K span names match. For
724
+ * production use this should be replaced with a proper trajectory-prefix
725
+ * hash; the heuristic is good enough for early-stage scaffolding.
726
+ */
727
+ declare function prmTrainingPairs(stepRewardsByRun: Map<string, StepReward[]>, opts?: {
728
+ minMargin?: number;
729
+ minPrefixLength?: number;
730
+ }): PrmTrainingTriple[];
731
+
732
+ /**
733
+ * Contamination probe — held-out perturbation tests.
734
+ *
735
+ * The bug class: once a benchmark scenario set is published, models train
736
+ * on it, and your scores become invalid. SWE-Bench-Verified, GPQA, and
737
+ * MMLU-Pro all exist because their predecessors got contaminated within
738
+ * months. The right defense is to keep a held-out *perturbed* version of
739
+ * every scenario — same task, slightly different surface — and check
740
+ * whether scores diverge significantly. Genuine capability transfers; rote
741
+ * memorization doesn't.
742
+ *
743
+ * This module ships the probe contract:
744
+ *
745
+ * 1. A `ScenarioPerturbation` strategy type — function that produces a
746
+ * perturbed scenario from an original.
747
+ * 2. `runContaminationProbe({ originals, perturbed, scoreFn })` — runs
748
+ * both halves and reports per-scenario score divergence + a global
749
+ * contamination verdict via paired Wilcoxon.
750
+ * 3. Several stock perturbations: `renameVariables`, `shuffleOrder`,
751
+ * `paraphrasePrompt`, `injectIrrelevantClause`. Each preserves the
752
+ * task's structural difficulty while breaking surface memorization.
753
+ *
754
+ * The verdict is conservative: if the perturbed-vs-original score
755
+ * difference is statistically significant (BH-adjusted p < 0.05) AND
756
+ * the median drop is > 5 percentage points, we flag *contamination
757
+ * suspected*. False positives are possible (the perturbation might
758
+ * actually be harder); the default is to flag for review, not to
759
+ * autoreject.
760
+ */
761
+ type ScenarioPerturbationKind = 'rename_variables' | 'shuffle_order' | 'paraphrase' | 'inject_irrelevant_clause' | 'custom';
762
+ interface ScenarioPerturbation<S> {
763
+ kind: ScenarioPerturbationKind;
764
+ /** Apply to one scenario, return its perturbed sibling. */
765
+ apply: (scenario: S) => Promise<S> | S;
766
+ /** Optional id — for the report. */
767
+ id?: string;
768
+ }
769
+ interface ContaminationProbeInput<S> {
770
+ /** Identity of every scenario. The probe's `runFingerprint` keys on these. */
771
+ scenarioId: (s: S) => string;
772
+ /** Original scenarios. */
773
+ originals: S[];
774
+ /**
775
+ * Either pre-computed perturbations (one per original, same order) OR a
776
+ * `perturbation` strategy that synthesizes them on the fly.
777
+ */
778
+ perturbed?: S[];
779
+ perturbation?: ScenarioPerturbation<S>;
780
+ /**
781
+ * Run the policy/agent against one scenario and return a scalar score
782
+ * in [0, 1]. The probe doesn't care what the policy is — that's the
783
+ * caller's contract.
784
+ */
785
+ scoreFn: (s: S) => Promise<number>;
786
+ }
787
+ interface ContaminationProbeOptions {
788
+ /** Drop scores below this from the probe; treats partial failures separately. Default 0. */
789
+ scoreFloor?: number;
790
+ /**
791
+ * BH-FDR threshold for declaring contamination on each per-scenario
792
+ * delta. Default 0.05.
793
+ */
794
+ fdr?: number;
795
+ /**
796
+ * Minimum median per-scenario drop to flag global contamination. Default
797
+ * 0.05 (5 percentage points). Smaller drops may be noise.
798
+ */
799
+ minMedianDrop?: number;
800
+ }
801
+ interface ContaminationProbeReport {
802
+ perScenario: Array<{
803
+ scenarioId: string;
804
+ originalScore: number;
805
+ perturbedScore: number;
806
+ delta: number;
807
+ /** Per-scenario q-value (single-test BH for a single scenario). Mainly for display. */
808
+ qValue: number;
809
+ }>;
810
+ /** Wilcoxon paired-test on the deltas. */
811
+ pairedTest: {
812
+ w: number;
813
+ p: number;
814
+ };
815
+ medianDelta: number;
816
+ meanDelta: number;
817
+ contaminationSuspected: boolean;
818
+ reason: string;
819
+ /** Number of scenarios processed. */
820
+ n: number;
821
+ }
822
+ declare function runContaminationProbe<S>(input: ContaminationProbeInput<S>, opts?: ContaminationProbeOptions): Promise<ContaminationProbeReport>;
823
+ /**
824
+ * Identifier-rename perturbation for code/text scenarios. Replaces every
825
+ * occurrence of the listed identifiers with synthesized aliases. Use when
826
+ * the scenario's structural difficulty is independent of variable names
827
+ * (e.g. SWE-Bench-style coding tasks).
828
+ */
829
+ declare function renameVariables<S extends {
830
+ prompt: string;
831
+ }>(identifiers: string[], rename?: (name: string, idx: number) => string): ScenarioPerturbation<S>;
832
+ /**
833
+ * Order-shuffle perturbation. Reshuffles a list-shaped section of the
834
+ * prompt (for QA scenarios that present options A/B/C/D — answer depends
835
+ * on the option labels, not order). Caller provides the section extractor.
836
+ */
837
+ declare function shuffleOrder<S extends {
838
+ prompt: string;
839
+ }>(shuffleSection: (prompt: string, rng: () => number) => string, seed: number): ScenarioPerturbation<S>;
840
+ /**
841
+ * Inject-irrelevant-clause perturbation. Adds a benign sentence that
842
+ * shouldn't change the answer. Tests for "did the model just memorize
843
+ * the input string."
844
+ */
845
+ declare function injectIrrelevantClause<S extends {
846
+ prompt: string;
847
+ }>(clause: string, position?: 'prefix' | 'suffix'): ScenarioPerturbation<S>;
848
+
849
+ /**
850
+ * Bradley-Terry / Elo tournament evaluation.
851
+ *
852
+ * For multi-candidate sweeps, comparing every candidate's score against
853
+ * a fixed comparator wastes information — the comparator becomes a high-
854
+ * variance reference and rank flips between near-tied middle-rank
855
+ * candidates are dominated by noise. Pairwise tournaments fix this:
856
+ * every (i, j) pair contributes a comparison to a Bradley-Terry MLE that
857
+ * estimates each candidate's strength on a unified scale.
858
+ *
859
+ * For online updating (rolling campaigns where new candidates arrive
860
+ * over time), we also ship classical Elo with configurable K-factor.
861
+ *
862
+ * References:
863
+ * - Bradley, R. A., Terry, M. E. (1952). Rank analysis of incomplete
864
+ * block designs. Biometrika, 39(3/4), 324–345.
865
+ * - Hunter, D. R. (2004). MM algorithms for generalized Bradley-Terry
866
+ * models. Annals of Statistics, 32(1), 384–406. (The MLE algorithm
867
+ * used here.)
868
+ * - Elo, A. E. (1978). The Rating of Chess Players, Past and Present.
869
+ *
870
+ * This is a useful primitive because most LLM-eval communities (Chatbot
871
+ * Arena, AlpacaEval, ELO-style ablation) have converged on pairwise
872
+ * tournament eval as the most sample-efficient and most rank-stable
873
+ * method when you have many candidates.
874
+ */
875
+ interface PairwiseOutcome {
876
+ /** Winner candidate id. */
877
+ winner: string;
878
+ /** Loser candidate id. */
879
+ loser: string;
880
+ /**
881
+ * Optional draw flag. When true, both candidates get half-credit
882
+ * (Bradley-Terry handles draws as half-wins for each side).
883
+ */
884
+ draw?: boolean;
885
+ /**
886
+ * Optional weight — useful if some pairwise comparisons are stronger
887
+ * signals than others (e.g. a paired test with a wider score gap is
888
+ * a more confident comparison). Default 1.
889
+ */
890
+ weight?: number;
891
+ }
892
+ interface BradleyTerryRating {
893
+ candidateId: string;
894
+ /** Latent strength θ ≥ 0 from the BT MLE. */
895
+ strength: number;
896
+ /** Log-strength = log(θ) — interpretable on a linear scale. */
897
+ logStrength: number;
898
+ /** Number of pairwise comparisons this candidate appears in. */
899
+ n: number;
900
+ /** Win count (+ 0.5 per draw). */
901
+ wins: number;
902
+ }
903
+ interface BradleyTerryFit {
904
+ ratings: BradleyTerryRating[];
905
+ /** Iterations of the MM algorithm before convergence. */
906
+ iterations: number;
907
+ /** Final maximum |θ_new - θ_old| / θ_old. */
908
+ finalDelta: number;
909
+ converged: boolean;
910
+ }
911
+ /**
912
+ * Bradley-Terry MLE via Hunter's MM algorithm.
913
+ *
914
+ * Iteration: θ_i^new = W_i / Σ_{j ≠ i} N_ij / (θ_i + θ_j)
915
+ * where W_i = wins by i (+ 0.5 per draw), N_ij = total comparisons.
916
+ *
917
+ * Returns log-strengths normalized so the smallest is 0 (any constant
918
+ * offset is unobservable in BT — only differences are identified).
919
+ */
920
+ declare function fitBradleyTerry(outcomes: PairwiseOutcome[], opts?: {
921
+ tolerance?: number;
922
+ maxIterations?: number;
923
+ smoothing?: number;
924
+ }): BradleyTerryFit;
925
+ /**
926
+ * Online Elo updates. Use when comparisons arrive over time and you want
927
+ * a running rating without re-fitting the full BT MLE on every update.
928
+ *
929
+ * Initialize ratings to `defaultRating` (1500 by default). Each call to
930
+ * `applyEloUpdate` mutates the map in place and returns the deltas so
931
+ * the caller can log per-comparison rating changes.
932
+ */
933
+ interface EloOptions {
934
+ /** Default rating for unseen candidates. Default 1500. */
935
+ defaultRating?: number;
936
+ /** K-factor controls the step size. Default 32 (FIDE-ish). */
937
+ kFactor?: number;
938
+ }
939
+ declare function applyEloUpdate(ratings: Map<string, number>, outcome: PairwiseOutcome, opts?: EloOptions): {
940
+ winnerDelta: number;
941
+ loserDelta: number;
942
+ };
943
+ /**
944
+ * Build pairwise outcomes from the campaign artifact: for every scenario
945
+ * shared by two candidates, the higher-scoring run wins. Useful when you
946
+ * want a tournament view of an existing campaign without an additional
947
+ * pairwise judge call.
948
+ */
949
+ interface BuildPairwiseFromCampaignInput {
950
+ runs: Array<{
951
+ candidateId: string;
952
+ /** Stable identifier for the matching unit (typically scenarioId). */
953
+ matchKey: string;
954
+ score: number;
955
+ }>;
956
+ /**
957
+ * Tied-score margin. Below this, the comparison is a draw. Default 0
958
+ * (no ties).
959
+ */
960
+ drawMargin?: number;
961
+ }
962
+ declare function buildPairwiseFromCampaign(input: BuildPairwiseFromCampaignInput): PairwiseOutcome[];
963
+
964
+ /**
965
+ * Adversarial scenario search.
966
+ *
967
+ * Capability evaluation on a fixed scenario set measures performance on
968
+ * the distribution someone curated. Production failure modes live in the
969
+ * tail — inputs the curator didn't think of, or actively avoided. The
970
+ * adversarial-search primitive actively looks for them: starting from a
971
+ * pool of scenarios where the policy already passes, it mutates them
972
+ * (paraphrase, edge-case substitution, compositional combination) and
973
+ * keeps the mutations that *break* the policy.
974
+ *
975
+ * This is not magic. It's the simplest version of the loop that AdA
976
+ * (Open-Ended Adaptation, DeepMind 2023), POET, and Anthropic's
977
+ * auto-jailbreak rigs all run: hill-climb against a failure indicator,
978
+ * keep the survivors, repeat. We ship the harness; consumers supply the
979
+ * mutation strategies and the failure detector.
980
+ *
981
+ * Why ship this in agent-eval and not as a separate red-team tool: every
982
+ * piece of the standard adversarial loop is already in this package
983
+ * (`runEvalCampaign` for the matrix run, `RawProviderSink` for capture,
984
+ * `assertRunCaptured` for integrity, `pairedEvalueSequence` for stop
985
+ * criteria). The adversarial primitive is just the *scenario-mutation
986
+ * meta-loop* on top of that machinery.
987
+ */
988
+ interface AdversarialScenario<S> {
989
+ /** Stable id — used for deduplication and lineage tracking. */
990
+ id: string;
991
+ /** Generation index — 0 for seeds, 1 for first round of mutations, etc. */
992
+ generation: number;
993
+ /** Lineage — id of the parent scenario this was mutated from, if any. */
994
+ parentId: string | null;
995
+ scenario: S;
996
+ /** Score on the policy under test. Lower = adversarial signal. */
997
+ score: number | null;
998
+ /** Strategy that produced this mutation, for diagnostics. */
999
+ mutationStrategy: string | null;
1000
+ }
1001
+ interface AdversarialMutation<S> {
1002
+ id: string;
1003
+ /**
1004
+ * Mutate one scenario. Return null to skip; return one or more new
1005
+ * scenarios. The harness deduplicates by `mutateScenarioId(scenario)`.
1006
+ */
1007
+ mutate(parent: S, rng: () => number): Promise<S[]> | S[];
1008
+ }
1009
+ interface AdversarialSearchOptions<S> {
1010
+ /** Initial scenarios — typically those the policy currently passes. */
1011
+ seeds: S[];
1012
+ /** Stable identifier extraction. */
1013
+ mutateScenarioId: (s: S) => string;
1014
+ /** Mutation strategies. */
1015
+ mutations: AdversarialMutation<S>[];
1016
+ /**
1017
+ * Run the policy under test against one scenario, return a scalar score
1018
+ * in [0, 1]. Lower = adversarial signal.
1019
+ */
1020
+ scoreFn: (s: S) => Promise<number>;
1021
+ /**
1022
+ * Threshold below which a scenario counts as a "failure" worth keeping.
1023
+ * Default 0.5.
1024
+ */
1025
+ failureThreshold?: number;
1026
+ /** Number of mutation rounds. Default 3. */
1027
+ rounds?: number;
1028
+ /** Children per parent per round. Default 4. */
1029
+ childrenPerParent?: number;
1030
+ /** Maximum total scenarios examined. Default Infinity. */
1031
+ budget?: number;
1032
+ /** Seed for the deterministic RNG. Default 1. */
1033
+ seed?: number;
1034
+ }
1035
+ interface AdversarialSearchReport<S> {
1036
+ scenarios: AdversarialScenario<S>[];
1037
+ /** Discovered failures sorted by score ascending. */
1038
+ failures: AdversarialScenario<S>[];
1039
+ /** Round-by-round counts. */
1040
+ byGeneration: Array<{
1041
+ generation: number;
1042
+ total: number;
1043
+ failures: number;
1044
+ meanScore: number;
1045
+ }>;
1046
+ /** Total scoreFn invocations consumed. */
1047
+ scoreCalls: number;
1048
+ }
1049
+ declare function adversarialScenarioSearch<S>(opts: AdversarialSearchOptions<S>): Promise<AdversarialSearchReport<S>>;
1050
+
1051
+ /**
1052
+ * Test-time compute scaling curves.
1053
+ *
1054
+ * The test-time-compute frontier paper (Snell et al. 2024) and the
1055
+ * subsequent o1-style scaling work both show that LLM-agent capability
1056
+ * is a function of the compute budget at inference, not just of the
1057
+ * training run. The right way to characterize a candidate is therefore
1058
+ * a *curve* — score at compute budgets {1×, 4×, 16×, …} — not a single
1059
+ * point.
1060
+ *
1061
+ * This module ships:
1062
+ *
1063
+ * 1. The compute-curve harness — `runComputeCurve(runner, budgets)` —
1064
+ * that evaluates one candidate at a sequence of compute budgets
1065
+ * and returns the (compute, score) curve.
1066
+ * 2. A best-of-N evaluator — `bestOfN(runner, n, scoreFn)` — the
1067
+ * simplest test-time-compute scaling primitive: sample N
1068
+ * independent rollouts, return the best.
1069
+ * 3. A self-consistency evaluator — `selfConsistency(runner, n)` —
1070
+ * the majority-vote variant of best-of-N for tasks with a small
1071
+ * categorical answer space.
1072
+ * 4. Pareto-frontier extraction over multiple candidates — given
1073
+ * (candidate, compute, score) tuples, return the set of
1074
+ * candidate-compute combinations that aren't dominated.
1075
+ *
1076
+ * Caveat: "compute" here is the caller's notion of a compute unit. For
1077
+ * agent eval that's typically wall-time × parallelism, or token budget,
1078
+ * or LLM-call count. We accept whatever the caller provides; the curve
1079
+ * is on whatever axis they pick.
1080
+ */
1081
+ interface ComputeCurveBudget {
1082
+ /** Identifier — for the report. Common: '1x', '4x', '16x'. */
1083
+ id: string;
1084
+ /** Numeric value on the chosen axis (tokens, calls, USD, ms — caller picks). */
1085
+ cost: number;
1086
+ /** Free-form metadata (the caller can carry per-budget config). */
1087
+ meta?: Record<string, unknown>;
1088
+ }
1089
+ interface ComputeCurvePoint {
1090
+ budgetId: string;
1091
+ cost: number;
1092
+ score: number;
1093
+ /** Number of underlying samples used at this budget. */
1094
+ samples: number;
1095
+ /** Optional spread / variance information. */
1096
+ std?: number;
1097
+ /** Any extra metrics the runner returned. */
1098
+ metrics?: Record<string, number>;
1099
+ }
1100
+ interface ComputeCurve {
1101
+ candidateId: string;
1102
+ points: ComputeCurvePoint[];
1103
+ /** Rough exponent fit: score ≈ a + b * log(cost). Useful for "how steep is the curve?" */
1104
+ logSlope: number | null;
1105
+ /** Best (highest-score) point on the curve. */
1106
+ best: ComputeCurvePoint;
1107
+ }
1108
+ interface RunComputeCurveOptions {
1109
+ candidateId: string;
1110
+ budgets: ComputeCurveBudget[];
1111
+ /**
1112
+ * Run the candidate at one budget. Returns the realized score plus
1113
+ * optional spread + extra metrics.
1114
+ */
1115
+ runAtBudget: (budget: ComputeCurveBudget) => Promise<{
1116
+ score: number;
1117
+ samples: number;
1118
+ std?: number;
1119
+ metrics?: Record<string, number>;
1120
+ }>;
1121
+ }
1122
+ declare function runComputeCurve(opts: RunComputeCurveOptions): Promise<ComputeCurve>;
1123
+ interface ComputeBestOfNOptions<O> {
1124
+ /** Number of independent samples to draw. */
1125
+ n: number;
1126
+ /** Sampler — produces one rollout. */
1127
+ sample: (sampleIdx: number) => Promise<O>;
1128
+ /** Score one rollout. */
1129
+ scoreFn: (rollout: O) => Promise<number> | number;
1130
+ }
1131
+ interface ComputeBestOfNResult<O> {
1132
+ best: O;
1133
+ bestScore: number;
1134
+ scores: number[];
1135
+ meanScore: number;
1136
+ /** Index of the best rollout, for diagnostics. */
1137
+ bestIndex: number;
1138
+ }
1139
+ /** The simplest test-time scaling primitive. */
1140
+ declare function bestOfN<O>(opts: ComputeBestOfNOptions<O>): Promise<ComputeBestOfNResult<O>>;
1141
+ interface SelfConsistencyOptions<O> {
1142
+ n: number;
1143
+ sample: (sampleIdx: number) => Promise<O>;
1144
+ /** Extract the canonical answer key (string) from a rollout. */
1145
+ answerKey: (rollout: O) => string;
1146
+ }
1147
+ interface SelfConsistencyResult<O> {
1148
+ /** Modal answer (the majority vote). */
1149
+ answer: string;
1150
+ /** Fraction of samples voting for the modal answer in [0, 1]. */
1151
+ agreement: number;
1152
+ /** Histogram of all answers. */
1153
+ histogram: Record<string, number>;
1154
+ /** A representative rollout that voted for the modal answer. */
1155
+ representative: O;
1156
+ /** All rollouts. */
1157
+ rollouts: O[];
1158
+ }
1159
+ /**
1160
+ * Self-consistency / majority-vote test-time scaling. For tasks with a
1161
+ * small categorical answer space (math problems, multiple choice).
1162
+ */
1163
+ declare function selfConsistency<O>(opts: SelfConsistencyOptions<O>): Promise<SelfConsistencyResult<O>>;
1164
+ /**
1165
+ * Pareto frontier over (candidate, compute, score) tuples. A point is on
1166
+ * the frontier iff no other point dominates it in both score (higher
1167
+ * better) and cost (lower better). Returns the frontier sorted ascending
1168
+ * by cost.
1169
+ */
1170
+ interface ParetoPointInput {
1171
+ candidateId: string;
1172
+ budgetId: string;
1173
+ cost: number;
1174
+ score: number;
1175
+ }
1176
+ declare function paretoFrontier(points: ParetoPointInput[]): ParetoPointInput[];
1177
+
1178
+ /**
1179
+ * Adaptive curriculum / active scenario selection.
1180
+ *
1181
+ * Fixed scenario sets waste sample budget on cells the policy already
1182
+ * passes (no information left) and cells the policy never passes (no
1183
+ * gradient available either). Active learning over scenarios fixes this
1184
+ * by allocating the next sample budget to cells where the policy's
1185
+ * outcome is *uncertain* — those carry the most decision-relevant signal.
1186
+ *
1187
+ * This module ships two complementary strategies:
1188
+ *
1189
+ * 1. **Variance-based** — score each (variant, scenario) cell by the
1190
+ * empirical variance of past observations. Allocate next-round budget
1191
+ * proportional to variance. Standard active-learning-by-uncertainty
1192
+ * heuristic; works well when the policy is non-deterministic and
1193
+ * cells differ in observation noise.
1194
+ *
1195
+ * 2. **Bandit-based (Thompson sampling)** — model each (variant,
1196
+ * scenario) cell as a Beta-Bernoulli arm; sample a posterior; pick
1197
+ * cells whose posterior mean is closest to the per-scenario decision
1198
+ * threshold. The right primitive when scenarios are
1199
+ * "pass/fail" rather than continuous, and when promotion gates fire
1200
+ * at a known threshold (e.g., 0.5).
1201
+ *
1202
+ * The output is a *next-round budget allocation* — a list of (variant,
1203
+ * scenario, count) triples. The consumer's matrix runner consumes the
1204
+ * allocation, runs those cells, feeds the new observations back. Loop.
1205
+ *
1206
+ * Out of scope (deliberate): scenario *generation* — that's the
1207
+ * adversarial primitive's job. This module allocates over an existing
1208
+ * scenario pool.
1209
+ */
1210
+
1211
+ interface CellObservation {
1212
+ variantId: string;
1213
+ scenarioId: string;
1214
+ /** Observed score in [0, 1]. */
1215
+ score: number;
1216
+ /** For Bernoulli arms — derive from the score with a threshold if needed. */
1217
+ pass?: boolean;
1218
+ }
1219
+ interface CurriculumAllocation {
1220
+ variantId: string;
1221
+ scenarioId: string;
1222
+ /** How many additional reps to run on this cell. */
1223
+ count: number;
1224
+ /** Strategy-specific reason for the allocation. */
1225
+ reason: string;
1226
+ }
1227
+ interface VarianceCurriculumOptions {
1228
+ /** Total reps to allocate across all cells. */
1229
+ budget: number;
1230
+ /**
1231
+ * Smoothing prior on variance — keeps the allocator from concentrating
1232
+ * on a cell with one observation just because its 1-sample variance is
1233
+ * 0. Default 0.05.
1234
+ */
1235
+ variancePrior?: number;
1236
+ /**
1237
+ * Minimum reps per cell — even when the variance estimate is low, give
1238
+ * every cell at least this many. Default 1.
1239
+ */
1240
+ floorPerCell?: number;
1241
+ }
1242
+ /**
1243
+ * Variance-proportional allocation. For each cell, estimate variance from
1244
+ * past observations + a prior, then allocate the budget proportional to
1245
+ * (sqrt(variance) + 1/sqrt(n)) — a classical optimal-allocation rule
1246
+ * (Neyman 1934) that balances "explore noisy cells" with "explore
1247
+ * under-sampled cells."
1248
+ */
1249
+ declare function varianceBasedCurriculum(observations: CellObservation[], candidateCells: Array<{
1250
+ variantId: string;
1251
+ scenarioId: string;
1252
+ }>, opts: VarianceCurriculumOptions): CurriculumAllocation[];
1253
+ interface ThompsonCurriculumOptions {
1254
+ budget: number;
1255
+ /**
1256
+ * The per-scenario decision threshold. Cells whose posterior mean is
1257
+ * closest to this get the most budget — that's where the next observation
1258
+ * has the highest information value for the gate decision. Default 0.5.
1259
+ */
1260
+ decisionThreshold?: number;
1261
+ /** Beta prior parameters. Default α=β=1 (uniform). */
1262
+ priorAlpha?: number;
1263
+ priorBeta?: number;
1264
+ /** Seed the Thompson sampler. Default unset (Math.random). */
1265
+ seed?: number;
1266
+ }
1267
+ /**
1268
+ * Thompson-sampling-style allocation for pass/fail cells. For each cell:
1269
+ *
1270
+ * - Maintain Beta(α + passes, β + failures) posterior on pass-rate
1271
+ * - Allocation weight ∝ exp(-((sampledMean - threshold) / σ)^2):
1272
+ * cells whose sampled posterior straddles the decision boundary get
1273
+ * the most weight; cells already clearly above or below get less.
1274
+ *
1275
+ * This is the right primitive when promotion gates fire at a known
1276
+ * threshold and you want to sharpen the posterior near the boundary.
1277
+ */
1278
+ declare function thompsonCurriculum(observations: CellObservation[], candidateCells: Array<{
1279
+ variantId: string;
1280
+ scenarioId: string;
1281
+ }>, opts: ThompsonCurriculumOptions): CurriculumAllocation[];
1282
+ /** Convenience: extract `CellObservation[]` directly from `RunRecord[]`. */
1283
+ declare function observationsFromRunRecords(runs: RunRecord[], opts?: {
1284
+ passThreshold?: number;
1285
+ useHoldout?: boolean;
1286
+ }): CellObservation[];
1287
+
1288
+ /**
1289
+ * Reward hacking / Goodhart detection.
1290
+ *
1291
+ * Goodhart's Law says: when a measure becomes a target, it ceases to be
1292
+ * a good measure. In RLHF and agentic-RL settings this is the dominant
1293
+ * failure mode — the policy learns to produce outputs that score well on
1294
+ * the proxy reward (judge, rubric, test pass-rate) without producing
1295
+ * the underlying capability the proxy was meant to track.
1296
+ *
1297
+ * Krakovna et al. (2020, "Specification Gaming Examples in AI") and the
1298
+ * subsequent RLHF reward-hacking literature (Skalse et al. 2022, Kim et al.
1299
+ * 2023) converge on a few diagnostic signatures:
1300
+ *
1301
+ * 1. **Reward divergence:** the proxy reward grows while the held-out
1302
+ * ground-truth signal stagnates or drops. Predictive validity over
1303
+ * time captures this.
1304
+ * 2. **Distributional shift in outputs:** after RL, the policy produces
1305
+ * outputs that no longer match the reference distribution — usually
1306
+ * because it found a high-reward attractor that's degenerate (e.g.
1307
+ * one-token responses, repetition, formatting tricks).
1308
+ * 3. **Disagreement between independent rewards:** if you train on
1309
+ * reward A and a held-out independent reward B drops sharply, you're
1310
+ * probably hacking A.
1311
+ * 4. **Calibration drift:** the verifiable / deterministic component of
1312
+ * the reward is stable; the probabilistic / judge component drifts up
1313
+ * while the deterministic component doesn't. The judge is being
1314
+ * gamed.
1315
+ *
1316
+ * This module ships explicit detectors for all four signatures, plus a
1317
+ * combined verdict. The output is diagnostic — actionable signals,
1318
+ * not autoreject — because each signature has known false positives
1319
+ * (e.g., a policy that genuinely improves can show distributional shift).
1320
+ *
1321
+ * Differs from `rubricPredictiveValidity` (which is a *standing* check on
1322
+ * whether rubrics correlate with deployment outcomes) — this is a
1323
+ * *temporal* check on whether the reward-vs-truth gap is *widening over
1324
+ * time during a training run*.
1325
+ */
1326
+
1327
+ type RewardHackingSignal = 'reward_divergence' | 'distribution_shift' | 'reward_disagreement' | 'judge_drift';
1328
+ interface RewardHackingFinding {
1329
+ signal: RewardHackingSignal;
1330
+ /** Severity in [0, 1]. >0.5 = strong signal. */
1331
+ severity: number;
1332
+ message: string;
1333
+ /** Numeric evidence the consumer can render. */
1334
+ detail: Record<string, number>;
1335
+ }
1336
+ interface RewardHackingReport {
1337
+ findings: RewardHackingFinding[];
1338
+ /**
1339
+ * Composite verdict. `'clean'` if every signal severity < 0.3;
1340
+ * `'suspect'` if at least one ≥ 0.3 but none ≥ 0.6; `'gaming'` if any ≥ 0.6.
1341
+ */
1342
+ verdict: 'clean' | 'suspect' | 'gaming';
1343
+ /** Rationale for the verdict, ready to paste into an audit log. */
1344
+ rationale: string[];
1345
+ /** Number of paired (proxy, truth) data points the report saw. */
1346
+ n: number;
1347
+ }
1348
+ interface DetectRewardHackingInput {
1349
+ /**
1350
+ * Run records ordered by recency (oldest first). The detector segments
1351
+ * them into prefix/suffix windows to compute "did the gap widen."
1352
+ */
1353
+ runs: RunRecord[];
1354
+ /**
1355
+ * The metric the policy was trained to optimize. Should be present on
1356
+ * `outcome.raw` or `outcome.holdoutScore`. Default reads `outcome.holdoutScore`.
1357
+ */
1358
+ proxyOf?: (run: RunRecord) => number | null;
1359
+ /**
1360
+ * The held-out ground-truth metric. For RL on coding, this is typically
1361
+ * test pass-rate. For RLHF, it's downstream task performance or human
1362
+ * preference. For knowledge tasks, it's an independently-graded score.
1363
+ */
1364
+ truthOf?: (run: RunRecord) => number | null;
1365
+ /**
1366
+ * Independent secondary reward. Used for the `reward_disagreement`
1367
+ * signal. Default uses the verifiable reward extractor (deterministic
1368
+ * sources only).
1369
+ */
1370
+ secondaryRewardOf?: (run: RunRecord) => number | null;
1371
+ /**
1372
+ * Window size — how many of the most recent runs count as the "after"
1373
+ * cohort. Default min(50, half the runs).
1374
+ */
1375
+ windowSize?: number;
1376
+ /**
1377
+ * Severity threshold to flag a signal. Default 0.3 (suspect) and 0.6
1378
+ * (gaming).
1379
+ */
1380
+ thresholds?: {
1381
+ suspect?: number;
1382
+ gaming?: number;
1383
+ };
1384
+ /**
1385
+ * Verifiable-reward options used for the secondary-reward fallback.
1386
+ */
1387
+ verifiableRewardOptions?: VerifiableRewardExtractionOptions;
1388
+ }
1389
+ declare function detectRewardHacking(input: DetectRewardHackingInput): RewardHackingReport;
1390
+
1391
+ /**
1392
+ * Sample-efficient adaptation evaluation.
1393
+ *
1394
+ * For foundation-model-based agents, the load-bearing capability isn't
1395
+ * raw end-state performance — it's *how fast the agent reaches that
1396
+ * performance from cold start*. The same model with a worse prompt that
1397
+ * adapts in 5 demonstrations beats the same model with a better prompt
1398
+ * that needs 50. Standard meta-learning eval (Finn et al., MAML, RL² lit)
1399
+ * reports an *adaptation curve*: score after k=0, 1, 2, 4, 8, 16, …
1400
+ * in-context examples or fine-tune steps.
1401
+ *
1402
+ * This module ships:
1403
+ *
1404
+ * 1. `runAdaptationCurve` — given a runner that takes k demonstrations
1405
+ * and returns a score, produce the (k, score) curve.
1406
+ * 2. `compareAdaptationCurves` — paired comparison across two policies.
1407
+ * Returns per-k delta with bootstrap CIs and an "area-under-curve"
1408
+ * summary statistic.
1409
+ * 3. `firstPassK` — for pass/fail evaluation, the minimum k at which
1410
+ * the policy reliably passes (≥ pass-rate threshold over reps).
1411
+ *
1412
+ * Use cases:
1413
+ * - Compare two prompt designs that have similar end-state performance
1414
+ * but different in-context efficiency.
1415
+ * - Decide between fine-tuning and prompting based on adaptation cost.
1416
+ * - Detect when a policy "memorizes" k=0 inputs vs. genuinely adapts.
1417
+ */
1418
+ interface AdaptationRunner<S> {
1419
+ /**
1420
+ * Runs the policy on `scenario` with `k` demonstrations. Returns a
1421
+ * scalar score in [0, 1]. The runner is responsible for any caching;
1422
+ * the harness calls it once per (scenario, k, rep) cell.
1423
+ */
1424
+ run(args: {
1425
+ scenario: S;
1426
+ k: number;
1427
+ rep: number;
1428
+ }): Promise<number>;
1429
+ }
1430
+ interface RunAdaptationCurveOptions<S> {
1431
+ scenarios: S[];
1432
+ /** Number-of-shots to evaluate at. Default `[0, 1, 2, 4, 8, 16]`. */
1433
+ ks?: number[];
1434
+ /** Reps per (scenario, k) cell. Default 3. */
1435
+ reps?: number;
1436
+ runner: AdaptationRunner<S>;
1437
+ /** Pass-rate threshold for `firstPassK` reporting. Default 0.5. */
1438
+ passThreshold?: number;
1439
+ }
1440
+ interface AdaptationPoint {
1441
+ k: number;
1442
+ meanScore: number;
1443
+ passRate: number;
1444
+ std: number;
1445
+ n: number;
1446
+ /** Per-scenario means at this k. */
1447
+ perScenario: Array<{
1448
+ scenarioId: string;
1449
+ meanScore: number;
1450
+ passes: number;
1451
+ total: number;
1452
+ }>;
1453
+ }
1454
+ interface AdaptationCurve {
1455
+ points: AdaptationPoint[];
1456
+ /**
1457
+ * Smallest `k` at which `passRate ≥ passThreshold`. `null` if no `k`
1458
+ * tested reaches it.
1459
+ */
1460
+ firstPassK: number | null;
1461
+ /**
1462
+ * Area under the (k, meanScore) curve, normalized by max-k. A
1463
+ * single-number summary of "how well does this policy adapt from
1464
+ * cold-start to fully-conditioned." Higher = better adapter.
1465
+ */
1466
+ adaptationArea: number;
1467
+ }
1468
+ declare function runAdaptationCurve<S extends {
1469
+ scenarioId?: string;
1470
+ }>(opts: RunAdaptationCurveOptions<S>): Promise<AdaptationCurve>;
1471
+ interface CompareCurvesResult {
1472
+ perK: Array<{
1473
+ k: number;
1474
+ deltaMean: number;
1475
+ aLow: number;
1476
+ aHigh: number;
1477
+ bLow: number;
1478
+ bHigh: number;
1479
+ }>;
1480
+ areaDelta: number;
1481
+ firstPassKDelta: number | null;
1482
+ /** Verdict: 'a_better' | 'b_better' | 'similar'. */
1483
+ verdict: 'a_better' | 'b_better' | 'similar';
1484
+ /** Rationale, ready to render. */
1485
+ rationale: string;
1486
+ }
1487
+ /**
1488
+ * Paired comparison of two adaptation curves. Per-k deltas with 95%
1489
+ * bootstrap CIs (constructed from each curve's `perScenario` per-k means
1490
+ * — the bootstrap unit is the scenario, not the rep).
1491
+ */
1492
+ declare function compareAdaptationCurves(a: AdaptationCurve, b: AdaptationCurve, opts?: {
1493
+ confidence?: number;
1494
+ bootstrapResamples?: number;
1495
+ seed?: number;
1496
+ }): CompareCurvesResult;
1497
+ /** First k at which the curve's per-scenario pass rate reliably hits the threshold. */
1498
+ declare function firstPassK(curve: AdaptationCurve, threshold?: number): number | null;
1499
+
1500
+ /**
1501
+ * Trainer-format exporters.
1502
+ *
1503
+ * agent-eval produces canonical artifacts (`RunRecord[]`, `PreferenceTriple[]`,
1504
+ * `StepReward[]`, `PrmTrainingTriple[]`). RL training pipelines consume
1505
+ * different shapes — Hugging Face TRL, Prime Intellect's prime-rl, OpenAI
1506
+ * fine-tuning, Anthropic finetuning, OpenRLHF, verl. Each has its own
1507
+ * JSONL conventions. Rather than ship N adapters, this module ships the
1508
+ * canonical formats most production pipelines accept and ergonomic helpers
1509
+ * for the rest.
1510
+ *
1511
+ * Shapes:
1512
+ * - **DPO / IPO / KTO** — `{prompt, chosen, rejected}` JSONL. Consumed
1513
+ * by HuggingFace TRL, prime-rl's offline DPO, OpenRLHF.
1514
+ * - **GRPO offline** — `{prompt, completions[], rewards[]}` JSONL.
1515
+ * Consumed by prime-rl GRPO, verl, OpenRLHF.
1516
+ * - **SFT** — `{messages[]}` JSONL with chosen completion as the final
1517
+ * assistant turn. Consumed by HF SFT trainers, OpenAI fine-tuning,
1518
+ * Anthropic finetuning.
1519
+ * - **PRM** — `{prompt, prefix_steps[], chosen_step, rejected_step}` JSONL.
1520
+ * Consumed by Lightman-style PRM trainers and prime-rl's PRM mode.
1521
+ *
1522
+ * Why ship this in agent-eval rather than a separate adapter package: the
1523
+ * canonical artifacts (`RunRecord[]`, `PreferenceTriple[]`, etc.) are
1524
+ * agent-eval's contract; without first-party exporters consumers reverse-
1525
+ * engineer the mapping every release. The exporters codify it.
1526
+ *
1527
+ * The exporters take callbacks for any field that isn't on the canonical
1528
+ * artifact (specifically: prompt + completion text, since the package
1529
+ * stores only their hashes by design — full text is the consumer's
1530
+ * trace store / raw event log).
1531
+ */
1532
+
1533
+ interface DpoLookups {
1534
+ /** Resolve the prompt text for a run (typically from a trace store / raw event sink). */
1535
+ promptOf: (runId: string) => string | Promise<string>;
1536
+ /** Resolve the assistant completion text for a run. */
1537
+ completionOf: (runId: string) => string | Promise<string>;
1538
+ }
1539
+ interface DpoExportRow {
1540
+ prompt: string;
1541
+ chosen: string;
1542
+ rejected: string;
1543
+ /** Carried-through margin. Some KTO / IPO variants use this. */
1544
+ margin?: number;
1545
+ /** Free-form metadata for downstream filtering / sharding. */
1546
+ meta?: Record<string, unknown>;
1547
+ }
1548
+ /**
1549
+ * Convert preference triples to TRL-compatible DPO rows. The shape
1550
+ * `{prompt, chosen, rejected}` is the canonical HuggingFace DPODataset
1551
+ * entry; every major DPO trainer accepts it.
1552
+ */
1553
+ declare function toDpoRows(triples: PreferenceTriple[], lookups: DpoLookups): Promise<DpoExportRow[]>;
1554
+ /** Serialize DPO rows as JSONL. One line per row. */
1555
+ declare function toDpoJsonl(rows: DpoExportRow[]): string;
1556
+ interface GrpoLookups {
1557
+ promptOf: (runId: string) => string | Promise<string>;
1558
+ completionOf: (runId: string) => string | Promise<string>;
1559
+ /** Optional: derive a custom reward from the run. Defaults to score. */
1560
+ rewardOf?: (run: RunRecord) => number | null;
1561
+ }
1562
+ interface GrpoExportRow {
1563
+ prompt: string;
1564
+ completions: string[];
1565
+ rewards: number[];
1566
+ /** runIds in the same order as `completions[]` for traceability. */
1567
+ runIds: string[];
1568
+ meta?: Record<string, unknown>;
1569
+ }
1570
+ /**
1571
+ * Convert RunRecord[] grouped by `(scenarioId)` into GRPO offline rows —
1572
+ * one row per scenario, with one completion per run on that scenario.
1573
+ *
1574
+ * GRPO (Shao et al. 2024 / DeepSeek-R1) trains on relative advantages
1575
+ * within a group of completions for the same prompt; this is the
1576
+ * canonical input format.
1577
+ */
1578
+ declare function toGrpoRows(runs: RunRecord[], lookups: GrpoLookups): Promise<GrpoExportRow[]>;
1579
+ declare function toGrpoJsonl(rows: GrpoExportRow[]): string;
1580
+ interface SftLookups {
1581
+ promptOf: (runId: string) => string | Promise<string>;
1582
+ completionOf: (runId: string) => string | Promise<string>;
1583
+ /** Optional system message. Default omits. */
1584
+ systemOf?: (run: RunRecord) => string | null | undefined;
1585
+ /** Filter — return false to skip the run (e.g., low score, failed cases). */
1586
+ include?: (run: RunRecord) => boolean;
1587
+ }
1588
+ interface SftExportRow {
1589
+ messages: Array<{
1590
+ role: 'system' | 'user' | 'assistant';
1591
+ content: string;
1592
+ }>;
1593
+ meta?: Record<string, unknown>;
1594
+ }
1595
+ /**
1596
+ * Convert RunRecord[] into Hugging Face / OpenAI / Anthropic-style
1597
+ * conversational SFT rows. By default every record becomes one row;
1598
+ * pass `include` to filter (e.g., keep only `score >= 0.8` for
1599
+ * rejection-sampling SFT).
1600
+ */
1601
+ declare function toSftRows(runs: RunRecord[], lookups: SftLookups): Promise<SftExportRow[]>;
1602
+ declare function toSftJsonl(rows: SftExportRow[]): string;
1603
+ interface PrmLookups {
1604
+ /** Resolve the prompt text for a run. */
1605
+ promptOf: (runId: string) => string | Promise<string>;
1606
+ /** Resolve the trajectory step text for a (runId, spanId) pair. */
1607
+ stepTextOf: (runId: string, spanId: string) => string | Promise<string>;
1608
+ /** Optional: sequence of prefix span ids leading up to the divergence. */
1609
+ prefixOf?: (runId: string, prefixStepIndex: number) => string[] | Promise<string[]>;
1610
+ }
1611
+ interface PrmExportRow {
1612
+ prompt: string;
1613
+ /** Span ids for the steps before divergence — caller resolves text via `stepTextOf`. */
1614
+ prefixSpanIds: string[];
1615
+ prefixStepText: string[];
1616
+ chosenStep: string;
1617
+ rejectedStep: string;
1618
+ chosenReward: number;
1619
+ rejectedReward: number;
1620
+ marginScore: number;
1621
+ meta?: Record<string, unknown>;
1622
+ }
1623
+ /**
1624
+ * Convert PRM training triples to JSONL rows. Caller's `stepTextOf`
1625
+ * callback resolves span text from the consumer's trace store.
1626
+ */
1627
+ declare function toPrmRows(triples: PrmTrainingTriple[], lookups: PrmLookups): Promise<PrmExportRow[]>;
1628
+ declare function toPrmJsonl(rows: PrmExportRow[]): string;
1629
+ interface StepRewardJsonlRow {
1630
+ runId: string;
1631
+ spanId: string;
1632
+ stepIndex: number;
1633
+ reward: number;
1634
+ determinism: 'deterministic' | 'probabilistic';
1635
+ weight: number;
1636
+ }
1637
+ declare function stepRewardsToJsonl(stepRewards: StepReward[]): string;
1638
+
1639
+ /**
1640
+ * `runRLCampaign` — the missing top-level orchestrator.
1641
+ *
1642
+ * `runEvalCampaign` runs the matrix and produces `RunRecord[]`. The 0.23
1643
+ * RL primitives consume that artifact in different ways. Until 0.24 they
1644
+ * had to be wired together by hand at every consumer; that defeats the
1645
+ * cohesion the package is supposed to provide.
1646
+ *
1647
+ * `runRLCampaign` wires:
1648
+ * 1. `runEvalCampaign` for the matrix run (capture, integrity, hooks)
1649
+ * 2. `extractVerifiableReward` over each run, separating deterministic
1650
+ * from probabilistic reward sources for the trainer
1651
+ * 3. `extractPreferences` to produce DPO/PPO/KTO triples
1652
+ * 4. `evaluateInterimReleaseConfidence` over paired deltas (anytime-valid)
1653
+ * 5. `rubricPredictiveValidity` against an outcome store, when provided
1654
+ * 6. `detectRewardHacking` as a standing hygiene check
1655
+ * 7. Trainer-format export rows ready for prime-rl / TRL / verl
1656
+ *
1657
+ * The output `RLCampaignResult` is a single, audit-ready artifact: every
1658
+ * stage's output is in there. The consumer's downstream fits in a single
1659
+ * line: pass `result.preferences` to their DPO trainer, `result.grpoRows`
1660
+ * to GRPO, `result.runs` plus `result.rewardSignals` to a custom RL loop.
1661
+ *
1662
+ * This is what the 0.23 panel critique called the "missing top-level
1663
+ * primitive." Now shipped.
1664
+ */
1665
+
1666
+ interface RunRLCampaignOptions<V> extends EvalCampaignOptions<V> {
1667
+ /** Preference-extraction options. Default uses paired-by-scenario-and-seed with min-margin 0.05. */
1668
+ preferences?: ExtractPreferencesOptions;
1669
+ /** Verifiable-reward extraction options. */
1670
+ verifiableReward?: VerifiableRewardExtractionOptions;
1671
+ /** Outcome store + metric names — when supplied, runs `rubricPredictiveValidity` post-campaign. */
1672
+ outcomeStore?: OutcomeStore;
1673
+ outcomeMetrics?: string[];
1674
+ /** Anytime-valid sequential evaluation options. */
1675
+ sequential?: {
1676
+ alpha?: number;
1677
+ bound?: number;
1678
+ rope?: {
1679
+ low: number;
1680
+ high: number;
1681
+ };
1682
+ };
1683
+ /** Trainer-format export lookups. When provided, the orchestrator builds the corresponding rows. */
1684
+ trainerExport?: {
1685
+ dpo?: DpoLookups;
1686
+ grpo?: GrpoLookups;
1687
+ sft?: SftLookups;
1688
+ };
1689
+ }
1690
+ interface RLCampaignResult<V> {
1691
+ campaign: EvalCampaignResult;
1692
+ /** Per-run verifiable reward (deterministic when available, probabilistic fallback otherwise). */
1693
+ rewardSignals: Array<{
1694
+ runId: string;
1695
+ reward: VerifiableReward | null;
1696
+ }>;
1697
+ /** Preference extraction report. */
1698
+ preferences: PreferenceExtractionReport;
1699
+ /** Anytime-valid interim verdict over the paired deltas (vs comparator). */
1700
+ interimConfidence: InterimReleaseConfidence | null;
1701
+ /** Standing reward-hacking hygiene check. */
1702
+ rewardHacking: RewardHackingReport;
1703
+ /** Predictive validity, when an outcome store was supplied. */
1704
+ predictiveValidity: RubricPredictiveValidityReport | null;
1705
+ /** Trainer-export rows, populated only for the formats the caller requested via `trainerExport`. */
1706
+ trainerRows: {
1707
+ dpo?: DpoExportRow[];
1708
+ grpo?: GrpoExportRow[];
1709
+ sft?: SftExportRow[];
1710
+ };
1711
+ /**
1712
+ * One-line top-level summary the consumer can log.
1713
+ */
1714
+ summary: string;
1715
+ /**
1716
+ * Convenience type-tag — consumers can branch on `result.kind`.
1717
+ */
1718
+ kind: 'agent-eval-rl-campaign';
1719
+ unusedVariant?: V;
1720
+ }
1721
+ declare function runRLCampaign<V>(opts: RunRLCampaignOptions<V>): Promise<RLCampaignResult<V>>;
1722
+
1723
+ /**
1724
+ * `PredictiveValidityResearcher` — concrete `Researcher` implementation
1725
+ * that drives selection from outcome-anchored predictive validity.
1726
+ *
1727
+ * `Researcher` was a placeholder interface plus `NoopResearcher` until
1728
+ * 0.23. The 0.23 panel critique called this out: shipping the interface
1729
+ * without a default implementation that drives the loop is incomplete.
1730
+ *
1731
+ * This researcher answers each method:
1732
+ *
1733
+ * - `inspectFailures(runs)` — synthesizes failure modes from the
1734
+ * bottom-quartile of `RunRecord`s on the configured proxy reward.
1735
+ * - `proposeChange(failures)` — proposes steering changes that target
1736
+ * the rubrics with the lowest predictive validity (decorative ones).
1737
+ * Either reduce their weight in the composite, or recalibrate them.
1738
+ * - `applyChange(changes, baseline)` — merges the proposed steering
1739
+ * into the experiment plan.
1740
+ * - `evaluateChange(plan)` — re-runs the predictive-validity check on
1741
+ * the post-change runs and reports the delta.
1742
+ *
1743
+ * The result is a closed loop: the rubric weights drift toward the ones
1744
+ * that actually predict deployment outcomes, automatically. Pair with
1745
+ * `runRLCampaign` for the full auto-research story.
1746
+ */
1747
+
1748
+ interface PredictiveValidityResearcherOptions {
1749
+ outcomes: OutcomeStore;
1750
+ outcomeMetrics: string[];
1751
+ /** Score threshold below which a run counts as a "failure." Default 0.5. */
1752
+ failureThreshold?: number;
1753
+ /** Spearman bucket below which a rubric is "decorative." Default 0.4. */
1754
+ decorativeThreshold?: number;
1755
+ /** Optional steering-namespace prefix for proposed changes. Default `'rubric_weight'`. */
1756
+ steeringNamespace?: string;
1757
+ /** Override the rubric set the researcher inspects. Default: every numeric `outcome.raw` key seen. */
1758
+ rubrics?: string[];
1759
+ /**
1760
+ * Snapshot stash hook — called with the most recent predictive-validity
1761
+ * report. Useful when a downstream system wants to log rubric drift over
1762
+ * time. Default no-op.
1763
+ */
1764
+ onReport?: (report: RubricPredictiveValidityReport) => void | Promise<void>;
1765
+ }
1766
+ /**
1767
+ * Concrete `Researcher` driven by `rubricPredictiveValidity`. The brain:
1768
+ * rubrics that don't predict deployment outcomes don't earn weight.
1769
+ */
1770
+ declare class PredictiveValidityResearcher implements Researcher {
1771
+ private opts;
1772
+ private lastReport;
1773
+ constructor(opts: PredictiveValidityResearcherOptions);
1774
+ inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
1775
+ proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
1776
+ applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
1777
+ evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
1778
+ /**
1779
+ * Run the predictive-validity check explicitly against a fresh RunRecord
1780
+ * set. Updates the researcher's cached report so subsequent
1781
+ * `proposeChange` calls have evidence to draw from.
1782
+ */
1783
+ runValidityCheck(runs: RunRecord[]): Promise<RubricPredictiveValidityReport>;
1784
+ /**
1785
+ * Force-feed a predictive-validity report into the researcher state —
1786
+ * useful when the consumer ran the report out-of-band and wants the
1787
+ * researcher's later proposals informed by it.
1788
+ */
1789
+ setReport(report: RubricPredictiveValidityReport): void;
1790
+ getLastReport(): RubricPredictiveValidityReport | null;
1791
+ }
1792
+
1793
+ /**
1794
+ * `analyzeOptimizationResult` — unifies the pre-0.22 auto-research stack
1795
+ * (`runPromptEvolution`, `runMultiShotOptimization`, reflective-mutation,
1796
+ * Ax/AxRLM trace analyst) with the 0.23 RL bridge in a single call.
1797
+ *
1798
+ * What this fixes: until 0.23 the optimization stack and the RL bridge
1799
+ * lived in parallel namespaces. The optimization primitives produced
1800
+ * `TrialResult[]`; the RL bridge consumed `RunRecord[]`. Trace-analyst
1801
+ * was decoupled from both. `analyzeOptimizationResult` does the wiring
1802
+ * once so consumers don't have to:
1803
+ *
1804
+ * Optimization (existing primitives) RL bridge (0.23)
1805
+ * ────────────────────────────────── ────────────────
1806
+ * runPromptEvolution → TrialResult[] →
1807
+ * runMultiShotOptimization → MSTrial[] → analyzeOptimizationResult →
1808
+ * reflective-mutation → mutations.jsonl → ↓
1809
+ * │
1810
+ * ↓ (per-generation inputs flow back) │
1811
+ * PredictiveValidityResearcher.proposeChange ←───────────────────── │
1812
+ * │
1813
+ * ↓ │
1814
+ * TraceAnalyst.analyze(progressLog) ←─────────────────────────┘
1815
+ *
1816
+ * The output of this function is the canonical RL artifact set:
1817
+ * `RunRecord[]` (so every other 0.22+ primitive composes), preference
1818
+ * triples, verifiable reward signals, reward-hacking diagnosis,
1819
+ * sequential interim verdict, and (when wired) trace-analyst summary.
1820
+ *
1821
+ * What this primitive does NOT do: it does not modify the optimization
1822
+ * primitives' internals. They keep producing `TrialResult` and emitting
1823
+ * `onProgress` events; this function bridges *after* the sweep completes.
1824
+ * Per-step capture-integrity (raw HTTP events from inside the score
1825
+ * adapter) requires the consumer to wire `RawProviderSink` into their
1826
+ * own `ScoreAdapter` — that's a per-consumer integration point.
1827
+ */
1828
+
1829
+ interface AnalyzeOptimizationResultOptions {
1830
+ /**
1831
+ * The optimization output. Either a `PromptEvolutionResult` or a
1832
+ * `MultiShotOptimizationResult`. The function detects which by
1833
+ * structural typing and produces canonical `RunRecord[]` from either.
1834
+ */
1835
+ result: PromptEvolutionResult | MultiShotOptimizationResult;
1836
+ /** Adapter context — `commitSha`, `model`, `promptHash`, `configHash`. */
1837
+ ctx: AdapterContext;
1838
+ /** Optional comparator candidate id for paired analyses. */
1839
+ comparator?: string;
1840
+ /** Verifiable-reward extraction options. */
1841
+ verifiableReward?: VerifiableRewardExtractionOptions;
1842
+ /** Preference extraction options. */
1843
+ preferences?: ExtractPreferencesOptions;
1844
+ /** Sequential interim-confidence options. */
1845
+ sequential?: {
1846
+ alpha?: number;
1847
+ bound?: number;
1848
+ rope?: {
1849
+ low: number;
1850
+ high: number;
1851
+ };
1852
+ };
1853
+ /** Outcome calibration store + metrics. */
1854
+ outcomes?: {
1855
+ store: OutcomeStore;
1856
+ metrics: string[];
1857
+ };
1858
+ /** Trainer-format export — DPO + GRPO lookups. */
1859
+ trainerExport?: {
1860
+ dpo?: DpoLookups;
1861
+ grpo?: GrpoLookups;
1862
+ };
1863
+ }
1864
+ interface AnalyzeOptimizationResultReport {
1865
+ /** All trials promoted to canonical `RunRecord` shape. */
1866
+ runs: RunRecord[];
1867
+ /** Per-run verifiable reward signal. */
1868
+ rewardSignals: Array<{
1869
+ runId: string;
1870
+ reward: VerifiableReward | null;
1871
+ }>;
1872
+ /** Preference triples ready for DPO/PPO/KTO training. */
1873
+ preferences: PreferenceExtractionReport;
1874
+ /** Anytime-valid sequential verdict, when a comparator is supplied. */
1875
+ interimConfidence: InterimReleaseConfidence | null;
1876
+ /** Standing reward-hacking hygiene check. */
1877
+ rewardHacking: RewardHackingReport;
1878
+ /** Predictive validity, when an outcome store is supplied. */
1879
+ predictiveValidity: RubricPredictiveValidityReport | null;
1880
+ /** Trainer-export rows, populated only for the formats requested. */
1881
+ trainerRows: {
1882
+ dpo?: DpoExportRow[];
1883
+ grpo?: GrpoExportRow[];
1884
+ };
1885
+ /** One-line summary suitable for logs. */
1886
+ summary: string;
1887
+ }
1888
+ /**
1889
+ * Convert an optimization sweep output into a fully-analysed RL artifact
1890
+ * set. Idempotent and read-only with respect to the optimization result.
1891
+ */
1892
+ declare function analyzeOptimizationResult(opts: AnalyzeOptimizationResultOptions): Promise<AnalyzeOptimizationResultReport>;
1893
+
1894
+ export { type RewardHackingFinding as $, type AdaptationCurve as A, type BradleyTerryFit as B, type CellObservation as C, type DetectRewardHackingInput as D, type EloOptions as E, type Finding as F, type GrpoExportRow as G, type GrpoLookups as H, type LayerStatus as I, type OffPolicyOptions as J, type OffPolicyTrajectory as K, type LayerResult as L, MultiLayerVerifier as M, type ParetoPointInput as N, type OffPolicyEstimate as O, type PairwiseOutcome as P, PredictiveValidityResearcher as Q, type PredictiveValidityResearcherOptions as R, type Severity as S, type PreferenceExtractionReport as T, type PreferenceStrategy as U, type VerifyContext as V, type PreferenceTriple as W, type PrmExportRow as X, type PrmLookups as Y, type PrmTrainingTriple as Z, type RLCampaignResult as _, type Layer as a, toTRLFormat as a$, type RewardHackingReport as a0, type RewardHackingSignal as a1, type RunAdaptationCurveOptions as a2, type RunComputeCurveOptions as a3, type RunRLCampaignOptions as a4, type RunwiseStepSummary as a5, type ScenarioPerturbation as a6, type ScenarioPerturbationKind as a7, type SelfConsistencyOptions as a8, type SelfConsistencyResult as a9, fitBradleyTerry as aA, gradeSemanticStatus as aB, injectIrrelevantClause as aC, inverseProbabilityWeighting as aD, observationsFromRunRecords as aE, offPolicyEstimateAll as aF, prmTrainingPairs as aG, renameVariables as aH, runAdaptationCurve as aI, runComputeCurve as aJ, runContaminationProbe as aK, runRLCampaign as aL, runwiseStepRewardSummary as aM, selfConsistency as aN, selfNormalizedImportanceWeighting as aO, shuffleOrder as aP, stepRewardsToJsonl as aQ, thompsonCurriculum as aR, toAnthropicFormat as aS, toDpoJsonl as aT, toDpoRows as aU, toGrpoJsonl as aV, toGrpoRows as aW, toPrmJsonl as aX, toPrmRows as aY, toSftJsonl as aZ, toSftRows as a_, type SftExportRow as aa, type SftLookups as ab, type StepReward as ac, type StepRewardJsonlRow as ad, type StepScorer as ae, type ThompsonCurriculumOptions as af, type VarianceCurriculumOptions as ag, type VerifiableReward as ah, type VerifiableRewardExtractionOptions as ai, type VerifiableRewardSource as aj, type VerificationReport as ak, type VerifyOptions as al, adversarialScenarioSearch as am, analyzeOptimizationResult as an, applyEloUpdate as ao, bestOfN as ap, buildPairwiseFromCampaign as aq, compareAdaptationCurves as ar, detectRewardHacking as as, doublyRobust as at, extractPreferences as au, extractStepRewards as av, extractVerifiableReward as aw, extractVerifiableRewardsFromRecords as ax, filterDeterministicallyRewarded as ay, firstPassK as az, type AdaptationPoint as b, trialToRunRecord as b0, trialsToRunRecords as b1, varianceBasedCurriculum as b2, variantAggregateToRunRecord as b3, verificationReportToRunRecord as b4, paretoFrontier as b5, type AdaptationRunner as c, type AdapterContext as d, type AdversarialMutation as e, type AdversarialScenario as f, type AdversarialSearchOptions as g, type AdversarialSearchReport as h, type AnalyzeOptimizationResultOptions as i, type AnalyzeOptimizationResultReport as j, type BradleyTerryRating as k, type BuildPairwiseFromCampaignInput as l, type CompareCurvesResult as m, type ComputeBestOfNOptions as n, type ComputeBestOfNResult as o, type ComputeCurve as p, type ComputeCurveBudget as q, type ComputeCurvePoint as r, type ContaminationProbeInput as s, type ContaminationProbeOptions as t, type ContaminationProbeReport as u, type CurriculumAllocation as v, type DpoExportRow as w, type DpoLookups as x, type ExtractPreferencesOptions as y, type ExtractStepRewardsOptions as z };