@tangle-network/agent-eval 0.21.0 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +236 -1
- package/README.md +17 -3
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
- package/dist/chunk-4W4NCYM2.js.map +1 -0
- package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
- package/dist/chunk-6M774GY6.js +53 -0
- package/dist/chunk-6M774GY6.js.map +1 -0
- package/dist/chunk-7EAUOUQS.js +495 -0
- package/dist/chunk-7EAUOUQS.js.map +1 -0
- package/dist/chunk-AXHNWLIX.js +246 -0
- package/dist/chunk-AXHNWLIX.js.map +1 -0
- package/dist/chunk-EXGR4XEM.js +283 -0
- package/dist/chunk-EXGR4XEM.js.map +1 -0
- package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
- package/dist/chunk-IOXMGMHQ.js.map +1 -0
- package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
- package/dist/chunk-LZKIOBG2.js +2026 -0
- package/dist/chunk-LZKIOBG2.js.map +1 -0
- package/dist/{chunk-YUFXO3TU.js → chunk-QBW3YBTR.js} +1 -1
- package/dist/chunk-QBW3YBTR.js.map +1 -0
- package/dist/chunk-QUKKGHTZ.js +121 -0
- package/dist/chunk-QUKKGHTZ.js.map +1 -0
- package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
- package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
- package/dist/{chunk-ARZ6BEV6.js → chunk-V5QSWN7L.js} +2 -2
- package/dist/{chunk-HRZELXCR.js → chunk-VQQSPGSM.js} +3 -3
- package/dist/cli.js +3 -3
- package/dist/{control-cxwMOAsy.d.ts → control-DvkH87qJ.d.ts} +2 -2
- package/dist/control.d.ts +3 -3
- package/dist/control.js +2 -2
- package/dist/eval-campaign-Ds5QljIh.d.ts +573 -0
- package/dist/{feedback-trajectory-CB0A32o3.d.ts → feedback-trajectory-c43WGtTX.d.ts} +1 -1
- package/dist/{index-c5saLbKD.d.ts → index-DDTlbHEK.d.ts} +1 -1
- package/dist/index-ekBXweiQ.d.ts +1894 -0
- package/dist/index.d.ts +20 -430
- package/dist/index.js +154 -34
- package/dist/index.js.map +1 -1
- package/dist/integrity-Cr5YodSY.d.ts +210 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +7 -145
- package/dist/optimization.js +12 -3
- package/dist/reporting.d.ts +294 -4
- package/dist/reporting.js +18 -9
- package/dist/rl.d.ts +8 -0
- package/dist/rl.js +113 -0
- package/dist/rl.js.map +1 -0
- package/dist/{run-record-CX_jcAyr.d.ts → run-record-DNiOMBrZ.d.ts} +10 -1
- package/dist/sequential-DgU2mFsE.d.ts +304 -0
- package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-Ce1r4EYo.d.ts} +382 -2
- package/dist/traces.d.ts +101 -181
- package/dist/traces.js +19 -8
- package/dist/wire/index.js +3 -3
- package/docs/auto-research-loop-end-to-end.md +186 -0
- package/docs/research-report-methodology.md +19 -4
- package/docs/three-package-architecture.md +180 -0
- package/docs/wire-protocol.md +1 -1
- package/package.json +7 -2
- package/dist/chunk-3IX6QTB7.js.map +0 -1
- package/dist/chunk-KRR4VMH7.js +0 -423
- package/dist/chunk-KRR4VMH7.js.map +0 -1
- package/dist/chunk-WOK2RTWG.js.map +0 -1
- package/dist/chunk-YUFXO3TU.js.map +0 -1
- package/dist/reporting-Da2ihlcM.d.ts +0 -672
- /package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
- /package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
- /package/dist/{chunk-ARZ6BEV6.js.map → chunk-V5QSWN7L.js.map} +0 -0
- /package/dist/{chunk-HRZELXCR.js.map → chunk-VQQSPGSM.js.map} +0 -0
|
@@ -0,0 +1,1894 @@
|
|
|
1
|
+
import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-Ce1r4EYo.js';
|
|
2
|
+
import { a as RunSplitTag, R as RunRecord } from './run-record-DNiOMBrZ.js';
|
|
3
|
+
import { S as Span, T as TraceStore } from './store-u47QaJ9G.js';
|
|
4
|
+
import { i as EvalCampaignResult, E as EvalCampaignOptions, R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult } from './eval-campaign-Ds5QljIh.js';
|
|
5
|
+
import { c as InterimReleaseConfidence, h as RubricPredictiveValidityReport, a as OutcomeStore } from './sequential-DgU2mFsE.js';
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Multi-layer verifier — ordered pipeline of verification layers.
|
|
9
|
+
*
|
|
10
|
+
* Different contract from {@link JudgeRunner} (which runs parallel
|
|
11
|
+
* specs against a sandbox). MultiLayerVerifier is a DAG of layers
|
|
12
|
+
* (install → typecheck → build → lint → serve → semantic → …) with
|
|
13
|
+
* dependency-based skip, per-layer findings, soft-fail semantics, and
|
|
14
|
+
* an aggregated `blendedScore` across all passed layers.
|
|
15
|
+
*
|
|
16
|
+
* Use when you want:
|
|
17
|
+
* - ordered stages where a failing upstream stage skips downstream ones
|
|
18
|
+
* - each stage produces rich `findings` (severity + message + evidence)
|
|
19
|
+
* - a single composite score across stages with per-stage weights
|
|
20
|
+
* - soft-fail stages whose failure doesn't abort the pipeline
|
|
21
|
+
*
|
|
22
|
+
* Use {@link JudgeRunner} when you want:
|
|
23
|
+
* - N independent judges running in parallel against the same artifact
|
|
24
|
+
* - no inter-judge dependencies
|
|
25
|
+
* - boolean `passed` per judge + overall
|
|
26
|
+
*
|
|
27
|
+
* Both primitives compose — JudgeRunner can be invoked as a single
|
|
28
|
+
* layer inside a MultiLayerVerifier if that suits the caller.
|
|
29
|
+
*/
|
|
30
|
+
type LayerStatus = 'pass' | 'fail' | 'skipped' | 'error' | 'timeout';
|
|
31
|
+
type Severity = 'critical' | 'major' | 'minor' | 'info';
|
|
32
|
+
interface Finding {
|
|
33
|
+
severity: Severity;
|
|
34
|
+
message: string;
|
|
35
|
+
evidence?: string;
|
|
36
|
+
/** Optional layer name the finding belongs to (set by the verifier if omitted). */
|
|
37
|
+
layer?: string;
|
|
38
|
+
/**
|
|
39
|
+
* Free-form structured payload — used by `multiToolchainLayer` to attach
|
|
40
|
+
* `{ adapter: 'pnpm' }`, by judges to attach evidence pointers, etc.
|
|
41
|
+
* Renderers MAY interrogate; agent-eval primitives never assume shape.
|
|
42
|
+
*/
|
|
43
|
+
detail?: Record<string, unknown>;
|
|
44
|
+
}
|
|
45
|
+
interface LayerResult {
|
|
46
|
+
layer: string;
|
|
47
|
+
status: LayerStatus;
|
|
48
|
+
/** 0..1 score, optional — layers that don't produce a numeric score omit. */
|
|
49
|
+
score?: number;
|
|
50
|
+
durationMs: number;
|
|
51
|
+
findings: Finding[];
|
|
52
|
+
/** Short human-readable summary (one line). */
|
|
53
|
+
reason?: string;
|
|
54
|
+
/**
|
|
55
|
+
* Numeric layer-level diagnostics: error counts, warning counts,
|
|
56
|
+
* cyclomatic complexity, total adapter wall-time, etc. Keyed by
|
|
57
|
+
* diagnostic name; null = "diagnostic not applicable / not measured."
|
|
58
|
+
* Renderers that know the keys can display them; ones that don't,
|
|
59
|
+
* ignore. Free-form on purpose — consumers type the value shape in
|
|
60
|
+
* their own namespace. Added in 0.10.
|
|
61
|
+
*/
|
|
62
|
+
diagnostics?: Record<string, number | null>;
|
|
63
|
+
/** Any rich per-layer detail — rendered as-is by consumers that know the layer. */
|
|
64
|
+
detail?: Record<string, unknown>;
|
|
65
|
+
}
|
|
66
|
+
interface VerifyContext<Env = unknown> {
|
|
67
|
+
/** Per-run opaque context the caller provides. Layers destructure what they need. */
|
|
68
|
+
env: Env;
|
|
69
|
+
/** Previously-computed results from layers that already ran. */
|
|
70
|
+
prior: Record<string, LayerResult>;
|
|
71
|
+
/** Signal — if aborted, layers MUST bail within reasonable wall. */
|
|
72
|
+
signal: AbortSignal;
|
|
73
|
+
}
|
|
74
|
+
interface Layer<Env = unknown> {
|
|
75
|
+
name: string;
|
|
76
|
+
/** Stages that must have `status: 'pass'` before this layer runs. */
|
|
77
|
+
dependsOn?: string[];
|
|
78
|
+
/**
|
|
79
|
+
* Weight in the composite `blendedScore`. Default 1.0. Layers with weight 0
|
|
80
|
+
* contribute findings but not score.
|
|
81
|
+
*/
|
|
82
|
+
weight?: number;
|
|
83
|
+
/**
|
|
84
|
+
* If true, a `fail` status contributes to `blendedScore` (as 0) instead of
|
|
85
|
+
* being dropped — use for layers whose failure is a real signal. Default:
|
|
86
|
+
* fail drops from numerator + denominator, matching VB's existing semantics.
|
|
87
|
+
*/
|
|
88
|
+
failContributesToScore?: boolean;
|
|
89
|
+
/** Optional per-layer wall-cap in ms. Honored by the verifier (AbortSignal). */
|
|
90
|
+
capMs?: number;
|
|
91
|
+
run: (ctx: VerifyContext<Env>) => Promise<LayerResult> | LayerResult;
|
|
92
|
+
}
|
|
93
|
+
interface VerifyOptions<Env = unknown> {
|
|
94
|
+
env: Env;
|
|
95
|
+
/**
|
|
96
|
+
* Overall wall cap. Default: sum of layer capMs, or Infinity if any layer
|
|
97
|
+
* omits a cap. The verifier short-circuits remaining layers on overall cap.
|
|
98
|
+
*/
|
|
99
|
+
overallCapMs?: number;
|
|
100
|
+
/** Called with each layer result as it completes. */
|
|
101
|
+
onLayer?: (result: LayerResult) => void;
|
|
102
|
+
}
|
|
103
|
+
interface VerificationReport {
|
|
104
|
+
layers: LayerResult[];
|
|
105
|
+
passCount: number;
|
|
106
|
+
failCount: number;
|
|
107
|
+
skippedCount: number;
|
|
108
|
+
errorCount: number;
|
|
109
|
+
/** True iff at least one scored layer ran AND every scored layer passed. */
|
|
110
|
+
allPass: boolean;
|
|
111
|
+
/**
|
|
112
|
+
* Weighted mean of `score` across contributing layers. 0 when no layers
|
|
113
|
+
* contributed. See {@link Layer.failContributesToScore} for fail semantics.
|
|
114
|
+
*/
|
|
115
|
+
blendedScore: number;
|
|
116
|
+
durationMs: number;
|
|
117
|
+
startedAt: string;
|
|
118
|
+
finishedAt: string;
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Grade a semantic-concept-style judge result into a single layer status.
|
|
122
|
+
*
|
|
123
|
+
* Pass when overall score >= threshold AND no critical-severity concept gap.
|
|
124
|
+
* Fail otherwise. Use inside a `Layer.run` when wrapping a concept judge.
|
|
125
|
+
*
|
|
126
|
+
* Generalized from VerticalBench H3 fix: `failingConcepts.length === 0` was
|
|
127
|
+
* too strict — a single concept at 6/10 failed the entire layer despite
|
|
128
|
+
* overall score being >= 0.7. Now we trust the judge's own `severity` field:
|
|
129
|
+
* `critical` findings veto; `major`/`minor` reduce the score but don't veto.
|
|
130
|
+
*/
|
|
131
|
+
declare function gradeSemanticStatus(input: {
|
|
132
|
+
score: number;
|
|
133
|
+
findings: Array<{
|
|
134
|
+
severity: Severity;
|
|
135
|
+
present?: boolean;
|
|
136
|
+
score?: number;
|
|
137
|
+
}>;
|
|
138
|
+
available: boolean;
|
|
139
|
+
threshold?: number;
|
|
140
|
+
}): LayerStatus;
|
|
141
|
+
declare class MultiLayerVerifier<Env = unknown> {
|
|
142
|
+
private readonly layers;
|
|
143
|
+
constructor(layers: Layer<Env>[]);
|
|
144
|
+
run(opts: VerifyOptions<Env>): Promise<VerificationReport>;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Adapters: convert legacy optimization outputs into the canonical
|
|
149
|
+
* `RunRecord[]` artifact that 0.22+ primitives consume.
|
|
150
|
+
*
|
|
151
|
+
* The 0.22 release standardized the campaign artifact: every cell of an
|
|
152
|
+
* eval matrix produces one `RunRecord`. The pre-0.22 optimization
|
|
153
|
+
* primitives (`runMultiShotOptimization`, `runPromptEvolution`) produce
|
|
154
|
+
* `TrialResult[]` with a different shape. This file bridges the two so
|
|
155
|
+
* the new primitives (`replayCache`, `pairedEvalueSequence`,
|
|
156
|
+
* `rubricPredictiveValidity`) compose cleanly with the existing RL stack.
|
|
157
|
+
*
|
|
158
|
+
* The adapters are thin and explicit — every mandatory `RunRecord` field
|
|
159
|
+
* comes from a caller-supplied context (`commitSha`, `model`,
|
|
160
|
+
* `promptHash`, `configHash`) plus the trial's runtime data. Defaults
|
|
161
|
+
* exist for fields the trial doesn't carry (`tokenUsage`, `costUsd`),
|
|
162
|
+
* but the validator still rejects records with bare-alias model strings
|
|
163
|
+
* — the caller is responsible for snapshot-pinning.
|
|
164
|
+
*/
|
|
165
|
+
|
|
166
|
+
interface AdapterContext {
|
|
167
|
+
/** Logical experiment id — typically the campaign or sweep identifier. */
|
|
168
|
+
experimentId: string;
|
|
169
|
+
/** Snapshot model id (e.g. `claude-sonnet-4-6@2025-04-15`). */
|
|
170
|
+
model: string;
|
|
171
|
+
/** Git SHA the harness was run from. */
|
|
172
|
+
commitSha: string;
|
|
173
|
+
/** Hash of the effective prompt sent to the model. */
|
|
174
|
+
promptHash: string | ((t: TrialResult) => string);
|
|
175
|
+
/** Hash of the effective config (model, temperature, tools, judges, splits). */
|
|
176
|
+
configHash: string | ((t: TrialResult) => string);
|
|
177
|
+
/** Default split tag. Default `'search'` — optimization sweeps run on the search split. */
|
|
178
|
+
splitTag?: RunSplitTag;
|
|
179
|
+
/** Default cost in USD when the trial doesn't record one. Default `0`. */
|
|
180
|
+
defaultCostUsd?: number;
|
|
181
|
+
}
|
|
182
|
+
/**
|
|
183
|
+
* Convert one `TrialResult` (from `runPromptEvolution` or
|
|
184
|
+
* `runMultiShotOptimization`) into a canonical `RunRecord`.
|
|
185
|
+
*
|
|
186
|
+
* The conversion is **not lossy** — every `TrialResult.metrics` field is
|
|
187
|
+
* carried through to `outcome.raw`, plus a synthetic
|
|
188
|
+
* `raw.cost_unknown = 1` flag when the trial omits cost (so downstream
|
|
189
|
+
* filters can distinguish "free" from "untracked"). This preserves the
|
|
190
|
+
* paper-grade contract: a record without a cost number is unbounded by
|
|
191
|
+
* definition, but we don't drop the record.
|
|
192
|
+
*/
|
|
193
|
+
declare function trialToRunRecord(trial: TrialResult, ctx: AdapterContext, opts?: {
|
|
194
|
+
runId?: string;
|
|
195
|
+
experimentIdPerTrial?: (t: TrialResult) => string;
|
|
196
|
+
}): RunRecord;
|
|
197
|
+
/** Convenience: convert an array of `TrialResult` in one go. */
|
|
198
|
+
declare function trialsToRunRecords(trials: TrialResult[], ctx: AdapterContext): RunRecord[];
|
|
199
|
+
/**
|
|
200
|
+
* Convert a `MultiLayerVerifier` `VerificationReport` into a `RunRecord`.
|
|
201
|
+
*
|
|
202
|
+
* The verifier produces per-layer results; we synthesize one canonical
|
|
203
|
+
* record where:
|
|
204
|
+
* - `outcome.searchScore` (or `holdoutScore`) is `report.blendedScore`
|
|
205
|
+
* - `outcome.raw` carries every layer's score keyed `layer.<name>`
|
|
206
|
+
* plus a `layer_<name>_pass` 1/0 indicator
|
|
207
|
+
* - `failureMode` is taken from the first failing layer's `reason`
|
|
208
|
+
* - `wallMs` is `report.durationMs`
|
|
209
|
+
*/
|
|
210
|
+
declare function verificationReportToRunRecord(report: VerificationReport, ctx: AdapterContext & {
|
|
211
|
+
candidateId: string;
|
|
212
|
+
scenarioId?: string;
|
|
213
|
+
}, opts?: {
|
|
214
|
+
runId?: string;
|
|
215
|
+
}): RunRecord;
|
|
216
|
+
/**
|
|
217
|
+
* Convert a `VariantAggregate` (per-variant rollup from `prompt-evolution`)
|
|
218
|
+
* into a synthetic `RunRecord` representing the aggregate. Useful when the
|
|
219
|
+
* downstream consumer wants per-variant entries for a `researchReport`
|
|
220
|
+
* rather than per-(variant, scenario, rep) trial entries.
|
|
221
|
+
*/
|
|
222
|
+
declare function variantAggregateToRunRecord(agg: VariantAggregate, ctx: AdapterContext, opts?: {
|
|
223
|
+
runId?: string;
|
|
224
|
+
}): RunRecord;
|
|
225
|
+
|
|
226
|
+
/**
|
|
227
|
+
* Verifiable reward channel.
|
|
228
|
+
*
|
|
229
|
+
* For RL on coding / math / theorem-proving / structured-output tasks, the
|
|
230
|
+
* reward signal is *decidable* — a test passes or fails, a proof checks or
|
|
231
|
+
* doesn't, an output validates against a schema or doesn't. These rewards
|
|
232
|
+
* are dramatically more useful for RL training than LLM-judge scores
|
|
233
|
+
* because they don't drift, can't be Goodhart-gamed by the policy in the
|
|
234
|
+
* same way, and don't require a separate calibration loop.
|
|
235
|
+
*
|
|
236
|
+
* The `MultiLayerVerifier` already produces this signal — it just doesn't
|
|
237
|
+
* surface it in a shape that's clean enough for RL training. This module
|
|
238
|
+
* wraps the verifier output so consumers can:
|
|
239
|
+
*
|
|
240
|
+
* 1. Extract a clean `VerifiableReward` from a `VerificationReport`
|
|
241
|
+
* 2. Distinguish *deterministic* rewards (compile, test, schema) from
|
|
242
|
+
* *probabilistic* rewards (judge) so they can be weighted differently
|
|
243
|
+
* in the RL training step
|
|
244
|
+
* 3. Filter `RunRecord[]` to only those with a verifiable reward,
|
|
245
|
+
* producing the clean training set that DeepSeek-R1-style GRPO and
|
|
246
|
+
* AlphaProof-style search both depend on
|
|
247
|
+
*
|
|
248
|
+
* Why this matters: every credible 2025-2026 frontier RL result on coding
|
|
249
|
+
* agents leans on verifiable reward (DeepSeek-R1 GRPO on test pass-rate,
|
|
250
|
+
* o-series RL on math/code, AlphaProof on Lean kernel checking). Mixing
|
|
251
|
+
* judge scores into the reward signal poisons the gradient. This module
|
|
252
|
+
* is the seam.
|
|
253
|
+
*/
|
|
254
|
+
|
|
255
|
+
type VerifiableRewardSource = 'compile' | 'test' | 'schema' | 'sandbox' | 'judge' | 'composite';
|
|
256
|
+
interface VerifiableReward {
|
|
257
|
+
/** Scalar in [0, 1]. The RL training signal. */
|
|
258
|
+
value: number;
|
|
259
|
+
/** What produced the reward — different sources have different determinism. */
|
|
260
|
+
source: VerifiableRewardSource;
|
|
261
|
+
/**
|
|
262
|
+
* Determinism class. `'deterministic'` rewards are repeatable byte-for-byte
|
|
263
|
+
* given the same inputs (compile, test, schema validation, sandbox exit code).
|
|
264
|
+
* `'probabilistic'` rewards depend on a stochastic component (LLM judge).
|
|
265
|
+
* Mixing these in the same training batch without separation is a known
|
|
266
|
+
* footgun in production RLHF pipelines.
|
|
267
|
+
*/
|
|
268
|
+
determinism: 'deterministic' | 'probabilistic';
|
|
269
|
+
/**
|
|
270
|
+
* Confidence in the reward value. For deterministic sources this is 1.0
|
|
271
|
+
* (the bit either flipped or didn't). For judge sources this is the
|
|
272
|
+
* judge-reported confidence or — when missing — a calibrated prior.
|
|
273
|
+
*/
|
|
274
|
+
confidence: number;
|
|
275
|
+
/** The layer / judge id that produced the signal, for provenance. */
|
|
276
|
+
origin: string;
|
|
277
|
+
/**
|
|
278
|
+
* Any per-source breakdown the consumer might want — e.g. `{ tests_passed: 7, tests_total: 10 }`.
|
|
279
|
+
*/
|
|
280
|
+
breakdown?: Record<string, number>;
|
|
281
|
+
}
|
|
282
|
+
interface VerifiableRewardExtractionOptions {
|
|
283
|
+
/**
|
|
284
|
+
* Which layers count as deterministic-reward sources. The verifier doesn't
|
|
285
|
+
* tag layers as "this is verifiable"; the caller declares it via this list
|
|
286
|
+
* (or via the layer name → source mapping). Default treats common names
|
|
287
|
+
* (`install`, `typecheck`, `build`, `lint`, `test`, `compile`, `schema`,
|
|
288
|
+
* `sandbox`) as deterministic.
|
|
289
|
+
*/
|
|
290
|
+
deterministicLayers?: string[];
|
|
291
|
+
/**
|
|
292
|
+
* Map layer name → reward source. Defaults to a sensible string-match.
|
|
293
|
+
*/
|
|
294
|
+
sourceFor?: (layerName: string) => VerifiableRewardSource;
|
|
295
|
+
/**
|
|
296
|
+
* Whether to fall back to a probabilistic (judge) reward when no
|
|
297
|
+
* deterministic layer produced a numeric score. Default `true`. Set to
|
|
298
|
+
* `false` for "deterministic-only" training pipelines that should
|
|
299
|
+
* discard runs without a verifiable signal.
|
|
300
|
+
*/
|
|
301
|
+
fallbackToJudge?: boolean;
|
|
302
|
+
/**
|
|
303
|
+
* Default confidence for probabilistic (judge) rewards when the judge
|
|
304
|
+
* doesn't report one. Default `0.7`.
|
|
305
|
+
*/
|
|
306
|
+
judgeConfidenceFloor?: number;
|
|
307
|
+
}
|
|
308
|
+
/**
|
|
309
|
+
* Extract a `VerifiableReward` from a `VerificationReport`.
|
|
310
|
+
*
|
|
311
|
+
* Strategy: prefer the deterministic layers (in order: test → compile →
|
|
312
|
+
* schema → sandbox), fall back to the judge layer if `fallbackToJudge` is
|
|
313
|
+
* true, return `null` if no signal qualifies. When multiple deterministic
|
|
314
|
+
* layers contribute, return a `'composite'` source with a weighted blend.
|
|
315
|
+
*/
|
|
316
|
+
declare function extractVerifiableReward(report: VerificationReport, opts?: VerifiableRewardExtractionOptions): VerifiableReward | null;
|
|
317
|
+
/**
|
|
318
|
+
* Extract verifiable rewards from `RunRecord[]` produced via the
|
|
319
|
+
* `verificationReportToRunRecord` adapter (which encodes per-layer scores
|
|
320
|
+
* in `outcome.raw['layer.<name>']`). For records that don't carry layer
|
|
321
|
+
* scores, returns `null` for that record.
|
|
322
|
+
*
|
|
323
|
+
* This is the canonical bridge from "campaign-shaped artifacts" to
|
|
324
|
+
* "RL-training-ready reward signals": every record that has a clean
|
|
325
|
+
* verifiable reward becomes a training datum, every record that doesn't
|
|
326
|
+
* gets filtered out (or kept with `'probabilistic'` determinism for
|
|
327
|
+
* separate downstream handling).
|
|
328
|
+
*/
|
|
329
|
+
declare function extractVerifiableRewardsFromRecords(runs: RunRecord[], opts?: VerifiableRewardExtractionOptions): Array<{
|
|
330
|
+
runId: string;
|
|
331
|
+
reward: VerifiableReward | null;
|
|
332
|
+
}>;
|
|
333
|
+
/** Filter `RunRecord[]` to those with deterministic verifiable rewards. */
|
|
334
|
+
declare function filterDeterministicallyRewarded(runs: RunRecord[], opts?: VerifiableRewardExtractionOptions): Array<{
|
|
335
|
+
run: RunRecord;
|
|
336
|
+
reward: VerifiableReward;
|
|
337
|
+
}>;
|
|
338
|
+
|
|
339
|
+
/**
|
|
340
|
+
* Preference dataset extraction — bridge from `RunRecord[]` to RL training.
|
|
341
|
+
*
|
|
342
|
+
* Production RLHF / DPO / KTO / SimPO pipelines need preference triples:
|
|
343
|
+
* `(prompt, chosen, rejected)`. The campaign artifact already contains the
|
|
344
|
+
* ingredients — every (variantId, scenarioId, seed) cell is a candidate
|
|
345
|
+
* that ran the same prompt against the same scenario, scored by the same
|
|
346
|
+
* judge — but turning that into a clean preference dataset requires
|
|
347
|
+
* deciding *what counts as a preference*.
|
|
348
|
+
*
|
|
349
|
+
* This module ships three preference-extraction strategies with explicit
|
|
350
|
+
* tradeoffs, plus a unified output type compatible with HuggingFace TRL,
|
|
351
|
+
* Anthropic finetuning JSONL, and OpenAI fine-tuning APIs. The strategies
|
|
352
|
+
* are deliberately not auto-magical — picking the wrong one corrupts the
|
|
353
|
+
* gradient.
|
|
354
|
+
*
|
|
355
|
+
* Strategies:
|
|
356
|
+
*
|
|
357
|
+
* 1. **`paired-by-scenario-and-seed`** — exact-match comparisons. For
|
|
358
|
+
* each scenario × seed pair, compare every (variantA, variantB) on
|
|
359
|
+
* that exact (scenario, seed). Matches scenarios so the comparison
|
|
360
|
+
* isolates variant effects. Highest signal-to-noise; smallest
|
|
361
|
+
* dataset (only matched pairs count).
|
|
362
|
+
*
|
|
363
|
+
* 2. **`paired-by-scenario`** — looser matching. For each scenario,
|
|
364
|
+
* compare every (variantA, variantB) where both have ≥ 1 run on the
|
|
365
|
+
* same scenario. Aggregates across seeds to compute mean scores per
|
|
366
|
+
* (variant, scenario), then forms preferences from the means. More
|
|
367
|
+
* data, lower per-pair signal.
|
|
368
|
+
*
|
|
369
|
+
* 3. **`top-vs-bottom`** — coarsest. Within each scenario, the highest-
|
|
370
|
+
* scoring run is `chosen`, the lowest is `rejected`. Smallest dataset
|
|
371
|
+
* per scenario but biggest score gap per pair. Useful for early
|
|
372
|
+
* bootstrapping when you have few variants.
|
|
373
|
+
*
|
|
374
|
+
* The output `PreferenceTriple` is *agent-eval-canonical* but trivially
|
|
375
|
+
* mappable to TRL's `DPODataset` shape (`prompt`, `chosen`, `rejected`)
|
|
376
|
+
* via the `toTRLFormat` helper.
|
|
377
|
+
*/
|
|
378
|
+
|
|
379
|
+
type PreferenceStrategy = 'paired-by-scenario-and-seed' | 'paired-by-scenario' | 'top-vs-bottom';
|
|
380
|
+
interface PreferenceTriple {
|
|
381
|
+
/** The scenario (input) the variants were run against. */
|
|
382
|
+
scenarioId: string;
|
|
383
|
+
/** RunRecord ids on each side, for traceability. */
|
|
384
|
+
chosenRunId: string;
|
|
385
|
+
rejectedRunId: string;
|
|
386
|
+
/** Variant ids — load-bearing for the RL update. */
|
|
387
|
+
chosenVariantId: string;
|
|
388
|
+
rejectedVariantId: string;
|
|
389
|
+
/** The score gap between chosen and rejected. Larger = stronger signal. */
|
|
390
|
+
marginScore: number;
|
|
391
|
+
/**
|
|
392
|
+
* Optional `(chosen_score, rejected_score)` pair for soft-margin DPO
|
|
393
|
+
* variants. Omitted for `top-vs-bottom` runs that don't carry meaningful
|
|
394
|
+
* scalar gaps.
|
|
395
|
+
*/
|
|
396
|
+
scores?: {
|
|
397
|
+
chosen: number;
|
|
398
|
+
rejected: number;
|
|
399
|
+
};
|
|
400
|
+
/** Tie-breaker — when multiple seeds match this scenario, the one used. */
|
|
401
|
+
seed?: number;
|
|
402
|
+
/**
|
|
403
|
+
* Free-form metadata propagated from the run records — e.g. original
|
|
404
|
+
* prompt-hash, model, etc. Lets the RL trainer reconstruct the prompt.
|
|
405
|
+
*/
|
|
406
|
+
meta: {
|
|
407
|
+
chosenPromptHash: string;
|
|
408
|
+
rejectedPromptHash: string;
|
|
409
|
+
chosenConfigHash: string;
|
|
410
|
+
rejectedConfigHash: string;
|
|
411
|
+
chosenModel: string;
|
|
412
|
+
rejectedModel: string;
|
|
413
|
+
};
|
|
414
|
+
}
|
|
415
|
+
interface ExtractPreferencesOptions {
|
|
416
|
+
strategy?: PreferenceStrategy;
|
|
417
|
+
/**
|
|
418
|
+
* Minimum score gap required to admit a pair. Pairs below this are
|
|
419
|
+
* dropped — they're noise, not signal. Default 0.05 (5% of [0,1]).
|
|
420
|
+
*/
|
|
421
|
+
minMargin?: number;
|
|
422
|
+
/**
|
|
423
|
+
* Optional split tag filter — restrict to runs from one split. Default
|
|
424
|
+
* `'holdout'` (the canonical "real" signal).
|
|
425
|
+
*/
|
|
426
|
+
splitTag?: RunRecord['splitTag'];
|
|
427
|
+
/**
|
|
428
|
+
* Optional reward extractor that overrides `outcome.holdoutScore` /
|
|
429
|
+
* `outcome.searchScore`. Use to drive preferences off a verifiable
|
|
430
|
+
* reward instead of the headline score.
|
|
431
|
+
*/
|
|
432
|
+
rewardOf?: (run: RunRecord) => number | null;
|
|
433
|
+
}
|
|
434
|
+
interface PreferenceExtractionReport {
|
|
435
|
+
pairs: PreferenceTriple[];
|
|
436
|
+
/** Number of (scenario, seed) cells inspected. */
|
|
437
|
+
cellsInspected: number;
|
|
438
|
+
/** Number of pairs filtered by `minMargin`. */
|
|
439
|
+
pairsBelowMargin: number;
|
|
440
|
+
/** Number of cells with only one variant (no comparison possible). */
|
|
441
|
+
cellsSingleton: number;
|
|
442
|
+
/** Strategy used. */
|
|
443
|
+
strategy: PreferenceStrategy;
|
|
444
|
+
}
|
|
445
|
+
/**
|
|
446
|
+
* Convert `RunRecord[]` to preference triples for RL training.
|
|
447
|
+
*
|
|
448
|
+
* Returns a structured report so callers can see how much data was
|
|
449
|
+
* dropped and why (low-margin pairs, singleton cells). For production
|
|
450
|
+
* pipelines, you usually want to:
|
|
451
|
+
*
|
|
452
|
+
* 1. Run a campaign producing 5–10 variants × 50–200 scenarios × 3 seeds
|
|
453
|
+
* 2. Call this with `strategy: 'paired-by-scenario-and-seed'` and a
|
|
454
|
+
* verifiable-reward extractor as `rewardOf`
|
|
455
|
+
* 3. Pass `report.pairs` to `toTRLFormat` and pipe to your DPO trainer
|
|
456
|
+
*/
|
|
457
|
+
declare function extractPreferences(runs: RunRecord[], opts?: ExtractPreferencesOptions): PreferenceExtractionReport;
|
|
458
|
+
/**
|
|
459
|
+
* TRL-compatible export. TRL's `DPODataset` is `{ prompt, chosen, rejected }`
|
|
460
|
+
* but the prompt isn't stored on the RunRecord — only its hash. The caller
|
|
461
|
+
* passes a `promptOf(promptHash)` lookup that the TRL trainer can use.
|
|
462
|
+
*/
|
|
463
|
+
declare function toTRLFormat(triples: PreferenceTriple[], promptOf: (hash: string) => string): Array<{
|
|
464
|
+
prompt: string;
|
|
465
|
+
chosen: string;
|
|
466
|
+
rejected: string;
|
|
467
|
+
}>;
|
|
468
|
+
/**
|
|
469
|
+
* Anthropic finetuning JSONL export — `{ system, user, assistant_chosen, assistant_rejected }`
|
|
470
|
+
* shape. Same caveat as TRL: prompt + outputs are content the caller has
|
|
471
|
+
* to map back from the run record / raw event log.
|
|
472
|
+
*/
|
|
473
|
+
declare function toAnthropicFormat(triples: PreferenceTriple[]): Array<{
|
|
474
|
+
scenarioId: string;
|
|
475
|
+
chosenRunId: string;
|
|
476
|
+
rejectedRunId: string;
|
|
477
|
+
margin: number;
|
|
478
|
+
}>;
|
|
479
|
+
|
|
480
|
+
/**
|
|
481
|
+
* Off-policy evaluation primitives.
|
|
482
|
+
*
|
|
483
|
+
* Standard inverse-probability-weighted (IPS), self-normalized
|
|
484
|
+
* importance-weighted (SNIPS), and doubly-robust (DR) estimators for the
|
|
485
|
+
* value of a *target* policy given trajectories collected under a
|
|
486
|
+
* *behavior* policy. This is the canonical RL eval task: "we have last
|
|
487
|
+
* week's runs, we changed the policy — how would the new one do without
|
|
488
|
+
* re-running?"
|
|
489
|
+
*
|
|
490
|
+
* The math here is textbook (Dudík, Langford, Li 2011 for DR; Swaminathan
|
|
491
|
+
* & Joachims 2015 for SNIPS) but the *application* to LLM-agent
|
|
492
|
+
* evaluation needs care:
|
|
493
|
+
*
|
|
494
|
+
* - The "policy" is the (prompt, tool config, model snapshot) triple.
|
|
495
|
+
* Two policies have the same probability over an action *iff* their
|
|
496
|
+
* LLM call would emit the same token with the same probability —
|
|
497
|
+
* which is generally unknowable without the model log-probs.
|
|
498
|
+
* - For LLM agents, propensity scores must be supplied by the caller
|
|
499
|
+
* (logged in the trace, recovered from token log-probs, or estimated
|
|
500
|
+
* via a learned propensity model). We do NOT estimate propensity here.
|
|
501
|
+
* - Doubly-robust requires a Q-function (model-based reward predictor).
|
|
502
|
+
* We accept any callable; consumers pass either a tabular average,
|
|
503
|
+
* a regression fit, or a learned reward model.
|
|
504
|
+
*
|
|
505
|
+
* Bias / variance tradeoffs:
|
|
506
|
+
* - IPS: unbiased; high variance for small overlap, infinite variance
|
|
507
|
+
* when target has support outside behavior.
|
|
508
|
+
* - SNIPS: lower variance, slight bias; usually preferred in practice.
|
|
509
|
+
* - DR: doubly-robust — unbiased if either propensity OR Q-function is
|
|
510
|
+
* correct. Lowest practical variance when Q is decent. Use this.
|
|
511
|
+
*
|
|
512
|
+
* Caveat the panel will land: on the LLM-agent setting, propensity scores
|
|
513
|
+
* recovered from token log-probs are noisy, the action space is enormous,
|
|
514
|
+
* and overlap is often poor. These estimators are useful but not magic;
|
|
515
|
+
* complement with `replayCampaign` (exact replay where the request hashes
|
|
516
|
+
* match) for high-confidence answers and OPE for the gap.
|
|
517
|
+
*/
|
|
518
|
+
interface OffPolicyTrajectory {
|
|
519
|
+
/** Stable id, for traceability through the dataset. */
|
|
520
|
+
runId: string;
|
|
521
|
+
/** Reward observed under the behavior policy (the realized outcome). */
|
|
522
|
+
reward: number;
|
|
523
|
+
/**
|
|
524
|
+
* Behavior-policy probability of the action that was taken. For LLM
|
|
525
|
+
* agents this is typically `exp(sum(token_log_probs))` over the chosen
|
|
526
|
+
* trajectory. Must be in (0, 1].
|
|
527
|
+
*/
|
|
528
|
+
behaviorProb: number;
|
|
529
|
+
/**
|
|
530
|
+
* Target-policy probability of the same action. For replay-style
|
|
531
|
+
* counterfactual evaluation this is what the *new* policy would have
|
|
532
|
+
* assigned to the *old* trajectory. Must be in [0, 1].
|
|
533
|
+
*/
|
|
534
|
+
targetProb: number;
|
|
535
|
+
/**
|
|
536
|
+
* Optional model-based reward prediction at the same context. Used by
|
|
537
|
+
* `doublyRobust`. Set to `null` for IPS-only evaluation.
|
|
538
|
+
*/
|
|
539
|
+
qHat?: number | null;
|
|
540
|
+
}
|
|
541
|
+
interface OffPolicyEstimate {
|
|
542
|
+
/** Estimated value of the target policy. */
|
|
543
|
+
value: number;
|
|
544
|
+
/** Standard error of the estimate. */
|
|
545
|
+
standardError: number;
|
|
546
|
+
/** Effective sample size (Kong 1992). Lower = more reliance on a few high-weight samples. */
|
|
547
|
+
effectiveSampleSize: number;
|
|
548
|
+
/** Number of trajectories used. */
|
|
549
|
+
n: number;
|
|
550
|
+
/**
|
|
551
|
+
* Diagnostic: maximum importance weight observed. Large values (>>10x
|
|
552
|
+
* mean) are a red flag — variance is dominated by a few outliers.
|
|
553
|
+
*/
|
|
554
|
+
maxImportanceWeight: number;
|
|
555
|
+
}
|
|
556
|
+
interface OffPolicyOptions {
|
|
557
|
+
/**
|
|
558
|
+
* Cap importance weights at this value (Ionides 2008 truncated IS) to
|
|
559
|
+
* trade unbiasedness for variance reduction. Default `Infinity` (no cap).
|
|
560
|
+
* Set e.g. `10` for stable estimates when the policies are close.
|
|
561
|
+
*/
|
|
562
|
+
weightCap?: number;
|
|
563
|
+
/** Reward clipping range. Default `[0, 1]`. */
|
|
564
|
+
rewardClip?: {
|
|
565
|
+
low: number;
|
|
566
|
+
high: number;
|
|
567
|
+
};
|
|
568
|
+
}
|
|
569
|
+
/**
|
|
570
|
+
* Inverse Probability Weighting (Horvitz-Thompson). Unbiased estimator
|
|
571
|
+
* of E[reward under target policy]. Variance scales with the spread of
|
|
572
|
+
* target/behavior ratios.
|
|
573
|
+
*/
|
|
574
|
+
declare function inverseProbabilityWeighting(trajectories: OffPolicyTrajectory[], opts?: OffPolicyOptions): OffPolicyEstimate;
|
|
575
|
+
/**
|
|
576
|
+
* Self-Normalized Importance Sampling. Lower variance than vanilla IPS at
|
|
577
|
+
* the cost of small bias (vanishing as N grows). The right default for
|
|
578
|
+
* LLM-agent evaluation where overlap is often poor.
|
|
579
|
+
*/
|
|
580
|
+
declare function selfNormalizedImportanceWeighting(trajectories: OffPolicyTrajectory[], opts?: OffPolicyOptions): OffPolicyEstimate;
|
|
581
|
+
/**
|
|
582
|
+
* Doubly-robust off-policy estimator (Dudík, Langford, Li 2011).
|
|
583
|
+
*
|
|
584
|
+
* V_DR = (1/N) * sum_i [ q_hat_i + (target_prob_i / behavior_prob_i) * (r_i - q_hat_i) ]
|
|
585
|
+
*
|
|
586
|
+
* Unbiased if EITHER:
|
|
587
|
+
* - the importance ratios are correct (IPS-style validity), OR
|
|
588
|
+
* - the Q-hat function is correct (model-based validity).
|
|
589
|
+
*
|
|
590
|
+
* In practice both are imperfect, but the residual bias is the *product*
|
|
591
|
+
* of both errors — much smaller than either alone. This is why DR is the
|
|
592
|
+
* default in production OPE pipelines.
|
|
593
|
+
*
|
|
594
|
+
* Requires `qHat` on every trajectory. If any are `null`, the estimator
|
|
595
|
+
* falls back to SNIPS for those entries (loud-fallback behavior; the
|
|
596
|
+
* report's `n` reflects the full set but `effectiveSampleSize` accounts
|
|
597
|
+
* for the lost variance reduction).
|
|
598
|
+
*/
|
|
599
|
+
declare function doublyRobust(trajectories: OffPolicyTrajectory[], opts?: OffPolicyOptions): OffPolicyEstimate;
|
|
600
|
+
/**
|
|
601
|
+
* Convenience: run all three estimators and return them side-by-side.
|
|
602
|
+
* The recommended diagnostic — agreement across estimators is a much
|
|
603
|
+
* stronger signal than any single one.
|
|
604
|
+
*/
|
|
605
|
+
declare function offPolicyEstimateAll(trajectories: OffPolicyTrajectory[], opts?: OffPolicyOptions): {
|
|
606
|
+
ips: OffPolicyEstimate;
|
|
607
|
+
snips: OffPolicyEstimate;
|
|
608
|
+
dr: OffPolicyEstimate;
|
|
609
|
+
};
|
|
610
|
+
|
|
611
|
+
/**
|
|
612
|
+
* Process reward extraction — step-level credit assignment from trace spans.
|
|
613
|
+
*
|
|
614
|
+
* RL on long-horizon agents needs *step-level* rewards, not run-level
|
|
615
|
+
* ones. The classic credit-assignment problem (Sutton & Barto) requires
|
|
616
|
+
* knowing which sub-decisions in a trajectory contributed to the
|
|
617
|
+
* outcome. Modern systems (DeepSeek-R1, OpenAI o-series, Lightman et al.
|
|
618
|
+
* "Let's Verify Step by Step" 2023) train *process reward models* (PRMs)
|
|
619
|
+
* that score every step, then do RL with the PRM as the reward signal.
|
|
620
|
+
*
|
|
621
|
+
* This module extracts `StepReward[]` from trace spans — one per
|
|
622
|
+
* meaningful step — and ships:
|
|
623
|
+
*
|
|
624
|
+
* 1. `extractStepRewards(store, runId, opts)` — span → step-reward
|
|
625
|
+
* conversion using configurable per-span scorers (LLM judge over the
|
|
626
|
+
* span output, deterministic checkers, or a learned PRM).
|
|
627
|
+
* 2. `runwiseStepRewardSummary(stepRewards)` — aggregate the per-step
|
|
628
|
+
* signal into a credit-assignment-aware run-level score.
|
|
629
|
+
* 3. `prmTrainingPairs(stepRewards, options)` — produce the
|
|
630
|
+
* `(prefix, suffix_chosen, suffix_rejected)` triples that PRM
|
|
631
|
+
* training pipelines consume.
|
|
632
|
+
*
|
|
633
|
+
* What we ship: the *extraction* and *aggregation* infrastructure plus
|
|
634
|
+
* the data shape PRM training expects. We do NOT ship the actual PRM
|
|
635
|
+
* training (gradient descent over a transformer is out of scope for a
|
|
636
|
+
* TS package). The interface is the contract; downstream consumers wire
|
|
637
|
+
* their preferred trainer.
|
|
638
|
+
*
|
|
639
|
+
* Caveat the panel will land: this is descriptive credit assignment
|
|
640
|
+
* (which steps correlate with outcome), not causal credit assignment
|
|
641
|
+
* (which steps caused outcome). For causal claims you need
|
|
642
|
+
* counterfactual rollouts or a learned dynamics model. Future work; the
|
|
643
|
+
* descriptive version is what production PRM training actually uses.
|
|
644
|
+
*/
|
|
645
|
+
|
|
646
|
+
interface StepReward {
|
|
647
|
+
/** Trace span this reward attaches to. */
|
|
648
|
+
spanId: string;
|
|
649
|
+
runId: string;
|
|
650
|
+
/** Index in the trajectory (0-based, in started-at order). */
|
|
651
|
+
stepIndex: number;
|
|
652
|
+
/** Span kind (typically 'tool', 'llm', 'judge'). */
|
|
653
|
+
kind: Span['kind'];
|
|
654
|
+
/** Span name — for the consumer's downstream filtering. */
|
|
655
|
+
name: string;
|
|
656
|
+
/** Step-level reward in [0, 1]. */
|
|
657
|
+
reward: number;
|
|
658
|
+
/**
|
|
659
|
+
* Determinism class. Mirrors the verifiable-reward distinction:
|
|
660
|
+
* deterministic = test/compile/schema check; probabilistic = LLM judge.
|
|
661
|
+
*/
|
|
662
|
+
determinism: 'deterministic' | 'probabilistic';
|
|
663
|
+
/** Optional rationale / evidence — the trainer typically discards. */
|
|
664
|
+
rationale?: string;
|
|
665
|
+
/** Optional weight — how much this step contributes to credit assignment. */
|
|
666
|
+
weight?: number;
|
|
667
|
+
}
|
|
668
|
+
interface StepScorer {
|
|
669
|
+
/** Span kinds this scorer applies to. */
|
|
670
|
+
appliesTo: Span['kind'][];
|
|
671
|
+
/** Returns null to skip the span; returns a `StepReward` shape (without index/runId/spanId, which are filled in). */
|
|
672
|
+
score(span: Span): Promise<Omit<StepReward, 'spanId' | 'runId' | 'stepIndex'>> | null | undefined;
|
|
673
|
+
}
|
|
674
|
+
interface ExtractStepRewardsOptions {
|
|
675
|
+
/**
|
|
676
|
+
* Ordered list of scorers. Each span runs through scorers in order;
|
|
677
|
+
* the first non-null result wins. If no scorer applies, the span is
|
|
678
|
+
* skipped (not all spans are training-worthy).
|
|
679
|
+
*/
|
|
680
|
+
scorers: StepScorer[];
|
|
681
|
+
/** Optional filter — return null to drop the span entirely before scoring. */
|
|
682
|
+
preFilter?: (span: Span) => boolean;
|
|
683
|
+
}
|
|
684
|
+
declare function extractStepRewards(store: TraceStore, runId: string, opts: ExtractStepRewardsOptions): Promise<StepReward[]>;
|
|
685
|
+
interface RunwiseStepSummary {
|
|
686
|
+
runId: string;
|
|
687
|
+
totalSteps: number;
|
|
688
|
+
meanReward: number;
|
|
689
|
+
/** Sum-of-rewards (weighted by `weight ?? 1`). Use as the run-level proxy. */
|
|
690
|
+
sumWeightedReward: number;
|
|
691
|
+
/** Fraction of steps where reward < 0.5 — proxy for "where the policy was wrong." */
|
|
692
|
+
failureFraction: number;
|
|
693
|
+
/** Maximum drop in reward between consecutive steps — diagnoses a step where things went sideways. */
|
|
694
|
+
worstStepDelta: number;
|
|
695
|
+
worstStepIndex: number | null;
|
|
696
|
+
}
|
|
697
|
+
declare function runwiseStepRewardSummary(stepRewards: StepReward[]): RunwiseStepSummary;
|
|
698
|
+
interface PrmTrainingTriple {
|
|
699
|
+
/** Prefix run-id (or composite key) — the trajectory up to step k-1. */
|
|
700
|
+
prefixRunId: string;
|
|
701
|
+
prefixStepIndex: number;
|
|
702
|
+
/** The step that came next on a high-reward trajectory. */
|
|
703
|
+
chosenSpanId: string;
|
|
704
|
+
chosenReward: number;
|
|
705
|
+
/** A step from a divergent low-reward trajectory at the same prefix length. */
|
|
706
|
+
rejectedSpanId: string;
|
|
707
|
+
rejectedReward: number;
|
|
708
|
+
/** The prefix run came from this run; the rejected step came from `rejectedRunId`. */
|
|
709
|
+
rejectedRunId: string;
|
|
710
|
+
marginScore: number;
|
|
711
|
+
}
|
|
712
|
+
/**
|
|
713
|
+
* Build PRM training triples. The shape: pair runs that share an early
|
|
714
|
+
* prefix (same scenario, same first N steps) and diverge later — at the
|
|
715
|
+
* point of divergence, the high-reward run's next step is `chosen`, the
|
|
716
|
+
* low-reward run's next step is `rejected`. This is the canonical PRM
|
|
717
|
+
* training data shape from Lightman et al. and DeepSeek-R1 process
|
|
718
|
+
* supervision.
|
|
719
|
+
*
|
|
720
|
+
* Implementation note: we don't have a way to detect "same prefix" in
|
|
721
|
+
* the general agent setting (token-level prefixes require hashing model
|
|
722
|
+
* outputs). The current heuristic groups by `(scenarioId, prefixSpanName
|
|
723
|
+
* sequence)` — runs are paired when their first K span names match. For
|
|
724
|
+
* production use this should be replaced with a proper trajectory-prefix
|
|
725
|
+
* hash; the heuristic is good enough for early-stage scaffolding.
|
|
726
|
+
*/
|
|
727
|
+
declare function prmTrainingPairs(stepRewardsByRun: Map<string, StepReward[]>, opts?: {
|
|
728
|
+
minMargin?: number;
|
|
729
|
+
minPrefixLength?: number;
|
|
730
|
+
}): PrmTrainingTriple[];
|
|
731
|
+
|
|
732
|
+
/**
|
|
733
|
+
* Contamination probe — held-out perturbation tests.
|
|
734
|
+
*
|
|
735
|
+
* The bug class: once a benchmark scenario set is published, models train
|
|
736
|
+
* on it, and your scores become invalid. SWE-Bench-Verified, GPQA, and
|
|
737
|
+
* MMLU-Pro all exist because their predecessors got contaminated within
|
|
738
|
+
* months. The right defense is to keep a held-out *perturbed* version of
|
|
739
|
+
* every scenario — same task, slightly different surface — and check
|
|
740
|
+
* whether scores diverge significantly. Genuine capability transfers; rote
|
|
741
|
+
* memorization doesn't.
|
|
742
|
+
*
|
|
743
|
+
* This module ships the probe contract:
|
|
744
|
+
*
|
|
745
|
+
* 1. A `ScenarioPerturbation` strategy type — function that produces a
|
|
746
|
+
* perturbed scenario from an original.
|
|
747
|
+
* 2. `runContaminationProbe({ originals, perturbed, scoreFn })` — runs
|
|
748
|
+
* both halves and reports per-scenario score divergence + a global
|
|
749
|
+
* contamination verdict via paired Wilcoxon.
|
|
750
|
+
* 3. Several stock perturbations: `renameVariables`, `shuffleOrder`,
|
|
751
|
+
* `paraphrasePrompt`, `injectIrrelevantClause`. Each preserves the
|
|
752
|
+
* task's structural difficulty while breaking surface memorization.
|
|
753
|
+
*
|
|
754
|
+
* The verdict is conservative: if the perturbed-vs-original score
|
|
755
|
+
* difference is statistically significant (BH-adjusted p < 0.05) AND
|
|
756
|
+
* the median drop is > 5 percentage points, we flag *contamination
|
|
757
|
+
* suspected*. False positives are possible (the perturbation might
|
|
758
|
+
* actually be harder); the default is to flag for review, not to
|
|
759
|
+
* autoreject.
|
|
760
|
+
*/
|
|
761
|
+
type ScenarioPerturbationKind = 'rename_variables' | 'shuffle_order' | 'paraphrase' | 'inject_irrelevant_clause' | 'custom';
|
|
762
|
+
interface ScenarioPerturbation<S> {
|
|
763
|
+
kind: ScenarioPerturbationKind;
|
|
764
|
+
/** Apply to one scenario, return its perturbed sibling. */
|
|
765
|
+
apply: (scenario: S) => Promise<S> | S;
|
|
766
|
+
/** Optional id — for the report. */
|
|
767
|
+
id?: string;
|
|
768
|
+
}
|
|
769
|
+
interface ContaminationProbeInput<S> {
|
|
770
|
+
/** Identity of every scenario. The probe's `runFingerprint` keys on these. */
|
|
771
|
+
scenarioId: (s: S) => string;
|
|
772
|
+
/** Original scenarios. */
|
|
773
|
+
originals: S[];
|
|
774
|
+
/**
|
|
775
|
+
* Either pre-computed perturbations (one per original, same order) OR a
|
|
776
|
+
* `perturbation` strategy that synthesizes them on the fly.
|
|
777
|
+
*/
|
|
778
|
+
perturbed?: S[];
|
|
779
|
+
perturbation?: ScenarioPerturbation<S>;
|
|
780
|
+
/**
|
|
781
|
+
* Run the policy/agent against one scenario and return a scalar score
|
|
782
|
+
* in [0, 1]. The probe doesn't care what the policy is — that's the
|
|
783
|
+
* caller's contract.
|
|
784
|
+
*/
|
|
785
|
+
scoreFn: (s: S) => Promise<number>;
|
|
786
|
+
}
|
|
787
|
+
interface ContaminationProbeOptions {
|
|
788
|
+
/** Drop scores below this from the probe; treats partial failures separately. Default 0. */
|
|
789
|
+
scoreFloor?: number;
|
|
790
|
+
/**
|
|
791
|
+
* BH-FDR threshold for declaring contamination on each per-scenario
|
|
792
|
+
* delta. Default 0.05.
|
|
793
|
+
*/
|
|
794
|
+
fdr?: number;
|
|
795
|
+
/**
|
|
796
|
+
* Minimum median per-scenario drop to flag global contamination. Default
|
|
797
|
+
* 0.05 (5 percentage points). Smaller drops may be noise.
|
|
798
|
+
*/
|
|
799
|
+
minMedianDrop?: number;
|
|
800
|
+
}
|
|
801
|
+
interface ContaminationProbeReport {
|
|
802
|
+
perScenario: Array<{
|
|
803
|
+
scenarioId: string;
|
|
804
|
+
originalScore: number;
|
|
805
|
+
perturbedScore: number;
|
|
806
|
+
delta: number;
|
|
807
|
+
/** Per-scenario q-value (single-test BH for a single scenario). Mainly for display. */
|
|
808
|
+
qValue: number;
|
|
809
|
+
}>;
|
|
810
|
+
/** Wilcoxon paired-test on the deltas. */
|
|
811
|
+
pairedTest: {
|
|
812
|
+
w: number;
|
|
813
|
+
p: number;
|
|
814
|
+
};
|
|
815
|
+
medianDelta: number;
|
|
816
|
+
meanDelta: number;
|
|
817
|
+
contaminationSuspected: boolean;
|
|
818
|
+
reason: string;
|
|
819
|
+
/** Number of scenarios processed. */
|
|
820
|
+
n: number;
|
|
821
|
+
}
|
|
822
|
+
declare function runContaminationProbe<S>(input: ContaminationProbeInput<S>, opts?: ContaminationProbeOptions): Promise<ContaminationProbeReport>;
|
|
823
|
+
/**
|
|
824
|
+
* Identifier-rename perturbation for code/text scenarios. Replaces every
|
|
825
|
+
* occurrence of the listed identifiers with synthesized aliases. Use when
|
|
826
|
+
* the scenario's structural difficulty is independent of variable names
|
|
827
|
+
* (e.g. SWE-Bench-style coding tasks).
|
|
828
|
+
*/
|
|
829
|
+
declare function renameVariables<S extends {
|
|
830
|
+
prompt: string;
|
|
831
|
+
}>(identifiers: string[], rename?: (name: string, idx: number) => string): ScenarioPerturbation<S>;
|
|
832
|
+
/**
|
|
833
|
+
* Order-shuffle perturbation. Reshuffles a list-shaped section of the
|
|
834
|
+
* prompt (for QA scenarios that present options A/B/C/D — answer depends
|
|
835
|
+
* on the option labels, not order). Caller provides the section extractor.
|
|
836
|
+
*/
|
|
837
|
+
declare function shuffleOrder<S extends {
|
|
838
|
+
prompt: string;
|
|
839
|
+
}>(shuffleSection: (prompt: string, rng: () => number) => string, seed: number): ScenarioPerturbation<S>;
|
|
840
|
+
/**
|
|
841
|
+
* Inject-irrelevant-clause perturbation. Adds a benign sentence that
|
|
842
|
+
* shouldn't change the answer. Tests for "did the model just memorize
|
|
843
|
+
* the input string."
|
|
844
|
+
*/
|
|
845
|
+
declare function injectIrrelevantClause<S extends {
|
|
846
|
+
prompt: string;
|
|
847
|
+
}>(clause: string, position?: 'prefix' | 'suffix'): ScenarioPerturbation<S>;
|
|
848
|
+
|
|
849
|
+
/**
|
|
850
|
+
* Bradley-Terry / Elo tournament evaluation.
|
|
851
|
+
*
|
|
852
|
+
* For multi-candidate sweeps, comparing every candidate's score against
|
|
853
|
+
* a fixed comparator wastes information — the comparator becomes a high-
|
|
854
|
+
* variance reference and rank flips between near-tied middle-rank
|
|
855
|
+
* candidates are dominated by noise. Pairwise tournaments fix this:
|
|
856
|
+
* every (i, j) pair contributes a comparison to a Bradley-Terry MLE that
|
|
857
|
+
* estimates each candidate's strength on a unified scale.
|
|
858
|
+
*
|
|
859
|
+
* For online updating (rolling campaigns where new candidates arrive
|
|
860
|
+
* over time), we also ship classical Elo with configurable K-factor.
|
|
861
|
+
*
|
|
862
|
+
* References:
|
|
863
|
+
* - Bradley, R. A., Terry, M. E. (1952). Rank analysis of incomplete
|
|
864
|
+
* block designs. Biometrika, 39(3/4), 324–345.
|
|
865
|
+
* - Hunter, D. R. (2004). MM algorithms for generalized Bradley-Terry
|
|
866
|
+
* models. Annals of Statistics, 32(1), 384–406. (The MLE algorithm
|
|
867
|
+
* used here.)
|
|
868
|
+
* - Elo, A. E. (1978). The Rating of Chess Players, Past and Present.
|
|
869
|
+
*
|
|
870
|
+
* This is a useful primitive because most LLM-eval communities (Chatbot
|
|
871
|
+
* Arena, AlpacaEval, ELO-style ablation) have converged on pairwise
|
|
872
|
+
* tournament eval as the most sample-efficient and most rank-stable
|
|
873
|
+
* method when you have many candidates.
|
|
874
|
+
*/
|
|
875
|
+
interface PairwiseOutcome {
|
|
876
|
+
/** Winner candidate id. */
|
|
877
|
+
winner: string;
|
|
878
|
+
/** Loser candidate id. */
|
|
879
|
+
loser: string;
|
|
880
|
+
/**
|
|
881
|
+
* Optional draw flag. When true, both candidates get half-credit
|
|
882
|
+
* (Bradley-Terry handles draws as half-wins for each side).
|
|
883
|
+
*/
|
|
884
|
+
draw?: boolean;
|
|
885
|
+
/**
|
|
886
|
+
* Optional weight — useful if some pairwise comparisons are stronger
|
|
887
|
+
* signals than others (e.g. a paired test with a wider score gap is
|
|
888
|
+
* a more confident comparison). Default 1.
|
|
889
|
+
*/
|
|
890
|
+
weight?: number;
|
|
891
|
+
}
|
|
892
|
+
interface BradleyTerryRating {
|
|
893
|
+
candidateId: string;
|
|
894
|
+
/** Latent strength θ ≥ 0 from the BT MLE. */
|
|
895
|
+
strength: number;
|
|
896
|
+
/** Log-strength = log(θ) — interpretable on a linear scale. */
|
|
897
|
+
logStrength: number;
|
|
898
|
+
/** Number of pairwise comparisons this candidate appears in. */
|
|
899
|
+
n: number;
|
|
900
|
+
/** Win count (+ 0.5 per draw). */
|
|
901
|
+
wins: number;
|
|
902
|
+
}
|
|
903
|
+
interface BradleyTerryFit {
|
|
904
|
+
ratings: BradleyTerryRating[];
|
|
905
|
+
/** Iterations of the MM algorithm before convergence. */
|
|
906
|
+
iterations: number;
|
|
907
|
+
/** Final maximum |θ_new - θ_old| / θ_old. */
|
|
908
|
+
finalDelta: number;
|
|
909
|
+
converged: boolean;
|
|
910
|
+
}
|
|
911
|
+
/**
|
|
912
|
+
* Bradley-Terry MLE via Hunter's MM algorithm.
|
|
913
|
+
*
|
|
914
|
+
* Iteration: θ_i^new = W_i / Σ_{j ≠ i} N_ij / (θ_i + θ_j)
|
|
915
|
+
* where W_i = wins by i (+ 0.5 per draw), N_ij = total comparisons.
|
|
916
|
+
*
|
|
917
|
+
* Returns log-strengths normalized so the smallest is 0 (any constant
|
|
918
|
+
* offset is unobservable in BT — only differences are identified).
|
|
919
|
+
*/
|
|
920
|
+
declare function fitBradleyTerry(outcomes: PairwiseOutcome[], opts?: {
|
|
921
|
+
tolerance?: number;
|
|
922
|
+
maxIterations?: number;
|
|
923
|
+
smoothing?: number;
|
|
924
|
+
}): BradleyTerryFit;
|
|
925
|
+
/**
|
|
926
|
+
* Online Elo updates. Use when comparisons arrive over time and you want
|
|
927
|
+
* a running rating without re-fitting the full BT MLE on every update.
|
|
928
|
+
*
|
|
929
|
+
* Initialize ratings to `defaultRating` (1500 by default). Each call to
|
|
930
|
+
* `applyEloUpdate` mutates the map in place and returns the deltas so
|
|
931
|
+
* the caller can log per-comparison rating changes.
|
|
932
|
+
*/
|
|
933
|
+
interface EloOptions {
|
|
934
|
+
/** Default rating for unseen candidates. Default 1500. */
|
|
935
|
+
defaultRating?: number;
|
|
936
|
+
/** K-factor controls the step size. Default 32 (FIDE-ish). */
|
|
937
|
+
kFactor?: number;
|
|
938
|
+
}
|
|
939
|
+
declare function applyEloUpdate(ratings: Map<string, number>, outcome: PairwiseOutcome, opts?: EloOptions): {
|
|
940
|
+
winnerDelta: number;
|
|
941
|
+
loserDelta: number;
|
|
942
|
+
};
|
|
943
|
+
/**
|
|
944
|
+
* Build pairwise outcomes from the campaign artifact: for every scenario
|
|
945
|
+
* shared by two candidates, the higher-scoring run wins. Useful when you
|
|
946
|
+
* want a tournament view of an existing campaign without an additional
|
|
947
|
+
* pairwise judge call.
|
|
948
|
+
*/
|
|
949
|
+
interface BuildPairwiseFromCampaignInput {
|
|
950
|
+
runs: Array<{
|
|
951
|
+
candidateId: string;
|
|
952
|
+
/** Stable identifier for the matching unit (typically scenarioId). */
|
|
953
|
+
matchKey: string;
|
|
954
|
+
score: number;
|
|
955
|
+
}>;
|
|
956
|
+
/**
|
|
957
|
+
* Tied-score margin. Below this, the comparison is a draw. Default 0
|
|
958
|
+
* (no ties).
|
|
959
|
+
*/
|
|
960
|
+
drawMargin?: number;
|
|
961
|
+
}
|
|
962
|
+
declare function buildPairwiseFromCampaign(input: BuildPairwiseFromCampaignInput): PairwiseOutcome[];
|
|
963
|
+
|
|
964
|
+
/**
|
|
965
|
+
* Adversarial scenario search.
|
|
966
|
+
*
|
|
967
|
+
* Capability evaluation on a fixed scenario set measures performance on
|
|
968
|
+
* the distribution someone curated. Production failure modes live in the
|
|
969
|
+
* tail — inputs the curator didn't think of, or actively avoided. The
|
|
970
|
+
* adversarial-search primitive actively looks for them: starting from a
|
|
971
|
+
* pool of scenarios where the policy already passes, it mutates them
|
|
972
|
+
* (paraphrase, edge-case substitution, compositional combination) and
|
|
973
|
+
* keeps the mutations that *break* the policy.
|
|
974
|
+
*
|
|
975
|
+
* This is not magic. It's the simplest version of the loop that AdA
|
|
976
|
+
* (Open-Ended Adaptation, DeepMind 2023), POET, and Anthropic's
|
|
977
|
+
* auto-jailbreak rigs all run: hill-climb against a failure indicator,
|
|
978
|
+
* keep the survivors, repeat. We ship the harness; consumers supply the
|
|
979
|
+
* mutation strategies and the failure detector.
|
|
980
|
+
*
|
|
981
|
+
* Why ship this in agent-eval and not as a separate red-team tool: every
|
|
982
|
+
* piece of the standard adversarial loop is already in this package
|
|
983
|
+
* (`runEvalCampaign` for the matrix run, `RawProviderSink` for capture,
|
|
984
|
+
* `assertRunCaptured` for integrity, `pairedEvalueSequence` for stop
|
|
985
|
+
* criteria). The adversarial primitive is just the *scenario-mutation
|
|
986
|
+
* meta-loop* on top of that machinery.
|
|
987
|
+
*/
|
|
988
|
+
interface AdversarialScenario<S> {
|
|
989
|
+
/** Stable id — used for deduplication and lineage tracking. */
|
|
990
|
+
id: string;
|
|
991
|
+
/** Generation index — 0 for seeds, 1 for first round of mutations, etc. */
|
|
992
|
+
generation: number;
|
|
993
|
+
/** Lineage — id of the parent scenario this was mutated from, if any. */
|
|
994
|
+
parentId: string | null;
|
|
995
|
+
scenario: S;
|
|
996
|
+
/** Score on the policy under test. Lower = adversarial signal. */
|
|
997
|
+
score: number | null;
|
|
998
|
+
/** Strategy that produced this mutation, for diagnostics. */
|
|
999
|
+
mutationStrategy: string | null;
|
|
1000
|
+
}
|
|
1001
|
+
interface AdversarialMutation<S> {
|
|
1002
|
+
id: string;
|
|
1003
|
+
/**
|
|
1004
|
+
* Mutate one scenario. Return null to skip; return one or more new
|
|
1005
|
+
* scenarios. The harness deduplicates by `mutateScenarioId(scenario)`.
|
|
1006
|
+
*/
|
|
1007
|
+
mutate(parent: S, rng: () => number): Promise<S[]> | S[];
|
|
1008
|
+
}
|
|
1009
|
+
interface AdversarialSearchOptions<S> {
|
|
1010
|
+
/** Initial scenarios — typically those the policy currently passes. */
|
|
1011
|
+
seeds: S[];
|
|
1012
|
+
/** Stable identifier extraction. */
|
|
1013
|
+
mutateScenarioId: (s: S) => string;
|
|
1014
|
+
/** Mutation strategies. */
|
|
1015
|
+
mutations: AdversarialMutation<S>[];
|
|
1016
|
+
/**
|
|
1017
|
+
* Run the policy under test against one scenario, return a scalar score
|
|
1018
|
+
* in [0, 1]. Lower = adversarial signal.
|
|
1019
|
+
*/
|
|
1020
|
+
scoreFn: (s: S) => Promise<number>;
|
|
1021
|
+
/**
|
|
1022
|
+
* Threshold below which a scenario counts as a "failure" worth keeping.
|
|
1023
|
+
* Default 0.5.
|
|
1024
|
+
*/
|
|
1025
|
+
failureThreshold?: number;
|
|
1026
|
+
/** Number of mutation rounds. Default 3. */
|
|
1027
|
+
rounds?: number;
|
|
1028
|
+
/** Children per parent per round. Default 4. */
|
|
1029
|
+
childrenPerParent?: number;
|
|
1030
|
+
/** Maximum total scenarios examined. Default Infinity. */
|
|
1031
|
+
budget?: number;
|
|
1032
|
+
/** Seed for the deterministic RNG. Default 1. */
|
|
1033
|
+
seed?: number;
|
|
1034
|
+
}
|
|
1035
|
+
interface AdversarialSearchReport<S> {
|
|
1036
|
+
scenarios: AdversarialScenario<S>[];
|
|
1037
|
+
/** Discovered failures sorted by score ascending. */
|
|
1038
|
+
failures: AdversarialScenario<S>[];
|
|
1039
|
+
/** Round-by-round counts. */
|
|
1040
|
+
byGeneration: Array<{
|
|
1041
|
+
generation: number;
|
|
1042
|
+
total: number;
|
|
1043
|
+
failures: number;
|
|
1044
|
+
meanScore: number;
|
|
1045
|
+
}>;
|
|
1046
|
+
/** Total scoreFn invocations consumed. */
|
|
1047
|
+
scoreCalls: number;
|
|
1048
|
+
}
|
|
1049
|
+
declare function adversarialScenarioSearch<S>(opts: AdversarialSearchOptions<S>): Promise<AdversarialSearchReport<S>>;
|
|
1050
|
+
|
|
1051
|
+
/**
|
|
1052
|
+
* Test-time compute scaling curves.
|
|
1053
|
+
*
|
|
1054
|
+
* The test-time-compute frontier paper (Snell et al. 2024) and the
|
|
1055
|
+
* subsequent o1-style scaling work both show that LLM-agent capability
|
|
1056
|
+
* is a function of the compute budget at inference, not just of the
|
|
1057
|
+
* training run. The right way to characterize a candidate is therefore
|
|
1058
|
+
* a *curve* — score at compute budgets {1×, 4×, 16×, …} — not a single
|
|
1059
|
+
* point.
|
|
1060
|
+
*
|
|
1061
|
+
* This module ships:
|
|
1062
|
+
*
|
|
1063
|
+
* 1. The compute-curve harness — `runComputeCurve(runner, budgets)` —
|
|
1064
|
+
* that evaluates one candidate at a sequence of compute budgets
|
|
1065
|
+
* and returns the (compute, score) curve.
|
|
1066
|
+
* 2. A best-of-N evaluator — `bestOfN(runner, n, scoreFn)` — the
|
|
1067
|
+
* simplest test-time-compute scaling primitive: sample N
|
|
1068
|
+
* independent rollouts, return the best.
|
|
1069
|
+
* 3. A self-consistency evaluator — `selfConsistency(runner, n)` —
|
|
1070
|
+
* the majority-vote variant of best-of-N for tasks with a small
|
|
1071
|
+
* categorical answer space.
|
|
1072
|
+
* 4. Pareto-frontier extraction over multiple candidates — given
|
|
1073
|
+
* (candidate, compute, score) tuples, return the set of
|
|
1074
|
+
* candidate-compute combinations that aren't dominated.
|
|
1075
|
+
*
|
|
1076
|
+
* Caveat: "compute" here is the caller's notion of a compute unit. For
|
|
1077
|
+
* agent eval that's typically wall-time × parallelism, or token budget,
|
|
1078
|
+
* or LLM-call count. We accept whatever the caller provides; the curve
|
|
1079
|
+
* is on whatever axis they pick.
|
|
1080
|
+
*/
|
|
1081
|
+
interface ComputeCurveBudget {
|
|
1082
|
+
/** Identifier — for the report. Common: '1x', '4x', '16x'. */
|
|
1083
|
+
id: string;
|
|
1084
|
+
/** Numeric value on the chosen axis (tokens, calls, USD, ms — caller picks). */
|
|
1085
|
+
cost: number;
|
|
1086
|
+
/** Free-form metadata (the caller can carry per-budget config). */
|
|
1087
|
+
meta?: Record<string, unknown>;
|
|
1088
|
+
}
|
|
1089
|
+
interface ComputeCurvePoint {
|
|
1090
|
+
budgetId: string;
|
|
1091
|
+
cost: number;
|
|
1092
|
+
score: number;
|
|
1093
|
+
/** Number of underlying samples used at this budget. */
|
|
1094
|
+
samples: number;
|
|
1095
|
+
/** Optional spread / variance information. */
|
|
1096
|
+
std?: number;
|
|
1097
|
+
/** Any extra metrics the runner returned. */
|
|
1098
|
+
metrics?: Record<string, number>;
|
|
1099
|
+
}
|
|
1100
|
+
interface ComputeCurve {
|
|
1101
|
+
candidateId: string;
|
|
1102
|
+
points: ComputeCurvePoint[];
|
|
1103
|
+
/** Rough exponent fit: score ≈ a + b * log(cost). Useful for "how steep is the curve?" */
|
|
1104
|
+
logSlope: number | null;
|
|
1105
|
+
/** Best (highest-score) point on the curve. */
|
|
1106
|
+
best: ComputeCurvePoint;
|
|
1107
|
+
}
|
|
1108
|
+
interface RunComputeCurveOptions {
|
|
1109
|
+
candidateId: string;
|
|
1110
|
+
budgets: ComputeCurveBudget[];
|
|
1111
|
+
/**
|
|
1112
|
+
* Run the candidate at one budget. Returns the realized score plus
|
|
1113
|
+
* optional spread + extra metrics.
|
|
1114
|
+
*/
|
|
1115
|
+
runAtBudget: (budget: ComputeCurveBudget) => Promise<{
|
|
1116
|
+
score: number;
|
|
1117
|
+
samples: number;
|
|
1118
|
+
std?: number;
|
|
1119
|
+
metrics?: Record<string, number>;
|
|
1120
|
+
}>;
|
|
1121
|
+
}
|
|
1122
|
+
declare function runComputeCurve(opts: RunComputeCurveOptions): Promise<ComputeCurve>;
|
|
1123
|
+
interface ComputeBestOfNOptions<O> {
|
|
1124
|
+
/** Number of independent samples to draw. */
|
|
1125
|
+
n: number;
|
|
1126
|
+
/** Sampler — produces one rollout. */
|
|
1127
|
+
sample: (sampleIdx: number) => Promise<O>;
|
|
1128
|
+
/** Score one rollout. */
|
|
1129
|
+
scoreFn: (rollout: O) => Promise<number> | number;
|
|
1130
|
+
}
|
|
1131
|
+
interface ComputeBestOfNResult<O> {
|
|
1132
|
+
best: O;
|
|
1133
|
+
bestScore: number;
|
|
1134
|
+
scores: number[];
|
|
1135
|
+
meanScore: number;
|
|
1136
|
+
/** Index of the best rollout, for diagnostics. */
|
|
1137
|
+
bestIndex: number;
|
|
1138
|
+
}
|
|
1139
|
+
/** The simplest test-time scaling primitive. */
|
|
1140
|
+
declare function bestOfN<O>(opts: ComputeBestOfNOptions<O>): Promise<ComputeBestOfNResult<O>>;
|
|
1141
|
+
interface SelfConsistencyOptions<O> {
|
|
1142
|
+
n: number;
|
|
1143
|
+
sample: (sampleIdx: number) => Promise<O>;
|
|
1144
|
+
/** Extract the canonical answer key (string) from a rollout. */
|
|
1145
|
+
answerKey: (rollout: O) => string;
|
|
1146
|
+
}
|
|
1147
|
+
interface SelfConsistencyResult<O> {
|
|
1148
|
+
/** Modal answer (the majority vote). */
|
|
1149
|
+
answer: string;
|
|
1150
|
+
/** Fraction of samples voting for the modal answer in [0, 1]. */
|
|
1151
|
+
agreement: number;
|
|
1152
|
+
/** Histogram of all answers. */
|
|
1153
|
+
histogram: Record<string, number>;
|
|
1154
|
+
/** A representative rollout that voted for the modal answer. */
|
|
1155
|
+
representative: O;
|
|
1156
|
+
/** All rollouts. */
|
|
1157
|
+
rollouts: O[];
|
|
1158
|
+
}
|
|
1159
|
+
/**
|
|
1160
|
+
* Self-consistency / majority-vote test-time scaling. For tasks with a
|
|
1161
|
+
* small categorical answer space (math problems, multiple choice).
|
|
1162
|
+
*/
|
|
1163
|
+
declare function selfConsistency<O>(opts: SelfConsistencyOptions<O>): Promise<SelfConsistencyResult<O>>;
|
|
1164
|
+
/**
|
|
1165
|
+
* Pareto frontier over (candidate, compute, score) tuples. A point is on
|
|
1166
|
+
* the frontier iff no other point dominates it in both score (higher
|
|
1167
|
+
* better) and cost (lower better). Returns the frontier sorted ascending
|
|
1168
|
+
* by cost.
|
|
1169
|
+
*/
|
|
1170
|
+
interface ParetoPointInput {
|
|
1171
|
+
candidateId: string;
|
|
1172
|
+
budgetId: string;
|
|
1173
|
+
cost: number;
|
|
1174
|
+
score: number;
|
|
1175
|
+
}
|
|
1176
|
+
declare function paretoFrontier(points: ParetoPointInput[]): ParetoPointInput[];
|
|
1177
|
+
|
|
1178
|
+
/**
|
|
1179
|
+
* Adaptive curriculum / active scenario selection.
|
|
1180
|
+
*
|
|
1181
|
+
* Fixed scenario sets waste sample budget on cells the policy already
|
|
1182
|
+
* passes (no information left) and cells the policy never passes (no
|
|
1183
|
+
* gradient available either). Active learning over scenarios fixes this
|
|
1184
|
+
* by allocating the next sample budget to cells where the policy's
|
|
1185
|
+
* outcome is *uncertain* — those carry the most decision-relevant signal.
|
|
1186
|
+
*
|
|
1187
|
+
* This module ships two complementary strategies:
|
|
1188
|
+
*
|
|
1189
|
+
* 1. **Variance-based** — score each (variant, scenario) cell by the
|
|
1190
|
+
* empirical variance of past observations. Allocate next-round budget
|
|
1191
|
+
* proportional to variance. Standard active-learning-by-uncertainty
|
|
1192
|
+
* heuristic; works well when the policy is non-deterministic and
|
|
1193
|
+
* cells differ in observation noise.
|
|
1194
|
+
*
|
|
1195
|
+
* 2. **Bandit-based (Thompson sampling)** — model each (variant,
|
|
1196
|
+
* scenario) cell as a Beta-Bernoulli arm; sample a posterior; pick
|
|
1197
|
+
* cells whose posterior mean is closest to the per-scenario decision
|
|
1198
|
+
* threshold. The right primitive when scenarios are
|
|
1199
|
+
* "pass/fail" rather than continuous, and when promotion gates fire
|
|
1200
|
+
* at a known threshold (e.g., 0.5).
|
|
1201
|
+
*
|
|
1202
|
+
* The output is a *next-round budget allocation* — a list of (variant,
|
|
1203
|
+
* scenario, count) triples. The consumer's matrix runner consumes the
|
|
1204
|
+
* allocation, runs those cells, feeds the new observations back. Loop.
|
|
1205
|
+
*
|
|
1206
|
+
* Out of scope (deliberate): scenario *generation* — that's the
|
|
1207
|
+
* adversarial primitive's job. This module allocates over an existing
|
|
1208
|
+
* scenario pool.
|
|
1209
|
+
*/
|
|
1210
|
+
|
|
1211
|
+
interface CellObservation {
|
|
1212
|
+
variantId: string;
|
|
1213
|
+
scenarioId: string;
|
|
1214
|
+
/** Observed score in [0, 1]. */
|
|
1215
|
+
score: number;
|
|
1216
|
+
/** For Bernoulli arms — derive from the score with a threshold if needed. */
|
|
1217
|
+
pass?: boolean;
|
|
1218
|
+
}
|
|
1219
|
+
interface CurriculumAllocation {
|
|
1220
|
+
variantId: string;
|
|
1221
|
+
scenarioId: string;
|
|
1222
|
+
/** How many additional reps to run on this cell. */
|
|
1223
|
+
count: number;
|
|
1224
|
+
/** Strategy-specific reason for the allocation. */
|
|
1225
|
+
reason: string;
|
|
1226
|
+
}
|
|
1227
|
+
interface VarianceCurriculumOptions {
|
|
1228
|
+
/** Total reps to allocate across all cells. */
|
|
1229
|
+
budget: number;
|
|
1230
|
+
/**
|
|
1231
|
+
* Smoothing prior on variance — keeps the allocator from concentrating
|
|
1232
|
+
* on a cell with one observation just because its 1-sample variance is
|
|
1233
|
+
* 0. Default 0.05.
|
|
1234
|
+
*/
|
|
1235
|
+
variancePrior?: number;
|
|
1236
|
+
/**
|
|
1237
|
+
* Minimum reps per cell — even when the variance estimate is low, give
|
|
1238
|
+
* every cell at least this many. Default 1.
|
|
1239
|
+
*/
|
|
1240
|
+
floorPerCell?: number;
|
|
1241
|
+
}
|
|
1242
|
+
/**
|
|
1243
|
+
* Variance-proportional allocation. For each cell, estimate variance from
|
|
1244
|
+
* past observations + a prior, then allocate the budget proportional to
|
|
1245
|
+
* (sqrt(variance) + 1/sqrt(n)) — a classical optimal-allocation rule
|
|
1246
|
+
* (Neyman 1934) that balances "explore noisy cells" with "explore
|
|
1247
|
+
* under-sampled cells."
|
|
1248
|
+
*/
|
|
1249
|
+
declare function varianceBasedCurriculum(observations: CellObservation[], candidateCells: Array<{
|
|
1250
|
+
variantId: string;
|
|
1251
|
+
scenarioId: string;
|
|
1252
|
+
}>, opts: VarianceCurriculumOptions): CurriculumAllocation[];
|
|
1253
|
+
interface ThompsonCurriculumOptions {
|
|
1254
|
+
budget: number;
|
|
1255
|
+
/**
|
|
1256
|
+
* The per-scenario decision threshold. Cells whose posterior mean is
|
|
1257
|
+
* closest to this get the most budget — that's where the next observation
|
|
1258
|
+
* has the highest information value for the gate decision. Default 0.5.
|
|
1259
|
+
*/
|
|
1260
|
+
decisionThreshold?: number;
|
|
1261
|
+
/** Beta prior parameters. Default α=β=1 (uniform). */
|
|
1262
|
+
priorAlpha?: number;
|
|
1263
|
+
priorBeta?: number;
|
|
1264
|
+
/** Seed the Thompson sampler. Default unset (Math.random). */
|
|
1265
|
+
seed?: number;
|
|
1266
|
+
}
|
|
1267
|
+
/**
|
|
1268
|
+
* Thompson-sampling-style allocation for pass/fail cells. For each cell:
|
|
1269
|
+
*
|
|
1270
|
+
* - Maintain Beta(α + passes, β + failures) posterior on pass-rate
|
|
1271
|
+
* - Allocation weight ∝ exp(-((sampledMean - threshold) / σ)^2):
|
|
1272
|
+
* cells whose sampled posterior straddles the decision boundary get
|
|
1273
|
+
* the most weight; cells already clearly above or below get less.
|
|
1274
|
+
*
|
|
1275
|
+
* This is the right primitive when promotion gates fire at a known
|
|
1276
|
+
* threshold and you want to sharpen the posterior near the boundary.
|
|
1277
|
+
*/
|
|
1278
|
+
declare function thompsonCurriculum(observations: CellObservation[], candidateCells: Array<{
|
|
1279
|
+
variantId: string;
|
|
1280
|
+
scenarioId: string;
|
|
1281
|
+
}>, opts: ThompsonCurriculumOptions): CurriculumAllocation[];
|
|
1282
|
+
/** Convenience: extract `CellObservation[]` directly from `RunRecord[]`. */
|
|
1283
|
+
declare function observationsFromRunRecords(runs: RunRecord[], opts?: {
|
|
1284
|
+
passThreshold?: number;
|
|
1285
|
+
useHoldout?: boolean;
|
|
1286
|
+
}): CellObservation[];
|
|
1287
|
+
|
|
1288
|
+
/**
|
|
1289
|
+
* Reward hacking / Goodhart detection.
|
|
1290
|
+
*
|
|
1291
|
+
* Goodhart's Law says: when a measure becomes a target, it ceases to be
|
|
1292
|
+
* a good measure. In RLHF and agentic-RL settings this is the dominant
|
|
1293
|
+
* failure mode — the policy learns to produce outputs that score well on
|
|
1294
|
+
* the proxy reward (judge, rubric, test pass-rate) without producing
|
|
1295
|
+
* the underlying capability the proxy was meant to track.
|
|
1296
|
+
*
|
|
1297
|
+
* Krakovna et al. (2020, "Specification Gaming Examples in AI") and the
|
|
1298
|
+
* subsequent RLHF reward-hacking literature (Skalse et al. 2022, Kim et al.
|
|
1299
|
+
* 2023) converge on a few diagnostic signatures:
|
|
1300
|
+
*
|
|
1301
|
+
* 1. **Reward divergence:** the proxy reward grows while the held-out
|
|
1302
|
+
* ground-truth signal stagnates or drops. Predictive validity over
|
|
1303
|
+
* time captures this.
|
|
1304
|
+
* 2. **Distributional shift in outputs:** after RL, the policy produces
|
|
1305
|
+
* outputs that no longer match the reference distribution — usually
|
|
1306
|
+
* because it found a high-reward attractor that's degenerate (e.g.
|
|
1307
|
+
* one-token responses, repetition, formatting tricks).
|
|
1308
|
+
* 3. **Disagreement between independent rewards:** if you train on
|
|
1309
|
+
* reward A and a held-out independent reward B drops sharply, you're
|
|
1310
|
+
* probably hacking A.
|
|
1311
|
+
* 4. **Calibration drift:** the verifiable / deterministic component of
|
|
1312
|
+
* the reward is stable; the probabilistic / judge component drifts up
|
|
1313
|
+
* while the deterministic component doesn't. The judge is being
|
|
1314
|
+
* gamed.
|
|
1315
|
+
*
|
|
1316
|
+
* This module ships explicit detectors for all four signatures, plus a
|
|
1317
|
+
* combined verdict. The output is diagnostic — actionable signals,
|
|
1318
|
+
* not autoreject — because each signature has known false positives
|
|
1319
|
+
* (e.g., a policy that genuinely improves can show distributional shift).
|
|
1320
|
+
*
|
|
1321
|
+
* Differs from `rubricPredictiveValidity` (which is a *standing* check on
|
|
1322
|
+
* whether rubrics correlate with deployment outcomes) — this is a
|
|
1323
|
+
* *temporal* check on whether the reward-vs-truth gap is *widening over
|
|
1324
|
+
* time during a training run*.
|
|
1325
|
+
*/
|
|
1326
|
+
|
|
1327
|
+
type RewardHackingSignal = 'reward_divergence' | 'distribution_shift' | 'reward_disagreement' | 'judge_drift';
|
|
1328
|
+
interface RewardHackingFinding {
|
|
1329
|
+
signal: RewardHackingSignal;
|
|
1330
|
+
/** Severity in [0, 1]. >0.5 = strong signal. */
|
|
1331
|
+
severity: number;
|
|
1332
|
+
message: string;
|
|
1333
|
+
/** Numeric evidence the consumer can render. */
|
|
1334
|
+
detail: Record<string, number>;
|
|
1335
|
+
}
|
|
1336
|
+
interface RewardHackingReport {
|
|
1337
|
+
findings: RewardHackingFinding[];
|
|
1338
|
+
/**
|
|
1339
|
+
* Composite verdict. `'clean'` if every signal severity < 0.3;
|
|
1340
|
+
* `'suspect'` if at least one ≥ 0.3 but none ≥ 0.6; `'gaming'` if any ≥ 0.6.
|
|
1341
|
+
*/
|
|
1342
|
+
verdict: 'clean' | 'suspect' | 'gaming';
|
|
1343
|
+
/** Rationale for the verdict, ready to paste into an audit log. */
|
|
1344
|
+
rationale: string[];
|
|
1345
|
+
/** Number of paired (proxy, truth) data points the report saw. */
|
|
1346
|
+
n: number;
|
|
1347
|
+
}
|
|
1348
|
+
interface DetectRewardHackingInput {
|
|
1349
|
+
/**
|
|
1350
|
+
* Run records ordered by recency (oldest first). The detector segments
|
|
1351
|
+
* them into prefix/suffix windows to compute "did the gap widen."
|
|
1352
|
+
*/
|
|
1353
|
+
runs: RunRecord[];
|
|
1354
|
+
/**
|
|
1355
|
+
* The metric the policy was trained to optimize. Should be present on
|
|
1356
|
+
* `outcome.raw` or `outcome.holdoutScore`. Default reads `outcome.holdoutScore`.
|
|
1357
|
+
*/
|
|
1358
|
+
proxyOf?: (run: RunRecord) => number | null;
|
|
1359
|
+
/**
|
|
1360
|
+
* The held-out ground-truth metric. For RL on coding, this is typically
|
|
1361
|
+
* test pass-rate. For RLHF, it's downstream task performance or human
|
|
1362
|
+
* preference. For knowledge tasks, it's an independently-graded score.
|
|
1363
|
+
*/
|
|
1364
|
+
truthOf?: (run: RunRecord) => number | null;
|
|
1365
|
+
/**
|
|
1366
|
+
* Independent secondary reward. Used for the `reward_disagreement`
|
|
1367
|
+
* signal. Default uses the verifiable reward extractor (deterministic
|
|
1368
|
+
* sources only).
|
|
1369
|
+
*/
|
|
1370
|
+
secondaryRewardOf?: (run: RunRecord) => number | null;
|
|
1371
|
+
/**
|
|
1372
|
+
* Window size — how many of the most recent runs count as the "after"
|
|
1373
|
+
* cohort. Default min(50, half the runs).
|
|
1374
|
+
*/
|
|
1375
|
+
windowSize?: number;
|
|
1376
|
+
/**
|
|
1377
|
+
* Severity threshold to flag a signal. Default 0.3 (suspect) and 0.6
|
|
1378
|
+
* (gaming).
|
|
1379
|
+
*/
|
|
1380
|
+
thresholds?: {
|
|
1381
|
+
suspect?: number;
|
|
1382
|
+
gaming?: number;
|
|
1383
|
+
};
|
|
1384
|
+
/**
|
|
1385
|
+
* Verifiable-reward options used for the secondary-reward fallback.
|
|
1386
|
+
*/
|
|
1387
|
+
verifiableRewardOptions?: VerifiableRewardExtractionOptions;
|
|
1388
|
+
}
|
|
1389
|
+
declare function detectRewardHacking(input: DetectRewardHackingInput): RewardHackingReport;
|
|
1390
|
+
|
|
1391
|
+
/**
|
|
1392
|
+
* Sample-efficient adaptation evaluation.
|
|
1393
|
+
*
|
|
1394
|
+
* For foundation-model-based agents, the load-bearing capability isn't
|
|
1395
|
+
* raw end-state performance — it's *how fast the agent reaches that
|
|
1396
|
+
* performance from cold start*. The same model with a worse prompt that
|
|
1397
|
+
* adapts in 5 demonstrations beats the same model with a better prompt
|
|
1398
|
+
* that needs 50. Standard meta-learning eval (Finn et al., MAML, RL² lit)
|
|
1399
|
+
* reports an *adaptation curve*: score after k=0, 1, 2, 4, 8, 16, …
|
|
1400
|
+
* in-context examples or fine-tune steps.
|
|
1401
|
+
*
|
|
1402
|
+
* This module ships:
|
|
1403
|
+
*
|
|
1404
|
+
* 1. `runAdaptationCurve` — given a runner that takes k demonstrations
|
|
1405
|
+
* and returns a score, produce the (k, score) curve.
|
|
1406
|
+
* 2. `compareAdaptationCurves` — paired comparison across two policies.
|
|
1407
|
+
* Returns per-k delta with bootstrap CIs and an "area-under-curve"
|
|
1408
|
+
* summary statistic.
|
|
1409
|
+
* 3. `firstPassK` — for pass/fail evaluation, the minimum k at which
|
|
1410
|
+
* the policy reliably passes (≥ pass-rate threshold over reps).
|
|
1411
|
+
*
|
|
1412
|
+
* Use cases:
|
|
1413
|
+
* - Compare two prompt designs that have similar end-state performance
|
|
1414
|
+
* but different in-context efficiency.
|
|
1415
|
+
* - Decide between fine-tuning and prompting based on adaptation cost.
|
|
1416
|
+
* - Detect when a policy "memorizes" k=0 inputs vs. genuinely adapts.
|
|
1417
|
+
*/
|
|
1418
|
+
interface AdaptationRunner<S> {
|
|
1419
|
+
/**
|
|
1420
|
+
* Runs the policy on `scenario` with `k` demonstrations. Returns a
|
|
1421
|
+
* scalar score in [0, 1]. The runner is responsible for any caching;
|
|
1422
|
+
* the harness calls it once per (scenario, k, rep) cell.
|
|
1423
|
+
*/
|
|
1424
|
+
run(args: {
|
|
1425
|
+
scenario: S;
|
|
1426
|
+
k: number;
|
|
1427
|
+
rep: number;
|
|
1428
|
+
}): Promise<number>;
|
|
1429
|
+
}
|
|
1430
|
+
interface RunAdaptationCurveOptions<S> {
|
|
1431
|
+
scenarios: S[];
|
|
1432
|
+
/** Number-of-shots to evaluate at. Default `[0, 1, 2, 4, 8, 16]`. */
|
|
1433
|
+
ks?: number[];
|
|
1434
|
+
/** Reps per (scenario, k) cell. Default 3. */
|
|
1435
|
+
reps?: number;
|
|
1436
|
+
runner: AdaptationRunner<S>;
|
|
1437
|
+
/** Pass-rate threshold for `firstPassK` reporting. Default 0.5. */
|
|
1438
|
+
passThreshold?: number;
|
|
1439
|
+
}
|
|
1440
|
+
interface AdaptationPoint {
|
|
1441
|
+
k: number;
|
|
1442
|
+
meanScore: number;
|
|
1443
|
+
passRate: number;
|
|
1444
|
+
std: number;
|
|
1445
|
+
n: number;
|
|
1446
|
+
/** Per-scenario means at this k. */
|
|
1447
|
+
perScenario: Array<{
|
|
1448
|
+
scenarioId: string;
|
|
1449
|
+
meanScore: number;
|
|
1450
|
+
passes: number;
|
|
1451
|
+
total: number;
|
|
1452
|
+
}>;
|
|
1453
|
+
}
|
|
1454
|
+
interface AdaptationCurve {
|
|
1455
|
+
points: AdaptationPoint[];
|
|
1456
|
+
/**
|
|
1457
|
+
* Smallest `k` at which `passRate ≥ passThreshold`. `null` if no `k`
|
|
1458
|
+
* tested reaches it.
|
|
1459
|
+
*/
|
|
1460
|
+
firstPassK: number | null;
|
|
1461
|
+
/**
|
|
1462
|
+
* Area under the (k, meanScore) curve, normalized by max-k. A
|
|
1463
|
+
* single-number summary of "how well does this policy adapt from
|
|
1464
|
+
* cold-start to fully-conditioned." Higher = better adapter.
|
|
1465
|
+
*/
|
|
1466
|
+
adaptationArea: number;
|
|
1467
|
+
}
|
|
1468
|
+
declare function runAdaptationCurve<S extends {
|
|
1469
|
+
scenarioId?: string;
|
|
1470
|
+
}>(opts: RunAdaptationCurveOptions<S>): Promise<AdaptationCurve>;
|
|
1471
|
+
interface CompareCurvesResult {
|
|
1472
|
+
perK: Array<{
|
|
1473
|
+
k: number;
|
|
1474
|
+
deltaMean: number;
|
|
1475
|
+
aLow: number;
|
|
1476
|
+
aHigh: number;
|
|
1477
|
+
bLow: number;
|
|
1478
|
+
bHigh: number;
|
|
1479
|
+
}>;
|
|
1480
|
+
areaDelta: number;
|
|
1481
|
+
firstPassKDelta: number | null;
|
|
1482
|
+
/** Verdict: 'a_better' | 'b_better' | 'similar'. */
|
|
1483
|
+
verdict: 'a_better' | 'b_better' | 'similar';
|
|
1484
|
+
/** Rationale, ready to render. */
|
|
1485
|
+
rationale: string;
|
|
1486
|
+
}
|
|
1487
|
+
/**
|
|
1488
|
+
* Paired comparison of two adaptation curves. Per-k deltas with 95%
|
|
1489
|
+
* bootstrap CIs (constructed from each curve's `perScenario` per-k means
|
|
1490
|
+
* — the bootstrap unit is the scenario, not the rep).
|
|
1491
|
+
*/
|
|
1492
|
+
declare function compareAdaptationCurves(a: AdaptationCurve, b: AdaptationCurve, opts?: {
|
|
1493
|
+
confidence?: number;
|
|
1494
|
+
bootstrapResamples?: number;
|
|
1495
|
+
seed?: number;
|
|
1496
|
+
}): CompareCurvesResult;
|
|
1497
|
+
/** First k at which the curve's per-scenario pass rate reliably hits the threshold. */
|
|
1498
|
+
declare function firstPassK(curve: AdaptationCurve, threshold?: number): number | null;
|
|
1499
|
+
|
|
1500
|
+
/**
|
|
1501
|
+
* Trainer-format exporters.
|
|
1502
|
+
*
|
|
1503
|
+
* agent-eval produces canonical artifacts (`RunRecord[]`, `PreferenceTriple[]`,
|
|
1504
|
+
* `StepReward[]`, `PrmTrainingTriple[]`). RL training pipelines consume
|
|
1505
|
+
* different shapes — Hugging Face TRL, Prime Intellect's prime-rl, OpenAI
|
|
1506
|
+
* fine-tuning, Anthropic finetuning, OpenRLHF, verl. Each has its own
|
|
1507
|
+
* JSONL conventions. Rather than ship N adapters, this module ships the
|
|
1508
|
+
* canonical formats most production pipelines accept and ergonomic helpers
|
|
1509
|
+
* for the rest.
|
|
1510
|
+
*
|
|
1511
|
+
* Shapes:
|
|
1512
|
+
* - **DPO / IPO / KTO** — `{prompt, chosen, rejected}` JSONL. Consumed
|
|
1513
|
+
* by HuggingFace TRL, prime-rl's offline DPO, OpenRLHF.
|
|
1514
|
+
* - **GRPO offline** — `{prompt, completions[], rewards[]}` JSONL.
|
|
1515
|
+
* Consumed by prime-rl GRPO, verl, OpenRLHF.
|
|
1516
|
+
* - **SFT** — `{messages[]}` JSONL with chosen completion as the final
|
|
1517
|
+
* assistant turn. Consumed by HF SFT trainers, OpenAI fine-tuning,
|
|
1518
|
+
* Anthropic finetuning.
|
|
1519
|
+
* - **PRM** — `{prompt, prefix_steps[], chosen_step, rejected_step}` JSONL.
|
|
1520
|
+
* Consumed by Lightman-style PRM trainers and prime-rl's PRM mode.
|
|
1521
|
+
*
|
|
1522
|
+
* Why ship this in agent-eval rather than a separate adapter package: the
|
|
1523
|
+
* canonical artifacts (`RunRecord[]`, `PreferenceTriple[]`, etc.) are
|
|
1524
|
+
* agent-eval's contract; without first-party exporters consumers reverse-
|
|
1525
|
+
* engineer the mapping every release. The exporters codify it.
|
|
1526
|
+
*
|
|
1527
|
+
* The exporters take callbacks for any field that isn't on the canonical
|
|
1528
|
+
* artifact (specifically: prompt + completion text, since the package
|
|
1529
|
+
* stores only their hashes by design — full text is the consumer's
|
|
1530
|
+
* trace store / raw event log).
|
|
1531
|
+
*/
|
|
1532
|
+
|
|
1533
|
+
interface DpoLookups {
|
|
1534
|
+
/** Resolve the prompt text for a run (typically from a trace store / raw event sink). */
|
|
1535
|
+
promptOf: (runId: string) => string | Promise<string>;
|
|
1536
|
+
/** Resolve the assistant completion text for a run. */
|
|
1537
|
+
completionOf: (runId: string) => string | Promise<string>;
|
|
1538
|
+
}
|
|
1539
|
+
interface DpoExportRow {
|
|
1540
|
+
prompt: string;
|
|
1541
|
+
chosen: string;
|
|
1542
|
+
rejected: string;
|
|
1543
|
+
/** Carried-through margin. Some KTO / IPO variants use this. */
|
|
1544
|
+
margin?: number;
|
|
1545
|
+
/** Free-form metadata for downstream filtering / sharding. */
|
|
1546
|
+
meta?: Record<string, unknown>;
|
|
1547
|
+
}
|
|
1548
|
+
/**
|
|
1549
|
+
* Convert preference triples to TRL-compatible DPO rows. The shape
|
|
1550
|
+
* `{prompt, chosen, rejected}` is the canonical HuggingFace DPODataset
|
|
1551
|
+
* entry; every major DPO trainer accepts it.
|
|
1552
|
+
*/
|
|
1553
|
+
declare function toDpoRows(triples: PreferenceTriple[], lookups: DpoLookups): Promise<DpoExportRow[]>;
|
|
1554
|
+
/** Serialize DPO rows as JSONL. One line per row. */
|
|
1555
|
+
declare function toDpoJsonl(rows: DpoExportRow[]): string;
|
|
1556
|
+
interface GrpoLookups {
|
|
1557
|
+
promptOf: (runId: string) => string | Promise<string>;
|
|
1558
|
+
completionOf: (runId: string) => string | Promise<string>;
|
|
1559
|
+
/** Optional: derive a custom reward from the run. Defaults to score. */
|
|
1560
|
+
rewardOf?: (run: RunRecord) => number | null;
|
|
1561
|
+
}
|
|
1562
|
+
interface GrpoExportRow {
|
|
1563
|
+
prompt: string;
|
|
1564
|
+
completions: string[];
|
|
1565
|
+
rewards: number[];
|
|
1566
|
+
/** runIds in the same order as `completions[]` for traceability. */
|
|
1567
|
+
runIds: string[];
|
|
1568
|
+
meta?: Record<string, unknown>;
|
|
1569
|
+
}
|
|
1570
|
+
/**
|
|
1571
|
+
* Convert RunRecord[] grouped by `(scenarioId)` into GRPO offline rows —
|
|
1572
|
+
* one row per scenario, with one completion per run on that scenario.
|
|
1573
|
+
*
|
|
1574
|
+
* GRPO (Shao et al. 2024 / DeepSeek-R1) trains on relative advantages
|
|
1575
|
+
* within a group of completions for the same prompt; this is the
|
|
1576
|
+
* canonical input format.
|
|
1577
|
+
*/
|
|
1578
|
+
declare function toGrpoRows(runs: RunRecord[], lookups: GrpoLookups): Promise<GrpoExportRow[]>;
|
|
1579
|
+
declare function toGrpoJsonl(rows: GrpoExportRow[]): string;
|
|
1580
|
+
interface SftLookups {
|
|
1581
|
+
promptOf: (runId: string) => string | Promise<string>;
|
|
1582
|
+
completionOf: (runId: string) => string | Promise<string>;
|
|
1583
|
+
/** Optional system message. Default omits. */
|
|
1584
|
+
systemOf?: (run: RunRecord) => string | null | undefined;
|
|
1585
|
+
/** Filter — return false to skip the run (e.g., low score, failed cases). */
|
|
1586
|
+
include?: (run: RunRecord) => boolean;
|
|
1587
|
+
}
|
|
1588
|
+
interface SftExportRow {
|
|
1589
|
+
messages: Array<{
|
|
1590
|
+
role: 'system' | 'user' | 'assistant';
|
|
1591
|
+
content: string;
|
|
1592
|
+
}>;
|
|
1593
|
+
meta?: Record<string, unknown>;
|
|
1594
|
+
}
|
|
1595
|
+
/**
|
|
1596
|
+
* Convert RunRecord[] into Hugging Face / OpenAI / Anthropic-style
|
|
1597
|
+
* conversational SFT rows. By default every record becomes one row;
|
|
1598
|
+
* pass `include` to filter (e.g., keep only `score >= 0.8` for
|
|
1599
|
+
* rejection-sampling SFT).
|
|
1600
|
+
*/
|
|
1601
|
+
declare function toSftRows(runs: RunRecord[], lookups: SftLookups): Promise<SftExportRow[]>;
|
|
1602
|
+
declare function toSftJsonl(rows: SftExportRow[]): string;
|
|
1603
|
+
interface PrmLookups {
|
|
1604
|
+
/** Resolve the prompt text for a run. */
|
|
1605
|
+
promptOf: (runId: string) => string | Promise<string>;
|
|
1606
|
+
/** Resolve the trajectory step text for a (runId, spanId) pair. */
|
|
1607
|
+
stepTextOf: (runId: string, spanId: string) => string | Promise<string>;
|
|
1608
|
+
/** Optional: sequence of prefix span ids leading up to the divergence. */
|
|
1609
|
+
prefixOf?: (runId: string, prefixStepIndex: number) => string[] | Promise<string[]>;
|
|
1610
|
+
}
|
|
1611
|
+
interface PrmExportRow {
|
|
1612
|
+
prompt: string;
|
|
1613
|
+
/** Span ids for the steps before divergence — caller resolves text via `stepTextOf`. */
|
|
1614
|
+
prefixSpanIds: string[];
|
|
1615
|
+
prefixStepText: string[];
|
|
1616
|
+
chosenStep: string;
|
|
1617
|
+
rejectedStep: string;
|
|
1618
|
+
chosenReward: number;
|
|
1619
|
+
rejectedReward: number;
|
|
1620
|
+
marginScore: number;
|
|
1621
|
+
meta?: Record<string, unknown>;
|
|
1622
|
+
}
|
|
1623
|
+
/**
|
|
1624
|
+
* Convert PRM training triples to JSONL rows. Caller's `stepTextOf`
|
|
1625
|
+
* callback resolves span text from the consumer's trace store.
|
|
1626
|
+
*/
|
|
1627
|
+
declare function toPrmRows(triples: PrmTrainingTriple[], lookups: PrmLookups): Promise<PrmExportRow[]>;
|
|
1628
|
+
declare function toPrmJsonl(rows: PrmExportRow[]): string;
|
|
1629
|
+
interface StepRewardJsonlRow {
|
|
1630
|
+
runId: string;
|
|
1631
|
+
spanId: string;
|
|
1632
|
+
stepIndex: number;
|
|
1633
|
+
reward: number;
|
|
1634
|
+
determinism: 'deterministic' | 'probabilistic';
|
|
1635
|
+
weight: number;
|
|
1636
|
+
}
|
|
1637
|
+
declare function stepRewardsToJsonl(stepRewards: StepReward[]): string;
|
|
1638
|
+
|
|
1639
|
+
/**
|
|
1640
|
+
* `runRLCampaign` — the missing top-level orchestrator.
|
|
1641
|
+
*
|
|
1642
|
+
* `runEvalCampaign` runs the matrix and produces `RunRecord[]`. The 0.23
|
|
1643
|
+
* RL primitives consume that artifact in different ways. Until 0.24 they
|
|
1644
|
+
* had to be wired together by hand at every consumer; that defeats the
|
|
1645
|
+
* cohesion the package is supposed to provide.
|
|
1646
|
+
*
|
|
1647
|
+
* `runRLCampaign` wires:
|
|
1648
|
+
* 1. `runEvalCampaign` for the matrix run (capture, integrity, hooks)
|
|
1649
|
+
* 2. `extractVerifiableReward` over each run, separating deterministic
|
|
1650
|
+
* from probabilistic reward sources for the trainer
|
|
1651
|
+
* 3. `extractPreferences` to produce DPO/PPO/KTO triples
|
|
1652
|
+
* 4. `evaluateInterimReleaseConfidence` over paired deltas (anytime-valid)
|
|
1653
|
+
* 5. `rubricPredictiveValidity` against an outcome store, when provided
|
|
1654
|
+
* 6. `detectRewardHacking` as a standing hygiene check
|
|
1655
|
+
* 7. Trainer-format export rows ready for prime-rl / TRL / verl
|
|
1656
|
+
*
|
|
1657
|
+
* The output `RLCampaignResult` is a single, audit-ready artifact: every
|
|
1658
|
+
* stage's output is in there. The consumer's downstream fits in a single
|
|
1659
|
+
* line: pass `result.preferences` to their DPO trainer, `result.grpoRows`
|
|
1660
|
+
* to GRPO, `result.runs` plus `result.rewardSignals` to a custom RL loop.
|
|
1661
|
+
*
|
|
1662
|
+
* This is what the 0.23 panel critique called the "missing top-level
|
|
1663
|
+
* primitive." Now shipped.
|
|
1664
|
+
*/
|
|
1665
|
+
|
|
1666
|
+
interface RunRLCampaignOptions<V> extends EvalCampaignOptions<V> {
|
|
1667
|
+
/** Preference-extraction options. Default uses paired-by-scenario-and-seed with min-margin 0.05. */
|
|
1668
|
+
preferences?: ExtractPreferencesOptions;
|
|
1669
|
+
/** Verifiable-reward extraction options. */
|
|
1670
|
+
verifiableReward?: VerifiableRewardExtractionOptions;
|
|
1671
|
+
/** Outcome store + metric names — when supplied, runs `rubricPredictiveValidity` post-campaign. */
|
|
1672
|
+
outcomeStore?: OutcomeStore;
|
|
1673
|
+
outcomeMetrics?: string[];
|
|
1674
|
+
/** Anytime-valid sequential evaluation options. */
|
|
1675
|
+
sequential?: {
|
|
1676
|
+
alpha?: number;
|
|
1677
|
+
bound?: number;
|
|
1678
|
+
rope?: {
|
|
1679
|
+
low: number;
|
|
1680
|
+
high: number;
|
|
1681
|
+
};
|
|
1682
|
+
};
|
|
1683
|
+
/** Trainer-format export lookups. When provided, the orchestrator builds the corresponding rows. */
|
|
1684
|
+
trainerExport?: {
|
|
1685
|
+
dpo?: DpoLookups;
|
|
1686
|
+
grpo?: GrpoLookups;
|
|
1687
|
+
sft?: SftLookups;
|
|
1688
|
+
};
|
|
1689
|
+
}
|
|
1690
|
+
interface RLCampaignResult<V> {
|
|
1691
|
+
campaign: EvalCampaignResult;
|
|
1692
|
+
/** Per-run verifiable reward (deterministic when available, probabilistic fallback otherwise). */
|
|
1693
|
+
rewardSignals: Array<{
|
|
1694
|
+
runId: string;
|
|
1695
|
+
reward: VerifiableReward | null;
|
|
1696
|
+
}>;
|
|
1697
|
+
/** Preference extraction report. */
|
|
1698
|
+
preferences: PreferenceExtractionReport;
|
|
1699
|
+
/** Anytime-valid interim verdict over the paired deltas (vs comparator). */
|
|
1700
|
+
interimConfidence: InterimReleaseConfidence | null;
|
|
1701
|
+
/** Standing reward-hacking hygiene check. */
|
|
1702
|
+
rewardHacking: RewardHackingReport;
|
|
1703
|
+
/** Predictive validity, when an outcome store was supplied. */
|
|
1704
|
+
predictiveValidity: RubricPredictiveValidityReport | null;
|
|
1705
|
+
/** Trainer-export rows, populated only for the formats the caller requested via `trainerExport`. */
|
|
1706
|
+
trainerRows: {
|
|
1707
|
+
dpo?: DpoExportRow[];
|
|
1708
|
+
grpo?: GrpoExportRow[];
|
|
1709
|
+
sft?: SftExportRow[];
|
|
1710
|
+
};
|
|
1711
|
+
/**
|
|
1712
|
+
* One-line top-level summary the consumer can log.
|
|
1713
|
+
*/
|
|
1714
|
+
summary: string;
|
|
1715
|
+
/**
|
|
1716
|
+
* Convenience type-tag — consumers can branch on `result.kind`.
|
|
1717
|
+
*/
|
|
1718
|
+
kind: 'agent-eval-rl-campaign';
|
|
1719
|
+
unusedVariant?: V;
|
|
1720
|
+
}
|
|
1721
|
+
declare function runRLCampaign<V>(opts: RunRLCampaignOptions<V>): Promise<RLCampaignResult<V>>;
|
|
1722
|
+
|
|
1723
|
+
/**
|
|
1724
|
+
* `PredictiveValidityResearcher` — concrete `Researcher` implementation
|
|
1725
|
+
* that drives selection from outcome-anchored predictive validity.
|
|
1726
|
+
*
|
|
1727
|
+
* `Researcher` was a placeholder interface plus `NoopResearcher` until
|
|
1728
|
+
* 0.23. The 0.23 panel critique called this out: shipping the interface
|
|
1729
|
+
* without a default implementation that drives the loop is incomplete.
|
|
1730
|
+
*
|
|
1731
|
+
* This researcher answers each method:
|
|
1732
|
+
*
|
|
1733
|
+
* - `inspectFailures(runs)` — synthesizes failure modes from the
|
|
1734
|
+
* bottom-quartile of `RunRecord`s on the configured proxy reward.
|
|
1735
|
+
* - `proposeChange(failures)` — proposes steering changes that target
|
|
1736
|
+
* the rubrics with the lowest predictive validity (decorative ones).
|
|
1737
|
+
* Either reduce their weight in the composite, or recalibrate them.
|
|
1738
|
+
* - `applyChange(changes, baseline)` — merges the proposed steering
|
|
1739
|
+
* into the experiment plan.
|
|
1740
|
+
* - `evaluateChange(plan)` — re-runs the predictive-validity check on
|
|
1741
|
+
* the post-change runs and reports the delta.
|
|
1742
|
+
*
|
|
1743
|
+
* The result is a closed loop: the rubric weights drift toward the ones
|
|
1744
|
+
* that actually predict deployment outcomes, automatically. Pair with
|
|
1745
|
+
* `runRLCampaign` for the full auto-research story.
|
|
1746
|
+
*/
|
|
1747
|
+
|
|
1748
|
+
interface PredictiveValidityResearcherOptions {
|
|
1749
|
+
outcomes: OutcomeStore;
|
|
1750
|
+
outcomeMetrics: string[];
|
|
1751
|
+
/** Score threshold below which a run counts as a "failure." Default 0.5. */
|
|
1752
|
+
failureThreshold?: number;
|
|
1753
|
+
/** Spearman bucket below which a rubric is "decorative." Default 0.4. */
|
|
1754
|
+
decorativeThreshold?: number;
|
|
1755
|
+
/** Optional steering-namespace prefix for proposed changes. Default `'rubric_weight'`. */
|
|
1756
|
+
steeringNamespace?: string;
|
|
1757
|
+
/** Override the rubric set the researcher inspects. Default: every numeric `outcome.raw` key seen. */
|
|
1758
|
+
rubrics?: string[];
|
|
1759
|
+
/**
|
|
1760
|
+
* Snapshot stash hook — called with the most recent predictive-validity
|
|
1761
|
+
* report. Useful when a downstream system wants to log rubric drift over
|
|
1762
|
+
* time. Default no-op.
|
|
1763
|
+
*/
|
|
1764
|
+
onReport?: (report: RubricPredictiveValidityReport) => void | Promise<void>;
|
|
1765
|
+
}
|
|
1766
|
+
/**
|
|
1767
|
+
* Concrete `Researcher` driven by `rubricPredictiveValidity`. The brain:
|
|
1768
|
+
* rubrics that don't predict deployment outcomes don't earn weight.
|
|
1769
|
+
*/
|
|
1770
|
+
declare class PredictiveValidityResearcher implements Researcher {
|
|
1771
|
+
private opts;
|
|
1772
|
+
private lastReport;
|
|
1773
|
+
constructor(opts: PredictiveValidityResearcherOptions);
|
|
1774
|
+
inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
|
|
1775
|
+
proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
|
|
1776
|
+
applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
|
|
1777
|
+
evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
|
|
1778
|
+
/**
|
|
1779
|
+
* Run the predictive-validity check explicitly against a fresh RunRecord
|
|
1780
|
+
* set. Updates the researcher's cached report so subsequent
|
|
1781
|
+
* `proposeChange` calls have evidence to draw from.
|
|
1782
|
+
*/
|
|
1783
|
+
runValidityCheck(runs: RunRecord[]): Promise<RubricPredictiveValidityReport>;
|
|
1784
|
+
/**
|
|
1785
|
+
* Force-feed a predictive-validity report into the researcher state —
|
|
1786
|
+
* useful when the consumer ran the report out-of-band and wants the
|
|
1787
|
+
* researcher's later proposals informed by it.
|
|
1788
|
+
*/
|
|
1789
|
+
setReport(report: RubricPredictiveValidityReport): void;
|
|
1790
|
+
getLastReport(): RubricPredictiveValidityReport | null;
|
|
1791
|
+
}
|
|
1792
|
+
|
|
1793
|
+
/**
|
|
1794
|
+
* `analyzeOptimizationResult` — unifies the pre-0.22 auto-research stack
|
|
1795
|
+
* (`runPromptEvolution`, `runMultiShotOptimization`, reflective-mutation,
|
|
1796
|
+
* Ax/AxRLM trace analyst) with the 0.23 RL bridge in a single call.
|
|
1797
|
+
*
|
|
1798
|
+
* What this fixes: until 0.23 the optimization stack and the RL bridge
|
|
1799
|
+
* lived in parallel namespaces. The optimization primitives produced
|
|
1800
|
+
* `TrialResult[]`; the RL bridge consumed `RunRecord[]`. Trace-analyst
|
|
1801
|
+
* was decoupled from both. `analyzeOptimizationResult` does the wiring
|
|
1802
|
+
* once so consumers don't have to:
|
|
1803
|
+
*
|
|
1804
|
+
* Optimization (existing primitives) RL bridge (0.23)
|
|
1805
|
+
* ────────────────────────────────── ────────────────
|
|
1806
|
+
* runPromptEvolution → TrialResult[] →
|
|
1807
|
+
* runMultiShotOptimization → MSTrial[] → analyzeOptimizationResult →
|
|
1808
|
+
* reflective-mutation → mutations.jsonl → ↓
|
|
1809
|
+
* │
|
|
1810
|
+
* ↓ (per-generation inputs flow back) │
|
|
1811
|
+
* PredictiveValidityResearcher.proposeChange ←───────────────────── │
|
|
1812
|
+
* │
|
|
1813
|
+
* ↓ │
|
|
1814
|
+
* TraceAnalyst.analyze(progressLog) ←─────────────────────────┘
|
|
1815
|
+
*
|
|
1816
|
+
* The output of this function is the canonical RL artifact set:
|
|
1817
|
+
* `RunRecord[]` (so every other 0.22+ primitive composes), preference
|
|
1818
|
+
* triples, verifiable reward signals, reward-hacking diagnosis,
|
|
1819
|
+
* sequential interim verdict, and (when wired) trace-analyst summary.
|
|
1820
|
+
*
|
|
1821
|
+
* What this primitive does NOT do: it does not modify the optimization
|
|
1822
|
+
* primitives' internals. They keep producing `TrialResult` and emitting
|
|
1823
|
+
* `onProgress` events; this function bridges *after* the sweep completes.
|
|
1824
|
+
* Per-step capture-integrity (raw HTTP events from inside the score
|
|
1825
|
+
* adapter) requires the consumer to wire `RawProviderSink` into their
|
|
1826
|
+
* own `ScoreAdapter` — that's a per-consumer integration point.
|
|
1827
|
+
*/
|
|
1828
|
+
|
|
1829
|
+
interface AnalyzeOptimizationResultOptions {
|
|
1830
|
+
/**
|
|
1831
|
+
* The optimization output. Either a `PromptEvolutionResult` or a
|
|
1832
|
+
* `MultiShotOptimizationResult`. The function detects which by
|
|
1833
|
+
* structural typing and produces canonical `RunRecord[]` from either.
|
|
1834
|
+
*/
|
|
1835
|
+
result: PromptEvolutionResult | MultiShotOptimizationResult;
|
|
1836
|
+
/** Adapter context — `commitSha`, `model`, `promptHash`, `configHash`. */
|
|
1837
|
+
ctx: AdapterContext;
|
|
1838
|
+
/** Optional comparator candidate id for paired analyses. */
|
|
1839
|
+
comparator?: string;
|
|
1840
|
+
/** Verifiable-reward extraction options. */
|
|
1841
|
+
verifiableReward?: VerifiableRewardExtractionOptions;
|
|
1842
|
+
/** Preference extraction options. */
|
|
1843
|
+
preferences?: ExtractPreferencesOptions;
|
|
1844
|
+
/** Sequential interim-confidence options. */
|
|
1845
|
+
sequential?: {
|
|
1846
|
+
alpha?: number;
|
|
1847
|
+
bound?: number;
|
|
1848
|
+
rope?: {
|
|
1849
|
+
low: number;
|
|
1850
|
+
high: number;
|
|
1851
|
+
};
|
|
1852
|
+
};
|
|
1853
|
+
/** Outcome calibration store + metrics. */
|
|
1854
|
+
outcomes?: {
|
|
1855
|
+
store: OutcomeStore;
|
|
1856
|
+
metrics: string[];
|
|
1857
|
+
};
|
|
1858
|
+
/** Trainer-format export — DPO + GRPO lookups. */
|
|
1859
|
+
trainerExport?: {
|
|
1860
|
+
dpo?: DpoLookups;
|
|
1861
|
+
grpo?: GrpoLookups;
|
|
1862
|
+
};
|
|
1863
|
+
}
|
|
1864
|
+
interface AnalyzeOptimizationResultReport {
|
|
1865
|
+
/** All trials promoted to canonical `RunRecord` shape. */
|
|
1866
|
+
runs: RunRecord[];
|
|
1867
|
+
/** Per-run verifiable reward signal. */
|
|
1868
|
+
rewardSignals: Array<{
|
|
1869
|
+
runId: string;
|
|
1870
|
+
reward: VerifiableReward | null;
|
|
1871
|
+
}>;
|
|
1872
|
+
/** Preference triples ready for DPO/PPO/KTO training. */
|
|
1873
|
+
preferences: PreferenceExtractionReport;
|
|
1874
|
+
/** Anytime-valid sequential verdict, when a comparator is supplied. */
|
|
1875
|
+
interimConfidence: InterimReleaseConfidence | null;
|
|
1876
|
+
/** Standing reward-hacking hygiene check. */
|
|
1877
|
+
rewardHacking: RewardHackingReport;
|
|
1878
|
+
/** Predictive validity, when an outcome store is supplied. */
|
|
1879
|
+
predictiveValidity: RubricPredictiveValidityReport | null;
|
|
1880
|
+
/** Trainer-export rows, populated only for the formats requested. */
|
|
1881
|
+
trainerRows: {
|
|
1882
|
+
dpo?: DpoExportRow[];
|
|
1883
|
+
grpo?: GrpoExportRow[];
|
|
1884
|
+
};
|
|
1885
|
+
/** One-line summary suitable for logs. */
|
|
1886
|
+
summary: string;
|
|
1887
|
+
}
|
|
1888
|
+
/**
|
|
1889
|
+
* Convert an optimization sweep output into a fully-analysed RL artifact
|
|
1890
|
+
* set. Idempotent and read-only with respect to the optimization result.
|
|
1891
|
+
*/
|
|
1892
|
+
declare function analyzeOptimizationResult(opts: AnalyzeOptimizationResultOptions): Promise<AnalyzeOptimizationResultReport>;
|
|
1893
|
+
|
|
1894
|
+
export { type RewardHackingFinding as $, type AdaptationCurve as A, type BradleyTerryFit as B, type CellObservation as C, type DetectRewardHackingInput as D, type EloOptions as E, type Finding as F, type GrpoExportRow as G, type GrpoLookups as H, type LayerStatus as I, type OffPolicyOptions as J, type OffPolicyTrajectory as K, type LayerResult as L, MultiLayerVerifier as M, type ParetoPointInput as N, type OffPolicyEstimate as O, type PairwiseOutcome as P, PredictiveValidityResearcher as Q, type PredictiveValidityResearcherOptions as R, type Severity as S, type PreferenceExtractionReport as T, type PreferenceStrategy as U, type VerifyContext as V, type PreferenceTriple as W, type PrmExportRow as X, type PrmLookups as Y, type PrmTrainingTriple as Z, type RLCampaignResult as _, type Layer as a, toTRLFormat as a$, type RewardHackingReport as a0, type RewardHackingSignal as a1, type RunAdaptationCurveOptions as a2, type RunComputeCurveOptions as a3, type RunRLCampaignOptions as a4, type RunwiseStepSummary as a5, type ScenarioPerturbation as a6, type ScenarioPerturbationKind as a7, type SelfConsistencyOptions as a8, type SelfConsistencyResult as a9, fitBradleyTerry as aA, gradeSemanticStatus as aB, injectIrrelevantClause as aC, inverseProbabilityWeighting as aD, observationsFromRunRecords as aE, offPolicyEstimateAll as aF, prmTrainingPairs as aG, renameVariables as aH, runAdaptationCurve as aI, runComputeCurve as aJ, runContaminationProbe as aK, runRLCampaign as aL, runwiseStepRewardSummary as aM, selfConsistency as aN, selfNormalizedImportanceWeighting as aO, shuffleOrder as aP, stepRewardsToJsonl as aQ, thompsonCurriculum as aR, toAnthropicFormat as aS, toDpoJsonl as aT, toDpoRows as aU, toGrpoJsonl as aV, toGrpoRows as aW, toPrmJsonl as aX, toPrmRows as aY, toSftJsonl as aZ, toSftRows as a_, type SftExportRow as aa, type SftLookups as ab, type StepReward as ac, type StepRewardJsonlRow as ad, type StepScorer as ae, type ThompsonCurriculumOptions as af, type VarianceCurriculumOptions as ag, type VerifiableReward as ah, type VerifiableRewardExtractionOptions as ai, type VerifiableRewardSource as aj, type VerificationReport as ak, type VerifyOptions as al, adversarialScenarioSearch as am, analyzeOptimizationResult as an, applyEloUpdate as ao, bestOfN as ap, buildPairwiseFromCampaign as aq, compareAdaptationCurves as ar, detectRewardHacking as as, doublyRobust as at, extractPreferences as au, extractStepRewards as av, extractVerifiableReward as aw, extractVerifiableRewardsFromRecords as ax, filterDeterministicallyRewarded as ay, firstPassK as az, type AdaptationPoint as b, trialToRunRecord as b0, trialsToRunRecords as b1, varianceBasedCurriculum as b2, variantAggregateToRunRecord as b3, verificationReportToRunRecord as b4, paretoFrontier as b5, type AdaptationRunner as c, type AdapterContext as d, type AdversarialMutation as e, type AdversarialScenario as f, type AdversarialSearchOptions as g, type AdversarialSearchReport as h, type AnalyzeOptimizationResultOptions as i, type AnalyzeOptimizationResultReport as j, type BradleyTerryRating as k, type BuildPairwiseFromCampaignInput as l, type CompareCurvesResult as m, type ComputeBestOfNOptions as n, type ComputeBestOfNResult as o, type ComputeCurve as p, type ComputeCurveBudget as q, type ComputeCurvePoint as r, type ContaminationProbeInput as s, type ContaminationProbeOptions as t, type ContaminationProbeReport as u, type CurriculumAllocation as v, type DpoExportRow as w, type DpoLookups as x, type ExtractPreferencesOptions as y, type ExtractStepRewardsOptions as z };
|