@tangle-network/agent-eval 0.21.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/CHANGELOG.md +236 -1
  2. package/README.md +17 -3
  3. package/dist/benchmarks/index.d.ts +2 -2
  4. package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
  5. package/dist/chunk-4W4NCYM2.js.map +1 -0
  6. package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
  7. package/dist/chunk-6M774GY6.js +53 -0
  8. package/dist/chunk-6M774GY6.js.map +1 -0
  9. package/dist/chunk-7EAUOUQS.js +495 -0
  10. package/dist/chunk-7EAUOUQS.js.map +1 -0
  11. package/dist/chunk-AXHNWLIX.js +246 -0
  12. package/dist/chunk-AXHNWLIX.js.map +1 -0
  13. package/dist/chunk-EXGR4XEM.js +283 -0
  14. package/dist/chunk-EXGR4XEM.js.map +1 -0
  15. package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
  16. package/dist/chunk-IOXMGMHQ.js.map +1 -0
  17. package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
  18. package/dist/chunk-LZKIOBG2.js +2026 -0
  19. package/dist/chunk-LZKIOBG2.js.map +1 -0
  20. package/dist/{chunk-YUFXO3TU.js → chunk-QBW3YBTR.js} +1 -1
  21. package/dist/chunk-QBW3YBTR.js.map +1 -0
  22. package/dist/chunk-QUKKGHTZ.js +121 -0
  23. package/dist/chunk-QUKKGHTZ.js.map +1 -0
  24. package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
  25. package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
  26. package/dist/{chunk-ARZ6BEV6.js → chunk-V5QSWN7L.js} +2 -2
  27. package/dist/{chunk-HRZELXCR.js → chunk-VQQSPGSM.js} +3 -3
  28. package/dist/cli.js +3 -3
  29. package/dist/{control-cxwMOAsy.d.ts → control-DvkH87qJ.d.ts} +2 -2
  30. package/dist/control.d.ts +3 -3
  31. package/dist/control.js +2 -2
  32. package/dist/eval-campaign-Ds5QljIh.d.ts +573 -0
  33. package/dist/{feedback-trajectory-CB0A32o3.d.ts → feedback-trajectory-c43WGtTX.d.ts} +1 -1
  34. package/dist/{index-c5saLbKD.d.ts → index-DDTlbHEK.d.ts} +1 -1
  35. package/dist/index-ekBXweiQ.d.ts +1894 -0
  36. package/dist/index.d.ts +20 -430
  37. package/dist/index.js +154 -34
  38. package/dist/index.js.map +1 -1
  39. package/dist/integrity-Cr5YodSY.d.ts +210 -0
  40. package/dist/openapi.json +1 -1
  41. package/dist/optimization.d.ts +7 -145
  42. package/dist/optimization.js +12 -3
  43. package/dist/reporting.d.ts +294 -4
  44. package/dist/reporting.js +18 -9
  45. package/dist/rl.d.ts +8 -0
  46. package/dist/rl.js +113 -0
  47. package/dist/rl.js.map +1 -0
  48. package/dist/{run-record-CX_jcAyr.d.ts → run-record-DNiOMBrZ.d.ts} +10 -1
  49. package/dist/sequential-DgU2mFsE.d.ts +304 -0
  50. package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-Ce1r4EYo.d.ts} +382 -2
  51. package/dist/traces.d.ts +101 -181
  52. package/dist/traces.js +19 -8
  53. package/dist/wire/index.js +3 -3
  54. package/docs/auto-research-loop-end-to-end.md +186 -0
  55. package/docs/research-report-methodology.md +19 -4
  56. package/docs/three-package-architecture.md +180 -0
  57. package/docs/wire-protocol.md +1 -1
  58. package/package.json +7 -2
  59. package/dist/chunk-3IX6QTB7.js.map +0 -1
  60. package/dist/chunk-KRR4VMH7.js +0 -423
  61. package/dist/chunk-KRR4VMH7.js.map +0 -1
  62. package/dist/chunk-WOK2RTWG.js.map +0 -1
  63. package/dist/chunk-YUFXO3TU.js.map +0 -1
  64. package/dist/reporting-Da2ihlcM.d.ts +0 -672
  65. /package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
  66. /package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
  67. /package/dist/{chunk-ARZ6BEV6.js.map → chunk-V5QSWN7L.js.map} +0 -0
  68. /package/dist/{chunk-HRZELXCR.js.map → chunk-VQQSPGSM.js.map} +0 -0
@@ -1,5 +1,295 @@
1
- export { B as BootstrapOptions, b as BootstrapResult, G as GainDistributionBin, g as GainDistributionFigureSpec, h as GainDistributionOptions, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, i as PairedBootstrapResult, j as ParetoFigureSpec, k as ParetoPoint, l as RESEARCH_REPORT_HARD_PAIR_FLOOR, m as ReleaseConfidenceAxis, n as ReleaseConfidenceAxisName, o as ReleaseConfidenceInput, p as ReleaseConfidenceIssue, q as ReleaseConfidenceMetrics, a as ReleaseConfidenceScorecard, r as ReleaseConfidenceStatus, R as ReleaseConfidenceThresholds, s as ReleaseTraceEvidence, t as RenderReleaseReportOptions, u as ResearchReport, v as ResearchReportCandidate, w as ResearchReportDecision, x as ResearchReportMethodology, y as ResearchReportOptions, z as ResearchReportRecommendation, S as SummaryTable, A as SummaryTableOptions, C as SummaryTableRow, V as Verdict, E as assertReleaseConfidence, H as bhAdjust, I as bootstrapCi, L as evaluateReleaseConfidence, N as gainHistogram, O as judgeReplayGate, Q as pairedBootstrap, T as pairedWilcoxon, U as paretoChart, W as releaseTraceEvidenceFromMultiShotTrials, X as renderReleaseReport, Y as researchReport, Z as summaryTable } from './reporting-Da2ihlcM.js';
2
- import './dataset-B9qvlm_o.js';
3
- import './multi-shot-optimization-Bvtz294B.js';
4
- import './run-record-CX_jcAyr.js';
1
+ import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-B9qvlm_o.js';
2
+ import { C as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-Ce1r4EYo.js';
3
+ export { Y as GainDistributionBin, Z as GainDistributionFigureSpec, _ as GainDistributionOptions, a3 as ParetoFigureSpec, a4 as ParetoPoint, a5 as RESEARCH_REPORT_HARD_PAIR_FLOOR, H as ResearchReport, a6 as ResearchReportCandidate, a7 as ResearchReportDecision, a8 as ResearchReportMethodology, F as ResearchReportOptions, a9 as ResearchReportRecommendation, aa as SummaryTable, ab as SummaryTableOptions, ac as SummaryTableRow, ah as gainHistogram, ai as paretoChart, al as researchReport, an as summaryTable } from './summary-report-Ce1r4EYo.js';
4
+ import { R as RunRecord, a as RunSplitTag } from './run-record-DNiOMBrZ.js';
5
+ export { c as InterimReleaseConfidence, d as InterimReleaseConfidenceInput, P as PairedEvalueOptions, e as PairedEvalueSequence, f as PairedEvalueStep, R as RubricOutcomePair, g as RubricPredictiveValidityInput, h as RubricPredictiveValidityReport, i as RubricRanking, S as SequentialDecision, j as evaluateInterimReleaseConfidence, p as pairedEvalueSequence, r as rubricPredictiveValidity } from './sequential-DgU2mFsE.js';
5
6
  import './store-u47QaJ9G.js';
7
+
8
+ /**
9
+ * Release confidence gate.
10
+ *
11
+ * This is the production-facing composition layer over the lower-level
12
+ * primitives:
13
+ * - Dataset manifests prove corpus/version coverage.
14
+ * - RunRecord rows prove reproducible search/holdout outcomes.
15
+ * - Multi-shot trace evidence carries turn counts and ASI diagnostics.
16
+ * - HeldOutGate decisions remain the paired promotion authority.
17
+ *
18
+ * The gate is intentionally pure and conservative. Missing declared evidence
19
+ * fails closed instead of being treated as a neutral zero.
20
+ */
21
+
22
+ type ReleaseConfidenceStatus = 'pass' | 'warn' | 'fail';
23
+ type ReleaseConfidenceAxisName = 'corpus' | 'quality' | 'generalization' | 'diagnostics' | 'efficiency';
24
+ interface ReleaseTraceEvidence {
25
+ scenarioId: string;
26
+ candidateId?: string;
27
+ split?: RunSplitTag;
28
+ score?: number;
29
+ ok?: boolean;
30
+ turnCount?: number;
31
+ costUsd?: number;
32
+ durationMs?: number;
33
+ failureMode?: string;
34
+ asi?: ActionableSideInfo[];
35
+ metadata?: Record<string, unknown>;
36
+ }
37
+ interface ReleaseConfidenceThresholds {
38
+ /** Require a Dataset manifest or explicit scenarios. Default true. */
39
+ requireCorpus?: boolean;
40
+ minScenarioCount?: number;
41
+ minSearchRuns?: number;
42
+ minHoldoutRuns?: number;
43
+ /** Require at least one holdout scenario/run. Default true. */
44
+ requireHoldout?: boolean;
45
+ minPassRate?: number;
46
+ minMeanScore?: number;
47
+ /** Search mean may exceed holdout mean by at most this much. */
48
+ maxOverfitGap?: number;
49
+ maxMeanCostUsd?: number;
50
+ maxP95WallMs?: number;
51
+ /** Low-score/failed rows must carry ASI. Default true. */
52
+ requireAsiForFailures?: boolean;
53
+ /** Score below this is considered a failure for ASI coverage. Default 0.5. */
54
+ failureScoreThreshold?: number;
55
+ }
56
+ interface ReleaseConfidenceInput {
57
+ target: string;
58
+ candidateId?: string;
59
+ baselineId?: string;
60
+ dataset?: DatasetManifest;
61
+ scenarios?: readonly DatasetScenario[];
62
+ runs?: readonly RunRecord[];
63
+ traces?: readonly ReleaseTraceEvidence[];
64
+ gateDecision?: GateDecision | null;
65
+ thresholds?: ReleaseConfidenceThresholds;
66
+ }
67
+ interface ReleaseConfidenceAxis {
68
+ name: ReleaseConfidenceAxisName;
69
+ status: ReleaseConfidenceStatus;
70
+ score: number;
71
+ detail: string;
72
+ }
73
+ interface ReleaseConfidenceIssue {
74
+ axis: ReleaseConfidenceAxisName;
75
+ severity: 'critical' | 'warning';
76
+ code: string;
77
+ detail: string;
78
+ }
79
+ interface ReleaseConfidenceMetrics {
80
+ scenarioCount: number;
81
+ searchRuns: number;
82
+ holdoutRuns: number;
83
+ passRate: number;
84
+ meanScore: number;
85
+ searchMeanScore: number;
86
+ holdoutMeanScore: number;
87
+ overfitGap: number;
88
+ meanCostUsd: number;
89
+ p95WallMs: number;
90
+ failedRows: number;
91
+ failuresWithAsi: number;
92
+ singleShotTraces: number;
93
+ multiShotTraces: number;
94
+ splitCounts: Record<DatasetSplit, number>;
95
+ domainCounts: Record<string, number>;
96
+ failureModeCounts: Record<string, number>;
97
+ responsibleSurfaceCounts: Record<string, number>;
98
+ }
99
+ interface ReleaseConfidenceScorecard {
100
+ target: string;
101
+ candidateId: string | null;
102
+ baselineId: string | null;
103
+ status: ReleaseConfidenceStatus;
104
+ promote: boolean;
105
+ axes: ReleaseConfidenceAxis[];
106
+ issues: ReleaseConfidenceIssue[];
107
+ metrics: ReleaseConfidenceMetrics;
108
+ dataset: DatasetManifest | null;
109
+ gateDecision: GateDecision | null;
110
+ summary: string;
111
+ }
112
+ declare function releaseTraceEvidenceFromMultiShotTrials(trials: readonly MultiShotTrialResult[]): ReleaseTraceEvidence[];
113
+ declare function evaluateReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
114
+ declare function assertReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
115
+
116
+ /**
117
+ * Paper-grade paired statistics for held-out promotion gates.
118
+ *
119
+ * The promotion gate (`HeldOutGate`) needs three things:
120
+ *
121
+ * 1. A bootstrap confidence interval on the per-item paired delta
122
+ * (`pairedBootstrap`). Median delta is the headline number; the
123
+ * CI lower bound is what the gate checks against `pairedDeltaThreshold`.
124
+ * 2. A non-parametric significance test on the paired deltas
125
+ * (`pairedWilcoxon` — re-export of `wilcoxonSignedRank` under the
126
+ * paper-style name).
127
+ * 3. False-discovery-rate correction across simultaneously-tested
128
+ * candidate variants (`bhAdjust` — re-export of `benjaminiHochberg`).
129
+ *
130
+ * Why a separate file: every existing primitive lives in `statistics.ts`
131
+ * (general) or `power-analysis.ts` (correction). Paired-bootstrap is
132
+ * paired-only, paper-grade, and load-bearing for the promotion gate.
133
+ * Putting it next to `statistics.ts` would require editing that file;
134
+ * the brief forbids that. New file, new exports, no surface change.
135
+ */
136
+ interface PairedBootstrapResult {
137
+ /** Number of paired observations (after dropping unequal lengths is rejected). */
138
+ n: number;
139
+ /** Median of paired deltas (after − before). */
140
+ median: number;
141
+ /** Mean of paired deltas. */
142
+ mean: number;
143
+ /** Lower bound of the bootstrap CI on the median delta. */
144
+ low: number;
145
+ /** Upper bound of the bootstrap CI on the median delta. */
146
+ high: number;
147
+ /** Confidence level used (e.g. 0.95). */
148
+ confidence: number;
149
+ /** Number of bootstrap resamples used. */
150
+ resamples: number;
151
+ }
152
+ interface PairedBootstrapOptions {
153
+ /** Confidence level. Default 0.95. */
154
+ confidence?: number;
155
+ /** Bootstrap resample count. Default 2000. */
156
+ resamples?: number;
157
+ /** Statistic to bootstrap. Default 'median'. */
158
+ statistic?: 'median' | 'mean';
159
+ /** Deterministic seed. If omitted, uses Math.random(). */
160
+ seed?: number;
161
+ }
162
+ /**
163
+ * Paired bootstrap on (after - before) deltas. Returns a CI on the
164
+ * chosen statistic (median by default). Pairs are resampled with
165
+ * replacement. The lower bound is what the promotion gate checks: if
166
+ * `low > pairedDeltaThreshold`, the gain is real at the chosen
167
+ * confidence level.
168
+ *
169
+ * Throws on unequal sample sizes — caller must align pairs upstream.
170
+ */
171
+ declare function pairedBootstrap(before: number[], after: number[], opts?: PairedBootstrapOptions): PairedBootstrapResult;
172
+ /**
173
+ * Paper-style alias for `wilcoxonSignedRank`. The signed-rank test on
174
+ * paired deltas is the standard non-parametric significance test for
175
+ * "candidate beats baseline on matched items." Use alongside the
176
+ * bootstrap CI: bootstrap gives effect size, Wilcoxon gives p.
177
+ */
178
+ declare function pairedWilcoxon(before: number[], after: number[]): {
179
+ w: number;
180
+ p: number;
181
+ };
182
+ /**
183
+ * Paper-style alias for `benjaminiHochberg`. Use to correct p-values
184
+ * across multiple candidate-vs-baseline comparisons run in the same
185
+ * promotion sweep. Returns BH-adjusted q-values and significance at
186
+ * the requested FDR (default 0.05).
187
+ */
188
+ declare function bhAdjust(pValues: number[], fdr?: number): {
189
+ qValues: number[];
190
+ significant: boolean[];
191
+ };
192
+
193
+ interface RenderReleaseReportOptions {
194
+ title?: string;
195
+ runs?: readonly RunRecord[];
196
+ comparator?: string;
197
+ traceAnalystFindings?: readonly string[];
198
+ nextActions?: readonly string[];
199
+ }
200
+ declare function renderReleaseReport(scorecard: ReleaseConfidenceScorecard, options?: RenderReleaseReportOptions): string;
201
+
202
+ /**
203
+ * Bootstrap-CI promotion gate.
204
+ *
205
+ * In any iterative-improvement loop (GEPA, prompt evolution, dataset
206
+ * curation), the question is "did this generation actually improve, or are
207
+ * we celebrating noise?". With small N and noisy outcomes, point-estimate
208
+ * deltas lie. Bootstrap confidence intervals tell the operator whether the
209
+ * delta is real before code or prompts get promoted.
210
+ *
211
+ * This module is pure functions — no I/O, no model calls. Easy to unit-test
212
+ * and to compose into any verdict gate.
213
+ *
214
+ * Default gate:
215
+ * - Bootstrap mean baseline vs candidate (1k resamples).
216
+ * - Compute the delta distribution; pass if the lower CI bound > 0.
217
+ * - Tunable confidence (default 95%) and resample count.
218
+ *
219
+ * Verdict semantics intentionally match the existing `experiments.jsonl`
220
+ * vocabulary:
221
+ * - ADVANCE: candidate's CI lower bound > baseline mean (real win)
222
+ * - KEEP: overlap, but candidate point estimate >= baseline (neutral)
223
+ * - REVERT: candidate's CI upper bound < baseline mean (real regression)
224
+ * - INCONCLUSIVE: not enough samples or CI straddles zero with no signal
225
+ */
226
+ type Verdict = 'ADVANCE' | 'KEEP' | 'REVERT' | 'INCONCLUSIVE';
227
+ interface BootstrapResult {
228
+ baselineMean: number;
229
+ candidateMean: number;
230
+ /** candidateMean - baselineMean, point estimate. */
231
+ delta: number;
232
+ /** Lower bound of the (1 - alpha) CI on the delta. */
233
+ ciLower: number;
234
+ /** Upper bound of the (1 - alpha) CI on the delta. */
235
+ ciUpper: number;
236
+ /** Number of bootstrap resamples used. */
237
+ iterations: number;
238
+ alpha: number;
239
+ verdict: Verdict;
240
+ }
241
+ interface BootstrapOptions {
242
+ /** Confidence level alpha (default 0.05 → 95% CI). */
243
+ alpha?: number;
244
+ /** Number of resamples (default 1000). */
245
+ iterations?: number;
246
+ /**
247
+ * Minimum total samples (baseline + candidate) below which we always
248
+ * return INCONCLUSIVE — bootstrap with too few samples is meaningless.
249
+ * Default 6 (combined).
250
+ */
251
+ minTotalSamples?: number;
252
+ /** RNG seed for reproducibility. Default: Math.random. */
253
+ seed?: number;
254
+ }
255
+ /**
256
+ * Compute the bootstrap CI on (candidateMean - baselineMean) and a verdict.
257
+ *
258
+ * Uses simple percentile bootstrap on the difference of resampled means.
259
+ * That's the standard non-parametric primitive — no distributional
260
+ * assumptions, robust to skew, easy to reason about.
261
+ */
262
+ declare function bootstrapCi(baseline: number[], candidate: number[], options?: BootstrapOptions): BootstrapResult;
263
+ /**
264
+ * Judge-replay promotion gate.
265
+ *
266
+ * The cheap inner-loop judge that drives an evolution run is by definition
267
+ * fast and noisy. When you're about to promote a winning variant to the
268
+ * canonical default, you want a STRONGER judge (a more expensive model, a
269
+ * human grader, a separately-trained reward model) to confirm the win
270
+ * generalises beyond the inner loop.
271
+ *
272
+ * This helper takes raw winner + baseline outputs, scores both through the
273
+ * stronger judge, and applies `bootstrapCi`. ADVANCE means the stronger
274
+ * judge agrees the winner is real with the configured confidence. Doesn't
275
+ * matter what shape your "output" is — pass a string, an object, anything
276
+ * the judge can read.
277
+ */
278
+ interface JudgeReplayGateArgs<TOutput> {
279
+ baselineOutputs: TOutput[];
280
+ candidateOutputs: TOutput[];
281
+ /** Stronger judge — async to allow LLM calls. Return a 0..N scalar score. */
282
+ judge: (output: TOutput) => Promise<number> | number;
283
+ alpha?: number;
284
+ iterations?: number;
285
+ /** RNG seed for reproducibility. */
286
+ seed?: number;
287
+ /** Maximum concurrent judge calls. Default 4. */
288
+ judgeConcurrency?: number;
289
+ }
290
+ declare function judgeReplayGate<TOutput>(args: JudgeReplayGateArgs<TOutput>): Promise<BootstrapResult & {
291
+ baselineSamples: number;
292
+ candidateSamples: number;
293
+ }>;
294
+
295
+ export { type BootstrapOptions, type BootstrapResult, type JudgeReplayGateArgs, type PairedBootstrapOptions, type PairedBootstrapResult, type ReleaseConfidenceAxis, type ReleaseConfidenceAxisName, type ReleaseConfidenceInput, type ReleaseConfidenceIssue, type ReleaseConfidenceMetrics, type ReleaseConfidenceScorecard, type ReleaseConfidenceStatus, type ReleaseConfidenceThresholds, type ReleaseTraceEvidence, type RenderReleaseReportOptions, type Verdict, assertReleaseConfidence, bhAdjust, bootstrapCi, evaluateReleaseConfidence, judgeReplayGate, pairedBootstrap, pairedWilcoxon, releaseTraceEvidenceFromMultiShotTrials, renderReleaseReport };
package/dist/reporting.js CHANGED
@@ -1,36 +1,45 @@
1
1
  import {
2
- RESEARCH_REPORT_HARD_PAIR_FLOOR,
3
2
  assertReleaseConfidence,
4
3
  bootstrapCi,
5
4
  evaluateReleaseConfidence,
6
- gainHistogram,
7
5
  judgeReplayGate,
8
- paretoChart,
9
6
  releaseTraceEvidenceFromMultiShotTrials,
10
- renderReleaseReport,
11
- researchReport,
12
- summaryTable
13
- } from "./chunk-3IX6QTB7.js";
7
+ renderReleaseReport
8
+ } from "./chunk-7EAUOUQS.js";
9
+ import {
10
+ evaluateInterimReleaseConfidence,
11
+ pairedEvalueSequence,
12
+ rubricPredictiveValidity
13
+ } from "./chunk-AXHNWLIX.js";
14
14
  import {
15
+ RESEARCH_REPORT_HARD_PAIR_FLOOR,
15
16
  bhAdjust,
17
+ gainHistogram,
16
18
  pairedBootstrap,
17
- pairedWilcoxon
18
- } from "./chunk-KRR4VMH7.js";
19
+ pairedWilcoxon,
20
+ paretoChart,
21
+ researchReport,
22
+ summaryTable
23
+ } from "./chunk-IOXMGMHQ.js";
24
+ import "./chunk-6M774GY6.js";
19
25
  import "./chunk-PZ5AY32C.js";
20
26
  export {
21
27
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
22
28
  assertReleaseConfidence,
23
29
  bhAdjust,
24
30
  bootstrapCi,
31
+ evaluateInterimReleaseConfidence,
25
32
  evaluateReleaseConfidence,
26
33
  gainHistogram,
27
34
  judgeReplayGate,
28
35
  pairedBootstrap,
36
+ pairedEvalueSequence,
29
37
  pairedWilcoxon,
30
38
  paretoChart,
31
39
  releaseTraceEvidenceFromMultiShotTrials,
32
40
  renderReleaseReport,
33
41
  researchReport,
42
+ rubricPredictiveValidity,
34
43
  summaryTable
35
44
  };
36
45
  //# sourceMappingURL=reporting.js.map
package/dist/rl.d.ts ADDED
@@ -0,0 +1,8 @@
1
+ export { A as AdaptationCurve, b as AdaptationPoint, c as AdaptationRunner, d as AdapterContext, e as AdversarialMutation, f as AdversarialScenario, g as AdversarialSearchOptions, h as AdversarialSearchReport, i as AnalyzeOptimizationResultOptions, j as AnalyzeOptimizationResultReport, B as BradleyTerryFit, k as BradleyTerryRating, l as BuildPairwiseFromCampaignInput, C as CellObservation, m as CompareCurvesResult, n as ComputeBestOfNOptions, o as ComputeBestOfNResult, p as ComputeCurve, q as ComputeCurveBudget, r as ComputeCurvePoint, s as ContaminationProbeInput, t as ContaminationProbeOptions, u as ContaminationProbeReport, v as CurriculumAllocation, D as DetectRewardHackingInput, w as DpoExportRow, x as DpoLookups, E as EloOptions, y as ExtractPreferencesOptions, z as ExtractStepRewardsOptions, G as GrpoExportRow, H as GrpoLookups, O as OffPolicyEstimate, J as OffPolicyOptions, K as OffPolicyTrajectory, P as PairwiseOutcome, N as ParetoPointInput, Q as PredictiveValidityResearcher, R as PredictiveValidityResearcherOptions, T as PreferenceExtractionReport, U as PreferenceStrategy, W as PreferenceTriple, X as PrmExportRow, Y as PrmLookups, Z as PrmTrainingTriple, _ as RLCampaignResult, $ as RewardHackingFinding, a0 as RewardHackingReport, a1 as RewardHackingSignal, a2 as RunAdaptationCurveOptions, a3 as RunComputeCurveOptions, a4 as RunRLCampaignOptions, a5 as RunwiseStepSummary, a6 as ScenarioPerturbation, a7 as ScenarioPerturbationKind, a8 as SelfConsistencyOptions, a9 as SelfConsistencyResult, aa as SftExportRow, ab as SftLookups, ac as StepReward, ad as StepRewardJsonlRow, ae as StepScorer, af as ThompsonCurriculumOptions, ag as VarianceCurriculumOptions, ah as VerifiableReward, ai as VerifiableRewardExtractionOptions, aj as VerifiableRewardSource, am as adversarialScenarioSearch, an as analyzeOptimizationResult, ao as applyEloUpdate, ap as bestOfN, aq as buildPairwiseFromCampaign, ar as compareAdaptationCurves, as as detectRewardHacking, at as doublyRobust, au as extractPreferences, av as extractStepRewards, aw as extractVerifiableReward, ax as extractVerifiableRewardsFromRecords, ay as filterDeterministicallyRewarded, az as firstPassK, aA as fitBradleyTerry, aC as injectIrrelevantClause, aD as inverseProbabilityWeighting, aE as observationsFromRunRecords, aF as offPolicyEstimateAll, b5 as paretoFrontier, aG as prmTrainingPairs, aH as renameVariables, aI as runAdaptationCurve, aJ as runComputeCurve, aK as runContaminationProbe, aL as runRLCampaign, aM as runwiseStepRewardSummary, aN as selfConsistency, aO as selfNormalizedImportanceWeighting, aP as shuffleOrder, aQ as stepRewardsToJsonl, aR as thompsonCurriculum, aS as toAnthropicFormat, aT as toDpoJsonl, aU as toDpoRows, aV as toGrpoJsonl, aW as toGrpoRows, aX as toPrmJsonl, aY as toPrmRows, aZ as toSftJsonl, a_ as toSftRows, a$ as toTRLFormat, b0 as trialToRunRecord, b1 as trialsToRunRecords, b2 as varianceBasedCurriculum, b3 as variantAggregateToRunRecord, b4 as verificationReportToRunRecord } from './index-ekBXweiQ.js';
2
+ export { r as runEvalCampaign } from './eval-campaign-Ds5QljIh.js';
3
+ import './summary-report-Ce1r4EYo.js';
4
+ import './run-record-DNiOMBrZ.js';
5
+ import './store-u47QaJ9G.js';
6
+ import './sequential-DgU2mFsE.js';
7
+ import './integrity-Cr5YodSY.js';
8
+ import './emitter-B2XqDKFU.js';
package/dist/rl.js ADDED
@@ -0,0 +1,113 @@
1
+ import {
2
+ PredictiveValidityResearcher,
3
+ adversarialScenarioSearch,
4
+ analyzeOptimizationResult,
5
+ applyEloUpdate,
6
+ bestOfN,
7
+ buildPairwiseFromCampaign,
8
+ compareAdaptationCurves,
9
+ detectRewardHacking,
10
+ doublyRobust,
11
+ extractPreferences,
12
+ extractStepRewards,
13
+ extractVerifiableReward,
14
+ extractVerifiableRewardsFromRecords,
15
+ filterDeterministicallyRewarded,
16
+ firstPassK,
17
+ fitBradleyTerry,
18
+ injectIrrelevantClause,
19
+ inverseProbabilityWeighting,
20
+ observationsFromRunRecords,
21
+ offPolicyEstimateAll,
22
+ paretoFrontier,
23
+ prmTrainingPairs,
24
+ renameVariables,
25
+ runAdaptationCurve,
26
+ runComputeCurve,
27
+ runContaminationProbe,
28
+ runRLCampaign,
29
+ runwiseStepRewardSummary,
30
+ selfConsistency,
31
+ selfNormalizedImportanceWeighting,
32
+ shuffleOrder,
33
+ stepRewardsToJsonl,
34
+ thompsonCurriculum,
35
+ toAnthropicFormat,
36
+ toDpoJsonl,
37
+ toDpoRows,
38
+ toGrpoJsonl,
39
+ toGrpoRows,
40
+ toPrmJsonl,
41
+ toPrmRows,
42
+ toSftJsonl,
43
+ toSftRows,
44
+ toTRLFormat,
45
+ trialToRunRecord,
46
+ trialsToRunRecords,
47
+ varianceBasedCurriculum,
48
+ variantAggregateToRunRecord,
49
+ verificationReportToRunRecord
50
+ } from "./chunk-LZKIOBG2.js";
51
+ import {
52
+ runEvalCampaign
53
+ } from "./chunk-EXGR4XEM.js";
54
+ import "./chunk-KAO3Q65R.js";
55
+ import "./chunk-AXHNWLIX.js";
56
+ import "./chunk-IOXMGMHQ.js";
57
+ import "./chunk-QUKKGHTZ.js";
58
+ import "./chunk-SQQLHODJ.js";
59
+ import "./chunk-5IIQKMD5.js";
60
+ import "./chunk-6M774GY6.js";
61
+ import "./chunk-PZ5AY32C.js";
62
+ export {
63
+ PredictiveValidityResearcher,
64
+ adversarialScenarioSearch,
65
+ analyzeOptimizationResult,
66
+ applyEloUpdate,
67
+ bestOfN,
68
+ buildPairwiseFromCampaign,
69
+ compareAdaptationCurves,
70
+ detectRewardHacking,
71
+ doublyRobust,
72
+ extractPreferences,
73
+ extractStepRewards,
74
+ extractVerifiableReward,
75
+ extractVerifiableRewardsFromRecords,
76
+ filterDeterministicallyRewarded,
77
+ firstPassK,
78
+ fitBradleyTerry,
79
+ injectIrrelevantClause,
80
+ inverseProbabilityWeighting,
81
+ observationsFromRunRecords,
82
+ offPolicyEstimateAll,
83
+ paretoFrontier,
84
+ prmTrainingPairs,
85
+ renameVariables,
86
+ runAdaptationCurve,
87
+ runComputeCurve,
88
+ runContaminationProbe,
89
+ runEvalCampaign,
90
+ runRLCampaign,
91
+ runwiseStepRewardSummary,
92
+ selfConsistency,
93
+ selfNormalizedImportanceWeighting,
94
+ shuffleOrder,
95
+ stepRewardsToJsonl,
96
+ thompsonCurriculum,
97
+ toAnthropicFormat,
98
+ toDpoJsonl,
99
+ toDpoRows,
100
+ toGrpoJsonl,
101
+ toGrpoRows,
102
+ toPrmJsonl,
103
+ toPrmRows,
104
+ toSftJsonl,
105
+ toSftRows,
106
+ toTRLFormat,
107
+ trialToRunRecord,
108
+ trialsToRunRecords,
109
+ varianceBasedCurriculum,
110
+ variantAggregateToRunRecord,
111
+ verificationReportToRunRecord
112
+ };
113
+ //# sourceMappingURL=rl.js.map
package/dist/rl.js.map ADDED
@@ -0,0 +1 @@
1
+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
@@ -107,6 +107,15 @@ interface RunRecord {
107
107
  failureMode?: string;
108
108
  /** Which split this run was drawn from. */
109
109
  splitTag: RunSplitTag;
110
+ /**
111
+ * Stable scenario identifier the run was scored against. Optional for
112
+ * backwards compatibility, but **strongly recommended**: every primitive
113
+ * that pairs runs by scenario (preferences, paired stats, BT tournament)
114
+ * keys on this. The campaign artifact populates it canonically; legacy
115
+ * runs without it fall back to inference from `outcome.raw.scenario_id`
116
+ * or `experimentId`.
117
+ */
118
+ scenarioId?: string;
110
119
  }
111
120
  declare class RunRecordValidationError extends Error {
112
121
  readonly path: string;
@@ -131,4 +140,4 @@ declare function parseRunRecordSafe(input: unknown): {
131
140
  /** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */
132
141
  declare function roundTripRunRecord(record: RunRecord): RunRecord;
133
142
 
134
- export { type RunSplitTag as R, type RunRecord as a, type RunJudgeMetadata as b, type RunOutcome as c, RunRecordValidationError as d, type RunTokenUsage as e, isRunRecord as i, parseRunRecordSafe as p, roundTripRunRecord as r, validateRunRecord as v };
143
+ export { type RunRecord as R, type RunSplitTag as a, type RunTokenUsage as b, type RunJudgeMetadata as c, type RunOutcome as d, RunRecordValidationError as e, isRunRecord as i, parseRunRecordSafe as p, roundTripRunRecord as r, validateRunRecord as v };