@tangle-network/agent-eval 0.40.5 → 0.42.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/dist/campaign/index.d.ts +48 -355
  2. package/dist/campaign/index.js +106 -6
  3. package/dist/campaign/index.js.map +1 -1
  4. package/dist/{chunk-AU2JLNSZ.js → chunk-H4TOS272.js} +1 -65
  5. package/dist/chunk-H4TOS272.js.map +1 -0
  6. package/dist/{chunk-NKLGKF2Q.js → chunk-KQ26DYTQ.js} +2 -18
  7. package/dist/chunk-KQ26DYTQ.js.map +1 -0
  8. package/dist/{chunk-EGIPWXHL.js → chunk-MNL6LXGQ.js} +98 -2
  9. package/dist/chunk-MNL6LXGQ.js.map +1 -0
  10. package/dist/{chunk-5U2DOJU4.js → chunk-N4SBKEPJ.js} +199 -2
  11. package/dist/chunk-N4SBKEPJ.js.map +1 -0
  12. package/dist/{chunk-LCIDRYGP.js → chunk-PD3MH6WU.js} +8 -8
  13. package/dist/{control-CmLJk3IG.d.ts → control-ojEWkMfJ.d.ts} +1 -1
  14. package/dist/control.d.ts +2 -2
  15. package/dist/{feedback-trajectory-Dvy-bt7x.d.ts → feedback-trajectory-BSxqEpu7.d.ts} +1 -1
  16. package/dist/index.d.ts +227 -687
  17. package/dist/index.js +753 -1237
  18. package/dist/index.js.map +1 -1
  19. package/dist/integrity-CTDhR1Sg.d.ts +81 -0
  20. package/dist/llm-client-BXVRUZyX.d.ts +234 -0
  21. package/dist/openapi.json +1 -1
  22. package/dist/pipelines/index.js +67 -3
  23. package/dist/pipelines/index.js.map +1 -1
  24. package/dist/{integrity-DYR5gWlb.d.ts → raw-provider-sink-C46HDghv.d.ts} +1 -80
  25. package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} +21 -3
  26. package/dist/reporting.d.ts +2 -3
  27. package/dist/reporting.js +4 -8
  28. package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} +116 -205
  29. package/dist/rl.d.ts +103 -221
  30. package/dist/rl.js +44 -199
  31. package/dist/rl.js.map +1 -1
  32. package/dist/sequential-DdV5ShjT.d.ts +561 -0
  33. package/dist/traces.d.ts +3 -2
  34. package/dist/traces.js +5 -5
  35. package/dist/types-BLbRTxoc.d.ts +367 -0
  36. package/dist/wire/index.d.ts +1 -1
  37. package/package.json +1 -6
  38. package/dist/chunk-5U2DOJU4.js.map +0 -1
  39. package/dist/chunk-AU2JLNSZ.js.map +0 -1
  40. package/dist/chunk-DMW5VENN.js +0 -1412
  41. package/dist/chunk-DMW5VENN.js.map +0 -1
  42. package/dist/chunk-EGIPWXHL.js.map +0 -1
  43. package/dist/chunk-MAZ26DC7.js +0 -99
  44. package/dist/chunk-MAZ26DC7.js.map +0 -1
  45. package/dist/chunk-NKLGKF2Q.js.map +0 -1
  46. package/dist/multi-layer-verifier-BNi4-8lR.d.ts +0 -141
  47. package/dist/optimization.d.ts +0 -11
  48. package/dist/optimization.js +0 -71
  49. package/dist/optimization.js.map +0 -1
  50. package/dist/sequential-5iSVfzl2.d.ts +0 -139
  51. package/dist/summary-report-DuZXOk7K.d.ts +0 -917
  52. /package/dist/{chunk-LCIDRYGP.js.map → chunk-PD3MH6WU.js.map} +0 -0
package/dist/rl.d.ts CHANGED
@@ -1,16 +1,17 @@
1
1
  import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
2
- import { V as VerificationReport } from './multi-layer-verifier-BNi4-8lR.js';
3
- import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-DuZXOk7K.js';
2
+ import { C as CampaignResult } from './types-BLbRTxoc.js';
3
+ import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-CoJMs2Iz.js';
4
+ export { r as runEvalCampaign } from './researcher-CoJMs2Iz.js';
5
+ import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
4
6
  import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
5
7
  import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-ByZEC3BX.js';
6
- import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
7
- import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
8
- import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-DeZ_EArp.js';
9
- export { r as runEvalCampaign } from './researcher-DeZ_EArp.js';
8
+ import { I as InterimReleaseConfidence } from './sequential-DdV5ShjT.js';
10
9
  import './errors-mje_cKOs.js';
11
- import './failure-cluster-Cw65_5FY.js';
12
- import './integrity-DYR5gWlb.js';
10
+ import './llm-client-BXVRUZyX.js';
11
+ import './raw-provider-sink-C46HDghv.js';
13
12
  import './emitter-DP_cSSiw.js';
13
+ import './integrity-CTDhR1Sg.js';
14
+ import './failure-cluster-Cw65_5FY.js';
14
15
 
15
16
  /**
16
17
  * Test-time compute scaling curves.
@@ -529,17 +530,17 @@ declare function toAnthropicFormat(triples: PreferenceTriple[]): Array<{
529
530
  }>;
530
531
 
531
532
  /**
532
- * Adapters: convert `TrialResult[]` (from `runMultiShotOptimization`,
533
- * `runPromptEvolution`) into the canonical `RunRecord[]` artifact that
534
- * `replayCache`, `pairedEvalueSequence`, and `rubricPredictiveValidity`
535
- * consume.
536
- *
537
- * Adapters are thin and explicit every mandatory `RunRecord` field
538
- * comes from a caller-supplied context (`commitSha`, `model`,
539
- * `promptHash`, `configHash`) plus the trial's runtime data. Defaults
540
- * exist for fields the trial doesn't carry (`tokenUsage`, `costUsd`),
541
- * but the validator still rejects records with bare-alias model strings
542
- * — the caller is responsible for snapshot-pinning.
533
+ * Adapters: convert measurement outputs into the canonical `RunRecord[]`
534
+ * artifact that `replayCache`, `pairedEvalueSequence`, and
535
+ * `rubricPredictiveValidity` consume. Two sources:
536
+ * - `campaignToRunRecords` — the campaign substrate's per-cell results
537
+ * (the modern path: `runCampaign` / `runImprovementLoop` → records).
538
+ * - `verificationReportToRunRecord`a `MultiLayerVerifier` report.
539
+ *
540
+ * Adapters are thin and explicit every mandatory `RunRecord` field comes
541
+ * from a caller-supplied context (`commitSha`, `model`, `promptHash`,
542
+ * `configHash`) plus the cell's runtime data. The validator still rejects
543
+ * bare-alias model strings — the caller snapshot-pins.
543
544
  */
544
545
 
545
546
  interface AdapterContext {
@@ -550,41 +551,30 @@ interface AdapterContext {
550
551
  /** Git SHA the harness was run from. */
551
552
  commitSha: string;
552
553
  /** Hash of the effective prompt sent to the model. */
553
- promptHash: string | ((t: TrialResult) => string);
554
+ promptHash: string;
554
555
  /** Hash of the effective config (model, temperature, tools, judges, splits). */
555
- configHash: string | ((t: TrialResult) => string);
556
- /** Default split tag. Default `'search'` — optimization sweeps run on the search split. */
556
+ configHash: string;
557
+ /** Default split tag. Default `'search'`. */
557
558
  splitTag?: RunSplitTag;
558
- /** Default cost in USD when the trial doesn't record one. Default `0`. */
559
+ /** Default cost in USD when the source doesn't record one. Default `0`. */
559
560
  defaultCostUsd?: number;
560
561
  }
561
562
  /**
562
- * Convert one `TrialResult` (from `runPromptEvolution` or
563
- * `runMultiShotOptimization`) into a canonical `RunRecord`.
564
- *
565
- * The conversion is **not lossy** every `TrialResult.metrics` field is
566
- * carried through to `outcome.raw`, plus a synthetic
567
- * `raw.cost_unknown = 1` flag when the trial omits cost (so downstream
568
- * filters can distinguish "free" from "untracked"). This preserves the
569
- * paper-grade contract: a record without a cost number is unbounded by
570
- * definition, but we don't drop the record.
563
+ * Convert a `CampaignResult` into canonical `RunRecord[]` — one record per
564
+ * scored cell. The cell's mean judge composite becomes the split score; every
565
+ * judge dimension is carried through to `outcome.raw`. A cell that errored
566
+ * becomes a record with `failureMode: 'cell_error'` (kept, not dropped — an
567
+ * unscored cell is signal). `candidateId` identifies the measured surface
568
+ * (defaults to the campaign manifest hash).
571
569
  */
572
- declare function trialToRunRecord(trial: TrialResult, ctx: AdapterContext, opts?: {
573
- runId?: string;
574
- experimentIdPerTrial?: (t: TrialResult) => string;
575
- }): RunRecord;
576
- /** Convenience: convert an array of `TrialResult` in one go. */
577
- declare function trialsToRunRecords(trials: TrialResult[], ctx: AdapterContext): RunRecord[];
570
+ declare function campaignToRunRecords(campaign: CampaignResult, ctx: AdapterContext & {
571
+ candidateId?: string;
572
+ }): RunRecord[];
578
573
  /**
579
574
  * Convert a `MultiLayerVerifier` `VerificationReport` into a `RunRecord`.
580
- *
581
- * The verifier produces per-layer results; we synthesize one canonical
582
- * record where:
583
- * - `outcome.searchScore` (or `holdoutScore`) is `report.blendedScore`
584
- * - `outcome.raw` carries every layer's score keyed `layer.<name>`
585
- * plus a `layer_<name>_pass` 1/0 indicator
586
- * - `failureMode` is taken from the first failing layer's `reason`
587
- * - `wallMs` is `report.durationMs`
575
+ * `outcome.searchScore` (or `holdoutScore`) is `report.blendedScore`;
576
+ * `outcome.raw` carries every layer's score + a pass indicator; `failureMode`
577
+ * is the first failing layer's reason.
588
578
  */
589
579
  declare function verificationReportToRunRecord(report: VerificationReport, ctx: AdapterContext & {
590
580
  candidateId: string;
@@ -592,15 +582,6 @@ declare function verificationReportToRunRecord(report: VerificationReport, ctx:
592
582
  }, opts?: {
593
583
  runId?: string;
594
584
  }): RunRecord;
595
- /**
596
- * Convert a `VariantAggregate` (per-variant rollup from `prompt-evolution`)
597
- * into a synthetic `RunRecord` representing the aggregate. Useful when the
598
- * downstream consumer wants per-variant entries for a `researchReport`
599
- * rather than per-(variant, scenario, rep) trial entries.
600
- */
601
- declare function variantAggregateToRunRecord(agg: VariantAggregate, ctx: AdapterContext, opts?: {
602
- runId?: string;
603
- }): RunRecord;
604
585
 
605
586
  /**
606
587
  * Bradley-Terry / Elo tournament evaluation.
@@ -1396,6 +1377,72 @@ interface StepRewardJsonlRow {
1396
1377
  }
1397
1378
  declare function stepRewardsToJsonl(stepRewards: StepReward[]): string;
1398
1379
 
1380
+ /**
1381
+ * `PredictiveValidityResearcher` — concrete `Researcher` implementation
1382
+ * that drives selection from outcome-anchored predictive validity.
1383
+ *
1384
+ * Each method:
1385
+ *
1386
+ * - `inspectFailures(runs)` — synthesizes failure modes from the
1387
+ * bottom-quartile of `RunRecord`s on the configured proxy reward.
1388
+ * - `proposeChange(failures)` — proposes steering changes that target
1389
+ * the rubrics with the lowest predictive validity (decorative ones).
1390
+ * Either reduce their weight in the composite, or recalibrate them.
1391
+ * - `applyChange(changes, baseline)` — merges the proposed steering
1392
+ * into the experiment plan.
1393
+ * - `evaluateChange(plan)` — re-runs the predictive-validity check on
1394
+ * the post-change runs and reports the delta.
1395
+ *
1396
+ * The result is a closed loop: the rubric weights drift toward the ones
1397
+ * that actually predict deployment outcomes, automatically. Pair with
1398
+ * `runRLCampaign` for the full auto-research story.
1399
+ */
1400
+
1401
+ interface PredictiveValidityResearcherOptions {
1402
+ outcomes: OutcomeStore;
1403
+ outcomeMetrics: string[];
1404
+ /** Score threshold below which a run counts as a "failure." Default 0.5. */
1405
+ failureThreshold?: number;
1406
+ /** Spearman bucket below which a rubric is "decorative." Default 0.4. */
1407
+ decorativeThreshold?: number;
1408
+ /** Optional steering-namespace prefix for proposed changes. Default `'rubric_weight'`. */
1409
+ steeringNamespace?: string;
1410
+ /** Override the rubric set the researcher inspects. Default: every numeric `outcome.raw` key seen. */
1411
+ rubrics?: string[];
1412
+ /**
1413
+ * Snapshot stash hook — called with the most recent predictive-validity
1414
+ * report. Useful when a downstream system wants to log rubric drift over
1415
+ * time. Default no-op.
1416
+ */
1417
+ onReport?: (report: RubricPredictiveValidityReport) => void | Promise<void>;
1418
+ }
1419
+ /**
1420
+ * Concrete `Researcher` driven by `rubricPredictiveValidity`. The brain:
1421
+ * rubrics that don't predict deployment outcomes don't earn weight.
1422
+ */
1423
+ declare class PredictiveValidityResearcher implements Researcher {
1424
+ private opts;
1425
+ private lastReport;
1426
+ constructor(opts: PredictiveValidityResearcherOptions);
1427
+ inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
1428
+ proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
1429
+ applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
1430
+ evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
1431
+ /**
1432
+ * Run the predictive-validity check explicitly against a fresh RunRecord
1433
+ * set. Updates the researcher's cached report so subsequent
1434
+ * `proposeChange` calls have evidence to draw from.
1435
+ */
1436
+ runValidityCheck(runs: RunRecord[]): Promise<RubricPredictiveValidityReport>;
1437
+ /**
1438
+ * Force-feed a predictive-validity report into the researcher state —
1439
+ * useful when the consumer ran the report out-of-band and wants the
1440
+ * researcher's later proposals informed by it.
1441
+ */
1442
+ setReport(report: RubricPredictiveValidityReport): void;
1443
+ getLastReport(): RubricPredictiveValidityReport | null;
1444
+ }
1445
+
1399
1446
  /**
1400
1447
  * Reward hacking / Goodhart detection.
1401
1448
  *
@@ -1499,171 +1546,6 @@ interface DetectRewardHackingInput {
1499
1546
  }
1500
1547
  declare function detectRewardHacking(input: DetectRewardHackingInput): RewardHackingReport;
1501
1548
 
1502
- /**
1503
- * `analyzeOptimizationResult` — unifies the auto-research stack
1504
- * (`runPromptEvolution`, `runMultiShotOptimization`, reflective-mutation,
1505
- * Ax/AxRLM trace analyst) with the RL bridge in a single call.
1506
- *
1507
- * The optimization primitives produce `TrialResult[]`; the RL bridge
1508
- * consumes `RunRecord[]`. Trace-analyst is independent of both. This
1509
- * function does the wiring once so consumers don't have to:
1510
- *
1511
- * Optimization (existing primitives) RL bridge
1512
- * ────────────────────────────────── ────────
1513
- * runPromptEvolution → TrialResult[] →
1514
- * runMultiShotOptimization → MSTrial[] → analyzeOptimizationResult →
1515
- * reflective-mutation → mutations.jsonl → ↓
1516
- * │
1517
- * ↓ (per-generation inputs flow back) │
1518
- * PredictiveValidityResearcher.proposeChange ←───────────────────── │
1519
- * │
1520
- * ↓ │
1521
- * TraceAnalyst.analyze(progressLog) ←─────────────────────────┘
1522
- *
1523
- * The output is the canonical RL artifact set: `RunRecord[]` (so every
1524
- * other RL primitive composes), preference triples, verifiable reward
1525
- * signals, reward-hacking diagnosis, sequential interim verdict, and
1526
- * (when wired) trace-analyst summary.
1527
- *
1528
- * What this primitive does NOT do: it does not modify the optimization
1529
- * primitives' internals. They keep producing `TrialResult` and emitting
1530
- * `onProgress` events; this function bridges *after* the sweep completes.
1531
- * Per-step capture-integrity (raw HTTP events from inside the score
1532
- * adapter) requires the consumer to wire `RawProviderSink` into their
1533
- * own `ScoreAdapter` — that's a per-consumer integration point.
1534
- */
1535
-
1536
- interface AnalyzeOptimizationResultOptions {
1537
- /**
1538
- * The optimization output. Either a `PromptEvolutionResult` or a
1539
- * `MultiShotOptimizationResult`. The function detects which by
1540
- * structural typing and produces canonical `RunRecord[]` from either.
1541
- */
1542
- result: PromptEvolutionResult | MultiShotOptimizationResult;
1543
- /** Adapter context — `commitSha`, `model`, `promptHash`, `configHash`. */
1544
- ctx: AdapterContext;
1545
- /** Optional comparator candidate id for paired analyses. */
1546
- comparator?: string;
1547
- /** Verifiable-reward extraction options. */
1548
- verifiableReward?: VerifiableRewardExtractionOptions;
1549
- /** Preference extraction options. */
1550
- preferences?: ExtractPreferencesOptions;
1551
- /** Sequential interim-confidence options. */
1552
- sequential?: {
1553
- alpha?: number;
1554
- bound?: number;
1555
- rope?: {
1556
- low: number;
1557
- high: number;
1558
- };
1559
- };
1560
- /** Outcome calibration store + metrics. */
1561
- outcomes?: {
1562
- store: OutcomeStore;
1563
- metrics: string[];
1564
- };
1565
- /** Trainer-format export — DPO + GRPO lookups. */
1566
- trainerExport?: {
1567
- dpo?: DpoLookups;
1568
- grpo?: GrpoLookups;
1569
- };
1570
- }
1571
- interface AnalyzeOptimizationResultReport {
1572
- /** All trials promoted to canonical `RunRecord` shape. */
1573
- runs: RunRecord[];
1574
- /** Per-run verifiable reward signal. */
1575
- rewardSignals: Array<{
1576
- runId: string;
1577
- reward: VerifiableReward | null;
1578
- }>;
1579
- /** Preference triples ready for DPO/PPO/KTO training. */
1580
- preferences: PreferenceExtractionReport;
1581
- /** Anytime-valid sequential verdict, when a comparator is supplied. */
1582
- interimConfidence: InterimReleaseConfidence | null;
1583
- /** Standing reward-hacking hygiene check. */
1584
- rewardHacking: RewardHackingReport;
1585
- /** Predictive validity, when an outcome store is supplied. */
1586
- predictiveValidity: RubricPredictiveValidityReport | null;
1587
- /** Trainer-export rows, populated only for the formats requested. */
1588
- trainerRows: {
1589
- dpo?: DpoExportRow[];
1590
- grpo?: GrpoExportRow[];
1591
- };
1592
- /** One-line summary suitable for logs. */
1593
- summary: string;
1594
- }
1595
- /**
1596
- * Convert an optimization sweep output into a fully-analysed RL artifact
1597
- * set. Idempotent and read-only with respect to the optimization result.
1598
- */
1599
- declare function analyzeOptimizationResult(opts: AnalyzeOptimizationResultOptions): Promise<AnalyzeOptimizationResultReport>;
1600
-
1601
- /**
1602
- * `PredictiveValidityResearcher` — concrete `Researcher` implementation
1603
- * that drives selection from outcome-anchored predictive validity.
1604
- *
1605
- * Each method:
1606
- *
1607
- * - `inspectFailures(runs)` — synthesizes failure modes from the
1608
- * bottom-quartile of `RunRecord`s on the configured proxy reward.
1609
- * - `proposeChange(failures)` — proposes steering changes that target
1610
- * the rubrics with the lowest predictive validity (decorative ones).
1611
- * Either reduce their weight in the composite, or recalibrate them.
1612
- * - `applyChange(changes, baseline)` — merges the proposed steering
1613
- * into the experiment plan.
1614
- * - `evaluateChange(plan)` — re-runs the predictive-validity check on
1615
- * the post-change runs and reports the delta.
1616
- *
1617
- * The result is a closed loop: the rubric weights drift toward the ones
1618
- * that actually predict deployment outcomes, automatically. Pair with
1619
- * `runRLCampaign` for the full auto-research story.
1620
- */
1621
-
1622
- interface PredictiveValidityResearcherOptions {
1623
- outcomes: OutcomeStore;
1624
- outcomeMetrics: string[];
1625
- /** Score threshold below which a run counts as a "failure." Default 0.5. */
1626
- failureThreshold?: number;
1627
- /** Spearman bucket below which a rubric is "decorative." Default 0.4. */
1628
- decorativeThreshold?: number;
1629
- /** Optional steering-namespace prefix for proposed changes. Default `'rubric_weight'`. */
1630
- steeringNamespace?: string;
1631
- /** Override the rubric set the researcher inspects. Default: every numeric `outcome.raw` key seen. */
1632
- rubrics?: string[];
1633
- /**
1634
- * Snapshot stash hook — called with the most recent predictive-validity
1635
- * report. Useful when a downstream system wants to log rubric drift over
1636
- * time. Default no-op.
1637
- */
1638
- onReport?: (report: RubricPredictiveValidityReport) => void | Promise<void>;
1639
- }
1640
- /**
1641
- * Concrete `Researcher` driven by `rubricPredictiveValidity`. The brain:
1642
- * rubrics that don't predict deployment outcomes don't earn weight.
1643
- */
1644
- declare class PredictiveValidityResearcher implements Researcher {
1645
- private opts;
1646
- private lastReport;
1647
- constructor(opts: PredictiveValidityResearcherOptions);
1648
- inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
1649
- proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
1650
- applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
1651
- evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
1652
- /**
1653
- * Run the predictive-validity check explicitly against a fresh RunRecord
1654
- * set. Updates the researcher's cached report so subsequent
1655
- * `proposeChange` calls have evidence to draw from.
1656
- */
1657
- runValidityCheck(runs: RunRecord[]): Promise<RubricPredictiveValidityReport>;
1658
- /**
1659
- * Force-feed a predictive-validity report into the researcher state —
1660
- * useful when the consumer ran the report out-of-band and wants the
1661
- * researcher's later proposals informed by it.
1662
- */
1663
- setReport(report: RubricPredictiveValidityReport): void;
1664
- getLastReport(): RubricPredictiveValidityReport | null;
1665
- }
1666
-
1667
1549
  /**
1668
1550
  * `runRLCampaign` — top-level orchestrator that runs the matrix and
1669
1551
  * produces every RL-ready artifact in one call.
@@ -1741,4 +1623,4 @@ interface RLCampaignResult<V> {
1741
1623
  }
1742
1624
  declare function runRLCampaign<V>(opts: RunRLCampaignOptions<V>): Promise<RLCampaignResult<V>>;
1743
1625
 
1744
- export { type AdaptationCurve, type AdaptationPoint, type AdaptationRunner, type AdapterContext, type AdversarialMutation, type AdversarialScenario, type AdversarialSearchOptions, type AdversarialSearchReport, type AnalyzeOptimizationResultOptions, type AnalyzeOptimizationResultReport, type BradleyTerryFit, type BradleyTerryRating, type BuildPairwiseFromCampaignInput, type CellObservation, type CompareCurvesResult, type ComputeBestOfNOptions, type ComputeBestOfNResult, type ComputeCurve, type ComputeCurveBudget, type ComputeCurvePoint, type ContaminationProbeInput, type ContaminationProbeOptions, type ContaminationProbeReport, type CurriculumAllocation, type DetectRewardHackingInput, type DpoExportRow, type DpoLookups, type EloOptions, type ExtractPreferencesOptions, type ExtractStepRewardsOptions, type GrpoExportRow, type GrpoLookups, type OffPolicyEstimate, type OffPolicyOptions, type OffPolicyTrajectory, type PairwiseOutcome, type ParetoPointInput, PredictiveValidityResearcher, type PredictiveValidityResearcherOptions, type PreferenceExtractionReport, type PreferenceStrategy, type PreferenceTriple, type PrmExportRow, type PrmLookups, type PrmTrainingTriple, type RLCampaignResult, type RewardHackingFinding, type RewardHackingReport, type RewardHackingSignal, type RunAdaptationCurveOptions, type RunComputeCurveOptions, type RunRLCampaignOptions, type RunwiseStepSummary, type ScenarioPerturbation, type ScenarioPerturbationKind, type SelfConsistencyOptions, type SelfConsistencyResult, type SftExportRow, type SftLookups, type StepReward, type StepRewardJsonlRow, type StepScorer, type ThompsonCurriculumOptions, type VarianceCurriculumOptions, type VerifiableReward, type VerifiableRewardExtractionOptions, type VerifiableRewardSource, adversarialScenarioSearch, analyzeOptimizationResult, applyEloUpdate, bestOfN, buildPairwiseFromCampaign, compareAdaptationCurves, detectRewardHacking, doublyRobust, extractPreferences, extractStepRewards, extractVerifiableReward, extractVerifiableRewardsFromRecords, filterDeterministicallyRewarded, firstPassK, fitBradleyTerry, injectIrrelevantClause, inverseProbabilityWeighting, observationsFromRunRecords, offPolicyEstimateAll, paretoFrontier, prmTrainingPairs, renameVariables, runAdaptationCurve, runComputeCurve, runContaminationProbe, runRLCampaign, runwiseStepRewardSummary, selfConsistency, selfNormalizedImportanceWeighting, shuffleOrder, stepRewardsToJsonl, thompsonCurriculum, toAnthropicFormat, toDpoJsonl, toDpoRows, toGrpoJsonl, toGrpoRows, toPrmJsonl, toPrmRows, toSftJsonl, toSftRows, toTRLFormat, trialToRunRecord, trialsToRunRecords, varianceBasedCurriculum, variantAggregateToRunRecord, verificationReportToRunRecord };
1626
+ export { type AdaptationCurve, type AdaptationPoint, type AdaptationRunner, type AdapterContext, type AdversarialMutation, type AdversarialScenario, type AdversarialSearchOptions, type AdversarialSearchReport, type BradleyTerryFit, type BradleyTerryRating, type BuildPairwiseFromCampaignInput, type CellObservation, type CompareCurvesResult, type ComputeBestOfNOptions, type ComputeBestOfNResult, type ComputeCurve, type ComputeCurveBudget, type ComputeCurvePoint, type ContaminationProbeInput, type ContaminationProbeOptions, type ContaminationProbeReport, type CurriculumAllocation, type DetectRewardHackingInput, type DpoExportRow, type DpoLookups, type EloOptions, type ExtractPreferencesOptions, type ExtractStepRewardsOptions, type GrpoExportRow, type GrpoLookups, type OffPolicyEstimate, type OffPolicyOptions, type OffPolicyTrajectory, type PairwiseOutcome, type ParetoPointInput, PredictiveValidityResearcher, type PredictiveValidityResearcherOptions, type PreferenceExtractionReport, type PreferenceStrategy, type PreferenceTriple, type PrmExportRow, type PrmLookups, type PrmTrainingTriple, type RLCampaignResult, type RewardHackingFinding, type RewardHackingReport, type RewardHackingSignal, type RunAdaptationCurveOptions, type RunComputeCurveOptions, type RunRLCampaignOptions, type RunwiseStepSummary, type ScenarioPerturbation, type ScenarioPerturbationKind, type SelfConsistencyOptions, type SelfConsistencyResult, type SftExportRow, type SftLookups, type StepReward, type StepRewardJsonlRow, type StepScorer, type ThompsonCurriculumOptions, type VarianceCurriculumOptions, type VerifiableReward, type VerifiableRewardExtractionOptions, type VerifiableRewardSource, adversarialScenarioSearch, applyEloUpdate, bestOfN, buildPairwiseFromCampaign, campaignToRunRecords, compareAdaptationCurves, detectRewardHacking, doublyRobust, extractPreferences, extractStepRewards, extractVerifiableReward, extractVerifiableRewardsFromRecords, filterDeterministicallyRewarded, firstPassK, fitBradleyTerry, injectIrrelevantClause, inverseProbabilityWeighting, observationsFromRunRecords, offPolicyEstimateAll, paretoFrontier, prmTrainingPairs, renameVariables, runAdaptationCurve, runComputeCurve, runContaminationProbe, runRLCampaign, runwiseStepRewardSummary, selfConsistency, selfNormalizedImportanceWeighting, shuffleOrder, stepRewardsToJsonl, thompsonCurriculum, toAnthropicFormat, toDpoJsonl, toDpoRows, toGrpoJsonl, toGrpoRows, toPrmJsonl, toPrmRows, toSftJsonl, toSftRows, toTRLFormat, varianceBasedCurriculum, verificationReportToRunRecord };