npm - @tangle-network/agent-eval - Versions diffs - 0.40.5 → 0.42.0 - Mend

@tangle-network/agent-eval 0.40.5 → 0.42.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

package/dist/campaign/index.d.ts +48 -355
package/dist/campaign/index.js +106 -6
package/dist/campaign/index.js.map +1 -1
package/dist/{chunk-AU2JLNSZ.js → chunk-H4TOS272.js} +1 -65
package/dist/chunk-H4TOS272.js.map +1 -0
package/dist/{chunk-NKLGKF2Q.js → chunk-KQ26DYTQ.js} +2 -18
package/dist/chunk-KQ26DYTQ.js.map +1 -0
package/dist/{chunk-EGIPWXHL.js → chunk-MNL6LXGQ.js} +98 -2
package/dist/chunk-MNL6LXGQ.js.map +1 -0
package/dist/{chunk-5U2DOJU4.js → chunk-N4SBKEPJ.js} +199 -2
package/dist/chunk-N4SBKEPJ.js.map +1 -0
package/dist/{chunk-LCIDRYGP.js → chunk-PD3MH6WU.js} +8 -8
package/dist/{control-CmLJk3IG.d.ts → control-ojEWkMfJ.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/{feedback-trajectory-Dvy-bt7x.d.ts → feedback-trajectory-BSxqEpu7.d.ts} +1 -1
package/dist/index.d.ts +227 -687
package/dist/index.js +753 -1237
package/dist/index.js.map +1 -1
package/dist/integrity-CTDhR1Sg.d.ts +81 -0
package/dist/llm-client-BXVRUZyX.d.ts +234 -0
package/dist/openapi.json +1 -1
package/dist/pipelines/index.js +67 -3
package/dist/pipelines/index.js.map +1 -1
package/dist/{integrity-DYR5gWlb.d.ts → raw-provider-sink-C46HDghv.d.ts} +1 -80
package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} +21 -3
package/dist/reporting.d.ts +2 -3
package/dist/reporting.js +4 -8
package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} +116 -205
package/dist/rl.d.ts +103 -221
package/dist/rl.js +44 -199
package/dist/rl.js.map +1 -1
package/dist/sequential-DdV5ShjT.d.ts +561 -0
package/dist/traces.d.ts +3 -2
package/dist/traces.js +5 -5
package/dist/types-BLbRTxoc.d.ts +367 -0
package/dist/wire/index.d.ts +1 -1
package/package.json +1 -6
package/dist/chunk-5U2DOJU4.js.map +0 -1
package/dist/chunk-AU2JLNSZ.js.map +0 -1
package/dist/chunk-DMW5VENN.js +0 -1412
package/dist/chunk-DMW5VENN.js.map +0 -1
package/dist/chunk-EGIPWXHL.js.map +0 -1
package/dist/chunk-MAZ26DC7.js +0 -99
package/dist/chunk-MAZ26DC7.js.map +0 -1
package/dist/chunk-NKLGKF2Q.js.map +0 -1
package/dist/multi-layer-verifier-BNi4-8lR.d.ts +0 -141
package/dist/optimization.d.ts +0 -11
package/dist/optimization.js +0 -71
package/dist/optimization.js.map +0 -1
package/dist/sequential-5iSVfzl2.d.ts +0 -139
package/dist/summary-report-DuZXOk7K.d.ts +0 -917
/package/dist/{chunk-LCIDRYGP.js.map → chunk-PD3MH6WU.js.map} +0 -0

package/dist/rl.d.ts CHANGED Viewed

@@ -1,16 +1,17 @@
 import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
-import { V as VerificationReport } from './multi-layer-verifier-BNi4-8lR.js';
-import { t as TrialResult, V as VariantAggregate, q as PromptEvolutionResult, e as MultiShotOptimizationResult } from './summary-report-DuZXOk7K.js';
+import { C as CampaignResult } from './types-BLbRTxoc.js';
+import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-CoJMs2Iz.js';
+export { r as runEvalCampaign } from './researcher-CoJMs2Iz.js';
+import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
 import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
 import { b as RubricPredictiveValidityReport } from './rubric-predictive-validity-ByZEC3BX.js';
-import { I as InterimReleaseConfidence } from './sequential-5iSVfzl2.js';
-import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
-import { R as Researcher, l as FailureMode, S as SteeringChange, j as ExperimentPlan, k as ExperimentResult, i as EvalCampaignResult, E as EvalCampaignOptions } from './researcher-DeZ_EArp.js';
-export { r as runEvalCampaign } from './researcher-DeZ_EArp.js';
+import { I as InterimReleaseConfidence } from './sequential-DdV5ShjT.js';
 import './errors-mje_cKOs.js';
-import './failure-cluster-Cw65_5FY.js';
-import './integrity-DYR5gWlb.js';
+import './llm-client-BXVRUZyX.js';
+import './raw-provider-sink-C46HDghv.js';
 import './emitter-DP_cSSiw.js';
+import './integrity-CTDhR1Sg.js';
+import './failure-cluster-Cw65_5FY.js';
 /**
  * Test-time compute scaling curves.
@@ -529,17 +530,17 @@ declare function toAnthropicFormat(triples: PreferenceTriple[]): Array<{
 }>;
 /**
- * Adapters: convert `TrialResult[]` (from `runMultiShotOptimization`,
- * `runPromptEvolution`) into the canonical `RunRecord[]` artifact that
- * `replayCache`, `pairedEvalueSequence`, and `rubricPredictiveValidity`
- * consume.
- *
- * Adapters are thin and explicit — every mandatory `RunRecord` field
- * comes from a caller-supplied context (`commitSha`, `model`,
- * `promptHash`, `configHash`) plus the trial's runtime data. Defaults
- * exist for fields the trial doesn't carry (`tokenUsage`, `costUsd`),
- * but the validator still rejects records with bare-alias model strings
- * — the caller is responsible for snapshot-pinning.
+ * Adapters: convert measurement outputs into the canonical `RunRecord[]`
+ * artifact that `replayCache`, `pairedEvalueSequence`, and
+ * `rubricPredictiveValidity` consume. Two sources:
+ *   - `campaignToRunRecords` — the campaign substrate's per-cell results
+ *     (the modern path: `runCampaign` / `runImprovementLoop` → records).
+ *   - `verificationReportToRunRecord` — a `MultiLayerVerifier` report.
+ *
+ * Adapters are thin and explicit — every mandatory `RunRecord` field comes
+ * from a caller-supplied context (`commitSha`, `model`, `promptHash`,
+ * `configHash`) plus the cell's runtime data. The validator still rejects
+ * bare-alias model strings — the caller snapshot-pins.
  */
 interface AdapterContext {
@@ -550,41 +551,30 @@ interface AdapterContext {
     /** Git SHA the harness was run from. */
     commitSha: string;
     /** Hash of the effective prompt sent to the model. */
-    promptHash: string | ((t: TrialResult) => string);
+    promptHash: string;
     /** Hash of the effective config (model, temperature, tools, judges, splits). */
-    configHash: string | ((t: TrialResult) => string);
-    /** Default split tag. Default `'search'` — optimization sweeps run on the search split. */
+    configHash: string;
+    /** Default split tag. Default `'search'`. */
     splitTag?: RunSplitTag;
-    /** Default cost in USD when the trial doesn't record one. Default `0`. */
+    /** Default cost in USD when the source doesn't record one. Default `0`. */
     defaultCostUsd?: number;
 }
 /**
- * Convert one `TrialResult` (from `runPromptEvolution` or
- * `runMultiShotOptimization`) into a canonical `RunRecord`.
- *
- * The conversion is **not lossy** — every `TrialResult.metrics` field is
- * carried through to `outcome.raw`, plus a synthetic
- * `raw.cost_unknown = 1` flag when the trial omits cost (so downstream
- * filters can distinguish "free" from "untracked"). This preserves the
- * paper-grade contract: a record without a cost number is unbounded by
- * definition, but we don't drop the record.
+ * Convert a `CampaignResult` into canonical `RunRecord[]` — one record per
+ * scored cell. The cell's mean judge composite becomes the split score; every
+ * judge dimension is carried through to `outcome.raw`. A cell that errored
+ * becomes a record with `failureMode: 'cell_error'` (kept, not dropped — an
+ * unscored cell is signal). `candidateId` identifies the measured surface
+ * (defaults to the campaign manifest hash).
  */
-declare function trialToRunRecord(trial: TrialResult, ctx: AdapterContext, opts?: {
-    runId?: string;
-    experimentIdPerTrial?: (t: TrialResult) => string;
-}): RunRecord;
-/** Convenience: convert an array of `TrialResult` in one go. */
-declare function trialsToRunRecords(trials: TrialResult[], ctx: AdapterContext): RunRecord[];
+declare function campaignToRunRecords(campaign: CampaignResult, ctx: AdapterContext & {
+    candidateId?: string;
+}): RunRecord[];
 /**
  * Convert a `MultiLayerVerifier` `VerificationReport` into a `RunRecord`.
- *
- * The verifier produces per-layer results; we synthesize one canonical
- * record where:
- *   - `outcome.searchScore` (or `holdoutScore`) is `report.blendedScore`
- *   - `outcome.raw` carries every layer's score keyed `layer.<name>`
- *     plus a `layer_<name>_pass` 1/0 indicator
- *   - `failureMode` is taken from the first failing layer's `reason`
- *   - `wallMs` is `report.durationMs`
+ * `outcome.searchScore` (or `holdoutScore`) is `report.blendedScore`;
+ * `outcome.raw` carries every layer's score + a pass indicator; `failureMode`
+ * is the first failing layer's reason.
  */
 declare function verificationReportToRunRecord(report: VerificationReport, ctx: AdapterContext & {
     candidateId: string;
@@ -592,15 +582,6 @@ declare function verificationReportToRunRecord(report: VerificationReport, ctx:
 }, opts?: {
     runId?: string;
 }): RunRecord;
-/**
- * Convert a `VariantAggregate` (per-variant rollup from `prompt-evolution`)
- * into a synthetic `RunRecord` representing the aggregate. Useful when the
- * downstream consumer wants per-variant entries for a `researchReport`
- * rather than per-(variant, scenario, rep) trial entries.
- */
-declare function variantAggregateToRunRecord(agg: VariantAggregate, ctx: AdapterContext, opts?: {
-    runId?: string;
-}): RunRecord;
 /**
  * Bradley-Terry / Elo tournament evaluation.
@@ -1396,6 +1377,72 @@ interface StepRewardJsonlRow {
 }
 declare function stepRewardsToJsonl(stepRewards: StepReward[]): string;
+/**
+ * `PredictiveValidityResearcher` — concrete `Researcher` implementation
+ * that drives selection from outcome-anchored predictive validity.
+ *
+ * Each method:
+ *
+ *   - `inspectFailures(runs)` — synthesizes failure modes from the
+ *     bottom-quartile of `RunRecord`s on the configured proxy reward.
+ *   - `proposeChange(failures)` — proposes steering changes that target
+ *     the rubrics with the lowest predictive validity (decorative ones).
+ *     Either reduce their weight in the composite, or recalibrate them.
+ *   - `applyChange(changes, baseline)` — merges the proposed steering
+ *     into the experiment plan.
+ *   - `evaluateChange(plan)` — re-runs the predictive-validity check on
+ *     the post-change runs and reports the delta.
+ *
+ * The result is a closed loop: the rubric weights drift toward the ones
+ * that actually predict deployment outcomes, automatically. Pair with
+ * `runRLCampaign` for the full auto-research story.
+ */
+interface PredictiveValidityResearcherOptions {
+    outcomes: OutcomeStore;
+    outcomeMetrics: string[];
+    /** Score threshold below which a run counts as a "failure." Default 0.5. */
+    failureThreshold?: number;
+    /** Spearman bucket below which a rubric is "decorative." Default 0.4. */
+    decorativeThreshold?: number;
+    /** Optional steering-namespace prefix for proposed changes. Default `'rubric_weight'`. */
+    steeringNamespace?: string;
+    /** Override the rubric set the researcher inspects. Default: every numeric `outcome.raw` key seen. */
+    rubrics?: string[];
+    /**
+     * Snapshot stash hook — called with the most recent predictive-validity
+     * report. Useful when a downstream system wants to log rubric drift over
+     * time. Default no-op.
+     */
+    onReport?: (report: RubricPredictiveValidityReport) => void | Promise<void>;
+}
+/**
+ * Concrete `Researcher` driven by `rubricPredictiveValidity`. The brain:
+ * rubrics that don't predict deployment outcomes don't earn weight.
+ */
+declare class PredictiveValidityResearcher implements Researcher {
+    private opts;
+    private lastReport;
+    constructor(opts: PredictiveValidityResearcherOptions);
+    inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
+    proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
+    applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
+    evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
+    /**
+     * Run the predictive-validity check explicitly against a fresh RunRecord
+     * set. Updates the researcher's cached report so subsequent
+     * `proposeChange` calls have evidence to draw from.
+     */
+    runValidityCheck(runs: RunRecord[]): Promise<RubricPredictiveValidityReport>;
+    /**
+     * Force-feed a predictive-validity report into the researcher state —
+     * useful when the consumer ran the report out-of-band and wants the
+     * researcher's later proposals informed by it.
+     */
+    setReport(report: RubricPredictiveValidityReport): void;
+    getLastReport(): RubricPredictiveValidityReport | null;
+}
 /**
  * Reward hacking / Goodhart detection.
  *
@@ -1499,171 +1546,6 @@ interface DetectRewardHackingInput {
 }
 declare function detectRewardHacking(input: DetectRewardHackingInput): RewardHackingReport;
-/**
- * `analyzeOptimizationResult` — unifies the auto-research stack
- * (`runPromptEvolution`, `runMultiShotOptimization`, reflective-mutation,
- * Ax/AxRLM trace analyst) with the RL bridge in a single call.
- *
- * The optimization primitives produce `TrialResult[]`; the RL bridge
- * consumes `RunRecord[]`. Trace-analyst is independent of both. This
- * function does the wiring once so consumers don't have to:
- *
- *    Optimization (existing primitives)           RL bridge
- *    ──────────────────────────────────           ────────
- *    runPromptEvolution → TrialResult[]    →
- *    runMultiShotOptimization → MSTrial[]  → analyzeOptimizationResult →
- *    reflective-mutation → mutations.jsonl →                             ↓
- *                                                                        │
- *    ↓ (per-generation inputs flow back)                                 │
- *    PredictiveValidityResearcher.proposeChange  ←─────────────────────  │
- *                                                                        │
- *    ↓                                                                   │
- *    TraceAnalyst.analyze(progressLog)         ←─────────────────────────┘
- *
- * The output is the canonical RL artifact set: `RunRecord[]` (so every
- * other RL primitive composes), preference triples, verifiable reward
- * signals, reward-hacking diagnosis, sequential interim verdict, and
- * (when wired) trace-analyst summary.
- *
- * What this primitive does NOT do: it does not modify the optimization
- * primitives' internals. They keep producing `TrialResult` and emitting
- * `onProgress` events; this function bridges *after* the sweep completes.
- * Per-step capture-integrity (raw HTTP events from inside the score
- * adapter) requires the consumer to wire `RawProviderSink` into their
- * own `ScoreAdapter` — that's a per-consumer integration point.
- */
-interface AnalyzeOptimizationResultOptions {
-    /**
-     * The optimization output. Either a `PromptEvolutionResult` or a
-     * `MultiShotOptimizationResult`. The function detects which by
-     * structural typing and produces canonical `RunRecord[]` from either.
-     */
-    result: PromptEvolutionResult | MultiShotOptimizationResult;
-    /** Adapter context — `commitSha`, `model`, `promptHash`, `configHash`. */
-    ctx: AdapterContext;
-    /** Optional comparator candidate id for paired analyses. */
-    comparator?: string;
-    /** Verifiable-reward extraction options. */
-    verifiableReward?: VerifiableRewardExtractionOptions;
-    /** Preference extraction options. */
-    preferences?: ExtractPreferencesOptions;
-    /** Sequential interim-confidence options. */
-    sequential?: {
-        alpha?: number;
-        bound?: number;
-        rope?: {
-            low: number;
-            high: number;
-        };
-    };
-    /** Outcome calibration store + metrics. */
-    outcomes?: {
-        store: OutcomeStore;
-        metrics: string[];
-    };
-    /** Trainer-format export — DPO + GRPO lookups. */
-    trainerExport?: {
-        dpo?: DpoLookups;
-        grpo?: GrpoLookups;
-    };
-}
-interface AnalyzeOptimizationResultReport {
-    /** All trials promoted to canonical `RunRecord` shape. */
-    runs: RunRecord[];
-    /** Per-run verifiable reward signal. */
-    rewardSignals: Array<{
-        runId: string;
-        reward: VerifiableReward | null;
-    }>;
-    /** Preference triples ready for DPO/PPO/KTO training. */
-    preferences: PreferenceExtractionReport;
-    /** Anytime-valid sequential verdict, when a comparator is supplied. */
-    interimConfidence: InterimReleaseConfidence | null;
-    /** Standing reward-hacking hygiene check. */
-    rewardHacking: RewardHackingReport;
-    /** Predictive validity, when an outcome store is supplied. */
-    predictiveValidity: RubricPredictiveValidityReport | null;
-    /** Trainer-export rows, populated only for the formats requested. */
-    trainerRows: {
-        dpo?: DpoExportRow[];
-        grpo?: GrpoExportRow[];
-    };
-    /** One-line summary suitable for logs. */
-    summary: string;
-}
-/**
- * Convert an optimization sweep output into a fully-analysed RL artifact
- * set. Idempotent and read-only with respect to the optimization result.
- */
-declare function analyzeOptimizationResult(opts: AnalyzeOptimizationResultOptions): Promise<AnalyzeOptimizationResultReport>;
-/**
- * `PredictiveValidityResearcher` — concrete `Researcher` implementation
- * that drives selection from outcome-anchored predictive validity.
- *
- * Each method:
- *
- *   - `inspectFailures(runs)` — synthesizes failure modes from the
- *     bottom-quartile of `RunRecord`s on the configured proxy reward.
- *   - `proposeChange(failures)` — proposes steering changes that target
- *     the rubrics with the lowest predictive validity (decorative ones).
- *     Either reduce their weight in the composite, or recalibrate them.
- *   - `applyChange(changes, baseline)` — merges the proposed steering
- *     into the experiment plan.
- *   - `evaluateChange(plan)` — re-runs the predictive-validity check on
- *     the post-change runs and reports the delta.
- *
- * The result is a closed loop: the rubric weights drift toward the ones
- * that actually predict deployment outcomes, automatically. Pair with
- * `runRLCampaign` for the full auto-research story.
- */
-interface PredictiveValidityResearcherOptions {
-    outcomes: OutcomeStore;
-    outcomeMetrics: string[];
-    /** Score threshold below which a run counts as a "failure." Default 0.5. */
-    failureThreshold?: number;
-    /** Spearman bucket below which a rubric is "decorative." Default 0.4. */
-    decorativeThreshold?: number;
-    /** Optional steering-namespace prefix for proposed changes. Default `'rubric_weight'`. */
-    steeringNamespace?: string;
-    /** Override the rubric set the researcher inspects. Default: every numeric `outcome.raw` key seen. */
-    rubrics?: string[];
-    /**
-     * Snapshot stash hook — called with the most recent predictive-validity
-     * report. Useful when a downstream system wants to log rubric drift over
-     * time. Default no-op.
-     */
-    onReport?: (report: RubricPredictiveValidityReport) => void | Promise<void>;
-}
-/**
- * Concrete `Researcher` driven by `rubricPredictiveValidity`. The brain:
- * rubrics that don't predict deployment outcomes don't earn weight.
- */
-declare class PredictiveValidityResearcher implements Researcher {
-    private opts;
-    private lastReport;
-    constructor(opts: PredictiveValidityResearcherOptions);
-    inspectFailures(runs: RunRecord[]): Promise<FailureMode[]>;
-    proposeChange(failures: FailureMode[]): Promise<SteeringChange[]>;
-    applyChange(changes: SteeringChange[], baseline: ExperimentPlan): Promise<ExperimentPlan>;
-    evaluateChange(plan: ExperimentPlan): Promise<ExperimentResult>;
-    /**
-     * Run the predictive-validity check explicitly against a fresh RunRecord
-     * set. Updates the researcher's cached report so subsequent
-     * `proposeChange` calls have evidence to draw from.
-     */
-    runValidityCheck(runs: RunRecord[]): Promise<RubricPredictiveValidityReport>;
-    /**
-     * Force-feed a predictive-validity report into the researcher state —
-     * useful when the consumer ran the report out-of-band and wants the
-     * researcher's later proposals informed by it.
-     */
-    setReport(report: RubricPredictiveValidityReport): void;
-    getLastReport(): RubricPredictiveValidityReport | null;
-}
 /**
  * `runRLCampaign` — top-level orchestrator that runs the matrix and
  * produces every RL-ready artifact in one call.
@@ -1741,4 +1623,4 @@ interface RLCampaignResult<V> {
 }
 declare function runRLCampaign<V>(opts: RunRLCampaignOptions<V>): Promise<RLCampaignResult<V>>;
-export { type AdaptationCurve, type AdaptationPoint, type AdaptationRunner, type AdapterContext, type AdversarialMutation, type AdversarialScenario, type AdversarialSearchOptions, type AdversarialSearchReport, type AnalyzeOptimizationResultOptions, type AnalyzeOptimizationResultReport, type BradleyTerryFit, type BradleyTerryRating, type BuildPairwiseFromCampaignInput, type CellObservation, type CompareCurvesResult, type ComputeBestOfNOptions, type ComputeBestOfNResult, type ComputeCurve, type ComputeCurveBudget, type ComputeCurvePoint, type ContaminationProbeInput, type ContaminationProbeOptions, type ContaminationProbeReport, type CurriculumAllocation, type DetectRewardHackingInput, type DpoExportRow, type DpoLookups, type EloOptions, type ExtractPreferencesOptions, type ExtractStepRewardsOptions, type GrpoExportRow, type GrpoLookups, type OffPolicyEstimate, type OffPolicyOptions, type OffPolicyTrajectory, type PairwiseOutcome, type ParetoPointInput, PredictiveValidityResearcher, type PredictiveValidityResearcherOptions, type PreferenceExtractionReport, type PreferenceStrategy, type PreferenceTriple, type PrmExportRow, type PrmLookups, type PrmTrainingTriple, type RLCampaignResult, type RewardHackingFinding, type RewardHackingReport, type RewardHackingSignal, type RunAdaptationCurveOptions, type RunComputeCurveOptions, type RunRLCampaignOptions, type RunwiseStepSummary, type ScenarioPerturbation, type ScenarioPerturbationKind, type SelfConsistencyOptions, type SelfConsistencyResult, type SftExportRow, type SftLookups, type StepReward, type StepRewardJsonlRow, type StepScorer, type ThompsonCurriculumOptions, type VarianceCurriculumOptions, type VerifiableReward, type VerifiableRewardExtractionOptions, type VerifiableRewardSource, adversarialScenarioSearch, analyzeOptimizationResult, applyEloUpdate, bestOfN, buildPairwiseFromCampaign, compareAdaptationCurves, detectRewardHacking, doublyRobust, extractPreferences, extractStepRewards, extractVerifiableReward, extractVerifiableRewardsFromRecords, filterDeterministicallyRewarded, firstPassK, fitBradleyTerry, injectIrrelevantClause, inverseProbabilityWeighting, observationsFromRunRecords, offPolicyEstimateAll, paretoFrontier, prmTrainingPairs, renameVariables, runAdaptationCurve, runComputeCurve, runContaminationProbe, runRLCampaign, runwiseStepRewardSummary, selfConsistency, selfNormalizedImportanceWeighting, shuffleOrder, stepRewardsToJsonl, thompsonCurriculum, toAnthropicFormat, toDpoJsonl, toDpoRows, toGrpoJsonl, toGrpoRows, toPrmJsonl, toPrmRows, toSftJsonl, toSftRows, toTRLFormat, trialToRunRecord, trialsToRunRecords, varianceBasedCurriculum, variantAggregateToRunRecord, verificationReportToRunRecord };
+export { type AdaptationCurve, type AdaptationPoint, type AdaptationRunner, type AdapterContext, type AdversarialMutation, type AdversarialScenario, type AdversarialSearchOptions, type AdversarialSearchReport, type BradleyTerryFit, type BradleyTerryRating, type BuildPairwiseFromCampaignInput, type CellObservation, type CompareCurvesResult, type ComputeBestOfNOptions, type ComputeBestOfNResult, type ComputeCurve, type ComputeCurveBudget, type ComputeCurvePoint, type ContaminationProbeInput, type ContaminationProbeOptions, type ContaminationProbeReport, type CurriculumAllocation, type DetectRewardHackingInput, type DpoExportRow, type DpoLookups, type EloOptions, type ExtractPreferencesOptions, type ExtractStepRewardsOptions, type GrpoExportRow, type GrpoLookups, type OffPolicyEstimate, type OffPolicyOptions, type OffPolicyTrajectory, type PairwiseOutcome, type ParetoPointInput, PredictiveValidityResearcher, type PredictiveValidityResearcherOptions, type PreferenceExtractionReport, type PreferenceStrategy, type PreferenceTriple, type PrmExportRow, type PrmLookups, type PrmTrainingTriple, type RLCampaignResult, type RewardHackingFinding, type RewardHackingReport, type RewardHackingSignal, type RunAdaptationCurveOptions, type RunComputeCurveOptions, type RunRLCampaignOptions, type RunwiseStepSummary, type ScenarioPerturbation, type ScenarioPerturbationKind, type SelfConsistencyOptions, type SelfConsistencyResult, type SftExportRow, type SftLookups, type StepReward, type StepRewardJsonlRow, type StepScorer, type ThompsonCurriculumOptions, type VarianceCurriculumOptions, type VerifiableReward, type VerifiableRewardExtractionOptions, type VerifiableRewardSource, adversarialScenarioSearch, applyEloUpdate, bestOfN, buildPairwiseFromCampaign, campaignToRunRecords, compareAdaptationCurves, detectRewardHacking, doublyRobust, extractPreferences, extractStepRewards, extractVerifiableReward, extractVerifiableRewardsFromRecords, filterDeterministicallyRewarded, firstPassK, fitBradleyTerry, injectIrrelevantClause, inverseProbabilityWeighting, observationsFromRunRecords, offPolicyEstimateAll, paretoFrontier, prmTrainingPairs, renameVariables, runAdaptationCurve, runComputeCurve, runContaminationProbe, runRLCampaign, runwiseStepRewardSummary, selfConsistency, selfNormalizedImportanceWeighting, shuffleOrder, stepRewardsToJsonl, thompsonCurriculum, toAnthropicFormat, toDpoJsonl, toDpoRows, toGrpoJsonl, toGrpoRows, toPrmJsonl, toPrmRows, toSftJsonl, toSftRows, toTRLFormat, varianceBasedCurriculum, verificationReportToRunRecord };