npm - @tangle-network/agent-eval - Versions diffs - 0.22.0 → 0.23.0 - Mend

@tangle-network/agent-eval 0.22.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

package/CHANGELOG.md +134 -0
package/README.md +13 -3
package/dist/benchmarks/index.d.ts +2 -2
package/dist/{chunk-UAND2LOT.js → chunk-7EAUOUQS.js} +4 -247
package/dist/chunk-7EAUOUQS.js.map +1 -0
package/dist/chunk-AXHNWLIX.js +246 -0
package/dist/chunk-AXHNWLIX.js.map +1 -0
package/dist/chunk-EXGR4XEM.js +283 -0
package/dist/chunk-EXGR4XEM.js.map +1 -0
package/dist/chunk-LZKIOBG2.js +2026 -0
package/dist/chunk-LZKIOBG2.js.map +1 -0
package/dist/{chunk-YUFXO3TU.js → chunk-QBW3YBTR.js} +1 -1
package/dist/chunk-QBW3YBTR.js.map +1 -0
package/dist/{chunk-ARZ6BEV6.js → chunk-V5QSWN7L.js} +2 -2
package/dist/{chunk-USHQBPMH.js → chunk-VQQSPGSM.js} +7 -283
package/dist/chunk-VQQSPGSM.js.map +1 -0
package/dist/{control-cxwMOAsy.d.ts → control-DvkH87qJ.d.ts} +2 -2
package/dist/control.d.ts +3 -3
package/dist/control.js +2 -2
package/dist/{optimization-UVDNKaO6.d.ts → eval-campaign-Ds5QljIh.d.ts} +4 -5
package/dist/{feedback-trajectory-CB0A32o3.d.ts → feedback-trajectory-c43WGtTX.d.ts} +1 -1
package/dist/{index-c5saLbKD.d.ts → index-DDTlbHEK.d.ts} +1 -1
package/dist/index-ekBXweiQ.d.ts +1894 -0
package/dist/index.d.ts +18 -154
package/dist/index.js +125 -25
package/dist/index.js.map +1 -1
package/dist/{integrity-K2oVlF57.d.ts → integrity-Cr5YodSY.d.ts} +1 -1
package/dist/openapi.json +1 -1
package/dist/optimization.d.ts +5 -5
package/dist/optimization.js +7 -5
package/dist/reporting.d.ts +294 -4
package/dist/reporting.js +6 -4
package/dist/rl.d.ts +8 -0
package/dist/rl.js +113 -0
package/dist/rl.js.map +1 -0
package/dist/{run-record-CX_jcAyr.d.ts → run-record-DNiOMBrZ.d.ts} +10 -1
package/dist/sequential-DgU2mFsE.d.ts +304 -0
package/dist/{summary-report-D4p7RlDu.d.ts → summary-report-Ce1r4EYo.d.ts} +2 -2
package/dist/traces.d.ts +2 -2
package/dist/traces.js +5 -5
package/docs/auto-research-loop-end-to-end.md +186 -0
package/docs/three-package-architecture.md +180 -0
package/package.json +6 -1
package/dist/chunk-UAND2LOT.js.map +0 -1
package/dist/chunk-USHQBPMH.js.map +0 -1
package/dist/chunk-YUFXO3TU.js.map +0 -1
package/dist/reporting-B82RSv9C.d.ts +0 -593
/package/dist/{chunk-ARZ6BEV6.js.map → chunk-V5QSWN7L.js.map} +0 -0

package/dist/index.d.ts CHANGED Viewed

@@ -1,25 +1,29 @@
 import { TCloud } from '@tangle-network/tcloud';
-import { R as ReleaseConfidenceThresholds, a as ReleaseConfidenceScorecard, O as OutcomeFilter, b as OutcomeStore } from './reporting-B82RSv9C.js';
-export { B as BootstrapOptions, c as BootstrapResult, D as DeploymentOutcome, F as FileSystemOutcomeStore, d as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, e as InterimReleaseConfidence, f as InterimReleaseConfidenceInput, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, g as PairedBootstrapResult, h as PairedEvalueOptions, i as PairedEvalueSequence, j as PairedEvalueStep, k as ReleaseConfidenceAxis, l as ReleaseConfidenceAxisName, m as ReleaseConfidenceInput, n as ReleaseConfidenceIssue, o as ReleaseConfidenceMetrics, p as ReleaseConfidenceStatus, q as ReleaseTraceEvidence, r as RenderReleaseReportOptions, s as RubricOutcomePair, t as RubricPredictiveValidityInput, u as RubricPredictiveValidityReport, v as RubricRanking, S as SequentialDecision, V as Verdict, w as assertReleaseConfidence, x as bhAdjust, y as bootstrapCi, z as evaluateInterimReleaseConfidence, A as evaluateReleaseConfidence, C as judgeReplayGate, E as pairedBootstrap, G as pairedEvalueSequence, H as pairedWilcoxon, K as releaseTraceEvidenceFromMultiShotTrials, L as renderReleaseReport, M as rubricPredictiveValidity } from './reporting-B82RSv9C.js';
-import { F as FeedbackLabel, a as FeedbackTrajectoryStore, b as FeedbackTrajectory, C as ControlSeverity, c as ControlEvalResult } from './feedback-trajectory-CB0A32o3.js';
-export { d as ControlActionFailureMode, e as ControlActionOutcome, f as ControlBudget, g as ControlContext, h as ControlDecision, i as ControlRunResult, j as ControlRuntimeConfig, k as ControlRuntimeError, l as ControlStep, m as ControlStopPolicies, n as FeedbackArtifactType, o as FeedbackAttempt, p as FeedbackLabelKind, q as FeedbackLabelSource, r as FeedbackOptimizerRow, s as FeedbackOutcome, t as FeedbackReplayAdapter, u as FeedbackReplayResult, v as FeedbackSeverity, w as FeedbackSplitPolicy, x as FeedbackTask, y as FeedbackTrajectoryFilter, z as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, A as ProposedSideEffect, S as StopDecision, B as allCriticalPassed, D as assignFeedbackSplit, E as controlRunToFeedbackTrajectory, G as createFeedbackTrajectory, H as feedbackTrajectoriesToDatasetScenarios, J as feedbackTrajectoriesToOptimizerRows, K as feedbackTrajectoryToDatasetScenario, L as feedbackTrajectoryToOptimizerRow, M as objectiveEval, N as parseFeedbackTrajectoriesJsonl, O as renderPreferenceMemoryMarkdown, Q as replayFeedbackTrajectories, R as replayFeedbackTrajectory, T as runAgentControlLoop, U as serializeFeedbackTrajectoriesJsonl, V as stopOnNoProgress, W as stopOnRepeatedAction, X as subjectiveEval, Y as summarizePreferenceMemory, Z as withAssignedFeedbackSplit } from './feedback-trajectory-CB0A32o3.js';
-export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-cxwMOAsy.js';
+import { ReleaseConfidenceThresholds, ReleaseConfidenceScorecard } from './reporting.js';
+export { BootstrapOptions, BootstrapResult, JudgeReplayGateArgs, PairedBootstrapOptions, PairedBootstrapResult, ReleaseConfidenceAxis, ReleaseConfidenceAxisName, ReleaseConfidenceInput, ReleaseConfidenceIssue, ReleaseConfidenceMetrics, ReleaseConfidenceStatus, ReleaseTraceEvidence, RenderReleaseReportOptions, Verdict, assertReleaseConfidence, bhAdjust, bootstrapCi, evaluateReleaseConfidence, judgeReplayGate, pairedBootstrap, pairedWilcoxon, releaseTraceEvidenceFromMultiShotTrials, renderReleaseReport } from './reporting.js';
+import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory, E as ControlSeverity, G as ControlEvalResult } from './feedback-trajectory-c43WGtTX.js';
+export { H as ControlActionFailureMode, J as ControlActionOutcome, K as ControlBudget, L as ControlContext, M as ControlDecision, N as ControlRunResult, O as ControlRuntimeConfig, Q as ControlRuntimeError, R as ControlStep, S as ControlStopPolicies, F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, T as StopDecision, U as allCriticalPassed, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, V as objectiveEval, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, W as runAgentControlLoop, B as serializeFeedbackTrajectoriesJsonl, X as stopOnNoProgress, Y as stopOnRepeatedAction, Z as subjectiveEval, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-c43WGtTX.js';
+export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-DvkH87qJ.js';
 import { T as TraceEmitter } from './emitter-B2XqDKFU.js';
 export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-B2XqDKFU.js';
-import { A as ActionableSideInfo, O as Objective, P as ParetoResult, T as TrialCache, a as TrialResult, E as EvolvableVariant, M as MutateAdapter, V as VariantAggregate } from './summary-report-D4p7RlDu.js';
-export { b as AsiSeverity, D as DEFAULT_FAILURE_RULES, c as DEFAULT_MUTATION_PRIMITIVES, d as Direction, F as FailureClassification, e as FailureCluster, f as FailureClusterReport, g as FailureContext, h as FailureRule, G as GainDistributionBin, i as GainDistributionFigureSpec, j as GainDistributionOptions, k as GateDecision, l as GateEvidence, m as GenerationReport, H as HeldOutGate, n as HeldOutGateConfig, o as HeldOutGateRejectionCode, I as InMemoryTrialCache, p as MultiShotGateConfig, q as MultiShotGateResult, r as MultiShotMutateAdapter, s as MultiShotOptimizationConfig, t as MultiShotOptimizationResult, u as MultiShotRun, v as MultiShotRunInput, w as MultiShotRunner, x as MultiShotScore, y as MultiShotScorer, z as MultiShotSplit, B as MultiShotTrace, C as MultiShotTrialResult, J as MultiShotVariant, K as ParetoFigureSpec, L as ParetoPoint, N as PromptEvolutionConfig, Q as PromptEvolutionEvent, R as PromptEvolutionResult, S as RESEARCH_REPORT_HARD_PAIR_FLOOR, U as ReflectionContext, W as ReflectionProposal, X as ResearchReport, Y as ResearchReportCandidate, Z as ResearchReportDecision, _ as ResearchReportMethodology, $ as ResearchReportOptions, a0 as ResearchReportRecommendation, a1 as ScenarioAggregate, a2 as ScoreAdapter, a3 as SummaryTable, a4 as SummaryTableOptions, a5 as SummaryTableRow, a6 as TrialTrace, a7 as buildReflectionPrompt, a8 as classifyFailure, a9 as crowdingDistance, aa as defaultMultiShotObjectives, ab as dominates, ac as failureClusterView, ad as gainHistogram, ae as paretoChart, af as paretoFrontier, ag as paretoFrontierWithCrowding, ah as parseReflectionResponse, ai as researchReport, aj as runMultiShotOptimization, ak as runPromptEvolution, al as scalarScore, am as summaryTable, an as trialTraceFromMultiShotTrial } from './summary-report-D4p7RlDu.js';
+import { A as ActionableSideInfo, O as Objective, J as ParetoResult, T as TrialCache, t as TrialResult, E as EvolvableVariant, o as MutateAdapter, V as VariantAggregate } from './summary-report-Ce1r4EYo.js';
+export { a as AsiSeverity, K as DEFAULT_FAILURE_RULES, D as DEFAULT_MUTATION_PRIMITIVES, L as Direction, N as FailureClassification, Q as FailureCluster, U as FailureClusterReport, W as FailureContext, X as FailureRule, Y as GainDistributionBin, Z as GainDistributionFigureSpec, _ as GainDistributionOptions, C as GateDecision, $ as GateEvidence, G as GenerationReport, a0 as HeldOutGate, a1 as HeldOutGateConfig, a2 as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, a3 as ParetoFigureSpec, a4 as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, a5 as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, H as ResearchReport, a6 as ResearchReportCandidate, a7 as ResearchReportDecision, a8 as ResearchReportMethodology, F as ResearchReportOptions, a9 as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, aa as SummaryTable, ab as SummaryTableOptions, ac as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ad as classifyFailure, ae as crowdingDistance, w as defaultMultiShotObjectives, af as dominates, ag as failureClusterView, ah as gainHistogram, ai as paretoChart, aj as paretoFrontier, ak as paretoFrontierWithCrowding, x as parseReflectionResponse, al as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, am as scalarScore, an as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-Ce1r4EYo.js';
 import { a as Run$1, S as Span, f as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, T as TraceStore, F as FailureClass, g as BudgetSpec, c as ToolSpan, h as RunFilter, L as LlmSpan, J as JudgeSpan } from './store-u47QaJ9G.js';
 export { i as EventFilter, E as EventKind, j as FAILURE_CLASSES, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, M as Message, d as RetrievalSpan, m as RunLayer, n as RunStatus, e as SandboxSpan, o as SpanBase, p as SpanFilter, b as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-u47QaJ9G.js';
 import { llmSpans } from './traces.js';
 export { AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, DatasetOverview, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpResourceSpans, OtlpSpan, QueryTracesPage, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SearchSpanResult, SearchTraceResult, SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TraceAnalysisStore, TraceAnalystByteBudgets, TraceAnalystFilters, TraceAnalystHookOptions, TraceAnalystSpan, TraceAnalystSpanKind, TraceAnalystSpanStatus, TraceAnalystTraceSummary, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, ViewSpansResult, ViewTraceOversized, ViewTraceResult, aggregateLlm, analyzeTraces, argHash, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, groupBy, inferDomainKeywords, iterateRawCalls, judgeSpans, planTraceInsightQuestions, redactString, redactValue, runFailureClass, runsForScenario, scoreTraceInsightReadiness, tokenizeDomainWords, toolSpans, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
-export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, R as RawProviderDirection, c as RawProviderEvent, d as RawProviderSink, e as RawProviderSinkFilter, f as RunIntegrityError, g as RunIntegrityExpectations, h as RunIntegrityIssue, i as RunIntegrityIssueCode, j as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-K2oVlF57.js';
+export { F as FileSystemRawProviderSink, c as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, d as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, e as RawProviderDirection, f as RawProviderEvent, R as RawProviderSink, g as RawProviderSinkFilter, h as RunIntegrityError, a as RunIntegrityExpectations, i as RunIntegrityIssue, j as RunIntegrityIssueCode, b as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-Cr5YodSY.js';
 import { a as DatasetScenario, b as Dataset, c as DatasetManifest } from './dataset-B9qvlm_o.js';
 export { d as DatasetDifficulty, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-B9qvlm_o.js';
-import { L as LlmClientOptions } from './optimization-UVDNKaO6.js';
-export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, m as LlmCallError, n as LlmCallRequest, o as LlmCallResult, p as LlmClient, q as LlmMessage, r as LlmRouteAssertionError, s as LlmRouteRequirements, t as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, u as assertLlmRoute, v as callLlm, w as callLlmJson, x as probeLlm, y as runEvalCampaign, z as stripFencedJson } from './optimization-UVDNKaO6.js';
-import { a as RunRecord } from './run-record-CX_jcAyr.js';
-export { b as RunJudgeMetadata, c as RunOutcome, d as RunRecordValidationError, R as RunSplitTag, e as RunTokenUsage, i as isRunRecord, p as parseRunRecordSafe, r as roundTripRunRecord, v as validateRunRecord } from './run-record-CX_jcAyr.js';
-export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-c5saLbKD.js';
+import { O as OutcomeFilter, a as OutcomeStore } from './sequential-DgU2mFsE.js';
+export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, c as InterimReleaseConfidence, d as InterimReleaseConfidenceInput, P as PairedEvalueOptions, e as PairedEvalueSequence, f as PairedEvalueStep, R as RubricOutcomePair, g as RubricPredictiveValidityInput, h as RubricPredictiveValidityReport, i as RubricRanking, S as SequentialDecision, j as evaluateInterimReleaseConfidence, p as pairedEvalueSequence, r as rubricPredictiveValidity } from './sequential-DgU2mFsE.js';
+import { L as LlmClientOptions } from './eval-campaign-Ds5QljIh.js';
+export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, m as LlmCallError, n as LlmCallRequest, o as LlmCallResult, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as callLlm, x as callLlmJson, y as probeLlm, r as runEvalCampaign, z as stripFencedJson } from './eval-campaign-Ds5QljIh.js';
+import { L as LayerResult, V as VerifyContext, a as Layer, S as Severity } from './index-ekBXweiQ.js';
+export { A as AdaptationCurve, b as AdaptationPoint, c as AdaptationRunner, d as AdapterContext, e as AdversarialMutation, f as AdversarialScenario, g as AdversarialSearchOptions, h as AdversarialSearchReport, i as AnalyzeOptimizationResultOptions, j as AnalyzeOptimizationResultReport, B as BradleyTerryFit, k as BradleyTerryRating, l as BuildPairwiseFromCampaignInput, C as CellObservation, m as CompareCurvesResult, n as ComputeBestOfNOptions, o as ComputeBestOfNResult, p as ComputeCurve, q as ComputeCurveBudget, r as ComputeCurvePoint, s as ContaminationProbeInput, t as ContaminationProbeOptions, u as ContaminationProbeReport, v as CurriculumAllocation, D as DetectRewardHackingInput, w as DpoExportRow, x as DpoLookups, E as EloOptions, y as ExtractPreferencesOptions, z as ExtractStepRewardsOptions, F as Finding, G as GrpoExportRow, H as GrpoLookups, I as LayerStatus, M as MultiLayerVerifier, O as OffPolicyEstimate, J as OffPolicyOptions, K as OffPolicyTrajectory, P as PairwiseOutcome, N as ParetoPointInput, Q as PredictiveValidityResearcher, R as PredictiveValidityResearcherOptions, T as PreferenceExtractionReport, U as PreferenceStrategy, W as PreferenceTriple, X as PrmExportRow, Y as PrmLookups, Z as PrmTrainingTriple, _ as RLCampaignResult, $ as RewardHackingFinding, a0 as RewardHackingReport, a1 as RewardHackingSignal, a2 as RunAdaptationCurveOptions, a3 as RunComputeCurveOptions, a4 as RunRLCampaignOptions, a5 as RunwiseStepSummary, a6 as ScenarioPerturbation, a7 as ScenarioPerturbationKind, a8 as SelfConsistencyOptions, a9 as SelfConsistencyResult, aa as SftExportRow, ab as SftLookups, ac as StepReward, ad as StepRewardJsonlRow, ae as StepScorer, af as ThompsonCurriculumOptions, ag as VarianceCurriculumOptions, ah as VerifiableReward, ai as VerifiableRewardExtractionOptions, aj as VerifiableRewardSource, ak as VerificationReport, al as VerifyOptions, am as adversarialScenarioSearch, an as analyzeOptimizationResult, ao as applyEloUpdate, ap as bestOfN, aq as buildPairwiseFromCampaign, ar as compareAdaptationCurves, as as detectRewardHacking, at as doublyRobust, au as extractPreferences, av as extractStepRewards, aw as extractVerifiableReward, ax as extractVerifiableRewardsFromRecords, ay as filterDeterministicallyRewarded, az as firstPassK, aA as fitBradleyTerry, aB as gradeSemanticStatus, aC as injectIrrelevantClause, aD as inverseProbabilityWeighting, aE as observationsFromRunRecords, aF as offPolicyEstimateAll, aG as prmTrainingPairs, aH as renameVariables, aI as runAdaptationCurve, aJ as runComputeCurve, aK as runContaminationProbe, aL as runRLCampaign, aM as runwiseStepRewardSummary, aN as selfConsistency, aO as selfNormalizedImportanceWeighting, aP as shuffleOrder, aQ as stepRewardsToJsonl, aR as thompsonCurriculum, aS as toAnthropicFormat, aT as toDpoJsonl, aU as toDpoRows, aV as toGrpoJsonl, aW as toGrpoRows, aX as toPrmJsonl, aY as toPrmRows, aZ as toSftJsonl, a_ as toSftRows, a$ as toTRLFormat, b0 as trialToRunRecord, b1 as trialsToRunRecords, b2 as varianceBasedCurriculum, b3 as variantAggregateToRunRecord, b4 as verificationReportToRunRecord } from './index-ekBXweiQ.js';
+import { R as RunRecord } from './run-record-DNiOMBrZ.js';
+export { c as RunJudgeMetadata, d as RunOutcome, e as RunRecordValidationError, a as RunSplitTag, b as RunTokenUsage, i as isRunRecord, p as parseRunRecordSafe, r as roundTripRunRecord, v as validateRunRecord } from './run-record-DNiOMBrZ.js';
+export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-DDTlbHEK.js';
 import '@ax-llm/ax';
 interface Scenario {
@@ -4226,146 +4230,6 @@ interface UseCaseSignals {
 declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
 declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
-/**
- * Multi-layer verifier — ordered pipeline of verification layers.
- *
- * Different contract from {@link JudgeRunner} (which runs parallel
- * specs against a sandbox). MultiLayerVerifier is a DAG of layers
- * (install → typecheck → build → lint → serve → semantic → …) with
- * dependency-based skip, per-layer findings, soft-fail semantics, and
- * an aggregated `blendedScore` across all passed layers.
- *
- * Use when you want:
- *   - ordered stages where a failing upstream stage skips downstream ones
- *   - each stage produces rich `findings` (severity + message + evidence)
- *   - a single composite score across stages with per-stage weights
- *   - soft-fail stages whose failure doesn't abort the pipeline
- *
- * Use {@link JudgeRunner} when you want:
- *   - N independent judges running in parallel against the same artifact
- *   - no inter-judge dependencies
- *   - boolean `passed` per judge + overall
- *
- * Both primitives compose — JudgeRunner can be invoked as a single
- * layer inside a MultiLayerVerifier if that suits the caller.
- */
-type LayerStatus = 'pass' | 'fail' | 'skipped' | 'error' | 'timeout';
-type Severity = 'critical' | 'major' | 'minor' | 'info';
-interface Finding {
-    severity: Severity;
-    message: string;
-    evidence?: string;
-    /** Optional layer name the finding belongs to (set by the verifier if omitted). */
-    layer?: string;
-    /**
-     * Free-form structured payload — used by `multiToolchainLayer` to attach
-     * `{ adapter: 'pnpm' }`, by judges to attach evidence pointers, etc.
-     * Renderers MAY interrogate; agent-eval primitives never assume shape.
-     */
-    detail?: Record<string, unknown>;
-}
-interface LayerResult {
-    layer: string;
-    status: LayerStatus;
-    /** 0..1 score, optional — layers that don't produce a numeric score omit. */
-    score?: number;
-    durationMs: number;
-    findings: Finding[];
-    /** Short human-readable summary (one line). */
-    reason?: string;
-    /**
-     * Numeric layer-level diagnostics: error counts, warning counts,
-     * cyclomatic complexity, total adapter wall-time, etc. Keyed by
-     * diagnostic name; null = "diagnostic not applicable / not measured."
-     * Renderers that know the keys can display them; ones that don't,
-     * ignore. Free-form on purpose — consumers type the value shape in
-     * their own namespace. Added in 0.10.
-     */
-    diagnostics?: Record<string, number | null>;
-    /** Any rich per-layer detail — rendered as-is by consumers that know the layer. */
-    detail?: Record<string, unknown>;
-}
-interface VerifyContext<Env = unknown> {
-    /** Per-run opaque context the caller provides. Layers destructure what they need. */
-    env: Env;
-    /** Previously-computed results from layers that already ran. */
-    prior: Record<string, LayerResult>;
-    /** Signal — if aborted, layers MUST bail within reasonable wall. */
-    signal: AbortSignal;
-}
-interface Layer<Env = unknown> {
-    name: string;
-    /** Stages that must have `status: 'pass'` before this layer runs. */
-    dependsOn?: string[];
-    /**
-     * Weight in the composite `blendedScore`. Default 1.0. Layers with weight 0
-     * contribute findings but not score.
-     */
-    weight?: number;
-    /**
-     * If true, a `fail` status contributes to `blendedScore` (as 0) instead of
-     * being dropped — use for layers whose failure is a real signal. Default:
-     * fail drops from numerator + denominator, matching VB's existing semantics.
-     */
-    failContributesToScore?: boolean;
-    /** Optional per-layer wall-cap in ms. Honored by the verifier (AbortSignal). */
-    capMs?: number;
-    run: (ctx: VerifyContext<Env>) => Promise<LayerResult> | LayerResult;
-}
-interface VerifyOptions<Env = unknown> {
-    env: Env;
-    /**
-     * Overall wall cap. Default: sum of layer capMs, or Infinity if any layer
-     * omits a cap. The verifier short-circuits remaining layers on overall cap.
-     */
-    overallCapMs?: number;
-    /** Called with each layer result as it completes. */
-    onLayer?: (result: LayerResult) => void;
-}
-interface VerificationReport {
-    layers: LayerResult[];
-    passCount: number;
-    failCount: number;
-    skippedCount: number;
-    errorCount: number;
-    /** True iff at least one scored layer ran AND every scored layer passed. */
-    allPass: boolean;
-    /**
-     * Weighted mean of `score` across contributing layers. 0 when no layers
-     * contributed. See {@link Layer.failContributesToScore} for fail semantics.
-     */
-    blendedScore: number;
-    durationMs: number;
-    startedAt: string;
-    finishedAt: string;
-}
-/**
- * Grade a semantic-concept-style judge result into a single layer status.
- *
- * Pass when overall score >= threshold AND no critical-severity concept gap.
- * Fail otherwise. Use inside a `Layer.run` when wrapping a concept judge.
- *
- * Generalized from VerticalBench H3 fix: `failingConcepts.length === 0` was
- * too strict — a single concept at 6/10 failed the entire layer despite
- * overall score being >= 0.7. Now we trust the judge's own `severity` field:
- * `critical` findings veto; `major`/`minor` reduce the score but don't veto.
- */
-declare function gradeSemanticStatus(input: {
-    score: number;
-    findings: Array<{
-        severity: Severity;
-        present?: boolean;
-        score?: number;
-    }>;
-    available: boolean;
-    threshold?: number;
-}): LayerStatus;
-declare class MultiLayerVerifier<Env = unknown> {
-    private readonly layers;
-    constructor(layers: Layer<Env>[]);
-    run(opts: VerifyOptions<Env>): Promise<VerificationReport>;
-}
 /**
  * CommandRunner — abstract subprocess execution surface.
  *
@@ -6038,4 +5902,4 @@ interface OrthogonalityResult {
 }
 declare function passOrthogonality<T>(input: OrthogonalityInput<T>): OrthogonalityResult;
-export { ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, BudgetLedgerEntry, BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ControlSeverity, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, type DataAcquisitionPlan, Dataset, DatasetManifest, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DirEntry, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, FailureClass, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OutcomeFilter, type OutcomePair, OutcomeStore, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegressionOptions, type RegressionSpec, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, TraceEvent, TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerbosityBiasResult, type VerificationReport, type VerifyContext, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateRunScore, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, blockingKnowledgeEval, bonferroni, budgetBreachView, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportTrainingData, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, precision as goldenPrecision, gradeSemanticStatus, hashContent, hashJson, htmlContainsElement, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, iqr, isPrmVerdict, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeAgreementView, keyPreserved, knowledgeReadinessTracePayload, linterJudge, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSuccessRubric, toolWasteView, typoMutator, urlContains, userQuestionsForKnowledgeGaps, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };
+export { ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, BudgetLedgerEntry, BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ControlSeverity, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, type DataAcquisitionPlan, Dataset, DatasetManifest, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DirEntry, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, FailureClass, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, type LayerCorrelation, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OutcomeFilter, type OutcomePair, OutcomeStore, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegressionOptions, type RegressionSpec, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, TraceEvent, TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerbosityBiasResult, VerifyContext, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateRunScore, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, blockingKnowledgeEval, bonferroni, budgetBreachView, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportTrainingData, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, iqr, isPrmVerdict, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, judgeAgreementView, keyPreserved, knowledgeReadinessTracePayload, linterJudge, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSuccessRubric, toolWasteView, typoMutator, urlContains, userQuestionsForKnowledgeGaps, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };

package/dist/index.js CHANGED Viewed

@@ -19,7 +19,7 @@ import {
   stopOnNoProgress,
   stopOnRepeatedAction,
   subjectiveEval
-} from "./chunk-ARZ6BEV6.js";
+} from "./chunk-V5QSWN7L.js";
 import {
   CallbackResearcher,
   DEFAULT_MUTATION_PRIMITIVES,
@@ -46,7 +46,6 @@ import {
   renderPreferenceMemoryMarkdown,
   replayFeedbackTrajectories,
   replayFeedbackTrajectory,
-  runEvalCampaign,
   runMultiShotOptimization,
   runPromptEvolution,
   scalarScore,
@@ -54,25 +53,89 @@ import {
   summarizePreferenceMemory,
   trialTraceFromMultiShotTrial,
   withAssignedFeedbackSplit
-} from "./chunk-USHQBPMH.js";
+} from "./chunk-VQQSPGSM.js";
 import {
   RunRecordValidationError,
   isRunRecord,
   parseRunRecordSafe,
   roundTripRunRecord,
   validateRunRecord
-} from "./chunk-YUFXO3TU.js";
+} from "./chunk-QBW3YBTR.js";
 import {
   assertReleaseConfidence,
   bootstrapCi,
-  evaluateInterimReleaseConfidence,
   evaluateReleaseConfidence,
   judgeReplayGate,
-  pairedEvalueSequence,
   releaseTraceEvidenceFromMultiShotTrials,
-  renderReleaseReport,
+  renderReleaseReport
+} from "./chunk-7EAUOUQS.js";
+import {
+  PredictiveValidityResearcher,
+  adversarialScenarioSearch,
+  analyzeOptimizationResult,
+  applyEloUpdate,
+  bestOfN,
+  buildPairwiseFromCampaign,
+  compareAdaptationCurves,
+  detectRewardHacking,
+  doublyRobust,
+  extractPreferences,
+  extractStepRewards,
+  extractVerifiableReward,
+  extractVerifiableRewardsFromRecords,
+  filterDeterministicallyRewarded,
+  firstPassK,
+  fitBradleyTerry,
+  injectIrrelevantClause,
+  inverseProbabilityWeighting,
+  observationsFromRunRecords,
+  offPolicyEstimateAll,
+  prmTrainingPairs,
+  renameVariables,
+  runAdaptationCurve,
+  runComputeCurve,
+  runContaminationProbe,
+  runRLCampaign,
+  runwiseStepRewardSummary,
+  selfConsistency,
+  selfNormalizedImportanceWeighting,
+  shuffleOrder,
+  stepRewardsToJsonl,
+  thompsonCurriculum,
+  toAnthropicFormat,
+  toDpoJsonl,
+  toDpoRows,
+  toGrpoJsonl,
+  toGrpoRows,
+  toPrmJsonl,
+  toPrmRows,
+  toSftJsonl,
+  toSftRows,
+  toTRLFormat,
+  trialToRunRecord,
+  trialsToRunRecords,
+  varianceBasedCurriculum,
+  variantAggregateToRunRecord,
+  verificationReportToRunRecord
+} from "./chunk-LZKIOBG2.js";
+import {
+  runEvalCampaign
+} from "./chunk-EXGR4XEM.js";
+import {
+  LlmCallError,
+  LlmClient,
+  LlmRouteAssertionError,
+  assertLlmRoute,
+  callLlm,
+  callLlmJson,
+  probeLlm,
+  stripFencedJson
+} from "./chunk-KAO3Q65R.js";
+import {
+  evaluateInterimReleaseConfidence,
+  pairedEvalueSequence,
   rubricPredictiveValidity
-} from "./chunk-UAND2LOT.js";
+} from "./chunk-AXHNWLIX.js";
 import {
   RESEARCH_REPORT_HARD_PAIR_FLOOR,
   benjaminiHochberg,
@@ -151,6 +214,13 @@ import {
   assertRunCaptured,
   throwIfRunIncomplete
 } from "./chunk-QUKKGHTZ.js";
+import {
+  FileSystemRawProviderSink,
+  InMemoryRawProviderSink,
+  NoopRawProviderSink,
+  defaultProviderRedactor,
+  providerFromBaseUrl
+} from "./chunk-SQQLHODJ.js";
 import {
   TraceEmitter,
   llmSpanFromProvider
@@ -162,23 +232,6 @@ import {
   signManifest,
   verifyManifest
 } from "./chunk-6M774GY6.js";
-import {
-  LlmCallError,
-  LlmClient,
-  LlmRouteAssertionError,
-  assertLlmRoute,
-  callLlm,
-  callLlmJson,
-  probeLlm,
-  stripFencedJson
-} from "./chunk-KAO3Q65R.js";
-import {
-  FileSystemRawProviderSink,
-  InMemoryRawProviderSink,
-  NoopRawProviderSink,
-  defaultProviderRedactor,
-  providerFromBaseUrl
-} from "./chunk-SQQLHODJ.js";
 import "./chunk-PZ5AY32C.js";
 // src/client.ts
@@ -10498,6 +10551,7 @@ export {
   OTEL_AGENT_EVAL_SCOPE,
   OtlpFileTraceStore,
   PairwiseSteeringOptimizer,
+  PredictiveValidityResearcher,
   PrmGrader,
   ProductClient,
   ProjectRegistry,
@@ -10527,12 +10581,15 @@ export {
   UNIVERSAL_FINDERS,
   acquisitionPlansForKnowledgeGaps,
   adversarialJudge,
+  adversarialScenarioSearch,
   aggregateLlm,
   aggregateRunScore,
   allCriticalPassed,
   analyzeAntiSlop,
+  analyzeOptimizationResult,
   analyzeSeries,
   analyzeTraces,
+  applyEloUpdate,
   argHash,
   assertLlmRoute,
   assertReleaseConfidence,
@@ -10542,12 +10599,14 @@ export {
   deterministicSplit as benchmarkDeterministicSplit,
   benchmarks_exports as benchmarks,
   benjaminiHochberg,
+  bestOfN,
   bhAdjust,
   bisect,
   blockingKnowledgeEval,
   bonferroni,
   bootstrapCi,
   budgetBreachView,
+  buildPairwiseFromCampaign,
   buildReflectionPrompt,
   buildReviewerPrompt,
   buildTraceAnalystTools,
@@ -10573,6 +10632,7 @@ export {
   coherenceJudge,
   collectionPreserved,
   commitBisect,
+  compareAdaptationCurves,
   compareReferenceReplay,
   compareToBaseline,
   compilerJudge,
@@ -10609,9 +10669,11 @@ export {
   defaultTraceInsightPanel,
   deployGateLayer,
   describeTraceInsightScope,
+  detectRewardHacking,
   distillPlaybook,
   domainEvidencePattern,
   dominates,
+  doublyRobust,
   estimateCost,
   estimateTokens,
   euAiActReport,
@@ -10628,6 +10690,10 @@ export {
   exportTrainingData,
   extractAssetUrls,
   extractErrorCount,
+  extractPreferences,
+  extractStepRewards,
+  extractVerifiableReward,
+  extractVerifiableRewardsFromRecords,
   failureClusterView,
   feedbackTrajectoriesToDatasetScenarios,
   feedbackTrajectoriesToOptimizerRows,
@@ -10635,12 +10701,15 @@ export {
   feedbackTrajectoryToOptimizerRow,
   fileContains,
   fileExists,
+  filterDeterministicallyRewarded,
   findAutoMatchNoExpectation,
   findConstructorCwdDropped,
   findFallbackToPass,
   findLiteralTruePass,
   findSkipCountsAsPass,
   firstDivergenceView,
+  firstPassK,
+  fitBradleyTerry,
   flowLayer,
   formatBenchmarkReport,
   formatDriverReport,
@@ -10656,12 +10725,14 @@ export {
   inMemoryReferenceReplayStore,
   inMemoryReviewStore,
   inferDomainKeywords,
+  injectIrrelevantClause,
   integrationAsi,
   integrationGateEvals,
   integrationInvokeFailedPayload,
   integrationManifestResolvedPayload,
   integrationManifestValidatedPayload,
   interRaterReliability,
+  inverseProbabilityWeighting,
   iqr,
   isJudgeSpan,
   isLlmSpan,
@@ -10697,6 +10768,8 @@ export {
   normalizeScores,
   notBlocked,
   objectiveEval,
+  observationsFromRunRecords,
+  offPolicyEstimateAll,
   outputLengthRubric,
   pairedBootstrap,
   pairedEvalueSequence,
@@ -10719,6 +10792,7 @@ export {
   printDriverSummary,
   prmBestOfN,
   prmEnsembleBestOfN,
+  prmTrainingPairs,
   probeLlm,
   promptBisect,
   proposeSynthesisTargets,
@@ -10734,6 +10808,7 @@ export {
   regexMatches,
   regressionView,
   releaseTraceEvidenceFromMultiShotTrials,
+  renameVariables,
   renderMarkdown,
   renderMarkdownReport,
   renderPlaybookMarkdown,
@@ -10752,10 +10827,13 @@ export {
   rowCount,
   rowWhere,
   rubricPredictiveValidity,
+  runAdaptationCurve,
   runAgentControlLoop,
   runAssertions,
   runBehavioralCanaries,
   runCanaries,
+  runComputeCurve,
+  runContaminationProbe,
   runCounterfactual,
   runE2EWorkflow,
   runEvalCampaign,
@@ -10771,11 +10849,13 @@ export {
   runPromptEvolution,
   runProposeReview,
   runProposeReviewAsControlLoop,
+  runRLCampaign,
   runReferenceReplay,
   runSelfPlay,
   runSemanticConceptJudge,
   runTestGradedScenario,
   runsForScenario,
+  runwiseStepRewardSummary,
   scalarScore,
   scanForMuffledGates,
   scoreAllProjects,
@@ -10788,12 +10868,16 @@ export {
   scoreTraceInsightReadiness,
   securityJudge,
   selectHarnessVariant,
+  selfConsistency,
+  selfNormalizedImportanceWeighting,
   selfPreference,
   sentenceReorderMutator,
   serializeFeedbackTrajectoriesJsonl,
+  shuffleOrder,
   signManifest,
   soc2Report,
   statusAdvanced,
+  stepRewardsToJsonl,
   stopOnNoProgress,
   stopOnRepeatedAction,
   stripFencedJson,
@@ -10805,10 +10889,21 @@ export {
   summaryTable,
   testJudge,
   textInSnapshot,
+  thompsonCurriculum,
   throwIfRunIncomplete,
+  toAnthropicFormat,
+  toDpoJsonl,
+  toDpoRows,
+  toGrpoJsonl,
+  toGrpoRows,
   toLangfuseEnvelope,
   toNdjson,
+  toPrmJsonl,
+  toPrmRows,
   toPrometheusText,
+  toSftJsonl,
+  toSftRows,
+  toTRLFormat,
   tokenizeDomainWords,
   toolIntentAlignmentRubric,
   toolNamesForRun,
@@ -10818,12 +10913,17 @@ export {
   toolWasteView,
   traceAnalystFunctionGroup,
   traceAnalystOnRunComplete,
+  trialToRunRecord,
   trialTraceFromMultiShotTrial,
+  trialsToRunRecords,
   typoMutator,
   urlContains,
   userQuestionsForKnowledgeGaps,
   validateRunRecord,
+  varianceBasedCurriculum,
+  variantAggregateToRunRecord,
   verbosityBias,
+  verificationReportToRunRecord,
   verifyManifest,
   visualDiff,
   viteDeployRunner,