npm - @tangle-network/agent-eval - Versions diffs - 0.27.0 → 0.27.2 - Mend

@tangle-network/agent-eval 0.27.0 → 0.27.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

package/CHANGELOG.md +72 -0
package/README.md +4 -5
package/dist/builder-eval/index.js +1 -1
package/dist/{chunk-WWYCWKUM.js → chunk-3CKU6VGU.js} +2 -2
package/dist/{chunk-K2TPS5LB.js → chunk-4U4BKCXK.js} +2 -2
package/dist/chunk-4U4BKCXK.js.map +1 -0
package/dist/{chunk-2A5XJB43.js → chunk-5AKPEK5L.js} +3 -3
package/dist/chunk-5AKPEK5L.js.map +1 -0
package/dist/{chunk-RAF443UI.js → chunk-DBIGN5MJ.js} +2 -2
package/dist/{chunk-JLZQWFV3.js → chunk-K33INZHH.js} +2 -2
package/dist/chunk-K33INZHH.js.map +1 -0
package/dist/{chunk-NU65VQ7M.js → chunk-MAZ26DC7.js} +1 -1
package/dist/chunk-MAZ26DC7.js.map +1 -0
package/dist/{chunk-LSH4MMOZ.js → chunk-NCRFYPS3.js} +1 -1
package/dist/chunk-NCRFYPS3.js.map +1 -0
package/dist/{chunk-ZN274SWR.js → chunk-PALJO75S.js} +2 -2
package/dist/{chunk-OWLAAMME.js → chunk-QHF6EQKK.js} +3 -2
package/dist/chunk-QHF6EQKK.js.map +1 -0
package/dist/chunk-R5UQJNKC.js +722 -0
package/dist/chunk-R5UQJNKC.js.map +1 -0
package/dist/{chunk-SESZDQPX.js → chunk-RUI6SIHY.js} +3 -3
package/dist/chunk-RUI6SIHY.js.map +1 -0
package/dist/{chunk-WHZMVFUV.js → chunk-SZSBQUIJ.js} +2 -2
package/dist/chunk-SZSBQUIJ.js.map +1 -0
package/dist/{chunk-4F5DQN55.js → chunk-VSMTAMNK.js} +1 -1
package/dist/chunk-VSMTAMNK.js.map +1 -0
package/dist/{chunk-5LBB5B3Z.js → chunk-XFZCM5Z3.js} +1 -1
package/dist/chunk-XFZCM5Z3.js.map +1 -0
package/dist/cli.js +1 -1
package/dist/{control-CBShYYA6.d.ts → control-BT4qnXiS.d.ts} +2 -2
package/dist/{control-runtime-BuJHoLg0.d.ts → control-runtime-BZ_lVLYW.d.ts} +1 -0
package/dist/control.d.ts +3 -3
package/dist/control.js +2 -2
package/dist/{failure-cluster-C2EGSDiT.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -2
package/dist/{feedback-trajectory-DfFdrraJ.d.ts → feedback-trajectory-D1aGKusy.d.ts} +1 -1
package/dist/governance/index.d.ts +1 -1
package/dist/{index-D3iBCjdF.d.ts → index-BhLlu-qO.d.ts} +1 -1
package/dist/index.d.ts +157 -167
package/dist/index.js +25 -335
package/dist/index.js.map +1 -1
package/dist/knowledge/index.d.ts +1 -1
package/dist/knowledge/index.js +2 -2
package/dist/{multi-layer-verifier-LkP3LVKj.d.ts → multi-layer-verifier-U-c8ge1k.d.ts} +1 -1
package/dist/openapi.json +1 -1
package/dist/optimization.d.ts +5 -5
package/dist/optimization.js +5 -5
package/dist/pipelines/index.d.ts +1 -1
package/dist/pipelines/index.js +2 -2
package/dist/{release-report-wfUySN5F.d.ts → release-report-CCQqnK46.d.ts} +1 -1
package/dist/{replay-BL96gCEP.d.ts → replay-D7z0J43-.d.ts} +4 -5
package/dist/reporting.d.ts +4 -4
package/dist/reporting.js +5 -5
package/dist/{researcher-bGkI7vCl.d.ts → researcher-G81CWc0q.d.ts} +9 -10
package/dist/rl.d.ts +26 -44
package/dist/rl.js +5 -5
package/dist/rl.js.map +1 -1
package/dist/{sequential-Dgz1n51-.d.ts → sequential-5iSVfzl2.d.ts} +2 -2
package/dist/{summary-report-DZVXOCK_.d.ts → summary-report-Dl4akLKX.d.ts} +5 -5
package/dist/traces.d.ts +1 -1
package/dist/traces.js +2 -2
package/dist/wire/index.d.ts +2 -2
package/dist/wire/index.js +1 -1
package/docs/research-report-methodology.md +4 -4
package/docs/three-package-architecture.md +12 -24
package/package.json +1 -1
package/dist/chunk-2A5XJB43.js.map +0 -1
package/dist/chunk-4F5DQN55.js.map +0 -1
package/dist/chunk-5LBB5B3Z.js.map +0 -1
package/dist/chunk-I4MBDTY5.js +0 -272
package/dist/chunk-I4MBDTY5.js.map +0 -1
package/dist/chunk-JLZQWFV3.js.map +0 -1
package/dist/chunk-K2TPS5LB.js.map +0 -1
package/dist/chunk-LSH4MMOZ.js.map +0 -1
package/dist/chunk-NU65VQ7M.js.map +0 -1
package/dist/chunk-OWLAAMME.js.map +0 -1
package/dist/chunk-SESZDQPX.js.map +0 -1
package/dist/chunk-WHZMVFUV.js.map +0 -1
/package/dist/{chunk-WWYCWKUM.js.map → chunk-3CKU6VGU.js.map} +0 -0
/package/dist/{chunk-RAF443UI.js.map → chunk-DBIGN5MJ.js.map} +0 -0
/package/dist/{chunk-ZN274SWR.js.map → chunk-PALJO75S.js.map} +0 -0

package/dist/index.d.ts CHANGED Viewed

@@ -1,28 +1,30 @@
-export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-CBShYYA6.js';
+export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-BT4qnXiS.js';
 import { TCloud } from '@tangle-network/tcloud';
-import { C as ControlEvalResult } from './control-runtime-BuJHoLg0.js';
-export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BuJHoLg0.js';
+import { C as ControlEvalResult } from './control-runtime-BZ_lVLYW.js';
+export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BZ_lVLYW.js';
 import { A as AgentEvalError } from './errors-BZ9sTdz7.js';
 export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-BZ9sTdz7.js';
-import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory } from './feedback-trajectory-DfFdrraJ.js';
-export { F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-DfFdrraJ.js';
-import { A as ActionableSideInfo, h as MultiShotRunner, j as MultiShotScorer, c as MultiShotMutateAdapter, a4 as HeldOutGateConfig, E as EvolvableVariant, m as MultiShotTrialResult, e as MultiShotOptimizationResult, a3 as GateDecision, a5 as Objective, a6 as ParetoResult, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-DZVXOCK_.js';
-export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a7 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a8 as GateEvidence, G as GenerationReport, a9 as HeldOutGate, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, d as MultiShotOptimizationConfig, f as MultiShotRun, g as MultiShotRunInput, i as MultiShotScore, k as MultiShotSplit, l as MultiShotTrace, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-DZVXOCK_.js';
+import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory } from './feedback-trajectory-D1aGKusy.js';
+export { F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-D1aGKusy.js';
+import { A as ActionableSideInfo, h as MultiShotRunner, j as MultiShotScorer, c as MultiShotMutateAdapter, a4 as HeldOutGateConfig, E as EvolvableVariant, m as MultiShotTrialResult, e as MultiShotOptimizationResult, a3 as GateDecision, a5 as Objective, a6 as ParetoResult, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-Dl4akLKX.js';
+export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a7 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a8 as GateEvidence, G as GenerationReport, a9 as HeldOutGate, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, d as MultiShotOptimizationConfig, f as MultiShotRun, g as MultiShotRunInput, i as MultiShotScore, k as MultiShotSplit, l as MultiShotTrace, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-Dl4akLKX.js';
 export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
-import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-wfUySN5F.js';
-export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-wfUySN5F.js';
-import { a as FailureCluster } from './failure-cluster-C2EGSDiT.js';
-export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-C2EGSDiT.js';
+import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-CCQqnK46.js';
+export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-CCQqnK46.js';
+import { a as FailureCluster } from './failure-cluster-Cw65_5FY.js';
+export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-Cw65_5FY.js';
 import { a as RunSplitTag, R as RunRecord } from './run-record-CqzahIbx.js';
 export { c as RunJudgeMetadata, d as RunOutcome, e as RunRecordValidationError, b as RunTokenUsage, i as isRunRecord, p as parseRunRecordSafe, r as roundTripRunRecord, v as validateRunRecord } from './run-record-CqzahIbx.js';
 import { T as TraceStore, R as Run$1, S as Span, b as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, g as BudgetSpec, h as RunFilter, L as LlmSpan } from './store-Db2Bv8Cf.js';
 export { i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, M as Message, e as RetrievalSpan, m as RunLayer, n as RunStatus, f as SandboxSpan, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
+import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './index-BhLlu-qO.js';
+export { b as CalibrationResult, c as CandidateScore, d as ContinuousCalibrationResult, D as DEFAULT_RED_TEAM_CORPUS, E as EuRiskClass, G as GoldenItem, e as GovernanceContext, f as GovernanceFinding, g as GovernanceReport, P as PositionalBiasResult, R as RedTeamCase, h as RedTeamCategory, i as RedTeamFinding, j as RedTeamPayload, k as RedTeamReport, S as SelfPreferenceResult, U as UseCaseSignals, V as VerbosityBiasResult, l as calibrateJudge, m as calibrateJudgeContinuous, n as classifyEuAiRisk, o as continuousAgreement, p as euAiActReport, q as nistAiRmfReport, r as positionalBias, s as redTeamDataset, t as redTeamReport, u as renderMarkdown, v as scoreRedTeamOutput, w as selfPreference, x as soc2Report, y as summarize, z as toolNamesForRun, A as verbosityBias } from './index-BhLlu-qO.js';
 import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-B2kWEdh9.js';
 export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-B2kWEdh9.js';
 import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
 export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DP_cSSiw.js';
 export { F as FileSystemRawProviderSink, d as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, e as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, f as RawProviderDirection, c as RawProviderEvent, R as RawProviderSink, g as RawProviderSinkFilter, h as RunIntegrityError, a as RunIntegrityExpectations, i as RunIntegrityIssue, j as RunIntegrityIssueCode, b as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-DK2EBVZC.js';
-export { D as DEFAULT_REDACTION_RULES, O as OTEL_AGENT_EVAL_SCOPE, a as OtlpExport, b as OtlpResourceSpans, c as OtlpSpan, R as REDACTION_VERSION, d as RedactionReport, e as RedactionRule, f as ReplayCache, g as ReplayCacheEntry, h as ReplayCacheMissError, i as ReplayCacheStats, j as ReplayFetchOptions, k as createReplayFetch, l as exportRunAsOtlp, m as iterateRawCalls, r as redactString, n as redactValue } from './replay-BL96gCEP.js';
+export { D as DEFAULT_REDACTION_RULES, O as OTEL_AGENT_EVAL_SCOPE, a as OtlpExport, b as OtlpResourceSpans, c as OtlpSpan, R as REDACTION_VERSION, d as RedactionReport, e as RedactionRule, f as ReplayCache, g as ReplayCacheEntry, h as ReplayCacheMissError, i as ReplayCacheStats, j as ReplayFetchOptions, k as createReplayFetch, l as exportRunAsOtlp, m as iterateRawCalls, r as redactString, n as redactValue } from './replay-D7z0J43-.js';
 export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-DODUYdPg.js';
 import { a as BaselineReport } from './baseline-4R5deP0N.js';
 export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-4R5deP0N.js';
@@ -30,14 +32,13 @@ import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
 export { b as buildTrajectory } from './trajectory-CnoBo-JY.js';
 import { a as DatasetScenario, c as Dataset } from './dataset-CiK_3LDr.js';
 export { d as DatasetDifficulty, b as DatasetManifest, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-CiK_3LDr.js';
-export { C as CalibrationResult, a as CandidateScore, b as ContinuousAgreement, c as ContinuousAgreementOptions, d as ContinuousCalibrationResult, D as DEFAULT_RED_TEAM_CORPUS, E as EuRiskClass, G as GoldenItem, e as GovernanceContext, f as GovernanceFinding, g as GovernanceReport, P as PositionalBiasResult, R as RedTeamCase, h as RedTeamCategory, i as RedTeamFinding, j as RedTeamPayload, k as RedTeamReport, S as SelfPreferenceResult, U as UseCaseSignals, V as VerbosityBiasResult, l as calibrateJudge, m as calibrateJudgeContinuous, n as classifyEuAiRisk, o as continuousAgreement, p as euAiActReport, q as nistAiRmfReport, r as positionalBias, s as redTeamDataset, t as redTeamReport, u as renderMarkdown, v as scoreRedTeamOutput, w as selfPreference, x as soc2Report, y as summarize, z as toolNamesForRun, A as verbosityBias } from './index-D3iBCjdF.js';
 import { a as PrmGrader } from './rubric-D5tjHNJQ.js';
-import { L as Layer, S as Severity, a as LayerResult, b as VerifyContext } from './multi-layer-verifier-LkP3LVKj.js';
-export { F as Finding, c as LayerStatus, M as MultiLayerVerifier, V as VerificationReport, d as VerifyOptions, g as gradeSemanticStatus } from './multi-layer-verifier-LkP3LVKj.js';
-import { L as LlmClientOptions } from './researcher-bGkI7vCl.js';
-export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, m as LlmCallError, n as LlmCallRequest, o as LlmCallResult, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as callLlm, x as callLlmJson, y as probeLlm, r as runEvalCampaign, z as stripFencedJson } from './researcher-bGkI7vCl.js';
+import { L as Layer, S as Severity, a as LayerResult, b as VerifyContext } from './multi-layer-verifier-U-c8ge1k.js';
+export { F as Finding, c as LayerStatus, M as MultiLayerVerifier, V as VerificationReport, d as VerifyOptions, g as gradeSemanticStatus } from './multi-layer-verifier-U-c8ge1k.js';
+import { L as LlmClientOptions } from './researcher-G81CWc0q.js';
+export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, m as LlmCallError, n as LlmCallRequest, o as LlmCallResult, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as callLlm, x as callLlmJson, y as probeLlm, r as runEvalCampaign, z as stripFencedJson } from './researcher-G81CWc0q.js';
 export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index--fVrWDiR.js';
-export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-Dgz1n51-.js';
+export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
 import './outcome-store-D6KWmYvj.js';
 /**
@@ -64,8 +65,7 @@ import './outcome-store-D6KWmYvj.js';
  * Both implement the small `AutoPrClient` interface, so tests substitute
  * a fake without spinning a process or network.
  *
- * @experimental — added in 0.25.0. Surface may evolve as consumers wire
- * it into CI workflows.
+ * @experimental — surface may evolve as consumers wire it into CI workflows.
  */
 interface FileChange {
     /** Repo-relative path. Forward slashes; no `..`. */
@@ -747,8 +747,7 @@ declare class MetricsCollector {
  * primitive is idempotent + replayable: re-running with the same
  * `runId` will produce the same plan.
  *
- * @experimental — added in 0.25.0. Surface may evolve as the 5 product
- * agents wire it in.
+ * @experimental — surface may evolve as product agents wire it in.
  */
 interface FailureClusterConfig {
@@ -998,6 +997,78 @@ declare function wilcoxonSignedRank(before: number[], after: number[]): {
  * Rule of thumb: |d| < 0.2 negligible, 0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.
  */
 declare function cohensD(a: number[], b: number[]): number;
+interface CorpusScoreRecord {
+    /** Stable identifier for the rated item (scenario, span, turn, …). */
+    itemId: string;
+    /** Identifier for the judge that produced this score. */
+    judgeName: string;
+    /** Dimension name (matches `JudgeScore.dimension`). */
+    dimension: string;
+    /** Numeric score; must be finite. */
+    score: number;
+}
+interface CorpusAgreementPerDimension extends ContinuousAgreement {
+    dimension: string;
+    /** Item IDs that contributed to this dimension's matrix (every judge scored them). */
+    itemIds: string[];
+    /** Judge IDs that contributed to this dimension's matrix. */
+    judgeIds: string[];
+}
+interface CorpusAgreementReport {
+    /** Per-dimension ICC(2,1) + κ_w + Pearson + Spearman + bootstrap CIs. */
+    perDimension: CorpusAgreementPerDimension[];
+    /** Mean ICC across dimensions (NaN if no dimension yielded a finite ICC). */
+    overallIcc: number;
+    /** Mean weighted κ across dimensions (NaN if none finite). */
+    overallWeightedKappa: number;
+    /** Dimensions evaluated (sorted). */
+    dimensions: string[];
+    /** Judges seen across the corpus (sorted). */
+    judgeIds: string[];
+}
+interface CorpusAgreementOptions extends ContinuousAgreementOptions {
+    /**
+     * Restrict the audit to these dimensions. Default = every dimension
+     * that appears in the input. A dimension named here but absent from
+     * the input throws — silent omission would corrupt the overall metric.
+     */
+    dimensions?: string[];
+    /**
+     * Restrict the audit to these judges. Default = every judge that
+     * appears in the input. A judge named here but absent from a
+     * dimension throws (see "fail loud" below).
+     */
+    judges?: string[];
+}
+/**
+ * Corpus-wide inter-rater agreement across N items × M judges × D dimensions.
+ *
+ * For each dimension, builds the [n_items][n_judges] matrix of scores
+ * (keeping only items every judge rated on that dimension), then runs
+ * `continuousAgreement` to get ICC(2,1), κ_w, Pearson, Spearman, and
+ * bootstrap CIs. Reports a pooled mean across dimensions as a single
+ * "is this judge panel reliable on this corpus?" number.
+ *
+ * Fail-loud contract:
+ *   - Empty input throws.
+ *   - Fewer than 2 judges or fewer than 2 items per dimension throws.
+ *   - A judge present in some dimensions but with zero scored items on
+ *     another dimension throws (would silently shrink the matrix).
+ *   - Duplicate (itemId, judgeName, dimension) records throw.
+ */
+declare function corpusInterRaterAgreement(records: CorpusScoreRecord[], opts?: CorpusAgreementOptions): CorpusAgreementReport;
+/**
+ * Convenience adapter for `JudgeScore[]` data keyed externally by item.
+ *
+ * Use when you have per-item arrays of `JudgeScore[]` (e.g. one
+ * `ScenarioResult.judgeScores` per scenario) and want corpus-wide
+ * agreement without manually flattening. `itemId` must be unique per
+ * row of `itemsScores`.
+ */
+declare function corpusInterRaterAgreementFromJudgeScores(itemsScores: Array<{
+    itemId: string;
+    scores: JudgeScore[];
+}>, opts?: CorpusAgreementOptions): CorpusAgreementReport;
 /**
  * Anti-slop quality judge.
@@ -2623,9 +2694,9 @@ declare function paraphraseRobustnessScenarios(args: ParaphraseRobustnessScenari
  *      `bonferroni(pValues, alpha)` correct for multiple pairwise tests
  *      so pairwise variant comparisons stay statistically honest.
  *
- * Fixes the correctness bug in 0.2's pairwise optimizer which applied
- * alpha directly across n*(n-1)/2 pairwise tests without correction —
- * dramatically inflating false-positive rate when variants ≥ 3.
+ * Applying alpha directly across n*(n-1)/2 pairwise tests without
+ * correction inflates the false-positive rate when variants ≥ 3 — the
+ * BH and Bonferroni helpers prevent that.
  */
 /**
  * Required N per arm for a two-sample comparison at target effect size,
@@ -2940,12 +3011,9 @@ interface HypothesisManifest {
  * Identifier for the hashing scheme used to produce `contentHash`.
  *
  * `'sha256-content'` — sha256 hex over the canonicalized manifest with
- * the `contentHash` and `algo` fields stripped. This is what
- * `signManifest` produces today.
- *
- * Held as a string union so future schemes can be added without
- * breaking parsers; legacy SignedManifest values written before this
- * field existed will deserialize cleanly because the field is optional.
+ * the `contentHash` and `algo` fields stripped. Held as a string union
+ * so future schemes can be added without breaking parsers; SignedManifest
+ * values without `algo` deserialize cleanly because the field is optional.
  */
 type SignedManifestAlgo = 'sha256-content';
 interface SignedManifest extends HypothesisManifest {
@@ -2954,10 +3022,10 @@ interface SignedManifest extends HypothesisManifest {
     /**
      * Algorithm string describing how `contentHash` was produced.
      *
-     * Optional on the type so legacy serialized manifests (pre-`algo`)
-     * still parse, but ALWAYS populated by {@link signManifest}.
-     * Consumers that want to enforce a known algorithm should reject
-     * manifests where this field is missing or unrecognized.
+     * Optional on the type so serialized manifests without it still parse,
+     * but ALWAYS populated by {@link signManifest}. Consumers that want to
+     * enforce a known algorithm should reject manifests where this field
+     * is missing or unrecognized.
      */
     algo?: SignedManifestAlgo;
 }
@@ -2996,10 +3064,9 @@ declare function canonicalize(v: unknown): unknown;
  *   - encoder choice (UTF-8 via TextEncoder, fixed)
  *   - runtime (uses the Web Crypto subtle digest, present in Node ≥18 and browsers)
  *
- * Naming note: `hashJson` rather than `hashContent` because `hashContent` is
- * already taken in `prompt-registry.ts` for the truncated 12-char prompt-id
- * helper, which has different semantics (string input, short return). Both
- * coexist; `hashJson` is the right name when you mean "canonicalize then hash."
+ * Named `hashJson` to disambiguate from `prompt-registry.ts`'s `hashContent`,
+ * which takes a string input and returns a truncated 12-char prompt id.
+ * Use `hashJson` when you mean "canonicalize then hash."
  *
  * @example
  *   const hash = await hashJson({ id: '1', kind: 'spec' })
@@ -3012,17 +3079,15 @@ declare function hashJson<T>(obj: T): Promise<string>;
  * The hash covers the canonicalized manifest with the `contentHash`
  * and `algo` fields stripped; this lets verifiers re-sign the rest and
  * compare. Returned manifest always carries `algo: 'sha256-content'`
- * so downstream consumers can identify the scheme; legacy serialized
- * manifests without `algo` still verify because it is stripped before
- * hashing on both sides.
+ * so downstream consumers can identify the scheme; manifests without
+ * `algo` still verify because it is stripped before hashing on both sides.
  */
 declare function signManifest(m: HypothesisManifest): Promise<SignedManifest>;
 /**
  * Verify that a signed manifest has not been tampered with.
  *
- * Strips `contentHash` and `algo` before re-signing so legacy manifests
- * (written before `algo` was emitted) verify identically to current
- * ones.
+ * Strips `contentHash` and `algo` before re-signing so manifests without
+ * `algo` verify identically to ones that carry it.
  */
 declare function verifyManifest(m: SignedManifest): Promise<boolean>;
 /**
@@ -3334,9 +3399,8 @@ declare const localCommandRunner: CommandRunner;
  *   - artifact dir contains an entry point (index.html for static SPAs,
  *     equivalent per framework family)
  *
- * Shipped in 0.11 with the canonical `vite` runner. Future generations
- * add wrangler-deploy --dry-run, next-build, etc — each as another
- * runner factory.
+ * Ships with a canonical `vite` runner. Additional runners
+ * (wrangler-deploy --dry-run, next-build, etc.) plug in as factories.
  */
 type DeployFamily = 'frontend-static' | 'nextjs' | 'remix' | 'fullstack-ts';
@@ -3508,10 +3572,8 @@ declare function extractErrorCount(text: string, opts?: ExtractOptions): Extract
  *   - test: in-memory mock that returns canned step outcomes
  *   - future: Playwright, Puppeteer, custom scrapers
  *
- * Shipped in 0.11 alongside {@link runIntentMatchJudge} — together they
- * close the "the agent shipped the wrong app and we didn't catch it"
- * blind spot. Intent-match catches "wrong app entirely"; flow catches
- * "right app but the buttons don't work."
+ * Paired with {@link runIntentMatchJudge}: intent-match catches "wrong
+ * app entirely"; flow-layer catches "right app but the buttons don't work."
  */
 type FlowAction = 'navigate' | 'click' | 'fill' | 'expect-text' | 'expect-element' | 'expect-url' | 'wait';
@@ -3607,10 +3669,6 @@ declare function flowLayer<Env extends FlowLayerEnv = FlowLayerEnv>(input: FlowL
  *
  * Soft-fails on LLM/JSON error (`available: false`) so callers can
  * treat failure as "judge skipped."
- *
- * Added in 0.11 to replace the lying `completenessScore: 1` field that
- * VerticalBench shipped pre-Gen-48 — that field was keyword-driven and
- * fired true on builds with zero spec concepts implemented.
  */
 declare const INTENT_MATCH_JUDGE_VERSION = "intent-match-judge-v1-2026-04-24";
@@ -4142,7 +4200,7 @@ declare function createDefaultReviewer(options: CreateDefaultReviewerOptions): (
  */
 /**
- * Implementation complexity class for weighted scoring (added 0.11).
+ * Implementation complexity class for weighted scoring.
  *
  * - `render` (default): the concept is a UI surface that displays static
  *   data — render a list, show a counter, lay out a button. Single-file
@@ -4212,11 +4270,10 @@ interface SemanticConceptJudgeResult {
     error?: string;
 }
 /**
- * Score-aggregation strategy. Default `mean` (legacy behavior — 0.10
- * and earlier always averaged 0-10 scores). `complexity` applies the
- * default weight table (render=1, integrate=2, compute=2.5) unless a
- * concept has an explicit `weight`. `explicit` honors only `weight`
- * (defaulting to 1 for unspecified).
+ * Score-aggregation strategy. `mean` averages 0-10 scores uniformly.
+ * `complexity` applies the default weight table (render=1, integrate=2,
+ * compute=2.5) unless a concept has an explicit `weight`. `explicit`
+ * honors only `weight` (defaulting to 1 for unspecified).
  */
 type ConceptWeightStrategy = 'mean' | 'complexity' | 'explicit';
 declare const DEFAULT_COMPLEXITY_WEIGHTS: Record<ConceptComplexity, number>;
@@ -4234,9 +4291,9 @@ interface SemanticConceptJudgeOptions {
     /** LlmClient config (baseUrl, apiKey, authHeader, …). */
     llm?: LlmClientOptions;
     /**
-     * Score aggregation strategy. Default `mean` for backward compatibility
-     * with 0.10 and earlier callers. Cross-vertical comparisons should use
-     * `complexity` to neutralize the integrate-vs-render asymmetry.
+     * Score aggregation strategy. Default `mean` — uniform average across
+     * concepts. Cross-vertical comparisons should use `complexity` to
+     * neutralize the integrate-vs-render asymmetry.
      */
     weightConcepts?: ConceptWeightStrategy;
     /** Override the default complexity → weight table. */
@@ -4458,9 +4515,8 @@ interface LineageNode {
 }
 /**
  * `kindOf` decides whether a variant is a seed (no parent), code mutation,
- * or prompt mutation. Default looks at `variant.payload.codeMutation` —
- * that field is part of the audit-bench convention but cheap enough to
- * accept any payload that mirrors it. Override by passing your own.
+ * or prompt mutation. Default looks at `variant.payload.codeMutation` and
+ * accepts any payload that exposes that field; override by passing your own.
  */
 type LineageKindResolver<P> = (variant: EvolvableVariant<P>) => LineageKind;
 /**
@@ -4707,9 +4763,8 @@ declare function createSandboxCodeMutator<T, P>(opts: CreateSandboxCodeMutatorOp
  *                  Δ improvement (auto-detect when prompt evolution has
  *                  hit a structural ceiling).
  *
- * Naming is generic: the original audit-bench version called the channels
- * "prompt" and "code" — those are the canonical use cases, but the
- * primitive doesn't care what each mutator actually does.
+ * Naming is generic — the canonical use cases are "prompt" and "code"
+ * channels, but the primitive doesn't care what each mutator actually does.
  */
 type CompositePolicy = 'primary-only' | 'secondary-only' | 'alternate' | 'plateau';
@@ -4754,25 +4809,15 @@ declare class Mutex {
 }
 /**
- * Persona discovery — replaces every consumer's hardcoded TRAINING_PERSONA_FILES.
- *
- * Today's failure mode: each product agent (legal/gtm/tax/creative) defines
- * a TRAINING_PERSONA_FILES const with 5 hardcoded filenames. When the 2yr
- * rewrite added 10+ new personas, those personas existed on disk but the
- * evolve runner never loaded them — the new rubric dims (audit_defendability,
- * intake_discipline, etc) got no training signal. The personas were
- * cosmetic, the rewrites partially uninformed.
- *
- * `discoverPersonas` walks a personas directory and returns every persona
- * file matching the convention. Consumers can filter by include/exclude
- * patterns. Default behavior — discover everything — eliminates the
- * "forgot to add the new persona to the list" failure mode.
+ * Walk a personas directory and return every file matching the convention
+ * `NN-slug.{yaml,yml,json,md}`. Sorted by filename so the numeric prefix
+ * gives stable persona ordering for reproducibility. Consumers filter
+ * through `include` / `exclude`.
  */
 interface DiscoverPersonasOptions {
     /**
      * Regex applied to filenames. Files that don't match are skipped.
-     * Default: `^[0-9]{2}-.+\.(yaml|yml|json|md)$` (the prevailing convention
-     * across legal/gtm/tax/creative: `NN-slug.yaml`).
+     * Default: `^[0-9]{2}-.+\.(yaml|yml|json|md)$`.
      */
     pattern?: RegExp;
     /**
@@ -4782,14 +4827,10 @@ interface DiscoverPersonasOptions {
     exclude?: readonly string[];
     /**
      * If set, return only personas whose basename contains one of these
-     * substrings (post-pattern filter). Used by the CLI's `--personas a,b,c`
-     * flag — consumers pass through.
+     * substrings (post-pattern filter).
      */
     include?: readonly string[];
-    /**
-     * Recurse into subdirectories. Default false (legal/gtm/tax/creative all
-     * store personas flat).
-     */
+    /** Recurse into subdirectories. Default false. */
     recursive?: boolean;
 }
 interface DiscoveredPersona {
@@ -4800,14 +4841,6 @@ interface DiscoveredPersona {
     /** Filename without extension — the conventional persona id. */
     id: string;
 }
-/**
- * Walk `dir` and return every persona file matching the convention. Async
- * because the consumer almost always wants this to be I/O-driven (so a new
- * persona added on disk is picked up without a code change).
- *
- * Sorted by filename (which gives stable persona id order via the `NN-`
- * numeric prefix convention) for reproducibility.
- */
 declare function discoverPersonas(dir: string, opts?: DiscoverPersonasOptions): Promise<DiscoveredPersona[]>;
 /**
@@ -4914,43 +4947,17 @@ declare class JsonlTrialCache implements TrialCache {
 }
 /**
- * Judge-retry wrapper.
- *
- * Today's failure mode: a judge LLM call aborts mid-stream (connection
- * dropped, model timed out, schema rejected) → consumer's try/catch swallows
- * the error and returns `score: 0`. The eval composite then weights that
- * zero into the mean, silently corrupting the score. Today's tax/gtm evals
- * had `judge=0` across every trial — the prompt rewrites couldn't be
- * evaluated honestly because the measurement instrument was broken.
+ * Wrap a single judge LLM call with retry, optional fallback-model
+ * rotation, exponential backoff, and a typed `JudgeRetryOutcome`. Callers
+ * MUST inspect `succeeded` before using `value`; on failure the library
+ * returns `value: null` rather than substituting a default, so a judge
+ * abort cannot silently corrupt a downstream composite.
  *
- * `withJudgeRetry` is the substrate fix. It wraps a single judge invocation
- * with:
- *
- *   1. N retry attempts on transient failures (abort, timeout, network).
- *   2. Optional fallback-model rotation — try the next model in the list
- *      if the primary keeps aborting (a verbose new prompt may stream-abort
- *      on claude-code/sonnet but succeed on kimi-code/k2p6).
- *   3. Exponential backoff between attempts.
- *   4. A typed outcome `{ succeeded, attempts, value, error }` that callers
- *      MUST decide what to do with. No silent zero.
- *
- * The reporting contract: callers ship `TrialResult.judgeSucceeded = succeeded`
- * and `TrialResult.judgeAttempts = attempts`. `aggregateTrials({mode: 'exclude-failed'})`
- * then skips failed-judge trials when computing composites.
- *
- * The library does NOT decide what score to record on failure — that's the
- * caller's product choice. Today's product agents (legal/gtm/tax/creative)
- * should set `score: NaN` + `judgeSucceeded: false` + `error: ...` so the
- * aggregator's exclude-failed mode drops the trial. Defaulting to 0 is what
- * caused today's data corruption.
- */
-/**
- * Retry policy for judge LLM calls.
- *
- * Defaults are tuned for the verbose post-2yr-rewrite prompts that exceed
- * the 60s `callLlm` default and abort on streaming. Pick a different timeout
- * for cheap-and-quick judges (e.g., 30s) or longer for thinking models.
+ * Reporting contract: callers ship `TrialResult.judgeSucceeded = succeeded`
+ * and `TrialResult.judgeAttempts = attempts` so `aggregateTrialsByMode`
+ * with `mode: 'exclude-failed'` drops the trial.
  */
+/** Retry policy for judge LLM calls. */
 interface JudgeRetryPolicy {
     /** Max attempts per model. Default 3 (one initial + two retries). */
     maxAttempts?: number;
@@ -5003,8 +5010,8 @@ interface JudgeRetryOutcome<T> {
  * to their underlying fetch/SDK call so the abort actually fires.
  *
  * Returns a typed outcome — callers MUST inspect `succeeded` before using
- * `value`. The library refuses to default to a silent zero score because that
- * is exactly what caused today's eval data corruption.
+ * `value`. The library refuses to default to a silent zero score because a
+ * synthetic zero is indistinguishable from a real low score downstream.
  */
 declare function withJudgeRetry<T>(judgeFn: (model: string, signal: AbortSignal) => Promise<T>, policy?: JudgeRetryPolicy): Promise<JudgeRetryOutcome<T>>;
@@ -5070,42 +5077,30 @@ declare function referenceReplayRunsToSteeringRows<Input = unknown>(runs: Refere
 declare function referenceReplayScenarioToRunScore(scenarioScore: ReferenceReplayScenarioScore, durationMs?: number): RunScore;
 /**
- * Trial-aggregator modes.
- *
- * The prompt-evolution loop's internal `aggregateTrials` defaulted to
- * including every non-`error` trial in the mean — which corrupted the mean
- * when a trial had `score: 0` because the judge silently aborted (the
- * caller's try/catch swallowed the abort and returned zero). Today's
- * tax/gtm evals show this: every trial scored judge=0 because the judge
- * aborted, and the composite then reflected `structural * 0.3 + slop * 0.1`
- * instead of the intended `judge * 0.6 + structural * 0.3 + slop * 0.1`.
+ * Aggregate trials with explicit handling of judge failure. Three modes:
  *
- * `aggregateTrialsByMode` is the substrate fix. Consumers can choose:
+ *   - `strict-fail` — any `judgeSucceeded === false` trial fails the whole
+ *     aggregate. Use for production gates: one corrupt trial halts the gate.
  *
- *   - `strict-fail` — any trial with `judgeSucceeded === false` fails the
- *     whole aggregate. Right for production-gate runs where one corrupted
- *     trial means "we don't know if the prompt is good, halt the gate."
+ *   - `exclude-failed` — drop `judgeSucceeded === false` trials from the
+ *     mean; report `excludedFailedTrials` separately. Default for new code.
  *
- *   - `exclude-failed` — drop trials with `judgeSucceeded === false` from
- *     the mean; report `failedTrials` separately. Right for research /
- *     comparison runs where you want to use the signal that DID land.
- *     Default for new code.
+ *   - `zero-fill` — failed trials count as `score: 0` in the mean. Available
+ *     only for adapters that don't yet set `judgeSucceeded`.
  *
- *   - `zero-fill` — legacy behavior: failed trials count as score=0 in
- *     the mean. Default ONLY for backwards-compat with adapters that
- *     don't yet set `judgeSucceeded`. Migrate off this — it's the source
- *     of today's data corruption.
+ * Hard-errored trials (`t.error` set) are always excluded — those are
+ * infrastructure failures, not eval signal.
  */
 type AggregatorMode = 'strict-fail' | 'exclude-failed' | 'zero-fill';
 interface TrialAggregate {
     /** Mean score over the trials counted by the chosen mode. */
     meanScore: number;
-    /** Mean cost (legacy, kept for compatibility). */
+    /** Mean cost across counted trials. */
     meanCost: number;
-    /** Mean wall time (legacy). */
+    /** Mean wall time across counted trials. */
     meanDurationMs: number;
-    /** ok-rate (legacy). */
+    /** Fraction of counted trials with `ok === true`. */
     okRate: number;
     /** Trials counted in the mean (mode-dependent). */
     countedTrials: number;
@@ -5125,13 +5120,8 @@ interface TrialAggregate {
         firstError?: string;
     };
 }
-/**
- * Aggregate trials with explicit failed-judge handling. Returns counts for
- * counted + excluded so callers can surface "the score is based on 7 of 10
- * trials; 3 judges failed" instead of silently weighting zero.
- */
 declare function aggregateTrialsByMode(trials: TrialResult[], opts: {
     mode: AggregatorMode;
 }): TrialAggregate;
-export { ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AggregatorMode, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DirEntry, type DiscoverPersonasOptions, type DiscoveredPersona, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvalResult, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, type JudgeRubric, JudgeRunner, type JudgeScore, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, ProductClient, type ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TestResult, type ThresholdContract, TokenCounter, type TokenSpec, TraceEmitter, TraceEvent, TraceStore, Trajectory, TrajectoryStep, type TrialAggregate, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, VerifyContext, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateRunScore, aggregateTrialsByMode, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, buildReviewerPrompt, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, confidenceInterval, containsAll, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, linterJudge, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, normalizeScores, notBlocked, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, typoMutator, urlContains, verifyManifest, visualDiff, viteDeployRunner, weightedMean, weightedRecall, whitespaceCollapseMutator, wilcoxonSignedRank, withJudgeRetry, wranglerDeployRunner };
+export { ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AggregatorMode, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, ContinuousAgreement, ContinuousAgreementOptions, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CorpusAgreementOptions, type CorpusAgreementPerDimension, type CorpusAgreementReport, type CorpusScoreRecord, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DirEntry, type DiscoverPersonasOptions, type DiscoveredPersona, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvalResult, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, type JudgeRubric, JudgeRunner, type JudgeScore, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, ProductClient, type ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TestResult, type ThresholdContract, TokenCounter, type TokenSpec, TraceEmitter, TraceEvent, TraceStore, Trajectory, TrajectoryStep, type TrialAggregate, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, VerifyContext, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateRunScore, aggregateTrialsByMode, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, buildReviewerPrompt, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, confidenceInterval, containsAll, corpusInterRaterAgreement, corpusInterRaterAgreementFromJudgeScores, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, linterJudge, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, normalizeScores, notBlocked, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, typoMutator, urlContains, verifyManifest, visualDiff, viteDeployRunner, weightedMean, weightedRecall, whitespaceCollapseMutator, wilcoxonSignedRank, withJudgeRetry, wranglerDeployRunner };