npm - @tangle-network/agent-eval - Versions diffs - 0.33.0 → 0.34.0 - Mend

@tangle-network/agent-eval 0.33.0 → 0.34.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

package/CHANGELOG.md +33 -0
package/dist/benchmarks/index.d.ts +2 -2
package/dist/chunk-DCZXFOQN.js +489 -0
package/dist/chunk-DCZXFOQN.js.map +1 -0
package/dist/{chunk-B73G44OH.js → chunk-FT3IAMQR.js} +5 -5
package/dist/chunk-FT3IAMQR.js.map +1 -0
package/dist/{chunk-GVQT44CS.js → chunk-KE7TDJUO.js} +2 -2
package/dist/{chunk-4L3WJXQJ.js → chunk-KHZRNY3F.js} +163 -2
package/dist/{chunk-4L3WJXQJ.js.map → chunk-KHZRNY3F.js.map} +1 -1
package/dist/{chunk-WGXZAQLR.js → chunk-LGAPK7NA.js} +2 -2
package/dist/{chunk-DTEJNZYK.js → chunk-SQYRO3BT.js} +47 -4
package/dist/chunk-SQYRO3BT.js.map +1 -0
package/dist/{chunk-CXJOVDJR.js → chunk-TQL7BAOY.js} +5 -175
package/dist/chunk-TQL7BAOY.js.map +1 -0
package/dist/{chunk-M6RZ5LJN.js → chunk-VXNVVBZO.js} +34 -5
package/dist/chunk-VXNVVBZO.js.map +1 -0
package/dist/{chunk-S4Y5VXMS.js → chunk-WRGHMGWT.js} +2 -2
package/dist/{chunk-SMSGXM74.js → chunk-YU3G6I7F.js} +2 -2
package/dist/cli.js +2 -2
package/dist/{control-p2ns7elI.d.ts → control-C3k02SCP.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/control.js +3 -2
package/dist/governance/index.d.ts +2 -1
package/dist/{index-DPILdKbP.d.ts → index-CN2agEaO.d.ts} +2 -142
package/dist/{index-BTqhGHJT.d.ts → index-ClMxVqe_.d.ts} +1 -1
package/dist/index.d.ts +278 -486
package/dist/index.js +522 -134
package/dist/index.js.map +1 -1
package/dist/judge-calibration-DilmB3Ml.d.ts +142 -0
package/dist/meta-eval/index.d.ts +2 -2
package/dist/openapi.json +1 -1
package/dist/optimization.d.ts +3 -3
package/dist/optimization.js +6 -6
package/dist/pipelines/index.js +2 -2
package/dist/release-report-ChfmCmLi.d.ts +713 -0
package/dist/reporting.d.ts +6 -4
package/dist/reporting.js +10 -9
package/dist/{researcher-BRHa5Jxo.d.ts → researcher-CfnL3HEb.d.ts} +34 -3
package/dist/rl.d.ts +5 -5
package/dist/rl.js +6 -6
package/dist/rl.js.map +1 -1
package/dist/{rubric-predictive-validity-CMHypZ_M.d.ts → rubric-predictive-validity-BvaNwfBE.d.ts} +1 -1
package/dist/{run-record-BfX5y68A.d.ts → run-record-YinVdFwu.d.ts} +78 -2
package/dist/{summary-report-D7AQS7eB.d.ts → summary-report-BPJVzIeW.d.ts} +2 -2
package/dist/wire/index.js +2 -2
package/docs/product-eval-adoption.md +18 -0
package/package.json +12 -22
package/dist/chunk-B73G44OH.js.map +0 -1
package/dist/chunk-CXJOVDJR.js.map +0 -1
package/dist/chunk-DTEJNZYK.js.map +0 -1
package/dist/chunk-M6RZ5LJN.js.map +0 -1
package/dist/chunk-ZN2CMQIW.js +0 -208
package/dist/chunk-ZN2CMQIW.js.map +0 -1
package/dist/release-report-DLWbBPtH.d.ts +0 -292
/package/dist/{chunk-GVQT44CS.js.map → chunk-KE7TDJUO.js.map} +0 -0
/package/dist/{chunk-WGXZAQLR.js.map → chunk-LGAPK7NA.js.map} +0 -0
/package/dist/{chunk-S4Y5VXMS.js.map → chunk-WRGHMGWT.js.map} +0 -0
/package/dist/{chunk-SMSGXM74.js.map → chunk-YU3G6I7F.js.map} +0 -0

package/dist/index.d.ts CHANGED Viewed

@@ -1,16 +1,18 @@
-export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-p2ns7elI.js';
+export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-C3k02SCP.js';
+import { R as RunRecord, a as RunSplitTag } from './run-record-YinVdFwu.js';
+export { A as AgentProfileCell, d as AgentProfileCellInput, e as AgentProfileCellSchemaVersion, f as AgentProfileCellValidationError, g as AgentProfileDimensionValue, h as AgentProfileHarness, i as AgentProfileJson, j as AgentProfileSource, k as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, l as RunOutcome, m as RunRecordValidationError, b as RunTokenUsage, n as agentProfileCellHashMaterial, o as agentProfileCellKey, p as assertRunAgentProfileCell, q as buildAgentProfileCell, r as groupRunsByAgentProfileCell, s as isRunRecord, t as parseRunRecordSafe, u as requireAgentProfileCell, v as roundTripRunRecord, w as validateAgentProfileCell, x as validateRunRecord, y as verifyAgentProfileCell } from './run-record-YinVdFwu.js';
 import { AxAIService, AxFunction } from '@ax-llm/ax';
 import { S as Severity, M as MultiLayerVerifier, a as VerifyOptions, L as Layer, b as LayerResult, c as VerifyContext } from './multi-layer-verifier-BNi4-8lR.js';
 export { F as Finding, d as LayerStatus, V as VerificationReport, g as gradeSemanticStatus } from './multi-layer-verifier-BNi4-8lR.js';
 import { R as Run$1, S as Span, b as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, T as TraceStore, g as BudgetSpec, h as RunFilter, L as LlmSpan } from './store-Db2Bv8Cf.js';
 export { i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, M as Message, e as RetrievalSpan, m as RunLayer, n as RunStatus, f as SandboxSpan, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
-import { L as LlmClientOptions, m as LlmCallRequest, n as LlmCallResult } from './researcher-BRHa5Jxo.js';
-export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, o as LlmCallError, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as callLlm, x as callLlmJson, y as probeLlm, r as runEvalCampaign, z as stripFencedJson } from './researcher-BRHa5Jxo.js';
+import { L as LlmClientOptions, m as LlmCallRequest, n as LlmCallResult } from './researcher-CfnL3HEb.js';
+export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, o as LlmCallError, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as backoffMs, x as callLlm, y as callLlmJson, z as isTransientLlmError, A as probeLlm, r as runEvalCampaign, B as stripFencedJson } from './researcher-CfnL3HEb.js';
 import { TraceAnalysisStore, AnalyzeTracesOptions } from './traces.js';
 export { AnalyzeTracesInput, AnalyzeTracesResult, AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, DatasetOverview, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpResourceSpans, OtlpSpan, QueryTracesPage, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SearchSpanResult, SearchTraceResult, SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TraceAnalystByteBudgets, TraceAnalystFilters, TraceAnalystHookOptions, TraceAnalystSpan, TraceAnalystSpanKind, TraceAnalystSpanStatus, TraceAnalystTraceSummary, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, ViewSpansResult, ViewTraceOversized, ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
+import { t as JudgeInput, u as JudgeFn, v as BenchmarkRunnerConfig, S as Scenario, x as BenchmarkReport, y as ProductClientConfig, C as CheckResult, T as TestResult, z as PersonaConfig, D as DriverResult, A as DriverState, E as CollectedArtifacts, F as ScenarioResult, i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard, G as TurnMetrics, H as ScenarioFile, I as CompletionCriterion } from './release-report-ChfmCmLi.js';
+export { K as ArtifactCheck, L as ArtifactResult, B as BootstrapOptions, a as BootstrapResult, M as CorpusAgreementOptions, N as CorpusAgreementPerDimension, O as CorpusAgreementReport, Q as CorpusScoreRecord, U as EvalResult, W as FeedbackPattern, X as JudgeConfig, J as JudgeReplayGateArgs, Y as JudgeRubric, Z as JudgeScore, P as PairedBootstrapOptions, b as PairedBootstrapResult, _ as PersonaRigor, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, $ as RouteMap, a0 as RubricDimension, a1 as Turn, a2 as TurnResult, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, a3 as bonferroni, n as bootstrapCi, a4 as cohensD, a5 as confidenceInterval, a6 as corpusInterRaterAgreement, a7 as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, a8 as interRaterReliability, p as judgeReplayGate, a9 as mannWhitneyU, aa as normalizeScores, q as pairedBootstrap, ab as pairedMde, ac as pairedTTest, ad as partialCredit, r as releaseTraceEvidenceFromMultiShotTrials, s as renderReleaseReport, ae as requiredSampleSize, af as weightedMean, w as wilcoxonSignedRank } from './release-report-ChfmCmLi.js';
 import { TCloud } from '@tangle-network/tcloud';
-import { R as RunRecord, a as RunSplitTag } from './run-record-BfX5y68A.js';
-export { J as JudgeScoresRecord, c as RunJudgeMetadata, d as RunOutcome, e as RunRecordValidationError, b as RunTokenUsage, i as isRunRecord, p as parseRunRecordSafe, r as roundTripRunRecord, v as validateRunRecord } from './run-record-BfX5y68A.js';
 import { z } from 'zod';
 import { C as ControlEvalResult } from './control-runtime-BZ_lVLYW.js';
 export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BZ_lVLYW.js';
@@ -18,15 +20,11 @@ import { A as AgentEvalError } from './errors-mje_cKOs.js';
 export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-mje_cKOs.js';
 import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory } from './feedback-trajectory-iATEAHmc.js';
 export { F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-iATEAHmc.js';
-import { A as ActionableSideInfo, h as MultiShotRunner, j as MultiShotScorer, c as MultiShotMutateAdapter, a4 as HeldOutGateConfig, E as EvolvableVariant, m as MultiShotTrialResult, e as MultiShotOptimizationResult, a3 as GateDecision, a5 as Objective, a6 as ParetoResult, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-D7AQS7eB.js';
-export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a7 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a8 as GateEvidence, G as GenerationReport, a9 as HeldOutGate, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, d as MultiShotOptimizationConfig, f as MultiShotRun, g as MultiShotRunInput, i as MultiShotScore, k as MultiShotSplit, l as MultiShotTrace, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-D7AQS7eB.js';
+import { A as ActionableSideInfo, h as MultiShotRunner, j as MultiShotScorer, c as MultiShotMutateAdapter, a4 as HeldOutGateConfig, E as EvolvableVariant, m as MultiShotTrialResult, e as MultiShotOptimizationResult, a3 as GateDecision, a5 as Objective, a6 as ParetoResult, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-BPJVzIeW.js';
+export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a7 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a8 as GateEvidence, G as GenerationReport, a9 as HeldOutGate, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, d as MultiShotOptimizationConfig, f as MultiShotRun, g as MultiShotRunInput, i as MultiShotScore, k as MultiShotSplit, l as MultiShotTrace, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-BPJVzIeW.js';
 export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
-import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-DLWbBPtH.js';
-export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-DLWbBPtH.js';
 import { a as FailureCluster } from './failure-cluster-Cw65_5FY.js';
 export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-Cw65_5FY.js';
-import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './index-DPILdKbP.js';
-export { b as CalibrationResult, c as CandidateScore, d as ContinuousCalibrationResult, D as DEFAULT_RED_TEAM_CORPUS, E as EuRiskClass, G as GoldenItem, e as GovernanceContext, f as GovernanceFinding, g as GovernanceReport, P as PositionalBiasResult, R as RedTeamCase, h as RedTeamCategory, i as RedTeamFinding, j as RedTeamPayload, k as RedTeamReport, S as SelfPreferenceResult, U as UseCaseSignals, V as VerbosityBiasResult, l as calibrateJudge, m as calibrateJudgeContinuous, n as classifyEuAiRisk, o as continuousAgreement, p as euAiActReport, q as nistAiRmfReport, r as positionalBias, s as redTeamDataset, t as redTeamReport, u as renderMarkdown, v as scoreRedTeamOutput, w as selfPreference, x as soc2Report, y as summarize, z as toolNamesForRun, A as verbosityBias } from './index-DPILdKbP.js';
 import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-B2kWEdh9.js';
 export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-B2kWEdh9.js';
 import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
@@ -39,8 +37,10 @@ import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
 export { b as buildTrajectory } from './trajectory-CnoBo-JY.js';
 import { a as DatasetScenario, c as Dataset } from './dataset-ueRVTUoY.js';
 export { d as DatasetDifficulty, b as DatasetManifest, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-ueRVTUoY.js';
+export { b as CalibrationResult, c as CandidateScore, a as ContinuousAgreement, C as ContinuousAgreementOptions, d as ContinuousCalibrationResult, G as GoldenItem, P as PositionalBiasResult, S as SelfPreferenceResult, V as VerbosityBiasResult, e as calibrateJudge, f as calibrateJudgeContinuous, g as continuousAgreement, p as positionalBias, s as selfPreference, v as verbosityBias } from './judge-calibration-DilmB3Ml.js';
+export { D as DEFAULT_RED_TEAM_CORPUS, E as EuRiskClass, G as GovernanceContext, a as GovernanceFinding, b as GovernanceReport, R as RedTeamCase, c as RedTeamCategory, d as RedTeamFinding, e as RedTeamPayload, f as RedTeamReport, U as UseCaseSignals, g as classifyEuAiRisk, h as euAiActReport, n as nistAiRmfReport, r as redTeamDataset, i as redTeamReport, j as renderMarkdown, s as scoreRedTeamOutput, k as soc2Report, l as summarize, t as toolNamesForRun } from './index-CN2agEaO.js';
 import { a as PrmGrader } from './rubric-D5tjHNJQ.js';
-export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-BTqhGHJT.js';
+export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-ClMxVqe_.js';
 export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
 import './outcome-store-D6KWmYvj.js';
@@ -230,299 +230,6 @@ declare function runSemanticConceptJudge(input: SemanticConceptJudgeInput, optio
  */
 declare function createSemanticConceptJudge(options?: SemanticConceptJudgeOptions): (input: SemanticConceptJudgeInput) => Promise<SemanticConceptJudgeResult>;
-interface Scenario {
-    id: string;
-    persona: string;
-    label: string;
-    thesis: string;
-    dimensions: string[];
-    turns: Turn[];
-    artifactChecks: ArtifactCheck[];
-    systemPromptAppend?: string;
-}
-interface Turn {
-    user: string;
-    expectedBehaviors: string[];
-    adversarial?: boolean;
-    feedbackType?: 'correction' | 'rejection' | 'vague' | 'contradictory' | 'escalation';
-}
-interface ArtifactCheck {
-    type: 'vault_file_exists' | 'vault_file_contains' | 'block_extracted' | 'code_valid' | 'generation_produced' | 'tool_created' | string;
-    target: string;
-    contains?: string;
-    minCount?: number;
-    description: string;
-}
-interface JudgeConfig {
-    model: string;
-    temperature: number;
-    rubric: JudgeRubric;
-}
-interface JudgeRubric {
-    name: string;
-    description: string;
-    dimensions: RubricDimension[];
-}
-interface RubricDimension {
-    name: string;
-    description: string;
-    anchor_low: string;
-    anchor_high: string;
-    weight: number;
-}
-interface ScenarioResult {
-    scenarioId: string;
-    persona: string;
-    turns: TurnResult[];
-    artifactResults: ArtifactResult[];
-    judgeScores: JudgeScore[];
-    judgeErrors: number;
-    overallScore: number;
-    totalDurationMs: number;
-    artifacts: CollectedArtifacts;
-}
-interface TurnResult {
-    turnIndex: number;
-    userMessage: string;
-    agentResponse: string;
-    durationMs: number;
-    blocksExtracted: {
-        type: string;
-        title: string;
-    }[];
-    containsCode: boolean;
-    containsToolCall: boolean;
-}
-interface ArtifactResult {
-    check: ArtifactCheck;
-    passed: boolean;
-    detail?: string;
-}
-interface JudgeScore {
-    judgeName: string;
-    dimension: string;
-    score: number;
-    reasoning: string;
-    evidence?: string;
-}
-interface CollectedArtifacts {
-    vaultFiles: {
-        path: string;
-        content: string;
-    }[];
-    blocksExtracted: {
-        type: string;
-        fields: Record<string, string>;
-    }[];
-    codeBlocks: {
-        language: string;
-        code: string;
-    }[];
-    toolCalls: string[];
-}
-interface BenchmarkReport {
-    timestamp: string;
-    generation: number;
-    promptVersion: string;
-    scenarioCount: number;
-    results: ScenarioResult[];
-    summary: {
-        overallAvg: number;
-        byPersona: Record<string, {
-            avg: number;
-            passed: number;
-            total: number;
-        }>;
-        byDimension: Record<string, {
-            avg: number;
-            scores: number[];
-        }>;
-        weakest: {
-            scenario: string;
-            score: number;
-            reason: string;
-        }[];
-        strongest: {
-            scenario: string;
-            score: number;
-            reason: string;
-        }[];
-    };
-}
-interface RouteMap {
-    signup?: string;
-    login?: string;
-    workspaces?: string;
-    threads?: string;
-    chat?: string;
-    tasks?: string;
-    events?: string;
-    approvals?: string;
-    vault?: string;
-    generations?: string;
-    [key: string]: string | undefined;
-}
-interface ProductClientConfig {
-    baseUrl: string;
-    routes: RouteMap;
-}
-interface ScenarioFile {
-    id: string;
-    category: string;
-    persona: string;
-    label: string;
-    thesis: string;
-    isControl?: boolean;
-    rubric?: {
-        dimensions: {
-            name: string;
-            description: string;
-            weight: number;
-        }[];
-    };
-    turns: Turn[];
-    artifactChecks: ArtifactCheck[];
-}
-interface CompletionCriterion {
-    name: string;
-    check: (state: DriverState) => boolean;
-    progress?: (state: DriverState) => number;
-}
-interface FeedbackPattern {
-    trigger: string;
-    response: string;
-}
-/**
- * How hard the simulated user pushes back. The driver LLM scales its tone
- * and follow-up aggression to this:
- *   cooperative — forgiving early adopter; accepts reasonable answers.
- *   demanding   — experienced professional; rejects vague or hedged answers.
- *   relentless  — senior partner reviewing for a client who will litigate;
- *                 interrogates every claim, accepts nothing undefended.
- */
-type PersonaRigor = 'cooperative' | 'demanding' | 'relentless';
-interface PersonaConfig {
-    id: string;
-    role: string;
-    goal: string;
-    completionCriteria: CompletionCriterion[];
-    feedbackPatterns?: FeedbackPattern[];
-    maxTurns: number;
-    driverModel?: string;
-    /** How adversarial the simulated user is. Defaults to 'demanding'. */
-    rigor?: PersonaRigor;
-    /**
-     * Domain expertise the simulated user holds — quoted into the driver
-     * prompt so it challenges the agent with authority instead of vague
-     * dissatisfaction. e.g. "a 15-year M&A partner who knows GAAP
-     * working-capital mechanics cold".
-     */
-    expertise?: string;
-    /**
-     * Substantive issues a senior professional in this role would
-     * interrogate — traps the scenario hides, claims that must be defended.
-     * The driver probes these without revealing them verbatim; the agent
-     * must surface them on its own.
-     */
-    pressurePoints?: string[];
-    /**
-     * Curveballs the driver may inject once the agent is coasting — changed
-     * facts, a hostile counterparty position, a new constraint. Forces the
-     * agent to re-derive rather than recite.
-     */
-    curveballs?: string[];
-}
-interface DriverState {
-    tasks: number;
-    events: number;
-    proposals: {
-        pending: number;
-        approved: number;
-        rejected: number;
-    };
-    vaultFiles: string[];
-    codeBlocks: number;
-    generations: number;
-}
-interface TurnMetrics {
-    turn: number;
-    timestamp: string;
-    tasks: number;
-    events: number;
-    proposals: {
-        pending: number;
-        approved: number;
-        rejected: number;
-    };
-    vaultFiles: number;
-    responseLatencyMs: number;
-    responseChars: number;
-    codeBlocksProduced: number;
-    blocksExtracted: number;
-    qualityScore?: number;
-    inputTokens: number;
-    outputTokens: number;
-    estimatedCostUsd: number;
-    totalCostUsd: number;
-    completionPercent: number;
-}
-interface DriverResult {
-    personaId: string;
-    /** True when the simulated user professionally signed off (driver said DONE). */
-    completed: boolean;
-    /** Turn at which the simulated user signed off, or null if it never did. */
-    turnsToCompletion: number | null;
-    /**
-     * Turn at which nominal completionCriteria were first all met, or null.
-     * Distinct from turnsToCompletion: criteria can be met while the
-     * simulated professional is still unsatisfied with the work's rigor.
-     */
-    criteriaMetAtTurn: number | null;
-    totalTurns: number;
-    metrics: TurnMetrics[];
-    finalState: DriverState;
-    convergenceCurve: number[];
-    totalCostUsd: number;
-    finalQualityScore: number | null;
-}
-interface BenchmarkRunnerConfig {
-    scenarios: Scenario[];
-    judges: JudgeFn[];
-    systemPrompt: string;
-    model?: string;
-    judgeModel?: string;
-    passThreshold?: number;
-    generation?: number;
-    promptVersion?: string;
-}
-interface JudgeInput {
-    scenario: Scenario;
-    turns: TurnResult[];
-    artifacts: CollectedArtifacts;
-}
-type JudgeFn = (tc: TCloud, input: JudgeInput) => Promise<JudgeScore[]>;
-interface TestResult {
-    name: string;
-    passed: boolean;
-    duration: number;
-    detail?: string;
-    checks: CheckResult[];
-}
-interface CheckResult {
-    name: string;
-    passed: boolean;
-    expected: string;
-    actual: string;
-}
-interface EvalResult {
-    scenario: string;
-    status: 'pass' | 'fail' | 'skip';
-    duration: number;
-    detail?: string;
-    artifact?: string;
-}
 /**
  * ChatClient — the single LLM abstraction analysts call.
  *
@@ -2093,6 +1800,94 @@ declare class MetricsCollector {
     getConvergenceCurve(): number[];
 }
+type PrReviewSource = 'drew' | 'donovan' | 'shady' | 'codex' | 'claude-code' | 'gpt-5.5-high' | 'claude-opus-4.7-high' | 'kimi' | 'opencode' | (string & {});
+type PrReviewSeverity = 'critical' | 'high' | 'medium' | 'low' | 'nit';
+type PrReviewOutcome = 'accepted' | 'fixed' | 'rejected' | 'duplicate' | 'noise' | 'unknown';
+interface PrReviewComment {
+    id: string;
+    source: PrReviewSource;
+    body: string;
+    model?: string;
+    author?: string;
+    path?: string;
+    line?: number;
+    severity?: PrReviewSeverity;
+    outcome?: PrReviewOutcome;
+    createdAt?: string;
+    metadata?: Record<string, unknown>;
+}
+interface PrReviewReferenceFinding {
+    id: string;
+    title: string;
+    severity: PrReviewSeverity;
+    path?: string;
+    line?: number;
+    /**
+     * Stable terms that should appear in a useful finding. Keep these
+     * factual: API names, invariant names, table names, error classes.
+     */
+    keywords?: string[];
+    fixedByCommit?: string;
+    sourceCommentIds?: string[];
+    metadata?: Record<string, unknown>;
+}
+interface PrReviewAuditCase {
+    id: string;
+    repo: string;
+    prNumber?: number;
+    baseSha?: string;
+    headSha?: string;
+    title?: string;
+    diff?: string;
+    split?: 'train' | 'validation' | 'test' | 'holdout' | (string & {});
+    comments: PrReviewComment[];
+    referenceFindings: PrReviewReferenceFinding[];
+    metadata?: Record<string, unknown>;
+}
+interface PrReviewScoreWeights {
+    recall: number;
+    precision: number;
+    actionability: number;
+    severityCalibration: number;
+    lowNoise: number;
+}
+interface PrReviewMatchedFinding {
+    referenceId: string;
+    commentId: string;
+    score: number;
+}
+interface PrReviewScore {
+    caseId: string;
+    source: PrReviewSource;
+    commentCount: number;
+    referenceCount: number;
+    matchedFindings: PrReviewMatchedFinding[];
+    recall: number;
+    precision: number;
+    actionability: number;
+    severityCalibration: number;
+    lowNoise: number;
+    aggregate: number;
+    notes: string[];
+}
+interface PrReviewBenchmarkSummary {
+    source: PrReviewSource;
+    caseCount: number;
+    commentCount: number;
+    aggregateMean: number;
+    recallMean: number;
+    precisionMean: number;
+    actionabilityMean: number;
+    severityCalibrationMean: number;
+    lowNoiseMean: number;
+}
+declare const DEFAULT_PR_REVIEW_SCORE_WEIGHTS: PrReviewScoreWeights;
+declare function commentsForSource(auditCase: PrReviewAuditCase, source: PrReviewSource): PrReviewComment[];
+declare function scorePrReviewSource(auditCase: PrReviewAuditCase, source: PrReviewSource, weights?: Partial<PrReviewScoreWeights>): PrReviewScore;
+declare function scorePrReviewComments(auditCase: PrReviewAuditCase, comments: PrReviewComment[], source: PrReviewSource, weights?: Partial<PrReviewScoreWeights>): PrReviewScore;
+declare function summarizePrReviewBenchmark(scores: PrReviewScore[]): PrReviewBenchmarkSummary[];
+declare function aggregatePrReviewScore(dimensions: Pick<PrReviewScore, 'recall' | 'precision' | 'actionability' | 'severityCalibration' | 'lowNoise'>, weights?: Partial<PrReviewScoreWeights>): number;
 /**
  * ProductionLoop — the substrate that closes eval → prod → eval.
  *
@@ -2310,139 +2105,6 @@ declare function formatDriverReport(results: DriverResult[]): string;
 /** Print a compact summary to console */
 declare function printDriverSummary(results: DriverResult[]): void;
-/**
- * Normalize scores so all dimensions follow "higher = better".
- * Inverted dimensions (hallucination, false_confidence, worst_failure)
- * already use inverted scoring in the prompt (10 = no hallucination),
- * but this function ensures consistency if raw scores leak through.
- */
-declare function normalizeScores(scores: JudgeScore[]): JudgeScore[];
-/** Weighted mean — falls back to uniform weights when omitted */
-declare function weightedMean(scores: {
-    score: number;
-    weight?: number;
-}[]): number;
-/** Bootstrap confidence interval */
-declare function confidenceInterval(scores: number[], confidence?: number): {
-    mean: number;
-    lower: number;
-    upper: number;
-};
-/**
- * Inter-rater reliability — simplified Krippendorff's alpha.
- *
- * Each inner array is one judge's scores for all items.
- * All arrays must have the same length (same items scored).
- */
-declare function interRaterReliability(judgeScores: JudgeScore[][]): number;
-/**
- * Mann-Whitney U test for comparing two independent groups.
- * Returns U statistic and approximate p-value (normal approximation).
- */
-declare function mannWhitneyU(a: number[], b: number[]): {
-    u: number;
-    p: number;
-};
-/** Partial credit: returns 0-1 ratio of current toward target */
-declare function partialCredit(current: number, target: number): number;
-/**
- * Paired t-test — before/after measurements on the SAME items.
- * Pairing removes inter-item variance, giving tighter significance than
- * an unpaired test when comparing prompt v1 vs prompt v2 on identical
- * scenarios.
- */
-declare function pairedTTest(before: number[], after: number[]): {
-    t: number;
-    df: number;
-    p: number;
-};
-/**
- * Wilcoxon signed-rank test — paired non-parametric alternative.
- * Use when the differences aren't normally distributed.
- */
-declare function wilcoxonSignedRank(before: number[], after: number[]): {
-    w: number;
-    p: number;
-};
-/**
- * Cohen's d — standardized effect size for two independent groups.
- * Positive d means group b has higher mean than group a.
- * Rule of thumb: |d| < 0.2 negligible, 0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.
- */
-declare function cohensD(a: number[], b: number[]): number;
-interface CorpusScoreRecord {
-    /** Stable identifier for the rated item (scenario, span, turn, …). */
-    itemId: string;
-    /** Identifier for the judge that produced this score. */
-    judgeName: string;
-    /** Dimension name (matches `JudgeScore.dimension`). */
-    dimension: string;
-    /** Numeric score; must be finite. */
-    score: number;
-}
-interface CorpusAgreementPerDimension extends ContinuousAgreement {
-    dimension: string;
-    /** Item IDs that contributed to this dimension's matrix (every judge scored them). */
-    itemIds: string[];
-    /** Judge IDs that contributed to this dimension's matrix. */
-    judgeIds: string[];
-}
-interface CorpusAgreementReport {
-    /** Per-dimension ICC(2,1) + κ_w + Pearson + Spearman + bootstrap CIs. */
-    perDimension: CorpusAgreementPerDimension[];
-    /** Mean ICC across dimensions (NaN if no dimension yielded a finite ICC). */
-    overallIcc: number;
-    /** Mean weighted κ across dimensions (NaN if none finite). */
-    overallWeightedKappa: number;
-    /** Dimensions evaluated (sorted). */
-    dimensions: string[];
-    /** Judges seen across the corpus (sorted). */
-    judgeIds: string[];
-}
-interface CorpusAgreementOptions extends ContinuousAgreementOptions {
-    /**
-     * Restrict the audit to these dimensions. Default = every dimension
-     * that appears in the input. A dimension named here but absent from
-     * the input throws — silent omission would corrupt the overall metric.
-     */
-    dimensions?: string[];
-    /**
-     * Restrict the audit to these judges. Default = every judge that
-     * appears in the input. A judge named here but absent from a
-     * dimension throws (see "fail loud" below).
-     */
-    judges?: string[];
-}
-/**
- * Corpus-wide inter-rater agreement across N items × M judges × D dimensions.
- *
- * For each dimension, builds the [n_items][n_judges] matrix of scores
- * (keeping only items every judge rated on that dimension), then runs
- * `continuousAgreement` to get ICC(2,1), κ_w, Pearson, Spearman, and
- * bootstrap CIs. Reports a pooled mean across dimensions as a single
- * "is this judge panel reliable on this corpus?" number.
- *
- * Fail-loud contract:
- *   - Empty input throws.
- *   - Fewer than 2 judges or fewer than 2 items per dimension throws.
- *   - A judge present in some dimensions but with zero scored items on
- *     another dimension throws (would silently shrink the matrix).
- *   - Duplicate (itemId, judgeName, dimension) records throw.
- */
-declare function corpusInterRaterAgreement(records: CorpusScoreRecord[], opts?: CorpusAgreementOptions): CorpusAgreementReport;
-/**
- * Convenience adapter for `JudgeScore[]` data keyed externally by item.
- *
- * Use when you have per-item arrays of `JudgeScore[]` (e.g. one
- * `ScenarioResult.judgeScores` per scenario) and want corpus-wide
- * agreement without manually flattening. `itemId` must be unique per
- * row of `itemsScores`.
- */
-declare function corpusInterRaterAgreementFromJudgeScores(itemsScores: Array<{
-    itemId: string;
-    scores: JudgeScore[];
-}>, opts?: CorpusAgreementOptions): CorpusAgreementReport;
 /**
  * Anti-slop quality judge.
  *
@@ -3455,6 +3117,46 @@ declare class BudgetGuard {
     get state(): Record<keyof BudgetSpec, number>;
 }
+/**
+ * @stable
+ *
+ * AgentProfile — the eval harness's unit of variation.
+ *
+ * A profile pins everything that changes agent behaviour for a benchmark
+ * cell: the model, the active skills, the prompt version, the available
+ * tools. Vary the profile — swap a model, add a skill — and re-run the suite
+ * to benchmark the change. The scorecard keys a cell on
+ * `(scenarioId, profileHash)`, so the model is not a separate axis: it lives
+ * inside the profile, and two profiles with the same model but different
+ * skills are different cells.
+ *
+ * `agentProfileHash` is the profile's behaviour identity. Two profiles that
+ * produce the same agent behaviour share a hash (and a scorecard cell);
+ * reordering `skills` or `tools` does not change it; the human-facing `id`
+ * label does not affect it.
+ */
+interface AgentProfile {
+    /** Human-facing label, e.g. `sonnet-legal-skills-v3`. Not part of the hash. */
+    id: string;
+    /** Model snapshot id this profile pins, e.g. `claude-sonnet-4-6@2025-04-15`. */
+    model: string;
+    /** Skill ids/versions active in this profile — the primary behaviour lever. */
+    skills?: string[];
+    /** Prompt version identifier. */
+    promptVersion?: string;
+    /** Tool ids available to the agent. */
+    tools?: string[];
+    /** Any other behaviour-bearing knobs that should fingerprint into the hash. */
+    metadata?: Record<string, string | number | boolean>;
+}
+/**
+ * Deterministic behaviour identity of a profile — a sha256 over the
+ * behaviour-bearing fields. `skills` and `tools` are order-insensitive; the
+ * `id` label is excluded. Throws on a profile with no `model` — an unkeyable
+ * profile must fail loud rather than collapse into a blank-model cell.
+ */
+declare function agentProfileHash(profile: AgentProfile): string;
 /**
  * Cost tracker — token + USD accounting per scenario and per run.
  *
@@ -3688,6 +3390,138 @@ interface OracleReport {
 /** Run all oracles against one observation and aggregate. */
 declare function evaluateOracles(obs: OracleObservation, oracles: Oracle[]): OracleReport;
+/**
+ * @stable
+ *
+ * Eval scorecard — the persistent (persona × profile) score timeline.
+ *
+ * Every benchmark run folds into per-cell entries; a cell is
+ * `(scenarioId, profileHash)` and its timeline carries one entry per commit.
+ * The scorecard answers the question a single run cannot: did THIS change
+ * regress persona P on profile F, even while the aggregate improved?
+ *
+ * Storage is an append-only JSONL log — one line per (cell, commit). Appends
+ * never read-modify-write, so concurrent campaign runs cannot clobber each
+ * other; `loadScorecard` folds the log into the queryable `Scorecard`, and a
+ * malformed line never breaks the read. `diffScorecard` compares the latest
+ * entry of each cell against its predecessor with Cohen's d + Welch's t-test.
+ */
+/** One commit's measurement of one (scenario, profile) cell. */
+interface ScorecardEntry {
+    commitSha: string;
+    /** ISO timestamp the entry was recorded. */
+    timestamp: string;
+    /** Per-seed (or per-rep) scores for this cell at this commit. */
+    scores: number[];
+    /** Median of `scores` — the cell's headline score for the commit. */
+    composite: number;
+    /** Per-dimension means, when the runs carried a judge breakdown. */
+    perDimension?: Record<string, number>;
+    /** RunRecord ids folded into this entry — provenance. */
+    runIds: string[];
+}
+/** A (scenario, profile) cell and its commit-ordered score timeline. */
+interface ScorecardCell {
+    scenarioId: string;
+    profileHash: string;
+    /** Model id — denormalised from the profile for readable filtering. */
+    model: string;
+    timeline: ScorecardEntry[];
+}
+/** The folded scorecard: every cell, plus the profile definitions by hash. */
+interface Scorecard {
+    cells: ScorecardCell[];
+    /** Profile definitions seen — keeps the scorecard self-describing. */
+    profiles: Record<string, AgentProfile>;
+}
+/** One append-only log line — a single cell's entry for a single commit. */
+interface ScorecardLogLine {
+    scenarioId: string;
+    profileHash: string;
+    model: string;
+    profile: AgentProfile;
+    entry: ScorecardEntry;
+}
+interface RecordRunsOptions {
+    /** The profile that produced these runs — keys the cell. */
+    profile: AgentProfile;
+    commitSha: string;
+    /** Defaults to `new Date().toISOString()`. */
+    timestamp?: string;
+}
+/**
+ * Fold a benchmark's `RunRecord[]` into per-cell scorecard log lines — one
+ * line per scenario the runs cover. All runs are attributed to the single
+ * `profile` in `opts` (the harness ran them under it); the cell key is
+ * `(scenarioId, agentProfileHash(profile))`.
+ */
+declare function recordRuns(runs: RunRecord[], opts: RecordRunsOptions): ScorecardLogLine[];
+/** Append cell entries to the JSONL scorecard log. Creates the file/dir. */
+declare function appendScorecard(logPath: string, lines: ScorecardLogLine[]): void;
+/** Record runs and append them to the log in one call. Returns the lines. */
+declare function recordRunsToScorecard(logPath: string, runs: RunRecord[], opts: RecordRunsOptions): ScorecardLogLine[];
+/**
+ * Fold the JSONL log into a queryable `Scorecard`. A missing file yields an
+ * empty scorecard; a malformed line is skipped — a corrupt append never
+ * breaks the read. Each cell's timeline is sorted chronologically.
+ */
+declare function loadScorecard(logPath: string): Scorecard;
+type CellVerdict = 'improved' | 'regressed' | 'flat' | 'new';
+interface ScorecardCellDiff {
+    scenarioId: string;
+    profileHash: string;
+    model: string;
+    verdict: CellVerdict;
+    /** Composite of the latest entry. */
+    current: number;
+    /** Composite of the comparison entry — `null` when `verdict === 'new'`. */
+    baseline: number | null;
+    /** `current − baseline` — `null` when new. */
+    delta: number | null;
+    /** Cohen's d of current vs baseline samples — `null` when new or n < 2. */
+    cohensD: number | null;
+    /** Welch's t-test p-value — `null` when new or n < 2. */
+    pValue: number | null;
+    currentCommit: string;
+    baselineCommit: string | null;
+}
+interface ScorecardDiff {
+    cells: ScorecardCellDiff[];
+    summary: {
+        improved: number;
+        regressed: number;
+        flat: number;
+        new: number;
+    };
+}
+interface DiffScorecardOptions {
+    /** Compare each cell against this commit instead of its immediate predecessor. */
+    baselineCommit?: string;
+    /** |Cohen's d| at/above which a move counts as real. Default 0.5. */
+    minEffect?: number;
+    /** p-value at/below which a move is significant. Default 0.05. */
+    maxP?: number;
+    /**
+     * |delta| at/above which a move counts when statistics are unavailable
+     * (a cell with fewer than 2 samples on either side). Default 0.05.
+     */
+    minDelta?: number;
+}
+/**
+ * Compare the latest entry of every cell against its predecessor (or against
+ * `baselineCommit`) and classify the move. A move is `improved`/`regressed`
+ * only when it clears both the effect-size and significance gates; otherwise
+ * `flat`. Cells with no prior entry are `new`.
+ */
+declare function diffScorecard(scorecard: Scorecard, opts?: DiffScorecardOptions): ScorecardDiff;
+/**
+ * Render a scorecard diff as a human-readable report — the block a feature
+ * PR prints. Regressions are listed first; flat cells are summarised, not
+ * enumerated.
+ */
+declare function formatScorecardDiff(diff: ScorecardDiff): string;
 /**
  * Series convergence — detects whether a sequence of scalar measurements
  * is stabilizing, drifting, or noisy.
@@ -4180,49 +4014,6 @@ interface ParaphraseRobustnessScenarioResult {
  */
 declare function paraphraseRobustnessScenarios(args: ParaphraseRobustnessScenarioInput): Promise<ParaphraseRobustnessScenarioResult>;
-/**
- * Power analysis + multiple-comparison correction.
- *
- * Two jobs:
- *   1. Before running: `requiredSampleSize({ effect, alpha, power })`
- *      returns the N per arm needed to detect a given effect size.
- *   2. After running: `benjaminiHochberg(pValues, fdr)` and
- *      `bonferroni(pValues, alpha)` correct for multiple pairwise tests
- *      so pairwise variant comparisons stay statistically honest.
- *
- * Applying alpha directly across n*(n-1)/2 pairwise tests without
- * correction inflates the false-positive rate when variants ≥ 3 — the
- * BH and Bonferroni helpers prevent that.
- */
-/**
- * Required N per arm for a two-sample comparison at target effect size,
- * alpha, and power. Uses the normal-approximation formula:
- *
- *   n = 2 * ( (z_{1-α/2} + z_{1-β}) / d )^2
- *
- * where d is Cohen's d. Returns Infinity for effect ≤ 0.
- */
-declare function requiredSampleSize(opts: {
-    effect: number;
-    alpha?: number;
-    power?: number;
-    twoSided?: boolean;
-}): number;
-/** Bonferroni adjustment: multiply every p-value by the number of tests, clamp at 1. */
-declare function bonferroni(pValues: number[], alpha?: number): {
-    adjusted: number[];
-    significant: boolean[];
-};
-/**
- * Benjamini–Hochberg false discovery rate. Returns adjusted q-values and
- * significance at the target FDR. Properly handles ties and preserves
- * monotonicity of q-values.
- */
-declare function benjaminiHochberg(pValues: number[], fdr?: number): {
-    qValues: number[];
-    significant: boolean[];
-};
 /**
  * Visual diff — pixel-delta scoring for UI / visual outputs.
  *
@@ -6337,10 +6128,11 @@ interface JudgeRetryPolicy {
     /** Exponential backoff function, default `attempt → min(500 * 2^attempt, 16_000)`. */
     backoffMs?: (attempt: number) => number;
     /**
-     * Predicate deciding whether an error should trigger a retry. Default
-     * retries on: AbortError, TimeoutError, `fetch failed`, `ECONNRESET`,
-     * `[This operation was aborted]`, and any LlmCallError with status in
-     * {429, 502, 503, 504}. JSON-parse errors are NOT retriable (the model
+     * Predicate deciding whether an error should trigger a retry. Defaults to
+     * `isTransientLlmError` — the package-wide classifier shared with
+     * `callLlm` — which retries aborts/timeouts, network faults, HTTP/2
+     * transport faults, and any `LlmCallError` with status in {429,502,503,504}.
+     * JSON-parse and schema-rejection errors are NOT retriable (the model
      * needs prompt adjustment, not another shot).
      */
     isRetryable?: (err: unknown) => boolean;
@@ -6486,4 +6278,4 @@ declare function aggregateTrialsByMode(trials: TrialResult[], opts: {
     mode: AggregatorMode;
 }): TrialAggregate;
-export { ANALYST_SEVERITIES, ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AggregatorMode, type AlignmentOp, type Analyst, type AnalystContext, type AnalystCost, type AnalystFinding, type AnalystHooks, type AnalystInputKind, AnalystRegistry, type AnalystRegistryOptions, type AnalystRequirements, type AnalystRunEvent, type AnalystRunInputs, type AnalystRunResult, type AnalystRunSummary, type AnalystSeverity, AnalyzeTracesOptions, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactResult, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BackendIntegrityError, type BackendIntegrityReport, BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, type BudgetPolicy, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type ChatCallOpts, type ChatClient, type ChatRequest, type ChatResponse, type ChatTransport, type CheckResult, type CliBridgeTransportOpts, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, ContinuousAgreement, ContinuousAgreementOptions, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CorpusAgreementOptions, type CorpusAgreementPerDimension, type CorpusAgreementReport, type CorpusScoreRecord, type CorrectnessChecker, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateChatClientOpts, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DirEntry, type DirectProviderTransportOpts, type DiscoverPersonasOptions, type DiscoveredPersona, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvalResult, type EvidenceRef, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAdapterOpts, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, type JudgeRubric, JudgeRunner, type JudgeScore, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallRequest, LlmCallResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MockTransportOpts, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersistedFinding, type PersonaConfig, type PersonaRigor, type Playbook, type PlaybookEntry, type PoolSlot, type ProducedProposal, type ProducedState, ProductClient, type ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegistryRunOpts, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RouterTransportOpts, type RubricDimension, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxSdkTransportOpts, type SatisfiedBy, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, type TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, Trajectory, TrajectoryStep, type TrialAggregate, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateRunScore, aggregateTrialsByMode, analyzeAntiSlop, analyzeSeries, assertRealBackend, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, buildDriverSystemPrompt, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, computeFindingId, confidenceInterval, containsAll, corpusInterRaterAgreement, corpusInterRaterAgreementFromJudgeScores, createAntiSlopJudge, createChatClient, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, liftSeverity, linterJudge, loadScorerFromGrader, localCommandRunner, lowercaseMutator, makeFinding, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, normalizeScores, notBlocked, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeBackendIntegrity, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedMean, weightedRecall, whitespaceCollapseMutator, wilcoxonSignedRank, withJudgeRetry, wranglerDeployRunner };
+export { ANALYST_SEVERITIES, ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AgentProfile, type AggregatorMode, type AlignmentOp, type Analyst, type AnalystContext, type AnalystCost, type AnalystFinding, type AnalystHooks, type AnalystInputKind, AnalystRegistry, type AnalystRegistryOptions, type AnalystRequirements, type AnalystRunEvent, type AnalystRunInputs, type AnalystRunResult, type AnalystRunSummary, type AnalystSeverity, AnalyzeTracesOptions, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BackendIntegrityError, type BackendIntegrityReport, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, type BudgetPolicy, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, type ChatCallOpts, type ChatClient, type ChatRequest, type ChatResponse, type ChatTransport, CheckResult, type CliBridgeTransportOpts, type CodeMutationOutcome, type CodeMutationRunner, CollectedArtifacts, type CommandRunner, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CorrectnessChecker, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateChatClientOpts, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type DirectProviderTransportOpts, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvidenceRef, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAdapterOpts, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallRequest, LlmCallResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MockTransportOpts, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegistryRunOpts, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouterTransportOpts, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxSdkTransportOpts, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, Trajectory, TrajectoryStep, type TrialAggregate, type TrialAttempt, TrialCache, TrialTelemetry, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, agentProfileHash, aggregatePrReviewScore, aggregateRunScore, aggregateTrialsByMode, analyzeAntiSlop, analyzeSeries, appendScorecard, assertRealBackend, attributeCounterfactuals, bisect, buildDriverSystemPrompt, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, computeFindingId, containsAll, createAntiSlopJudge, createChatClient, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, liftSeverity, linterJudge, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, makeFinding, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeBackendIntegrity, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, wranglerDeployRunner };