@tangle-network/agent-eval 0.32.0 → 0.33.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/CHANGELOG.md +30 -0
  2. package/dist/benchmarks/index.d.ts +2 -2
  3. package/dist/chunk-DCZXFOQN.js +489 -0
  4. package/dist/chunk-DCZXFOQN.js.map +1 -0
  5. package/dist/{chunk-B73G44OH.js → chunk-FT3IAMQR.js} +5 -5
  6. package/dist/chunk-FT3IAMQR.js.map +1 -0
  7. package/dist/{chunk-GVQT44CS.js → chunk-KE7TDJUO.js} +2 -2
  8. package/dist/{chunk-4L3WJXQJ.js → chunk-KHZRNY3F.js} +163 -2
  9. package/dist/{chunk-4L3WJXQJ.js.map → chunk-KHZRNY3F.js.map} +1 -1
  10. package/dist/{chunk-WGXZAQLR.js → chunk-LGAPK7NA.js} +2 -2
  11. package/dist/{chunk-DTEJNZYK.js → chunk-SQYRO3BT.js} +47 -4
  12. package/dist/chunk-SQYRO3BT.js.map +1 -0
  13. package/dist/{chunk-CXJOVDJR.js → chunk-TQL7BAOY.js} +5 -175
  14. package/dist/chunk-TQL7BAOY.js.map +1 -0
  15. package/dist/{chunk-M6RZ5LJN.js → chunk-VXNVVBZO.js} +34 -5
  16. package/dist/chunk-VXNVVBZO.js.map +1 -0
  17. package/dist/{chunk-S4Y5VXMS.js → chunk-WRGHMGWT.js} +2 -2
  18. package/dist/{chunk-SMSGXM74.js → chunk-YU3G6I7F.js} +2 -2
  19. package/dist/cli.js +2 -2
  20. package/dist/{control-p2ns7elI.d.ts → control-C3k02SCP.d.ts} +1 -1
  21. package/dist/control.d.ts +2 -2
  22. package/dist/control.js +3 -2
  23. package/dist/governance/index.d.ts +2 -1
  24. package/dist/{index-DPILdKbP.d.ts → index-CN2agEaO.d.ts} +2 -142
  25. package/dist/{index-BTqhGHJT.d.ts → index-ClMxVqe_.d.ts} +1 -1
  26. package/dist/index.d.ts +39 -486
  27. package/dist/index.js +75 -68
  28. package/dist/index.js.map +1 -1
  29. package/dist/judge-calibration-DilmB3Ml.d.ts +142 -0
  30. package/dist/meta-eval/index.d.ts +2 -2
  31. package/dist/openapi.json +1 -1
  32. package/dist/optimization.d.ts +3 -3
  33. package/dist/optimization.js +6 -6
  34. package/dist/pipelines/index.js +2 -2
  35. package/dist/release-report-ChfmCmLi.d.ts +713 -0
  36. package/dist/reporting.d.ts +6 -4
  37. package/dist/reporting.js +10 -9
  38. package/dist/{researcher-BRHa5Jxo.d.ts → researcher-CfnL3HEb.d.ts} +34 -3
  39. package/dist/rl.d.ts +5 -5
  40. package/dist/rl.js +6 -6
  41. package/dist/rl.js.map +1 -1
  42. package/dist/{rubric-predictive-validity-CMHypZ_M.d.ts → rubric-predictive-validity-BvaNwfBE.d.ts} +1 -1
  43. package/dist/{run-record-BfX5y68A.d.ts → run-record-YinVdFwu.d.ts} +78 -2
  44. package/dist/{summary-report-D7AQS7eB.d.ts → summary-report-BPJVzIeW.d.ts} +2 -2
  45. package/dist/wire/index.js +2 -2
  46. package/docs/product-eval-adoption.md +18 -0
  47. package/package.json +22 -12
  48. package/dist/chunk-B73G44OH.js.map +0 -1
  49. package/dist/chunk-CXJOVDJR.js.map +0 -1
  50. package/dist/chunk-DTEJNZYK.js.map +0 -1
  51. package/dist/chunk-M6RZ5LJN.js.map +0 -1
  52. package/dist/chunk-ZN2CMQIW.js +0 -208
  53. package/dist/chunk-ZN2CMQIW.js.map +0 -1
  54. package/dist/release-report-DLWbBPtH.d.ts +0 -292
  55. /package/dist/{chunk-GVQT44CS.js.map → chunk-KE7TDJUO.js.map} +0 -0
  56. /package/dist/{chunk-WGXZAQLR.js.map → chunk-LGAPK7NA.js.map} +0 -0
  57. /package/dist/{chunk-S4Y5VXMS.js.map → chunk-WRGHMGWT.js.map} +0 -0
  58. /package/dist/{chunk-SMSGXM74.js.map → chunk-YU3G6I7F.js.map} +0 -0
package/dist/index.d.ts CHANGED
@@ -1,16 +1,18 @@
1
- export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-p2ns7elI.js';
1
+ export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-C3k02SCP.js';
2
+ import { R as RunRecord, a as RunSplitTag } from './run-record-YinVdFwu.js';
3
+ export { A as AgentProfileCell, d as AgentProfileCellInput, e as AgentProfileCellSchemaVersion, f as AgentProfileCellValidationError, g as AgentProfileDimensionValue, h as AgentProfileHarness, i as AgentProfileJson, j as AgentProfileSource, k as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, l as RunOutcome, m as RunRecordValidationError, b as RunTokenUsage, n as agentProfileCellHashMaterial, o as agentProfileCellKey, p as assertRunAgentProfileCell, q as buildAgentProfileCell, r as groupRunsByAgentProfileCell, s as isRunRecord, t as parseRunRecordSafe, u as requireAgentProfileCell, v as roundTripRunRecord, w as validateAgentProfileCell, x as validateRunRecord, y as verifyAgentProfileCell } from './run-record-YinVdFwu.js';
2
4
  import { AxAIService, AxFunction } from '@ax-llm/ax';
3
5
  import { S as Severity, M as MultiLayerVerifier, a as VerifyOptions, L as Layer, b as LayerResult, c as VerifyContext } from './multi-layer-verifier-BNi4-8lR.js';
4
6
  export { F as Finding, d as LayerStatus, V as VerificationReport, g as gradeSemanticStatus } from './multi-layer-verifier-BNi4-8lR.js';
5
7
  import { R as Run$1, S as Span, b as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, T as TraceStore, g as BudgetSpec, h as RunFilter, L as LlmSpan } from './store-Db2Bv8Cf.js';
6
8
  export { i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, M as Message, e as RetrievalSpan, m as RunLayer, n as RunStatus, f as SandboxSpan, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
7
- import { L as LlmClientOptions, m as LlmCallRequest, n as LlmCallResult } from './researcher-BRHa5Jxo.js';
8
- export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, o as LlmCallError, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as callLlm, x as callLlmJson, y as probeLlm, r as runEvalCampaign, z as stripFencedJson } from './researcher-BRHa5Jxo.js';
9
+ import { L as LlmClientOptions, m as LlmCallRequest, n as LlmCallResult } from './researcher-CfnL3HEb.js';
10
+ export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, o as LlmCallError, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as backoffMs, x as callLlm, y as callLlmJson, z as isTransientLlmError, A as probeLlm, r as runEvalCampaign, B as stripFencedJson } from './researcher-CfnL3HEb.js';
9
11
  import { TraceAnalysisStore, AnalyzeTracesOptions } from './traces.js';
10
12
  export { AnalyzeTracesInput, AnalyzeTracesResult, AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, DatasetOverview, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpResourceSpans, OtlpSpan, QueryTracesPage, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SearchSpanResult, SearchTraceResult, SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TraceAnalystByteBudgets, TraceAnalystFilters, TraceAnalystHookOptions, TraceAnalystSpan, TraceAnalystSpanKind, TraceAnalystSpanStatus, TraceAnalystTraceSummary, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, ViewSpansResult, ViewTraceOversized, ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
13
+ import { t as JudgeInput, u as JudgeFn, v as BenchmarkRunnerConfig, S as Scenario, x as BenchmarkReport, y as ProductClientConfig, C as CheckResult, T as TestResult, z as PersonaConfig, D as DriverResult, A as DriverState, E as CollectedArtifacts, F as ScenarioResult, i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard, G as TurnMetrics, H as ScenarioFile, I as CompletionCriterion } from './release-report-ChfmCmLi.js';
14
+ export { K as ArtifactCheck, L as ArtifactResult, B as BootstrapOptions, a as BootstrapResult, M as CorpusAgreementOptions, N as CorpusAgreementPerDimension, O as CorpusAgreementReport, Q as CorpusScoreRecord, U as EvalResult, W as FeedbackPattern, X as JudgeConfig, J as JudgeReplayGateArgs, Y as JudgeRubric, Z as JudgeScore, P as PairedBootstrapOptions, b as PairedBootstrapResult, _ as PersonaRigor, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, $ as RouteMap, a0 as RubricDimension, a1 as Turn, a2 as TurnResult, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, a3 as bonferroni, n as bootstrapCi, a4 as cohensD, a5 as confidenceInterval, a6 as corpusInterRaterAgreement, a7 as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, a8 as interRaterReliability, p as judgeReplayGate, a9 as mannWhitneyU, aa as normalizeScores, q as pairedBootstrap, ab as pairedMde, ac as pairedTTest, ad as partialCredit, r as releaseTraceEvidenceFromMultiShotTrials, s as renderReleaseReport, ae as requiredSampleSize, af as weightedMean, w as wilcoxonSignedRank } from './release-report-ChfmCmLi.js';
11
15
  import { TCloud } from '@tangle-network/tcloud';
12
- import { R as RunRecord, a as RunSplitTag } from './run-record-BfX5y68A.js';
13
- export { J as JudgeScoresRecord, c as RunJudgeMetadata, d as RunOutcome, e as RunRecordValidationError, b as RunTokenUsage, i as isRunRecord, p as parseRunRecordSafe, r as roundTripRunRecord, v as validateRunRecord } from './run-record-BfX5y68A.js';
14
16
  import { z } from 'zod';
15
17
  import { C as ControlEvalResult } from './control-runtime-BZ_lVLYW.js';
16
18
  export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BZ_lVLYW.js';
@@ -18,15 +20,11 @@ import { A as AgentEvalError } from './errors-mje_cKOs.js';
18
20
  export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-mje_cKOs.js';
19
21
  import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory } from './feedback-trajectory-iATEAHmc.js';
20
22
  export { F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-iATEAHmc.js';
21
- import { A as ActionableSideInfo, h as MultiShotRunner, j as MultiShotScorer, c as MultiShotMutateAdapter, a4 as HeldOutGateConfig, E as EvolvableVariant, m as MultiShotTrialResult, e as MultiShotOptimizationResult, a3 as GateDecision, a5 as Objective, a6 as ParetoResult, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-D7AQS7eB.js';
22
- export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a7 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a8 as GateEvidence, G as GenerationReport, a9 as HeldOutGate, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, d as MultiShotOptimizationConfig, f as MultiShotRun, g as MultiShotRunInput, i as MultiShotScore, k as MultiShotSplit, l as MultiShotTrace, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-D7AQS7eB.js';
23
+ import { A as ActionableSideInfo, h as MultiShotRunner, j as MultiShotScorer, c as MultiShotMutateAdapter, a4 as HeldOutGateConfig, E as EvolvableVariant, m as MultiShotTrialResult, e as MultiShotOptimizationResult, a3 as GateDecision, a5 as Objective, a6 as ParetoResult, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-BPJVzIeW.js';
24
+ export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a7 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a8 as GateEvidence, G as GenerationReport, a9 as HeldOutGate, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, d as MultiShotOptimizationConfig, f as MultiShotRun, g as MultiShotRunInput, i as MultiShotScore, k as MultiShotSplit, l as MultiShotTrace, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-BPJVzIeW.js';
23
25
  export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
24
- import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-DLWbBPtH.js';
25
- export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-DLWbBPtH.js';
26
26
  import { a as FailureCluster } from './failure-cluster-Cw65_5FY.js';
27
27
  export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-Cw65_5FY.js';
28
- import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './index-DPILdKbP.js';
29
- export { b as CalibrationResult, c as CandidateScore, d as ContinuousCalibrationResult, D as DEFAULT_RED_TEAM_CORPUS, E as EuRiskClass, G as GoldenItem, e as GovernanceContext, f as GovernanceFinding, g as GovernanceReport, P as PositionalBiasResult, R as RedTeamCase, h as RedTeamCategory, i as RedTeamFinding, j as RedTeamPayload, k as RedTeamReport, S as SelfPreferenceResult, U as UseCaseSignals, V as VerbosityBiasResult, l as calibrateJudge, m as calibrateJudgeContinuous, n as classifyEuAiRisk, o as continuousAgreement, p as euAiActReport, q as nistAiRmfReport, r as positionalBias, s as redTeamDataset, t as redTeamReport, u as renderMarkdown, v as scoreRedTeamOutput, w as selfPreference, x as soc2Report, y as summarize, z as toolNamesForRun, A as verbosityBias } from './index-DPILdKbP.js';
30
28
  import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-B2kWEdh9.js';
31
29
  export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-B2kWEdh9.js';
32
30
  import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
@@ -39,8 +37,10 @@ import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
39
37
  export { b as buildTrajectory } from './trajectory-CnoBo-JY.js';
40
38
  import { a as DatasetScenario, c as Dataset } from './dataset-ueRVTUoY.js';
41
39
  export { d as DatasetDifficulty, b as DatasetManifest, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-ueRVTUoY.js';
40
+ export { b as CalibrationResult, c as CandidateScore, a as ContinuousAgreement, C as ContinuousAgreementOptions, d as ContinuousCalibrationResult, G as GoldenItem, P as PositionalBiasResult, S as SelfPreferenceResult, V as VerbosityBiasResult, e as calibrateJudge, f as calibrateJudgeContinuous, g as continuousAgreement, p as positionalBias, s as selfPreference, v as verbosityBias } from './judge-calibration-DilmB3Ml.js';
41
+ export { D as DEFAULT_RED_TEAM_CORPUS, E as EuRiskClass, G as GovernanceContext, a as GovernanceFinding, b as GovernanceReport, R as RedTeamCase, c as RedTeamCategory, d as RedTeamFinding, e as RedTeamPayload, f as RedTeamReport, U as UseCaseSignals, g as classifyEuAiRisk, h as euAiActReport, n as nistAiRmfReport, r as redTeamDataset, i as redTeamReport, j as renderMarkdown, s as scoreRedTeamOutput, k as soc2Report, l as summarize, t as toolNamesForRun } from './index-CN2agEaO.js';
42
42
  import { a as PrmGrader } from './rubric-D5tjHNJQ.js';
43
- export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-BTqhGHJT.js';
43
+ export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-ClMxVqe_.js';
44
44
  export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
45
45
  import './outcome-store-D6KWmYvj.js';
46
46
 
@@ -230,299 +230,6 @@ declare function runSemanticConceptJudge(input: SemanticConceptJudgeInput, optio
230
230
  */
231
231
  declare function createSemanticConceptJudge(options?: SemanticConceptJudgeOptions): (input: SemanticConceptJudgeInput) => Promise<SemanticConceptJudgeResult>;
232
232
 
233
- interface Scenario {
234
- id: string;
235
- persona: string;
236
- label: string;
237
- thesis: string;
238
- dimensions: string[];
239
- turns: Turn[];
240
- artifactChecks: ArtifactCheck[];
241
- systemPromptAppend?: string;
242
- }
243
- interface Turn {
244
- user: string;
245
- expectedBehaviors: string[];
246
- adversarial?: boolean;
247
- feedbackType?: 'correction' | 'rejection' | 'vague' | 'contradictory' | 'escalation';
248
- }
249
- interface ArtifactCheck {
250
- type: 'vault_file_exists' | 'vault_file_contains' | 'block_extracted' | 'code_valid' | 'generation_produced' | 'tool_created' | string;
251
- target: string;
252
- contains?: string;
253
- minCount?: number;
254
- description: string;
255
- }
256
- interface JudgeConfig {
257
- model: string;
258
- temperature: number;
259
- rubric: JudgeRubric;
260
- }
261
- interface JudgeRubric {
262
- name: string;
263
- description: string;
264
- dimensions: RubricDimension[];
265
- }
266
- interface RubricDimension {
267
- name: string;
268
- description: string;
269
- anchor_low: string;
270
- anchor_high: string;
271
- weight: number;
272
- }
273
- interface ScenarioResult {
274
- scenarioId: string;
275
- persona: string;
276
- turns: TurnResult[];
277
- artifactResults: ArtifactResult[];
278
- judgeScores: JudgeScore[];
279
- judgeErrors: number;
280
- overallScore: number;
281
- totalDurationMs: number;
282
- artifacts: CollectedArtifacts;
283
- }
284
- interface TurnResult {
285
- turnIndex: number;
286
- userMessage: string;
287
- agentResponse: string;
288
- durationMs: number;
289
- blocksExtracted: {
290
- type: string;
291
- title: string;
292
- }[];
293
- containsCode: boolean;
294
- containsToolCall: boolean;
295
- }
296
- interface ArtifactResult {
297
- check: ArtifactCheck;
298
- passed: boolean;
299
- detail?: string;
300
- }
301
- interface JudgeScore {
302
- judgeName: string;
303
- dimension: string;
304
- score: number;
305
- reasoning: string;
306
- evidence?: string;
307
- }
308
- interface CollectedArtifacts {
309
- vaultFiles: {
310
- path: string;
311
- content: string;
312
- }[];
313
- blocksExtracted: {
314
- type: string;
315
- fields: Record<string, string>;
316
- }[];
317
- codeBlocks: {
318
- language: string;
319
- code: string;
320
- }[];
321
- toolCalls: string[];
322
- }
323
- interface BenchmarkReport {
324
- timestamp: string;
325
- generation: number;
326
- promptVersion: string;
327
- scenarioCount: number;
328
- results: ScenarioResult[];
329
- summary: {
330
- overallAvg: number;
331
- byPersona: Record<string, {
332
- avg: number;
333
- passed: number;
334
- total: number;
335
- }>;
336
- byDimension: Record<string, {
337
- avg: number;
338
- scores: number[];
339
- }>;
340
- weakest: {
341
- scenario: string;
342
- score: number;
343
- reason: string;
344
- }[];
345
- strongest: {
346
- scenario: string;
347
- score: number;
348
- reason: string;
349
- }[];
350
- };
351
- }
352
- interface RouteMap {
353
- signup?: string;
354
- login?: string;
355
- workspaces?: string;
356
- threads?: string;
357
- chat?: string;
358
- tasks?: string;
359
- events?: string;
360
- approvals?: string;
361
- vault?: string;
362
- generations?: string;
363
- [key: string]: string | undefined;
364
- }
365
- interface ProductClientConfig {
366
- baseUrl: string;
367
- routes: RouteMap;
368
- }
369
- interface ScenarioFile {
370
- id: string;
371
- category: string;
372
- persona: string;
373
- label: string;
374
- thesis: string;
375
- isControl?: boolean;
376
- rubric?: {
377
- dimensions: {
378
- name: string;
379
- description: string;
380
- weight: number;
381
- }[];
382
- };
383
- turns: Turn[];
384
- artifactChecks: ArtifactCheck[];
385
- }
386
- interface CompletionCriterion {
387
- name: string;
388
- check: (state: DriverState) => boolean;
389
- progress?: (state: DriverState) => number;
390
- }
391
- interface FeedbackPattern {
392
- trigger: string;
393
- response: string;
394
- }
395
- /**
396
- * How hard the simulated user pushes back. The driver LLM scales its tone
397
- * and follow-up aggression to this:
398
- * cooperative — forgiving early adopter; accepts reasonable answers.
399
- * demanding — experienced professional; rejects vague or hedged answers.
400
- * relentless — senior partner reviewing for a client who will litigate;
401
- * interrogates every claim, accepts nothing undefended.
402
- */
403
- type PersonaRigor = 'cooperative' | 'demanding' | 'relentless';
404
- interface PersonaConfig {
405
- id: string;
406
- role: string;
407
- goal: string;
408
- completionCriteria: CompletionCriterion[];
409
- feedbackPatterns?: FeedbackPattern[];
410
- maxTurns: number;
411
- driverModel?: string;
412
- /** How adversarial the simulated user is. Defaults to 'demanding'. */
413
- rigor?: PersonaRigor;
414
- /**
415
- * Domain expertise the simulated user holds — quoted into the driver
416
- * prompt so it challenges the agent with authority instead of vague
417
- * dissatisfaction. e.g. "a 15-year M&A partner who knows GAAP
418
- * working-capital mechanics cold".
419
- */
420
- expertise?: string;
421
- /**
422
- * Substantive issues a senior professional in this role would
423
- * interrogate — traps the scenario hides, claims that must be defended.
424
- * The driver probes these without revealing them verbatim; the agent
425
- * must surface them on its own.
426
- */
427
- pressurePoints?: string[];
428
- /**
429
- * Curveballs the driver may inject once the agent is coasting — changed
430
- * facts, a hostile counterparty position, a new constraint. Forces the
431
- * agent to re-derive rather than recite.
432
- */
433
- curveballs?: string[];
434
- }
435
- interface DriverState {
436
- tasks: number;
437
- events: number;
438
- proposals: {
439
- pending: number;
440
- approved: number;
441
- rejected: number;
442
- };
443
- vaultFiles: string[];
444
- codeBlocks: number;
445
- generations: number;
446
- }
447
- interface TurnMetrics {
448
- turn: number;
449
- timestamp: string;
450
- tasks: number;
451
- events: number;
452
- proposals: {
453
- pending: number;
454
- approved: number;
455
- rejected: number;
456
- };
457
- vaultFiles: number;
458
- responseLatencyMs: number;
459
- responseChars: number;
460
- codeBlocksProduced: number;
461
- blocksExtracted: number;
462
- qualityScore?: number;
463
- inputTokens: number;
464
- outputTokens: number;
465
- estimatedCostUsd: number;
466
- totalCostUsd: number;
467
- completionPercent: number;
468
- }
469
- interface DriverResult {
470
- personaId: string;
471
- /** True when the simulated user professionally signed off (driver said DONE). */
472
- completed: boolean;
473
- /** Turn at which the simulated user signed off, or null if it never did. */
474
- turnsToCompletion: number | null;
475
- /**
476
- * Turn at which nominal completionCriteria were first all met, or null.
477
- * Distinct from turnsToCompletion: criteria can be met while the
478
- * simulated professional is still unsatisfied with the work's rigor.
479
- */
480
- criteriaMetAtTurn: number | null;
481
- totalTurns: number;
482
- metrics: TurnMetrics[];
483
- finalState: DriverState;
484
- convergenceCurve: number[];
485
- totalCostUsd: number;
486
- finalQualityScore: number | null;
487
- }
488
- interface BenchmarkRunnerConfig {
489
- scenarios: Scenario[];
490
- judges: JudgeFn[];
491
- systemPrompt: string;
492
- model?: string;
493
- judgeModel?: string;
494
- passThreshold?: number;
495
- generation?: number;
496
- promptVersion?: string;
497
- }
498
- interface JudgeInput {
499
- scenario: Scenario;
500
- turns: TurnResult[];
501
- artifacts: CollectedArtifacts;
502
- }
503
- type JudgeFn = (tc: TCloud, input: JudgeInput) => Promise<JudgeScore[]>;
504
-
505
- interface TestResult {
506
- name: string;
507
- passed: boolean;
508
- duration: number;
509
- detail?: string;
510
- checks: CheckResult[];
511
- }
512
- interface CheckResult {
513
- name: string;
514
- passed: boolean;
515
- expected: string;
516
- actual: string;
517
- }
518
- interface EvalResult {
519
- scenario: string;
520
- status: 'pass' | 'fail' | 'skip';
521
- duration: number;
522
- detail?: string;
523
- artifact?: string;
524
- }
525
-
526
233
  /**
527
234
  * ChatClient — the single LLM abstraction analysts call.
528
235
  *
@@ -1811,6 +1518,27 @@ declare class AgentDriver {
1811
1518
  * — exported so harness authors can inspect and regression-test it.
1812
1519
  */
1813
1520
  declare function buildDriverSystemPrompt(persona: PersonaConfig, state: DriverState, productContext?: string): string;
1521
+ interface DecideNextUserTurnOpts {
1522
+ persona: PersonaConfig;
1523
+ state: DriverState;
1524
+ /** Conversation so far — alternating user/assistant messages, oldest first. */
1525
+ history: {
1526
+ role: string;
1527
+ content: string;
1528
+ }[];
1529
+ /** Optional product context woven into the driver prompt. */
1530
+ productContext?: string;
1531
+ /** Driver LLM model. Defaults to claude-sonnet-4-6. */
1532
+ model?: string;
1533
+ }
1534
+ /**
1535
+ * Decide the simulated user's next turn — the reactive, adversarial
1536
+ * turn-generation core of `AgentDriver`, exposed standalone so an in-process
1537
+ * eval harness can drive multi-shot conversations without the `ProductClient`
1538
+ * workspace machinery. Returns the next user message, or the literal "DONE"
1539
+ * when the simulated professional would sign off.
1540
+ */
1541
+ declare function decideNextUserTurn(tc: TCloud, opts: DecideNextUserTurnOpts): Promise<string>;
1814
1542
 
1815
1543
  interface ExecutorConfig {
1816
1544
  /** System prompt for the agent under test */
@@ -2289,139 +2017,6 @@ declare function formatDriverReport(results: DriverResult[]): string;
2289
2017
  /** Print a compact summary to console */
2290
2018
  declare function printDriverSummary(results: DriverResult[]): void;
2291
2019
 
2292
- /**
2293
- * Normalize scores so all dimensions follow "higher = better".
2294
- * Inverted dimensions (hallucination, false_confidence, worst_failure)
2295
- * already use inverted scoring in the prompt (10 = no hallucination),
2296
- * but this function ensures consistency if raw scores leak through.
2297
- */
2298
- declare function normalizeScores(scores: JudgeScore[]): JudgeScore[];
2299
- /** Weighted mean — falls back to uniform weights when omitted */
2300
- declare function weightedMean(scores: {
2301
- score: number;
2302
- weight?: number;
2303
- }[]): number;
2304
- /** Bootstrap confidence interval */
2305
- declare function confidenceInterval(scores: number[], confidence?: number): {
2306
- mean: number;
2307
- lower: number;
2308
- upper: number;
2309
- };
2310
- /**
2311
- * Inter-rater reliability — simplified Krippendorff's alpha.
2312
- *
2313
- * Each inner array is one judge's scores for all items.
2314
- * All arrays must have the same length (same items scored).
2315
- */
2316
- declare function interRaterReliability(judgeScores: JudgeScore[][]): number;
2317
- /**
2318
- * Mann-Whitney U test for comparing two independent groups.
2319
- * Returns U statistic and approximate p-value (normal approximation).
2320
- */
2321
- declare function mannWhitneyU(a: number[], b: number[]): {
2322
- u: number;
2323
- p: number;
2324
- };
2325
- /** Partial credit: returns 0-1 ratio of current toward target */
2326
- declare function partialCredit(current: number, target: number): number;
2327
- /**
2328
- * Paired t-test — before/after measurements on the SAME items.
2329
- * Pairing removes inter-item variance, giving tighter significance than
2330
- * an unpaired test when comparing prompt v1 vs prompt v2 on identical
2331
- * scenarios.
2332
- */
2333
- declare function pairedTTest(before: number[], after: number[]): {
2334
- t: number;
2335
- df: number;
2336
- p: number;
2337
- };
2338
- /**
2339
- * Wilcoxon signed-rank test — paired non-parametric alternative.
2340
- * Use when the differences aren't normally distributed.
2341
- */
2342
- declare function wilcoxonSignedRank(before: number[], after: number[]): {
2343
- w: number;
2344
- p: number;
2345
- };
2346
- /**
2347
- * Cohen's d — standardized effect size for two independent groups.
2348
- * Positive d means group b has higher mean than group a.
2349
- * Rule of thumb: |d| < 0.2 negligible, 0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.
2350
- */
2351
- declare function cohensD(a: number[], b: number[]): number;
2352
- interface CorpusScoreRecord {
2353
- /** Stable identifier for the rated item (scenario, span, turn, …). */
2354
- itemId: string;
2355
- /** Identifier for the judge that produced this score. */
2356
- judgeName: string;
2357
- /** Dimension name (matches `JudgeScore.dimension`). */
2358
- dimension: string;
2359
- /** Numeric score; must be finite. */
2360
- score: number;
2361
- }
2362
- interface CorpusAgreementPerDimension extends ContinuousAgreement {
2363
- dimension: string;
2364
- /** Item IDs that contributed to this dimension's matrix (every judge scored them). */
2365
- itemIds: string[];
2366
- /** Judge IDs that contributed to this dimension's matrix. */
2367
- judgeIds: string[];
2368
- }
2369
- interface CorpusAgreementReport {
2370
- /** Per-dimension ICC(2,1) + κ_w + Pearson + Spearman + bootstrap CIs. */
2371
- perDimension: CorpusAgreementPerDimension[];
2372
- /** Mean ICC across dimensions (NaN if no dimension yielded a finite ICC). */
2373
- overallIcc: number;
2374
- /** Mean weighted κ across dimensions (NaN if none finite). */
2375
- overallWeightedKappa: number;
2376
- /** Dimensions evaluated (sorted). */
2377
- dimensions: string[];
2378
- /** Judges seen across the corpus (sorted). */
2379
- judgeIds: string[];
2380
- }
2381
- interface CorpusAgreementOptions extends ContinuousAgreementOptions {
2382
- /**
2383
- * Restrict the audit to these dimensions. Default = every dimension
2384
- * that appears in the input. A dimension named here but absent from
2385
- * the input throws — silent omission would corrupt the overall metric.
2386
- */
2387
- dimensions?: string[];
2388
- /**
2389
- * Restrict the audit to these judges. Default = every judge that
2390
- * appears in the input. A judge named here but absent from a
2391
- * dimension throws (see "fail loud" below).
2392
- */
2393
- judges?: string[];
2394
- }
2395
- /**
2396
- * Corpus-wide inter-rater agreement across N items × M judges × D dimensions.
2397
- *
2398
- * For each dimension, builds the [n_items][n_judges] matrix of scores
2399
- * (keeping only items every judge rated on that dimension), then runs
2400
- * `continuousAgreement` to get ICC(2,1), κ_w, Pearson, Spearman, and
2401
- * bootstrap CIs. Reports a pooled mean across dimensions as a single
2402
- * "is this judge panel reliable on this corpus?" number.
2403
- *
2404
- * Fail-loud contract:
2405
- * - Empty input throws.
2406
- * - Fewer than 2 judges or fewer than 2 items per dimension throws.
2407
- * - A judge present in some dimensions but with zero scored items on
2408
- * another dimension throws (would silently shrink the matrix).
2409
- * - Duplicate (itemId, judgeName, dimension) records throw.
2410
- */
2411
- declare function corpusInterRaterAgreement(records: CorpusScoreRecord[], opts?: CorpusAgreementOptions): CorpusAgreementReport;
2412
- /**
2413
- * Convenience adapter for `JudgeScore[]` data keyed externally by item.
2414
- *
2415
- * Use when you have per-item arrays of `JudgeScore[]` (e.g. one
2416
- * `ScenarioResult.judgeScores` per scenario) and want corpus-wide
2417
- * agreement without manually flattening. `itemId` must be unique per
2418
- * row of `itemsScores`.
2419
- */
2420
- declare function corpusInterRaterAgreementFromJudgeScores(itemsScores: Array<{
2421
- itemId: string;
2422
- scores: JudgeScore[];
2423
- }>, opts?: CorpusAgreementOptions): CorpusAgreementReport;
2424
-
2425
2020
  /**
2426
2021
  * Anti-slop quality judge.
2427
2022
  *
@@ -4159,49 +3754,6 @@ interface ParaphraseRobustnessScenarioResult {
4159
3754
  */
4160
3755
  declare function paraphraseRobustnessScenarios(args: ParaphraseRobustnessScenarioInput): Promise<ParaphraseRobustnessScenarioResult>;
4161
3756
 
4162
- /**
4163
- * Power analysis + multiple-comparison correction.
4164
- *
4165
- * Two jobs:
4166
- * 1. Before running: `requiredSampleSize({ effect, alpha, power })`
4167
- * returns the N per arm needed to detect a given effect size.
4168
- * 2. After running: `benjaminiHochberg(pValues, fdr)` and
4169
- * `bonferroni(pValues, alpha)` correct for multiple pairwise tests
4170
- * so pairwise variant comparisons stay statistically honest.
4171
- *
4172
- * Applying alpha directly across n*(n-1)/2 pairwise tests without
4173
- * correction inflates the false-positive rate when variants ≥ 3 — the
4174
- * BH and Bonferroni helpers prevent that.
4175
- */
4176
- /**
4177
- * Required N per arm for a two-sample comparison at target effect size,
4178
- * alpha, and power. Uses the normal-approximation formula:
4179
- *
4180
- * n = 2 * ( (z_{1-α/2} + z_{1-β}) / d )^2
4181
- *
4182
- * where d is Cohen's d. Returns Infinity for effect ≤ 0.
4183
- */
4184
- declare function requiredSampleSize(opts: {
4185
- effect: number;
4186
- alpha?: number;
4187
- power?: number;
4188
- twoSided?: boolean;
4189
- }): number;
4190
- /** Bonferroni adjustment: multiply every p-value by the number of tests, clamp at 1. */
4191
- declare function bonferroni(pValues: number[], alpha?: number): {
4192
- adjusted: number[];
4193
- significant: boolean[];
4194
- };
4195
- /**
4196
- * Benjamini–Hochberg false discovery rate. Returns adjusted q-values and
4197
- * significance at the target FDR. Properly handles ties and preserves
4198
- * monotonicity of q-values.
4199
- */
4200
- declare function benjaminiHochberg(pValues: number[], fdr?: number): {
4201
- qValues: number[];
4202
- significant: boolean[];
4203
- };
4204
-
4205
3757
  /**
4206
3758
  * Visual diff — pixel-delta scoring for UI / visual outputs.
4207
3759
  *
@@ -6316,10 +5868,11 @@ interface JudgeRetryPolicy {
6316
5868
  /** Exponential backoff function, default `attempt → min(500 * 2^attempt, 16_000)`. */
6317
5869
  backoffMs?: (attempt: number) => number;
6318
5870
  /**
6319
- * Predicate deciding whether an error should trigger a retry. Default
6320
- * retries on: AbortError, TimeoutError, `fetch failed`, `ECONNRESET`,
6321
- * `[This operation was aborted]`, and any LlmCallError with status in
6322
- * {429, 502, 503, 504}. JSON-parse errors are NOT retriable (the model
5871
+ * Predicate deciding whether an error should trigger a retry. Defaults to
5872
+ * `isTransientLlmError` the package-wide classifier shared with
5873
+ * `callLlm` which retries aborts/timeouts, network faults, HTTP/2
5874
+ * transport faults, and any `LlmCallError` with status in {429,502,503,504}.
5875
+ * JSON-parse and schema-rejection errors are NOT retriable (the model
6323
5876
  * needs prompt adjustment, not another shot).
6324
5877
  */
6325
5878
  isRetryable?: (err: unknown) => boolean;
@@ -6465,4 +6018,4 @@ declare function aggregateTrialsByMode(trials: TrialResult[], opts: {
6465
6018
  mode: AggregatorMode;
6466
6019
  }): TrialAggregate;
6467
6020
 
6468
- export { ANALYST_SEVERITIES, ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AggregatorMode, type AlignmentOp, type Analyst, type AnalystContext, type AnalystCost, type AnalystFinding, type AnalystHooks, type AnalystInputKind, AnalystRegistry, type AnalystRegistryOptions, type AnalystRequirements, type AnalystRunEvent, type AnalystRunInputs, type AnalystRunResult, type AnalystRunSummary, type AnalystSeverity, AnalyzeTracesOptions, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactResult, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BackendIntegrityError, type BackendIntegrityReport, BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, type BudgetPolicy, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type ChatCallOpts, type ChatClient, type ChatRequest, type ChatResponse, type ChatTransport, type CheckResult, type CliBridgeTransportOpts, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, ContinuousAgreement, ContinuousAgreementOptions, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CorpusAgreementOptions, type CorpusAgreementPerDimension, type CorpusAgreementReport, type CorpusScoreRecord, type CorrectnessChecker, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateChatClientOpts, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DirEntry, type DirectProviderTransportOpts, type DiscoverPersonasOptions, type DiscoveredPersona, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvalResult, type EvidenceRef, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAdapterOpts, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, type JudgeRubric, JudgeRunner, type JudgeScore, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallRequest, LlmCallResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MockTransportOpts, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersistedFinding, type PersonaConfig, type PersonaRigor, type Playbook, type PlaybookEntry, type PoolSlot, type ProducedProposal, type ProducedState, ProductClient, type ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegistryRunOpts, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RouterTransportOpts, type RubricDimension, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxSdkTransportOpts, type SatisfiedBy, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, type TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, Trajectory, TrajectoryStep, type TrialAggregate, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateRunScore, aggregateTrialsByMode, analyzeAntiSlop, analyzeSeries, assertRealBackend, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, buildDriverSystemPrompt, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, computeFindingId, confidenceInterval, containsAll, corpusInterRaterAgreement, corpusInterRaterAgreementFromJudgeScores, createAntiSlopJudge, createChatClient, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, liftSeverity, linterJudge, loadScorerFromGrader, localCommandRunner, lowercaseMutator, makeFinding, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, normalizeScores, notBlocked, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeBackendIntegrity, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedMean, weightedRecall, whitespaceCollapseMutator, wilcoxonSignedRank, withJudgeRetry, wranglerDeployRunner };
6021
+ export { ANALYST_SEVERITIES, ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AggregatorMode, type AlignmentOp, type Analyst, type AnalystContext, type AnalystCost, type AnalystFinding, type AnalystHooks, type AnalystInputKind, AnalystRegistry, type AnalystRegistryOptions, type AnalystRequirements, type AnalystRunEvent, type AnalystRunInputs, type AnalystRunResult, type AnalystRunSummary, type AnalystSeverity, AnalyzeTracesOptions, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BackendIntegrityError, type BackendIntegrityReport, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, type BudgetPolicy, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type ChatCallOpts, type ChatClient, type ChatRequest, type ChatResponse, type ChatTransport, CheckResult, type CliBridgeTransportOpts, type CodeMutationOutcome, type CodeMutationRunner, CollectedArtifacts, type CommandRunner, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CorrectnessChecker, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateChatClientOpts, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DirEntry, type DirectProviderTransportOpts, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvidenceRef, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAdapterOpts, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallRequest, LlmCallResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MockTransportOpts, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegistryRunOpts, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouterTransportOpts, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxSdkTransportOpts, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, Trajectory, TrajectoryStep, type TrialAggregate, type TrialAttempt, TrialCache, TrialTelemetry, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateRunScore, aggregateTrialsByMode, analyzeAntiSlop, analyzeSeries, assertRealBackend, attributeCounterfactuals, bisect, buildDriverSystemPrompt, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, computeFindingId, containsAll, createAntiSlopJudge, createChatClient, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, liftSeverity, linterJudge, loadScorerFromGrader, localCommandRunner, lowercaseMutator, makeFinding, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeBackendIntegrity, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, wranglerDeployRunner };