@tangle-network/agent-eval 0.27.0 → 0.27.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/CHANGELOG.md +72 -0
  2. package/README.md +4 -5
  3. package/dist/builder-eval/index.js +1 -1
  4. package/dist/{chunk-WWYCWKUM.js → chunk-3CKU6VGU.js} +2 -2
  5. package/dist/{chunk-K2TPS5LB.js → chunk-4U4BKCXK.js} +2 -2
  6. package/dist/chunk-4U4BKCXK.js.map +1 -0
  7. package/dist/{chunk-2A5XJB43.js → chunk-5AKPEK5L.js} +3 -3
  8. package/dist/chunk-5AKPEK5L.js.map +1 -0
  9. package/dist/{chunk-RAF443UI.js → chunk-DBIGN5MJ.js} +2 -2
  10. package/dist/{chunk-JLZQWFV3.js → chunk-K33INZHH.js} +2 -2
  11. package/dist/chunk-K33INZHH.js.map +1 -0
  12. package/dist/{chunk-NU65VQ7M.js → chunk-MAZ26DC7.js} +1 -1
  13. package/dist/chunk-MAZ26DC7.js.map +1 -0
  14. package/dist/{chunk-LSH4MMOZ.js → chunk-NCRFYPS3.js} +1 -1
  15. package/dist/chunk-NCRFYPS3.js.map +1 -0
  16. package/dist/{chunk-ZN274SWR.js → chunk-PALJO75S.js} +2 -2
  17. package/dist/{chunk-OWLAAMME.js → chunk-QHF6EQKK.js} +3 -2
  18. package/dist/chunk-QHF6EQKK.js.map +1 -0
  19. package/dist/chunk-R5UQJNKC.js +722 -0
  20. package/dist/chunk-R5UQJNKC.js.map +1 -0
  21. package/dist/{chunk-SESZDQPX.js → chunk-RUI6SIHY.js} +3 -3
  22. package/dist/chunk-RUI6SIHY.js.map +1 -0
  23. package/dist/{chunk-WHZMVFUV.js → chunk-SZSBQUIJ.js} +2 -2
  24. package/dist/chunk-SZSBQUIJ.js.map +1 -0
  25. package/dist/{chunk-4F5DQN55.js → chunk-VSMTAMNK.js} +1 -1
  26. package/dist/chunk-VSMTAMNK.js.map +1 -0
  27. package/dist/{chunk-5LBB5B3Z.js → chunk-XFZCM5Z3.js} +1 -1
  28. package/dist/chunk-XFZCM5Z3.js.map +1 -0
  29. package/dist/cli.js +1 -1
  30. package/dist/{control-CBShYYA6.d.ts → control-BT4qnXiS.d.ts} +2 -2
  31. package/dist/{control-runtime-BuJHoLg0.d.ts → control-runtime-BZ_lVLYW.d.ts} +1 -0
  32. package/dist/control.d.ts +3 -3
  33. package/dist/control.js +2 -2
  34. package/dist/{failure-cluster-C2EGSDiT.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -2
  35. package/dist/{feedback-trajectory-DfFdrraJ.d.ts → feedback-trajectory-D1aGKusy.d.ts} +1 -1
  36. package/dist/governance/index.d.ts +1 -1
  37. package/dist/{index-D3iBCjdF.d.ts → index-BhLlu-qO.d.ts} +1 -1
  38. package/dist/index.d.ts +157 -167
  39. package/dist/index.js +25 -335
  40. package/dist/index.js.map +1 -1
  41. package/dist/knowledge/index.d.ts +1 -1
  42. package/dist/knowledge/index.js +2 -2
  43. package/dist/{multi-layer-verifier-LkP3LVKj.d.ts → multi-layer-verifier-U-c8ge1k.d.ts} +1 -1
  44. package/dist/openapi.json +1 -1
  45. package/dist/optimization.d.ts +5 -5
  46. package/dist/optimization.js +5 -5
  47. package/dist/pipelines/index.d.ts +1 -1
  48. package/dist/pipelines/index.js +2 -2
  49. package/dist/{release-report-wfUySN5F.d.ts → release-report-CCQqnK46.d.ts} +1 -1
  50. package/dist/{replay-BL96gCEP.d.ts → replay-D7z0J43-.d.ts} +4 -5
  51. package/dist/reporting.d.ts +4 -4
  52. package/dist/reporting.js +5 -5
  53. package/dist/{researcher-bGkI7vCl.d.ts → researcher-G81CWc0q.d.ts} +9 -10
  54. package/dist/rl.d.ts +26 -44
  55. package/dist/rl.js +5 -5
  56. package/dist/rl.js.map +1 -1
  57. package/dist/{sequential-Dgz1n51-.d.ts → sequential-5iSVfzl2.d.ts} +2 -2
  58. package/dist/{summary-report-DZVXOCK_.d.ts → summary-report-Dl4akLKX.d.ts} +5 -5
  59. package/dist/traces.d.ts +1 -1
  60. package/dist/traces.js +2 -2
  61. package/dist/wire/index.d.ts +2 -2
  62. package/dist/wire/index.js +1 -1
  63. package/docs/research-report-methodology.md +4 -4
  64. package/docs/three-package-architecture.md +12 -24
  65. package/package.json +1 -1
  66. package/dist/chunk-2A5XJB43.js.map +0 -1
  67. package/dist/chunk-4F5DQN55.js.map +0 -1
  68. package/dist/chunk-5LBB5B3Z.js.map +0 -1
  69. package/dist/chunk-I4MBDTY5.js +0 -272
  70. package/dist/chunk-I4MBDTY5.js.map +0 -1
  71. package/dist/chunk-JLZQWFV3.js.map +0 -1
  72. package/dist/chunk-K2TPS5LB.js.map +0 -1
  73. package/dist/chunk-LSH4MMOZ.js.map +0 -1
  74. package/dist/chunk-NU65VQ7M.js.map +0 -1
  75. package/dist/chunk-OWLAAMME.js.map +0 -1
  76. package/dist/chunk-SESZDQPX.js.map +0 -1
  77. package/dist/chunk-WHZMVFUV.js.map +0 -1
  78. /package/dist/{chunk-WWYCWKUM.js.map → chunk-3CKU6VGU.js.map} +0 -0
  79. /package/dist/{chunk-RAF443UI.js.map → chunk-DBIGN5MJ.js.map} +0 -0
  80. /package/dist/{chunk-ZN274SWR.js.map → chunk-PALJO75S.js.map} +0 -0
package/dist/index.d.ts CHANGED
@@ -1,28 +1,30 @@
1
- export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-CBShYYA6.js';
1
+ export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-BT4qnXiS.js';
2
2
  import { TCloud } from '@tangle-network/tcloud';
3
- import { C as ControlEvalResult } from './control-runtime-BuJHoLg0.js';
4
- export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BuJHoLg0.js';
3
+ import { C as ControlEvalResult } from './control-runtime-BZ_lVLYW.js';
4
+ export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BZ_lVLYW.js';
5
5
  import { A as AgentEvalError } from './errors-BZ9sTdz7.js';
6
6
  export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-BZ9sTdz7.js';
7
- import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory } from './feedback-trajectory-DfFdrraJ.js';
8
- export { F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-DfFdrraJ.js';
9
- import { A as ActionableSideInfo, h as MultiShotRunner, j as MultiShotScorer, c as MultiShotMutateAdapter, a4 as HeldOutGateConfig, E as EvolvableVariant, m as MultiShotTrialResult, e as MultiShotOptimizationResult, a3 as GateDecision, a5 as Objective, a6 as ParetoResult, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-DZVXOCK_.js';
10
- export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a7 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a8 as GateEvidence, G as GenerationReport, a9 as HeldOutGate, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, d as MultiShotOptimizationConfig, f as MultiShotRun, g as MultiShotRunInput, i as MultiShotScore, k as MultiShotSplit, l as MultiShotTrace, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-DZVXOCK_.js';
7
+ import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory } from './feedback-trajectory-D1aGKusy.js';
8
+ export { F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-D1aGKusy.js';
9
+ import { A as ActionableSideInfo, h as MultiShotRunner, j as MultiShotScorer, c as MultiShotMutateAdapter, a4 as HeldOutGateConfig, E as EvolvableVariant, m as MultiShotTrialResult, e as MultiShotOptimizationResult, a3 as GateDecision, a5 as Objective, a6 as ParetoResult, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-Dl4akLKX.js';
10
+ export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a7 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a8 as GateEvidence, G as GenerationReport, a9 as HeldOutGate, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, d as MultiShotOptimizationConfig, f as MultiShotRun, g as MultiShotRunInput, i as MultiShotScore, k as MultiShotSplit, l as MultiShotTrace, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-Dl4akLKX.js';
11
11
  export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
12
- import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-wfUySN5F.js';
13
- export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-wfUySN5F.js';
14
- import { a as FailureCluster } from './failure-cluster-C2EGSDiT.js';
15
- export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-C2EGSDiT.js';
12
+ import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-CCQqnK46.js';
13
+ export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-CCQqnK46.js';
14
+ import { a as FailureCluster } from './failure-cluster-Cw65_5FY.js';
15
+ export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-Cw65_5FY.js';
16
16
  import { a as RunSplitTag, R as RunRecord } from './run-record-CqzahIbx.js';
17
17
  export { c as RunJudgeMetadata, d as RunOutcome, e as RunRecordValidationError, b as RunTokenUsage, i as isRunRecord, p as parseRunRecordSafe, r as roundTripRunRecord, v as validateRunRecord } from './run-record-CqzahIbx.js';
18
18
  import { T as TraceStore, R as Run$1, S as Span, b as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, g as BudgetSpec, h as RunFilter, L as LlmSpan } from './store-Db2Bv8Cf.js';
19
19
  export { i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, M as Message, e as RetrievalSpan, m as RunLayer, n as RunStatus, f as SandboxSpan, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
20
+ import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './index-BhLlu-qO.js';
21
+ export { b as CalibrationResult, c as CandidateScore, d as ContinuousCalibrationResult, D as DEFAULT_RED_TEAM_CORPUS, E as EuRiskClass, G as GoldenItem, e as GovernanceContext, f as GovernanceFinding, g as GovernanceReport, P as PositionalBiasResult, R as RedTeamCase, h as RedTeamCategory, i as RedTeamFinding, j as RedTeamPayload, k as RedTeamReport, S as SelfPreferenceResult, U as UseCaseSignals, V as VerbosityBiasResult, l as calibrateJudge, m as calibrateJudgeContinuous, n as classifyEuAiRisk, o as continuousAgreement, p as euAiActReport, q as nistAiRmfReport, r as positionalBias, s as redTeamDataset, t as redTeamReport, u as renderMarkdown, v as scoreRedTeamOutput, w as selfPreference, x as soc2Report, y as summarize, z as toolNamesForRun, A as verbosityBias } from './index-BhLlu-qO.js';
20
22
  import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-B2kWEdh9.js';
21
23
  export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-B2kWEdh9.js';
22
24
  import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
23
25
  export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DP_cSSiw.js';
24
26
  export { F as FileSystemRawProviderSink, d as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, e as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, f as RawProviderDirection, c as RawProviderEvent, R as RawProviderSink, g as RawProviderSinkFilter, h as RunIntegrityError, a as RunIntegrityExpectations, i as RunIntegrityIssue, j as RunIntegrityIssueCode, b as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-DK2EBVZC.js';
25
- export { D as DEFAULT_REDACTION_RULES, O as OTEL_AGENT_EVAL_SCOPE, a as OtlpExport, b as OtlpResourceSpans, c as OtlpSpan, R as REDACTION_VERSION, d as RedactionReport, e as RedactionRule, f as ReplayCache, g as ReplayCacheEntry, h as ReplayCacheMissError, i as ReplayCacheStats, j as ReplayFetchOptions, k as createReplayFetch, l as exportRunAsOtlp, m as iterateRawCalls, r as redactString, n as redactValue } from './replay-BL96gCEP.js';
27
+ export { D as DEFAULT_REDACTION_RULES, O as OTEL_AGENT_EVAL_SCOPE, a as OtlpExport, b as OtlpResourceSpans, c as OtlpSpan, R as REDACTION_VERSION, d as RedactionReport, e as RedactionRule, f as ReplayCache, g as ReplayCacheEntry, h as ReplayCacheMissError, i as ReplayCacheStats, j as ReplayFetchOptions, k as createReplayFetch, l as exportRunAsOtlp, m as iterateRawCalls, r as redactString, n as redactValue } from './replay-D7z0J43-.js';
26
28
  export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-DODUYdPg.js';
27
29
  import { a as BaselineReport } from './baseline-4R5deP0N.js';
28
30
  export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-4R5deP0N.js';
@@ -30,14 +32,13 @@ import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
30
32
  export { b as buildTrajectory } from './trajectory-CnoBo-JY.js';
31
33
  import { a as DatasetScenario, c as Dataset } from './dataset-CiK_3LDr.js';
32
34
  export { d as DatasetDifficulty, b as DatasetManifest, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-CiK_3LDr.js';
33
- export { C as CalibrationResult, a as CandidateScore, b as ContinuousAgreement, c as ContinuousAgreementOptions, d as ContinuousCalibrationResult, D as DEFAULT_RED_TEAM_CORPUS, E as EuRiskClass, G as GoldenItem, e as GovernanceContext, f as GovernanceFinding, g as GovernanceReport, P as PositionalBiasResult, R as RedTeamCase, h as RedTeamCategory, i as RedTeamFinding, j as RedTeamPayload, k as RedTeamReport, S as SelfPreferenceResult, U as UseCaseSignals, V as VerbosityBiasResult, l as calibrateJudge, m as calibrateJudgeContinuous, n as classifyEuAiRisk, o as continuousAgreement, p as euAiActReport, q as nistAiRmfReport, r as positionalBias, s as redTeamDataset, t as redTeamReport, u as renderMarkdown, v as scoreRedTeamOutput, w as selfPreference, x as soc2Report, y as summarize, z as toolNamesForRun, A as verbosityBias } from './index-D3iBCjdF.js';
34
35
  import { a as PrmGrader } from './rubric-D5tjHNJQ.js';
35
- import { L as Layer, S as Severity, a as LayerResult, b as VerifyContext } from './multi-layer-verifier-LkP3LVKj.js';
36
- export { F as Finding, c as LayerStatus, M as MultiLayerVerifier, V as VerificationReport, d as VerifyOptions, g as gradeSemanticStatus } from './multi-layer-verifier-LkP3LVKj.js';
37
- import { L as LlmClientOptions } from './researcher-bGkI7vCl.js';
38
- export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, m as LlmCallError, n as LlmCallRequest, o as LlmCallResult, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as callLlm, x as callLlmJson, y as probeLlm, r as runEvalCampaign, z as stripFencedJson } from './researcher-bGkI7vCl.js';
36
+ import { L as Layer, S as Severity, a as LayerResult, b as VerifyContext } from './multi-layer-verifier-U-c8ge1k.js';
37
+ export { F as Finding, c as LayerStatus, M as MultiLayerVerifier, V as VerificationReport, d as VerifyOptions, g as gradeSemanticStatus } from './multi-layer-verifier-U-c8ge1k.js';
38
+ import { L as LlmClientOptions } from './researcher-G81CWc0q.js';
39
+ export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, m as LlmCallError, n as LlmCallRequest, o as LlmCallResult, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as callLlm, x as callLlmJson, y as probeLlm, r as runEvalCampaign, z as stripFencedJson } from './researcher-G81CWc0q.js';
39
40
  export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index--fVrWDiR.js';
40
- export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-Dgz1n51-.js';
41
+ export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
41
42
  import './outcome-store-D6KWmYvj.js';
42
43
 
43
44
  /**
@@ -64,8 +65,7 @@ import './outcome-store-D6KWmYvj.js';
64
65
  * Both implement the small `AutoPrClient` interface, so tests substitute
65
66
  * a fake without spinning a process or network.
66
67
  *
67
- * @experimental — added in 0.25.0. Surface may evolve as consumers wire
68
- * it into CI workflows.
68
+ * @experimental — surface may evolve as consumers wire it into CI workflows.
69
69
  */
70
70
  interface FileChange {
71
71
  /** Repo-relative path. Forward slashes; no `..`. */
@@ -747,8 +747,7 @@ declare class MetricsCollector {
747
747
  * primitive is idempotent + replayable: re-running with the same
748
748
  * `runId` will produce the same plan.
749
749
  *
750
- * @experimental — added in 0.25.0. Surface may evolve as the 5 product
751
- * agents wire it in.
750
+ * @experimental — surface may evolve as product agents wire it in.
752
751
  */
753
752
 
754
753
  interface FailureClusterConfig {
@@ -998,6 +997,78 @@ declare function wilcoxonSignedRank(before: number[], after: number[]): {
998
997
  * Rule of thumb: |d| < 0.2 negligible, 0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.
999
998
  */
1000
999
  declare function cohensD(a: number[], b: number[]): number;
1000
+ interface CorpusScoreRecord {
1001
+ /** Stable identifier for the rated item (scenario, span, turn, …). */
1002
+ itemId: string;
1003
+ /** Identifier for the judge that produced this score. */
1004
+ judgeName: string;
1005
+ /** Dimension name (matches `JudgeScore.dimension`). */
1006
+ dimension: string;
1007
+ /** Numeric score; must be finite. */
1008
+ score: number;
1009
+ }
1010
+ interface CorpusAgreementPerDimension extends ContinuousAgreement {
1011
+ dimension: string;
1012
+ /** Item IDs that contributed to this dimension's matrix (every judge scored them). */
1013
+ itemIds: string[];
1014
+ /** Judge IDs that contributed to this dimension's matrix. */
1015
+ judgeIds: string[];
1016
+ }
1017
+ interface CorpusAgreementReport {
1018
+ /** Per-dimension ICC(2,1) + κ_w + Pearson + Spearman + bootstrap CIs. */
1019
+ perDimension: CorpusAgreementPerDimension[];
1020
+ /** Mean ICC across dimensions (NaN if no dimension yielded a finite ICC). */
1021
+ overallIcc: number;
1022
+ /** Mean weighted κ across dimensions (NaN if none finite). */
1023
+ overallWeightedKappa: number;
1024
+ /** Dimensions evaluated (sorted). */
1025
+ dimensions: string[];
1026
+ /** Judges seen across the corpus (sorted). */
1027
+ judgeIds: string[];
1028
+ }
1029
+ interface CorpusAgreementOptions extends ContinuousAgreementOptions {
1030
+ /**
1031
+ * Restrict the audit to these dimensions. Default = every dimension
1032
+ * that appears in the input. A dimension named here but absent from
1033
+ * the input throws — silent omission would corrupt the overall metric.
1034
+ */
1035
+ dimensions?: string[];
1036
+ /**
1037
+ * Restrict the audit to these judges. Default = every judge that
1038
+ * appears in the input. A judge named here but absent from a
1039
+ * dimension throws (see "fail loud" below).
1040
+ */
1041
+ judges?: string[];
1042
+ }
1043
+ /**
1044
+ * Corpus-wide inter-rater agreement across N items × M judges × D dimensions.
1045
+ *
1046
+ * For each dimension, builds the [n_items][n_judges] matrix of scores
1047
+ * (keeping only items every judge rated on that dimension), then runs
1048
+ * `continuousAgreement` to get ICC(2,1), κ_w, Pearson, Spearman, and
1049
+ * bootstrap CIs. Reports a pooled mean across dimensions as a single
1050
+ * "is this judge panel reliable on this corpus?" number.
1051
+ *
1052
+ * Fail-loud contract:
1053
+ * - Empty input throws.
1054
+ * - Fewer than 2 judges or fewer than 2 items per dimension throws.
1055
+ * - A judge present in some dimensions but with zero scored items on
1056
+ * another dimension throws (would silently shrink the matrix).
1057
+ * - Duplicate (itemId, judgeName, dimension) records throw.
1058
+ */
1059
+ declare function corpusInterRaterAgreement(records: CorpusScoreRecord[], opts?: CorpusAgreementOptions): CorpusAgreementReport;
1060
+ /**
1061
+ * Convenience adapter for `JudgeScore[]` data keyed externally by item.
1062
+ *
1063
+ * Use when you have per-item arrays of `JudgeScore[]` (e.g. one
1064
+ * `ScenarioResult.judgeScores` per scenario) and want corpus-wide
1065
+ * agreement without manually flattening. `itemId` must be unique per
1066
+ * row of `itemsScores`.
1067
+ */
1068
+ declare function corpusInterRaterAgreementFromJudgeScores(itemsScores: Array<{
1069
+ itemId: string;
1070
+ scores: JudgeScore[];
1071
+ }>, opts?: CorpusAgreementOptions): CorpusAgreementReport;
1001
1072
 
1002
1073
  /**
1003
1074
  * Anti-slop quality judge.
@@ -2623,9 +2694,9 @@ declare function paraphraseRobustnessScenarios(args: ParaphraseRobustnessScenari
2623
2694
  * `bonferroni(pValues, alpha)` correct for multiple pairwise tests
2624
2695
  * so pairwise variant comparisons stay statistically honest.
2625
2696
  *
2626
- * Fixes the correctness bug in 0.2's pairwise optimizer which applied
2627
- * alpha directly across n*(n-1)/2 pairwise tests without correction
2628
- * dramatically inflating false-positive rate when variants ≥ 3.
2697
+ * Applying alpha directly across n*(n-1)/2 pairwise tests without
2698
+ * correction inflates the false-positive rate when variants 3 the
2699
+ * BH and Bonferroni helpers prevent that.
2629
2700
  */
2630
2701
  /**
2631
2702
  * Required N per arm for a two-sample comparison at target effect size,
@@ -2940,12 +3011,9 @@ interface HypothesisManifest {
2940
3011
  * Identifier for the hashing scheme used to produce `contentHash`.
2941
3012
  *
2942
3013
  * `'sha256-content'` — sha256 hex over the canonicalized manifest with
2943
- * the `contentHash` and `algo` fields stripped. This is what
2944
- * `signManifest` produces today.
2945
- *
2946
- * Held as a string union so future schemes can be added without
2947
- * breaking parsers; legacy SignedManifest values written before this
2948
- * field existed will deserialize cleanly because the field is optional.
3014
+ * the `contentHash` and `algo` fields stripped. Held as a string union
3015
+ * so future schemes can be added without breaking parsers; SignedManifest
3016
+ * values without `algo` deserialize cleanly because the field is optional.
2949
3017
  */
2950
3018
  type SignedManifestAlgo = 'sha256-content';
2951
3019
  interface SignedManifest extends HypothesisManifest {
@@ -2954,10 +3022,10 @@ interface SignedManifest extends HypothesisManifest {
2954
3022
  /**
2955
3023
  * Algorithm string describing how `contentHash` was produced.
2956
3024
  *
2957
- * Optional on the type so legacy serialized manifests (pre-`algo`)
2958
- * still parse, but ALWAYS populated by {@link signManifest}.
2959
- * Consumers that want to enforce a known algorithm should reject
2960
- * manifests where this field is missing or unrecognized.
3025
+ * Optional on the type so serialized manifests without it still parse,
3026
+ * but ALWAYS populated by {@link signManifest}. Consumers that want to
3027
+ * enforce a known algorithm should reject manifests where this field
3028
+ * is missing or unrecognized.
2961
3029
  */
2962
3030
  algo?: SignedManifestAlgo;
2963
3031
  }
@@ -2996,10 +3064,9 @@ declare function canonicalize(v: unknown): unknown;
2996
3064
  * - encoder choice (UTF-8 via TextEncoder, fixed)
2997
3065
  * - runtime (uses the Web Crypto subtle digest, present in Node ≥18 and browsers)
2998
3066
  *
2999
- * Naming note: `hashJson` rather than `hashContent` because `hashContent` is
3000
- * already taken in `prompt-registry.ts` for the truncated 12-char prompt-id
3001
- * helper, which has different semantics (string input, short return). Both
3002
- * coexist; `hashJson` is the right name when you mean "canonicalize then hash."
3067
+ * Named `hashJson` to disambiguate from `prompt-registry.ts`'s `hashContent`,
3068
+ * which takes a string input and returns a truncated 12-char prompt id.
3069
+ * Use `hashJson` when you mean "canonicalize then hash."
3003
3070
  *
3004
3071
  * @example
3005
3072
  * const hash = await hashJson({ id: '1', kind: 'spec' })
@@ -3012,17 +3079,15 @@ declare function hashJson<T>(obj: T): Promise<string>;
3012
3079
  * The hash covers the canonicalized manifest with the `contentHash`
3013
3080
  * and `algo` fields stripped; this lets verifiers re-sign the rest and
3014
3081
  * compare. Returned manifest always carries `algo: 'sha256-content'`
3015
- * so downstream consumers can identify the scheme; legacy serialized
3016
- * manifests without `algo` still verify because it is stripped before
3017
- * hashing on both sides.
3082
+ * so downstream consumers can identify the scheme; manifests without
3083
+ * `algo` still verify because it is stripped before hashing on both sides.
3018
3084
  */
3019
3085
  declare function signManifest(m: HypothesisManifest): Promise<SignedManifest>;
3020
3086
  /**
3021
3087
  * Verify that a signed manifest has not been tampered with.
3022
3088
  *
3023
- * Strips `contentHash` and `algo` before re-signing so legacy manifests
3024
- * (written before `algo` was emitted) verify identically to current
3025
- * ones.
3089
+ * Strips `contentHash` and `algo` before re-signing so manifests without
3090
+ * `algo` verify identically to ones that carry it.
3026
3091
  */
3027
3092
  declare function verifyManifest(m: SignedManifest): Promise<boolean>;
3028
3093
  /**
@@ -3334,9 +3399,8 @@ declare const localCommandRunner: CommandRunner;
3334
3399
  * - artifact dir contains an entry point (index.html for static SPAs,
3335
3400
  * equivalent per framework family)
3336
3401
  *
3337
- * Shipped in 0.11 with the canonical `vite` runner. Future generations
3338
- * add wrangler-deploy --dry-run, next-build, etc each as another
3339
- * runner factory.
3402
+ * Ships with a canonical `vite` runner. Additional runners
3403
+ * (wrangler-deploy --dry-run, next-build, etc.) plug in as factories.
3340
3404
  */
3341
3405
 
3342
3406
  type DeployFamily = 'frontend-static' | 'nextjs' | 'remix' | 'fullstack-ts';
@@ -3508,10 +3572,8 @@ declare function extractErrorCount(text: string, opts?: ExtractOptions): Extract
3508
3572
  * - test: in-memory mock that returns canned step outcomes
3509
3573
  * - future: Playwright, Puppeteer, custom scrapers
3510
3574
  *
3511
- * Shipped in 0.11 alongside {@link runIntentMatchJudge} together they
3512
- * close the "the agent shipped the wrong app and we didn't catch it"
3513
- * blind spot. Intent-match catches "wrong app entirely"; flow catches
3514
- * "right app but the buttons don't work."
3575
+ * Paired with {@link runIntentMatchJudge}: intent-match catches "wrong
3576
+ * app entirely"; flow-layer catches "right app but the buttons don't work."
3515
3577
  */
3516
3578
 
3517
3579
  type FlowAction = 'navigate' | 'click' | 'fill' | 'expect-text' | 'expect-element' | 'expect-url' | 'wait';
@@ -3607,10 +3669,6 @@ declare function flowLayer<Env extends FlowLayerEnv = FlowLayerEnv>(input: FlowL
3607
3669
  *
3608
3670
  * Soft-fails on LLM/JSON error (`available: false`) so callers can
3609
3671
  * treat failure as "judge skipped."
3610
- *
3611
- * Added in 0.11 to replace the lying `completenessScore: 1` field that
3612
- * VerticalBench shipped pre-Gen-48 — that field was keyword-driven and
3613
- * fired true on builds with zero spec concepts implemented.
3614
3672
  */
3615
3673
 
3616
3674
  declare const INTENT_MATCH_JUDGE_VERSION = "intent-match-judge-v1-2026-04-24";
@@ -4142,7 +4200,7 @@ declare function createDefaultReviewer(options: CreateDefaultReviewerOptions): (
4142
4200
  */
4143
4201
 
4144
4202
  /**
4145
- * Implementation complexity class for weighted scoring (added 0.11).
4203
+ * Implementation complexity class for weighted scoring.
4146
4204
  *
4147
4205
  * - `render` (default): the concept is a UI surface that displays static
4148
4206
  * data — render a list, show a counter, lay out a button. Single-file
@@ -4212,11 +4270,10 @@ interface SemanticConceptJudgeResult {
4212
4270
  error?: string;
4213
4271
  }
4214
4272
  /**
4215
- * Score-aggregation strategy. Default `mean` (legacy behavior 0.10
4216
- * and earlier always averaged 0-10 scores). `complexity` applies the
4217
- * default weight table (render=1, integrate=2, compute=2.5) unless a
4218
- * concept has an explicit `weight`. `explicit` honors only `weight`
4219
- * (defaulting to 1 for unspecified).
4273
+ * Score-aggregation strategy. `mean` averages 0-10 scores uniformly.
4274
+ * `complexity` applies the default weight table (render=1, integrate=2,
4275
+ * compute=2.5) unless a concept has an explicit `weight`. `explicit`
4276
+ * honors only `weight` (defaulting to 1 for unspecified).
4220
4277
  */
4221
4278
  type ConceptWeightStrategy = 'mean' | 'complexity' | 'explicit';
4222
4279
  declare const DEFAULT_COMPLEXITY_WEIGHTS: Record<ConceptComplexity, number>;
@@ -4234,9 +4291,9 @@ interface SemanticConceptJudgeOptions {
4234
4291
  /** LlmClient config (baseUrl, apiKey, authHeader, …). */
4235
4292
  llm?: LlmClientOptions;
4236
4293
  /**
4237
- * Score aggregation strategy. Default `mean` for backward compatibility
4238
- * with 0.10 and earlier callers. Cross-vertical comparisons should use
4239
- * `complexity` to neutralize the integrate-vs-render asymmetry.
4294
+ * Score aggregation strategy. Default `mean` uniform average across
4295
+ * concepts. Cross-vertical comparisons should use `complexity` to
4296
+ * neutralize the integrate-vs-render asymmetry.
4240
4297
  */
4241
4298
  weightConcepts?: ConceptWeightStrategy;
4242
4299
  /** Override the default complexity → weight table. */
@@ -4458,9 +4515,8 @@ interface LineageNode {
4458
4515
  }
4459
4516
  /**
4460
4517
  * `kindOf` decides whether a variant is a seed (no parent), code mutation,
4461
- * or prompt mutation. Default looks at `variant.payload.codeMutation`
4462
- * that field is part of the audit-bench convention but cheap enough to
4463
- * accept any payload that mirrors it. Override by passing your own.
4518
+ * or prompt mutation. Default looks at `variant.payload.codeMutation` and
4519
+ * accepts any payload that exposes that field; override by passing your own.
4464
4520
  */
4465
4521
  type LineageKindResolver<P> = (variant: EvolvableVariant<P>) => LineageKind;
4466
4522
  /**
@@ -4707,9 +4763,8 @@ declare function createSandboxCodeMutator<T, P>(opts: CreateSandboxCodeMutatorOp
4707
4763
  * Δ improvement (auto-detect when prompt evolution has
4708
4764
  * hit a structural ceiling).
4709
4765
  *
4710
- * Naming is generic: the original audit-bench version called the channels
4711
- * "prompt" and "code" those are the canonical use cases, but the
4712
- * primitive doesn't care what each mutator actually does.
4766
+ * Naming is generic the canonical use cases are "prompt" and "code"
4767
+ * channels, but the primitive doesn't care what each mutator actually does.
4713
4768
  */
4714
4769
 
4715
4770
  type CompositePolicy = 'primary-only' | 'secondary-only' | 'alternate' | 'plateau';
@@ -4754,25 +4809,15 @@ declare class Mutex {
4754
4809
  }
4755
4810
 
4756
4811
  /**
4757
- * Persona discovery replaces every consumer's hardcoded TRAINING_PERSONA_FILES.
4758
- *
4759
- * Today's failure mode: each product agent (legal/gtm/tax/creative) defines
4760
- * a TRAINING_PERSONA_FILES const with 5 hardcoded filenames. When the 2yr
4761
- * rewrite added 10+ new personas, those personas existed on disk but the
4762
- * evolve runner never loaded them — the new rubric dims (audit_defendability,
4763
- * intake_discipline, etc) got no training signal. The personas were
4764
- * cosmetic, the rewrites partially uninformed.
4765
- *
4766
- * `discoverPersonas` walks a personas directory and returns every persona
4767
- * file matching the convention. Consumers can filter by include/exclude
4768
- * patterns. Default behavior — discover everything — eliminates the
4769
- * "forgot to add the new persona to the list" failure mode.
4812
+ * Walk a personas directory and return every file matching the convention
4813
+ * `NN-slug.{yaml,yml,json,md}`. Sorted by filename so the numeric prefix
4814
+ * gives stable persona ordering for reproducibility. Consumers filter
4815
+ * through `include` / `exclude`.
4770
4816
  */
4771
4817
  interface DiscoverPersonasOptions {
4772
4818
  /**
4773
4819
  * Regex applied to filenames. Files that don't match are skipped.
4774
- * Default: `^[0-9]{2}-.+\.(yaml|yml|json|md)$` (the prevailing convention
4775
- * across legal/gtm/tax/creative: `NN-slug.yaml`).
4820
+ * Default: `^[0-9]{2}-.+\.(yaml|yml|json|md)$`.
4776
4821
  */
4777
4822
  pattern?: RegExp;
4778
4823
  /**
@@ -4782,14 +4827,10 @@ interface DiscoverPersonasOptions {
4782
4827
  exclude?: readonly string[];
4783
4828
  /**
4784
4829
  * If set, return only personas whose basename contains one of these
4785
- * substrings (post-pattern filter). Used by the CLI's `--personas a,b,c`
4786
- * flag — consumers pass through.
4830
+ * substrings (post-pattern filter).
4787
4831
  */
4788
4832
  include?: readonly string[];
4789
- /**
4790
- * Recurse into subdirectories. Default false (legal/gtm/tax/creative all
4791
- * store personas flat).
4792
- */
4833
+ /** Recurse into subdirectories. Default false. */
4793
4834
  recursive?: boolean;
4794
4835
  }
4795
4836
  interface DiscoveredPersona {
@@ -4800,14 +4841,6 @@ interface DiscoveredPersona {
4800
4841
  /** Filename without extension — the conventional persona id. */
4801
4842
  id: string;
4802
4843
  }
4803
- /**
4804
- * Walk `dir` and return every persona file matching the convention. Async
4805
- * because the consumer almost always wants this to be I/O-driven (so a new
4806
- * persona added on disk is picked up without a code change).
4807
- *
4808
- * Sorted by filename (which gives stable persona id order via the `NN-`
4809
- * numeric prefix convention) for reproducibility.
4810
- */
4811
4844
  declare function discoverPersonas(dir: string, opts?: DiscoverPersonasOptions): Promise<DiscoveredPersona[]>;
4812
4845
 
4813
4846
  /**
@@ -4914,43 +4947,17 @@ declare class JsonlTrialCache implements TrialCache {
4914
4947
  }
4915
4948
 
4916
4949
  /**
4917
- * Judge-retry wrapper.
4918
- *
4919
- * Today's failure mode: a judge LLM call aborts mid-stream (connection
4920
- * dropped, model timed out, schema rejected) consumer's try/catch swallows
4921
- * the error and returns `score: 0`. The eval composite then weights that
4922
- * zero into the mean, silently corrupting the score. Today's tax/gtm evals
4923
- * had `judge=0` across every trial — the prompt rewrites couldn't be
4924
- * evaluated honestly because the measurement instrument was broken.
4950
+ * Wrap a single judge LLM call with retry, optional fallback-model
4951
+ * rotation, exponential backoff, and a typed `JudgeRetryOutcome`. Callers
4952
+ * MUST inspect `succeeded` before using `value`; on failure the library
4953
+ * returns `value: null` rather than substituting a default, so a judge
4954
+ * abort cannot silently corrupt a downstream composite.
4925
4955
  *
4926
- * `withJudgeRetry` is the substrate fix. It wraps a single judge invocation
4927
- * with:
4928
- *
4929
- * 1. N retry attempts on transient failures (abort, timeout, network).
4930
- * 2. Optional fallback-model rotation — try the next model in the list
4931
- * if the primary keeps aborting (a verbose new prompt may stream-abort
4932
- * on claude-code/sonnet but succeed on kimi-code/k2p6).
4933
- * 3. Exponential backoff between attempts.
4934
- * 4. A typed outcome `{ succeeded, attempts, value, error }` that callers
4935
- * MUST decide what to do with. No silent zero.
4936
- *
4937
- * The reporting contract: callers ship `TrialResult.judgeSucceeded = succeeded`
4938
- * and `TrialResult.judgeAttempts = attempts`. `aggregateTrials({mode: 'exclude-failed'})`
4939
- * then skips failed-judge trials when computing composites.
4940
- *
4941
- * The library does NOT decide what score to record on failure — that's the
4942
- * caller's product choice. Today's product agents (legal/gtm/tax/creative)
4943
- * should set `score: NaN` + `judgeSucceeded: false` + `error: ...` so the
4944
- * aggregator's exclude-failed mode drops the trial. Defaulting to 0 is what
4945
- * caused today's data corruption.
4946
- */
4947
- /**
4948
- * Retry policy for judge LLM calls.
4949
- *
4950
- * Defaults are tuned for the verbose post-2yr-rewrite prompts that exceed
4951
- * the 60s `callLlm` default and abort on streaming. Pick a different timeout
4952
- * for cheap-and-quick judges (e.g., 30s) or longer for thinking models.
4956
+ * Reporting contract: callers ship `TrialResult.judgeSucceeded = succeeded`
4957
+ * and `TrialResult.judgeAttempts = attempts` so `aggregateTrialsByMode`
4958
+ * with `mode: 'exclude-failed'` drops the trial.
4953
4959
  */
4960
+ /** Retry policy for judge LLM calls. */
4954
4961
  interface JudgeRetryPolicy {
4955
4962
  /** Max attempts per model. Default 3 (one initial + two retries). */
4956
4963
  maxAttempts?: number;
@@ -5003,8 +5010,8 @@ interface JudgeRetryOutcome<T> {
5003
5010
  * to their underlying fetch/SDK call so the abort actually fires.
5004
5011
  *
5005
5012
  * Returns a typed outcome — callers MUST inspect `succeeded` before using
5006
- * `value`. The library refuses to default to a silent zero score because that
5007
- * is exactly what caused today's eval data corruption.
5013
+ * `value`. The library refuses to default to a silent zero score because a
5014
+ * synthetic zero is indistinguishable from a real low score downstream.
5008
5015
  */
5009
5016
  declare function withJudgeRetry<T>(judgeFn: (model: string, signal: AbortSignal) => Promise<T>, policy?: JudgeRetryPolicy): Promise<JudgeRetryOutcome<T>>;
5010
5017
 
@@ -5070,42 +5077,30 @@ declare function referenceReplayRunsToSteeringRows<Input = unknown>(runs: Refere
5070
5077
  declare function referenceReplayScenarioToRunScore(scenarioScore: ReferenceReplayScenarioScore, durationMs?: number): RunScore;
5071
5078
 
5072
5079
  /**
5073
- * Trial-aggregator modes.
5074
- *
5075
- * The prompt-evolution loop's internal `aggregateTrials` defaulted to
5076
- * including every non-`error` trial in the mean — which corrupted the mean
5077
- * when a trial had `score: 0` because the judge silently aborted (the
5078
- * caller's try/catch swallowed the abort and returned zero). Today's
5079
- * tax/gtm evals show this: every trial scored judge=0 because the judge
5080
- * aborted, and the composite then reflected `structural * 0.3 + slop * 0.1`
5081
- * instead of the intended `judge * 0.6 + structural * 0.3 + slop * 0.1`.
5080
+ * Aggregate trials with explicit handling of judge failure. Three modes:
5082
5081
  *
5083
- * `aggregateTrialsByMode` is the substrate fix. Consumers can choose:
5082
+ * - `strict-fail` any `judgeSucceeded === false` trial fails the whole
5083
+ * aggregate. Use for production gates: one corrupt trial halts the gate.
5084
5084
  *
5085
- * - `strict-fail` — any trial with `judgeSucceeded === false` fails the
5086
- * whole aggregate. Right for production-gate runs where one corrupted
5087
- * trial means "we don't know if the prompt is good, halt the gate."
5085
+ * - `exclude-failed` — drop `judgeSucceeded === false` trials from the
5086
+ * mean; report `excludedFailedTrials` separately. Default for new code.
5088
5087
  *
5089
- * - `exclude-failed` — drop trials with `judgeSucceeded === false` from
5090
- * the mean; report `failedTrials` separately. Right for research /
5091
- * comparison runs where you want to use the signal that DID land.
5092
- * Default for new code.
5088
+ * - `zero-fill` — failed trials count as `score: 0` in the mean. Available
5089
+ * only for adapters that don't yet set `judgeSucceeded`.
5093
5090
  *
5094
- * - `zero-fill` legacy behavior: failed trials count as score=0 in
5095
- * the mean. Default ONLY for backwards-compat with adapters that
5096
- * don't yet set `judgeSucceeded`. Migrate off this — it's the source
5097
- * of today's data corruption.
5091
+ * Hard-errored trials (`t.error` set) are always excluded those are
5092
+ * infrastructure failures, not eval signal.
5098
5093
  */
5099
5094
 
5100
5095
  type AggregatorMode = 'strict-fail' | 'exclude-failed' | 'zero-fill';
5101
5096
  interface TrialAggregate {
5102
5097
  /** Mean score over the trials counted by the chosen mode. */
5103
5098
  meanScore: number;
5104
- /** Mean cost (legacy, kept for compatibility). */
5099
+ /** Mean cost across counted trials. */
5105
5100
  meanCost: number;
5106
- /** Mean wall time (legacy). */
5101
+ /** Mean wall time across counted trials. */
5107
5102
  meanDurationMs: number;
5108
- /** ok-rate (legacy). */
5103
+ /** Fraction of counted trials with `ok === true`. */
5109
5104
  okRate: number;
5110
5105
  /** Trials counted in the mean (mode-dependent). */
5111
5106
  countedTrials: number;
@@ -5125,13 +5120,8 @@ interface TrialAggregate {
5125
5120
  firstError?: string;
5126
5121
  };
5127
5122
  }
5128
- /**
5129
- * Aggregate trials with explicit failed-judge handling. Returns counts for
5130
- * counted + excluded so callers can surface "the score is based on 7 of 10
5131
- * trials; 3 judges failed" instead of silently weighting zero.
5132
- */
5133
5123
  declare function aggregateTrialsByMode(trials: TrialResult[], opts: {
5134
5124
  mode: AggregatorMode;
5135
5125
  }): TrialAggregate;
5136
5126
 
5137
- export { ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AggregatorMode, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DirEntry, type DiscoverPersonasOptions, type DiscoveredPersona, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvalResult, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, type JudgeRubric, JudgeRunner, type JudgeScore, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, ProductClient, type ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TestResult, type ThresholdContract, TokenCounter, type TokenSpec, TraceEmitter, TraceEvent, TraceStore, Trajectory, TrajectoryStep, type TrialAggregate, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, VerifyContext, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateRunScore, aggregateTrialsByMode, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, buildReviewerPrompt, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, confidenceInterval, containsAll, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, linterJudge, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, normalizeScores, notBlocked, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, typoMutator, urlContains, verifyManifest, visualDiff, viteDeployRunner, weightedMean, weightedRecall, whitespaceCollapseMutator, wilcoxonSignedRank, withJudgeRetry, wranglerDeployRunner };
5127
+ export { ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AggregatorMode, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, ContinuousAgreement, ContinuousAgreementOptions, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CorpusAgreementOptions, type CorpusAgreementPerDimension, type CorpusAgreementReport, type CorpusScoreRecord, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DirEntry, type DiscoverPersonasOptions, type DiscoveredPersona, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvalResult, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, type JudgeRubric, JudgeRunner, type JudgeScore, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, ProductClient, type ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TestResult, type ThresholdContract, TokenCounter, type TokenSpec, TraceEmitter, TraceEvent, TraceStore, Trajectory, TrajectoryStep, type TrialAggregate, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, VerifyContext, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateRunScore, aggregateTrialsByMode, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, buildReviewerPrompt, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, confidenceInterval, containsAll, corpusInterRaterAgreement, corpusInterRaterAgreementFromJudgeScores, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, linterJudge, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, normalizeScores, notBlocked, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, typoMutator, urlContains, verifyManifest, visualDiff, viteDeployRunner, weightedMean, weightedRecall, whitespaceCollapseMutator, wilcoxonSignedRank, withJudgeRetry, wranglerDeployRunner };