@tangle-network/agent-eval 0.41.0 → 0.43.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/dist/benchmarks/index.js +2 -2
  2. package/dist/builder-eval/index.js +1 -1
  3. package/dist/campaign/index.d.ts +90 -368
  4. package/dist/campaign/index.js +74 -4
  5. package/dist/campaign/index.js.map +1 -1
  6. package/dist/{chunk-AU2JLNSZ.js → chunk-H4TOS272.js} +1 -65
  7. package/dist/chunk-H4TOS272.js.map +1 -0
  8. package/dist/{chunk-NKLGKF2Q.js → chunk-KQ26DYTQ.js} +2 -18
  9. package/dist/chunk-KQ26DYTQ.js.map +1 -0
  10. package/dist/{chunk-6QDKWHLS.js → chunk-MHQPVHXU.js} +2 -2
  11. package/dist/{chunk-EGIPWXHL.js → chunk-MNL6LXGQ.js} +98 -2
  12. package/dist/chunk-MNL6LXGQ.js.map +1 -0
  13. package/dist/{chunk-5U2DOJU4.js → chunk-N4SBKEPJ.js} +199 -2
  14. package/dist/chunk-N4SBKEPJ.js.map +1 -0
  15. package/dist/chunk-NSBPE2FW.js +17 -0
  16. package/dist/{chunk-LCIDRYGP.js → chunk-PD3MH6WU.js} +8 -8
  17. package/dist/{chunk-YNMCYUWT.js → chunk-RXK7FXLV.js} +92 -37
  18. package/dist/chunk-RXK7FXLV.js.map +1 -0
  19. package/dist/cli.js +1 -1
  20. package/dist/{control-CmLJk3IG.d.ts → control-ojEWkMfJ.d.ts} +1 -1
  21. package/dist/control.d.ts +2 -2
  22. package/dist/control.js +1 -1
  23. package/dist/{feedback-trajectory-Dvy-bt7x.d.ts → feedback-trajectory-BSxqEpu7.d.ts} +1 -1
  24. package/dist/governance/index.js +1 -1
  25. package/dist/index.d.ts +227 -687
  26. package/dist/index.js +755 -1239
  27. package/dist/index.js.map +1 -1
  28. package/dist/integrity-CTDhR1Sg.d.ts +81 -0
  29. package/dist/knowledge/index.js +1 -1
  30. package/dist/llm-client-BXVRUZyX.d.ts +234 -0
  31. package/dist/matrix/index.js +1 -1
  32. package/dist/meta-eval/index.js +1 -1
  33. package/dist/multishot/index.js +1 -1
  34. package/dist/openapi.json +1 -1
  35. package/dist/pipelines/index.js +68 -4
  36. package/dist/pipelines/index.js.map +1 -1
  37. package/dist/prm/index.js +1 -1
  38. package/dist/{integrity-DYR5gWlb.d.ts → raw-provider-sink-C46HDghv.d.ts} +1 -80
  39. package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} +21 -3
  40. package/dist/reporting.d.ts +2 -3
  41. package/dist/reporting.js +5 -9
  42. package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} +116 -205
  43. package/dist/rl.d.ts +103 -221
  44. package/dist/rl.js +45 -200
  45. package/dist/rl.js.map +1 -1
  46. package/dist/{run-campaign-KEJK5KFT.js → run-campaign-GNDO66B4.js} +3 -3
  47. package/dist/sequential-DdV5ShjT.d.ts +561 -0
  48. package/dist/telemetry/file.js +1 -1
  49. package/dist/telemetry/index.js +1 -1
  50. package/dist/traces.d.ts +3 -2
  51. package/dist/traces.js +6 -6
  52. package/dist/types-BLbRTxoc.d.ts +367 -0
  53. package/dist/wire/index.d.ts +1 -1
  54. package/dist/wire/index.js +1 -1
  55. package/package.json +26 -17
  56. package/dist/chunk-5U2DOJU4.js.map +0 -1
  57. package/dist/chunk-AU2JLNSZ.js.map +0 -1
  58. package/dist/chunk-DMW5VENN.js +0 -1412
  59. package/dist/chunk-DMW5VENN.js.map +0 -1
  60. package/dist/chunk-EGIPWXHL.js.map +0 -1
  61. package/dist/chunk-MAZ26DC7.js +0 -99
  62. package/dist/chunk-MAZ26DC7.js.map +0 -1
  63. package/dist/chunk-NKLGKF2Q.js.map +0 -1
  64. package/dist/chunk-PZ5AY32C.js +0 -10
  65. package/dist/chunk-YNMCYUWT.js.map +0 -1
  66. package/dist/multi-layer-verifier-BNi4-8lR.d.ts +0 -141
  67. package/dist/optimization.d.ts +0 -11
  68. package/dist/optimization.js +0 -71
  69. package/dist/run-campaign-KEJK5KFT.js.map +0 -1
  70. package/dist/sequential-5iSVfzl2.d.ts +0 -139
  71. package/dist/summary-report-DuZXOk7K.d.ts +0 -917
  72. /package/dist/{chunk-6QDKWHLS.js.map → chunk-MHQPVHXU.js.map} +0 -0
  73. /package/dist/{chunk-PZ5AY32C.js.map → chunk-NSBPE2FW.js.map} +0 -0
  74. /package/dist/{chunk-LCIDRYGP.js.map → chunk-PD3MH6WU.js.map} +0 -0
  75. /package/dist/{optimization.js.map → run-campaign-GNDO66B4.js.map} +0 -0
package/dist/index.d.ts CHANGED
@@ -1,36 +1,33 @@
1
- export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-CmLJk3IG.js';
2
- import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
3
- export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, b as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-BGY6bHRh.js';
1
+ export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-ojEWkMfJ.js';
2
+ import { R as RunRecord } from './run-record-BGY6bHRh.js';
3
+ export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, a as RunSplitTag, b as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-BGY6bHRh.js';
4
4
  import { AxAIService, AxFunction } from '@ax-llm/ax';
5
- import { S as Severity, M as MultiLayerVerifier, a as VerifyOptions, L as Layer, b as LayerResult, c as VerifyContext } from './multi-layer-verifier-BNi4-8lR.js';
6
- export { F as Finding, d as LayerStatus, V as VerificationReport, g as gradeSemanticStatus } from './multi-layer-verifier-BNi4-8lR.js';
5
+ import { d as Severity, M as MultiLayerVerifier, e as VerifyOptions, L as Layer, f as LayerResult, g as VerifyContext } from './researcher-CoJMs2Iz.js';
6
+ export { C as CallbackResearcher, h as CallbackResearcherOptions, i as CampaignFactoryParams, j as CampaignIntegrityPolicy, k as CampaignRunContext, l as CampaignRunOutcome, m as CampaignRunner, n as CampaignScenario, o as CampaignVariant, c as EvalCampaignOptions, b as EvalCampaignResult, E as ExperimentPlan, a as ExperimentResult, p as FailedRun, F as FailureMode, q as Finding, s as LayerStatus, N as NoopResearcher, R as Researcher, S as SteeringChange, V as VerificationReport, t as gradeSemanticStatus, r as runEvalCampaign } from './researcher-CoJMs2Iz.js';
7
7
  import { R as Run$1, S as Span, b as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, T as TraceStore, g as BudgetSpec, h as RunFilter, L as LlmSpan } from './store-Db2Bv8Cf.js';
8
8
  export { i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, M as Message, e as RetrievalSpan, m as RunLayer, n as RunStatus, f as SandboxSpan, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
9
- import { L as LlmClientOptions, m as LlmCallRequest, n as LlmCallResult } from './researcher-DeZ_EArp.js';
10
- export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, o as LlmCallError, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as backoffMs, x as callLlm, y as callLlmJson, z as isTransientLlmError, A as probeLlm, r as runEvalCampaign, B as stripFencedJson } from './researcher-DeZ_EArp.js';
9
+ import { L as LlmClientOptions, b as LlmCallRequest, c as LlmCallResult } from './llm-client-BXVRUZyX.js';
10
+ export { d as LlmCallError, e as LlmClient, f as LlmMessage, g as LlmRouteAssertionError, a as LlmRouteRequirements, h as LlmUsage, i as assertLlmRoute, j as backoffMs, k as callLlm, l as callLlmJson, m as isTransientLlmError, p as probeLlm, s as stripFencedJson } from './llm-client-BXVRUZyX.js';
11
11
  import { TraceAnalysisStore, AnalyzeTracesOptions, OtelExporter, OtelExportConfig, AnalyzeTracesInput, AnalyzeTracesResult } from './traces.js';
12
12
  export { AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, DatasetOverview, ExportableSpan, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpResourceSpans, OtlpSpan, QueryTracesPage, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SearchSpanResult, SearchTraceResult, SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TraceAnalystByteBudgets, TraceAnalystFilters, TraceAnalystHookOptions, TraceAnalystSpan, TraceAnalystSpanKind, TraceAnalystSpanStatus, TraceAnalystTraceSummary, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, ViewSpansResult, ViewTraceOversized, ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
13
- import { t as JudgeInput, u as JudgeFn, v as BenchmarkRunnerConfig, S as Scenario, x as BenchmarkReport, y as ProductClientConfig, C as CheckResult, T as TestResult, z as PersonaConfig, D as DriverResult, A as DriverState, E as CollectedArtifacts, F as ScenarioResult, i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard, G as TurnMetrics, H as ScenarioFile, I as CompletionCriterion } from './release-report-Di84bXD7.js';
14
- export { K as ArtifactCheck, L as ArtifactResult, B as BootstrapOptions, a as BootstrapResult, M as CorpusAgreementOptions, N as CorpusAgreementPerDimension, O as CorpusAgreementReport, Q as CorpusScoreRecord, U as EvalResult, W as FeedbackPattern, X as JudgeConfig, J as JudgeReplayGateArgs, Y as JudgeRubric, Z as JudgeScore, P as PairedBootstrapOptions, b as PairedBootstrapResult, _ as PersonaRigor, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, $ as RouteMap, a0 as RubricDimension, a1 as Turn, a2 as TurnResult, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, a3 as bonferroni, n as bootstrapCi, a4 as cohensD, a5 as confidenceInterval, a6 as corpusInterRaterAgreement, a7 as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, a8 as interRaterReliability, p as judgeReplayGate, a9 as mannWhitneyU, aa as normalizeScores, q as pairedBootstrap, ab as pairedMde, ac as pairedTTest, ad as partialCredit, r as releaseTraceEvidenceFromMultiShotTrials, s as renderReleaseReport, ae as requiredSampleSize, af as weightedMean, w as wilcoxonSignedRank } from './release-report-Di84bXD7.js';
13
+ import { s as JudgeInput, t as JudgeFn, u as BenchmarkRunnerConfig, S as Scenario, v as BenchmarkReport, x as ProductClientConfig, C as CheckResult, T as TestResult, y as PersonaConfig, D as DriverResult, z as DriverState, A as CollectedArtifacts, E as ScenarioResult, i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard, F as TurnMetrics, G as ScenarioFile, H as CompletionCriterion } from './release-report-BtpgWRI0.js';
14
+ export { I as ActionableSideInfo, K as ArtifactCheck, L as ArtifactResult, M as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, N as CorpusAgreementOptions, O as CorpusAgreementPerDimension, Q as CorpusAgreementReport, U as CorpusScoreRecord, W as EvalResult, X as FeedbackPattern, Y as JudgeConfig, J as JudgeReplayGateArgs, Z as JudgeRubric, _ as JudgeScore, P as PairedBootstrapOptions, b as PairedBootstrapResult, $ as PersonaRigor, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, a0 as RouteMap, a1 as RubricDimension, a2 as Turn, a3 as TurnResult, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, a4 as bonferroni, n as bootstrapCi, a5 as cohensD, a6 as confidenceInterval, a7 as corpusInterRaterAgreement, a8 as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, a9 as interRaterReliability, p as judgeReplayGate, aa as mannWhitneyU, ab as normalizeScores, q as pairedBootstrap, ac as pairedMde, ad as pairedTTest, ae as partialCredit, r as renderReleaseReport, af as requiredSampleSize, ag as weightedMean, w as wilcoxonSignedRank } from './release-report-BtpgWRI0.js';
15
15
  import { TCloud } from '@tangle-network/tcloud';
16
16
  import { z } from 'zod';
17
- import { C as ControlEvalResult } from './control-runtime-BZ_lVLYW.js';
18
- export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BZ_lVLYW.js';
17
+ export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BZ_lVLYW.js';
19
18
  import { A as AgentEvalError } from './errors-mje_cKOs.js';
20
19
  export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-mje_cKOs.js';
21
- import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory } from './feedback-trajectory-Dvy-bt7x.js';
22
- export { F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-Dvy-bt7x.js';
23
- import { A as ActionableSideInfo, h as MultiShotRunner, j as MultiShotScorer, c as MultiShotMutateAdapter, a4 as HeldOutGateConfig, E as EvolvableVariant, m as MultiShotTrialResult, e as MultiShotOptimizationResult, a3 as GateDecision, a5 as Objective, a6 as ParetoResult, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-DuZXOk7K.js';
24
- export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a7 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a8 as GateEvidence, G as GenerationReport, a9 as HeldOutGate, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, d as MultiShotOptimizationConfig, f as MultiShotRun, g as MultiShotRunInput, i as MultiShotScore, k as MultiShotSplit, l as MultiShotTrace, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-DuZXOk7K.js';
20
+ import { a as FeedbackLabel, F as FeedbackTrajectoryStore, b as FeedbackTrajectory } from './feedback-trajectory-BSxqEpu7.js';
21
+ export { c as FeedbackArtifactType, d as FeedbackAttempt, e as FeedbackLabelKind, f as FeedbackLabelSource, g as FeedbackOptimizerRow, h as FeedbackOutcome, i as FeedbackReplayAdapter, j as FeedbackReplayResult, k as FeedbackSeverity, l as FeedbackSplitPolicy, m as FeedbackTask, n as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-BSxqEpu7.js';
25
22
  export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
26
- import { a as FailureCluster } from './failure-cluster-Cw65_5FY.js';
27
- export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-Cw65_5FY.js';
28
23
  import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-B2kWEdh9.js';
29
24
  export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-B2kWEdh9.js';
30
25
  import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
31
26
  export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DP_cSSiw.js';
32
- export { F as FileSystemRawProviderSink, c as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, d as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, e as RawProviderDirection, f as RawProviderEvent, R as RawProviderSink, g as RawProviderSinkFilter, h as RunIntegrityError, a as RunIntegrityExpectations, i as RunIntegrityIssue, j as RunIntegrityIssueCode, b as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-DYR5gWlb.js';
27
+ export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CTDhR1Sg.js';
33
28
  export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-DODUYdPg.js';
29
+ export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, c as RawProviderDirection, d as RawProviderEvent, R as RawProviderSink, e as RawProviderSinkFilter, f as defaultProviderRedactor, p as providerFromBaseUrl } from './raw-provider-sink-C46HDghv.js';
30
+ export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-Cw65_5FY.js';
34
31
  import { a as BaselineReport } from './baseline-4R5deP0N.js';
35
32
  export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-4R5deP0N.js';
36
33
  import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
@@ -42,7 +39,7 @@ export { D as DEFAULT_RED_TEAM_CORPUS, R as RedTeamCase, a as RedTeamCategory, b
42
39
  import { a as PrmGrader } from './rubric-D5tjHNJQ.js';
43
40
  export { EuRiskClass, GovernanceContext, GovernanceFinding, GovernanceReport, UseCaseSignals, classifyEuAiRisk, euAiActReport, nistAiRmfReport, renderMarkdown, soc2Report, summarize } from './governance/index.js';
44
41
  export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-0pu_fBwZ.js';
45
- export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
42
+ export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, w as GateDecision, x as GateEvidence, H as HeldOutGate, y as HeldOutGateConfig, z as HeldOutGateRejectionCode, I as InterimReleaseConfidence, c as InterimReleaseConfidenceInput, P as PairedEvalueOptions, d as PairedEvalueSequence, e as PairedEvalueStep, f as ParetoFigureSpec, g as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, h as ResearchReport, i as ResearchReportCandidate, j as ResearchReportDecision, k as ResearchReportMethodology, l as ResearchReportOptions, m as ResearchReportRecommendation, S as SequentialDecision, n as SummaryTable, o as SummaryTableOptions, p as SummaryTableRow, q as evaluateInterimReleaseConfidence, r as gainHistogram, s as pairedEvalueSequence, t as paretoChart, u as researchReport, v as summaryTable } from './sequential-DdV5ShjT.js';
46
43
  import './outcome-store-D6KWmYvj.js';
47
44
 
48
45
  interface RunScore {
@@ -1565,34 +1562,6 @@ interface ExecutorConfig {
1565
1562
  */
1566
1563
  declare function executeScenario(tc: TCloud, scenario: Scenario, config: ExecutorConfig): Promise<ScenarioResult>;
1567
1564
 
1568
- type IntegrationGateSurface = 'integration-manifest' | 'integration-connection' | 'integration-scope' | 'integration-approval' | 'integration-auth' | 'integration-provider' | 'integration-policy';
1569
- interface IntegrationManifestGateInput {
1570
- connectorId: string;
1571
- actionId?: string;
1572
- valid: boolean;
1573
- missingConnections?: string[];
1574
- missingScopes?: string[];
1575
- requiredScopes?: string[];
1576
- approvalRequired?: boolean;
1577
- status?: 'ready' | 'blocked' | 'approval_required';
1578
- reason?: string;
1579
- metadata?: Record<string, unknown>;
1580
- }
1581
- interface IntegrationInvokeFailureInput {
1582
- connectorId: string;
1583
- actionId: string;
1584
- code: 'auth_expired' | 'scope_denied' | 'approval_required' | 'unsafe_write_denied' | 'provider_failure' | 'manifest_invalid';
1585
- message: string;
1586
- status?: number;
1587
- retryable?: boolean;
1588
- metadata?: Record<string, unknown>;
1589
- }
1590
- declare function integrationManifestValidatedPayload(input: IntegrationManifestGateInput): Record<string, unknown>;
1591
- declare function integrationManifestResolvedPayload(input: IntegrationManifestGateInput): Record<string, unknown>;
1592
- declare function integrationInvokeFailedPayload(input: IntegrationInvokeFailureInput): Record<string, unknown>;
1593
- declare function integrationGateEvals(input: IntegrationManifestGateInput): ControlEvalResult[];
1594
- declare function integrationAsi(input: IntegrationManifestGateInput | IntegrationInvokeFailureInput): ActionableSideInfo;
1595
-
1596
1565
  /**
1597
1566
  * Backend-integrity guard: distinguish "agent failed" from "eval ran against
1598
1567
  * a stub / unconfigured backend." Without this guard a canonical eval can
@@ -1889,180 +1858,6 @@ declare function scorePrReviewComments(auditCase: PrReviewAuditCase, comments: P
1889
1858
  declare function summarizePrReviewBenchmark(scores: PrReviewScore[]): PrReviewBenchmarkSummary[];
1890
1859
  declare function aggregatePrReviewScore(dimensions: Pick<PrReviewScore, 'recall' | 'precision' | 'actionability' | 'severityCalibration' | 'lowNoise'>, weights?: Partial<PrReviewScoreWeights>): number;
1891
1860
 
1892
- /**
1893
- * ProductionLoop — the substrate that closes eval → prod → eval.
1894
- *
1895
- * Static prompts decay. Yesterday's regulation flips today; yesterday's
1896
- * tool quirk becomes today's incident. A production agent that ships a
1897
- * static prompt and never re-trains is on a clock.
1898
- *
1899
- * `runProductionLoop` is the orchestration layer over the eval substrate:
1900
- *
1901
- * 1. Ingest production traces + user feedback (via the wire HTTP
1902
- * ingestion endpoints, or directly through any `TraceStore` and
1903
- * `FeedbackTrajectoryStore` implementation).
1904
- * 2. Cluster the failures (`failureClusterView`) and prioritize by
1905
- * size × severity.
1906
- * 3. If any cluster crosses the consumer's threshold, run a
1907
- * `runMultiShotOptimization` round seeded by the current production
1908
- * prompt against holdout-shape scenarios derived from the offending
1909
- * cluster.
1910
- * 4. Gate the promoted prompt with `evaluateReleaseConfidence`. Fail
1911
- * closed.
1912
- * 5. If the gate passes and an `AutoPrClient` is wired, open a PR with
1913
- * the new prompt. Otherwise return the proposed change.
1914
- *
1915
- * One call = one cycle. Cron / GitHub Actions are the caller's job. The
1916
- * primitive is idempotent + replayable: re-running with the same
1917
- * `runId` will produce the same plan.
1918
- *
1919
- * @experimental — surface may evolve as product agents wire it in.
1920
- */
1921
-
1922
- interface FailureClusterConfig {
1923
- /** Minimum runs in a cluster before it triggers an evolve round. Default 5. */
1924
- minClusterSize?: number;
1925
- /**
1926
- * Severity threshold. A cluster is "actionable" when its size
1927
- * normalized by total runs exceeds this. Default 0.05 (5% of all runs).
1928
- */
1929
- minSeverityRatio?: number;
1930
- /**
1931
- * Maximum number of clusters to react to in one cycle. Acting on too
1932
- * many at once obscures attribution. Default 1 — the worst cluster.
1933
- */
1934
- maxClustersPerCycle?: number;
1935
- }
1936
- interface ProductionEvolveConfig<P = string> {
1937
- /** How to run a candidate prompt against a scenario. */
1938
- runner: MultiShotRunner<P>;
1939
- /** How to score the trajectory. Usually a calibrated judge. */
1940
- scorer: MultiShotScorer<P>;
1941
- /** How to mutate. Addendum-style mutators (append vs. rewrite) work best. */
1942
- mutator: MultiShotMutateAdapter<P>;
1943
- /** The current production prompt. Acts as the baseline + seed. */
1944
- baselinePrompt: P;
1945
- /** Stable id for the baseline variant. Default `'baseline'`. */
1946
- baselineId?: string;
1947
- /** Scenarios resembling production load. Used as the holdout split. */
1948
- holdoutScenarios: Scenario[];
1949
- /** Scenarios used during search. Default: derived from `holdoutScenarios` via deterministic split. */
1950
- searchScenarios?: Scenario[];
1951
- /** Gate config for the held-out promotion check. */
1952
- gate: HeldOutGateConfig;
1953
- /** Reps per (variant × scenario) cell. Default 3. */
1954
- reps?: number;
1955
- /** Number of mutation generations. Default 3. */
1956
- generations?: number;
1957
- /** Population size per generation. Default 4. */
1958
- populationSize?: number;
1959
- /** Concurrent score() calls. Default 1. */
1960
- scoreConcurrency?: number;
1961
- /**
1962
- * Optional bridge from a scored trial into a paper-grade RunRecord.
1963
- * If omitted, the loop synthesises a minimal record sufficient for
1964
- * `HeldOutGate` and `evaluateReleaseConfidence`.
1965
- */
1966
- toRunRecord?: (input: {
1967
- variant: EvolvableVariant<P>;
1968
- scenarioId: string;
1969
- rep: number;
1970
- split: RunSplitTag;
1971
- seed: number;
1972
- trial: MultiShotTrialResult;
1973
- }) => RunRecord;
1974
- }
1975
- interface ProductionShipConfig {
1976
- repo: RepoRef;
1977
- /** Branch name prefix. Final branch = `${branchPrefix}/${runId}`. */
1978
- branchPrefix: string;
1979
- /** Path (repo-relative) of the file holding the production prompt. */
1980
- promptFilePath: string;
1981
- /** Base branch for the PR. Default `'main'`. */
1982
- baseBranch?: string;
1983
- reviewers?: string[];
1984
- labels?: string[];
1985
- /** Required: the auto-PR transport. Use `ghCliClient()` or `httpGithubClient()`. */
1986
- client: AutoPrClient;
1987
- /** Skip the actual push + PR call — for sanity-checking the plan. Default false. */
1988
- dryRun?: boolean;
1989
- /** Render PR body from the loop's findings. Optional override. */
1990
- renderBody?: (ctx: ProductionLoopRenderContext) => string;
1991
- /** Render the file contents from the new prompt. Default: serialize as the file. */
1992
- renderPromptFile?: (newPrompt: string, oldFileContents: string | null) => string;
1993
- /** Read the current prompt file contents for diff context. Optional. */
1994
- readCurrentPromptFile?: () => Promise<string | null>;
1995
- }
1996
- interface ProductionLoopCronConfig {
1997
- cadence: 'weekly' | 'daily' | 'hourly';
1998
- /** Optional jitter (seconds) the consumer's scheduler should add. Surface-only. */
1999
- jitterSec?: number;
2000
- }
2001
- interface RunProductionLoopOptions<P = string> {
2002
- /** Stable id; deterministic outputs when reused. */
2003
- runId: string;
2004
- /** Human label — surfaces in PR titles and reports. */
2005
- target: string;
2006
- traceStore: TraceStore;
2007
- feedbackStore: FeedbackTrajectoryStore;
2008
- cluster: FailureClusterConfig;
2009
- evolve: ProductionEvolveConfig<P>;
2010
- /** When omitted, the loop returns the proposed prompt without opening a PR. */
2011
- ship?: ProductionShipConfig;
2012
- /** Surface-only — encodes scheduler expectations into the artifact. */
2013
- cron?: ProductionLoopCronConfig;
2014
- /** Release confidence thresholds. Default: library defaults. */
2015
- releaseThresholds?: ReleaseConfidenceThresholds;
2016
- /** Now() seam for reproducibility in tests. */
2017
- now?: () => Date;
2018
- }
2019
- type ProductionLoopDecision = 'no_actionable_failures' | 'evolve_yielded_no_improvement' | 'gate_failed' | 'proposed_change' | 'pr_opened';
2020
- interface ProductionLoopRenderContext {
2021
- runId: string;
2022
- target: string;
2023
- decision: ProductionLoopDecision;
2024
- /** Clusters seen in production this cycle, sorted by severity. */
2025
- clusters: FailureCluster[];
2026
- /** The cluster the loop acted on (if any). */
2027
- actedOnCluster: FailureCluster | null;
2028
- /** Production runs observed this cycle. */
2029
- observedRunCount: number;
2030
- /** Feedback trajectories observed this cycle. */
2031
- observedFeedbackCount: number;
2032
- /** Evolve result (if evolve ran). */
2033
- evolution: MultiShotOptimizationResult<unknown> | null;
2034
- /** Release gate verdict (if evolve ran). */
2035
- release: ReleaseConfidenceScorecard | null;
2036
- /** Held-out gate decision (if a candidate was paired against the baseline). */
2037
- gate: GateDecision | null;
2038
- /** The baseline (current production) prompt as a string. */
2039
- baselinePromptString: string;
2040
- /** The proposed new prompt as a string. Empty if no change was proposed. */
2041
- promotedPromptString: string;
2042
- }
2043
- interface ProductionLoopResult {
2044
- runId: string;
2045
- target: string;
2046
- decision: ProductionLoopDecision;
2047
- startedAt: string;
2048
- finishedAt: string;
2049
- observedRunCount: number;
2050
- observedFeedbackCount: number;
2051
- clusters: FailureCluster[];
2052
- actedOnCluster: FailureCluster | null;
2053
- evolution: MultiShotOptimizationResult<unknown> | null;
2054
- release: ReleaseConfidenceScorecard | null;
2055
- gate: GateDecision | null;
2056
- /** Baseline prompt as it entered the cycle. */
2057
- baselinePrompt: unknown;
2058
- /** Promoted prompt — equals baseline when no change is proposed. */
2059
- promotedPrompt: unknown;
2060
- /** PR artifact when `ship` was wired and gate passed. */
2061
- pullRequest: ProposeAutomatedPullRequestResult | null;
2062
- cron: ProductionLoopCronConfig | null;
2063
- }
2064
- declare function runProductionLoop<P = string>(opts: RunProductionLoopOptions<P>): Promise<ProductionLoopResult>;
2065
-
2066
1861
  /**
2067
1862
  * ScenarioRegistry — manages scenario discovery and filtering.
2068
1863
  *
@@ -2727,6 +2522,86 @@ declare class FileSystemExperimentStore implements ExperimentStore {
2727
2522
  private load;
2728
2523
  }
2729
2524
 
2525
+ /**
2526
+ * Pareto frontier — multi-objective optimization over candidate runs.
2527
+ *
2528
+ * Lifted from ADC pareto.ts and blueprint-agent frontier.ts. When you're
2529
+ * trading off (cost, latency, quality) or (passRate, tokenBudget,
2530
+ * ttfb), you rarely have a single "winner" — you have a set of
2531
+ * non-dominated candidates. This module exposes:
2532
+ *
2533
+ * - `paretoFrontier`: filter a set of candidates to the non-dominated ones
2534
+ * - `dominates`: does A dominate B across all objectives?
2535
+ *
2536
+ * Each objective is declared with a direction: 'maximize' (higher=better)
2537
+ * or 'minimize' (lower=better). Candidates are any object; pass an
2538
+ * `objective(candidate)` accessor.
2539
+ */
2540
+ type Direction = 'maximize' | 'minimize';
2541
+ interface Objective<T> {
2542
+ /** Stable label used in reports. */
2543
+ name: string;
2544
+ direction: Direction;
2545
+ value: (candidate: T) => number;
2546
+ }
2547
+ interface ParetoResult<T> {
2548
+ frontier: T[];
2549
+ dominated: T[];
2550
+ /** Index map: frontier[i] dominates each of dominatedBy[i]. */
2551
+ dominanceMap: Array<{
2552
+ dominator: T;
2553
+ dominated: T[];
2554
+ }>;
2555
+ }
2556
+ /** Does candidate A weakly dominate B — strictly better on at least one objective and no worse on any? */
2557
+ declare function dominates<T>(a: T, b: T, objectives: Objective<T>[]): boolean;
2558
+ /**
2559
+ * Compute the non-dominated frontier. Candidates with NaN/Infinity on any
2560
+ * objective are excluded (can't rank them). A candidate enters the frontier
2561
+ * iff no other candidate dominates it.
2562
+ */
2563
+ declare function paretoFrontier<T>(candidates: T[], objectives: Objective<T>[]): ParetoResult<T>;
2564
+ /**
2565
+ * Weighted-sum scalarisation. Use as a tie-break / single-winner selector
2566
+ * when callers don't want to consume a frontier. Each objective contributes
2567
+ * its normalised value (0..1 via min-max across the candidate pool) times
2568
+ * its weight; missing weights default to 1/N.
2569
+ *
2570
+ * Direction is honoured automatically — `minimize` axes have their values
2571
+ * inverted before scaling so "higher scalar = better" always holds.
2572
+ */
2573
+ declare function scalarScore<T>(candidates: T[], objectives: Objective<T>[], options?: {
2574
+ weights?: Partial<Record<string, number>>;
2575
+ }): Array<{
2576
+ candidate: T;
2577
+ score: number;
2578
+ }>;
2579
+ /**
2580
+ * NSGA-II crowding distance — secondary sort for ties on the frontier.
2581
+ *
2582
+ * When the Pareto front collapses to a single point (or many candidates tie
2583
+ * on dominance), naive selection picks arbitrarily and the population
2584
+ * degenerates over generations. NSGA-II preserves diversity by preferring
2585
+ * candidates with more empty space around them on the frontier.
2586
+ *
2587
+ * Returns an array of `{ candidate, distance }` in the SAME order as the
2588
+ * input. Higher distance = more isolated = should be preferred when
2589
+ * preserving diversity.
2590
+ */
2591
+ declare function crowdingDistance<T>(candidates: T[], objectives: Objective<T>[]): Array<{
2592
+ candidate: T;
2593
+ distance: number;
2594
+ }>;
2595
+ /**
2596
+ * Pareto frontier with tie-break by crowding distance — the canonical
2597
+ * NSGA-II selection step. Returns the frontier sorted by descending crowding
2598
+ * distance so callers can `.slice(0, k)` to pick K diverse winners.
2599
+ */
2600
+ declare function paretoFrontierWithCrowding<T>(candidates: T[], objectives: Objective<T>[]): Array<{
2601
+ candidate: T;
2602
+ distance: number;
2603
+ }>;
2604
+
2730
2605
  interface SteeringRolePrompt {
2731
2606
  system?: string;
2732
2607
  append?: string;
@@ -5567,378 +5442,6 @@ interface CanaryOptions {
5567
5442
  */
5568
5443
  declare function runCanaries(runs: RunRecord[], opts?: CanaryOptions): CanaryReport;
5569
5444
 
5570
- /**
5571
- * evolution-telemetry — durable JSONL/JSON sinks for the evolution loop.
5572
- *
5573
- * `runPromptEvolution` exposes generation-level events but doesn't persist
5574
- * the per-mutation, per-trial, lineage, or cost breakdown. These four
5575
- * sinks fill that gap so a finished autoresearch run leaves a forensically
5576
- * complete trail under one directory:
5577
- *
5578
- * - `mutations.jsonl` — every mutate attempt (success + failure) with
5579
- * latency, agent steps, diff stats, cost.
5580
- * - `trials.jsonl` — every TrialResult including cache hits, with
5581
- * provenance (channel, runtime slot, generation).
5582
- * - `lineage.json` — variant tree {id → {parent, generation, kind, …}},
5583
- * incremental upsert.
5584
- * - `cost-ledger.json` — running $ totals per source (mutator-prompt,
5585
- * mutator-code, scorer-prompt, scorer-code) plus pool utilisation.
5586
- *
5587
- * All writes are mutex-serialised. The append-only sinks (mutations,
5588
- * trials) survive a hard kill; the snapshot sinks (lineage, cost-ledger)
5589
- * rewrite on every update so the latest state is always on disk.
5590
- *
5591
- * Generic over a payload P so any consumer of `runPromptEvolution<P>` can
5592
- * record lineage without leaking domain types.
5593
- */
5594
-
5595
- type MutationChannel = 'prompt' | 'code';
5596
- interface MutationAttempt {
5597
- ts: number;
5598
- channel: MutationChannel;
5599
- generation: number;
5600
- parentId: string;
5601
- /** Successful child variant id, or null if the attempt failed. */
5602
- childId: string | null;
5603
- ok: boolean;
5604
- /**
5605
- * One of: 'parse_failure' | 'typecheck_failure' | 'no_changes' |
5606
- * 'agent_error' | 'commit_failure' | 'no_api_key' | 'no_valid_proposals'
5607
- * | 'reproduce_parent_failed' | 'branch_failed' | 'other'.
5608
- * Free-form to allow consumer-specific reasons.
5609
- */
5610
- failureReason?: string;
5611
- /** Free-form description of what the agent said it did. */
5612
- description?: string;
5613
- /** Latency of the LLM call (ms). */
5614
- latencyMs: number;
5615
- /** Bytes of generated diff (code channel only). */
5616
- diffBytes?: number;
5617
- /** Files touched (code channel only). */
5618
- filesTouched?: number;
5619
- /** Steps the agent ran (tool calls). */
5620
- agentSteps?: number;
5621
- /** Approx $ spent on this mutation (LLM tokens). */
5622
- costUsd?: number;
5623
- /** Runtime slot used (code channel only). */
5624
- runtimeSandboxId?: string;
5625
- }
5626
- declare class MutationTelemetry {
5627
- private readonly appender;
5628
- constructor(path: string);
5629
- record(attempt: MutationAttempt): Promise<void>;
5630
- }
5631
- interface TrialAttempt {
5632
- ts: number;
5633
- channel: MutationChannel;
5634
- generation: number;
5635
- variantId: string;
5636
- scenarioId: string;
5637
- rep: number;
5638
- ok: boolean;
5639
- score: number;
5640
- costUsd: number;
5641
- durationMs: number;
5642
- cached: boolean;
5643
- runtimeSandboxId?: string;
5644
- error?: string;
5645
- metrics?: Record<string, number>;
5646
- }
5647
- declare class TrialTelemetry {
5648
- private readonly appender;
5649
- constructor(path: string);
5650
- record(attempt: TrialAttempt): Promise<void>;
5651
- }
5652
- type LineageKind = 'seed' | 'prompt' | 'code';
5653
- interface LineageNode {
5654
- id: string;
5655
- parentId: string | null;
5656
- generation: number;
5657
- kind: LineageKind;
5658
- rationale?: string;
5659
- /** Filled when scoring lands. */
5660
- meanScore?: number;
5661
- promotedToFrontier?: boolean;
5662
- /**
5663
- * The variant payload (e.g. evolved persona text, code mutation diff).
5664
- * Persisted so a winning variant can be reproduced after a run completes
5665
- * without re-running the optimizer. Optional — pass `omitPayload: true` to
5666
- * `upsertVariant` for cases where the payload is too large to log.
5667
- */
5668
- payload?: unknown;
5669
- }
5670
- /**
5671
- * `kindOf` decides whether a variant is a seed (no parent), code mutation,
5672
- * or prompt mutation. Default looks at `variant.payload.codeMutation` and
5673
- * accepts any payload that exposes that field; override by passing your own.
5674
- */
5675
- type LineageKindResolver<P> = (variant: EvolvableVariant<P>) => LineageKind;
5676
- /**
5677
- * Persistence shape:
5678
- *
5679
- * `<path>` — JSONL of upserts (event log). Each line is a
5680
- * partial node; replay folds them into the current
5681
- * state. Append-only, so cost is O(1) per upsert
5682
- * instead of the previous O(n²) full rewrite.
5683
- * `<path>.snapshot` — Optional consolidated snapshot, written on
5684
- * demand via `compact()` (e.g. at end of run).
5685
- * Read by external tools that don't want to
5686
- * replay the log.
5687
- *
5688
- * Loaded at construction time: if `<path>.snapshot` exists, parse it
5689
- * first; then replay any newer log lines on top. Falls back to log-only
5690
- * when no snapshot is present.
5691
- */
5692
- declare class LineageRecorder<P = unknown> {
5693
- private readonly path;
5694
- private readonly snapshotPath;
5695
- private readonly mutex;
5696
- private readonly nodes;
5697
- private readonly kindOf;
5698
- constructor(path: string, kindOf?: LineageKindResolver<P>);
5699
- upsert(node: LineageNode): Promise<void>;
5700
- upsertVariant(variant: EvolvableVariant<P>, opts?: {
5701
- omitPayload?: boolean;
5702
- }): Promise<void>;
5703
- snapshot(): LineageNode[];
5704
- /**
5705
- * Write the current consolidated state to `<path>.snapshot` so external
5706
- * tools can read it without replaying the event log. Idempotent.
5707
- */
5708
- compact(): Promise<void>;
5709
- }
5710
- /** Per-generation cost rollup. Same shape as the totals, scoped to one gen. */
5711
- interface CostLedgerGeneration {
5712
- generation: number;
5713
- mutatorPromptUsd: number;
5714
- mutatorCodeUsd: number;
5715
- scorerPromptUsd: number;
5716
- scorerCodeUsd: number;
5717
- trialsCounted: number;
5718
- cachedTrials: number;
5719
- }
5720
- interface CostLedgerSnapshot {
5721
- totalUsd: number;
5722
- mutatorPromptUsd: number;
5723
- mutatorCodeUsd: number;
5724
- scorerPromptUsd: number;
5725
- scorerCodeUsd: number;
5726
- trialsCounted: number;
5727
- cachedTrials: number;
5728
- poolBusyMs?: number;
5729
- poolUtilizationPct?: number;
5730
- /** Per-generation breakdown, sorted ascending. Empty when generations
5731
- * weren't supplied to addMutation/addTrial. */
5732
- byGeneration: CostLedgerGeneration[];
5733
- }
5734
- declare class CostLedger {
5735
- private totals;
5736
- private readonly path;
5737
- private readonly mutex;
5738
- constructor(path: string);
5739
- private genBucket;
5740
- addMutation(channel: MutationChannel, usd: number, opts?: {
5741
- generation?: number;
5742
- }): Promise<void>;
5743
- addTrial(channel: MutationChannel, usd: number, cached: boolean, opts?: {
5744
- generation?: number;
5745
- }): Promise<void>;
5746
- setPoolUtilization(busyMs: number, totalMs: number): Promise<void>;
5747
- snapshot(): CostLedgerSnapshot;
5748
- private persist;
5749
- }
5750
-
5751
- /**
5752
- * SandboxPool — bounded checkout/release pool for mutation slots.
5753
- *
5754
- * The composite-mutator's `code` channel needs an isolated workspace per
5755
- * mutation attempt: a git worktree, a sandbox container, a tmpdir clone —
5756
- * whatever the consumer's runtime is. Without a pool, every consumer
5757
- * re-implements the same machinery (mint N slots, check one out per
5758
- * mutation, reset before reuse, drain at the end, track utilisation for
5759
- * the cost ledger). This primitive ships that machinery so consumers
5760
- * supply only a `SlotFactory`.
5761
- *
5762
- * Generic over a slot resource `T` so the same pool serves git worktrees
5763
- * (T = path), Tangle sandboxes (T = SandboxBox), or anything else with
5764
- * the create/reset/destroy lifecycle.
5765
- *
5766
- * Concurrency: FIFO via the shared `Mutex` primitive. Each `checkout()`
5767
- * either takes an idle slot or queues until one is released. Lifecycle
5768
- * is single-process — multi-process pools need external coordination
5769
- * (file locks, etc.) and are deliberately out of scope.
5770
- */
5771
- interface PoolSlot<T> {
5772
- /** Stable id assigned at slot creation. Use for telemetry / lineage. */
5773
- readonly id: string;
5774
- /** Consumer-defined resource. */
5775
- readonly resource: T;
5776
- }
5777
- interface SlotFactory<T> {
5778
- /** Build a new slot. Called lazily as the pool grows up to `size`. */
5779
- create(slotId: string): Promise<T>;
5780
- /**
5781
- * Reset a slot to a clean state before reuse. Called BEFORE every
5782
- * checkout returns it (including the first — so the factory's
5783
- * `create` can leave the slot dirty and let `reset` normalise).
5784
- * Optional; default is a no-op.
5785
- */
5786
- reset?(slot: PoolSlot<T>): Promise<void>;
5787
- /** Tear the slot down. Called by `drain()`. */
5788
- destroy(slot: PoolSlot<T>): Promise<void>;
5789
- }
5790
- interface SandboxPool<T> {
5791
- /**
5792
- * Take a slot. If all slots are busy, the promise resolves when one
5793
- * is released. Always pair with the returned `release` (or wrap with
5794
- * `withSlot`).
5795
- */
5796
- checkout(): Promise<{
5797
- slot: PoolSlot<T>;
5798
- release: () => void;
5799
- }>;
5800
- /**
5801
- * Run `fn` with a checked-out slot, releasing on completion or throw.
5802
- * The convenience wrapper most callers should use.
5803
- */
5804
- withSlot<R>(fn: (slot: PoolSlot<T>) => Promise<R>): Promise<R>;
5805
- /** Destroy every slot. Idempotent. */
5806
- drain(): Promise<void>;
5807
- /** How many slots have been minted (≤ `size`). */
5808
- poolSize(): number;
5809
- /** How many checkouts are currently outstanding. */
5810
- activeCheckouts(): number;
5811
- /** Snapshot of busy/total durations for the cost ledger. */
5812
- utilization(): {
5813
- busyMs: number;
5814
- totalMs: number;
5815
- checkouts: number;
5816
- };
5817
- }
5818
- interface CreateSandboxPoolOpts<T> {
5819
- /** Maximum concurrent slots. Slots are minted on first need, not eagerly. */
5820
- size: number;
5821
- factory: SlotFactory<T>;
5822
- }
5823
- declare function createSandboxPool<T>(opts: CreateSandboxPoolOpts<T>): SandboxPool<T>;
5824
-
5825
- /**
5826
- * createSandboxCodeMutator — `MutateAdapter<P>` that runs a coding agent
5827
- * inside a SandboxPool slot to produce code-channel variants.
5828
- *
5829
- * Composable shape (matches `reflective-mutation.ts`'s separation of
5830
- * "build the prompt" from "run the model"):
5831
- *
5832
- * pool → where mutations execute (any SlotFactory)
5833
- * runner → consumer-supplied: invokes the coding agent in a slot,
5834
- * returns the diff/branch/whatever as `CodeMutationOutcome`s
5835
- * toVariantPayload → maps outcome → P (consumer encodes the diff their
5836
- * way — patch string, branch ref, file map, etc)
5837
- *
5838
- * What this primitive owns (so consumers don't reinvent it every time):
5839
- * - Pool checkout / release with reset between attempts
5840
- * - Per-attempt mutex so a single slot can't be invoked concurrently
5841
- * - Telemetry write-through (mutations.jsonl, lineage.json,
5842
- * cost-ledger.json) when sinks are passed
5843
- * - Stable child-id generation
5844
- * - Failure capture (every attempt produces either a successful child
5845
- * or a recorded failure with reason — never a silent drop)
5846
- *
5847
- * Consumers stay focused on the actual interesting parts: building the
5848
- * agent prompt, running the agent, capturing the diff.
5849
- */
5850
-
5851
- /**
5852
- * Result of one coding-agent invocation. The runner produces 1..N of
5853
- * these per `runner` call (a single agent session can sometimes
5854
- * produce multiple sibling diffs cheaply — runner decides).
5855
- */
5856
- interface CodeMutationOutcome {
5857
- ok: boolean;
5858
- /** Stable id for the child variant if `ok`. The mutator falls back to
5859
- * a generated id when omitted. */
5860
- childId?: string;
5861
- /** Free-form one-liner: "tightened tool descriptions in forge-tools.ts". */
5862
- description?: string;
5863
- /** What the runner was trying to fix (carried into EvolvableVariant.rationale). */
5864
- rationale?: string;
5865
- /** Caller-defined diff payload. Mapped into the variant's payload by
5866
- * `toVariantPayload`; agent-eval treats it as opaque. */
5867
- artifact?: unknown;
5868
- /** When ok === false. Free-form: 'parse_failure' / 'agent_error' /
5869
- * 'no_changes' / 'commit_failed' / etc. */
5870
- failureReason?: string;
5871
- /** Telemetry stats. */
5872
- diffBytes?: number;
5873
- filesTouched?: number;
5874
- agentSteps?: number;
5875
- costUsd?: number;
5876
- latencyMs: number;
5877
- }
5878
- type CodeMutationRunner<T, P> = (args: {
5879
- slot: PoolSlot<T>;
5880
- parent: EvolvableVariant<P>;
5881
- parentAggregate: VariantAggregate;
5882
- topTrials: TrialResult[];
5883
- bottomTrials: TrialResult[];
5884
- childCount: number;
5885
- generation: number;
5886
- }) => Promise<CodeMutationOutcome[]>;
5887
- interface CreateSandboxCodeMutatorOpts<T, P> {
5888
- pool: SandboxPool<T>;
5889
- runner: CodeMutationRunner<T, P>;
5890
- /**
5891
- * Map an outcome into the variant payload `P`. Lets the consumer
5892
- * encode the diff however they want (file map, patch string, branch
5893
- * ref, snapshot id) without agent-eval taking a stance.
5894
- */
5895
- toVariantPayload(outcome: CodeMutationOutcome, parent: EvolvableVariant<P>): P;
5896
- /** Optional telemetry sinks. */
5897
- mutationTelemetry?: MutationTelemetry;
5898
- costLedger?: CostLedger;
5899
- lineage?: LineageRecorder<P>;
5900
- /** Override id generation. Default: `${parent.id}.g${generation}.code.${i}`. */
5901
- childIdFor?(parent: EvolvableVariant<P>, generation: number, index: number): string;
5902
- /** Default label for the variant (visible in reports). */
5903
- labelFor?(outcome: CodeMutationOutcome, parent: EvolvableVariant<P>, generation: number, index: number): string;
5904
- }
5905
- declare function createSandboxCodeMutator<T, P>(opts: CreateSandboxCodeMutatorOpts<T, P>): MutateAdapter<P>;
5906
-
5907
- /**
5908
- * createCompositeMutator — combines two `MutateAdapter<P>`s under a policy.
5909
- *
5910
- * prompt-only — every generation runs `primary` (typical: a reflective
5911
- * prompt mutator). The default.
5912
- * secondary-only — every generation runs `secondary` (typical: a coding
5913
- * agent that edits the harness itself). Slow + expensive.
5914
- * alternate — even gens run `primary`, odd gens run `secondary`.
5915
- * plateau — start with `primary`; switch to a 50/50 split between
5916
- * `primary` and `secondary` after K gens with less than
5917
- * Δ improvement (auto-detect when prompt evolution has
5918
- * hit a structural ceiling).
5919
- *
5920
- * Naming is generic — the canonical use cases are "prompt" and "code"
5921
- * channels, but the primitive doesn't care what each mutator actually does.
5922
- */
5923
-
5924
- type CompositePolicy = 'primary-only' | 'secondary-only' | 'alternate' | 'plateau';
5925
- interface CreateCompositeMutatorOpts<P> {
5926
- primary: MutateAdapter<P>;
5927
- secondary?: MutateAdapter<P>;
5928
- policy: CompositePolicy;
5929
- /** For 'plateau': minimum improvement (Δ meanScore) to count as progress. Default 0.02. */
5930
- plateauThreshold?: number;
5931
- /** For 'plateau': consecutive gens without progress that trigger split mode. Default 2. */
5932
- plateauPatience?: number;
5933
- /** Optional progress hook. */
5934
- onPolicyDecision?: (info: {
5935
- generation: number;
5936
- chose: 'primary' | 'secondary' | 'split';
5937
- reason: string;
5938
- }) => void;
5939
- }
5940
- declare function createCompositeMutator<P>(opts: CreateCompositeMutatorOpts<P>): MutateAdapter<P>;
5941
-
5942
5445
  /**
5943
5446
  * concurrency — small primitives the evolution loop needs.
5944
5447
  *
@@ -6068,38 +5571,6 @@ declare function precision<T>(goldens: GoldenSpec[], candidates: T[], options?:
6068
5571
  text?: (candidate: T) => string;
6069
5572
  }): number;
6070
5573
 
6071
- /**
6072
- * JsonlTrialCache — `TrialCache` backed by a JSONL append-only file so a
6073
- * crashed `runPromptEvolution` can resume without re-running expensive
6074
- * trials. Last write wins on key collision; the file is forward-swept at
6075
- * construction.
6076
- *
6077
- * Tail corruption (partial line at the bottom from a hard kill) is
6078
- * tolerated — we skip unparseable lines and continue.
6079
- *
6080
- * The cache surface (`get` / `set`) is synchronous because `TrialCache`
6081
- * is. Writes are mutex-serialised through a `LockedJsonlAppender`
6082
- * (kicked off with `void`) so two in-process callers can't tear a long
6083
- * line that exceeds POSIX `PIPE_BUF`. Cross-process safety still
6084
- * requires fcntl/flock and is deliberately out of scope.
6085
- */
6086
-
6087
- declare class JsonlTrialCache implements TrialCache {
6088
- private readonly map;
6089
- private readonly path;
6090
- private readonly appender;
6091
- constructor(path: string);
6092
- get(key: string): TrialResult | undefined;
6093
- set(key: string, value: TrialResult): void;
6094
- size(): number;
6095
- /**
6096
- * Synchronous fallback path for tests / CLI tools that want to be sure
6097
- * the line is on disk before returning. Bypasses the mutex (single-
6098
- * threaded callers only).
6099
- */
6100
- setSync(key: string, value: TrialResult): void;
6101
- }
6102
-
6103
5574
  /**
6104
5575
  * Wrap a single judge LLM call with retry, optional fallback-model
6105
5576
  * rotation, exponential backoff, and a typed `JudgeRetryOutcome`. Callers
@@ -6232,52 +5703,145 @@ declare function referenceReplayRunsToSteeringRows<Input = unknown>(runs: Refere
6232
5703
  declare function referenceReplayScenarioToRunScore(scenarioScore: ReferenceReplayScenarioScore, durationMs?: number): RunScore;
6233
5704
 
6234
5705
  /**
6235
- * Aggregate trials with explicit handling of judge failure. Three modes:
6236
- *
6237
- * - `strict-fail` — any `judgeSucceeded === false` trial fails the whole
6238
- * aggregate. Use for production gates: one corrupt trial halts the gate.
5706
+ * Reflective mutation primitives for trace-conditioned prompt rewriting.
6239
5707
  *
6240
- * - `exclude-failed` drop `judgeSucceeded === false` trials from the
6241
- * mean; report `excludedFailedTrials` separately. Default for new code.
5708
+ * Used by `prompt-evolution.ts` (and any consumer running iterative
5709
+ * improvement). Given a parent prompt + concrete trace evidence (top trials,
5710
+ * bottom trials, missed expectations), produce an LLM-ready prompt that
5711
+ * proposes targeted mutations — not blind rephrasings.
6242
5712
  *
6243
- * - `zero-fill` failed trials count as `score: 0` in the mean. Available
6244
- * only for adapters that don't yet set `judgeSucceeded`.
5713
+ * Why this lives outside `prompt-evolution.ts`: any consumer that wants to
5714
+ * run reflective rewriting WITHOUT the population/Pareto machinery can
5715
+ * import these primitives directly.
6245
5716
  *
6246
- * Hard-errored trials (`t.error` set) are always excluded — those are
6247
- * infrastructure failures, not eval signal.
5717
+ * Quality bar (vs. naive "mutate this prompt"):
5718
+ * - Show parent ↔ children diff, not just one variant
5719
+ * - Quote specific missed goldens with their match phrases
5720
+ * - Surface the model's actual emitted output side-by-side with what was expected
5721
+ * - Quote concrete mutation primitives so the model has a vocabulary
6248
5722
  */
5723
+ interface TrialTrace {
5724
+ /** Stable id for the trial — surfaces in the prompt for grounding. */
5725
+ id: string;
5726
+ /** Score the trial received on its primary metric. */
5727
+ score: number;
5728
+ /** Candidate inputs the agent was given (e.g., the fixture or scenario). */
5729
+ inputName?: string;
5730
+ /**
5731
+ * Goldens / expectations this trial was tested against, with whether each
5732
+ * was matched. The reflection prompt quotes the missed ones specifically.
5733
+ */
5734
+ expectations?: Array<{
5735
+ id: string;
5736
+ phrase: string;
5737
+ matched: boolean;
5738
+ }>;
5739
+ /** Free-form text — what the agent actually emitted (e.g., findings, plan). */
5740
+ emitted?: string;
5741
+ /** Optional structured metrics (recall, precision, cost, latency). */
5742
+ metrics?: Record<string, number>;
5743
+ }
5744
+ interface ReflectionContext {
5745
+ /** What is being mutated — appears in the system prompt for orientation. */
5746
+ target: string;
5747
+ /** Current variant's payload — JSON-serialised for the prompt. */
5748
+ parentPayload: unknown;
5749
+ /** Best-performing trials this generation. */
5750
+ topTrials: TrialTrace[];
5751
+ /** Worst-performing trials this generation — the missed-golden source. */
5752
+ bottomTrials: TrialTrace[];
5753
+ /** How many children the mutator should propose. */
5754
+ childCount: number;
5755
+ /** Optional: domain-specific mutation primitives the model can pick from. */
5756
+ mutationPrimitives?: string[];
5757
+ }
5758
+ declare const DEFAULT_MUTATION_PRIMITIVES: string[];
5759
+ /**
5760
+ * Build the LLM-ready reflection prompt. Output is plain text — pass it as
5761
+ * the user message. The system message should be small and stable (e.g.
5762
+ * "Output ONLY a JSON object matching the schema below.").
5763
+ */
5764
+ declare function buildReflectionPrompt(ctx: ReflectionContext): string;
5765
+ interface ReflectionProposal {
5766
+ label: string;
5767
+ rationale: string;
5768
+ payload: unknown;
5769
+ }
5770
+ declare function parseReflectionResponse(raw: string, maxProposals?: number): ReflectionProposal[];
6249
5771
 
6250
- type AggregatorMode = 'strict-fail' | 'exclude-failed' | 'zero-fill';
6251
- interface TrialAggregate {
6252
- /** Mean score over the trials counted by the chosen mode. */
6253
- meanScore: number;
6254
- /** Mean cost across counted trials. */
6255
- meanCost: number;
6256
- /** Mean wall time across counted trials. */
6257
- meanDurationMs: number;
6258
- /** Fraction of counted trials with `ok === true`. */
6259
- okRate: number;
6260
- /** Trials counted in the mean (mode-dependent). */
6261
- countedTrials: number;
6262
- /** Trials excluded because `judgeSucceeded === false` (exclude-failed mode). */
6263
- excludedFailedTrials: number;
6264
- /** Total trials passed in. */
6265
- totalTrials: number;
6266
- /** Mean of every numeric metric across counted trials. */
6267
- metrics: Record<string, number>;
5772
+ /**
5773
+ * SandboxPool — bounded checkout/release pool for mutation slots.
5774
+ *
5775
+ * The composite-mutator's `code` channel needs an isolated workspace per
5776
+ * mutation attempt: a git worktree, a sandbox container, a tmpdir clone —
5777
+ * whatever the consumer's runtime is. Without a pool, every consumer
5778
+ * re-implements the same machinery (mint N slots, check one out per
5779
+ * mutation, reset before reuse, drain at the end, track utilisation for
5780
+ * the cost ledger). This primitive ships that machinery so consumers
5781
+ * supply only a `SlotFactory`.
5782
+ *
5783
+ * Generic over a slot resource `T` so the same pool serves git worktrees
5784
+ * (T = path), Tangle sandboxes (T = SandboxBox), or anything else with
5785
+ * the create/reset/destroy lifecycle.
5786
+ *
5787
+ * Concurrency: FIFO via the shared `Mutex` primitive. Each `checkout()`
5788
+ * either takes an idle slot or queues until one is released. Lifecycle
5789
+ * is single-process — multi-process pools need external coordination
5790
+ * (file locks, etc.) and are deliberately out of scope.
5791
+ */
5792
+ interface PoolSlot<T> {
5793
+ /** Stable id assigned at slot creation. Use for telemetry / lineage. */
5794
+ readonly id: string;
5795
+ /** Consumer-defined resource. */
5796
+ readonly resource: T;
5797
+ }
5798
+ interface SlotFactory<T> {
5799
+ /** Build a new slot. Called lazily as the pool grows up to `size`. */
5800
+ create(slotId: string): Promise<T>;
5801
+ /**
5802
+ * Reset a slot to a clean state before reuse. Called BEFORE every
5803
+ * checkout returns it (including the first — so the factory's
5804
+ * `create` can leave the slot dirty and let `reset` normalise).
5805
+ * Optional; default is a no-op.
5806
+ */
5807
+ reset?(slot: PoolSlot<T>): Promise<void>;
5808
+ /** Tear the slot down. Called by `drain()`. */
5809
+ destroy(slot: PoolSlot<T>): Promise<void>;
5810
+ }
5811
+ interface SandboxPool<T> {
5812
+ /**
5813
+ * Take a slot. If all slots are busy, the promise resolves when one
5814
+ * is released. Always pair with the returned `release` (or wrap with
5815
+ * `withSlot`).
5816
+ */
5817
+ checkout(): Promise<{
5818
+ slot: PoolSlot<T>;
5819
+ release: () => void;
5820
+ }>;
6268
5821
  /**
6269
- * Set when mode is `strict-fail` AND at least one trial had
6270
- * `judgeSucceeded === false`. Caller should refuse to use this aggregate
6271
- * downstream — the eval is corrupt.
5822
+ * Run `fn` with a checked-out slot, releasing on completion or throw.
5823
+ * The convenience wrapper most callers should use.
6272
5824
  */
6273
- strictFailure?: {
6274
- failedCount: number;
6275
- firstError?: string;
5825
+ withSlot<R>(fn: (slot: PoolSlot<T>) => Promise<R>): Promise<R>;
5826
+ /** Destroy every slot. Idempotent. */
5827
+ drain(): Promise<void>;
5828
+ /** How many slots have been minted (≤ `size`). */
5829
+ poolSize(): number;
5830
+ /** How many checkouts are currently outstanding. */
5831
+ activeCheckouts(): number;
5832
+ /** Snapshot of busy/total durations for the cost ledger. */
5833
+ utilization(): {
5834
+ busyMs: number;
5835
+ totalMs: number;
5836
+ checkouts: number;
6276
5837
  };
6277
5838
  }
6278
- declare function aggregateTrialsByMode(trials: TrialResult[], opts: {
6279
- mode: AggregatorMode;
6280
- }): TrialAggregate;
5839
+ interface CreateSandboxPoolOpts<T> {
5840
+ /** Maximum concurrent slots. Slots are minted on first need, not eagerly. */
5841
+ size: number;
5842
+ factory: SlotFactory<T>;
5843
+ }
5844
+ declare function createSandboxPool<T>(opts: CreateSandboxPoolOpts<T>): SandboxPool<T>;
6281
5845
 
6282
5846
  /**
6283
5847
  * Pipeline-level OTEL integration — auto-attaches an OTEL exporter when
@@ -6363,28 +5927,4 @@ declare function traceJudge(judge: JudgeFn, judgeName: string, opts: TracedJudge
6363
5927
  */
6364
5928
  declare function traceJudgeEnsemble(judges: JudgeFn[], judgeNames: string[], opts: TracedJudgeOptions): JudgeFn;
6365
5929
 
6366
- /**
6367
- * Traced mutator wrapper — instruments reflective-mutation LLM calls.
6368
- *
6369
- * The reflective mutator (used by production-loop + multi-shot-optimization)
6370
- * builds a prompt via `buildReflectionPrompt` and calls an LLM to produce
6371
- * candidate mutations. This wrapper emits a span around each mutation call
6372
- * so OTEL sinks observe:
6373
- * - Model used for mutation
6374
- * - Input context (target, trial count, child count)
6375
- * - Output (proposal count, labels)
6376
- * - Duration + cost if available
6377
- */
6378
-
6379
- interface TracedMutatorOptions {
6380
- /** TraceEmitter for span emission. */
6381
- emitter: TraceEmitter;
6382
- /** Parent span id. If omitted, uses emitter stack. */
6383
- parentSpanId?: string;
6384
- }
6385
- /**
6386
- * Wrap a MutateAdapter so every mutate() call emits a span.
6387
- */
6388
- declare function traceMutator<P>(adapter: MutateAdapter<P>, opts: TracedMutatorOptions): MutateAdapter<P>;
6389
-
6390
- export { ANALYST_SEVERITIES, ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AgentProfile, type AggregatorMode, type AlignmentOp, type Analyst, type AnalystContext, type AnalystCost, type AnalystFinding, type AnalystHooks, type AnalystInputKind, AnalystRegistry, type AnalystRegistryOptions, type AnalystRequirements, type AnalystRunEvent, type AnalystRunInputs, type AnalystRunResult, type AnalystRunSummary, type AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BackendIntegrityError, type BackendIntegrityReport, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, type BudgetPolicy, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, type ChatCallOpts, type ChatClient, type ChatRequest, type ChatResponse, type ChatTransport, CheckResult, type CliBridgeTransportOpts, type CodeMutationOutcome, type CodeMutationRunner, CollectedArtifacts, type CommandRunner, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CorrectnessChecker, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateChatClientOpts, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type DirectProviderTransportOpts, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvidenceRef, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAdapterOpts, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallRequest, LlmCallResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MockTransportOpts, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegistryRunOpts, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouterTransportOpts, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxSdkTransportOpts, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, type TracedMutatorOptions, Trajectory, TrajectoryStep, type TrialAggregate, type TrialAttempt, TrialCache, TrialTelemetry, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, agentProfileHash, aggregatePrReviewScore, aggregateRunScore, aggregateTrialsByMode, analyzeAntiSlop, analyzeSeries, appendScorecard, assertRealBackend, attributeCounterfactuals, bisect, buildDriverSystemPrompt, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, computeFindingId, containsAll, createAntiSlopJudge, createChatClient, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, liftSeverity, linterJudge, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, makeFinding, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeBackendIntegrity, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, traceMutator, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
5930
+ export { ANALYST_SEVERITIES, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AgentProfile, type AlignmentOp, type Analyst, type AnalystContext, type AnalystCost, type AnalystFinding, type AnalystHooks, type AnalystInputKind, AnalystRegistry, type AnalystRegistryOptions, type AnalystRequirements, type AnalystRunEvent, type AnalystRunInputs, type AnalystRunResult, type AnalystRunSummary, type AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BackendIntegrityError, type BackendIntegrityReport, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, type BudgetPolicy, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, type ChatCallOpts, type ChatClient, type ChatRequest, type ChatResponse, type ChatTransport, CheckResult, type CliBridgeTransportOpts, CollectedArtifacts, type CommandRunner, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrectnessChecker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateChatClientOpts, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type DirectProviderTransportOpts, type Direction, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvidenceRef, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAdapterOpts, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallRequest, LlmCallResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MockTransportOpts, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, Mutex, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoResult, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegistryRunOpts, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouterTransportOpts, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxSdkTransportOpts, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, agentProfileHash, aggregatePrReviewScore, aggregateRunScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertRealBackend, attributeCounterfactuals, bisect, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, computeFindingId, containsAll, createAntiSlopJudge, createChatClient, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, crowdingDistance, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, dominates, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, liftSeverity, linterJudge, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, makeFinding, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, paretoFrontier, paretoFrontierWithCrowding, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scalarScore, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeBackendIntegrity, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };