@tangle-network/agent-eval 0.24.0 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -6,12 +6,16 @@ import { A as AgentEvalError } from './errors-BZ9sTdz7.js';
6
6
  export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-BZ9sTdz7.js';
7
7
  import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory } from './feedback-trajectory-DfFdrraJ.js';
8
8
  export { F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-DfFdrraJ.js';
9
- import { A as ActionableSideInfo, a4 as Objective, a5 as ParetoResult, E as EvolvableVariant, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-BXGs_9V0.js';
10
- export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a6 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a3 as GateDecision, a7 as GateEvidence, G as GenerationReport, a8 as HeldOutGate, a9 as HeldOutGateConfig, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-BXGs_9V0.js';
9
+ import { A as ActionableSideInfo, h as MultiShotRunner, j as MultiShotScorer, c as MultiShotMutateAdapter, a4 as HeldOutGateConfig, E as EvolvableVariant, m as MultiShotTrialResult, e as MultiShotOptimizationResult, a3 as GateDecision, a5 as Objective, a6 as ParetoResult, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-DZVXOCK_.js';
10
+ export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a7 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a8 as GateEvidence, G as GenerationReport, a9 as HeldOutGate, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, d as MultiShotOptimizationConfig, f as MultiShotRun, g as MultiShotRunInput, i as MultiShotScore, k as MultiShotSplit, l as MultiShotTrace, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-DZVXOCK_.js';
11
11
  export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
12
- import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-TDPn1cxq.js';
13
- export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-TDPn1cxq.js';
14
- import { R as Run$1, S as Span, b as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, T as TraceStore, g as BudgetSpec, h as RunFilter, L as LlmSpan } from './store-Db2Bv8Cf.js';
12
+ import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-wfUySN5F.js';
13
+ export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-wfUySN5F.js';
14
+ import { a as FailureCluster } from './failure-cluster-C2EGSDiT.js';
15
+ export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-C2EGSDiT.js';
16
+ import { a as RunSplitTag, R as RunRecord } from './run-record-CqzahIbx.js';
17
+ export { c as RunJudgeMetadata, d as RunOutcome, e as RunRecordValidationError, b as RunTokenUsage, i as isRunRecord, p as parseRunRecordSafe, r as roundTripRunRecord, v as validateRunRecord } from './run-record-CqzahIbx.js';
18
+ import { T as TraceStore, R as Run$1, S as Span, b as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, g as BudgetSpec, h as RunFilter, L as LlmSpan } from './store-Db2Bv8Cf.js';
15
19
  export { i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, M as Message, e as RetrievalSpan, m as RunLayer, n as RunStatus, f as SandboxSpan, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
16
20
  import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-B2kWEdh9.js';
17
21
  export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-B2kWEdh9.js';
@@ -20,25 +24,146 @@ export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b a
20
24
  export { F as FileSystemRawProviderSink, d as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, e as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, f as RawProviderDirection, c as RawProviderEvent, R as RawProviderSink, g as RawProviderSinkFilter, h as RunIntegrityError, a as RunIntegrityExpectations, i as RunIntegrityIssue, j as RunIntegrityIssueCode, b as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-DK2EBVZC.js';
21
25
  export { D as DEFAULT_REDACTION_RULES, O as OTEL_AGENT_EVAL_SCOPE, a as OtlpExport, b as OtlpResourceSpans, c as OtlpSpan, R as REDACTION_VERSION, d as RedactionReport, e as RedactionRule, f as ReplayCache, g as ReplayCacheEntry, h as ReplayCacheMissError, i as ReplayCacheStats, j as ReplayFetchOptions, k as createReplayFetch, l as exportRunAsOtlp, m as iterateRawCalls, r as redactString, n as redactValue } from './replay-BL96gCEP.js';
22
26
  export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-DODUYdPg.js';
23
- export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-C2EGSDiT.js';
24
27
  import { a as BaselineReport } from './baseline-4R5deP0N.js';
25
28
  export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-4R5deP0N.js';
26
29
  import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
27
30
  export { b as buildTrajectory } from './trajectory-CnoBo-JY.js';
28
31
  import { a as DatasetScenario, c as Dataset } from './dataset-CiK_3LDr.js';
29
32
  export { d as DatasetDifficulty, b as DatasetManifest, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-CiK_3LDr.js';
30
- export { C as CalibrationResult, a as CandidateScore, D as DEFAULT_RED_TEAM_CORPUS, E as EuRiskClass, G as GoldenItem, b as GovernanceContext, c as GovernanceFinding, d as GovernanceReport, P as PositionalBiasResult, R as RedTeamCase, e as RedTeamCategory, f as RedTeamFinding, g as RedTeamPayload, h as RedTeamReport, S as SelfPreferenceResult, U as UseCaseSignals, V as VerbosityBiasResult, i as calibrateJudge, j as classifyEuAiRisk, k as euAiActReport, n as nistAiRmfReport, p as positionalBias, r as redTeamDataset, l as redTeamReport, m as renderMarkdown, s as scoreRedTeamOutput, o as selfPreference, q as soc2Report, t as summarize, u as toolNamesForRun, v as verbosityBias } from './index-Oj9fAPPN.js';
33
+ export { C as CalibrationResult, a as CandidateScore, b as ContinuousAgreement, c as ContinuousAgreementOptions, d as ContinuousCalibrationResult, D as DEFAULT_RED_TEAM_CORPUS, E as EuRiskClass, G as GoldenItem, e as GovernanceContext, f as GovernanceFinding, g as GovernanceReport, P as PositionalBiasResult, R as RedTeamCase, h as RedTeamCategory, i as RedTeamFinding, j as RedTeamPayload, k as RedTeamReport, S as SelfPreferenceResult, U as UseCaseSignals, V as VerbosityBiasResult, l as calibrateJudge, m as calibrateJudgeContinuous, n as classifyEuAiRisk, o as continuousAgreement, p as euAiActReport, q as nistAiRmfReport, r as positionalBias, s as redTeamDataset, t as redTeamReport, u as renderMarkdown, v as scoreRedTeamOutput, w as selfPreference, x as soc2Report, y as summarize, z as toolNamesForRun, A as verbosityBias } from './index-D3iBCjdF.js';
31
34
  import { a as PrmGrader } from './rubric-D5tjHNJQ.js';
32
35
  import { L as Layer, S as Severity, a as LayerResult, b as VerifyContext } from './multi-layer-verifier-LkP3LVKj.js';
33
36
  export { F as Finding, c as LayerStatus, M as MultiLayerVerifier, V as VerificationReport, d as VerifyOptions, g as gradeSemanticStatus } from './multi-layer-verifier-LkP3LVKj.js';
34
- import { L as LlmClientOptions } from './researcher-CUOiGcGv.js';
35
- export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, m as LlmCallError, n as LlmCallRequest, o as LlmCallResult, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as callLlm, x as callLlmJson, y as probeLlm, r as runEvalCampaign, z as stripFencedJson } from './researcher-CUOiGcGv.js';
37
+ import { L as LlmClientOptions } from './researcher-bGkI7vCl.js';
38
+ export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, m as LlmCallError, n as LlmCallRequest, o as LlmCallResult, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as callLlm, x as callLlmJson, y as probeLlm, r as runEvalCampaign, z as stripFencedJson } from './researcher-bGkI7vCl.js';
36
39
  export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index--fVrWDiR.js';
37
- import { R as RunRecord } from './run-record-CqzahIbx.js';
38
- export { c as RunJudgeMetadata, d as RunOutcome, e as RunRecordValidationError, a as RunSplitTag, b as RunTokenUsage, i as isRunRecord, p as parseRunRecordSafe, r as roundTripRunRecord, v as validateRunRecord } from './run-record-CqzahIbx.js';
39
40
  export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-Dgz1n51-.js';
40
41
  import './outcome-store-D6KWmYvj.js';
41
42
 
43
+ /**
44
+ * Automated pull request opener for the production loop.
45
+ *
46
+ * `runProductionLoop` produces a `promotedPrompt` string and a release
47
+ * scorecard. To close the eval → prod → eval cycle the framework needs
48
+ * to land that prompt as a reviewable code change. This module does
49
+ * exactly that:
50
+ *
51
+ * 1. Stage a branch off `baseBranch`.
52
+ * 2. Write each `fileChange` into the worktree.
53
+ * 3. Commit + push.
54
+ * 4. Open a PR via the GitHub API.
55
+ *
56
+ * Two transports ship in core:
57
+ *
58
+ * - `ghCliClient(opts)` — shells out to the `gh` CLI. No extra deps,
59
+ * re-uses the developer machine's `gh auth` state, works with both
60
+ * github.com and GitHub Enterprise. This is the recommended default.
61
+ * - `httpGithubClient(opts)` — direct `fetch` against `api.github.com`
62
+ * with a bearer token. Useful in CI where `gh` may not be installed.
63
+ *
64
+ * Both implement the small `AutoPrClient` interface, so tests substitute
65
+ * a fake without spinning a process or network.
66
+ *
67
+ * @experimental — added in 0.25.0. Surface may evolve as consumers wire
68
+ * it into CI workflows.
69
+ */
70
+ interface FileChange {
71
+ /** Repo-relative path. Forward slashes; no `..`. */
72
+ path: string;
73
+ /** New file contents. UTF-8. */
74
+ contents: string;
75
+ /** Optional explanatory comment shown in the commit body. */
76
+ rationale?: string;
77
+ }
78
+ interface RepoRef {
79
+ owner: string;
80
+ name: string;
81
+ }
82
+ interface ProposeAutomatedPullRequestInput {
83
+ repo: RepoRef;
84
+ /** Branch to base the PR on. Default `'main'`. */
85
+ baseBranch?: string;
86
+ /** New branch name. Use a prefix + a short stable id; no spaces. */
87
+ branchName: string;
88
+ fileChanges: FileChange[];
89
+ title: string;
90
+ body: string;
91
+ /** Optional GitHub usernames to request review from. */
92
+ reviewers?: string[];
93
+ /** Optional labels to apply. */
94
+ labels?: string[];
95
+ /** Commit author name. Default: derived from the GitHub client. */
96
+ authorName?: string;
97
+ /** Commit author email. Default: derived from the GitHub client. */
98
+ authorEmail?: string;
99
+ /** Dry-run — do not push or open a PR; just return the would-be plan. */
100
+ dryRun?: boolean;
101
+ }
102
+ interface ProposeAutomatedPullRequestResult {
103
+ prUrl: string;
104
+ branchName: string;
105
+ headSha: string;
106
+ dryRun: boolean;
107
+ }
108
+ /** Pluggable transport for the auto-PR pipeline. */
109
+ interface AutoPrClient {
110
+ /**
111
+ * Create a branch from `baseBranch`, write file changes, commit, push,
112
+ * and open a PR. Returns the PR's HTML url and head SHA.
113
+ *
114
+ * Implementations must be idempotent on `branchName`: if the branch
115
+ * already exists with the same head SHA as the would-be commit, return
116
+ * the existing PR rather than failing. This makes the production loop
117
+ * safe to retry on transient errors.
118
+ */
119
+ proposeChange(input: ProposeAutomatedPullRequestInput): Promise<ProposeAutomatedPullRequestResult>;
120
+ }
121
+ declare function proposeAutomatedPullRequest(client: AutoPrClient, input: ProposeAutomatedPullRequestInput): Promise<ProposeAutomatedPullRequestResult>;
122
+ interface HttpGithubClientOptions {
123
+ /** Personal access token, GitHub App token, or `GITHUB_TOKEN` from Actions. */
124
+ token: string;
125
+ /** Override for GitHub Enterprise. Default `'https://api.github.com'`. */
126
+ apiBase?: string;
127
+ /** Test seam — defaults to global `fetch`. */
128
+ fetchImpl?: typeof fetch;
129
+ /** Test seam — clock for commit timestamps. */
130
+ now?: () => Date;
131
+ }
132
+ /**
133
+ * Direct REST-API GitHub client. No external deps.
134
+ *
135
+ * Idempotency strategy: before creating refs/commits/PRs, check whether
136
+ * the branch already exists at the desired tree. If so, return the
137
+ * existing PR (or open one if missing). Errors from concurrent runs
138
+ * (`Reference already exists`) are caught and treated as success.
139
+ */
140
+ declare function httpGithubClient(opts: HttpGithubClientOptions): AutoPrClient;
141
+ interface GhCliClientOptions {
142
+ /** Override the CLI binary (`gh`). For testing. */
143
+ bin?: string;
144
+ /** Working directory containing a clone of `repo`. Default: process cwd. */
145
+ cwd?: string;
146
+ /** Test seam: process spawner. Default: node:child_process spawn. */
147
+ exec?: (bin: string, args: string[], opts: {
148
+ cwd: string;
149
+ stdin?: string;
150
+ }) => Promise<{
151
+ stdout: string;
152
+ stderr: string;
153
+ exitCode: number;
154
+ }>;
155
+ }
156
+ /**
157
+ * `gh` CLI transport. Requires:
158
+ * - `gh` installed and authenticated (`gh auth status`).
159
+ * - A local clone of the repo with a clean working tree.
160
+ * - `git` on PATH.
161
+ *
162
+ * Uses `gh api` for repo metadata and `gh pr create` for the PR. The
163
+ * actual commit lands via `git`, which keeps `gh`'s footprint minimal.
164
+ */
165
+ declare function ghCliClient(opts?: GhCliClientOptions): AutoPrClient;
166
+
42
167
  interface Scenario {
43
168
  id: string;
44
169
  persona: string;
@@ -595,6 +720,181 @@ declare class MetricsCollector {
595
720
  getConvergenceCurve(): number[];
596
721
  }
597
722
 
723
+ /**
724
+ * ProductionLoop — the substrate that closes eval → prod → eval.
725
+ *
726
+ * Static prompts decay. Yesterday's regulation flips today; yesterday's
727
+ * tool quirk becomes today's incident. A production agent that ships a
728
+ * static prompt and never re-trains is on a clock.
729
+ *
730
+ * `runProductionLoop` is the orchestration layer over the eval substrate:
731
+ *
732
+ * 1. Ingest production traces + user feedback (via the wire HTTP
733
+ * ingestion endpoints, or directly through any `TraceStore` and
734
+ * `FeedbackTrajectoryStore` implementation).
735
+ * 2. Cluster the failures (`failureClusterView`) and prioritize by
736
+ * size × severity.
737
+ * 3. If any cluster crosses the consumer's threshold, run a
738
+ * `runMultiShotOptimization` round seeded by the current production
739
+ * prompt against holdout-shape scenarios derived from the offending
740
+ * cluster.
741
+ * 4. Gate the promoted prompt with `evaluateReleaseConfidence`. Fail
742
+ * closed.
743
+ * 5. If the gate passes and an `AutoPrClient` is wired, open a PR with
744
+ * the new prompt. Otherwise return the proposed change.
745
+ *
746
+ * One call = one cycle. Cron / GitHub Actions are the caller's job. The
747
+ * primitive is idempotent + replayable: re-running with the same
748
+ * `runId` will produce the same plan.
749
+ *
750
+ * @experimental — added in 0.25.0. Surface may evolve as the 5 product
751
+ * agents wire it in.
752
+ */
753
+
754
+ interface FailureClusterConfig {
755
+ /** Minimum runs in a cluster before it triggers an evolve round. Default 5. */
756
+ minClusterSize?: number;
757
+ /**
758
+ * Severity threshold. A cluster is "actionable" when its size
759
+ * normalized by total runs exceeds this. Default 0.05 (5% of all runs).
760
+ */
761
+ minSeverityRatio?: number;
762
+ /**
763
+ * Maximum number of clusters to react to in one cycle. Acting on too
764
+ * many at once obscures attribution. Default 1 — the worst cluster.
765
+ */
766
+ maxClustersPerCycle?: number;
767
+ }
768
+ interface ProductionEvolveConfig<P = string> {
769
+ /** How to run a candidate prompt against a scenario. */
770
+ runner: MultiShotRunner<P>;
771
+ /** How to score the trajectory. Usually a calibrated judge. */
772
+ scorer: MultiShotScorer<P>;
773
+ /** How to mutate. Addendum-style mutators (append vs. rewrite) work best. */
774
+ mutator: MultiShotMutateAdapter<P>;
775
+ /** The current production prompt. Acts as the baseline + seed. */
776
+ baselinePrompt: P;
777
+ /** Stable id for the baseline variant. Default `'baseline'`. */
778
+ baselineId?: string;
779
+ /** Scenarios resembling production load. Used as the holdout split. */
780
+ holdoutScenarios: Scenario[];
781
+ /** Scenarios used during search. Default: derived from `holdoutScenarios` via deterministic split. */
782
+ searchScenarios?: Scenario[];
783
+ /** Gate config for the held-out promotion check. */
784
+ gate: HeldOutGateConfig;
785
+ /** Reps per (variant × scenario) cell. Default 3. */
786
+ reps?: number;
787
+ /** Number of mutation generations. Default 3. */
788
+ generations?: number;
789
+ /** Population size per generation. Default 4. */
790
+ populationSize?: number;
791
+ /** Concurrent score() calls. Default 1. */
792
+ scoreConcurrency?: number;
793
+ /**
794
+ * Optional bridge from a scored trial into a paper-grade RunRecord.
795
+ * If omitted, the loop synthesises a minimal record sufficient for
796
+ * `HeldOutGate` and `evaluateReleaseConfidence`.
797
+ */
798
+ toRunRecord?: (input: {
799
+ variant: EvolvableVariant<P>;
800
+ scenarioId: string;
801
+ rep: number;
802
+ split: RunSplitTag;
803
+ seed: number;
804
+ trial: MultiShotTrialResult;
805
+ }) => RunRecord;
806
+ }
807
+ interface ProductionShipConfig {
808
+ repo: RepoRef;
809
+ /** Branch name prefix. Final branch = `${branchPrefix}/${runId}`. */
810
+ branchPrefix: string;
811
+ /** Path (repo-relative) of the file holding the production prompt. */
812
+ promptFilePath: string;
813
+ /** Base branch for the PR. Default `'main'`. */
814
+ baseBranch?: string;
815
+ reviewers?: string[];
816
+ labels?: string[];
817
+ /** Required: the auto-PR transport. Use `ghCliClient()` or `httpGithubClient()`. */
818
+ client: AutoPrClient;
819
+ /** Skip the actual push + PR call — for sanity-checking the plan. Default false. */
820
+ dryRun?: boolean;
821
+ /** Render PR body from the loop's findings. Optional override. */
822
+ renderBody?: (ctx: ProductionLoopRenderContext) => string;
823
+ /** Render the file contents from the new prompt. Default: serialize as the file. */
824
+ renderPromptFile?: (newPrompt: string, oldFileContents: string | null) => string;
825
+ /** Read the current prompt file contents for diff context. Optional. */
826
+ readCurrentPromptFile?: () => Promise<string | null>;
827
+ }
828
+ interface ProductionLoopCronConfig {
829
+ cadence: 'weekly' | 'daily' | 'hourly';
830
+ /** Optional jitter (seconds) the consumer's scheduler should add. Surface-only. */
831
+ jitterSec?: number;
832
+ }
833
+ interface RunProductionLoopOptions<P = string> {
834
+ /** Stable id; deterministic outputs when reused. */
835
+ runId: string;
836
+ /** Human label — surfaces in PR titles and reports. */
837
+ target: string;
838
+ traceStore: TraceStore;
839
+ feedbackStore: FeedbackTrajectoryStore;
840
+ cluster: FailureClusterConfig;
841
+ evolve: ProductionEvolveConfig<P>;
842
+ /** When omitted, the loop returns the proposed prompt without opening a PR. */
843
+ ship?: ProductionShipConfig;
844
+ /** Surface-only — encodes scheduler expectations into the artifact. */
845
+ cron?: ProductionLoopCronConfig;
846
+ /** Release confidence thresholds. Default: library defaults. */
847
+ releaseThresholds?: ReleaseConfidenceThresholds;
848
+ /** Now() seam for reproducibility in tests. */
849
+ now?: () => Date;
850
+ }
851
+ type ProductionLoopDecision = 'no_actionable_failures' | 'evolve_yielded_no_improvement' | 'gate_failed' | 'proposed_change' | 'pr_opened';
852
+ interface ProductionLoopRenderContext {
853
+ runId: string;
854
+ target: string;
855
+ decision: ProductionLoopDecision;
856
+ /** Clusters seen in production this cycle, sorted by severity. */
857
+ clusters: FailureCluster[];
858
+ /** The cluster the loop acted on (if any). */
859
+ actedOnCluster: FailureCluster | null;
860
+ /** Production runs observed this cycle. */
861
+ observedRunCount: number;
862
+ /** Feedback trajectories observed this cycle. */
863
+ observedFeedbackCount: number;
864
+ /** Evolve result (if evolve ran). */
865
+ evolution: MultiShotOptimizationResult<unknown> | null;
866
+ /** Release gate verdict (if evolve ran). */
867
+ release: ReleaseConfidenceScorecard | null;
868
+ /** Held-out gate decision (if a candidate was paired against the baseline). */
869
+ gate: GateDecision | null;
870
+ /** The baseline (current production) prompt as a string. */
871
+ baselinePromptString: string;
872
+ /** The proposed new prompt as a string. Empty if no change was proposed. */
873
+ promotedPromptString: string;
874
+ }
875
+ interface ProductionLoopResult {
876
+ runId: string;
877
+ target: string;
878
+ decision: ProductionLoopDecision;
879
+ startedAt: string;
880
+ finishedAt: string;
881
+ observedRunCount: number;
882
+ observedFeedbackCount: number;
883
+ clusters: FailureCluster[];
884
+ actedOnCluster: FailureCluster | null;
885
+ evolution: MultiShotOptimizationResult<unknown> | null;
886
+ release: ReleaseConfidenceScorecard | null;
887
+ gate: GateDecision | null;
888
+ /** Baseline prompt as it entered the cycle. */
889
+ baselinePrompt: unknown;
890
+ /** Promoted prompt — equals baseline when no change is proposed. */
891
+ promotedPrompt: unknown;
892
+ /** PR artifact when `ship` was wired and gate passed. */
893
+ pullRequest: ProposeAutomatedPullRequestResult | null;
894
+ cron: ProductionLoopCronConfig | null;
895
+ }
896
+ declare function runProductionLoop<P = string>(opts: RunProductionLoopOptions<P>): Promise<ProductionLoopResult>;
897
+
598
898
  /**
599
899
  * ScenarioRegistry — manages scenario discovery and filtering.
600
900
  *
@@ -4453,6 +4753,63 @@ declare class Mutex {
4453
4753
  get pending(): number;
4454
4754
  }
4455
4755
 
4756
+ /**
4757
+ * Persona discovery — replaces every consumer's hardcoded TRAINING_PERSONA_FILES.
4758
+ *
4759
+ * Today's failure mode: each product agent (legal/gtm/tax/creative) defines
4760
+ * a TRAINING_PERSONA_FILES const with 5 hardcoded filenames. When the 2yr
4761
+ * rewrite added 10+ new personas, those personas existed on disk but the
4762
+ * evolve runner never loaded them — the new rubric dims (audit_defendability,
4763
+ * intake_discipline, etc) got no training signal. The personas were
4764
+ * cosmetic, the rewrites partially uninformed.
4765
+ *
4766
+ * `discoverPersonas` walks a personas directory and returns every persona
4767
+ * file matching the convention. Consumers can filter by include/exclude
4768
+ * patterns. Default behavior — discover everything — eliminates the
4769
+ * "forgot to add the new persona to the list" failure mode.
4770
+ */
4771
+ interface DiscoverPersonasOptions {
4772
+ /**
4773
+ * Regex applied to filenames. Files that don't match are skipped.
4774
+ * Default: `^[0-9]{2}-.+\.(yaml|yml|json|md)$` (the prevailing convention
4775
+ * across legal/gtm/tax/creative: `NN-slug.yaml`).
4776
+ */
4777
+ pattern?: RegExp;
4778
+ /**
4779
+ * Filenames (or basenames) to skip. Use this to exclude WIP / archived
4780
+ * personas without removing the file.
4781
+ */
4782
+ exclude?: readonly string[];
4783
+ /**
4784
+ * If set, return only personas whose basename contains one of these
4785
+ * substrings (post-pattern filter). Used by the CLI's `--personas a,b,c`
4786
+ * flag — consumers pass through.
4787
+ */
4788
+ include?: readonly string[];
4789
+ /**
4790
+ * Recurse into subdirectories. Default false (legal/gtm/tax/creative all
4791
+ * store personas flat).
4792
+ */
4793
+ recursive?: boolean;
4794
+ }
4795
+ interface DiscoveredPersona {
4796
+ /** Absolute file path. */
4797
+ path: string;
4798
+ /** Filename without directory. */
4799
+ filename: string;
4800
+ /** Filename without extension — the conventional persona id. */
4801
+ id: string;
4802
+ }
4803
+ /**
4804
+ * Walk `dir` and return every persona file matching the convention. Async
4805
+ * because the consumer almost always wants this to be I/O-driven (so a new
4806
+ * persona added on disk is picked up without a code change).
4807
+ *
4808
+ * Sorted by filename (which gives stable persona id order via the `NN-`
4809
+ * numeric prefix convention) for reproducibility.
4810
+ */
4811
+ declare function discoverPersonas(dir: string, opts?: DiscoverPersonasOptions): Promise<DiscoveredPersona[]>;
4812
+
4456
4813
  /**
4457
4814
  * GoldenMatcher — fuzzy matcher for "did the agent produce the expected things?".
4458
4815
  *
@@ -4556,6 +4913,101 @@ declare class JsonlTrialCache implements TrialCache {
4556
4913
  setSync(key: string, value: TrialResult): void;
4557
4914
  }
4558
4915
 
4916
+ /**
4917
+ * Judge-retry wrapper.
4918
+ *
4919
+ * Today's failure mode: a judge LLM call aborts mid-stream (connection
4920
+ * dropped, model timed out, schema rejected) → consumer's try/catch swallows
4921
+ * the error and returns `score: 0`. The eval composite then weights that
4922
+ * zero into the mean, silently corrupting the score. Today's tax/gtm evals
4923
+ * had `judge=0` across every trial — the prompt rewrites couldn't be
4924
+ * evaluated honestly because the measurement instrument was broken.
4925
+ *
4926
+ * `withJudgeRetry` is the substrate fix. It wraps a single judge invocation
4927
+ * with:
4928
+ *
4929
+ * 1. N retry attempts on transient failures (abort, timeout, network).
4930
+ * 2. Optional fallback-model rotation — try the next model in the list
4931
+ * if the primary keeps aborting (a verbose new prompt may stream-abort
4932
+ * on claude-code/sonnet but succeed on kimi-code/k2p6).
4933
+ * 3. Exponential backoff between attempts.
4934
+ * 4. A typed outcome `{ succeeded, attempts, value, error }` that callers
4935
+ * MUST decide what to do with. No silent zero.
4936
+ *
4937
+ * The reporting contract: callers ship `TrialResult.judgeSucceeded = succeeded`
4938
+ * and `TrialResult.judgeAttempts = attempts`. `aggregateTrials({mode: 'exclude-failed'})`
4939
+ * then skips failed-judge trials when computing composites.
4940
+ *
4941
+ * The library does NOT decide what score to record on failure — that's the
4942
+ * caller's product choice. Today's product agents (legal/gtm/tax/creative)
4943
+ * should set `score: NaN` + `judgeSucceeded: false` + `error: ...` so the
4944
+ * aggregator's exclude-failed mode drops the trial. Defaulting to 0 is what
4945
+ * caused today's data corruption.
4946
+ */
4947
+ /**
4948
+ * Retry policy for judge LLM calls.
4949
+ *
4950
+ * Defaults are tuned for the verbose post-2yr-rewrite prompts that exceed
4951
+ * the 60s `callLlm` default and abort on streaming. Pick a different timeout
4952
+ * for cheap-and-quick judges (e.g., 30s) or longer for thinking models.
4953
+ */
4954
+ interface JudgeRetryPolicy {
4955
+ /** Max attempts per model. Default 3 (one initial + two retries). */
4956
+ maxAttempts?: number;
4957
+ /** Per-attempt timeout in ms. Default 90_000 (1.5×agent-eval's 60s default). */
4958
+ timeoutMs?: number;
4959
+ /**
4960
+ * Models to try, in order. The first model is the primary; subsequent
4961
+ * models are fallbacks invoked only when ALL retries on the previous
4962
+ * model have been exhausted. Example: `['claude-code/sonnet', 'kimi-code/k2p6']`
4963
+ * runs claude-code up to maxAttempts times, then falls back to kimi.
4964
+ * If omitted, the caller's judge function controls model selection and
4965
+ * the retries apply to that single model.
4966
+ */
4967
+ models?: readonly string[];
4968
+ /** Exponential backoff function, default `attempt → min(500 * 2^attempt, 16_000)`. */
4969
+ backoffMs?: (attempt: number) => number;
4970
+ /**
4971
+ * Predicate deciding whether an error should trigger a retry. Default
4972
+ * retries on: AbortError, TimeoutError, `fetch failed`, `ECONNRESET`,
4973
+ * `[This operation was aborted]`, and any LlmCallError with status in
4974
+ * {429, 502, 503, 504}. JSON-parse errors are NOT retriable (the model
4975
+ * needs prompt adjustment, not another shot).
4976
+ */
4977
+ isRetryable?: (err: unknown) => boolean;
4978
+ }
4979
+ /** Outcome of a wrapped judge invocation. */
4980
+ interface JudgeRetryOutcome<T> {
4981
+ /** The judge's returned value when `succeeded === true`. */
4982
+ value: T | null;
4983
+ /** True iff one of the attempts completed without throwing. */
4984
+ succeeded: boolean;
4985
+ /** Total attempts made across all models. */
4986
+ attempts: number;
4987
+ /** Which model the successful attempt used (when succeeded). */
4988
+ modelUsed?: string;
4989
+ /** Last error captured when `succeeded === false`. */
4990
+ error?: Error;
4991
+ /** Per-attempt error log for forensics. */
4992
+ attemptErrors: Array<{
4993
+ attempt: number;
4994
+ model: string;
4995
+ error: string;
4996
+ }>;
4997
+ }
4998
+ /**
4999
+ * Wrap a judge call with retry + fallback-model + typed outcome semantics.
5000
+ *
5001
+ * The `judgeFn` signature is `(model: string, signal: AbortSignal) => Promise<T>`.
5002
+ * The signal will be aborted at `timeoutMs`. Callers should pass the signal
5003
+ * to their underlying fetch/SDK call so the abort actually fires.
5004
+ *
5005
+ * Returns a typed outcome — callers MUST inspect `succeeded` before using
5006
+ * `value`. The library refuses to default to a silent zero score because that
5007
+ * is exactly what caused today's eval data corruption.
5008
+ */
5009
+ declare function withJudgeRetry<T>(judgeFn: (model: string, signal: AbortSignal) => Promise<T>, policy?: JudgeRetryPolicy): Promise<JudgeRetryOutcome<T>>;
5010
+
4559
5011
  /**
4560
5012
  * LockedJsonlAppender — mutex-serialized JSONL append helper for arbitrary
4561
5013
  * payloads. The reference-replay store does the same thing for typed
@@ -4617,4 +5069,69 @@ interface ReferenceReplaySteeringRowsOptions<Input = unknown> {
4617
5069
  declare function referenceReplayRunsToSteeringRows<Input = unknown>(runs: ReferenceReplayRun<Input>[], options?: ReferenceReplaySteeringRowsOptions<Input>): SteeringOptimizationRow[];
4618
5070
  declare function referenceReplayScenarioToRunScore(scenarioScore: ReferenceReplayScenarioScore, durationMs?: number): RunScore;
4619
5071
 
4620
- export { ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DirEntry, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvalResult, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, ProductClient, type ProductClientConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TestResult, type ThresholdContract, TokenCounter, type TokenSpec, TraceEmitter, TraceEvent, TraceStore, Trajectory, TrajectoryStep, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, VerifyContext, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateRunScore, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, buildReviewerPrompt, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, confidenceInterval, containsAll, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, linterJudge, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, normalizeScores, notBlocked, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeSynthesisTargets, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, typoMutator, urlContains, verifyManifest, visualDiff, viteDeployRunner, weightedMean, weightedRecall, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };
5072
+ /**
5073
+ * Trial-aggregator modes.
5074
+ *
5075
+ * The prompt-evolution loop's internal `aggregateTrials` defaulted to
5076
+ * including every non-`error` trial in the mean — which corrupted the mean
5077
+ * when a trial had `score: 0` because the judge silently aborted (the
5078
+ * caller's try/catch swallowed the abort and returned zero). Today's
5079
+ * tax/gtm evals show this: every trial scored judge=0 because the judge
5080
+ * aborted, and the composite then reflected `structural * 0.3 + slop * 0.1`
5081
+ * instead of the intended `judge * 0.6 + structural * 0.3 + slop * 0.1`.
5082
+ *
5083
+ * `aggregateTrialsByMode` is the substrate fix. Consumers can choose:
5084
+ *
5085
+ * - `strict-fail` — any trial with `judgeSucceeded === false` fails the
5086
+ * whole aggregate. Right for production-gate runs where one corrupted
5087
+ * trial means "we don't know if the prompt is good, halt the gate."
5088
+ *
5089
+ * - `exclude-failed` — drop trials with `judgeSucceeded === false` from
5090
+ * the mean; report `failedTrials` separately. Right for research /
5091
+ * comparison runs where you want to use the signal that DID land.
5092
+ * Default for new code.
5093
+ *
5094
+ * - `zero-fill` — legacy behavior: failed trials count as score=0 in
5095
+ * the mean. Default ONLY for backwards-compat with adapters that
5096
+ * don't yet set `judgeSucceeded`. Migrate off this — it's the source
5097
+ * of today's data corruption.
5098
+ */
5099
+
5100
+ type AggregatorMode = 'strict-fail' | 'exclude-failed' | 'zero-fill';
5101
+ interface TrialAggregate {
5102
+ /** Mean score over the trials counted by the chosen mode. */
5103
+ meanScore: number;
5104
+ /** Mean cost (legacy, kept for compatibility). */
5105
+ meanCost: number;
5106
+ /** Mean wall time (legacy). */
5107
+ meanDurationMs: number;
5108
+ /** ok-rate (legacy). */
5109
+ okRate: number;
5110
+ /** Trials counted in the mean (mode-dependent). */
5111
+ countedTrials: number;
5112
+ /** Trials excluded because `judgeSucceeded === false` (exclude-failed mode). */
5113
+ excludedFailedTrials: number;
5114
+ /** Total trials passed in. */
5115
+ totalTrials: number;
5116
+ /** Mean of every numeric metric across counted trials. */
5117
+ metrics: Record<string, number>;
5118
+ /**
5119
+ * Set when mode is `strict-fail` AND at least one trial had
5120
+ * `judgeSucceeded === false`. Caller should refuse to use this aggregate
5121
+ * downstream — the eval is corrupt.
5122
+ */
5123
+ strictFailure?: {
5124
+ failedCount: number;
5125
+ firstError?: string;
5126
+ };
5127
+ }
5128
+ /**
5129
+ * Aggregate trials with explicit failed-judge handling. Returns counts for
5130
+ * counted + excluded so callers can surface "the score is based on 7 of 10
5131
+ * trials; 3 judges failed" instead of silently weighting zero.
5132
+ */
5133
+ declare function aggregateTrialsByMode(trials: TrialResult[], opts: {
5134
+ mode: AggregatorMode;
5135
+ }): TrialAggregate;
5136
+
5137
+ export { ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AggregatorMode, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DirEntry, type DiscoverPersonasOptions, type DiscoveredPersona, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvalResult, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, type JudgeRubric, JudgeRunner, type JudgeScore, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, ProductClient, type ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TestResult, type ThresholdContract, TokenCounter, type TokenSpec, TraceEmitter, TraceEvent, TraceStore, Trajectory, TrajectoryStep, type TrialAggregate, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, VerifyContext, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateRunScore, aggregateTrialsByMode, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, buildReviewerPrompt, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, confidenceInterval, containsAll, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, linterJudge, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, normalizeScores, notBlocked, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, typoMutator, urlContains, verifyManifest, visualDiff, viteDeployRunner, weightedMean, weightedRecall, whitespaceCollapseMutator, wilcoxonSignedRank, withJudgeRetry, wranglerDeployRunner };