@tangle-network/agent-eval 0.27.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +72 -0
- package/README.md +4 -5
- package/dist/{baseline-4R5deP0N.d.ts → baseline-BwdCXUS8.d.ts} +1 -1
- package/dist/builder-eval/index.d.ts +3 -3
- package/dist/builder-eval/index.js +1 -1
- package/dist/{chunk-WWYCWKUM.js → chunk-3CKU6VGU.js} +2 -2
- package/dist/{chunk-2A5XJB43.js → chunk-5AKPEK5L.js} +3 -3
- package/dist/chunk-5AKPEK5L.js.map +1 -0
- package/dist/{chunk-RAF443UI.js → chunk-DBIGN5MJ.js} +2 -2
- package/dist/{chunk-JLZQWFV3.js → chunk-K33INZHH.js} +2 -2
- package/dist/chunk-K33INZHH.js.map +1 -0
- package/dist/{chunk-NU65VQ7M.js → chunk-MAZ26DC7.js} +1 -1
- package/dist/chunk-MAZ26DC7.js.map +1 -0
- package/dist/{chunk-LSH4MMOZ.js → chunk-NCRFYPS3.js} +1 -1
- package/dist/chunk-NCRFYPS3.js.map +1 -0
- package/dist/{chunk-ZN274SWR.js → chunk-PALJO75S.js} +2 -2
- package/dist/{chunk-OWLAAMME.js → chunk-QHF6EQKK.js} +3 -2
- package/dist/chunk-QHF6EQKK.js.map +1 -0
- package/dist/chunk-R5UQJNKC.js +722 -0
- package/dist/chunk-R5UQJNKC.js.map +1 -0
- package/dist/{chunk-SESZDQPX.js → chunk-RUI6SIHY.js} +3 -3
- package/dist/chunk-RUI6SIHY.js.map +1 -0
- package/dist/{chunk-WHZMVFUV.js → chunk-SZSBQUIJ.js} +2 -2
- package/dist/chunk-SZSBQUIJ.js.map +1 -0
- package/dist/chunk-UW4NOOZI.js +1561 -0
- package/dist/chunk-UW4NOOZI.js.map +1 -0
- package/dist/{chunk-4F5DQN55.js → chunk-VSMTAMNK.js} +1 -1
- package/dist/chunk-VSMTAMNK.js.map +1 -0
- package/dist/{chunk-5LBB5B3Z.js → chunk-XFZCM5Z3.js} +1 -1
- package/dist/chunk-XFZCM5Z3.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/{control-CBShYYA6.d.ts → control-rJhEDdpy.d.ts} +4 -4
- package/dist/{control-runtime-BuJHoLg0.d.ts → control-runtime-BRdQ0wrx.d.ts} +3 -2
- package/dist/control.d.ts +5 -5
- package/dist/control.js +2 -2
- package/dist/{emitter-DP_cSSiw.d.ts → emitter-BqjeOvJh.d.ts} +1 -1
- package/dist/{failure-cluster-C2EGSDiT.d.ts → failure-cluster-D1NZKqYu.d.ts} +2 -3
- package/dist/{feedback-trajectory-DfFdrraJ.d.ts → feedback-trajectory-j0nJFgC6.d.ts} +1 -1
- package/dist/governance/index.d.ts +2 -2
- package/dist/{index-D3iBCjdF.d.ts → index-Cgt3DKXr.d.ts} +2 -2
- package/dist/index.d.ts +1279 -468
- package/dist/index.js +1992 -1259
- package/dist/index.js.map +1 -1
- package/dist/{integrity-DK2EBVZC.d.ts → integrity-BAxLGJ9I.d.ts} +2 -2
- package/dist/knowledge/index.d.ts +3 -3
- package/dist/knowledge/index.js +2 -2
- package/dist/meta-eval/index.d.ts +1 -1
- package/dist/{multi-layer-verifier-LkP3LVKj.d.ts → multi-layer-verifier-BNi4-8lR.d.ts} +2 -2
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +8 -8
- package/dist/optimization.js +5 -5
- package/dist/pipelines/index.d.ts +6 -6
- package/dist/pipelines/index.js +2 -2
- package/dist/prm/index.d.ts +4 -4
- package/dist/{query-DODUYdPg.d.ts → query-BFDT0kX_.d.ts} +1 -1
- package/dist/{release-report-wfUySN5F.d.ts → release-report-PWhGlpfO.d.ts} +1 -1
- package/dist/replay-BX5Fm8en.d.ts +529 -0
- package/dist/reporting.d.ts +5 -5
- package/dist/reporting.js +5 -5
- package/dist/{researcher-bGkI7vCl.d.ts → researcher-ClDX3KZx.d.ts} +13 -14
- package/dist/rl.d.ts +29 -47
- package/dist/rl.js +5 -5
- package/dist/rl.js.map +1 -1
- package/dist/{rubric-D5tjHNJQ.d.ts → rubric-DgSqjqqj.d.ts} +2 -2
- package/dist/{sequential-Dgz1n51-.d.ts → sequential-5iSVfzl2.d.ts} +2 -2
- package/dist/{store-Db2Bv8Cf.d.ts → store-BP5be6s7.d.ts} +1 -1
- package/dist/{summary-report-DZVXOCK_.d.ts → summary-report-jrSGb2xZ.d.ts} +5 -5
- package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BJ54PDan.d.ts} +2 -2
- package/dist/traces.d.ts +9 -311
- package/dist/traces.js +16 -987
- package/dist/traces.js.map +1 -1
- package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-BFmveYZt.d.ts} +1 -1
- package/dist/wire/index.d.ts +4 -4
- package/dist/wire/index.js +1 -1
- package/docs/research-report-methodology.md +4 -4
- package/docs/three-package-architecture.md +12 -24
- package/package.json +1 -1
- package/dist/chunk-2A5XJB43.js.map +0 -1
- package/dist/chunk-4F5DQN55.js.map +0 -1
- package/dist/chunk-5LBB5B3Z.js.map +0 -1
- package/dist/chunk-I4MBDTY5.js +0 -272
- package/dist/chunk-I4MBDTY5.js.map +0 -1
- package/dist/chunk-JLZQWFV3.js.map +0 -1
- package/dist/chunk-K2TPS5LB.js +0 -569
- package/dist/chunk-K2TPS5LB.js.map +0 -1
- package/dist/chunk-LSH4MMOZ.js.map +0 -1
- package/dist/chunk-NU65VQ7M.js.map +0 -1
- package/dist/chunk-OWLAAMME.js.map +0 -1
- package/dist/chunk-SESZDQPX.js.map +0 -1
- package/dist/chunk-WHZMVFUV.js.map +0 -1
- package/dist/replay-BL96gCEP.d.ts +0 -226
- /package/dist/{chunk-WWYCWKUM.js.map → chunk-3CKU6VGU.js.map} +0 -0
- /package/dist/{chunk-RAF443UI.js.map → chunk-DBIGN5MJ.js.map} +0 -0
- /package/dist/{chunk-ZN274SWR.js.map → chunk-PALJO75S.js.map} +0 -0
package/dist/index.d.ts
CHANGED
|
@@ -1,168 +1,234 @@
|
|
|
1
|
-
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-
|
|
1
|
+
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-rJhEDdpy.js';
|
|
2
|
+
import { AxAIService, AxFunction } from '@ax-llm/ax';
|
|
3
|
+
import { S as Severity, M as MultiLayerVerifier, a as VerifyOptions, L as Layer, b as LayerResult, c as VerifyContext } from './multi-layer-verifier-BNi4-8lR.js';
|
|
4
|
+
export { F as Finding, d as LayerStatus, V as VerificationReport, g as gradeSemanticStatus } from './multi-layer-verifier-BNi4-8lR.js';
|
|
5
|
+
import { R as Run$1, S as Span, b as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, T as TraceStore, g as BudgetSpec, l as RunFilter, L as LlmSpan } from './store-BP5be6s7.js';
|
|
6
|
+
export { h as EventFilter, E as EventKind, i as FAILURE_CLASSES, F as FailureClass, j as FileSystemTraceStore, k as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, M as Message, e as RetrievalSpan, m as RunLayer, n as RunStatus, f as SandboxSpan, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-BP5be6s7.js';
|
|
7
|
+
import { L as LlmClientOptions, m as LlmCallRequest, n as LlmCallResult } from './researcher-ClDX3KZx.js';
|
|
8
|
+
export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, o as LlmCallError, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as callLlm, x as callLlmJson, y as probeLlm, r as runEvalCampaign, z as stripFencedJson } from './researcher-ClDX3KZx.js';
|
|
9
|
+
import { T as TraceAnalysisStore, A as AnalyzeTracesOptions } from './replay-BX5Fm8en.js';
|
|
10
|
+
export { g as DEFAULT_REDACTION_RULES, O as OTEL_AGENT_EVAL_SCOPE, i as OtlpExport, j as OtlpResourceSpans, k as OtlpSpan, R as REDACTION_VERSION, l as RedactionReport, m as RedactionRule, n as ReplayCache, o as ReplayCacheEntry, p as ReplayCacheMissError, q as ReplayCacheStats, r as ReplayFetchOptions, C as createReplayFetch, E as exportRunAsOtlp, F as iterateRawCalls, G as redactString, H as redactValue } from './replay-BX5Fm8en.js';
|
|
2
11
|
import { TCloud } from '@tangle-network/tcloud';
|
|
3
|
-
import {
|
|
4
|
-
export { c as
|
|
12
|
+
import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
|
|
13
|
+
export { c as RunJudgeMetadata, d as RunOutcome, e as RunRecordValidationError, b as RunTokenUsage, i as isRunRecord, p as parseRunRecordSafe, r as roundTripRunRecord, v as validateRunRecord } from './run-record-CqzahIbx.js';
|
|
14
|
+
import { z } from 'zod';
|
|
15
|
+
import { C as ControlEvalResult } from './control-runtime-BRdQ0wrx.js';
|
|
16
|
+
export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BRdQ0wrx.js';
|
|
5
17
|
import { A as AgentEvalError } from './errors-BZ9sTdz7.js';
|
|
6
18
|
export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-BZ9sTdz7.js';
|
|
7
|
-
import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory } from './feedback-trajectory-
|
|
8
|
-
export { F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-
|
|
9
|
-
import { A as ActionableSideInfo, h as MultiShotRunner, j as MultiShotScorer, c as MultiShotMutateAdapter, a4 as HeldOutGateConfig, E as EvolvableVariant, m as MultiShotTrialResult, e as MultiShotOptimizationResult, a3 as GateDecision, a5 as Objective, a6 as ParetoResult, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-
|
|
10
|
-
export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a7 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a8 as GateEvidence, G as GenerationReport, a9 as HeldOutGate, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, d as MultiShotOptimizationConfig, f as MultiShotRun, g as MultiShotRunInput, i as MultiShotScore, k as MultiShotSplit, l as MultiShotTrace, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-
|
|
19
|
+
import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory } from './feedback-trajectory-j0nJFgC6.js';
|
|
20
|
+
export { F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-j0nJFgC6.js';
|
|
21
|
+
import { A as ActionableSideInfo, h as MultiShotRunner, j as MultiShotScorer, c as MultiShotMutateAdapter, a4 as HeldOutGateConfig, E as EvolvableVariant, m as MultiShotTrialResult, e as MultiShotOptimizationResult, a3 as GateDecision, a5 as Objective, a6 as ParetoResult, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-jrSGb2xZ.js';
|
|
22
|
+
export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a7 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a8 as GateEvidence, G as GenerationReport, a9 as HeldOutGate, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, d as MultiShotOptimizationConfig, f as MultiShotRun, g as MultiShotRunInput, i as MultiShotScore, k as MultiShotSplit, l as MultiShotTrace, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-jrSGb2xZ.js';
|
|
11
23
|
export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
|
|
12
|
-
import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-
|
|
13
|
-
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-
|
|
14
|
-
import { a as FailureCluster } from './failure-cluster-
|
|
15
|
-
export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-
|
|
16
|
-
import {
|
|
17
|
-
export { c as
|
|
18
|
-
import {
|
|
19
|
-
export {
|
|
20
|
-
import {
|
|
21
|
-
export {
|
|
22
|
-
|
|
23
|
-
export {
|
|
24
|
-
|
|
25
|
-
export {
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-4R5deP0N.js';
|
|
29
|
-
import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
|
|
30
|
-
export { b as buildTrajectory } from './trajectory-CnoBo-JY.js';
|
|
24
|
+
import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-PWhGlpfO.js';
|
|
25
|
+
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-PWhGlpfO.js';
|
|
26
|
+
import { a as FailureCluster } from './failure-cluster-D1NZKqYu.js';
|
|
27
|
+
export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-D1NZKqYu.js';
|
|
28
|
+
import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './index-Cgt3DKXr.js';
|
|
29
|
+
export { b as CalibrationResult, c as CandidateScore, d as ContinuousCalibrationResult, D as DEFAULT_RED_TEAM_CORPUS, E as EuRiskClass, G as GoldenItem, e as GovernanceContext, f as GovernanceFinding, g as GovernanceReport, P as PositionalBiasResult, R as RedTeamCase, h as RedTeamCategory, i as RedTeamFinding, j as RedTeamPayload, k as RedTeamReport, S as SelfPreferenceResult, U as UseCaseSignals, V as VerbosityBiasResult, l as calibrateJudge, m as calibrateJudgeContinuous, n as classifyEuAiRisk, o as continuousAgreement, p as euAiActReport, q as nistAiRmfReport, r as positionalBias, s as redTeamDataset, t as redTeamReport, u as renderMarkdown, v as scoreRedTeamOutput, w as selfPreference, x as soc2Report, y as summarize, z as toolNamesForRun, A as verbosityBias } from './index-Cgt3DKXr.js';
|
|
30
|
+
import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-BJ54PDan.js';
|
|
31
|
+
export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-BJ54PDan.js';
|
|
32
|
+
import { T as TraceEmitter } from './emitter-BqjeOvJh.js';
|
|
33
|
+
export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-BqjeOvJh.js';
|
|
34
|
+
export { F as FileSystemRawProviderSink, c as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, d as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, e as RawProviderDirection, f as RawProviderEvent, R as RawProviderSink, g as RawProviderSinkFilter, h as RunIntegrityError, a as RunIntegrityExpectations, i as RunIntegrityIssue, j as RunIntegrityIssueCode, b as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-BAxLGJ9I.js';
|
|
35
|
+
export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-BFDT0kX_.js';
|
|
36
|
+
import { a as BaselineReport } from './baseline-BwdCXUS8.js';
|
|
37
|
+
export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-BwdCXUS8.js';
|
|
38
|
+
import { T as Trajectory, a as TrajectoryStep } from './trajectory-BFmveYZt.js';
|
|
39
|
+
export { b as buildTrajectory } from './trajectory-BFmveYZt.js';
|
|
31
40
|
import { a as DatasetScenario, c as Dataset } from './dataset-CiK_3LDr.js';
|
|
32
41
|
export { d as DatasetDifficulty, b as DatasetManifest, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-CiK_3LDr.js';
|
|
33
|
-
|
|
34
|
-
import { a as PrmGrader } from './rubric-D5tjHNJQ.js';
|
|
35
|
-
import { L as Layer, S as Severity, a as LayerResult, b as VerifyContext } from './multi-layer-verifier-LkP3LVKj.js';
|
|
36
|
-
export { F as Finding, c as LayerStatus, M as MultiLayerVerifier, V as VerificationReport, d as VerifyOptions, g as gradeSemanticStatus } from './multi-layer-verifier-LkP3LVKj.js';
|
|
37
|
-
import { L as LlmClientOptions } from './researcher-bGkI7vCl.js';
|
|
38
|
-
export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, m as LlmCallError, n as LlmCallRequest, o as LlmCallResult, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as callLlm, x as callLlmJson, y as probeLlm, r as runEvalCampaign, z as stripFencedJson } from './researcher-bGkI7vCl.js';
|
|
42
|
+
import { a as PrmGrader } from './rubric-DgSqjqqj.js';
|
|
39
43
|
export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index--fVrWDiR.js';
|
|
40
|
-
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-
|
|
44
|
+
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
|
|
41
45
|
import './outcome-store-D6KWmYvj.js';
|
|
42
46
|
|
|
47
|
+
interface RunScore {
|
|
48
|
+
success: number;
|
|
49
|
+
goalProgress: number;
|
|
50
|
+
repoGroundedness: number;
|
|
51
|
+
driftPenalty: number;
|
|
52
|
+
toolUseQuality: number;
|
|
53
|
+
patchQuality: number;
|
|
54
|
+
testReality: number;
|
|
55
|
+
finalGate: number;
|
|
56
|
+
reviewerBlockers: number;
|
|
57
|
+
costUsd: number;
|
|
58
|
+
wallSeconds: number;
|
|
59
|
+
notes?: string[];
|
|
60
|
+
}
|
|
61
|
+
interface RunScoreWeights {
|
|
62
|
+
success: number;
|
|
63
|
+
goalProgress: number;
|
|
64
|
+
repoGroundedness: number;
|
|
65
|
+
driftPenalty: number;
|
|
66
|
+
toolUseQuality: number;
|
|
67
|
+
patchQuality: number;
|
|
68
|
+
testReality: number;
|
|
69
|
+
finalGate: number;
|
|
70
|
+
reviewerBlockers: number;
|
|
71
|
+
costUsd: number;
|
|
72
|
+
wallSeconds: number;
|
|
73
|
+
}
|
|
74
|
+
declare const DEFAULT_RUN_SCORE_WEIGHTS: RunScoreWeights;
|
|
75
|
+
declare function aggregateRunScore(score: RunScore, weights?: Partial<RunScoreWeights>): number;
|
|
76
|
+
declare function clamp01(value: number): number;
|
|
77
|
+
|
|
78
|
+
interface RunTrace {
|
|
79
|
+
run: Run$1;
|
|
80
|
+
spans: Span[];
|
|
81
|
+
events: TraceEvent[];
|
|
82
|
+
artifacts: Artifact$1[];
|
|
83
|
+
budget: BudgetLedgerEntry[];
|
|
84
|
+
}
|
|
85
|
+
interface RunCriticOptions {
|
|
86
|
+
weights?: Partial<RunScoreWeights>;
|
|
87
|
+
driftPatterns?: RegExp[];
|
|
88
|
+
}
|
|
89
|
+
declare class RunCritic {
|
|
90
|
+
private readonly weights?;
|
|
91
|
+
private readonly driftPatterns;
|
|
92
|
+
constructor(options?: RunCriticOptions);
|
|
93
|
+
score(store: TraceStore, runId: string): Promise<RunScore>;
|
|
94
|
+
scoreTrace(trace: RunTrace): RunScore;
|
|
95
|
+
rank(score: RunScore): number;
|
|
96
|
+
private isDrift;
|
|
97
|
+
}
|
|
98
|
+
|
|
43
99
|
/**
|
|
44
|
-
*
|
|
100
|
+
* Semantic concept judge — "does the built artifact actually implement
|
|
101
|
+
* the features the user asked for?"
|
|
45
102
|
*
|
|
46
|
-
*
|
|
47
|
-
*
|
|
48
|
-
*
|
|
49
|
-
*
|
|
103
|
+
* Distinct from the domain/code/coherence judges in `judges.ts`:
|
|
104
|
+
* - those judges score free-form conversational agent outputs along
|
|
105
|
+
* quality dimensions (accuracy, depth, etc.)
|
|
106
|
+
* - this judge scores a *built artifact* (served HTML + source files)
|
|
107
|
+
* against an explicit list of expected concepts, returning per-concept
|
|
108
|
+
* {present, score 0-10, evidence, severity}.
|
|
50
109
|
*
|
|
51
|
-
*
|
|
52
|
-
*
|
|
53
|
-
*
|
|
54
|
-
* 4. Open a PR via the GitHub API.
|
|
110
|
+
* The judge is strict about distinguishing (a) a working implementation
|
|
111
|
+
* from (b) a keyword-present stub. "// TODO: mint button" is NOT present.
|
|
112
|
+
* Only real, functional, wired-up code counts.
|
|
55
113
|
*
|
|
56
|
-
*
|
|
114
|
+
* Use via {@link createSemanticConceptJudge} or directly via
|
|
115
|
+
* {@link runSemanticConceptJudge}. Soft-fails (available=false) on LLM
|
|
116
|
+
* or JSON-parse errors so the caller can treat that as "layer skipped"
|
|
117
|
+
* rather than "layer failed" in a multi-layer pipeline.
|
|
118
|
+
*/
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Implementation complexity class for weighted scoring.
|
|
57
122
|
*
|
|
58
|
-
*
|
|
59
|
-
*
|
|
60
|
-
*
|
|
61
|
-
*
|
|
62
|
-
*
|
|
123
|
+
* - `render` (default): the concept is a UI surface that displays static
|
|
124
|
+
* data — render a list, show a counter, lay out a button. Single-file
|
|
125
|
+
* work, no external integration.
|
|
126
|
+
* - `integrate`: the concept requires wiring a real external system —
|
|
127
|
+
* wallet connect (wagmi + RainbowKit + chain config), payment provider
|
|
128
|
+
* (Stripe Elements + intent + webhook), an API client with auth.
|
|
129
|
+
* Multi-file, library-knowledge, runtime correctness matters.
|
|
130
|
+
* - `compute`: the concept requires algorithmic work — solver, simulator,
|
|
131
|
+
* constraint propagation, ML inference. Correctness > UI polish.
|
|
63
132
|
*
|
|
64
|
-
*
|
|
65
|
-
*
|
|
133
|
+
* Default weights (when applied via `weightConcepts: 'complexity'`):
|
|
134
|
+
* render=1.0, integrate=2.0, compute=2.5
|
|
66
135
|
*
|
|
67
|
-
*
|
|
68
|
-
*
|
|
136
|
+
* Cross-vertical scoring without complexity weighting silently inflates
|
|
137
|
+
* the rate of UI-heavy verticals (healthcare, fintech dashboards) vs
|
|
138
|
+
* integration-heavy verticals (DeFi, wallets) — all concepts treated
|
|
139
|
+
* equally even though the agent does 2-3x the work for `integrate`.
|
|
69
140
|
*/
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
path: string;
|
|
73
|
-
/** New file contents. UTF-8. */
|
|
74
|
-
contents: string;
|
|
75
|
-
/** Optional explanatory comment shown in the commit body. */
|
|
76
|
-
rationale?: string;
|
|
77
|
-
}
|
|
78
|
-
interface RepoRef {
|
|
79
|
-
owner: string;
|
|
141
|
+
type ConceptComplexity = 'render' | 'integrate' | 'compute';
|
|
142
|
+
interface ConceptSpec {
|
|
80
143
|
name: string;
|
|
144
|
+
/** Short hints that help the judge; not used for matching. */
|
|
145
|
+
keywords?: string[];
|
|
146
|
+
/** Optional explicit weight; default 1.0. Overrides complexity-derived weight. */
|
|
147
|
+
weight?: number;
|
|
148
|
+
/** Implementation complexity class. Default `render`. */
|
|
149
|
+
complexity?: ConceptComplexity;
|
|
81
150
|
}
|
|
82
|
-
interface
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
title: string;
|
|
90
|
-
body: string;
|
|
91
|
-
/** Optional GitHub usernames to request review from. */
|
|
92
|
-
reviewers?: string[];
|
|
93
|
-
/** Optional labels to apply. */
|
|
94
|
-
labels?: string[];
|
|
95
|
-
/** Commit author name. Default: derived from the GitHub client. */
|
|
96
|
-
authorName?: string;
|
|
97
|
-
/** Commit author email. Default: derived from the GitHub client. */
|
|
98
|
-
authorEmail?: string;
|
|
99
|
-
/** Dry-run — do not push or open a PR; just return the would-be plan. */
|
|
100
|
-
dryRun?: boolean;
|
|
151
|
+
interface ConceptFinding {
|
|
152
|
+
concept: string;
|
|
153
|
+
present: boolean;
|
|
154
|
+
/** 0..10. 10 = production-ready; 7 = functional thin; 4 = partial; 0 = absent. */
|
|
155
|
+
score: number;
|
|
156
|
+
evidence: string;
|
|
157
|
+
severity: Severity;
|
|
101
158
|
}
|
|
102
|
-
interface
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
159
|
+
interface SemanticConceptJudgeInput {
|
|
160
|
+
/** Full natural-language prompt the agent was handed. */
|
|
161
|
+
userRequest: string;
|
|
162
|
+
/** Rendered HTML the preview returns (UI artifacts). Optional. */
|
|
163
|
+
servedHtml?: string;
|
|
164
|
+
/** Top-level source files from the agent's workdir. */
|
|
165
|
+
sourceFiles: Array<{
|
|
166
|
+
path: string;
|
|
167
|
+
content: string;
|
|
168
|
+
}>;
|
|
169
|
+
/** The expected concept list. */
|
|
170
|
+
expectedConcepts: ConceptSpec[];
|
|
171
|
+
/** Free-form metadata (id, difficulty) to inject into the prompt. */
|
|
172
|
+
artifactLabel?: string;
|
|
173
|
+
artifactDescription?: string;
|
|
107
174
|
}
|
|
108
|
-
|
|
109
|
-
|
|
175
|
+
interface SemanticConceptJudgeResult {
|
|
176
|
+
kind: 'semantic-concept';
|
|
177
|
+
version: string;
|
|
178
|
+
/** Normalized 0..1 score — mean of per-concept scores / 10. */
|
|
179
|
+
score: number;
|
|
180
|
+
presentCount: number;
|
|
181
|
+
totalCount: number;
|
|
182
|
+
findings: ConceptFinding[];
|
|
183
|
+
summary: string;
|
|
184
|
+
durationMs: number;
|
|
185
|
+
costUsd: number | null;
|
|
186
|
+
/** False on LLM/JSON error — treat as "skipped / unable to judge" in pipelines. */
|
|
187
|
+
available: boolean;
|
|
188
|
+
error?: string;
|
|
189
|
+
}
|
|
190
|
+
/**
|
|
191
|
+
* Score-aggregation strategy. `mean` averages 0-10 scores uniformly.
|
|
192
|
+
* `complexity` applies the default weight table (render=1, integrate=2,
|
|
193
|
+
* compute=2.5) unless a concept has an explicit `weight`. `explicit`
|
|
194
|
+
* honors only `weight` (defaulting to 1 for unspecified).
|
|
195
|
+
*/
|
|
196
|
+
type ConceptWeightStrategy = 'mean' | 'complexity' | 'explicit';
|
|
197
|
+
declare const DEFAULT_COMPLEXITY_WEIGHTS: Record<ConceptComplexity, number>;
|
|
198
|
+
interface SemanticConceptJudgeOptions {
|
|
199
|
+
/** Model id to call. Default 'claude-sonnet-4-6' via agent-eval defaults. */
|
|
200
|
+
model?: string;
|
|
201
|
+
/** Per-call timeout. Default 180s. */
|
|
202
|
+
timeoutMs?: number;
|
|
203
|
+
/** Pipeline budget for the prompt (source blob truncation). Default 45000. */
|
|
204
|
+
maxSourceChars?: number;
|
|
205
|
+
/** Per-file cap before inclusion. Default 20000. */
|
|
206
|
+
maxPerFileChars?: number;
|
|
207
|
+
/** HTML cap. Default 30000. */
|
|
208
|
+
maxHtmlChars?: number;
|
|
209
|
+
/** LlmClient config (baseUrl, apiKey, authHeader, …). */
|
|
210
|
+
llm?: LlmClientOptions;
|
|
110
211
|
/**
|
|
111
|
-
*
|
|
112
|
-
*
|
|
113
|
-
*
|
|
114
|
-
* Implementations must be idempotent on `branchName`: if the branch
|
|
115
|
-
* already exists with the same head SHA as the would-be commit, return
|
|
116
|
-
* the existing PR rather than failing. This makes the production loop
|
|
117
|
-
* safe to retry on transient errors.
|
|
212
|
+
* Score aggregation strategy. Default `mean` — uniform average across
|
|
213
|
+
* concepts. Cross-vertical comparisons should use `complexity` to
|
|
214
|
+
* neutralize the integrate-vs-render asymmetry.
|
|
118
215
|
*/
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
interface HttpGithubClientOptions {
|
|
123
|
-
/** Personal access token, GitHub App token, or `GITHUB_TOKEN` from Actions. */
|
|
124
|
-
token: string;
|
|
125
|
-
/** Override for GitHub Enterprise. Default `'https://api.github.com'`. */
|
|
126
|
-
apiBase?: string;
|
|
127
|
-
/** Test seam — defaults to global `fetch`. */
|
|
128
|
-
fetchImpl?: typeof fetch;
|
|
129
|
-
/** Test seam — clock for commit timestamps. */
|
|
130
|
-
now?: () => Date;
|
|
216
|
+
weightConcepts?: ConceptWeightStrategy;
|
|
217
|
+
/** Override the default complexity → weight table. */
|
|
218
|
+
complexityWeights?: Partial<Record<ConceptComplexity, number>>;
|
|
131
219
|
}
|
|
220
|
+
declare const SEMANTIC_CONCEPT_JUDGE_VERSION = "semantic-concept-judge-v1-2026-04-24";
|
|
132
221
|
/**
|
|
133
|
-
*
|
|
134
|
-
*
|
|
135
|
-
*
|
|
136
|
-
* the branch already exists at the desired tree. If so, return the
|
|
137
|
-
* existing PR (or open one if missing). Errors from concurrent runs
|
|
138
|
-
* (`Reference already exists`) are caught and treated as success.
|
|
222
|
+
* Run the semantic concept judge. Soft-fails to available=false on
|
|
223
|
+
* LLM/JSON errors — callers in a MultiLayerVerifier pipeline can treat
|
|
224
|
+
* that as "skip" rather than "fail."
|
|
139
225
|
*/
|
|
140
|
-
declare function
|
|
141
|
-
interface GhCliClientOptions {
|
|
142
|
-
/** Override the CLI binary (`gh`). For testing. */
|
|
143
|
-
bin?: string;
|
|
144
|
-
/** Working directory containing a clone of `repo`. Default: process cwd. */
|
|
145
|
-
cwd?: string;
|
|
146
|
-
/** Test seam: process spawner. Default: node:child_process spawn. */
|
|
147
|
-
exec?: (bin: string, args: string[], opts: {
|
|
148
|
-
cwd: string;
|
|
149
|
-
stdin?: string;
|
|
150
|
-
}) => Promise<{
|
|
151
|
-
stdout: string;
|
|
152
|
-
stderr: string;
|
|
153
|
-
exitCode: number;
|
|
154
|
-
}>;
|
|
155
|
-
}
|
|
226
|
+
declare function runSemanticConceptJudge(input: SemanticConceptJudgeInput, options?: SemanticConceptJudgeOptions): Promise<SemanticConceptJudgeResult>;
|
|
156
227
|
/**
|
|
157
|
-
*
|
|
158
|
-
*
|
|
159
|
-
* - A local clone of the repo with a clean working tree.
|
|
160
|
-
* - `git` on PATH.
|
|
161
|
-
*
|
|
162
|
-
* Uses `gh api` for repo metadata and `gh pr create` for the PR. The
|
|
163
|
-
* actual commit lands via `git`, which keeps `gh`'s footprint minimal.
|
|
228
|
+
* Factory: pin LLM options once, return a closure that accepts inputs.
|
|
229
|
+
* Convenient for pipelines that want to share a single LlmClient config.
|
|
164
230
|
*/
|
|
165
|
-
declare function
|
|
231
|
+
declare function createSemanticConceptJudge(options?: SemanticConceptJudgeOptions): (input: SemanticConceptJudgeInput) => Promise<SemanticConceptJudgeResult>;
|
|
166
232
|
|
|
167
233
|
interface Scenario {
|
|
168
234
|
id: string;
|
|
@@ -410,13 +476,954 @@ interface CheckResult {
|
|
|
410
476
|
expected: string;
|
|
411
477
|
actual: string;
|
|
412
478
|
}
|
|
413
|
-
interface EvalResult {
|
|
414
|
-
scenario: string;
|
|
415
|
-
status: 'pass' | 'fail' | 'skip';
|
|
416
|
-
duration: number;
|
|
417
|
-
detail?: string;
|
|
418
|
-
artifact?: string;
|
|
479
|
+
interface EvalResult {
|
|
480
|
+
scenario: string;
|
|
481
|
+
status: 'pass' | 'fail' | 'skip';
|
|
482
|
+
duration: number;
|
|
483
|
+
detail?: string;
|
|
484
|
+
artifact?: string;
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
/**
|
|
488
|
+
* ChatClient — the single LLM abstraction analysts call.
|
|
489
|
+
*
|
|
490
|
+
* agent-eval already ships an `LlmClient` (OpenAI-compatible, retry,
|
|
491
|
+
* graceful JSON-schema degrade) and judges that talk to `TCloud`. Two
|
|
492
|
+
* mixed patterns force every analyst author to pick a transport, which
|
|
493
|
+
* couples analyst code to runtime concerns (cli-bridge vs router vs
|
|
494
|
+
* sandbox-sdk) it shouldn't know about.
|
|
495
|
+
*
|
|
496
|
+
* `ChatClient` is one interface every analyst takes via `AnalystContext.chat`.
|
|
497
|
+
* The operator decides at the registry boundary which transport binds
|
|
498
|
+
* to it. Analyst code stays transport-agnostic; swapping production
|
|
499
|
+
* (sandbox-sdk) for local dev (cli-bridge) or tests (mock) is a one-
|
|
500
|
+
* line factory call.
|
|
501
|
+
*
|
|
502
|
+
* Designed to coexist: existing `LlmClient` callers and existing
|
|
503
|
+
* `TCloud`-based judges keep working untouched. New analyst code uses
|
|
504
|
+
* `ChatClient`. When old call sites migrate, they pick up budgeting,
|
|
505
|
+
* cancellation, and unified telemetry for free.
|
|
506
|
+
*/
|
|
507
|
+
|
|
508
|
+
/**
|
|
509
|
+
* Unified chat interface. Mirrors LlmCallRequest/Result so the OpenAI-
|
|
510
|
+
* compatible mental model stays. Two methods: a one-shot `chat()` and
|
|
511
|
+
* an `streamChat()` for future agentic loops (not yet exposed).
|
|
512
|
+
*/
|
|
513
|
+
interface ChatClient {
|
|
514
|
+
/** Display name of the bound transport — included in telemetry. */
|
|
515
|
+
readonly transport: ChatTransport;
|
|
516
|
+
/** Default model when caller omits — operators bind this per environment. */
|
|
517
|
+
readonly defaultModel?: string;
|
|
518
|
+
chat(req: ChatRequest, opts?: ChatCallOpts): Promise<ChatResponse>;
|
|
519
|
+
}
|
|
520
|
+
type ChatTransport = 'router' | 'sandbox-sdk' | 'cli-bridge' | 'direct-provider' | 'mock';
|
|
521
|
+
interface ChatRequest extends Omit<LlmCallRequest, 'model'> {
|
|
522
|
+
/** Optional — falls back to ChatClient.defaultModel. */
|
|
523
|
+
model?: string;
|
|
524
|
+
}
|
|
525
|
+
type ChatResponse = LlmCallResult;
|
|
526
|
+
interface ChatCallOpts {
|
|
527
|
+
/** Cancel the in-flight request. */
|
|
528
|
+
signal?: AbortSignal;
|
|
529
|
+
/** Hard USD ceiling for this single call (informational; the underlying transport may not enforce). */
|
|
530
|
+
maxCostUsd?: number;
|
|
531
|
+
/** Correlation tag carried into request headers when the transport allows. */
|
|
532
|
+
correlationId?: string;
|
|
533
|
+
}
|
|
534
|
+
type CreateChatClientOpts = RouterTransportOpts | CliBridgeTransportOpts | DirectProviderTransportOpts | SandboxSdkTransportOpts | MockTransportOpts;
|
|
535
|
+
interface BaseTransportOpts {
|
|
536
|
+
defaultModel?: string;
|
|
537
|
+
}
|
|
538
|
+
interface RouterTransportOpts extends BaseTransportOpts {
|
|
539
|
+
transport: 'router';
|
|
540
|
+
baseUrl?: string;
|
|
541
|
+
apiKey: string;
|
|
542
|
+
}
|
|
543
|
+
interface CliBridgeTransportOpts extends BaseTransportOpts {
|
|
544
|
+
transport: 'cli-bridge';
|
|
545
|
+
baseUrl?: string;
|
|
546
|
+
bearer?: string;
|
|
547
|
+
}
|
|
548
|
+
interface DirectProviderTransportOpts extends BaseTransportOpts {
|
|
549
|
+
transport: 'direct-provider';
|
|
550
|
+
baseUrl: string;
|
|
551
|
+
apiKey: string;
|
|
552
|
+
}
|
|
553
|
+
/**
|
|
554
|
+
* Sandbox-SDK transport. Provided as a thin pass-through: the caller
|
|
555
|
+
* supplies a callable that mimics LlmClient.chat() against an already-
|
|
556
|
+
* configured Sandbox handle. We don't import the SDK here to keep
|
|
557
|
+
* agent-eval dep-free of @tangle-network/sandbox.
|
|
558
|
+
*/
|
|
559
|
+
interface SandboxSdkTransportOpts extends BaseTransportOpts {
|
|
560
|
+
transport: 'sandbox-sdk';
|
|
561
|
+
chat: (req: ChatRequest, opts?: ChatCallOpts) => Promise<ChatResponse>;
|
|
562
|
+
}
|
|
563
|
+
/**
|
|
564
|
+
* Mock transport for tests. The handler receives the request and returns
|
|
565
|
+
* whatever the test wants. No retries, no JSON-schema degrade.
|
|
566
|
+
*/
|
|
567
|
+
interface MockTransportOpts extends BaseTransportOpts {
|
|
568
|
+
transport: 'mock';
|
|
569
|
+
handler: (req: ChatRequest, opts?: ChatCallOpts) => Promise<ChatResponse>;
|
|
570
|
+
}
|
|
571
|
+
/**
|
|
572
|
+
* Build a ChatClient bound to a specific transport. The returned client
|
|
573
|
+
* is safe to share across analysts in a single registry run.
|
|
574
|
+
*/
|
|
575
|
+
declare function createChatClient(opts: CreateChatClientOpts): ChatClient;
|
|
576
|
+
|
|
577
|
+
/**
|
|
578
|
+
* Analyst contract — the missing orchestration layer over agent-eval's
|
|
579
|
+
* existing analyzers (analyzeTraces, MultiLayerVerifier, RunCritic,
|
|
580
|
+
* SemanticConceptJudge, JudgeFn, ...).
|
|
581
|
+
*
|
|
582
|
+
* Each existing primitive returns its own output shape. The Analyst
|
|
583
|
+
* contract is the single envelope every primitive lifts into, so a
|
|
584
|
+
* registry can run N analysts against a run and a single renderer can
|
|
585
|
+
* compose findings without knowing which analyzer produced them.
|
|
586
|
+
*
|
|
587
|
+
* The contract is intentionally domain-agnostic: nothing here knows
|
|
588
|
+
* about code, voice, RAG, or any particular agent stack. Analysts
|
|
589
|
+
* declare what INPUT KIND they need (a trace store, an artifact dir,
|
|
590
|
+
* a RunRecord, a JudgeInput, or `custom`), and the registry routes
|
|
591
|
+
* the matching input from `AnalystRunInputs`.
|
|
592
|
+
*/
|
|
593
|
+
|
|
594
|
+
/**
|
|
595
|
+
* Unified envelope every analyst emits. Schema-versioned so renderers
|
|
596
|
+
* and time-series diffs survive future field additions.
|
|
597
|
+
*/
|
|
598
|
+
interface AnalystFinding {
|
|
599
|
+
schema_version: '1.0.0';
|
|
600
|
+
/**
|
|
601
|
+
* Stable hash over identity-defining fields (analyst_id + canonical
|
|
602
|
+
* claim + area + optional subject). Two findings from two runs that
|
|
603
|
+
* "are the same finding" share this id — that's what `diffFindings`
|
|
604
|
+
* uses to compute appeared/disappeared sets across runs.
|
|
605
|
+
*/
|
|
606
|
+
finding_id: string;
|
|
607
|
+
analyst_id: string;
|
|
608
|
+
produced_at: string;
|
|
609
|
+
severity: AnalystSeverity;
|
|
610
|
+
/**
|
|
611
|
+
* Coarse classification. Renderers group by this. Free-form so
|
|
612
|
+
* domain-specific analysts can introduce categories without a
|
|
613
|
+
* schema change ('agent-reasoning', 'verification', 'cost',
|
|
614
|
+
* 'tool-use', 'safety', 'latency', 'data-quality', ...).
|
|
615
|
+
*/
|
|
616
|
+
area: string;
|
|
617
|
+
claim: string;
|
|
618
|
+
rationale?: string;
|
|
619
|
+
evidence_refs: EvidenceRef[];
|
|
620
|
+
recommended_action?: string;
|
|
621
|
+
validation_plan?: string;
|
|
622
|
+
/** 0..1 — the analyst's own confidence. Not calibrated across analysts. */
|
|
623
|
+
confidence: number;
|
|
624
|
+
/**
|
|
625
|
+
* Optional subject the finding is about — leaf id, agent id, request
|
|
626
|
+
* id. Included in finding_id when present so per-subject findings
|
|
627
|
+
* diff cleanly across runs.
|
|
628
|
+
*/
|
|
629
|
+
subject?: string;
|
|
630
|
+
/** Analyst-private extras; renderers ignore unless they know the analyst. */
|
|
631
|
+
metadata?: Record<string, unknown>;
|
|
632
|
+
}
|
|
633
|
+
type AnalystSeverity = 'critical' | 'high' | 'medium' | 'low' | 'info';
|
|
634
|
+
interface EvidenceRef {
|
|
635
|
+
/**
|
|
636
|
+
* Where the evidence lives. `span` and `event` refer to OTLP trace
|
|
637
|
+
* elements; `artifact` to a file inside the run's artifact tree;
|
|
638
|
+
* `finding` to another AnalystFinding (cross-analyst chaining);
|
|
639
|
+
* `metric` to a named scalar reading the renderer knows how to read.
|
|
640
|
+
*/
|
|
641
|
+
kind: 'span' | 'event' | 'artifact' | 'finding' | 'metric';
|
|
642
|
+
uri: string;
|
|
643
|
+
excerpt?: string;
|
|
644
|
+
}
|
|
645
|
+
/**
|
|
646
|
+
* The discriminator the registry uses to pass the right input.
|
|
647
|
+
* `custom` is the escape hatch — analysts that need something else
|
|
648
|
+
* (e.g. an embedding cache, a partner SDK handle) read it from
|
|
649
|
+
* `AnalystRunInputs.custom[<analyst id>]`.
|
|
650
|
+
*/
|
|
651
|
+
type AnalystInputKind = 'trace-store' | 'artifact-dir' | 'run-record' | 'judge-input' | 'custom';
|
|
652
|
+
interface AnalystCost {
|
|
653
|
+
/** `deterministic` analysts MUST NOT call the LLM. */
|
|
654
|
+
kind: 'deterministic' | 'llm';
|
|
655
|
+
/** Optional declared upper bound; the registry can enforce a budget. */
|
|
656
|
+
est_usd_per_run?: number;
|
|
657
|
+
/** Models the analyst expects to use (informational). */
|
|
658
|
+
models?: string[];
|
|
659
|
+
}
|
|
660
|
+
interface AnalystRequirements {
|
|
661
|
+
/** Min number of shots / samples the analyst needs to produce signal. */
|
|
662
|
+
min_shots?: number;
|
|
663
|
+
/** Capabilities the runtime must supply (e.g. ['network', 'gpu']). */
|
|
664
|
+
capabilities?: string[];
|
|
665
|
+
}
|
|
666
|
+
/**
|
|
667
|
+
* What's passed to every analyst call. The registry resolves which
|
|
668
|
+
* field the analyst's `inputKind` selects and asserts it's present.
|
|
669
|
+
*/
|
|
670
|
+
interface AnalystRunInputs {
|
|
671
|
+
traceStore?: TraceAnalysisStore;
|
|
672
|
+
artifactDir?: string;
|
|
673
|
+
runRecord?: RunRecord;
|
|
674
|
+
judgeInput?: JudgeInput;
|
|
675
|
+
/** Keyed by analyst id; populated by callers that registered custom analysts. */
|
|
676
|
+
custom?: Record<string, unknown>;
|
|
677
|
+
}
|
|
678
|
+
interface AnalystContext {
|
|
679
|
+
runId: string;
|
|
680
|
+
/** Stable correlation id so logs from a single registry.run() share a tag. */
|
|
681
|
+
correlationId: string;
|
|
682
|
+
/** Wall-clock deadline (epoch ms). Analysts SHOULD honor for graceful cancel. */
|
|
683
|
+
deadlineMs?: number;
|
|
684
|
+
/** Per-analyst USD budget. Analysts MAY check before issuing LLM calls. */
|
|
685
|
+
budgetUsd?: number;
|
|
686
|
+
/**
|
|
687
|
+
* Shared chat client. Analysts that call an LLM go through this so
|
|
688
|
+
* the operator picks transport (sandbox-sdk | router | cli-bridge |
|
|
689
|
+
* direct-provider | mock) at the registry boundary without touching
|
|
690
|
+
* analyst code.
|
|
691
|
+
*/
|
|
692
|
+
chat?: ChatClient;
|
|
693
|
+
/** Free-form runtime tags (env, host, op). Findings can echo these into metadata. */
|
|
694
|
+
tags?: Record<string, string>;
|
|
695
|
+
/** Logger callback — analysts SHOULD prefer this over console.* for testability. */
|
|
696
|
+
log?: (msg: string, fields?: Record<string, unknown>) => void;
|
|
697
|
+
/** Optional abort signal. Analysts SHOULD pass it through to LLM calls. */
|
|
698
|
+
signal?: AbortSignal;
|
|
699
|
+
}
|
|
700
|
+
/**
|
|
701
|
+
* The minimal contract. Concrete analysts can refine `TInput` so
|
|
702
|
+
* implementations stay type-safe (e.g. a trace analyst's `TInput` is
|
|
703
|
+
* `TraceAnalysisStore`); the registry passes the right field from
|
|
704
|
+
* `AnalystRunInputs` based on `inputKind`.
|
|
705
|
+
*/
|
|
706
|
+
interface Analyst<TInput = unknown> {
|
|
707
|
+
/** Stable identifier — appears in finding_id, telemetry, and registry exclusion lists. */
|
|
708
|
+
readonly id: string;
|
|
709
|
+
/** Human-readable. One sentence. */
|
|
710
|
+
readonly description: string;
|
|
711
|
+
readonly inputKind: AnalystInputKind;
|
|
712
|
+
readonly cost: AnalystCost;
|
|
713
|
+
readonly requires?: AnalystRequirements;
|
|
714
|
+
/** Bump on breaking changes to claim wording or area so old finding_ids don't collide. */
|
|
715
|
+
readonly version: string;
|
|
716
|
+
analyze(input: TInput, ctx: AnalystContext): Promise<AnalystFinding[]>;
|
|
717
|
+
}
|
|
718
|
+
/**
|
|
719
|
+
* Compute the stable finding_id from the identity-defining fields.
|
|
720
|
+
* Default implementation hashes {analyst_id, area, subject, normalized claim}.
|
|
721
|
+
* Analysts that emit findings whose claim text varies per run (timestamps,
|
|
722
|
+
* counts) SHOULD either: (a) pass an explicit `id_basis` to fix the hash,
|
|
723
|
+
* or (b) move the variable part into `rationale`/`metadata` and keep the
|
|
724
|
+
* `claim` static.
|
|
725
|
+
*/
|
|
726
|
+
declare function computeFindingId(input: {
|
|
727
|
+
analyst_id: string;
|
|
728
|
+
area: string;
|
|
729
|
+
subject?: string;
|
|
730
|
+
claim: string;
|
|
731
|
+
/** Override the claim for hashing — use when the displayed claim has run-specific bits. */
|
|
732
|
+
id_basis?: string;
|
|
733
|
+
}): string;
|
|
734
|
+
/**
|
|
735
|
+
* Convenience factory: produce a fully-formed AnalystFinding with the
|
|
736
|
+
* id computed automatically. Analyst code stays terse.
|
|
737
|
+
*/
|
|
738
|
+
declare function makeFinding(init: Omit<AnalystFinding, 'schema_version' | 'finding_id' | 'produced_at'> & {
|
|
739
|
+
id_basis?: string;
|
|
740
|
+
produced_at?: string;
|
|
741
|
+
}): AnalystFinding;
|
|
742
|
+
interface AnalystRunSummary {
|
|
743
|
+
analyst_id: string;
|
|
744
|
+
status: 'ok' | 'skipped' | 'failed';
|
|
745
|
+
/** Why skipped — missing input, budget exceeded, capability unmet. */
|
|
746
|
+
reason?: string;
|
|
747
|
+
findings_count: number;
|
|
748
|
+
latency_ms: number;
|
|
749
|
+
cost_usd: number;
|
|
750
|
+
/** When `status='failed'`: the error class + message, never the full stack. */
|
|
751
|
+
error?: {
|
|
752
|
+
class: string;
|
|
753
|
+
message: string;
|
|
754
|
+
};
|
|
755
|
+
}
|
|
756
|
+
interface AnalystRunResult {
|
|
757
|
+
run_id: string;
|
|
758
|
+
correlation_id: string;
|
|
759
|
+
started_at: string;
|
|
760
|
+
ended_at: string;
|
|
761
|
+
findings: AnalystFinding[];
|
|
762
|
+
per_analyst: AnalystRunSummary[];
|
|
763
|
+
/** Total LLM cost in USD across all analysts in this registry.run(). */
|
|
764
|
+
total_cost_usd: number;
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
/**
|
|
768
|
+
* Adapter factories — lift each existing agent-eval primitive into the
|
|
769
|
+
* Analyst contract without re-implementing it.
|
|
770
|
+
*
|
|
771
|
+
* Five primitives, five factories. Each one:
|
|
772
|
+
* - Builds an Analyst with a stable id (caller chooses; defaults
|
|
773
|
+
* given), a sensible default `inputKind`, a version derived from
|
|
774
|
+
* the wrapped primitive's version + an adapter revision, and an
|
|
775
|
+
* `analyze()` that calls the primitive and lifts its output to
|
|
776
|
+
* AnalystFinding[] using `makeFinding()`.
|
|
777
|
+
* - Maps severities: the existing `Severity` ('critical' | 'major' |
|
|
778
|
+
* 'minor' | 'info') projects onto AnalystSeverity ('critical' |
|
|
779
|
+
* 'high' | 'medium' | 'low' | 'info'); 'major' → 'high', 'minor' →
|
|
780
|
+
* 'medium'. Domain analysts that want finer-grained mapping override.
|
|
781
|
+
*
|
|
782
|
+
* Adapters never own state. Calling the same factory twice with the
|
|
783
|
+
* same primitive instance is safe.
|
|
784
|
+
*/
|
|
785
|
+
|
|
786
|
+
declare function liftSeverity(s: Severity): AnalystSeverity;
|
|
787
|
+
interface TraceAnalystAdapterOpts {
|
|
788
|
+
id?: string;
|
|
789
|
+
area?: string;
|
|
790
|
+
/** The natural-language question(s) put to the analyst. One finding per question. */
|
|
791
|
+
questions: string[];
|
|
792
|
+
/** Caller-provided AxAI service — same one trace-analyst.ts expects. */
|
|
793
|
+
ai: AxAIService;
|
|
794
|
+
model?: string;
|
|
795
|
+
/** Forwarded to analyzeTraces. */
|
|
796
|
+
extra?: Omit<AnalyzeTracesOptions, 'source' | 'ai' | 'model'>;
|
|
797
|
+
}
|
|
798
|
+
/**
|
|
799
|
+
* @deprecated Prefer `createTraceAnalystKind` + one of the failure /
|
|
800
|
+
* improvement kinds from `./kinds`. This adapter wraps the legacy
|
|
801
|
+
* `analyzeTraces` flow whose output is `findings:string[]` — every
|
|
802
|
+
* bullet gets flat-defaulted severity `medium` / confidence `0.6`,
|
|
803
|
+
* which loses the per-finding grading kinds provide via Ax structured
|
|
804
|
+
* output + Zod validation. Kept for one minor while consumers migrate.
|
|
805
|
+
*/
|
|
806
|
+
declare function createTraceAnalystAdapter(opts: TraceAnalystAdapterOpts): Analyst<TraceAnalysisStore>;
|
|
807
|
+
interface VerifierAdapterOpts<Env> {
|
|
808
|
+
id?: string;
|
|
809
|
+
area?: string;
|
|
810
|
+
verifier: MultiLayerVerifier<Env>;
|
|
811
|
+
/**
|
|
812
|
+
* The verifier expects an `env` per run. Adapters take it from
|
|
813
|
+
* `AnalystRunInputs.custom[<id>]` via the registry's 'custom' routing.
|
|
814
|
+
*/
|
|
815
|
+
options?: Omit<VerifyOptions<Env>, 'env'>;
|
|
816
|
+
}
|
|
817
|
+
declare function createVerifierAdapter<Env>(opts: VerifierAdapterOpts<Env>): Analyst<Env>;
|
|
818
|
+
interface RunCriticAdapterOpts {
|
|
819
|
+
id?: string;
|
|
820
|
+
area?: string;
|
|
821
|
+
critic?: RunCritic;
|
|
822
|
+
/** Optional threshold below which a dimension is reported as a finding. Default 0.5. */
|
|
823
|
+
threshold?: number;
|
|
824
|
+
}
|
|
825
|
+
declare function createRunCriticAdapter(opts?: RunCriticAdapterOpts): Analyst<RunTrace>;
|
|
826
|
+
interface JudgeAdapterOpts {
|
|
827
|
+
id?: string;
|
|
828
|
+
area?: string;
|
|
829
|
+
judge: JudgeFn;
|
|
830
|
+
/** TCloud handle the JudgeFn calls. */
|
|
831
|
+
tcloud: TCloud;
|
|
832
|
+
/** Optional cost classification — most judges call an LLM. */
|
|
833
|
+
cost?: Analyst['cost'];
|
|
834
|
+
/** Optional threshold below which a JudgeScore becomes a finding. Default 6 (on 0-10 scale). */
|
|
835
|
+
threshold?: number;
|
|
836
|
+
}
|
|
837
|
+
declare function createJudgeAdapter(opts: JudgeAdapterOpts): Analyst<JudgeInput>;
|
|
838
|
+
interface SemanticConceptJudgeAdapterOpts {
|
|
839
|
+
id?: string;
|
|
840
|
+
area?: string;
|
|
841
|
+
options?: SemanticConceptJudgeOptions;
|
|
842
|
+
}
|
|
843
|
+
declare function createSemanticConceptJudgeAdapter(opts?: SemanticConceptJudgeAdapterOpts): Analyst<SemanticConceptJudgeInput>;
|
|
844
|
+
|
|
845
|
+
/**
|
|
846
|
+
* FindingsStore — durable persistence for AnalystFinding rows + a diff
|
|
847
|
+
* helper so we can answer "what changed since the last run?" without
|
|
848
|
+
* recomputing analysts.
|
|
849
|
+
*
|
|
850
|
+
* On-disk shape is JSONL: one finding per line, append-only, locked via
|
|
851
|
+
* LockedJsonlAppender. Operators get crash-safety (no partial JSON),
|
|
852
|
+
* cheap reads (sequential parse), and trivial backup (rsync the file).
|
|
853
|
+
*
|
|
854
|
+
* Reads are non-locking: a reader sees a consistent snapshot of all
|
|
855
|
+
* fully-written lines and skips an incomplete trailing line if the
|
|
856
|
+
* writer is mid-append. Cross-process locking is intentionally out of
|
|
857
|
+
* scope (see locked-jsonl-appender.ts).
|
|
858
|
+
*
|
|
859
|
+
* The store is run-scoped: callers pass `runId` on append and on load,
|
|
860
|
+
* which keeps multi-run files cleanly partitioned. The `diffFindings`
|
|
861
|
+
* helper compares two run-id sets using stable `finding_id` semantics —
|
|
862
|
+
* the diff is the cross-run signal the regression dashboard renders.
|
|
863
|
+
*/
|
|
864
|
+
|
|
865
|
+
/**
|
|
866
|
+
* One persisted row. We attach `run_id` on disk so a single file can
|
|
867
|
+
* hold multiple runs and the diff helper can query without re-walking
|
|
868
|
+
* separate files.
|
|
869
|
+
*/
|
|
870
|
+
interface PersistedFinding extends AnalystFinding {
|
|
871
|
+
run_id: string;
|
|
872
|
+
}
|
|
873
|
+
declare class FindingsStore {
|
|
874
|
+
readonly path: string;
|
|
875
|
+
private readonly appender;
|
|
876
|
+
constructor(path: string);
|
|
877
|
+
append(runId: string, findings: AnalystFinding[]): Promise<void>;
|
|
878
|
+
/** Load every persisted finding. Discards malformed trailing lines silently. */
|
|
879
|
+
loadAll(): PersistedFinding[];
|
|
880
|
+
/** Filter to a single run. */
|
|
881
|
+
loadRun(runId: string): PersistedFinding[];
|
|
882
|
+
}
|
|
883
|
+
interface FindingsDiff {
|
|
884
|
+
/** New finding ids in `current` that weren't in `previous`. */
|
|
885
|
+
appeared: PersistedFinding[];
|
|
886
|
+
/** Finding ids in `previous` that aren't in `current`. */
|
|
887
|
+
disappeared: PersistedFinding[];
|
|
888
|
+
/** Same finding id present in both runs and unchanged per the materiality test. */
|
|
889
|
+
persisted: PersistedFinding[];
|
|
890
|
+
/**
|
|
891
|
+
* Same finding id in both runs but at least one non-identity field
|
|
892
|
+
* shifted per `DiffPolicy.isMaterial`. Reported as [previous, current].
|
|
893
|
+
*/
|
|
894
|
+
changed: Array<{
|
|
895
|
+
previous: PersistedFinding;
|
|
896
|
+
current: PersistedFinding;
|
|
897
|
+
}>;
|
|
898
|
+
}
|
|
899
|
+
interface DiffPolicy {
|
|
900
|
+
/**
|
|
901
|
+
* Predicate that decides whether two findings (same finding_id) count
|
|
902
|
+
* as a material change. Defaults to {@link defaultIsMaterial}: severity
|
|
903
|
+
* shift, confidence Δ > 0.05, or evidence count change. Compliance /
|
|
904
|
+
* perf consumers MAY supply a stricter predicate (e.g. rationale text
|
|
905
|
+
* diff, metric Δ thresholds).
|
|
906
|
+
*/
|
|
907
|
+
isMaterial?: (previous: AnalystFinding, current: AnalystFinding) => boolean;
|
|
908
|
+
}
|
|
909
|
+
/**
|
|
910
|
+
* Default materiality test. Deliberately narrow so LLM-reword churn
|
|
911
|
+
* doesn't flood the diff. Stricter tests are opt-in via DiffPolicy.
|
|
912
|
+
*/
|
|
913
|
+
declare function defaultIsMaterial(a: AnalystFinding, b: AnalystFinding): boolean;
|
|
914
|
+
/**
|
|
915
|
+
* Diff two findings sets by stable finding_id. Callers typically load
|
|
916
|
+
* the two run-id slices from the same store and pass them in.
|
|
917
|
+
*/
|
|
918
|
+
declare function diffFindings(previous: PersistedFinding[], current: PersistedFinding[], policy?: DiffPolicy): FindingsDiff;
|
|
919
|
+
|
|
920
|
+
/**
|
|
921
|
+
* Typed Ax output for analyst findings.
|
|
922
|
+
*
|
|
923
|
+
* Replaces the legacy `findings:string[]` pattern (where every bullet
|
|
924
|
+
* became a flat-severity `AnalystFinding`) with a structured object
|
|
925
|
+
* array. Ax binds the field as `findings:json[]` so the provider emits
|
|
926
|
+
* native structured output; at the kind-factory boundary we Zod-validate
|
|
927
|
+
* each emitted finding so malformed rows fail loud instead of being
|
|
928
|
+
* silently lifted with default severity.
|
|
929
|
+
*
|
|
930
|
+
* Why not `f.object().array()` directly in the signature? The Ax
|
|
931
|
+
* signature string `question:string -> findings:json[]` already lets
|
|
932
|
+
* the provider emit JSON arrays. A Zod boundary is required either
|
|
933
|
+
* way (the provider can return any JSON), and Zod gives us a single
|
|
934
|
+
* validation surface independent of which Ax version is installed.
|
|
935
|
+
*/
|
|
936
|
+
|
|
937
|
+
declare const ANALYST_SEVERITIES: readonly ["critical", "high", "medium", "low", "info"];
|
|
938
|
+
declare const RawAnalystFindingSchema: z.ZodObject<{
|
|
939
|
+
severity: z.ZodEnum<{
|
|
940
|
+
info: "info";
|
|
941
|
+
critical: "critical";
|
|
942
|
+
medium: "medium";
|
|
943
|
+
low: "low";
|
|
944
|
+
high: "high";
|
|
945
|
+
}>;
|
|
946
|
+
claim: z.ZodString;
|
|
947
|
+
subject: z.ZodOptional<z.ZodString>;
|
|
948
|
+
evidence_uri: z.ZodString;
|
|
949
|
+
evidence_excerpt: z.ZodOptional<z.ZodString>;
|
|
950
|
+
confidence: z.ZodNumber;
|
|
951
|
+
rationale: z.ZodOptional<z.ZodString>;
|
|
952
|
+
recommended_action: z.ZodOptional<z.ZodString>;
|
|
953
|
+
}, z.core.$strict>;
|
|
954
|
+
type RawAnalystFinding = z.infer<typeof RawAnalystFindingSchema>;
|
|
955
|
+
/**
|
|
956
|
+
* Description embedded into the actor prompt so the LLM knows what
|
|
957
|
+
* shape to emit. Kept here so kinds share one source of truth rather
|
|
958
|
+
* than restating the schema in every prompt.
|
|
959
|
+
*/
|
|
960
|
+
declare const RAW_FINDING_SCHEMA_PROMPT = "Each finding MUST be a JSON object with these fields:\n - severity: one of \"critical\" | \"high\" | \"medium\" | \"low\" | \"info\"\n - claim: one-sentence statement (max 2000 chars)\n - subject?: the leaf id, agent id, span id, tool name, or noun phrase the finding is about\n - evidence_uri: \"span://<trace_id>/<span_id>\" for trace evidence, \"artifact://<relative-path>\" for files, \"metric://<name>\" for named scalars \u2014 ALWAYS cite a real id surfaced by the tools\n - evidence_excerpt?: short quote (<=2000 chars) from the cited span/artifact\n - confidence: number 0..1 \u2014 0.9+ when backed by exact quotes, 0.6-0.8 for inferred patterns, <0.5 for speculative\n - rationale?: one or two sentences explaining the reasoning\n - recommended_action?: concrete change phrased as an imperative (\"Add ...\", \"Replace ...\", \"Stop ...\") \u2014 omit when the finding is purely descriptive\n\nEmit an empty array when the question has no findings to report. Do not fabricate evidence.";
|
|
961
|
+
/**
|
|
962
|
+
* Validate one row emitted by the LLM. Returns the typed finding on
|
|
963
|
+
* success; returns `null` and logs the reason on failure so the kind
|
|
964
|
+
* factory can skip-and-count rather than abort the whole analyst run.
|
|
965
|
+
*/
|
|
966
|
+
declare function parseRawFinding(row: unknown, log?: (msg: string, fields?: Record<string, unknown>) => void): RawAnalystFinding | null;
|
|
967
|
+
|
|
968
|
+
/**
|
|
969
|
+
* Analyst-kind factory — the typed, focused replacement for the
|
|
970
|
+
* legacy `createTraceAnalystAdapter`.
|
|
971
|
+
*
|
|
972
|
+
* A "kind" is a specialized analyst whose actor prompt, tool subset,
|
|
973
|
+
* and Ax recursion config target one failure-mode lens (failure-mode
|
|
974
|
+
* classification, knowledge gap discovery, knowledge poisoning, recursive
|
|
975
|
+
* self-improvement, ...). Kinds emit findings in the typed `RawAnalystFinding`
|
|
976
|
+
* shape via a JSON-array Ax output; the factory validates each row with
|
|
977
|
+
* Zod and lifts it into `AnalystFinding[]` with no shape guessing.
|
|
978
|
+
*
|
|
979
|
+
* Composition rules:
|
|
980
|
+
* - Each kind owns its actor description. No generic "answer this
|
|
981
|
+
* question" prompt — the prompt names the failure lens.
|
|
982
|
+
* - Each kind picks a narrow tool subset from `ANALYST_TOOL_GROUPS`.
|
|
983
|
+
* A kind that never needs full-trace dumps can drop `viewTrace` /
|
|
984
|
+
* `viewSpans` and stay cheap.
|
|
985
|
+
* - Each kind declares its recursion + parallelism budget. Discovery-
|
|
986
|
+
* heavy kinds (failure-mode) get higher `maxDepth`; lens kinds
|
|
987
|
+
* (poisoning) usually stay at 0 since they have a tighter brief.
|
|
988
|
+
*
|
|
989
|
+
* Optimizer hook: kinds may declare `goldens` — labeled examples used
|
|
990
|
+
* by `AxMiPRO` / `AxBootstrapFewShot` / `AxGEPA` to fit the actor
|
|
991
|
+
* description programmatically. Stored on the kind, not the registry,
|
|
992
|
+
* because the right metric is kind-specific.
|
|
993
|
+
*/
|
|
994
|
+
|
|
995
|
+
/**
|
|
996
|
+
* Per-kind specification. The factory turns this into a regular
|
|
997
|
+
* `Analyst<TraceAnalysisStore>` ready for `AnalystRegistry.register()`.
|
|
998
|
+
*/
|
|
999
|
+
interface TraceAnalystKindSpec {
|
|
1000
|
+
/** Stable id. Appears in finding_id, telemetry, and registry exclusions. */
|
|
1001
|
+
id: string;
|
|
1002
|
+
/** One-sentence description shown in `registry.list()`. */
|
|
1003
|
+
description: string;
|
|
1004
|
+
/** Coarse classification stamped on every emitted finding (`failure-mode`, `knowledge-gap`, ...). */
|
|
1005
|
+
area: string;
|
|
1006
|
+
/** Bump on any breaking change to the actor prompt or output schema. */
|
|
1007
|
+
version: string;
|
|
1008
|
+
/** Actor system prompt. Must instruct the LLM to emit `findings` per the schema. */
|
|
1009
|
+
actorDescription: string;
|
|
1010
|
+
/** Responder system prompt; falls back to a minimal "format the findings" instruction. */
|
|
1011
|
+
responderDescription?: string;
|
|
1012
|
+
/** Tool functions the actor may call. Pick narrow subsets via `ANALYST_TOOL_GROUPS`. */
|
|
1013
|
+
buildTools: (store: TraceAnalysisStore) => AxFunction[];
|
|
1014
|
+
/** Recursion budget. `maxDepth: 0` disables subagents. */
|
|
1015
|
+
recursion?: {
|
|
1016
|
+
maxDepth: number;
|
|
1017
|
+
maxParallelSubagents?: number;
|
|
1018
|
+
};
|
|
1019
|
+
/** Actor turn cap. Default 12. */
|
|
1020
|
+
maxTurns?: number;
|
|
1021
|
+
/** Runtime char cap. Default 6000. */
|
|
1022
|
+
maxRuntimeChars?: number;
|
|
1023
|
+
/** Cost classification surfaced in `registry.list()` and budget enforcement. */
|
|
1024
|
+
cost: AnalystCost;
|
|
1025
|
+
/** Per-finding-row hook — kinds may reject / rewrite before lifting. */
|
|
1026
|
+
postProcess?: (row: RawAnalystFinding, ctx: AnalystContext) => RawAnalystFinding | null;
|
|
1027
|
+
/** Optional optimizer hook — populated when a kind wants to fit its prompt against labeled examples. */
|
|
1028
|
+
goldens?: TraceAnalystGolden[];
|
|
1029
|
+
}
|
|
1030
|
+
/**
|
|
1031
|
+
* One labeled example consumed by Ax optimizers (MIPRO / GEPA / Bootstrap).
|
|
1032
|
+
* Each input is the same `{question}` an analyst would receive; `expected`
|
|
1033
|
+
* is the ground-truth finding set a fitted prompt should produce on this
|
|
1034
|
+
* input. Metric: kind-specific (default: F1 on `finding_id` overlap).
|
|
1035
|
+
*/
|
|
1036
|
+
interface TraceAnalystGolden {
|
|
1037
|
+
question: string;
|
|
1038
|
+
expected: ReadonlyArray<Omit<RawAnalystFinding, 'confidence'>>;
|
|
1039
|
+
}
|
|
1040
|
+
interface CreateTraceAnalystKindOpts {
|
|
1041
|
+
/** AxAIService bound at registration time. */
|
|
1042
|
+
ai: AxAIService;
|
|
1043
|
+
/** Optional model override; falls back to the AI service's default. */
|
|
1044
|
+
model?: string;
|
|
1045
|
+
/** Override the spec's `version` (e.g. when an optimizer has fitted a new prompt). */
|
|
1046
|
+
versionSuffix?: string;
|
|
1047
|
+
}
|
|
1048
|
+
/**
|
|
1049
|
+
* Build an `Analyst<TraceAnalysisStore>` from a kind spec.
|
|
1050
|
+
*
|
|
1051
|
+
* Lifts the Ax pipeline once at registration time so the registry
|
|
1052
|
+
* gets a stateless analyst. The Ax agent is freshly constructed per
|
|
1053
|
+
* `analyze()` call (the agent carries chat-log + usage state we don't
|
|
1054
|
+
* want shared across analyst runs).
|
|
1055
|
+
*/
|
|
1056
|
+
declare function createTraceAnalystKind(spec: TraceAnalystKindSpec, opts: CreateTraceAnalystKindOpts): Analyst<TraceAnalysisStore>;
|
|
1057
|
+
|
|
1058
|
+
/**
|
|
1059
|
+
* Failure-mode analyst — classifies what went wrong and why.
|
|
1060
|
+
*
|
|
1061
|
+
* Brief: read the trace dataset, identify the top failure modes across
|
|
1062
|
+
* runs, classify each with severity + evidence, and surface them as
|
|
1063
|
+
* findings. The actor's job is *taxonomy + evidence*, not fix-design —
|
|
1064
|
+
* that's the improvement-analyst's job.
|
|
1065
|
+
*
|
|
1066
|
+
* Recursion is deep (`maxDepth: 3`) because real failure-mode
|
|
1067
|
+
* discovery is genuinely tree-shaped: the actor splits the dataset
|
|
1068
|
+
* into candidate clusters, each cluster spawns a focused investigator
|
|
1069
|
+
* that drills into representative traces, and a deeply-recursed
|
|
1070
|
+
* investigator may itself split a confounded mode into two sub-modes.
|
|
1071
|
+
* Each level fans out 4-way, so the analyst can investigate up to
|
|
1072
|
+
* ~16 leaf clusters before hitting the depth ceiling.
|
|
1073
|
+
*/
|
|
1074
|
+
|
|
1075
|
+
declare const FAILURE_MODE_KIND_SPEC: TraceAnalystKindSpec;
|
|
1076
|
+
|
|
1077
|
+
/**
|
|
1078
|
+
* Knowledge-gap analyst — what did the agent NOT know that it needed?
|
|
1079
|
+
*
|
|
1080
|
+
* Brief: find moments in the trace where the agent had to guess, ask
|
|
1081
|
+
* the user to fill in context, recover from a wrong assumption, or
|
|
1082
|
+
* loop on a retrieval. Each finding names a *missing or outdated piece
|
|
1083
|
+
* of knowledge* the agent's curated knowledge base should have held —
|
|
1084
|
+
* or a downstream lookup (web, docs, tool description) that surfaced
|
|
1085
|
+
* stale or outdated information.
|
|
1086
|
+
*
|
|
1087
|
+
* The primary expected store is `@tangle-network/agent-knowledge`: a
|
|
1088
|
+
* Karpathy-style wiki the agent maintains with raw ↔ curated pages,
|
|
1089
|
+
* source anchors, and claim/relation triples. A gap is anything the
|
|
1090
|
+
* agent had to discover at run-time that should already have lived
|
|
1091
|
+
* there. Secondary loci: web-search results that returned outdated
|
|
1092
|
+
* pages, tool descriptions that omitted critical behavior, system-
|
|
1093
|
+
* prompt sections that didn't cover the case.
|
|
1094
|
+
*
|
|
1095
|
+
* Distinct from failure-mode: failure-mode classifies *how* it broke;
|
|
1096
|
+
* knowledge-gap names the *information* whose absence (or staleness)
|
|
1097
|
+
* caused the break. One failure-mode often maps to several gaps.
|
|
1098
|
+
*
|
|
1099
|
+
* Recursion (`maxDepth: 2`) is enough to fan out one subagent per
|
|
1100
|
+
* candidate gap-source layer; each subagent runs a focused detection.
|
|
1101
|
+
*/
|
|
1102
|
+
|
|
1103
|
+
declare const KNOWLEDGE_GAP_KIND_SPEC: TraceAnalystKindSpec;
|
|
1104
|
+
|
|
1105
|
+
/**
|
|
1106
|
+
* Knowledge-poisoning analyst — what FALSE information misled the agent?
|
|
1107
|
+
*
|
|
1108
|
+
* Brief: find moments where the agent acted on information that was
|
|
1109
|
+
* *wrong* — stale memory, RAG documents that contradicted ground truth,
|
|
1110
|
+
* tool descriptions that lied about return shapes, system-prompt
|
|
1111
|
+
* instructions that no longer matched reality, prior-run summaries that
|
|
1112
|
+
* cached a wrong decision.
|
|
1113
|
+
*
|
|
1114
|
+
* Distinct from knowledge-gap: a gap is "the agent didn't know X"; a
|
|
1115
|
+
* poisoning is "the agent confidently used X, but X was wrong." Gaps
|
|
1116
|
+
* surface as questions / self-correction; poisonings surface as
|
|
1117
|
+
* confident-but-wrong actions that downstream evidence contradicts.
|
|
1118
|
+
*
|
|
1119
|
+
* Recursion is moderate (`maxDepth: 2`) because each candidate
|
|
1120
|
+
* poisoning typically needs two sub-investigations: one to confirm
|
|
1121
|
+
* the agent acted on the false belief, one to confirm the belief
|
|
1122
|
+
* itself is actually false in ground truth.
|
|
1123
|
+
*/
|
|
1124
|
+
|
|
1125
|
+
declare const KNOWLEDGE_POISONING_KIND_SPEC: TraceAnalystKindSpec;
|
|
1126
|
+
|
|
1127
|
+
/**
|
|
1128
|
+
* Improvement analyst — actionable, recursive self-improvement findings.
|
|
1129
|
+
*
|
|
1130
|
+
* Brief: read findings from upstream analysts (failure-mode,
|
|
1131
|
+
* knowledge-gap, knowledge-poisoning) AND the trace dataset itself,
|
|
1132
|
+
* then propose **concrete edits** to the agent's runtime: prompt
|
|
1133
|
+
* additions, RAG documents to ingest, tool descriptions to rewrite,
|
|
1134
|
+
* scaffolding changes to make, memory entries to invalidate. Each
|
|
1135
|
+
* finding is one proposed edit with the locus, the diff, and the
|
|
1136
|
+
* expected effect.
|
|
1137
|
+
*
|
|
1138
|
+
* This is the recursive-self-improvement loop's last mile: the prior
|
|
1139
|
+
* kinds describe *what's wrong*; this kind describes *what to change*.
|
|
1140
|
+
*
|
|
1141
|
+
* Recursion is deep (`maxDepth: 3`) because real improvement proposals
|
|
1142
|
+
* are competitive: for each failure-mode there are usually 2-3 viable
|
|
1143
|
+
* fix directions (tighten prompt vs add tool vs adjust scaffolding),
|
|
1144
|
+
* and the actor should explore each with a focused subagent before
|
|
1145
|
+
* picking the highest-leverage one to recommend.
|
|
1146
|
+
*/
|
|
1147
|
+
|
|
1148
|
+
declare const IMPROVEMENT_KIND_SPEC: TraceAnalystKindSpec;
|
|
1149
|
+
|
|
1150
|
+
/**
|
|
1151
|
+
* Default analyst kinds focused on agent failure + recursive
|
|
1152
|
+
* self-improvement.
|
|
1153
|
+
*
|
|
1154
|
+
* The four kinds chain: failure-mode classifies; knowledge-gap and
|
|
1155
|
+
* knowledge-poisoning explain *why* in two orthogonal ways; improvement
|
|
1156
|
+
* proposes concrete edits. Register all four against the same trace
|
|
1157
|
+
* store and the registry runs them in dependency order if the operator
|
|
1158
|
+
* pipes findings between them.
|
|
1159
|
+
*/
|
|
1160
|
+
|
|
1161
|
+
/**
|
|
1162
|
+
* The default kind suite. Order is the run order operators should
|
|
1163
|
+
* use: failure-mode first (no upstream deps), gap + poisoning next
|
|
1164
|
+
* (both depend on failures), improvement last (chains all three).
|
|
1165
|
+
*/
|
|
1166
|
+
declare const DEFAULT_TRACE_ANALYST_KINDS: readonly TraceAnalystKindSpec[];
|
|
1167
|
+
|
|
1168
|
+
/**
|
|
1169
|
+
* Pre-curated tool subsets for analyst kinds.
|
|
1170
|
+
*
|
|
1171
|
+
* The full trace-analyst tool set is seven functions. Most kinds only
|
|
1172
|
+
* need three or four. Picking from named groups instead of importing
|
|
1173
|
+
* the whole bundle keeps every kind's actor-context budget tight and
|
|
1174
|
+
* makes "what can this analyst see?" obvious at registration time.
|
|
1175
|
+
*
|
|
1176
|
+
* Each function in the group keeps its full `name`/`description` from
|
|
1177
|
+
* `buildTraceAnalystTools` — we filter, we don't re-implement.
|
|
1178
|
+
*/
|
|
1179
|
+
|
|
1180
|
+
/** Named tool sets. Kinds pass `tools: TRACE_TOOL_GROUPS.failureForensics` etc. */
|
|
1181
|
+
type TraceToolGroupName =
|
|
1182
|
+
/** All seven tools. Use for open-ended discovery kinds. */
|
|
1183
|
+
'all'
|
|
1184
|
+
/** Overview + paginated query + count. No deep reads. Cheap. */
|
|
1185
|
+
| 'discovery'
|
|
1186
|
+
/** Discovery + viewTrace + viewSpans. Deep-read but no regex search. */
|
|
1187
|
+
| 'discoveryAndRead'
|
|
1188
|
+
/** Discovery + search tools. For pattern-matching across many traces. */
|
|
1189
|
+
| 'discoveryAndSearch'
|
|
1190
|
+
/** Discovery + viewSpans + searchSpan. Targeted-span work after another kind narrows down. */
|
|
1191
|
+
| 'targeted';
|
|
1192
|
+
/**
|
|
1193
|
+
* Build the tool set for a named group bound to a specific trace store.
|
|
1194
|
+
*
|
|
1195
|
+
* `all` returns every tool. Other groups filter `buildTraceAnalystTools`
|
|
1196
|
+
* by name to the documented subset. An unrecognised group name throws —
|
|
1197
|
+
* silently returning all tools would defeat the cost-control point.
|
|
1198
|
+
*/
|
|
1199
|
+
declare function buildTraceToolsForGroup(group: TraceToolGroupName, store: TraceAnalysisStore): AxFunction[];
|
|
1200
|
+
|
|
1201
|
+
/**
|
|
1202
|
+
* AnalystRegistry — orchestrate N analysts against one run.
|
|
1203
|
+
*
|
|
1204
|
+
* Owns three responsibilities and only three:
|
|
1205
|
+
* 1. Registration — ids must be unique; bad registrations fail loudly
|
|
1206
|
+
* at register-time, not run-time.
|
|
1207
|
+
* 2. Routing — each analyst declares its `inputKind`; the registry
|
|
1208
|
+
* picks the matching field from AnalystRunInputs and skips the
|
|
1209
|
+
* analyst with a logged reason if it's missing.
|
|
1210
|
+
* 3. Isolation — one analyst's exception MUST NOT stop other analysts.
|
|
1211
|
+
* Failed analysts produce zero findings + a 'failed' summary row.
|
|
1212
|
+
*
|
|
1213
|
+
* Cross-cutting concerns (telemetry, error → finding conversion, cost
|
|
1214
|
+
* ingestion, storage rotation) live in `AnalystHooks`. Budget shaping
|
|
1215
|
+
* (equal split vs weighted vs custom) lives in `BudgetPolicy`. Both
|
|
1216
|
+
* have sensible defaults; consumers override only what they need.
|
|
1217
|
+
*/
|
|
1218
|
+
|
|
1219
|
+
interface AnalystHooks {
|
|
1220
|
+
/** Before analyze() — last chance to mutate ctx (e.g. inject tags, override budget). */
|
|
1221
|
+
onBeforeAnalyze?(args: {
|
|
1222
|
+
analyst: Analyst;
|
|
1223
|
+
ctx: AnalystContext;
|
|
1224
|
+
runId: string;
|
|
1225
|
+
}): void | Promise<void>;
|
|
1226
|
+
/** After every analyst (ok | failed | skipped). Use for telemetry, ingestion, rotation. */
|
|
1227
|
+
onAfterAnalyze?(args: {
|
|
1228
|
+
analyst: Analyst;
|
|
1229
|
+
summary: AnalystRunSummary;
|
|
1230
|
+
findings: AnalystFinding[];
|
|
1231
|
+
runId: string;
|
|
1232
|
+
}): void | Promise<void>;
|
|
1233
|
+
/**
|
|
1234
|
+
* On analyst exception. Hook MAY return findings to convert the
|
|
1235
|
+
* error into structured findings; the summary still reports 'failed'.
|
|
1236
|
+
* Return void to keep the default empty-findings behavior.
|
|
1237
|
+
*/
|
|
1238
|
+
onError?(args: {
|
|
1239
|
+
analyst: Analyst;
|
|
1240
|
+
error: Error;
|
|
1241
|
+
runId: string;
|
|
1242
|
+
}): AnalystFinding[] | void | Promise<AnalystFinding[] | void>;
|
|
1243
|
+
/** Once after registry.run() completes. Use for final aggregation, persistence. */
|
|
1244
|
+
onComplete?(args: {
|
|
1245
|
+
result: AnalystRunResult;
|
|
1246
|
+
}): void | Promise<void>;
|
|
1247
|
+
}
|
|
1248
|
+
interface BudgetPolicy {
|
|
1249
|
+
/** Overall USD cap across the registry.run(). */
|
|
1250
|
+
totalUsd?: number;
|
|
1251
|
+
/** Per-analyst weight for the default allocator. Missing ids get weight 1. */
|
|
1252
|
+
weights?: Record<string, number>;
|
|
1253
|
+
/**
|
|
1254
|
+
* Custom allocator — receives the analyst, remaining/total budget, and
|
|
1255
|
+
* the count of analysts that will run. Returns the per-analyst budget
|
|
1256
|
+
* (or undefined to leave it uncapped). Overrides weights when set.
|
|
1257
|
+
*/
|
|
1258
|
+
allocate?: (args: {
|
|
1259
|
+
analyst: Analyst;
|
|
1260
|
+
totalUsd: number | undefined;
|
|
1261
|
+
remainingUsd: number | undefined;
|
|
1262
|
+
runningCount: number;
|
|
1263
|
+
}) => number | undefined;
|
|
1264
|
+
}
|
|
1265
|
+
interface AnalystRegistryOptions {
|
|
1266
|
+
/** Shared chat client passed to every LLM analyst via AnalystContext. */
|
|
1267
|
+
chat?: ChatClient;
|
|
1268
|
+
/** Logger callback. Defaults to a no-op. */
|
|
1269
|
+
log?: (msg: string, fields?: Record<string, unknown>) => void;
|
|
1270
|
+
/** Hooks invoked around analyze() — observability + customization seam. */
|
|
1271
|
+
hooks?: AnalystHooks;
|
|
1272
|
+
/** Default budget when run() doesn't override. */
|
|
1273
|
+
defaultBudget?: BudgetPolicy;
|
|
1274
|
+
}
|
|
1275
|
+
interface RegistryRunOpts {
|
|
1276
|
+
/** Restrict to a subset of registered analysts by id. */
|
|
1277
|
+
only?: string[];
|
|
1278
|
+
/** Skip these analysts even if registered. Useful for cheap iteration. */
|
|
1279
|
+
skip?: string[];
|
|
1280
|
+
/** Budget policy — totalUsd + optional weights/allocator. Falls back to options.defaultBudget. */
|
|
1281
|
+
budget?: BudgetPolicy;
|
|
1282
|
+
/** Wall-clock cap. Analysts SHOULD honor `ctx.deadlineMs`. */
|
|
1283
|
+
timeoutMs?: number;
|
|
1284
|
+
/** Abort signal — forwarded into every analyst's context. */
|
|
1285
|
+
signal?: AbortSignal;
|
|
1286
|
+
/** Tags echoed into AnalystContext.tags — useful for tracking environment/version in findings. */
|
|
1287
|
+
tags?: Record<string, string>;
|
|
1288
|
+
}
|
|
1289
|
+
declare class AnalystRegistry {
|
|
1290
|
+
private readonly analysts;
|
|
1291
|
+
private readonly options;
|
|
1292
|
+
constructor(options?: AnalystRegistryOptions);
|
|
1293
|
+
register(analyst: Analyst): void;
|
|
1294
|
+
list(): ReadonlyArray<{
|
|
1295
|
+
id: string;
|
|
1296
|
+
description: string;
|
|
1297
|
+
version: string;
|
|
1298
|
+
cost: Analyst['cost'];
|
|
1299
|
+
}>;
|
|
1300
|
+
run(runId: string, inputs: AnalystRunInputs, runOpts?: RegistryRunOpts): Promise<AnalystRunResult>;
|
|
1301
|
+
private selectAnalysts;
|
|
1302
|
+
private routeInput;
|
|
1303
|
+
}
|
|
1304
|
+
|
|
1305
|
+
/**
|
|
1306
|
+
* Automated pull request opener for the production loop.
|
|
1307
|
+
*
|
|
1308
|
+
* `runProductionLoop` produces a `promotedPrompt` string and a release
|
|
1309
|
+
* scorecard. To close the eval → prod → eval cycle the framework needs
|
|
1310
|
+
* to land that prompt as a reviewable code change. This module does
|
|
1311
|
+
* exactly that:
|
|
1312
|
+
*
|
|
1313
|
+
* 1. Stage a branch off `baseBranch`.
|
|
1314
|
+
* 2. Write each `fileChange` into the worktree.
|
|
1315
|
+
* 3. Commit + push.
|
|
1316
|
+
* 4. Open a PR via the GitHub API.
|
|
1317
|
+
*
|
|
1318
|
+
* Two transports ship in core:
|
|
1319
|
+
*
|
|
1320
|
+
* - `ghCliClient(opts)` — shells out to the `gh` CLI. No extra deps,
|
|
1321
|
+
* re-uses the developer machine's `gh auth` state, works with both
|
|
1322
|
+
* github.com and GitHub Enterprise. This is the recommended default.
|
|
1323
|
+
* - `httpGithubClient(opts)` — direct `fetch` against `api.github.com`
|
|
1324
|
+
* with a bearer token. Useful in CI where `gh` may not be installed.
|
|
1325
|
+
*
|
|
1326
|
+
* Both implement the small `AutoPrClient` interface, so tests substitute
|
|
1327
|
+
* a fake without spinning a process or network.
|
|
1328
|
+
*
|
|
1329
|
+
* @experimental — surface may evolve as consumers wire it into CI workflows.
|
|
1330
|
+
*/
|
|
1331
|
+
interface FileChange {
|
|
1332
|
+
/** Repo-relative path. Forward slashes; no `..`. */
|
|
1333
|
+
path: string;
|
|
1334
|
+
/** New file contents. UTF-8. */
|
|
1335
|
+
contents: string;
|
|
1336
|
+
/** Optional explanatory comment shown in the commit body. */
|
|
1337
|
+
rationale?: string;
|
|
1338
|
+
}
|
|
1339
|
+
interface RepoRef {
|
|
1340
|
+
owner: string;
|
|
1341
|
+
name: string;
|
|
1342
|
+
}
|
|
1343
|
+
interface ProposeAutomatedPullRequestInput {
|
|
1344
|
+
repo: RepoRef;
|
|
1345
|
+
/** Branch to base the PR on. Default `'main'`. */
|
|
1346
|
+
baseBranch?: string;
|
|
1347
|
+
/** New branch name. Use a prefix + a short stable id; no spaces. */
|
|
1348
|
+
branchName: string;
|
|
1349
|
+
fileChanges: FileChange[];
|
|
1350
|
+
title: string;
|
|
1351
|
+
body: string;
|
|
1352
|
+
/** Optional GitHub usernames to request review from. */
|
|
1353
|
+
reviewers?: string[];
|
|
1354
|
+
/** Optional labels to apply. */
|
|
1355
|
+
labels?: string[];
|
|
1356
|
+
/** Commit author name. Default: derived from the GitHub client. */
|
|
1357
|
+
authorName?: string;
|
|
1358
|
+
/** Commit author email. Default: derived from the GitHub client. */
|
|
1359
|
+
authorEmail?: string;
|
|
1360
|
+
/** Dry-run — do not push or open a PR; just return the would-be plan. */
|
|
1361
|
+
dryRun?: boolean;
|
|
1362
|
+
}
|
|
1363
|
+
interface ProposeAutomatedPullRequestResult {
|
|
1364
|
+
prUrl: string;
|
|
1365
|
+
branchName: string;
|
|
1366
|
+
headSha: string;
|
|
1367
|
+
dryRun: boolean;
|
|
1368
|
+
}
|
|
1369
|
+
/** Pluggable transport for the auto-PR pipeline. */
|
|
1370
|
+
interface AutoPrClient {
|
|
1371
|
+
/**
|
|
1372
|
+
* Create a branch from `baseBranch`, write file changes, commit, push,
|
|
1373
|
+
* and open a PR. Returns the PR's HTML url and head SHA.
|
|
1374
|
+
*
|
|
1375
|
+
* Implementations must be idempotent on `branchName`: if the branch
|
|
1376
|
+
* already exists with the same head SHA as the would-be commit, return
|
|
1377
|
+
* the existing PR rather than failing. This makes the production loop
|
|
1378
|
+
* safe to retry on transient errors.
|
|
1379
|
+
*/
|
|
1380
|
+
proposeChange(input: ProposeAutomatedPullRequestInput): Promise<ProposeAutomatedPullRequestResult>;
|
|
1381
|
+
}
|
|
1382
|
+
declare function proposeAutomatedPullRequest(client: AutoPrClient, input: ProposeAutomatedPullRequestInput): Promise<ProposeAutomatedPullRequestResult>;
|
|
1383
|
+
interface HttpGithubClientOptions {
|
|
1384
|
+
/** Personal access token, GitHub App token, or `GITHUB_TOKEN` from Actions. */
|
|
1385
|
+
token: string;
|
|
1386
|
+
/** Override for GitHub Enterprise. Default `'https://api.github.com'`. */
|
|
1387
|
+
apiBase?: string;
|
|
1388
|
+
/** Test seam — defaults to global `fetch`. */
|
|
1389
|
+
fetchImpl?: typeof fetch;
|
|
1390
|
+
/** Test seam — clock for commit timestamps. */
|
|
1391
|
+
now?: () => Date;
|
|
1392
|
+
}
|
|
1393
|
+
/**
|
|
1394
|
+
* Direct REST-API GitHub client. No external deps.
|
|
1395
|
+
*
|
|
1396
|
+
* Idempotency strategy: before creating refs/commits/PRs, check whether
|
|
1397
|
+
* the branch already exists at the desired tree. If so, return the
|
|
1398
|
+
* existing PR (or open one if missing). Errors from concurrent runs
|
|
1399
|
+
* (`Reference already exists`) are caught and treated as success.
|
|
1400
|
+
*/
|
|
1401
|
+
declare function httpGithubClient(opts: HttpGithubClientOptions): AutoPrClient;
|
|
1402
|
+
interface GhCliClientOptions {
|
|
1403
|
+
/** Override the CLI binary (`gh`). For testing. */
|
|
1404
|
+
bin?: string;
|
|
1405
|
+
/** Working directory containing a clone of `repo`. Default: process cwd. */
|
|
1406
|
+
cwd?: string;
|
|
1407
|
+
/** Test seam: process spawner. Default: node:child_process spawn. */
|
|
1408
|
+
exec?: (bin: string, args: string[], opts: {
|
|
1409
|
+
cwd: string;
|
|
1410
|
+
stdin?: string;
|
|
1411
|
+
}) => Promise<{
|
|
1412
|
+
stdout: string;
|
|
1413
|
+
stderr: string;
|
|
1414
|
+
exitCode: number;
|
|
1415
|
+
}>;
|
|
419
1416
|
}
|
|
1417
|
+
/**
|
|
1418
|
+
* `gh` CLI transport. Requires:
|
|
1419
|
+
* - `gh` installed and authenticated (`gh auth status`).
|
|
1420
|
+
* - A local clone of the repo with a clean working tree.
|
|
1421
|
+
* - `git` on PATH.
|
|
1422
|
+
*
|
|
1423
|
+
* Uses `gh api` for repo metadata and `gh pr create` for the PR. The
|
|
1424
|
+
* actual commit lands via `git`, which keeps `gh`'s footprint minimal.
|
|
1425
|
+
*/
|
|
1426
|
+
declare function ghCliClient(opts?: GhCliClientOptions): AutoPrClient;
|
|
420
1427
|
|
|
421
1428
|
/**
|
|
422
1429
|
* BenchmarkRunner — orchestrates scenarios, executor, judges, and scoring.
|
|
@@ -747,8 +1754,7 @@ declare class MetricsCollector {
|
|
|
747
1754
|
* primitive is idempotent + replayable: re-running with the same
|
|
748
1755
|
* `runId` will produce the same plan.
|
|
749
1756
|
*
|
|
750
|
-
* @experimental —
|
|
751
|
-
* agents wire it in.
|
|
1757
|
+
* @experimental — surface may evolve as product agents wire it in.
|
|
752
1758
|
*/
|
|
753
1759
|
|
|
754
1760
|
interface FailureClusterConfig {
|
|
@@ -998,6 +2004,78 @@ declare function wilcoxonSignedRank(before: number[], after: number[]): {
|
|
|
998
2004
|
* Rule of thumb: |d| < 0.2 negligible, 0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.
|
|
999
2005
|
*/
|
|
1000
2006
|
declare function cohensD(a: number[], b: number[]): number;
|
|
2007
|
+
interface CorpusScoreRecord {
|
|
2008
|
+
/** Stable identifier for the rated item (scenario, span, turn, …). */
|
|
2009
|
+
itemId: string;
|
|
2010
|
+
/** Identifier for the judge that produced this score. */
|
|
2011
|
+
judgeName: string;
|
|
2012
|
+
/** Dimension name (matches `JudgeScore.dimension`). */
|
|
2013
|
+
dimension: string;
|
|
2014
|
+
/** Numeric score; must be finite. */
|
|
2015
|
+
score: number;
|
|
2016
|
+
}
|
|
2017
|
+
interface CorpusAgreementPerDimension extends ContinuousAgreement {
|
|
2018
|
+
dimension: string;
|
|
2019
|
+
/** Item IDs that contributed to this dimension's matrix (every judge scored them). */
|
|
2020
|
+
itemIds: string[];
|
|
2021
|
+
/** Judge IDs that contributed to this dimension's matrix. */
|
|
2022
|
+
judgeIds: string[];
|
|
2023
|
+
}
|
|
2024
|
+
interface CorpusAgreementReport {
|
|
2025
|
+
/** Per-dimension ICC(2,1) + κ_w + Pearson + Spearman + bootstrap CIs. */
|
|
2026
|
+
perDimension: CorpusAgreementPerDimension[];
|
|
2027
|
+
/** Mean ICC across dimensions (NaN if no dimension yielded a finite ICC). */
|
|
2028
|
+
overallIcc: number;
|
|
2029
|
+
/** Mean weighted κ across dimensions (NaN if none finite). */
|
|
2030
|
+
overallWeightedKappa: number;
|
|
2031
|
+
/** Dimensions evaluated (sorted). */
|
|
2032
|
+
dimensions: string[];
|
|
2033
|
+
/** Judges seen across the corpus (sorted). */
|
|
2034
|
+
judgeIds: string[];
|
|
2035
|
+
}
|
|
2036
|
+
interface CorpusAgreementOptions extends ContinuousAgreementOptions {
|
|
2037
|
+
/**
|
|
2038
|
+
* Restrict the audit to these dimensions. Default = every dimension
|
|
2039
|
+
* that appears in the input. A dimension named here but absent from
|
|
2040
|
+
* the input throws — silent omission would corrupt the overall metric.
|
|
2041
|
+
*/
|
|
2042
|
+
dimensions?: string[];
|
|
2043
|
+
/**
|
|
2044
|
+
* Restrict the audit to these judges. Default = every judge that
|
|
2045
|
+
* appears in the input. A judge named here but absent from a
|
|
2046
|
+
* dimension throws (see "fail loud" below).
|
|
2047
|
+
*/
|
|
2048
|
+
judges?: string[];
|
|
2049
|
+
}
|
|
2050
|
+
/**
|
|
2051
|
+
* Corpus-wide inter-rater agreement across N items × M judges × D dimensions.
|
|
2052
|
+
*
|
|
2053
|
+
* For each dimension, builds the [n_items][n_judges] matrix of scores
|
|
2054
|
+
* (keeping only items every judge rated on that dimension), then runs
|
|
2055
|
+
* `continuousAgreement` to get ICC(2,1), κ_w, Pearson, Spearman, and
|
|
2056
|
+
* bootstrap CIs. Reports a pooled mean across dimensions as a single
|
|
2057
|
+
* "is this judge panel reliable on this corpus?" number.
|
|
2058
|
+
*
|
|
2059
|
+
* Fail-loud contract:
|
|
2060
|
+
* - Empty input throws.
|
|
2061
|
+
* - Fewer than 2 judges or fewer than 2 items per dimension throws.
|
|
2062
|
+
* - A judge present in some dimensions but with zero scored items on
|
|
2063
|
+
* another dimension throws (would silently shrink the matrix).
|
|
2064
|
+
* - Duplicate (itemId, judgeName, dimension) records throw.
|
|
2065
|
+
*/
|
|
2066
|
+
declare function corpusInterRaterAgreement(records: CorpusScoreRecord[], opts?: CorpusAgreementOptions): CorpusAgreementReport;
|
|
2067
|
+
/**
|
|
2068
|
+
* Convenience adapter for `JudgeScore[]` data keyed externally by item.
|
|
2069
|
+
*
|
|
2070
|
+
* Use when you have per-item arrays of `JudgeScore[]` (e.g. one
|
|
2071
|
+
* `ScenarioResult.judgeScores` per scenario) and want corpus-wide
|
|
2072
|
+
* agreement without manually flattening. `itemId` must be unique per
|
|
2073
|
+
* row of `itemsScores`.
|
|
2074
|
+
*/
|
|
2075
|
+
declare function corpusInterRaterAgreementFromJudgeScores(itemsScores: Array<{
|
|
2076
|
+
itemId: string;
|
|
2077
|
+
scores: JudgeScore[];
|
|
2078
|
+
}>, opts?: CorpusAgreementOptions): CorpusAgreementReport;
|
|
1001
2079
|
|
|
1002
2080
|
/**
|
|
1003
2081
|
* Anti-slop quality judge.
|
|
@@ -1241,58 +2319,6 @@ declare class DualAgentBench {
|
|
|
1241
2319
|
run(config: DualAgentBenchConfig): Promise<DualAgentReport>;
|
|
1242
2320
|
}
|
|
1243
2321
|
|
|
1244
|
-
interface RunScore {
|
|
1245
|
-
success: number;
|
|
1246
|
-
goalProgress: number;
|
|
1247
|
-
repoGroundedness: number;
|
|
1248
|
-
driftPenalty: number;
|
|
1249
|
-
toolUseQuality: number;
|
|
1250
|
-
patchQuality: number;
|
|
1251
|
-
testReality: number;
|
|
1252
|
-
finalGate: number;
|
|
1253
|
-
reviewerBlockers: number;
|
|
1254
|
-
costUsd: number;
|
|
1255
|
-
wallSeconds: number;
|
|
1256
|
-
notes?: string[];
|
|
1257
|
-
}
|
|
1258
|
-
interface RunScoreWeights {
|
|
1259
|
-
success: number;
|
|
1260
|
-
goalProgress: number;
|
|
1261
|
-
repoGroundedness: number;
|
|
1262
|
-
driftPenalty: number;
|
|
1263
|
-
toolUseQuality: number;
|
|
1264
|
-
patchQuality: number;
|
|
1265
|
-
testReality: number;
|
|
1266
|
-
finalGate: number;
|
|
1267
|
-
reviewerBlockers: number;
|
|
1268
|
-
costUsd: number;
|
|
1269
|
-
wallSeconds: number;
|
|
1270
|
-
}
|
|
1271
|
-
declare const DEFAULT_RUN_SCORE_WEIGHTS: RunScoreWeights;
|
|
1272
|
-
declare function aggregateRunScore(score: RunScore, weights?: Partial<RunScoreWeights>): number;
|
|
1273
|
-
declare function clamp01(value: number): number;
|
|
1274
|
-
|
|
1275
|
-
interface RunTrace {
|
|
1276
|
-
run: Run$1;
|
|
1277
|
-
spans: Span[];
|
|
1278
|
-
events: TraceEvent[];
|
|
1279
|
-
artifacts: Artifact$1[];
|
|
1280
|
-
budget: BudgetLedgerEntry[];
|
|
1281
|
-
}
|
|
1282
|
-
interface RunCriticOptions {
|
|
1283
|
-
weights?: Partial<RunScoreWeights>;
|
|
1284
|
-
driftPatterns?: RegExp[];
|
|
1285
|
-
}
|
|
1286
|
-
declare class RunCritic {
|
|
1287
|
-
private readonly weights?;
|
|
1288
|
-
private readonly driftPatterns;
|
|
1289
|
-
constructor(options?: RunCriticOptions);
|
|
1290
|
-
score(store: TraceStore, runId: string): Promise<RunScore>;
|
|
1291
|
-
scoreTrace(trace: RunTrace): RunScore;
|
|
1292
|
-
rank(score: RunScore): number;
|
|
1293
|
-
private isDrift;
|
|
1294
|
-
}
|
|
1295
|
-
|
|
1296
2322
|
interface HostedJudgeDimension {
|
|
1297
2323
|
name: string;
|
|
1298
2324
|
weight: number;
|
|
@@ -2623,9 +3649,9 @@ declare function paraphraseRobustnessScenarios(args: ParaphraseRobustnessScenari
|
|
|
2623
3649
|
* `bonferroni(pValues, alpha)` correct for multiple pairwise tests
|
|
2624
3650
|
* so pairwise variant comparisons stay statistically honest.
|
|
2625
3651
|
*
|
|
2626
|
-
*
|
|
2627
|
-
*
|
|
2628
|
-
*
|
|
3652
|
+
* Applying alpha directly across n*(n-1)/2 pairwise tests without
|
|
3653
|
+
* correction inflates the false-positive rate when variants ≥ 3 — the
|
|
3654
|
+
* BH and Bonferroni helpers prevent that.
|
|
2629
3655
|
*/
|
|
2630
3656
|
/**
|
|
2631
3657
|
* Required N per arm for a two-sample comparison at target effect size,
|
|
@@ -2940,12 +3966,9 @@ interface HypothesisManifest {
|
|
|
2940
3966
|
* Identifier for the hashing scheme used to produce `contentHash`.
|
|
2941
3967
|
*
|
|
2942
3968
|
* `'sha256-content'` — sha256 hex over the canonicalized manifest with
|
|
2943
|
-
* the `contentHash` and `algo` fields stripped.
|
|
2944
|
-
*
|
|
2945
|
-
*
|
|
2946
|
-
* Held as a string union so future schemes can be added without
|
|
2947
|
-
* breaking parsers; legacy SignedManifest values written before this
|
|
2948
|
-
* field existed will deserialize cleanly because the field is optional.
|
|
3969
|
+
* the `contentHash` and `algo` fields stripped. Held as a string union
|
|
3970
|
+
* so future schemes can be added without breaking parsers; SignedManifest
|
|
3971
|
+
* values without `algo` deserialize cleanly because the field is optional.
|
|
2949
3972
|
*/
|
|
2950
3973
|
type SignedManifestAlgo = 'sha256-content';
|
|
2951
3974
|
interface SignedManifest extends HypothesisManifest {
|
|
@@ -2954,10 +3977,10 @@ interface SignedManifest extends HypothesisManifest {
|
|
|
2954
3977
|
/**
|
|
2955
3978
|
* Algorithm string describing how `contentHash` was produced.
|
|
2956
3979
|
*
|
|
2957
|
-
* Optional on the type so
|
|
2958
|
-
*
|
|
2959
|
-
*
|
|
2960
|
-
*
|
|
3980
|
+
* Optional on the type so serialized manifests without it still parse,
|
|
3981
|
+
* but ALWAYS populated by {@link signManifest}. Consumers that want to
|
|
3982
|
+
* enforce a known algorithm should reject manifests where this field
|
|
3983
|
+
* is missing or unrecognized.
|
|
2961
3984
|
*/
|
|
2962
3985
|
algo?: SignedManifestAlgo;
|
|
2963
3986
|
}
|
|
@@ -2996,10 +4019,9 @@ declare function canonicalize(v: unknown): unknown;
|
|
|
2996
4019
|
* - encoder choice (UTF-8 via TextEncoder, fixed)
|
|
2997
4020
|
* - runtime (uses the Web Crypto subtle digest, present in Node ≥18 and browsers)
|
|
2998
4021
|
*
|
|
2999
|
-
*
|
|
3000
|
-
*
|
|
3001
|
-
*
|
|
3002
|
-
* coexist; `hashJson` is the right name when you mean "canonicalize then hash."
|
|
4022
|
+
* Named `hashJson` to disambiguate from `prompt-registry.ts`'s `hashContent`,
|
|
4023
|
+
* which takes a string input and returns a truncated 12-char prompt id.
|
|
4024
|
+
* Use `hashJson` when you mean "canonicalize then hash."
|
|
3003
4025
|
*
|
|
3004
4026
|
* @example
|
|
3005
4027
|
* const hash = await hashJson({ id: '1', kind: 'spec' })
|
|
@@ -3012,17 +4034,15 @@ declare function hashJson<T>(obj: T): Promise<string>;
|
|
|
3012
4034
|
* The hash covers the canonicalized manifest with the `contentHash`
|
|
3013
4035
|
* and `algo` fields stripped; this lets verifiers re-sign the rest and
|
|
3014
4036
|
* compare. Returned manifest always carries `algo: 'sha256-content'`
|
|
3015
|
-
* so downstream consumers can identify the scheme;
|
|
3016
|
-
*
|
|
3017
|
-
* hashing on both sides.
|
|
4037
|
+
* so downstream consumers can identify the scheme; manifests without
|
|
4038
|
+
* `algo` still verify because it is stripped before hashing on both sides.
|
|
3018
4039
|
*/
|
|
3019
4040
|
declare function signManifest(m: HypothesisManifest): Promise<SignedManifest>;
|
|
3020
4041
|
/**
|
|
3021
4042
|
* Verify that a signed manifest has not been tampered with.
|
|
3022
4043
|
*
|
|
3023
|
-
* Strips `contentHash` and `algo` before re-signing so
|
|
3024
|
-
*
|
|
3025
|
-
* ones.
|
|
4044
|
+
* Strips `contentHash` and `algo` before re-signing so manifests without
|
|
4045
|
+
* `algo` verify identically to ones that carry it.
|
|
3026
4046
|
*/
|
|
3027
4047
|
declare function verifyManifest(m: SignedManifest): Promise<boolean>;
|
|
3028
4048
|
/**
|
|
@@ -3334,9 +4354,8 @@ declare const localCommandRunner: CommandRunner;
|
|
|
3334
4354
|
* - artifact dir contains an entry point (index.html for static SPAs,
|
|
3335
4355
|
* equivalent per framework family)
|
|
3336
4356
|
*
|
|
3337
|
-
*
|
|
3338
|
-
*
|
|
3339
|
-
* runner factory.
|
|
4357
|
+
* Ships with a canonical `vite` runner. Additional runners
|
|
4358
|
+
* (wrangler-deploy --dry-run, next-build, etc.) plug in as factories.
|
|
3340
4359
|
*/
|
|
3341
4360
|
|
|
3342
4361
|
type DeployFamily = 'frontend-static' | 'nextjs' | 'remix' | 'fullstack-ts';
|
|
@@ -3508,10 +4527,8 @@ declare function extractErrorCount(text: string, opts?: ExtractOptions): Extract
|
|
|
3508
4527
|
* - test: in-memory mock that returns canned step outcomes
|
|
3509
4528
|
* - future: Playwright, Puppeteer, custom scrapers
|
|
3510
4529
|
*
|
|
3511
|
-
*
|
|
3512
|
-
*
|
|
3513
|
-
* blind spot. Intent-match catches "wrong app entirely"; flow catches
|
|
3514
|
-
* "right app but the buttons don't work."
|
|
4530
|
+
* Paired with {@link runIntentMatchJudge}: intent-match catches "wrong
|
|
4531
|
+
* app entirely"; flow-layer catches "right app but the buttons don't work."
|
|
3515
4532
|
*/
|
|
3516
4533
|
|
|
3517
4534
|
type FlowAction = 'navigate' | 'click' | 'fill' | 'expect-text' | 'expect-element' | 'expect-url' | 'wait';
|
|
@@ -3607,10 +4624,6 @@ declare function flowLayer<Env extends FlowLayerEnv = FlowLayerEnv>(input: FlowL
|
|
|
3607
4624
|
*
|
|
3608
4625
|
* Soft-fails on LLM/JSON error (`available: false`) so callers can
|
|
3609
4626
|
* treat failure as "judge skipped."
|
|
3610
|
-
*
|
|
3611
|
-
* Added in 0.11 to replace the lying `completenessScore: 1` field that
|
|
3612
|
-
* VerticalBench shipped pre-Gen-48 — that field was keyword-driven and
|
|
3613
|
-
* fired true on builds with zero spec concepts implemented.
|
|
3614
4627
|
*/
|
|
3615
4628
|
|
|
3616
4629
|
declare const INTENT_MATCH_JUDGE_VERSION = "intent-match-judge-v1-2026-04-24";
|
|
@@ -4120,141 +5133,6 @@ declare function buildReviewerPrompt(input: ReviewerPromptInput): {
|
|
|
4120
5133
|
*/
|
|
4121
5134
|
declare function createDefaultReviewer(options: CreateDefaultReviewerOptions): (input: ReviewerPromptInput) => Promise<ReviewerOutput>;
|
|
4122
5135
|
|
|
4123
|
-
/**
|
|
4124
|
-
* Semantic concept judge — "does the built artifact actually implement
|
|
4125
|
-
* the features the user asked for?"
|
|
4126
|
-
*
|
|
4127
|
-
* Distinct from the domain/code/coherence judges in `judges.ts`:
|
|
4128
|
-
* - those judges score free-form conversational agent outputs along
|
|
4129
|
-
* quality dimensions (accuracy, depth, etc.)
|
|
4130
|
-
* - this judge scores a *built artifact* (served HTML + source files)
|
|
4131
|
-
* against an explicit list of expected concepts, returning per-concept
|
|
4132
|
-
* {present, score 0-10, evidence, severity}.
|
|
4133
|
-
*
|
|
4134
|
-
* The judge is strict about distinguishing (a) a working implementation
|
|
4135
|
-
* from (b) a keyword-present stub. "// TODO: mint button" is NOT present.
|
|
4136
|
-
* Only real, functional, wired-up code counts.
|
|
4137
|
-
*
|
|
4138
|
-
* Use via {@link createSemanticConceptJudge} or directly via
|
|
4139
|
-
* {@link runSemanticConceptJudge}. Soft-fails (available=false) on LLM
|
|
4140
|
-
* or JSON-parse errors so the caller can treat that as "layer skipped"
|
|
4141
|
-
* rather than "layer failed" in a multi-layer pipeline.
|
|
4142
|
-
*/
|
|
4143
|
-
|
|
4144
|
-
/**
|
|
4145
|
-
* Implementation complexity class for weighted scoring (added 0.11).
|
|
4146
|
-
*
|
|
4147
|
-
* - `render` (default): the concept is a UI surface that displays static
|
|
4148
|
-
* data — render a list, show a counter, lay out a button. Single-file
|
|
4149
|
-
* work, no external integration.
|
|
4150
|
-
* - `integrate`: the concept requires wiring a real external system —
|
|
4151
|
-
* wallet connect (wagmi + RainbowKit + chain config), payment provider
|
|
4152
|
-
* (Stripe Elements + intent + webhook), an API client with auth.
|
|
4153
|
-
* Multi-file, library-knowledge, runtime correctness matters.
|
|
4154
|
-
* - `compute`: the concept requires algorithmic work — solver, simulator,
|
|
4155
|
-
* constraint propagation, ML inference. Correctness > UI polish.
|
|
4156
|
-
*
|
|
4157
|
-
* Default weights (when applied via `weightConcepts: 'complexity'`):
|
|
4158
|
-
* render=1.0, integrate=2.0, compute=2.5
|
|
4159
|
-
*
|
|
4160
|
-
* Cross-vertical scoring without complexity weighting silently inflates
|
|
4161
|
-
* the rate of UI-heavy verticals (healthcare, fintech dashboards) vs
|
|
4162
|
-
* integration-heavy verticals (DeFi, wallets) — all concepts treated
|
|
4163
|
-
* equally even though the agent does 2-3x the work for `integrate`.
|
|
4164
|
-
*/
|
|
4165
|
-
type ConceptComplexity = 'render' | 'integrate' | 'compute';
|
|
4166
|
-
interface ConceptSpec {
|
|
4167
|
-
name: string;
|
|
4168
|
-
/** Short hints that help the judge; not used for matching. */
|
|
4169
|
-
keywords?: string[];
|
|
4170
|
-
/** Optional explicit weight; default 1.0. Overrides complexity-derived weight. */
|
|
4171
|
-
weight?: number;
|
|
4172
|
-
/** Implementation complexity class. Default `render`. */
|
|
4173
|
-
complexity?: ConceptComplexity;
|
|
4174
|
-
}
|
|
4175
|
-
interface ConceptFinding {
|
|
4176
|
-
concept: string;
|
|
4177
|
-
present: boolean;
|
|
4178
|
-
/** 0..10. 10 = production-ready; 7 = functional thin; 4 = partial; 0 = absent. */
|
|
4179
|
-
score: number;
|
|
4180
|
-
evidence: string;
|
|
4181
|
-
severity: Severity;
|
|
4182
|
-
}
|
|
4183
|
-
interface SemanticConceptJudgeInput {
|
|
4184
|
-
/** Full natural-language prompt the agent was handed. */
|
|
4185
|
-
userRequest: string;
|
|
4186
|
-
/** Rendered HTML the preview returns (UI artifacts). Optional. */
|
|
4187
|
-
servedHtml?: string;
|
|
4188
|
-
/** Top-level source files from the agent's workdir. */
|
|
4189
|
-
sourceFiles: Array<{
|
|
4190
|
-
path: string;
|
|
4191
|
-
content: string;
|
|
4192
|
-
}>;
|
|
4193
|
-
/** The expected concept list. */
|
|
4194
|
-
expectedConcepts: ConceptSpec[];
|
|
4195
|
-
/** Free-form metadata (id, difficulty) to inject into the prompt. */
|
|
4196
|
-
artifactLabel?: string;
|
|
4197
|
-
artifactDescription?: string;
|
|
4198
|
-
}
|
|
4199
|
-
interface SemanticConceptJudgeResult {
|
|
4200
|
-
kind: 'semantic-concept';
|
|
4201
|
-
version: string;
|
|
4202
|
-
/** Normalized 0..1 score — mean of per-concept scores / 10. */
|
|
4203
|
-
score: number;
|
|
4204
|
-
presentCount: number;
|
|
4205
|
-
totalCount: number;
|
|
4206
|
-
findings: ConceptFinding[];
|
|
4207
|
-
summary: string;
|
|
4208
|
-
durationMs: number;
|
|
4209
|
-
costUsd: number | null;
|
|
4210
|
-
/** False on LLM/JSON error — treat as "skipped / unable to judge" in pipelines. */
|
|
4211
|
-
available: boolean;
|
|
4212
|
-
error?: string;
|
|
4213
|
-
}
|
|
4214
|
-
/**
|
|
4215
|
-
* Score-aggregation strategy. Default `mean` (legacy behavior — 0.10
|
|
4216
|
-
* and earlier always averaged 0-10 scores). `complexity` applies the
|
|
4217
|
-
* default weight table (render=1, integrate=2, compute=2.5) unless a
|
|
4218
|
-
* concept has an explicit `weight`. `explicit` honors only `weight`
|
|
4219
|
-
* (defaulting to 1 for unspecified).
|
|
4220
|
-
*/
|
|
4221
|
-
type ConceptWeightStrategy = 'mean' | 'complexity' | 'explicit';
|
|
4222
|
-
declare const DEFAULT_COMPLEXITY_WEIGHTS: Record<ConceptComplexity, number>;
|
|
4223
|
-
interface SemanticConceptJudgeOptions {
|
|
4224
|
-
/** Model id to call. Default 'claude-sonnet-4-6' via agent-eval defaults. */
|
|
4225
|
-
model?: string;
|
|
4226
|
-
/** Per-call timeout. Default 180s. */
|
|
4227
|
-
timeoutMs?: number;
|
|
4228
|
-
/** Pipeline budget for the prompt (source blob truncation). Default 45000. */
|
|
4229
|
-
maxSourceChars?: number;
|
|
4230
|
-
/** Per-file cap before inclusion. Default 20000. */
|
|
4231
|
-
maxPerFileChars?: number;
|
|
4232
|
-
/** HTML cap. Default 30000. */
|
|
4233
|
-
maxHtmlChars?: number;
|
|
4234
|
-
/** LlmClient config (baseUrl, apiKey, authHeader, …). */
|
|
4235
|
-
llm?: LlmClientOptions;
|
|
4236
|
-
/**
|
|
4237
|
-
* Score aggregation strategy. Default `mean` for backward compatibility
|
|
4238
|
-
* with 0.10 and earlier callers. Cross-vertical comparisons should use
|
|
4239
|
-
* `complexity` to neutralize the integrate-vs-render asymmetry.
|
|
4240
|
-
*/
|
|
4241
|
-
weightConcepts?: ConceptWeightStrategy;
|
|
4242
|
-
/** Override the default complexity → weight table. */
|
|
4243
|
-
complexityWeights?: Partial<Record<ConceptComplexity, number>>;
|
|
4244
|
-
}
|
|
4245
|
-
declare const SEMANTIC_CONCEPT_JUDGE_VERSION = "semantic-concept-judge-v1-2026-04-24";
|
|
4246
|
-
/**
|
|
4247
|
-
* Run the semantic concept judge. Soft-fails to available=false on
|
|
4248
|
-
* LLM/JSON errors — callers in a MultiLayerVerifier pipeline can treat
|
|
4249
|
-
* that as "skip" rather than "fail."
|
|
4250
|
-
*/
|
|
4251
|
-
declare function runSemanticConceptJudge(input: SemanticConceptJudgeInput, options?: SemanticConceptJudgeOptions): Promise<SemanticConceptJudgeResult>;
|
|
4252
|
-
/**
|
|
4253
|
-
* Factory: pin LLM options once, return a closure that accepts inputs.
|
|
4254
|
-
* Convenient for pipelines that want to share a single LlmClient config.
|
|
4255
|
-
*/
|
|
4256
|
-
declare function createSemanticConceptJudge(options?: SemanticConceptJudgeOptions): (input: SemanticConceptJudgeInput) => Promise<SemanticConceptJudgeResult>;
|
|
4257
|
-
|
|
4258
5136
|
/**
|
|
4259
5137
|
* Liveness canaries — cheap statistical checks that catch the failure
|
|
4260
5138
|
* modes a green test suite never sees.
|
|
@@ -4458,9 +5336,8 @@ interface LineageNode {
|
|
|
4458
5336
|
}
|
|
4459
5337
|
/**
|
|
4460
5338
|
* `kindOf` decides whether a variant is a seed (no parent), code mutation,
|
|
4461
|
-
* or prompt mutation. Default looks at `variant.payload.codeMutation`
|
|
4462
|
-
*
|
|
4463
|
-
* accept any payload that mirrors it. Override by passing your own.
|
|
5339
|
+
* or prompt mutation. Default looks at `variant.payload.codeMutation` and
|
|
5340
|
+
* accepts any payload that exposes that field; override by passing your own.
|
|
4464
5341
|
*/
|
|
4465
5342
|
type LineageKindResolver<P> = (variant: EvolvableVariant<P>) => LineageKind;
|
|
4466
5343
|
/**
|
|
@@ -4707,9 +5584,8 @@ declare function createSandboxCodeMutator<T, P>(opts: CreateSandboxCodeMutatorOp
|
|
|
4707
5584
|
* Δ improvement (auto-detect when prompt evolution has
|
|
4708
5585
|
* hit a structural ceiling).
|
|
4709
5586
|
*
|
|
4710
|
-
* Naming is generic
|
|
4711
|
-
*
|
|
4712
|
-
* primitive doesn't care what each mutator actually does.
|
|
5587
|
+
* Naming is generic — the canonical use cases are "prompt" and "code"
|
|
5588
|
+
* channels, but the primitive doesn't care what each mutator actually does.
|
|
4713
5589
|
*/
|
|
4714
5590
|
|
|
4715
5591
|
type CompositePolicy = 'primary-only' | 'secondary-only' | 'alternate' | 'plateau';
|
|
@@ -4754,25 +5630,15 @@ declare class Mutex {
|
|
|
4754
5630
|
}
|
|
4755
5631
|
|
|
4756
5632
|
/**
|
|
4757
|
-
*
|
|
4758
|
-
*
|
|
4759
|
-
*
|
|
4760
|
-
*
|
|
4761
|
-
* rewrite added 10+ new personas, those personas existed on disk but the
|
|
4762
|
-
* evolve runner never loaded them — the new rubric dims (audit_defendability,
|
|
4763
|
-
* intake_discipline, etc) got no training signal. The personas were
|
|
4764
|
-
* cosmetic, the rewrites partially uninformed.
|
|
4765
|
-
*
|
|
4766
|
-
* `discoverPersonas` walks a personas directory and returns every persona
|
|
4767
|
-
* file matching the convention. Consumers can filter by include/exclude
|
|
4768
|
-
* patterns. Default behavior — discover everything — eliminates the
|
|
4769
|
-
* "forgot to add the new persona to the list" failure mode.
|
|
5633
|
+
* Walk a personas directory and return every file matching the convention
|
|
5634
|
+
* `NN-slug.{yaml,yml,json,md}`. Sorted by filename so the numeric prefix
|
|
5635
|
+
* gives stable persona ordering for reproducibility. Consumers filter
|
|
5636
|
+
* through `include` / `exclude`.
|
|
4770
5637
|
*/
|
|
4771
5638
|
interface DiscoverPersonasOptions {
|
|
4772
5639
|
/**
|
|
4773
5640
|
* Regex applied to filenames. Files that don't match are skipped.
|
|
4774
|
-
* Default: `^[0-9]{2}-.+\.(yaml|yml|json|md)
|
|
4775
|
-
* across legal/gtm/tax/creative: `NN-slug.yaml`).
|
|
5641
|
+
* Default: `^[0-9]{2}-.+\.(yaml|yml|json|md)$`.
|
|
4776
5642
|
*/
|
|
4777
5643
|
pattern?: RegExp;
|
|
4778
5644
|
/**
|
|
@@ -4782,14 +5648,10 @@ interface DiscoverPersonasOptions {
|
|
|
4782
5648
|
exclude?: readonly string[];
|
|
4783
5649
|
/**
|
|
4784
5650
|
* If set, return only personas whose basename contains one of these
|
|
4785
|
-
* substrings (post-pattern filter).
|
|
4786
|
-
* flag — consumers pass through.
|
|
5651
|
+
* substrings (post-pattern filter).
|
|
4787
5652
|
*/
|
|
4788
5653
|
include?: readonly string[];
|
|
4789
|
-
/**
|
|
4790
|
-
* Recurse into subdirectories. Default false (legal/gtm/tax/creative all
|
|
4791
|
-
* store personas flat).
|
|
4792
|
-
*/
|
|
5654
|
+
/** Recurse into subdirectories. Default false. */
|
|
4793
5655
|
recursive?: boolean;
|
|
4794
5656
|
}
|
|
4795
5657
|
interface DiscoveredPersona {
|
|
@@ -4800,14 +5662,6 @@ interface DiscoveredPersona {
|
|
|
4800
5662
|
/** Filename without extension — the conventional persona id. */
|
|
4801
5663
|
id: string;
|
|
4802
5664
|
}
|
|
4803
|
-
/**
|
|
4804
|
-
* Walk `dir` and return every persona file matching the convention. Async
|
|
4805
|
-
* because the consumer almost always wants this to be I/O-driven (so a new
|
|
4806
|
-
* persona added on disk is picked up without a code change).
|
|
4807
|
-
*
|
|
4808
|
-
* Sorted by filename (which gives stable persona id order via the `NN-`
|
|
4809
|
-
* numeric prefix convention) for reproducibility.
|
|
4810
|
-
*/
|
|
4811
5665
|
declare function discoverPersonas(dir: string, opts?: DiscoverPersonasOptions): Promise<DiscoveredPersona[]>;
|
|
4812
5666
|
|
|
4813
5667
|
/**
|
|
@@ -4914,43 +5768,17 @@ declare class JsonlTrialCache implements TrialCache {
|
|
|
4914
5768
|
}
|
|
4915
5769
|
|
|
4916
5770
|
/**
|
|
4917
|
-
*
|
|
4918
|
-
*
|
|
4919
|
-
*
|
|
4920
|
-
*
|
|
4921
|
-
*
|
|
4922
|
-
* zero into the mean, silently corrupting the score. Today's tax/gtm evals
|
|
4923
|
-
* had `judge=0` across every trial — the prompt rewrites couldn't be
|
|
4924
|
-
* evaluated honestly because the measurement instrument was broken.
|
|
4925
|
-
*
|
|
4926
|
-
* `withJudgeRetry` is the substrate fix. It wraps a single judge invocation
|
|
4927
|
-
* with:
|
|
4928
|
-
*
|
|
4929
|
-
* 1. N retry attempts on transient failures (abort, timeout, network).
|
|
4930
|
-
* 2. Optional fallback-model rotation — try the next model in the list
|
|
4931
|
-
* if the primary keeps aborting (a verbose new prompt may stream-abort
|
|
4932
|
-
* on claude-code/sonnet but succeed on kimi-code/k2p6).
|
|
4933
|
-
* 3. Exponential backoff between attempts.
|
|
4934
|
-
* 4. A typed outcome `{ succeeded, attempts, value, error }` that callers
|
|
4935
|
-
* MUST decide what to do with. No silent zero.
|
|
4936
|
-
*
|
|
4937
|
-
* The reporting contract: callers ship `TrialResult.judgeSucceeded = succeeded`
|
|
4938
|
-
* and `TrialResult.judgeAttempts = attempts`. `aggregateTrials({mode: 'exclude-failed'})`
|
|
4939
|
-
* then skips failed-judge trials when computing composites.
|
|
4940
|
-
*
|
|
4941
|
-
* The library does NOT decide what score to record on failure — that's the
|
|
4942
|
-
* caller's product choice. Today's product agents (legal/gtm/tax/creative)
|
|
4943
|
-
* should set `score: NaN` + `judgeSucceeded: false` + `error: ...` so the
|
|
4944
|
-
* aggregator's exclude-failed mode drops the trial. Defaulting to 0 is what
|
|
4945
|
-
* caused today's data corruption.
|
|
4946
|
-
*/
|
|
4947
|
-
/**
|
|
4948
|
-
* Retry policy for judge LLM calls.
|
|
5771
|
+
* Wrap a single judge LLM call with retry, optional fallback-model
|
|
5772
|
+
* rotation, exponential backoff, and a typed `JudgeRetryOutcome`. Callers
|
|
5773
|
+
* MUST inspect `succeeded` before using `value`; on failure the library
|
|
5774
|
+
* returns `value: null` rather than substituting a default, so a judge
|
|
5775
|
+
* abort cannot silently corrupt a downstream composite.
|
|
4949
5776
|
*
|
|
4950
|
-
*
|
|
4951
|
-
*
|
|
4952
|
-
*
|
|
5777
|
+
* Reporting contract: callers ship `TrialResult.judgeSucceeded = succeeded`
|
|
5778
|
+
* and `TrialResult.judgeAttempts = attempts` so `aggregateTrialsByMode`
|
|
5779
|
+
* with `mode: 'exclude-failed'` drops the trial.
|
|
4953
5780
|
*/
|
|
5781
|
+
/** Retry policy for judge LLM calls. */
|
|
4954
5782
|
interface JudgeRetryPolicy {
|
|
4955
5783
|
/** Max attempts per model. Default 3 (one initial + two retries). */
|
|
4956
5784
|
maxAttempts?: number;
|
|
@@ -5003,8 +5831,8 @@ interface JudgeRetryOutcome<T> {
|
|
|
5003
5831
|
* to their underlying fetch/SDK call so the abort actually fires.
|
|
5004
5832
|
*
|
|
5005
5833
|
* Returns a typed outcome — callers MUST inspect `succeeded` before using
|
|
5006
|
-
* `value`. The library refuses to default to a silent zero score because
|
|
5007
|
-
* is
|
|
5834
|
+
* `value`. The library refuses to default to a silent zero score because a
|
|
5835
|
+
* synthetic zero is indistinguishable from a real low score downstream.
|
|
5008
5836
|
*/
|
|
5009
5837
|
declare function withJudgeRetry<T>(judgeFn: (model: string, signal: AbortSignal) => Promise<T>, policy?: JudgeRetryPolicy): Promise<JudgeRetryOutcome<T>>;
|
|
5010
5838
|
|
|
@@ -5070,42 +5898,30 @@ declare function referenceReplayRunsToSteeringRows<Input = unknown>(runs: Refere
|
|
|
5070
5898
|
declare function referenceReplayScenarioToRunScore(scenarioScore: ReferenceReplayScenarioScore, durationMs?: number): RunScore;
|
|
5071
5899
|
|
|
5072
5900
|
/**
|
|
5073
|
-
*
|
|
5074
|
-
*
|
|
5075
|
-
* The prompt-evolution loop's internal `aggregateTrials` defaulted to
|
|
5076
|
-
* including every non-`error` trial in the mean — which corrupted the mean
|
|
5077
|
-
* when a trial had `score: 0` because the judge silently aborted (the
|
|
5078
|
-
* caller's try/catch swallowed the abort and returned zero). Today's
|
|
5079
|
-
* tax/gtm evals show this: every trial scored judge=0 because the judge
|
|
5080
|
-
* aborted, and the composite then reflected `structural * 0.3 + slop * 0.1`
|
|
5081
|
-
* instead of the intended `judge * 0.6 + structural * 0.3 + slop * 0.1`.
|
|
5901
|
+
* Aggregate trials with explicit handling of judge failure. Three modes:
|
|
5082
5902
|
*
|
|
5083
|
-
* `
|
|
5903
|
+
* - `strict-fail` — any `judgeSucceeded === false` trial fails the whole
|
|
5904
|
+
* aggregate. Use for production gates: one corrupt trial halts the gate.
|
|
5084
5905
|
*
|
|
5085
|
-
* - `
|
|
5086
|
-
*
|
|
5087
|
-
* trial means "we don't know if the prompt is good, halt the gate."
|
|
5906
|
+
* - `exclude-failed` — drop `judgeSucceeded === false` trials from the
|
|
5907
|
+
* mean; report `excludedFailedTrials` separately. Default for new code.
|
|
5088
5908
|
*
|
|
5089
|
-
* - `
|
|
5090
|
-
*
|
|
5091
|
-
* comparison runs where you want to use the signal that DID land.
|
|
5092
|
-
* Default for new code.
|
|
5909
|
+
* - `zero-fill` — failed trials count as `score: 0` in the mean. Available
|
|
5910
|
+
* only for adapters that don't yet set `judgeSucceeded`.
|
|
5093
5911
|
*
|
|
5094
|
-
*
|
|
5095
|
-
*
|
|
5096
|
-
* don't yet set `judgeSucceeded`. Migrate off this — it's the source
|
|
5097
|
-
* of today's data corruption.
|
|
5912
|
+
* Hard-errored trials (`t.error` set) are always excluded — those are
|
|
5913
|
+
* infrastructure failures, not eval signal.
|
|
5098
5914
|
*/
|
|
5099
5915
|
|
|
5100
5916
|
type AggregatorMode = 'strict-fail' | 'exclude-failed' | 'zero-fill';
|
|
5101
5917
|
interface TrialAggregate {
|
|
5102
5918
|
/** Mean score over the trials counted by the chosen mode. */
|
|
5103
5919
|
meanScore: number;
|
|
5104
|
-
/** Mean cost
|
|
5920
|
+
/** Mean cost across counted trials. */
|
|
5105
5921
|
meanCost: number;
|
|
5106
|
-
/** Mean wall time
|
|
5922
|
+
/** Mean wall time across counted trials. */
|
|
5107
5923
|
meanDurationMs: number;
|
|
5108
|
-
/** ok
|
|
5924
|
+
/** Fraction of counted trials with `ok === true`. */
|
|
5109
5925
|
okRate: number;
|
|
5110
5926
|
/** Trials counted in the mean (mode-dependent). */
|
|
5111
5927
|
countedTrials: number;
|
|
@@ -5125,13 +5941,8 @@ interface TrialAggregate {
|
|
|
5125
5941
|
firstError?: string;
|
|
5126
5942
|
};
|
|
5127
5943
|
}
|
|
5128
|
-
/**
|
|
5129
|
-
* Aggregate trials with explicit failed-judge handling. Returns counts for
|
|
5130
|
-
* counted + excluded so callers can surface "the score is based on 7 of 10
|
|
5131
|
-
* trials; 3 judges failed" instead of silently weighting zero.
|
|
5132
|
-
*/
|
|
5133
5944
|
declare function aggregateTrialsByMode(trials: TrialResult[], opts: {
|
|
5134
5945
|
mode: AggregatorMode;
|
|
5135
5946
|
}): TrialAggregate;
|
|
5136
5947
|
|
|
5137
|
-
export { ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AggregatorMode, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DirEntry, type DiscoverPersonasOptions, type DiscoveredPersona, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvalResult, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, type JudgeRubric, JudgeRunner, type JudgeScore, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, ProductClient, type ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TestResult, type ThresholdContract, TokenCounter, type TokenSpec, TraceEmitter, TraceEvent, TraceStore, Trajectory, TrajectoryStep, type TrialAggregate, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, VerifyContext, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateRunScore, aggregateTrialsByMode, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, buildReviewerPrompt, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, confidenceInterval, containsAll, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, linterJudge, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, normalizeScores, notBlocked, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, typoMutator, urlContains, verifyManifest, visualDiff, viteDeployRunner, weightedMean, weightedRecall, whitespaceCollapseMutator, wilcoxonSignedRank, withJudgeRetry, wranglerDeployRunner };
|
|
5948
|
+
export { ANALYST_SEVERITIES, ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AggregatorMode, type AlignmentOp, type Analyst, type AnalystContext, type AnalystCost, type AnalystFinding, type AnalystHooks, type AnalystInputKind, AnalystRegistry, type AnalystRegistryOptions, type AnalystRequirements, type AnalystRunInputs, type AnalystRunResult, type AnalystRunSummary, type AnalystSeverity, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, type BudgetPolicy, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type ChatCallOpts, type ChatClient, type ChatRequest, type ChatResponse, type ChatTransport, type CheckResult, type CliBridgeTransportOpts, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, ContinuousAgreement, ContinuousAgreementOptions, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CorpusAgreementOptions, type CorpusAgreementPerDimension, type CorpusAgreementReport, type CorpusScoreRecord, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateChatClientOpts, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DirEntry, type DirectProviderTransportOpts, type DiscoverPersonasOptions, type DiscoveredPersona, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvalResult, type EvidenceRef, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAdapterOpts, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, type JudgeRubric, JudgeRunner, type JudgeScore, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallRequest, LlmCallResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MockTransportOpts, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersistedFinding, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, ProductClient, type ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegistryRunOpts, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RouterTransportOpts, type RubricDimension, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxSdkTransportOpts, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, Trajectory, TrajectoryStep, type TrialAggregate, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateRunScore, aggregateTrialsByMode, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, computeFindingId, confidenceInterval, containsAll, corpusInterRaterAgreement, corpusInterRaterAgreementFromJudgeScores, createAntiSlopJudge, createChatClient, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createRunCriticAdapter, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, liftSeverity, linterJudge, loadScorerFromGrader, localCommandRunner, lowercaseMutator, makeFinding, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, normalizeScores, notBlocked, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, parseRawFinding, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, typoMutator, urlContains, verifyManifest, visualDiff, viteDeployRunner, weightedMean, weightedRecall, whitespaceCollapseMutator, wilcoxonSignedRank, withJudgeRetry, wranglerDeployRunner };
|