@tangle-network/agent-eval 0.25.0 → 0.27.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +145 -0
- package/README.md +5 -5
- package/dist/builder-eval/index.js +1 -1
- package/dist/{chunk-WWYCWKUM.js → chunk-3CKU6VGU.js} +2 -2
- package/dist/{chunk-K2TPS5LB.js → chunk-4U4BKCXK.js} +2 -2
- package/dist/chunk-4U4BKCXK.js.map +1 -0
- package/dist/{chunk-2A5XJB43.js → chunk-5AKPEK5L.js} +3 -3
- package/dist/chunk-5AKPEK5L.js.map +1 -0
- package/dist/{chunk-RAF443UI.js → chunk-DBIGN5MJ.js} +2 -2
- package/dist/{chunk-JLZQWFV3.js → chunk-K33INZHH.js} +2 -2
- package/dist/chunk-K33INZHH.js.map +1 -0
- package/dist/{chunk-NU65VQ7M.js → chunk-MAZ26DC7.js} +1 -1
- package/dist/chunk-MAZ26DC7.js.map +1 -0
- package/dist/{chunk-LSH4MMOZ.js → chunk-NCRFYPS3.js} +1 -1
- package/dist/chunk-NCRFYPS3.js.map +1 -0
- package/dist/{chunk-ZN274SWR.js → chunk-PALJO75S.js} +2 -2
- package/dist/{chunk-OWLAAMME.js → chunk-QHF6EQKK.js} +3 -2
- package/dist/chunk-QHF6EQKK.js.map +1 -0
- package/dist/chunk-R5UQJNKC.js +722 -0
- package/dist/chunk-R5UQJNKC.js.map +1 -0
- package/dist/{chunk-SESZDQPX.js → chunk-RUI6SIHY.js} +3 -3
- package/dist/chunk-RUI6SIHY.js.map +1 -0
- package/dist/{chunk-EDUKQ5AM.js → chunk-SZSBQUIJ.js} +2 -2
- package/dist/chunk-SZSBQUIJ.js.map +1 -0
- package/dist/{chunk-4F5DQN55.js → chunk-VSMTAMNK.js} +1 -1
- package/dist/chunk-VSMTAMNK.js.map +1 -0
- package/dist/{chunk-5LBB5B3Z.js → chunk-XFZCM5Z3.js} +1 -1
- package/dist/chunk-XFZCM5Z3.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/{control-CBShYYA6.d.ts → control-BT4qnXiS.d.ts} +2 -2
- package/dist/{control-runtime-BuJHoLg0.d.ts → control-runtime-BZ_lVLYW.d.ts} +1 -0
- package/dist/control.d.ts +3 -3
- package/dist/control.js +2 -2
- package/dist/{failure-cluster-C2EGSDiT.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -2
- package/dist/{feedback-trajectory-DfFdrraJ.d.ts → feedback-trajectory-D1aGKusy.d.ts} +1 -1
- package/dist/governance/index.d.ts +1 -1
- package/dist/{index-Oj9fAPPN.d.ts → index-BhLlu-qO.d.ts} +63 -2
- package/dist/index.d.ts +279 -72
- package/dist/index.js +222 -136
- package/dist/index.js.map +1 -1
- package/dist/knowledge/index.d.ts +1 -1
- package/dist/knowledge/index.js +2 -2
- package/dist/{multi-layer-verifier-LkP3LVKj.d.ts → multi-layer-verifier-U-c8ge1k.d.ts} +1 -1
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +5 -5
- package/dist/optimization.js +5 -5
- package/dist/pipelines/index.d.ts +1 -1
- package/dist/pipelines/index.js +2 -2
- package/dist/{release-report-BNgMdqPF.d.ts → release-report-CCQqnK46.d.ts} +1 -1
- package/dist/{replay-BL96gCEP.d.ts → replay-D7z0J43-.d.ts} +4 -5
- package/dist/reporting.d.ts +4 -4
- package/dist/reporting.js +5 -5
- package/dist/{researcher-BPT8x_NT.d.ts → researcher-G81CWc0q.d.ts} +9 -10
- package/dist/rl.d.ts +26 -44
- package/dist/rl.js +5 -5
- package/dist/rl.js.map +1 -1
- package/dist/{sequential-Dgz1n51-.d.ts → sequential-5iSVfzl2.d.ts} +2 -2
- package/dist/{summary-report-C7VPYEj2.d.ts → summary-report-Dl4akLKX.d.ts} +13 -1
- package/dist/traces.d.ts +1 -1
- package/dist/traces.js +2 -2
- package/dist/wire/index.d.ts +2 -2
- package/dist/wire/index.js +1 -1
- package/docs/concepts.md +11 -0
- package/docs/research-report-methodology.md +4 -4
- package/docs/three-package-architecture.md +12 -24
- package/package.json +1 -1
- package/dist/chunk-2A5XJB43.js.map +0 -1
- package/dist/chunk-4F5DQN55.js.map +0 -1
- package/dist/chunk-5LBB5B3Z.js.map +0 -1
- package/dist/chunk-EDUKQ5AM.js.map +0 -1
- package/dist/chunk-I4MBDTY5.js +0 -272
- package/dist/chunk-I4MBDTY5.js.map +0 -1
- package/dist/chunk-JLZQWFV3.js.map +0 -1
- package/dist/chunk-K2TPS5LB.js.map +0 -1
- package/dist/chunk-LSH4MMOZ.js.map +0 -1
- package/dist/chunk-NU65VQ7M.js.map +0 -1
- package/dist/chunk-OWLAAMME.js.map +0 -1
- package/dist/chunk-SESZDQPX.js.map +0 -1
- /package/dist/{chunk-WWYCWKUM.js.map → chunk-3CKU6VGU.js.map} +0 -0
- /package/dist/{chunk-RAF443UI.js.map → chunk-DBIGN5MJ.js.map} +0 -0
- /package/dist/{chunk-ZN274SWR.js.map → chunk-PALJO75S.js.map} +0 -0
package/dist/index.d.ts
CHANGED
|
@@ -1,28 +1,30 @@
|
|
|
1
|
-
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-
|
|
1
|
+
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-BT4qnXiS.js';
|
|
2
2
|
import { TCloud } from '@tangle-network/tcloud';
|
|
3
|
-
import { C as ControlEvalResult } from './control-runtime-
|
|
4
|
-
export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-
|
|
3
|
+
import { C as ControlEvalResult } from './control-runtime-BZ_lVLYW.js';
|
|
4
|
+
export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BZ_lVLYW.js';
|
|
5
5
|
import { A as AgentEvalError } from './errors-BZ9sTdz7.js';
|
|
6
6
|
export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-BZ9sTdz7.js';
|
|
7
|
-
import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory } from './feedback-trajectory-
|
|
8
|
-
export { F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-
|
|
9
|
-
import { A as ActionableSideInfo, h as MultiShotRunner, j as MultiShotScorer, c as MultiShotMutateAdapter, a4 as HeldOutGateConfig, E as EvolvableVariant, m as MultiShotTrialResult, e as MultiShotOptimizationResult, a3 as GateDecision, a5 as Objective, a6 as ParetoResult, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-
|
|
10
|
-
export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a7 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a8 as GateEvidence, G as GenerationReport, a9 as HeldOutGate, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, d as MultiShotOptimizationConfig, f as MultiShotRun, g as MultiShotRunInput, i as MultiShotScore, k as MultiShotSplit, l as MultiShotTrace, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-
|
|
7
|
+
import { b as FeedbackLabel, n as FeedbackTrajectoryStore, l as FeedbackTrajectory } from './feedback-trajectory-D1aGKusy.js';
|
|
8
|
+
export { F as FeedbackArtifactType, a as FeedbackAttempt, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, m as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-D1aGKusy.js';
|
|
9
|
+
import { A as ActionableSideInfo, h as MultiShotRunner, j as MultiShotScorer, c as MultiShotMutateAdapter, a4 as HeldOutGateConfig, E as EvolvableVariant, m as MultiShotTrialResult, e as MultiShotOptimizationResult, a3 as GateDecision, a5 as Objective, a6 as ParetoResult, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-Dl4akLKX.js';
|
|
10
|
+
export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a7 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a8 as GateEvidence, G as GenerationReport, a9 as HeldOutGate, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, d as MultiShotOptimizationConfig, f as MultiShotRun, g as MultiShotRunInput, i as MultiShotScore, k as MultiShotSplit, l as MultiShotTrace, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-Dl4akLKX.js';
|
|
11
11
|
export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
|
|
12
|
-
import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-
|
|
13
|
-
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-
|
|
14
|
-
import { a as FailureCluster } from './failure-cluster-
|
|
15
|
-
export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-
|
|
12
|
+
import { i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard } from './release-report-CCQqnK46.js';
|
|
13
|
+
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as bhAdjust, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as pairedWilcoxon, s as releaseTraceEvidenceFromMultiShotTrials, t as renderReleaseReport } from './release-report-CCQqnK46.js';
|
|
14
|
+
import { a as FailureCluster } from './failure-cluster-Cw65_5FY.js';
|
|
15
|
+
export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-Cw65_5FY.js';
|
|
16
16
|
import { a as RunSplitTag, R as RunRecord } from './run-record-CqzahIbx.js';
|
|
17
17
|
export { c as RunJudgeMetadata, d as RunOutcome, e as RunRecordValidationError, b as RunTokenUsage, i as isRunRecord, p as parseRunRecordSafe, r as roundTripRunRecord, v as validateRunRecord } from './run-record-CqzahIbx.js';
|
|
18
18
|
import { T as TraceStore, R as Run$1, S as Span, b as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, g as BudgetSpec, h as RunFilter, L as LlmSpan } from './store-Db2Bv8Cf.js';
|
|
19
19
|
export { i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, M as Message, e as RetrievalSpan, m as RunLayer, n as RunStatus, f as SandboxSpan, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
|
|
20
|
+
import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './index-BhLlu-qO.js';
|
|
21
|
+
export { b as CalibrationResult, c as CandidateScore, d as ContinuousCalibrationResult, D as DEFAULT_RED_TEAM_CORPUS, E as EuRiskClass, G as GoldenItem, e as GovernanceContext, f as GovernanceFinding, g as GovernanceReport, P as PositionalBiasResult, R as RedTeamCase, h as RedTeamCategory, i as RedTeamFinding, j as RedTeamPayload, k as RedTeamReport, S as SelfPreferenceResult, U as UseCaseSignals, V as VerbosityBiasResult, l as calibrateJudge, m as calibrateJudgeContinuous, n as classifyEuAiRisk, o as continuousAgreement, p as euAiActReport, q as nistAiRmfReport, r as positionalBias, s as redTeamDataset, t as redTeamReport, u as renderMarkdown, v as scoreRedTeamOutput, w as selfPreference, x as soc2Report, y as summarize, z as toolNamesForRun, A as verbosityBias } from './index-BhLlu-qO.js';
|
|
20
22
|
import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-B2kWEdh9.js';
|
|
21
23
|
export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-B2kWEdh9.js';
|
|
22
24
|
import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
|
|
23
25
|
export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DP_cSSiw.js';
|
|
24
26
|
export { F as FileSystemRawProviderSink, d as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, e as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, f as RawProviderDirection, c as RawProviderEvent, R as RawProviderSink, g as RawProviderSinkFilter, h as RunIntegrityError, a as RunIntegrityExpectations, i as RunIntegrityIssue, j as RunIntegrityIssueCode, b as RunIntegrityReport, k as assertRunCaptured, l as defaultProviderRedactor, p as providerFromBaseUrl, t as throwIfRunIncomplete } from './integrity-DK2EBVZC.js';
|
|
25
|
-
export { D as DEFAULT_REDACTION_RULES, O as OTEL_AGENT_EVAL_SCOPE, a as OtlpExport, b as OtlpResourceSpans, c as OtlpSpan, R as REDACTION_VERSION, d as RedactionReport, e as RedactionRule, f as ReplayCache, g as ReplayCacheEntry, h as ReplayCacheMissError, i as ReplayCacheStats, j as ReplayFetchOptions, k as createReplayFetch, l as exportRunAsOtlp, m as iterateRawCalls, r as redactString, n as redactValue } from './replay-
|
|
27
|
+
export { D as DEFAULT_REDACTION_RULES, O as OTEL_AGENT_EVAL_SCOPE, a as OtlpExport, b as OtlpResourceSpans, c as OtlpSpan, R as REDACTION_VERSION, d as RedactionReport, e as RedactionRule, f as ReplayCache, g as ReplayCacheEntry, h as ReplayCacheMissError, i as ReplayCacheStats, j as ReplayFetchOptions, k as createReplayFetch, l as exportRunAsOtlp, m as iterateRawCalls, r as redactString, n as redactValue } from './replay-D7z0J43-.js';
|
|
26
28
|
export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-DODUYdPg.js';
|
|
27
29
|
import { a as BaselineReport } from './baseline-4R5deP0N.js';
|
|
28
30
|
export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-4R5deP0N.js';
|
|
@@ -30,14 +32,13 @@ import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
|
|
|
30
32
|
export { b as buildTrajectory } from './trajectory-CnoBo-JY.js';
|
|
31
33
|
import { a as DatasetScenario, c as Dataset } from './dataset-CiK_3LDr.js';
|
|
32
34
|
export { d as DatasetDifficulty, b as DatasetManifest, e as DatasetProvenance, D as DatasetSplit, H as HoldoutLockedError, S as SliceOptions, h as hashScenarios } from './dataset-CiK_3LDr.js';
|
|
33
|
-
export { C as CalibrationResult, a as CandidateScore, D as DEFAULT_RED_TEAM_CORPUS, E as EuRiskClass, G as GoldenItem, b as GovernanceContext, c as GovernanceFinding, d as GovernanceReport, P as PositionalBiasResult, R as RedTeamCase, e as RedTeamCategory, f as RedTeamFinding, g as RedTeamPayload, h as RedTeamReport, S as SelfPreferenceResult, U as UseCaseSignals, V as VerbosityBiasResult, i as calibrateJudge, j as classifyEuAiRisk, k as euAiActReport, n as nistAiRmfReport, p as positionalBias, r as redTeamDataset, l as redTeamReport, m as renderMarkdown, s as scoreRedTeamOutput, o as selfPreference, q as soc2Report, t as summarize, u as toolNamesForRun, v as verbosityBias } from './index-Oj9fAPPN.js';
|
|
34
35
|
import { a as PrmGrader } from './rubric-D5tjHNJQ.js';
|
|
35
|
-
import { L as Layer, S as Severity, a as LayerResult, b as VerifyContext } from './multi-layer-verifier-
|
|
36
|
-
export { F as Finding, c as LayerStatus, M as MultiLayerVerifier, V as VerificationReport, d as VerifyOptions, g as gradeSemanticStatus } from './multi-layer-verifier-
|
|
37
|
-
import { L as LlmClientOptions } from './researcher-
|
|
38
|
-
export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, m as LlmCallError, n as LlmCallRequest, o as LlmCallResult, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as callLlm, x as callLlmJson, y as probeLlm, r as runEvalCampaign, z as stripFencedJson } from './researcher-
|
|
36
|
+
import { L as Layer, S as Severity, a as LayerResult, b as VerifyContext } from './multi-layer-verifier-U-c8ge1k.js';
|
|
37
|
+
export { F as Finding, c as LayerStatus, M as MultiLayerVerifier, V as VerificationReport, d as VerifyOptions, g as gradeSemanticStatus } from './multi-layer-verifier-U-c8ge1k.js';
|
|
38
|
+
import { L as LlmClientOptions } from './researcher-G81CWc0q.js';
|
|
39
|
+
export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, m as LlmCallError, n as LlmCallRequest, o as LlmCallResult, p as LlmClient, q as LlmMessage, s as LlmRouteAssertionError, t as LlmRouteRequirements, u as LlmUsage, N as NoopResearcher, R as Researcher, S as SteeringChange, v as assertLlmRoute, w as callLlm, x as callLlmJson, y as probeLlm, r as runEvalCampaign, z as stripFencedJson } from './researcher-G81CWc0q.js';
|
|
39
40
|
export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index--fVrWDiR.js';
|
|
40
|
-
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-
|
|
41
|
+
export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
|
|
41
42
|
import './outcome-store-D6KWmYvj.js';
|
|
42
43
|
|
|
43
44
|
/**
|
|
@@ -64,8 +65,7 @@ import './outcome-store-D6KWmYvj.js';
|
|
|
64
65
|
* Both implement the small `AutoPrClient` interface, so tests substitute
|
|
65
66
|
* a fake without spinning a process or network.
|
|
66
67
|
*
|
|
67
|
-
* @experimental —
|
|
68
|
-
* it into CI workflows.
|
|
68
|
+
* @experimental — surface may evolve as consumers wire it into CI workflows.
|
|
69
69
|
*/
|
|
70
70
|
interface FileChange {
|
|
71
71
|
/** Repo-relative path. Forward slashes; no `..`. */
|
|
@@ -747,8 +747,7 @@ declare class MetricsCollector {
|
|
|
747
747
|
* primitive is idempotent + replayable: re-running with the same
|
|
748
748
|
* `runId` will produce the same plan.
|
|
749
749
|
*
|
|
750
|
-
* @experimental —
|
|
751
|
-
* agents wire it in.
|
|
750
|
+
* @experimental — surface may evolve as product agents wire it in.
|
|
752
751
|
*/
|
|
753
752
|
|
|
754
753
|
interface FailureClusterConfig {
|
|
@@ -998,6 +997,78 @@ declare function wilcoxonSignedRank(before: number[], after: number[]): {
|
|
|
998
997
|
* Rule of thumb: |d| < 0.2 negligible, 0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.
|
|
999
998
|
*/
|
|
1000
999
|
declare function cohensD(a: number[], b: number[]): number;
|
|
1000
|
+
interface CorpusScoreRecord {
|
|
1001
|
+
/** Stable identifier for the rated item (scenario, span, turn, …). */
|
|
1002
|
+
itemId: string;
|
|
1003
|
+
/** Identifier for the judge that produced this score. */
|
|
1004
|
+
judgeName: string;
|
|
1005
|
+
/** Dimension name (matches `JudgeScore.dimension`). */
|
|
1006
|
+
dimension: string;
|
|
1007
|
+
/** Numeric score; must be finite. */
|
|
1008
|
+
score: number;
|
|
1009
|
+
}
|
|
1010
|
+
interface CorpusAgreementPerDimension extends ContinuousAgreement {
|
|
1011
|
+
dimension: string;
|
|
1012
|
+
/** Item IDs that contributed to this dimension's matrix (every judge scored them). */
|
|
1013
|
+
itemIds: string[];
|
|
1014
|
+
/** Judge IDs that contributed to this dimension's matrix. */
|
|
1015
|
+
judgeIds: string[];
|
|
1016
|
+
}
|
|
1017
|
+
interface CorpusAgreementReport {
|
|
1018
|
+
/** Per-dimension ICC(2,1) + κ_w + Pearson + Spearman + bootstrap CIs. */
|
|
1019
|
+
perDimension: CorpusAgreementPerDimension[];
|
|
1020
|
+
/** Mean ICC across dimensions (NaN if no dimension yielded a finite ICC). */
|
|
1021
|
+
overallIcc: number;
|
|
1022
|
+
/** Mean weighted κ across dimensions (NaN if none finite). */
|
|
1023
|
+
overallWeightedKappa: number;
|
|
1024
|
+
/** Dimensions evaluated (sorted). */
|
|
1025
|
+
dimensions: string[];
|
|
1026
|
+
/** Judges seen across the corpus (sorted). */
|
|
1027
|
+
judgeIds: string[];
|
|
1028
|
+
}
|
|
1029
|
+
interface CorpusAgreementOptions extends ContinuousAgreementOptions {
|
|
1030
|
+
/**
|
|
1031
|
+
* Restrict the audit to these dimensions. Default = every dimension
|
|
1032
|
+
* that appears in the input. A dimension named here but absent from
|
|
1033
|
+
* the input throws — silent omission would corrupt the overall metric.
|
|
1034
|
+
*/
|
|
1035
|
+
dimensions?: string[];
|
|
1036
|
+
/**
|
|
1037
|
+
* Restrict the audit to these judges. Default = every judge that
|
|
1038
|
+
* appears in the input. A judge named here but absent from a
|
|
1039
|
+
* dimension throws (see "fail loud" below).
|
|
1040
|
+
*/
|
|
1041
|
+
judges?: string[];
|
|
1042
|
+
}
|
|
1043
|
+
/**
|
|
1044
|
+
* Corpus-wide inter-rater agreement across N items × M judges × D dimensions.
|
|
1045
|
+
*
|
|
1046
|
+
* For each dimension, builds the [n_items][n_judges] matrix of scores
|
|
1047
|
+
* (keeping only items every judge rated on that dimension), then runs
|
|
1048
|
+
* `continuousAgreement` to get ICC(2,1), κ_w, Pearson, Spearman, and
|
|
1049
|
+
* bootstrap CIs. Reports a pooled mean across dimensions as a single
|
|
1050
|
+
* "is this judge panel reliable on this corpus?" number.
|
|
1051
|
+
*
|
|
1052
|
+
* Fail-loud contract:
|
|
1053
|
+
* - Empty input throws.
|
|
1054
|
+
* - Fewer than 2 judges or fewer than 2 items per dimension throws.
|
|
1055
|
+
* - A judge present in some dimensions but with zero scored items on
|
|
1056
|
+
* another dimension throws (would silently shrink the matrix).
|
|
1057
|
+
* - Duplicate (itemId, judgeName, dimension) records throw.
|
|
1058
|
+
*/
|
|
1059
|
+
declare function corpusInterRaterAgreement(records: CorpusScoreRecord[], opts?: CorpusAgreementOptions): CorpusAgreementReport;
|
|
1060
|
+
/**
|
|
1061
|
+
* Convenience adapter for `JudgeScore[]` data keyed externally by item.
|
|
1062
|
+
*
|
|
1063
|
+
* Use when you have per-item arrays of `JudgeScore[]` (e.g. one
|
|
1064
|
+
* `ScenarioResult.judgeScores` per scenario) and want corpus-wide
|
|
1065
|
+
* agreement without manually flattening. `itemId` must be unique per
|
|
1066
|
+
* row of `itemsScores`.
|
|
1067
|
+
*/
|
|
1068
|
+
declare function corpusInterRaterAgreementFromJudgeScores(itemsScores: Array<{
|
|
1069
|
+
itemId: string;
|
|
1070
|
+
scores: JudgeScore[];
|
|
1071
|
+
}>, opts?: CorpusAgreementOptions): CorpusAgreementReport;
|
|
1001
1072
|
|
|
1002
1073
|
/**
|
|
1003
1074
|
* Anti-slop quality judge.
|
|
@@ -2623,9 +2694,9 @@ declare function paraphraseRobustnessScenarios(args: ParaphraseRobustnessScenari
|
|
|
2623
2694
|
* `bonferroni(pValues, alpha)` correct for multiple pairwise tests
|
|
2624
2695
|
* so pairwise variant comparisons stay statistically honest.
|
|
2625
2696
|
*
|
|
2626
|
-
*
|
|
2627
|
-
*
|
|
2628
|
-
*
|
|
2697
|
+
* Applying alpha directly across n*(n-1)/2 pairwise tests without
|
|
2698
|
+
* correction inflates the false-positive rate when variants ≥ 3 — the
|
|
2699
|
+
* BH and Bonferroni helpers prevent that.
|
|
2629
2700
|
*/
|
|
2630
2701
|
/**
|
|
2631
2702
|
* Required N per arm for a two-sample comparison at target effect size,
|
|
@@ -2940,12 +3011,9 @@ interface HypothesisManifest {
|
|
|
2940
3011
|
* Identifier for the hashing scheme used to produce `contentHash`.
|
|
2941
3012
|
*
|
|
2942
3013
|
* `'sha256-content'` — sha256 hex over the canonicalized manifest with
|
|
2943
|
-
* the `contentHash` and `algo` fields stripped.
|
|
2944
|
-
*
|
|
2945
|
-
*
|
|
2946
|
-
* Held as a string union so future schemes can be added without
|
|
2947
|
-
* breaking parsers; legacy SignedManifest values written before this
|
|
2948
|
-
* field existed will deserialize cleanly because the field is optional.
|
|
3014
|
+
* the `contentHash` and `algo` fields stripped. Held as a string union
|
|
3015
|
+
* so future schemes can be added without breaking parsers; SignedManifest
|
|
3016
|
+
* values without `algo` deserialize cleanly because the field is optional.
|
|
2949
3017
|
*/
|
|
2950
3018
|
type SignedManifestAlgo = 'sha256-content';
|
|
2951
3019
|
interface SignedManifest extends HypothesisManifest {
|
|
@@ -2954,10 +3022,10 @@ interface SignedManifest extends HypothesisManifest {
|
|
|
2954
3022
|
/**
|
|
2955
3023
|
* Algorithm string describing how `contentHash` was produced.
|
|
2956
3024
|
*
|
|
2957
|
-
* Optional on the type so
|
|
2958
|
-
*
|
|
2959
|
-
*
|
|
2960
|
-
*
|
|
3025
|
+
* Optional on the type so serialized manifests without it still parse,
|
|
3026
|
+
* but ALWAYS populated by {@link signManifest}. Consumers that want to
|
|
3027
|
+
* enforce a known algorithm should reject manifests where this field
|
|
3028
|
+
* is missing or unrecognized.
|
|
2961
3029
|
*/
|
|
2962
3030
|
algo?: SignedManifestAlgo;
|
|
2963
3031
|
}
|
|
@@ -2996,10 +3064,9 @@ declare function canonicalize(v: unknown): unknown;
|
|
|
2996
3064
|
* - encoder choice (UTF-8 via TextEncoder, fixed)
|
|
2997
3065
|
* - runtime (uses the Web Crypto subtle digest, present in Node ≥18 and browsers)
|
|
2998
3066
|
*
|
|
2999
|
-
*
|
|
3000
|
-
*
|
|
3001
|
-
*
|
|
3002
|
-
* coexist; `hashJson` is the right name when you mean "canonicalize then hash."
|
|
3067
|
+
* Named `hashJson` to disambiguate from `prompt-registry.ts`'s `hashContent`,
|
|
3068
|
+
* which takes a string input and returns a truncated 12-char prompt id.
|
|
3069
|
+
* Use `hashJson` when you mean "canonicalize then hash."
|
|
3003
3070
|
*
|
|
3004
3071
|
* @example
|
|
3005
3072
|
* const hash = await hashJson({ id: '1', kind: 'spec' })
|
|
@@ -3012,17 +3079,15 @@ declare function hashJson<T>(obj: T): Promise<string>;
|
|
|
3012
3079
|
* The hash covers the canonicalized manifest with the `contentHash`
|
|
3013
3080
|
* and `algo` fields stripped; this lets verifiers re-sign the rest and
|
|
3014
3081
|
* compare. Returned manifest always carries `algo: 'sha256-content'`
|
|
3015
|
-
* so downstream consumers can identify the scheme;
|
|
3016
|
-
*
|
|
3017
|
-
* hashing on both sides.
|
|
3082
|
+
* so downstream consumers can identify the scheme; manifests without
|
|
3083
|
+
* `algo` still verify because it is stripped before hashing on both sides.
|
|
3018
3084
|
*/
|
|
3019
3085
|
declare function signManifest(m: HypothesisManifest): Promise<SignedManifest>;
|
|
3020
3086
|
/**
|
|
3021
3087
|
* Verify that a signed manifest has not been tampered with.
|
|
3022
3088
|
*
|
|
3023
|
-
* Strips `contentHash` and `algo` before re-signing so
|
|
3024
|
-
*
|
|
3025
|
-
* ones.
|
|
3089
|
+
* Strips `contentHash` and `algo` before re-signing so manifests without
|
|
3090
|
+
* `algo` verify identically to ones that carry it.
|
|
3026
3091
|
*/
|
|
3027
3092
|
declare function verifyManifest(m: SignedManifest): Promise<boolean>;
|
|
3028
3093
|
/**
|
|
@@ -3334,9 +3399,8 @@ declare const localCommandRunner: CommandRunner;
|
|
|
3334
3399
|
* - artifact dir contains an entry point (index.html for static SPAs,
|
|
3335
3400
|
* equivalent per framework family)
|
|
3336
3401
|
*
|
|
3337
|
-
*
|
|
3338
|
-
*
|
|
3339
|
-
* runner factory.
|
|
3402
|
+
* Ships with a canonical `vite` runner. Additional runners
|
|
3403
|
+
* (wrangler-deploy --dry-run, next-build, etc.) plug in as factories.
|
|
3340
3404
|
*/
|
|
3341
3405
|
|
|
3342
3406
|
type DeployFamily = 'frontend-static' | 'nextjs' | 'remix' | 'fullstack-ts';
|
|
@@ -3508,10 +3572,8 @@ declare function extractErrorCount(text: string, opts?: ExtractOptions): Extract
|
|
|
3508
3572
|
* - test: in-memory mock that returns canned step outcomes
|
|
3509
3573
|
* - future: Playwright, Puppeteer, custom scrapers
|
|
3510
3574
|
*
|
|
3511
|
-
*
|
|
3512
|
-
*
|
|
3513
|
-
* blind spot. Intent-match catches "wrong app entirely"; flow catches
|
|
3514
|
-
* "right app but the buttons don't work."
|
|
3575
|
+
* Paired with {@link runIntentMatchJudge}: intent-match catches "wrong
|
|
3576
|
+
* app entirely"; flow-layer catches "right app but the buttons don't work."
|
|
3515
3577
|
*/
|
|
3516
3578
|
|
|
3517
3579
|
type FlowAction = 'navigate' | 'click' | 'fill' | 'expect-text' | 'expect-element' | 'expect-url' | 'wait';
|
|
@@ -3607,10 +3669,6 @@ declare function flowLayer<Env extends FlowLayerEnv = FlowLayerEnv>(input: FlowL
|
|
|
3607
3669
|
*
|
|
3608
3670
|
* Soft-fails on LLM/JSON error (`available: false`) so callers can
|
|
3609
3671
|
* treat failure as "judge skipped."
|
|
3610
|
-
*
|
|
3611
|
-
* Added in 0.11 to replace the lying `completenessScore: 1` field that
|
|
3612
|
-
* VerticalBench shipped pre-Gen-48 — that field was keyword-driven and
|
|
3613
|
-
* fired true on builds with zero spec concepts implemented.
|
|
3614
3672
|
*/
|
|
3615
3673
|
|
|
3616
3674
|
declare const INTENT_MATCH_JUDGE_VERSION = "intent-match-judge-v1-2026-04-24";
|
|
@@ -4142,7 +4200,7 @@ declare function createDefaultReviewer(options: CreateDefaultReviewerOptions): (
|
|
|
4142
4200
|
*/
|
|
4143
4201
|
|
|
4144
4202
|
/**
|
|
4145
|
-
* Implementation complexity class for weighted scoring
|
|
4203
|
+
* Implementation complexity class for weighted scoring.
|
|
4146
4204
|
*
|
|
4147
4205
|
* - `render` (default): the concept is a UI surface that displays static
|
|
4148
4206
|
* data — render a list, show a counter, lay out a button. Single-file
|
|
@@ -4212,11 +4270,10 @@ interface SemanticConceptJudgeResult {
|
|
|
4212
4270
|
error?: string;
|
|
4213
4271
|
}
|
|
4214
4272
|
/**
|
|
4215
|
-
* Score-aggregation strategy.
|
|
4216
|
-
*
|
|
4217
|
-
*
|
|
4218
|
-
*
|
|
4219
|
-
* (defaulting to 1 for unspecified).
|
|
4273
|
+
* Score-aggregation strategy. `mean` averages 0-10 scores uniformly.
|
|
4274
|
+
* `complexity` applies the default weight table (render=1, integrate=2,
|
|
4275
|
+
* compute=2.5) unless a concept has an explicit `weight`. `explicit`
|
|
4276
|
+
* honors only `weight` (defaulting to 1 for unspecified).
|
|
4220
4277
|
*/
|
|
4221
4278
|
type ConceptWeightStrategy = 'mean' | 'complexity' | 'explicit';
|
|
4222
4279
|
declare const DEFAULT_COMPLEXITY_WEIGHTS: Record<ConceptComplexity, number>;
|
|
@@ -4234,9 +4291,9 @@ interface SemanticConceptJudgeOptions {
|
|
|
4234
4291
|
/** LlmClient config (baseUrl, apiKey, authHeader, …). */
|
|
4235
4292
|
llm?: LlmClientOptions;
|
|
4236
4293
|
/**
|
|
4237
|
-
* Score aggregation strategy. Default `mean`
|
|
4238
|
-
*
|
|
4239
|
-
*
|
|
4294
|
+
* Score aggregation strategy. Default `mean` — uniform average across
|
|
4295
|
+
* concepts. Cross-vertical comparisons should use `complexity` to
|
|
4296
|
+
* neutralize the integrate-vs-render asymmetry.
|
|
4240
4297
|
*/
|
|
4241
4298
|
weightConcepts?: ConceptWeightStrategy;
|
|
4242
4299
|
/** Override the default complexity → weight table. */
|
|
@@ -4458,9 +4515,8 @@ interface LineageNode {
|
|
|
4458
4515
|
}
|
|
4459
4516
|
/**
|
|
4460
4517
|
* `kindOf` decides whether a variant is a seed (no parent), code mutation,
|
|
4461
|
-
* or prompt mutation. Default looks at `variant.payload.codeMutation`
|
|
4462
|
-
*
|
|
4463
|
-
* accept any payload that mirrors it. Override by passing your own.
|
|
4518
|
+
* or prompt mutation. Default looks at `variant.payload.codeMutation` and
|
|
4519
|
+
* accepts any payload that exposes that field; override by passing your own.
|
|
4464
4520
|
*/
|
|
4465
4521
|
type LineageKindResolver<P> = (variant: EvolvableVariant<P>) => LineageKind;
|
|
4466
4522
|
/**
|
|
@@ -4707,9 +4763,8 @@ declare function createSandboxCodeMutator<T, P>(opts: CreateSandboxCodeMutatorOp
|
|
|
4707
4763
|
* Δ improvement (auto-detect when prompt evolution has
|
|
4708
4764
|
* hit a structural ceiling).
|
|
4709
4765
|
*
|
|
4710
|
-
* Naming is generic
|
|
4711
|
-
*
|
|
4712
|
-
* primitive doesn't care what each mutator actually does.
|
|
4766
|
+
* Naming is generic — the canonical use cases are "prompt" and "code"
|
|
4767
|
+
* channels, but the primitive doesn't care what each mutator actually does.
|
|
4713
4768
|
*/
|
|
4714
4769
|
|
|
4715
4770
|
type CompositePolicy = 'primary-only' | 'secondary-only' | 'alternate' | 'plateau';
|
|
@@ -4753,6 +4808,41 @@ declare class Mutex {
|
|
|
4753
4808
|
get pending(): number;
|
|
4754
4809
|
}
|
|
4755
4810
|
|
|
4811
|
+
/**
|
|
4812
|
+
* Walk a personas directory and return every file matching the convention
|
|
4813
|
+
* `NN-slug.{yaml,yml,json,md}`. Sorted by filename so the numeric prefix
|
|
4814
|
+
* gives stable persona ordering for reproducibility. Consumers filter
|
|
4815
|
+
* through `include` / `exclude`.
|
|
4816
|
+
*/
|
|
4817
|
+
interface DiscoverPersonasOptions {
|
|
4818
|
+
/**
|
|
4819
|
+
* Regex applied to filenames. Files that don't match are skipped.
|
|
4820
|
+
* Default: `^[0-9]{2}-.+\.(yaml|yml|json|md)$`.
|
|
4821
|
+
*/
|
|
4822
|
+
pattern?: RegExp;
|
|
4823
|
+
/**
|
|
4824
|
+
* Filenames (or basenames) to skip. Use this to exclude WIP / archived
|
|
4825
|
+
* personas without removing the file.
|
|
4826
|
+
*/
|
|
4827
|
+
exclude?: readonly string[];
|
|
4828
|
+
/**
|
|
4829
|
+
* If set, return only personas whose basename contains one of these
|
|
4830
|
+
* substrings (post-pattern filter).
|
|
4831
|
+
*/
|
|
4832
|
+
include?: readonly string[];
|
|
4833
|
+
/** Recurse into subdirectories. Default false. */
|
|
4834
|
+
recursive?: boolean;
|
|
4835
|
+
}
|
|
4836
|
+
interface DiscoveredPersona {
|
|
4837
|
+
/** Absolute file path. */
|
|
4838
|
+
path: string;
|
|
4839
|
+
/** Filename without directory. */
|
|
4840
|
+
filename: string;
|
|
4841
|
+
/** Filename without extension — the conventional persona id. */
|
|
4842
|
+
id: string;
|
|
4843
|
+
}
|
|
4844
|
+
declare function discoverPersonas(dir: string, opts?: DiscoverPersonasOptions): Promise<DiscoveredPersona[]>;
|
|
4845
|
+
|
|
4756
4846
|
/**
|
|
4757
4847
|
* GoldenMatcher — fuzzy matcher for "did the agent produce the expected things?".
|
|
4758
4848
|
*
|
|
@@ -4856,6 +4946,75 @@ declare class JsonlTrialCache implements TrialCache {
|
|
|
4856
4946
|
setSync(key: string, value: TrialResult): void;
|
|
4857
4947
|
}
|
|
4858
4948
|
|
|
4949
|
+
/**
|
|
4950
|
+
* Wrap a single judge LLM call with retry, optional fallback-model
|
|
4951
|
+
* rotation, exponential backoff, and a typed `JudgeRetryOutcome`. Callers
|
|
4952
|
+
* MUST inspect `succeeded` before using `value`; on failure the library
|
|
4953
|
+
* returns `value: null` rather than substituting a default, so a judge
|
|
4954
|
+
* abort cannot silently corrupt a downstream composite.
|
|
4955
|
+
*
|
|
4956
|
+
* Reporting contract: callers ship `TrialResult.judgeSucceeded = succeeded`
|
|
4957
|
+
* and `TrialResult.judgeAttempts = attempts` so `aggregateTrialsByMode`
|
|
4958
|
+
* with `mode: 'exclude-failed'` drops the trial.
|
|
4959
|
+
*/
|
|
4960
|
+
/** Retry policy for judge LLM calls. */
|
|
4961
|
+
interface JudgeRetryPolicy {
|
|
4962
|
+
/** Max attempts per model. Default 3 (one initial + two retries). */
|
|
4963
|
+
maxAttempts?: number;
|
|
4964
|
+
/** Per-attempt timeout in ms. Default 90_000 (1.5×agent-eval's 60s default). */
|
|
4965
|
+
timeoutMs?: number;
|
|
4966
|
+
/**
|
|
4967
|
+
* Models to try, in order. The first model is the primary; subsequent
|
|
4968
|
+
* models are fallbacks invoked only when ALL retries on the previous
|
|
4969
|
+
* model have been exhausted. Example: `['claude-code/sonnet', 'kimi-code/k2p6']`
|
|
4970
|
+
* runs claude-code up to maxAttempts times, then falls back to kimi.
|
|
4971
|
+
* If omitted, the caller's judge function controls model selection and
|
|
4972
|
+
* the retries apply to that single model.
|
|
4973
|
+
*/
|
|
4974
|
+
models?: readonly string[];
|
|
4975
|
+
/** Exponential backoff function, default `attempt → min(500 * 2^attempt, 16_000)`. */
|
|
4976
|
+
backoffMs?: (attempt: number) => number;
|
|
4977
|
+
/**
|
|
4978
|
+
* Predicate deciding whether an error should trigger a retry. Default
|
|
4979
|
+
* retries on: AbortError, TimeoutError, `fetch failed`, `ECONNRESET`,
|
|
4980
|
+
* `[This operation was aborted]`, and any LlmCallError with status in
|
|
4981
|
+
* {429, 502, 503, 504}. JSON-parse errors are NOT retriable (the model
|
|
4982
|
+
* needs prompt adjustment, not another shot).
|
|
4983
|
+
*/
|
|
4984
|
+
isRetryable?: (err: unknown) => boolean;
|
|
4985
|
+
}
|
|
4986
|
+
/** Outcome of a wrapped judge invocation. */
|
|
4987
|
+
interface JudgeRetryOutcome<T> {
|
|
4988
|
+
/** The judge's returned value when `succeeded === true`. */
|
|
4989
|
+
value: T | null;
|
|
4990
|
+
/** True iff one of the attempts completed without throwing. */
|
|
4991
|
+
succeeded: boolean;
|
|
4992
|
+
/** Total attempts made across all models. */
|
|
4993
|
+
attempts: number;
|
|
4994
|
+
/** Which model the successful attempt used (when succeeded). */
|
|
4995
|
+
modelUsed?: string;
|
|
4996
|
+
/** Last error captured when `succeeded === false`. */
|
|
4997
|
+
error?: Error;
|
|
4998
|
+
/** Per-attempt error log for forensics. */
|
|
4999
|
+
attemptErrors: Array<{
|
|
5000
|
+
attempt: number;
|
|
5001
|
+
model: string;
|
|
5002
|
+
error: string;
|
|
5003
|
+
}>;
|
|
5004
|
+
}
|
|
5005
|
+
/**
|
|
5006
|
+
* Wrap a judge call with retry + fallback-model + typed outcome semantics.
|
|
5007
|
+
*
|
|
5008
|
+
* The `judgeFn` signature is `(model: string, signal: AbortSignal) => Promise<T>`.
|
|
5009
|
+
* The signal will be aborted at `timeoutMs`. Callers should pass the signal
|
|
5010
|
+
* to their underlying fetch/SDK call so the abort actually fires.
|
|
5011
|
+
*
|
|
5012
|
+
* Returns a typed outcome — callers MUST inspect `succeeded` before using
|
|
5013
|
+
* `value`. The library refuses to default to a silent zero score because a
|
|
5014
|
+
* synthetic zero is indistinguishable from a real low score downstream.
|
|
5015
|
+
*/
|
|
5016
|
+
declare function withJudgeRetry<T>(judgeFn: (model: string, signal: AbortSignal) => Promise<T>, policy?: JudgeRetryPolicy): Promise<JudgeRetryOutcome<T>>;
|
|
5017
|
+
|
|
4859
5018
|
/**
|
|
4860
5019
|
* LockedJsonlAppender — mutex-serialized JSONL append helper for arbitrary
|
|
4861
5020
|
* payloads. The reference-replay store does the same thing for typed
|
|
@@ -4917,4 +5076,52 @@ interface ReferenceReplaySteeringRowsOptions<Input = unknown> {
|
|
|
4917
5076
|
declare function referenceReplayRunsToSteeringRows<Input = unknown>(runs: ReferenceReplayRun<Input>[], options?: ReferenceReplaySteeringRowsOptions<Input>): SteeringOptimizationRow[];
|
|
4918
5077
|
declare function referenceReplayScenarioToRunScore(scenarioScore: ReferenceReplayScenarioScore, durationMs?: number): RunScore;
|
|
4919
5078
|
|
|
4920
|
-
export { ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DirEntry, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvalResult, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, ProductClient, type ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TestResult, type ThresholdContract, TokenCounter, type TokenSpec, TraceEmitter, TraceEvent, TraceStore, Trajectory, TrajectoryStep, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, VerifyContext, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateRunScore, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, buildReviewerPrompt, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, confidenceInterval, containsAll, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, linterJudge, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, normalizeScores, notBlocked, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, typoMutator, urlContains, verifyManifest, visualDiff, viteDeployRunner, weightedMean, weightedRecall, whitespaceCollapseMutator, wilcoxonSignedRank, wranglerDeployRunner };
|
|
5079
|
+
/**
|
|
5080
|
+
* Aggregate trials with explicit handling of judge failure. Three modes:
|
|
5081
|
+
*
|
|
5082
|
+
* - `strict-fail` — any `judgeSucceeded === false` trial fails the whole
|
|
5083
|
+
* aggregate. Use for production gates: one corrupt trial halts the gate.
|
|
5084
|
+
*
|
|
5085
|
+
* - `exclude-failed` — drop `judgeSucceeded === false` trials from the
|
|
5086
|
+
* mean; report `excludedFailedTrials` separately. Default for new code.
|
|
5087
|
+
*
|
|
5088
|
+
* - `zero-fill` — failed trials count as `score: 0` in the mean. Available
|
|
5089
|
+
* only for adapters that don't yet set `judgeSucceeded`.
|
|
5090
|
+
*
|
|
5091
|
+
* Hard-errored trials (`t.error` set) are always excluded — those are
|
|
5092
|
+
* infrastructure failures, not eval signal.
|
|
5093
|
+
*/
|
|
5094
|
+
|
|
5095
|
+
type AggregatorMode = 'strict-fail' | 'exclude-failed' | 'zero-fill';
|
|
5096
|
+
interface TrialAggregate {
|
|
5097
|
+
/** Mean score over the trials counted by the chosen mode. */
|
|
5098
|
+
meanScore: number;
|
|
5099
|
+
/** Mean cost across counted trials. */
|
|
5100
|
+
meanCost: number;
|
|
5101
|
+
/** Mean wall time across counted trials. */
|
|
5102
|
+
meanDurationMs: number;
|
|
5103
|
+
/** Fraction of counted trials with `ok === true`. */
|
|
5104
|
+
okRate: number;
|
|
5105
|
+
/** Trials counted in the mean (mode-dependent). */
|
|
5106
|
+
countedTrials: number;
|
|
5107
|
+
/** Trials excluded because `judgeSucceeded === false` (exclude-failed mode). */
|
|
5108
|
+
excludedFailedTrials: number;
|
|
5109
|
+
/** Total trials passed in. */
|
|
5110
|
+
totalTrials: number;
|
|
5111
|
+
/** Mean of every numeric metric across counted trials. */
|
|
5112
|
+
metrics: Record<string, number>;
|
|
5113
|
+
/**
|
|
5114
|
+
* Set when mode is `strict-fail` AND at least one trial had
|
|
5115
|
+
* `judgeSucceeded === false`. Caller should refuse to use this aggregate
|
|
5116
|
+
* downstream — the eval is corrupt.
|
|
5117
|
+
*/
|
|
5118
|
+
strictFailure?: {
|
|
5119
|
+
failedCount: number;
|
|
5120
|
+
firstError?: string;
|
|
5121
|
+
};
|
|
5122
|
+
}
|
|
5123
|
+
declare function aggregateTrialsByMode(trials: TrialResult[], opts: {
|
|
5124
|
+
mode: AggregatorMode;
|
|
5125
|
+
}): TrialAggregate;
|
|
5126
|
+
|
|
5127
|
+
export { ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AggregatorMode, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, ContinuousAgreement, ContinuousAgreementOptions, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CorpusAgreementOptions, type CorpusAgreementPerDimension, type CorpusAgreementReport, type CorpusScoreRecord, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, DatasetScenario, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DirEntry, type DiscoverPersonasOptions, type DiscoveredPersona, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvalResult, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, type FeedbackPattern, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, type JudgeRubric, JudgeRunner, type JudgeScore, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmClientOptions, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, ProductClient, type ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TestResult, type ThresholdContract, TokenCounter, type TokenSpec, TraceEmitter, TraceEvent, TraceStore, Trajectory, TrajectoryStep, type TrialAggregate, type TrialAttempt, TrialCache, TrialTelemetry, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, VerifyContext, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateRunScore, aggregateTrialsByMode, analyzeAntiSlop, analyzeSeries, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, buildReviewerPrompt, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, confidenceInterval, containsAll, corpusInterRaterAgreement, corpusInterRaterAgreementFromJudgeScores, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, interRaterReliability, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, linterJudge, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, normalizeScores, notBlocked, pairedTTest, paraphraseRobustness, paraphraseRobustnessScenarios, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, typoMutator, urlContains, verifyManifest, visualDiff, viteDeployRunner, weightedMean, weightedRecall, whitespaceCollapseMutator, wilcoxonSignedRank, withJudgeRetry, wranglerDeployRunner };
|