@tangle-network/agent-eval 0.41.0 → 0.42.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/campaign/index.d.ts +48 -368
- package/dist/campaign/index.js +67 -1
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-AU2JLNSZ.js → chunk-H4TOS272.js} +1 -65
- package/dist/chunk-H4TOS272.js.map +1 -0
- package/dist/{chunk-NKLGKF2Q.js → chunk-KQ26DYTQ.js} +2 -18
- package/dist/chunk-KQ26DYTQ.js.map +1 -0
- package/dist/{chunk-EGIPWXHL.js → chunk-MNL6LXGQ.js} +98 -2
- package/dist/chunk-MNL6LXGQ.js.map +1 -0
- package/dist/{chunk-5U2DOJU4.js → chunk-N4SBKEPJ.js} +199 -2
- package/dist/chunk-N4SBKEPJ.js.map +1 -0
- package/dist/{chunk-LCIDRYGP.js → chunk-PD3MH6WU.js} +8 -8
- package/dist/{control-CmLJk3IG.d.ts → control-ojEWkMfJ.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/{feedback-trajectory-Dvy-bt7x.d.ts → feedback-trajectory-BSxqEpu7.d.ts} +1 -1
- package/dist/index.d.ts +227 -687
- package/dist/index.js +753 -1237
- package/dist/index.js.map +1 -1
- package/dist/integrity-CTDhR1Sg.d.ts +81 -0
- package/dist/llm-client-BXVRUZyX.d.ts +234 -0
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +67 -3
- package/dist/pipelines/index.js.map +1 -1
- package/dist/{integrity-DYR5gWlb.d.ts → raw-provider-sink-C46HDghv.d.ts} +1 -80
- package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} +21 -3
- package/dist/reporting.d.ts +2 -3
- package/dist/reporting.js +4 -8
- package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} +116 -205
- package/dist/rl.d.ts +103 -221
- package/dist/rl.js +44 -199
- package/dist/rl.js.map +1 -1
- package/dist/sequential-DdV5ShjT.d.ts +561 -0
- package/dist/traces.d.ts +3 -2
- package/dist/traces.js +5 -5
- package/dist/types-BLbRTxoc.d.ts +367 -0
- package/dist/wire/index.d.ts +1 -1
- package/package.json +1 -6
- package/dist/chunk-5U2DOJU4.js.map +0 -1
- package/dist/chunk-AU2JLNSZ.js.map +0 -1
- package/dist/chunk-DMW5VENN.js +0 -1412
- package/dist/chunk-DMW5VENN.js.map +0 -1
- package/dist/chunk-EGIPWXHL.js.map +0 -1
- package/dist/chunk-MAZ26DC7.js +0 -99
- package/dist/chunk-MAZ26DC7.js.map +0 -1
- package/dist/chunk-NKLGKF2Q.js.map +0 -1
- package/dist/multi-layer-verifier-BNi4-8lR.d.ts +0 -141
- package/dist/optimization.d.ts +0 -11
- package/dist/optimization.js +0 -71
- package/dist/optimization.js.map +0 -1
- package/dist/sequential-5iSVfzl2.d.ts +0 -139
- package/dist/summary-report-DuZXOk7K.d.ts +0 -917
- /package/dist/{chunk-LCIDRYGP.js.map → chunk-PD3MH6WU.js.map} +0 -0
package/dist/index.d.ts
CHANGED
|
@@ -1,36 +1,33 @@
|
|
|
1
|
-
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-
|
|
2
|
-
import { R as RunRecord
|
|
3
|
-
export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, b as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-BGY6bHRh.js';
|
|
1
|
+
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, L as LlmJsonCall, b as LlmReviewerConfig, P as ProposeFn, c as ProposeInput, d as ProposeOutput, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, k as ProposeReviewShot, R as Review, l as ReviewFn, m as ReviewInput, n as ReviewMemoryEntry, o as ReviewMemoryStore, p as RunEvidenceMetadata, V as Verification, q as VerifyFn, r as controlFailureClassFromVerification, s as controlRunToRunRecord, t as createLlmReviewer, u as evaluateActionPolicy, v as inMemoryReviewStore, w as jsonlReviewStore, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-ojEWkMfJ.js';
|
|
2
|
+
import { R as RunRecord } from './run-record-BGY6bHRh.js';
|
|
3
|
+
export { e as AGENT_PROFILE_KINDS, A as AgentProfileCell, d as AgentProfileCellInput, f as AgentProfileCellSchemaVersion, g as AgentProfileCellValidationError, h as AgentProfileDimensionValue, i as AgentProfileHarness, j as AgentProfileJson, k as AgentProfileKind, l as AgentProfileSource, m as AgentProfileSourceInput, J as JudgeScoresRecord, c as RunJudgeMetadata, n as RunOutcome, o as RunRecordValidationError, a as RunSplitTag, b as RunTokenUsage, S as SandboxAgentProfileLike, p as agentProfileCellHashMaterial, q as agentProfileCellKey, r as assertRunAgentProfileCell, s as buildAgentProfileCell, t as buildSandboxAgentProfileCell, u as groupRunsByAgentProfileCell, v as isRunRecord, w as parseRunRecordSafe, x as requireAgentProfileCell, y as roundTripRunRecord, z as toAgentProfileJson, B as validateAgentProfileCell, C as validateRunRecord, D as verifyAgentProfileCell } from './run-record-BGY6bHRh.js';
|
|
4
4
|
import { AxAIService, AxFunction } from '@ax-llm/ax';
|
|
5
|
-
import {
|
|
6
|
-
export { F as Finding,
|
|
5
|
+
import { d as Severity, M as MultiLayerVerifier, e as VerifyOptions, L as Layer, f as LayerResult, g as VerifyContext } from './researcher-CoJMs2Iz.js';
|
|
6
|
+
export { C as CallbackResearcher, h as CallbackResearcherOptions, i as CampaignFactoryParams, j as CampaignIntegrityPolicy, k as CampaignRunContext, l as CampaignRunOutcome, m as CampaignRunner, n as CampaignScenario, o as CampaignVariant, c as EvalCampaignOptions, b as EvalCampaignResult, E as ExperimentPlan, a as ExperimentResult, p as FailedRun, F as FailureMode, q as Finding, s as LayerStatus, N as NoopResearcher, R as Researcher, S as SteeringChange, V as VerificationReport, t as gradeSemanticStatus, r as runEvalCampaign } from './researcher-CoJMs2Iz.js';
|
|
7
7
|
import { R as Run$1, S as Span, b as TraceEvent, A as Artifact$1, B as BudgetLedgerEntry, T as TraceStore, g as BudgetSpec, h as RunFilter, L as LlmSpan } from './store-Db2Bv8Cf.js';
|
|
8
8
|
export { i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, M as Message, e as RetrievalSpan, m as RunLayer, n as RunStatus, f as SandboxSpan, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
|
|
9
|
-
import { L as LlmClientOptions,
|
|
10
|
-
export {
|
|
9
|
+
import { L as LlmClientOptions, b as LlmCallRequest, c as LlmCallResult } from './llm-client-BXVRUZyX.js';
|
|
10
|
+
export { d as LlmCallError, e as LlmClient, f as LlmMessage, g as LlmRouteAssertionError, a as LlmRouteRequirements, h as LlmUsage, i as assertLlmRoute, j as backoffMs, k as callLlm, l as callLlmJson, m as isTransientLlmError, p as probeLlm, s as stripFencedJson } from './llm-client-BXVRUZyX.js';
|
|
11
11
|
import { TraceAnalysisStore, AnalyzeTracesOptions, OtelExporter, OtelExportConfig, AnalyzeTracesInput, AnalyzeTracesResult } from './traces.js';
|
|
12
12
|
export { AnalyzeTracesTurnSnapshot, DEFAULT_REDACTION_RULES, DEFAULT_TRACE_ANALYST_BUDGETS, DatasetOverview, ExportableSpan, OTEL_AGENT_EVAL_SCOPE, OtlpExport, OtlpFileTraceStore, OtlpFileTraceStoreOptions, OtlpResourceSpans, OtlpSpan, QueryTracesPage, REDACTION_VERSION, RedactionReport, RedactionRule, ReplayCache, ReplayCacheEntry, ReplayCacheMissError, ReplayCacheStats, ReplayFetchOptions, SearchSpanResult, SearchTraceResult, SpanMatchRecord, SpanNotFoundError, TRACE_ANALYST_ACTOR_DESCRIPTION, TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION, TRACE_ANALYST_SUBAGENT_DESCRIPTION, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, TraceAnalystByteBudgets, TraceAnalystFilters, TraceAnalystHookOptions, TraceAnalystSpan, TraceAnalystSpanKind, TraceAnalystSpanStatus, TraceAnalystTraceSummary, TraceFileMissingError, TraceInsightContext, TraceInsightFinding, TraceInsightPanelRole, TraceInsightPromptInput, TraceInsightQualityGate, TraceInsightQuestion, TraceInsightReadiness, TraceInsightSuite, TraceInsightTask, TraceNotFoundError, ViewSpansResult, ViewTraceOversized, ViewTraceResult, analyzeTraces, buildTraceAnalystTools, buildTraceInsightContext, buildTraceInsightPrompt, createOtelExporter, createOtelTracingStore, createReplayFetch, defaultTraceInsightPanel, describeTraceInsightScope, domainEvidencePattern, exportRunAsOtlp, inferDomainKeywords, iterateRawCalls, otelRunCompleteHook, planTraceInsightQuestions, redactString, redactValue, scoreTraceInsightReadiness, tokenizeDomainWords, traceAnalystFunctionGroup, traceAnalystOnRunComplete } from './traces.js';
|
|
13
|
-
import {
|
|
14
|
-
export { K as ArtifactCheck, L as ArtifactResult, B as BootstrapOptions, a as BootstrapResult,
|
|
13
|
+
import { s as JudgeInput, t as JudgeFn, u as BenchmarkRunnerConfig, S as Scenario, v as BenchmarkReport, x as ProductClientConfig, C as CheckResult, T as TestResult, y as PersonaConfig, D as DriverResult, z as DriverState, A as CollectedArtifacts, E as ScenarioResult, i as ReleaseConfidenceThresholds, g as ReleaseConfidenceScorecard, F as TurnMetrics, G as ScenarioFile, H as CompletionCriterion } from './release-report-BtpgWRI0.js';
|
|
14
|
+
export { I as ActionableSideInfo, K as ArtifactCheck, L as ArtifactResult, M as AsiSeverity, B as BootstrapOptions, a as BootstrapResult, N as CorpusAgreementOptions, O as CorpusAgreementPerDimension, Q as CorpusAgreementReport, U as CorpusScoreRecord, W as EvalResult, X as FeedbackPattern, Y as JudgeConfig, J as JudgeReplayGateArgs, Z as JudgeRubric, _ as JudgeScore, P as PairedBootstrapOptions, b as PairedBootstrapResult, $ as PersonaRigor, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, h as ReleaseConfidenceStatus, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, a0 as RouteMap, a1 as RubricDimension, a2 as Turn, a3 as TurnResult, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, a4 as bonferroni, n as bootstrapCi, a5 as cohensD, a6 as confidenceInterval, a7 as corpusInterRaterAgreement, a8 as corpusInterRaterAgreementFromJudgeScores, o as evaluateReleaseConfidence, a9 as interRaterReliability, p as judgeReplayGate, aa as mannWhitneyU, ab as normalizeScores, q as pairedBootstrap, ac as pairedMde, ad as pairedTTest, ae as partialCredit, r as renderReleaseReport, af as requiredSampleSize, ag as weightedMean, w as wilcoxonSignedRank } from './release-report-BtpgWRI0.js';
|
|
15
15
|
import { TCloud } from '@tangle-network/tcloud';
|
|
16
16
|
import { z } from 'zod';
|
|
17
|
-
|
|
18
|
-
export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BZ_lVLYW.js';
|
|
17
|
+
export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BZ_lVLYW.js';
|
|
19
18
|
import { A as AgentEvalError } from './errors-mje_cKOs.js';
|
|
20
19
|
export { a as AgentEvalErrorCode, C as CaptureIntegrityError, b as ConfigError, J as JudgeError, N as NotFoundError, R as ReplayError, V as ValidationError, c as VerificationError } from './errors-mje_cKOs.js';
|
|
21
|
-
import {
|
|
22
|
-
export {
|
|
23
|
-
import { A as ActionableSideInfo, h as MultiShotRunner, j as MultiShotScorer, c as MultiShotMutateAdapter, a4 as HeldOutGateConfig, E as EvolvableVariant, m as MultiShotTrialResult, e as MultiShotOptimizationResult, a3 as GateDecision, a5 as Objective, a6 as ParetoResult, V as VariantAggregate, t as TrialResult, o as MutateAdapter, T as TrialCache } from './summary-report-DuZXOk7K.js';
|
|
24
|
-
export { a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, a7 as Direction, C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, a8 as GateEvidence, G as GenerationReport, a9 as HeldOutGate, aa as HeldOutGateRejectionCode, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, d as MultiShotOptimizationConfig, f as MultiShotRun, g as MultiShotRunInput, i as MultiShotScore, k as MultiShotSplit, l as MultiShotTrace, n as MultiShotVariant, J as ParetoFigureSpec, K as ParetoPoint, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, R as ReflectionContext, r as ReflectionProposal, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, S as ScenarioAggregate, s as ScoreAdapter, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, u as TrialTrace, v as buildReflectionPrompt, ab as crowdingDistance, w as defaultMultiShotObjectives, ac as dominates, $ as gainHistogram, a0 as paretoChart, ad as paretoFrontier, ae as paretoFrontierWithCrowding, x as parseReflectionResponse, a1 as researchReport, y as runMultiShotOptimization, z as runPromptEvolution, af as scalarScore, a2 as summaryTable, B as trialTraceFromMultiShotTrial } from './summary-report-DuZXOk7K.js';
|
|
20
|
+
import { a as FeedbackLabel, F as FeedbackTrajectoryStore, b as FeedbackTrajectory } from './feedback-trajectory-BSxqEpu7.js';
|
|
21
|
+
export { c as FeedbackArtifactType, d as FeedbackAttempt, e as FeedbackLabelKind, f as FeedbackLabelSource, g as FeedbackOptimizerRow, h as FeedbackOutcome, i as FeedbackReplayAdapter, j as FeedbackReplayResult, k as FeedbackSeverity, l as FeedbackSplitPolicy, m as FeedbackTask, n as FeedbackTrajectoryFilter, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-BSxqEpu7.js';
|
|
25
22
|
export { DataAcquisitionPlan, KnowledgeAcquisitionMode, KnowledgeBundle, KnowledgeFallbackPolicy, KnowledgeFreshness, KnowledgeImportance, KnowledgeReadinessReport, KnowledgeRecommendedAction, KnowledgeRequirement, KnowledgeRequirementCategory, KnowledgeResponsibleSurface, KnowledgeSensitivity, ScoreKnowledgeReadinessOptions, UserQuestion, acquisitionPlansForKnowledgeGaps, blockingKnowledgeEval, knowledgeReadinessTracePayload, scoreKnowledgeReadiness, userQuestionsForKnowledgeGaps } from './knowledge/index.js';
|
|
26
|
-
import { a as FailureCluster } from './failure-cluster-Cw65_5FY.js';
|
|
27
|
-
export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-Cw65_5FY.js';
|
|
28
23
|
import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult } from './test-graded-scenario-B2kWEdh9.js';
|
|
29
24
|
export { D as DockerSandboxDriver, c as SandboxHarness, d as SandboxResult, e as SubprocessSandboxDriver, f as SubprocessSandboxDriverOptions, g as TestGradedRunOptions, b as TestGradedRunResult, T as TestGradedScenario, h as TestOutputParser, i as composeParsers, j as jestTestParser, p as pytestTestParser, r as runTestGradedScenario, v as vitestTestParser } from './test-graded-scenario-B2kWEdh9.js';
|
|
30
25
|
import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
|
|
31
26
|
export { R as RunCompleteHook, a as RunCompleteHookContext, S as SpanHandle, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DP_cSSiw.js';
|
|
32
|
-
export {
|
|
27
|
+
export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CTDhR1Sg.js';
|
|
33
28
|
export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-DODUYdPg.js';
|
|
29
|
+
export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, c as RawProviderDirection, d as RawProviderEvent, R as RawProviderSink, e as RawProviderSinkFilter, f as defaultProviderRedactor, p as providerFromBaseUrl } from './raw-provider-sink-C46HDghv.js';
|
|
30
|
+
export { D as DEFAULT_FAILURE_RULES, b as FailureClassification, c as FailureContext, d as FailureRule, e as classifyFailure } from './failure-cluster-Cw65_5FY.js';
|
|
34
31
|
import { a as BaselineReport } from './baseline-4R5deP0N.js';
|
|
35
32
|
export { B as BaselineOptions, M as MetricSamples, b as MetricVerdict, T as ToolStats, d as ToolUseMetrics, e as ToolUseOptions, f as compareToBaseline, c as computeToolUseMetrics, i as iqr, w as welchsTTest } from './baseline-4R5deP0N.js';
|
|
36
33
|
import { T as Trajectory, a as TrajectoryStep } from './trajectory-CnoBo-JY.js';
|
|
@@ -42,7 +39,7 @@ export { D as DEFAULT_RED_TEAM_CORPUS, R as RedTeamCase, a as RedTeamCategory, b
|
|
|
42
39
|
import { a as PrmGrader } from './rubric-D5tjHNJQ.js';
|
|
43
40
|
export { EuRiskClass, GovernanceContext, GovernanceFinding, GovernanceReport, UseCaseSignals, classifyEuAiRisk, euAiActReport, nistAiRmfReport, renderMarkdown, soc2Report, summarize } from './governance/index.js';
|
|
44
41
|
export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as benchmarkDeterministicSplit, i as benchmarks } from './index-0pu_fBwZ.js';
|
|
45
|
-
export { I as InterimReleaseConfidence,
|
|
42
|
+
export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, w as GateDecision, x as GateEvidence, H as HeldOutGate, y as HeldOutGateConfig, z as HeldOutGateRejectionCode, I as InterimReleaseConfidence, c as InterimReleaseConfidenceInput, P as PairedEvalueOptions, d as PairedEvalueSequence, e as PairedEvalueStep, f as ParetoFigureSpec, g as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, h as ResearchReport, i as ResearchReportCandidate, j as ResearchReportDecision, k as ResearchReportMethodology, l as ResearchReportOptions, m as ResearchReportRecommendation, S as SequentialDecision, n as SummaryTable, o as SummaryTableOptions, p as SummaryTableRow, q as evaluateInterimReleaseConfidence, r as gainHistogram, s as pairedEvalueSequence, t as paretoChart, u as researchReport, v as summaryTable } from './sequential-DdV5ShjT.js';
|
|
46
43
|
import './outcome-store-D6KWmYvj.js';
|
|
47
44
|
|
|
48
45
|
interface RunScore {
|
|
@@ -1565,34 +1562,6 @@ interface ExecutorConfig {
|
|
|
1565
1562
|
*/
|
|
1566
1563
|
declare function executeScenario(tc: TCloud, scenario: Scenario, config: ExecutorConfig): Promise<ScenarioResult>;
|
|
1567
1564
|
|
|
1568
|
-
type IntegrationGateSurface = 'integration-manifest' | 'integration-connection' | 'integration-scope' | 'integration-approval' | 'integration-auth' | 'integration-provider' | 'integration-policy';
|
|
1569
|
-
interface IntegrationManifestGateInput {
|
|
1570
|
-
connectorId: string;
|
|
1571
|
-
actionId?: string;
|
|
1572
|
-
valid: boolean;
|
|
1573
|
-
missingConnections?: string[];
|
|
1574
|
-
missingScopes?: string[];
|
|
1575
|
-
requiredScopes?: string[];
|
|
1576
|
-
approvalRequired?: boolean;
|
|
1577
|
-
status?: 'ready' | 'blocked' | 'approval_required';
|
|
1578
|
-
reason?: string;
|
|
1579
|
-
metadata?: Record<string, unknown>;
|
|
1580
|
-
}
|
|
1581
|
-
interface IntegrationInvokeFailureInput {
|
|
1582
|
-
connectorId: string;
|
|
1583
|
-
actionId: string;
|
|
1584
|
-
code: 'auth_expired' | 'scope_denied' | 'approval_required' | 'unsafe_write_denied' | 'provider_failure' | 'manifest_invalid';
|
|
1585
|
-
message: string;
|
|
1586
|
-
status?: number;
|
|
1587
|
-
retryable?: boolean;
|
|
1588
|
-
metadata?: Record<string, unknown>;
|
|
1589
|
-
}
|
|
1590
|
-
declare function integrationManifestValidatedPayload(input: IntegrationManifestGateInput): Record<string, unknown>;
|
|
1591
|
-
declare function integrationManifestResolvedPayload(input: IntegrationManifestGateInput): Record<string, unknown>;
|
|
1592
|
-
declare function integrationInvokeFailedPayload(input: IntegrationInvokeFailureInput): Record<string, unknown>;
|
|
1593
|
-
declare function integrationGateEvals(input: IntegrationManifestGateInput): ControlEvalResult[];
|
|
1594
|
-
declare function integrationAsi(input: IntegrationManifestGateInput | IntegrationInvokeFailureInput): ActionableSideInfo;
|
|
1595
|
-
|
|
1596
1565
|
/**
|
|
1597
1566
|
* Backend-integrity guard: distinguish "agent failed" from "eval ran against
|
|
1598
1567
|
* a stub / unconfigured backend." Without this guard a canonical eval can
|
|
@@ -1889,180 +1858,6 @@ declare function scorePrReviewComments(auditCase: PrReviewAuditCase, comments: P
|
|
|
1889
1858
|
declare function summarizePrReviewBenchmark(scores: PrReviewScore[]): PrReviewBenchmarkSummary[];
|
|
1890
1859
|
declare function aggregatePrReviewScore(dimensions: Pick<PrReviewScore, 'recall' | 'precision' | 'actionability' | 'severityCalibration' | 'lowNoise'>, weights?: Partial<PrReviewScoreWeights>): number;
|
|
1891
1860
|
|
|
1892
|
-
/**
|
|
1893
|
-
* ProductionLoop — the substrate that closes eval → prod → eval.
|
|
1894
|
-
*
|
|
1895
|
-
* Static prompts decay. Yesterday's regulation flips today; yesterday's
|
|
1896
|
-
* tool quirk becomes today's incident. A production agent that ships a
|
|
1897
|
-
* static prompt and never re-trains is on a clock.
|
|
1898
|
-
*
|
|
1899
|
-
* `runProductionLoop` is the orchestration layer over the eval substrate:
|
|
1900
|
-
*
|
|
1901
|
-
* 1. Ingest production traces + user feedback (via the wire HTTP
|
|
1902
|
-
* ingestion endpoints, or directly through any `TraceStore` and
|
|
1903
|
-
* `FeedbackTrajectoryStore` implementation).
|
|
1904
|
-
* 2. Cluster the failures (`failureClusterView`) and prioritize by
|
|
1905
|
-
* size × severity.
|
|
1906
|
-
* 3. If any cluster crosses the consumer's threshold, run a
|
|
1907
|
-
* `runMultiShotOptimization` round seeded by the current production
|
|
1908
|
-
* prompt against holdout-shape scenarios derived from the offending
|
|
1909
|
-
* cluster.
|
|
1910
|
-
* 4. Gate the promoted prompt with `evaluateReleaseConfidence`. Fail
|
|
1911
|
-
* closed.
|
|
1912
|
-
* 5. If the gate passes and an `AutoPrClient` is wired, open a PR with
|
|
1913
|
-
* the new prompt. Otherwise return the proposed change.
|
|
1914
|
-
*
|
|
1915
|
-
* One call = one cycle. Cron / GitHub Actions are the caller's job. The
|
|
1916
|
-
* primitive is idempotent + replayable: re-running with the same
|
|
1917
|
-
* `runId` will produce the same plan.
|
|
1918
|
-
*
|
|
1919
|
-
* @experimental — surface may evolve as product agents wire it in.
|
|
1920
|
-
*/
|
|
1921
|
-
|
|
1922
|
-
interface FailureClusterConfig {
|
|
1923
|
-
/** Minimum runs in a cluster before it triggers an evolve round. Default 5. */
|
|
1924
|
-
minClusterSize?: number;
|
|
1925
|
-
/**
|
|
1926
|
-
* Severity threshold. A cluster is "actionable" when its size
|
|
1927
|
-
* normalized by total runs exceeds this. Default 0.05 (5% of all runs).
|
|
1928
|
-
*/
|
|
1929
|
-
minSeverityRatio?: number;
|
|
1930
|
-
/**
|
|
1931
|
-
* Maximum number of clusters to react to in one cycle. Acting on too
|
|
1932
|
-
* many at once obscures attribution. Default 1 — the worst cluster.
|
|
1933
|
-
*/
|
|
1934
|
-
maxClustersPerCycle?: number;
|
|
1935
|
-
}
|
|
1936
|
-
interface ProductionEvolveConfig<P = string> {
|
|
1937
|
-
/** How to run a candidate prompt against a scenario. */
|
|
1938
|
-
runner: MultiShotRunner<P>;
|
|
1939
|
-
/** How to score the trajectory. Usually a calibrated judge. */
|
|
1940
|
-
scorer: MultiShotScorer<P>;
|
|
1941
|
-
/** How to mutate. Addendum-style mutators (append vs. rewrite) work best. */
|
|
1942
|
-
mutator: MultiShotMutateAdapter<P>;
|
|
1943
|
-
/** The current production prompt. Acts as the baseline + seed. */
|
|
1944
|
-
baselinePrompt: P;
|
|
1945
|
-
/** Stable id for the baseline variant. Default `'baseline'`. */
|
|
1946
|
-
baselineId?: string;
|
|
1947
|
-
/** Scenarios resembling production load. Used as the holdout split. */
|
|
1948
|
-
holdoutScenarios: Scenario[];
|
|
1949
|
-
/** Scenarios used during search. Default: derived from `holdoutScenarios` via deterministic split. */
|
|
1950
|
-
searchScenarios?: Scenario[];
|
|
1951
|
-
/** Gate config for the held-out promotion check. */
|
|
1952
|
-
gate: HeldOutGateConfig;
|
|
1953
|
-
/** Reps per (variant × scenario) cell. Default 3. */
|
|
1954
|
-
reps?: number;
|
|
1955
|
-
/** Number of mutation generations. Default 3. */
|
|
1956
|
-
generations?: number;
|
|
1957
|
-
/** Population size per generation. Default 4. */
|
|
1958
|
-
populationSize?: number;
|
|
1959
|
-
/** Concurrent score() calls. Default 1. */
|
|
1960
|
-
scoreConcurrency?: number;
|
|
1961
|
-
/**
|
|
1962
|
-
* Optional bridge from a scored trial into a paper-grade RunRecord.
|
|
1963
|
-
* If omitted, the loop synthesises a minimal record sufficient for
|
|
1964
|
-
* `HeldOutGate` and `evaluateReleaseConfidence`.
|
|
1965
|
-
*/
|
|
1966
|
-
toRunRecord?: (input: {
|
|
1967
|
-
variant: EvolvableVariant<P>;
|
|
1968
|
-
scenarioId: string;
|
|
1969
|
-
rep: number;
|
|
1970
|
-
split: RunSplitTag;
|
|
1971
|
-
seed: number;
|
|
1972
|
-
trial: MultiShotTrialResult;
|
|
1973
|
-
}) => RunRecord;
|
|
1974
|
-
}
|
|
1975
|
-
interface ProductionShipConfig {
|
|
1976
|
-
repo: RepoRef;
|
|
1977
|
-
/** Branch name prefix. Final branch = `${branchPrefix}/${runId}`. */
|
|
1978
|
-
branchPrefix: string;
|
|
1979
|
-
/** Path (repo-relative) of the file holding the production prompt. */
|
|
1980
|
-
promptFilePath: string;
|
|
1981
|
-
/** Base branch for the PR. Default `'main'`. */
|
|
1982
|
-
baseBranch?: string;
|
|
1983
|
-
reviewers?: string[];
|
|
1984
|
-
labels?: string[];
|
|
1985
|
-
/** Required: the auto-PR transport. Use `ghCliClient()` or `httpGithubClient()`. */
|
|
1986
|
-
client: AutoPrClient;
|
|
1987
|
-
/** Skip the actual push + PR call — for sanity-checking the plan. Default false. */
|
|
1988
|
-
dryRun?: boolean;
|
|
1989
|
-
/** Render PR body from the loop's findings. Optional override. */
|
|
1990
|
-
renderBody?: (ctx: ProductionLoopRenderContext) => string;
|
|
1991
|
-
/** Render the file contents from the new prompt. Default: serialize as the file. */
|
|
1992
|
-
renderPromptFile?: (newPrompt: string, oldFileContents: string | null) => string;
|
|
1993
|
-
/** Read the current prompt file contents for diff context. Optional. */
|
|
1994
|
-
readCurrentPromptFile?: () => Promise<string | null>;
|
|
1995
|
-
}
|
|
1996
|
-
interface ProductionLoopCronConfig {
|
|
1997
|
-
cadence: 'weekly' | 'daily' | 'hourly';
|
|
1998
|
-
/** Optional jitter (seconds) the consumer's scheduler should add. Surface-only. */
|
|
1999
|
-
jitterSec?: number;
|
|
2000
|
-
}
|
|
2001
|
-
interface RunProductionLoopOptions<P = string> {
|
|
2002
|
-
/** Stable id; deterministic outputs when reused. */
|
|
2003
|
-
runId: string;
|
|
2004
|
-
/** Human label — surfaces in PR titles and reports. */
|
|
2005
|
-
target: string;
|
|
2006
|
-
traceStore: TraceStore;
|
|
2007
|
-
feedbackStore: FeedbackTrajectoryStore;
|
|
2008
|
-
cluster: FailureClusterConfig;
|
|
2009
|
-
evolve: ProductionEvolveConfig<P>;
|
|
2010
|
-
/** When omitted, the loop returns the proposed prompt without opening a PR. */
|
|
2011
|
-
ship?: ProductionShipConfig;
|
|
2012
|
-
/** Surface-only — encodes scheduler expectations into the artifact. */
|
|
2013
|
-
cron?: ProductionLoopCronConfig;
|
|
2014
|
-
/** Release confidence thresholds. Default: library defaults. */
|
|
2015
|
-
releaseThresholds?: ReleaseConfidenceThresholds;
|
|
2016
|
-
/** Now() seam for reproducibility in tests. */
|
|
2017
|
-
now?: () => Date;
|
|
2018
|
-
}
|
|
2019
|
-
type ProductionLoopDecision = 'no_actionable_failures' | 'evolve_yielded_no_improvement' | 'gate_failed' | 'proposed_change' | 'pr_opened';
|
|
2020
|
-
interface ProductionLoopRenderContext {
|
|
2021
|
-
runId: string;
|
|
2022
|
-
target: string;
|
|
2023
|
-
decision: ProductionLoopDecision;
|
|
2024
|
-
/** Clusters seen in production this cycle, sorted by severity. */
|
|
2025
|
-
clusters: FailureCluster[];
|
|
2026
|
-
/** The cluster the loop acted on (if any). */
|
|
2027
|
-
actedOnCluster: FailureCluster | null;
|
|
2028
|
-
/** Production runs observed this cycle. */
|
|
2029
|
-
observedRunCount: number;
|
|
2030
|
-
/** Feedback trajectories observed this cycle. */
|
|
2031
|
-
observedFeedbackCount: number;
|
|
2032
|
-
/** Evolve result (if evolve ran). */
|
|
2033
|
-
evolution: MultiShotOptimizationResult<unknown> | null;
|
|
2034
|
-
/** Release gate verdict (if evolve ran). */
|
|
2035
|
-
release: ReleaseConfidenceScorecard | null;
|
|
2036
|
-
/** Held-out gate decision (if a candidate was paired against the baseline). */
|
|
2037
|
-
gate: GateDecision | null;
|
|
2038
|
-
/** The baseline (current production) prompt as a string. */
|
|
2039
|
-
baselinePromptString: string;
|
|
2040
|
-
/** The proposed new prompt as a string. Empty if no change was proposed. */
|
|
2041
|
-
promotedPromptString: string;
|
|
2042
|
-
}
|
|
2043
|
-
interface ProductionLoopResult {
|
|
2044
|
-
runId: string;
|
|
2045
|
-
target: string;
|
|
2046
|
-
decision: ProductionLoopDecision;
|
|
2047
|
-
startedAt: string;
|
|
2048
|
-
finishedAt: string;
|
|
2049
|
-
observedRunCount: number;
|
|
2050
|
-
observedFeedbackCount: number;
|
|
2051
|
-
clusters: FailureCluster[];
|
|
2052
|
-
actedOnCluster: FailureCluster | null;
|
|
2053
|
-
evolution: MultiShotOptimizationResult<unknown> | null;
|
|
2054
|
-
release: ReleaseConfidenceScorecard | null;
|
|
2055
|
-
gate: GateDecision | null;
|
|
2056
|
-
/** Baseline prompt as it entered the cycle. */
|
|
2057
|
-
baselinePrompt: unknown;
|
|
2058
|
-
/** Promoted prompt — equals baseline when no change is proposed. */
|
|
2059
|
-
promotedPrompt: unknown;
|
|
2060
|
-
/** PR artifact when `ship` was wired and gate passed. */
|
|
2061
|
-
pullRequest: ProposeAutomatedPullRequestResult | null;
|
|
2062
|
-
cron: ProductionLoopCronConfig | null;
|
|
2063
|
-
}
|
|
2064
|
-
declare function runProductionLoop<P = string>(opts: RunProductionLoopOptions<P>): Promise<ProductionLoopResult>;
|
|
2065
|
-
|
|
2066
1861
|
/**
|
|
2067
1862
|
* ScenarioRegistry — manages scenario discovery and filtering.
|
|
2068
1863
|
*
|
|
@@ -2727,6 +2522,86 @@ declare class FileSystemExperimentStore implements ExperimentStore {
|
|
|
2727
2522
|
private load;
|
|
2728
2523
|
}
|
|
2729
2524
|
|
|
2525
|
+
/**
|
|
2526
|
+
* Pareto frontier — multi-objective optimization over candidate runs.
|
|
2527
|
+
*
|
|
2528
|
+
* Lifted from ADC pareto.ts and blueprint-agent frontier.ts. When you're
|
|
2529
|
+
* trading off (cost, latency, quality) or (passRate, tokenBudget,
|
|
2530
|
+
* ttfb), you rarely have a single "winner" — you have a set of
|
|
2531
|
+
* non-dominated candidates. This module exposes:
|
|
2532
|
+
*
|
|
2533
|
+
* - `paretoFrontier`: filter a set of candidates to the non-dominated ones
|
|
2534
|
+
* - `dominates`: does A dominate B across all objectives?
|
|
2535
|
+
*
|
|
2536
|
+
* Each objective is declared with a direction: 'maximize' (higher=better)
|
|
2537
|
+
* or 'minimize' (lower=better). Candidates are any object; pass an
|
|
2538
|
+
* `objective(candidate)` accessor.
|
|
2539
|
+
*/
|
|
2540
|
+
type Direction = 'maximize' | 'minimize';
|
|
2541
|
+
interface Objective<T> {
|
|
2542
|
+
/** Stable label used in reports. */
|
|
2543
|
+
name: string;
|
|
2544
|
+
direction: Direction;
|
|
2545
|
+
value: (candidate: T) => number;
|
|
2546
|
+
}
|
|
2547
|
+
interface ParetoResult<T> {
|
|
2548
|
+
frontier: T[];
|
|
2549
|
+
dominated: T[];
|
|
2550
|
+
/** Index map: frontier[i] dominates each of dominatedBy[i]. */
|
|
2551
|
+
dominanceMap: Array<{
|
|
2552
|
+
dominator: T;
|
|
2553
|
+
dominated: T[];
|
|
2554
|
+
}>;
|
|
2555
|
+
}
|
|
2556
|
+
/** Does candidate A weakly dominate B — strictly better on at least one objective and no worse on any? */
|
|
2557
|
+
declare function dominates<T>(a: T, b: T, objectives: Objective<T>[]): boolean;
|
|
2558
|
+
/**
|
|
2559
|
+
* Compute the non-dominated frontier. Candidates with NaN/Infinity on any
|
|
2560
|
+
* objective are excluded (can't rank them). A candidate enters the frontier
|
|
2561
|
+
* iff no other candidate dominates it.
|
|
2562
|
+
*/
|
|
2563
|
+
declare function paretoFrontier<T>(candidates: T[], objectives: Objective<T>[]): ParetoResult<T>;
|
|
2564
|
+
/**
|
|
2565
|
+
* Weighted-sum scalarisation. Use as a tie-break / single-winner selector
|
|
2566
|
+
* when callers don't want to consume a frontier. Each objective contributes
|
|
2567
|
+
* its normalised value (0..1 via min-max across the candidate pool) times
|
|
2568
|
+
* its weight; missing weights default to 1/N.
|
|
2569
|
+
*
|
|
2570
|
+
* Direction is honoured automatically — `minimize` axes have their values
|
|
2571
|
+
* inverted before scaling so "higher scalar = better" always holds.
|
|
2572
|
+
*/
|
|
2573
|
+
declare function scalarScore<T>(candidates: T[], objectives: Objective<T>[], options?: {
|
|
2574
|
+
weights?: Partial<Record<string, number>>;
|
|
2575
|
+
}): Array<{
|
|
2576
|
+
candidate: T;
|
|
2577
|
+
score: number;
|
|
2578
|
+
}>;
|
|
2579
|
+
/**
|
|
2580
|
+
* NSGA-II crowding distance — secondary sort for ties on the frontier.
|
|
2581
|
+
*
|
|
2582
|
+
* When the Pareto front collapses to a single point (or many candidates tie
|
|
2583
|
+
* on dominance), naive selection picks arbitrarily and the population
|
|
2584
|
+
* degenerates over generations. NSGA-II preserves diversity by preferring
|
|
2585
|
+
* candidates with more empty space around them on the frontier.
|
|
2586
|
+
*
|
|
2587
|
+
* Returns an array of `{ candidate, distance }` in the SAME order as the
|
|
2588
|
+
* input. Higher distance = more isolated = should be preferred when
|
|
2589
|
+
* preserving diversity.
|
|
2590
|
+
*/
|
|
2591
|
+
declare function crowdingDistance<T>(candidates: T[], objectives: Objective<T>[]): Array<{
|
|
2592
|
+
candidate: T;
|
|
2593
|
+
distance: number;
|
|
2594
|
+
}>;
|
|
2595
|
+
/**
|
|
2596
|
+
* Pareto frontier with tie-break by crowding distance — the canonical
|
|
2597
|
+
* NSGA-II selection step. Returns the frontier sorted by descending crowding
|
|
2598
|
+
* distance so callers can `.slice(0, k)` to pick K diverse winners.
|
|
2599
|
+
*/
|
|
2600
|
+
declare function paretoFrontierWithCrowding<T>(candidates: T[], objectives: Objective<T>[]): Array<{
|
|
2601
|
+
candidate: T;
|
|
2602
|
+
distance: number;
|
|
2603
|
+
}>;
|
|
2604
|
+
|
|
2730
2605
|
interface SteeringRolePrompt {
|
|
2731
2606
|
system?: string;
|
|
2732
2607
|
append?: string;
|
|
@@ -5567,378 +5442,6 @@ interface CanaryOptions {
|
|
|
5567
5442
|
*/
|
|
5568
5443
|
declare function runCanaries(runs: RunRecord[], opts?: CanaryOptions): CanaryReport;
|
|
5569
5444
|
|
|
5570
|
-
/**
|
|
5571
|
-
* evolution-telemetry — durable JSONL/JSON sinks for the evolution loop.
|
|
5572
|
-
*
|
|
5573
|
-
* `runPromptEvolution` exposes generation-level events but doesn't persist
|
|
5574
|
-
* the per-mutation, per-trial, lineage, or cost breakdown. These four
|
|
5575
|
-
* sinks fill that gap so a finished autoresearch run leaves a forensically
|
|
5576
|
-
* complete trail under one directory:
|
|
5577
|
-
*
|
|
5578
|
-
* - `mutations.jsonl` — every mutate attempt (success + failure) with
|
|
5579
|
-
* latency, agent steps, diff stats, cost.
|
|
5580
|
-
* - `trials.jsonl` — every TrialResult including cache hits, with
|
|
5581
|
-
* provenance (channel, runtime slot, generation).
|
|
5582
|
-
* - `lineage.json` — variant tree {id → {parent, generation, kind, …}},
|
|
5583
|
-
* incremental upsert.
|
|
5584
|
-
* - `cost-ledger.json` — running $ totals per source (mutator-prompt,
|
|
5585
|
-
* mutator-code, scorer-prompt, scorer-code) plus pool utilisation.
|
|
5586
|
-
*
|
|
5587
|
-
* All writes are mutex-serialised. The append-only sinks (mutations,
|
|
5588
|
-
* trials) survive a hard kill; the snapshot sinks (lineage, cost-ledger)
|
|
5589
|
-
* rewrite on every update so the latest state is always on disk.
|
|
5590
|
-
*
|
|
5591
|
-
* Generic over a payload P so any consumer of `runPromptEvolution<P>` can
|
|
5592
|
-
* record lineage without leaking domain types.
|
|
5593
|
-
*/
|
|
5594
|
-
|
|
5595
|
-
type MutationChannel = 'prompt' | 'code';
|
|
5596
|
-
interface MutationAttempt {
|
|
5597
|
-
ts: number;
|
|
5598
|
-
channel: MutationChannel;
|
|
5599
|
-
generation: number;
|
|
5600
|
-
parentId: string;
|
|
5601
|
-
/** Successful child variant id, or null if the attempt failed. */
|
|
5602
|
-
childId: string | null;
|
|
5603
|
-
ok: boolean;
|
|
5604
|
-
/**
|
|
5605
|
-
* One of: 'parse_failure' | 'typecheck_failure' | 'no_changes' |
|
|
5606
|
-
* 'agent_error' | 'commit_failure' | 'no_api_key' | 'no_valid_proposals'
|
|
5607
|
-
* | 'reproduce_parent_failed' | 'branch_failed' | 'other'.
|
|
5608
|
-
* Free-form to allow consumer-specific reasons.
|
|
5609
|
-
*/
|
|
5610
|
-
failureReason?: string;
|
|
5611
|
-
/** Free-form description of what the agent said it did. */
|
|
5612
|
-
description?: string;
|
|
5613
|
-
/** Latency of the LLM call (ms). */
|
|
5614
|
-
latencyMs: number;
|
|
5615
|
-
/** Bytes of generated diff (code channel only). */
|
|
5616
|
-
diffBytes?: number;
|
|
5617
|
-
/** Files touched (code channel only). */
|
|
5618
|
-
filesTouched?: number;
|
|
5619
|
-
/** Steps the agent ran (tool calls). */
|
|
5620
|
-
agentSteps?: number;
|
|
5621
|
-
/** Approx $ spent on this mutation (LLM tokens). */
|
|
5622
|
-
costUsd?: number;
|
|
5623
|
-
/** Runtime slot used (code channel only). */
|
|
5624
|
-
runtimeSandboxId?: string;
|
|
5625
|
-
}
|
|
5626
|
-
declare class MutationTelemetry {
|
|
5627
|
-
private readonly appender;
|
|
5628
|
-
constructor(path: string);
|
|
5629
|
-
record(attempt: MutationAttempt): Promise<void>;
|
|
5630
|
-
}
|
|
5631
|
-
interface TrialAttempt {
|
|
5632
|
-
ts: number;
|
|
5633
|
-
channel: MutationChannel;
|
|
5634
|
-
generation: number;
|
|
5635
|
-
variantId: string;
|
|
5636
|
-
scenarioId: string;
|
|
5637
|
-
rep: number;
|
|
5638
|
-
ok: boolean;
|
|
5639
|
-
score: number;
|
|
5640
|
-
costUsd: number;
|
|
5641
|
-
durationMs: number;
|
|
5642
|
-
cached: boolean;
|
|
5643
|
-
runtimeSandboxId?: string;
|
|
5644
|
-
error?: string;
|
|
5645
|
-
metrics?: Record<string, number>;
|
|
5646
|
-
}
|
|
5647
|
-
declare class TrialTelemetry {
|
|
5648
|
-
private readonly appender;
|
|
5649
|
-
constructor(path: string);
|
|
5650
|
-
record(attempt: TrialAttempt): Promise<void>;
|
|
5651
|
-
}
|
|
5652
|
-
type LineageKind = 'seed' | 'prompt' | 'code';
|
|
5653
|
-
interface LineageNode {
|
|
5654
|
-
id: string;
|
|
5655
|
-
parentId: string | null;
|
|
5656
|
-
generation: number;
|
|
5657
|
-
kind: LineageKind;
|
|
5658
|
-
rationale?: string;
|
|
5659
|
-
/** Filled when scoring lands. */
|
|
5660
|
-
meanScore?: number;
|
|
5661
|
-
promotedToFrontier?: boolean;
|
|
5662
|
-
/**
|
|
5663
|
-
* The variant payload (e.g. evolved persona text, code mutation diff).
|
|
5664
|
-
* Persisted so a winning variant can be reproduced after a run completes
|
|
5665
|
-
* without re-running the optimizer. Optional — pass `omitPayload: true` to
|
|
5666
|
-
* `upsertVariant` for cases where the payload is too large to log.
|
|
5667
|
-
*/
|
|
5668
|
-
payload?: unknown;
|
|
5669
|
-
}
|
|
5670
|
-
/**
|
|
5671
|
-
* `kindOf` decides whether a variant is a seed (no parent), code mutation,
|
|
5672
|
-
* or prompt mutation. Default looks at `variant.payload.codeMutation` and
|
|
5673
|
-
* accepts any payload that exposes that field; override by passing your own.
|
|
5674
|
-
*/
|
|
5675
|
-
type LineageKindResolver<P> = (variant: EvolvableVariant<P>) => LineageKind;
|
|
5676
|
-
/**
|
|
5677
|
-
* Persistence shape:
|
|
5678
|
-
*
|
|
5679
|
-
* `<path>` — JSONL of upserts (event log). Each line is a
|
|
5680
|
-
* partial node; replay folds them into the current
|
|
5681
|
-
* state. Append-only, so cost is O(1) per upsert
|
|
5682
|
-
* instead of the previous O(n²) full rewrite.
|
|
5683
|
-
* `<path>.snapshot` — Optional consolidated snapshot, written on
|
|
5684
|
-
* demand via `compact()` (e.g. at end of run).
|
|
5685
|
-
* Read by external tools that don't want to
|
|
5686
|
-
* replay the log.
|
|
5687
|
-
*
|
|
5688
|
-
* Loaded at construction time: if `<path>.snapshot` exists, parse it
|
|
5689
|
-
* first; then replay any newer log lines on top. Falls back to log-only
|
|
5690
|
-
* when no snapshot is present.
|
|
5691
|
-
*/
|
|
5692
|
-
declare class LineageRecorder<P = unknown> {
|
|
5693
|
-
private readonly path;
|
|
5694
|
-
private readonly snapshotPath;
|
|
5695
|
-
private readonly mutex;
|
|
5696
|
-
private readonly nodes;
|
|
5697
|
-
private readonly kindOf;
|
|
5698
|
-
constructor(path: string, kindOf?: LineageKindResolver<P>);
|
|
5699
|
-
upsert(node: LineageNode): Promise<void>;
|
|
5700
|
-
upsertVariant(variant: EvolvableVariant<P>, opts?: {
|
|
5701
|
-
omitPayload?: boolean;
|
|
5702
|
-
}): Promise<void>;
|
|
5703
|
-
snapshot(): LineageNode[];
|
|
5704
|
-
/**
|
|
5705
|
-
* Write the current consolidated state to `<path>.snapshot` so external
|
|
5706
|
-
* tools can read it without replaying the event log. Idempotent.
|
|
5707
|
-
*/
|
|
5708
|
-
compact(): Promise<void>;
|
|
5709
|
-
}
|
|
5710
|
-
/** Per-generation cost rollup. Same shape as the totals, scoped to one gen. */
|
|
5711
|
-
interface CostLedgerGeneration {
|
|
5712
|
-
generation: number;
|
|
5713
|
-
mutatorPromptUsd: number;
|
|
5714
|
-
mutatorCodeUsd: number;
|
|
5715
|
-
scorerPromptUsd: number;
|
|
5716
|
-
scorerCodeUsd: number;
|
|
5717
|
-
trialsCounted: number;
|
|
5718
|
-
cachedTrials: number;
|
|
5719
|
-
}
|
|
5720
|
-
interface CostLedgerSnapshot {
|
|
5721
|
-
totalUsd: number;
|
|
5722
|
-
mutatorPromptUsd: number;
|
|
5723
|
-
mutatorCodeUsd: number;
|
|
5724
|
-
scorerPromptUsd: number;
|
|
5725
|
-
scorerCodeUsd: number;
|
|
5726
|
-
trialsCounted: number;
|
|
5727
|
-
cachedTrials: number;
|
|
5728
|
-
poolBusyMs?: number;
|
|
5729
|
-
poolUtilizationPct?: number;
|
|
5730
|
-
/** Per-generation breakdown, sorted ascending. Empty when generations
|
|
5731
|
-
* weren't supplied to addMutation/addTrial. */
|
|
5732
|
-
byGeneration: CostLedgerGeneration[];
|
|
5733
|
-
}
|
|
5734
|
-
declare class CostLedger {
|
|
5735
|
-
private totals;
|
|
5736
|
-
private readonly path;
|
|
5737
|
-
private readonly mutex;
|
|
5738
|
-
constructor(path: string);
|
|
5739
|
-
private genBucket;
|
|
5740
|
-
addMutation(channel: MutationChannel, usd: number, opts?: {
|
|
5741
|
-
generation?: number;
|
|
5742
|
-
}): Promise<void>;
|
|
5743
|
-
addTrial(channel: MutationChannel, usd: number, cached: boolean, opts?: {
|
|
5744
|
-
generation?: number;
|
|
5745
|
-
}): Promise<void>;
|
|
5746
|
-
setPoolUtilization(busyMs: number, totalMs: number): Promise<void>;
|
|
5747
|
-
snapshot(): CostLedgerSnapshot;
|
|
5748
|
-
private persist;
|
|
5749
|
-
}
|
|
5750
|
-
|
|
5751
|
-
/**
|
|
5752
|
-
* SandboxPool — bounded checkout/release pool for mutation slots.
|
|
5753
|
-
*
|
|
5754
|
-
* The composite-mutator's `code` channel needs an isolated workspace per
|
|
5755
|
-
* mutation attempt: a git worktree, a sandbox container, a tmpdir clone —
|
|
5756
|
-
* whatever the consumer's runtime is. Without a pool, every consumer
|
|
5757
|
-
* re-implements the same machinery (mint N slots, check one out per
|
|
5758
|
-
* mutation, reset before reuse, drain at the end, track utilisation for
|
|
5759
|
-
* the cost ledger). This primitive ships that machinery so consumers
|
|
5760
|
-
* supply only a `SlotFactory`.
|
|
5761
|
-
*
|
|
5762
|
-
* Generic over a slot resource `T` so the same pool serves git worktrees
|
|
5763
|
-
* (T = path), Tangle sandboxes (T = SandboxBox), or anything else with
|
|
5764
|
-
* the create/reset/destroy lifecycle.
|
|
5765
|
-
*
|
|
5766
|
-
* Concurrency: FIFO via the shared `Mutex` primitive. Each `checkout()`
|
|
5767
|
-
* either takes an idle slot or queues until one is released. Lifecycle
|
|
5768
|
-
* is single-process — multi-process pools need external coordination
|
|
5769
|
-
* (file locks, etc.) and are deliberately out of scope.
|
|
5770
|
-
*/
|
|
5771
|
-
interface PoolSlot<T> {
|
|
5772
|
-
/** Stable id assigned at slot creation. Use for telemetry / lineage. */
|
|
5773
|
-
readonly id: string;
|
|
5774
|
-
/** Consumer-defined resource. */
|
|
5775
|
-
readonly resource: T;
|
|
5776
|
-
}
|
|
5777
|
-
interface SlotFactory<T> {
|
|
5778
|
-
/** Build a new slot. Called lazily as the pool grows up to `size`. */
|
|
5779
|
-
create(slotId: string): Promise<T>;
|
|
5780
|
-
/**
|
|
5781
|
-
* Reset a slot to a clean state before reuse. Called BEFORE every
|
|
5782
|
-
* checkout returns it (including the first — so the factory's
|
|
5783
|
-
* `create` can leave the slot dirty and let `reset` normalise).
|
|
5784
|
-
* Optional; default is a no-op.
|
|
5785
|
-
*/
|
|
5786
|
-
reset?(slot: PoolSlot<T>): Promise<void>;
|
|
5787
|
-
/** Tear the slot down. Called by `drain()`. */
|
|
5788
|
-
destroy(slot: PoolSlot<T>): Promise<void>;
|
|
5789
|
-
}
|
|
5790
|
-
interface SandboxPool<T> {
|
|
5791
|
-
/**
|
|
5792
|
-
* Take a slot. If all slots are busy, the promise resolves when one
|
|
5793
|
-
* is released. Always pair with the returned `release` (or wrap with
|
|
5794
|
-
* `withSlot`).
|
|
5795
|
-
*/
|
|
5796
|
-
checkout(): Promise<{
|
|
5797
|
-
slot: PoolSlot<T>;
|
|
5798
|
-
release: () => void;
|
|
5799
|
-
}>;
|
|
5800
|
-
/**
|
|
5801
|
-
* Run `fn` with a checked-out slot, releasing on completion or throw.
|
|
5802
|
-
* The convenience wrapper most callers should use.
|
|
5803
|
-
*/
|
|
5804
|
-
withSlot<R>(fn: (slot: PoolSlot<T>) => Promise<R>): Promise<R>;
|
|
5805
|
-
/** Destroy every slot. Idempotent. */
|
|
5806
|
-
drain(): Promise<void>;
|
|
5807
|
-
/** How many slots have been minted (≤ `size`). */
|
|
5808
|
-
poolSize(): number;
|
|
5809
|
-
/** How many checkouts are currently outstanding. */
|
|
5810
|
-
activeCheckouts(): number;
|
|
5811
|
-
/** Snapshot of busy/total durations for the cost ledger. */
|
|
5812
|
-
utilization(): {
|
|
5813
|
-
busyMs: number;
|
|
5814
|
-
totalMs: number;
|
|
5815
|
-
checkouts: number;
|
|
5816
|
-
};
|
|
5817
|
-
}
|
|
5818
|
-
interface CreateSandboxPoolOpts<T> {
|
|
5819
|
-
/** Maximum concurrent slots. Slots are minted on first need, not eagerly. */
|
|
5820
|
-
size: number;
|
|
5821
|
-
factory: SlotFactory<T>;
|
|
5822
|
-
}
|
|
5823
|
-
declare function createSandboxPool<T>(opts: CreateSandboxPoolOpts<T>): SandboxPool<T>;
|
|
5824
|
-
|
|
5825
|
-
/**
|
|
5826
|
-
* createSandboxCodeMutator — `MutateAdapter<P>` that runs a coding agent
|
|
5827
|
-
* inside a SandboxPool slot to produce code-channel variants.
|
|
5828
|
-
*
|
|
5829
|
-
* Composable shape (matches `reflective-mutation.ts`'s separation of
|
|
5830
|
-
* "build the prompt" from "run the model"):
|
|
5831
|
-
*
|
|
5832
|
-
* pool → where mutations execute (any SlotFactory)
|
|
5833
|
-
* runner → consumer-supplied: invokes the coding agent in a slot,
|
|
5834
|
-
* returns the diff/branch/whatever as `CodeMutationOutcome`s
|
|
5835
|
-
* toVariantPayload → maps outcome → P (consumer encodes the diff their
|
|
5836
|
-
* way — patch string, branch ref, file map, etc)
|
|
5837
|
-
*
|
|
5838
|
-
* What this primitive owns (so consumers don't reinvent it every time):
|
|
5839
|
-
* - Pool checkout / release with reset between attempts
|
|
5840
|
-
* - Per-attempt mutex so a single slot can't be invoked concurrently
|
|
5841
|
-
* - Telemetry write-through (mutations.jsonl, lineage.json,
|
|
5842
|
-
* cost-ledger.json) when sinks are passed
|
|
5843
|
-
* - Stable child-id generation
|
|
5844
|
-
* - Failure capture (every attempt produces either a successful child
|
|
5845
|
-
* or a recorded failure with reason — never a silent drop)
|
|
5846
|
-
*
|
|
5847
|
-
* Consumers stay focused on the actual interesting parts: building the
|
|
5848
|
-
* agent prompt, running the agent, capturing the diff.
|
|
5849
|
-
*/
|
|
5850
|
-
|
|
5851
|
-
/**
|
|
5852
|
-
* Result of one coding-agent invocation. The runner produces 1..N of
|
|
5853
|
-
* these per `runner` call (a single agent session can sometimes
|
|
5854
|
-
* produce multiple sibling diffs cheaply — runner decides).
|
|
5855
|
-
*/
|
|
5856
|
-
interface CodeMutationOutcome {
|
|
5857
|
-
ok: boolean;
|
|
5858
|
-
/** Stable id for the child variant if `ok`. The mutator falls back to
|
|
5859
|
-
* a generated id when omitted. */
|
|
5860
|
-
childId?: string;
|
|
5861
|
-
/** Free-form one-liner: "tightened tool descriptions in forge-tools.ts". */
|
|
5862
|
-
description?: string;
|
|
5863
|
-
/** What the runner was trying to fix (carried into EvolvableVariant.rationale). */
|
|
5864
|
-
rationale?: string;
|
|
5865
|
-
/** Caller-defined diff payload. Mapped into the variant's payload by
|
|
5866
|
-
* `toVariantPayload`; agent-eval treats it as opaque. */
|
|
5867
|
-
artifact?: unknown;
|
|
5868
|
-
/** When ok === false. Free-form: 'parse_failure' / 'agent_error' /
|
|
5869
|
-
* 'no_changes' / 'commit_failed' / etc. */
|
|
5870
|
-
failureReason?: string;
|
|
5871
|
-
/** Telemetry stats. */
|
|
5872
|
-
diffBytes?: number;
|
|
5873
|
-
filesTouched?: number;
|
|
5874
|
-
agentSteps?: number;
|
|
5875
|
-
costUsd?: number;
|
|
5876
|
-
latencyMs: number;
|
|
5877
|
-
}
|
|
5878
|
-
type CodeMutationRunner<T, P> = (args: {
|
|
5879
|
-
slot: PoolSlot<T>;
|
|
5880
|
-
parent: EvolvableVariant<P>;
|
|
5881
|
-
parentAggregate: VariantAggregate;
|
|
5882
|
-
topTrials: TrialResult[];
|
|
5883
|
-
bottomTrials: TrialResult[];
|
|
5884
|
-
childCount: number;
|
|
5885
|
-
generation: number;
|
|
5886
|
-
}) => Promise<CodeMutationOutcome[]>;
|
|
5887
|
-
interface CreateSandboxCodeMutatorOpts<T, P> {
|
|
5888
|
-
pool: SandboxPool<T>;
|
|
5889
|
-
runner: CodeMutationRunner<T, P>;
|
|
5890
|
-
/**
|
|
5891
|
-
* Map an outcome into the variant payload `P`. Lets the consumer
|
|
5892
|
-
* encode the diff however they want (file map, patch string, branch
|
|
5893
|
-
* ref, snapshot id) without agent-eval taking a stance.
|
|
5894
|
-
*/
|
|
5895
|
-
toVariantPayload(outcome: CodeMutationOutcome, parent: EvolvableVariant<P>): P;
|
|
5896
|
-
/** Optional telemetry sinks. */
|
|
5897
|
-
mutationTelemetry?: MutationTelemetry;
|
|
5898
|
-
costLedger?: CostLedger;
|
|
5899
|
-
lineage?: LineageRecorder<P>;
|
|
5900
|
-
/** Override id generation. Default: `${parent.id}.g${generation}.code.${i}`. */
|
|
5901
|
-
childIdFor?(parent: EvolvableVariant<P>, generation: number, index: number): string;
|
|
5902
|
-
/** Default label for the variant (visible in reports). */
|
|
5903
|
-
labelFor?(outcome: CodeMutationOutcome, parent: EvolvableVariant<P>, generation: number, index: number): string;
|
|
5904
|
-
}
|
|
5905
|
-
declare function createSandboxCodeMutator<T, P>(opts: CreateSandboxCodeMutatorOpts<T, P>): MutateAdapter<P>;
|
|
5906
|
-
|
|
5907
|
-
/**
|
|
5908
|
-
* createCompositeMutator — combines two `MutateAdapter<P>`s under a policy.
|
|
5909
|
-
*
|
|
5910
|
-
* prompt-only — every generation runs `primary` (typical: a reflective
|
|
5911
|
-
* prompt mutator). The default.
|
|
5912
|
-
* secondary-only — every generation runs `secondary` (typical: a coding
|
|
5913
|
-
* agent that edits the harness itself). Slow + expensive.
|
|
5914
|
-
* alternate — even gens run `primary`, odd gens run `secondary`.
|
|
5915
|
-
* plateau — start with `primary`; switch to a 50/50 split between
|
|
5916
|
-
* `primary` and `secondary` after K gens with less than
|
|
5917
|
-
* Δ improvement (auto-detect when prompt evolution has
|
|
5918
|
-
* hit a structural ceiling).
|
|
5919
|
-
*
|
|
5920
|
-
* Naming is generic — the canonical use cases are "prompt" and "code"
|
|
5921
|
-
* channels, but the primitive doesn't care what each mutator actually does.
|
|
5922
|
-
*/
|
|
5923
|
-
|
|
5924
|
-
type CompositePolicy = 'primary-only' | 'secondary-only' | 'alternate' | 'plateau';
|
|
5925
|
-
interface CreateCompositeMutatorOpts<P> {
|
|
5926
|
-
primary: MutateAdapter<P>;
|
|
5927
|
-
secondary?: MutateAdapter<P>;
|
|
5928
|
-
policy: CompositePolicy;
|
|
5929
|
-
/** For 'plateau': minimum improvement (Δ meanScore) to count as progress. Default 0.02. */
|
|
5930
|
-
plateauThreshold?: number;
|
|
5931
|
-
/** For 'plateau': consecutive gens without progress that trigger split mode. Default 2. */
|
|
5932
|
-
plateauPatience?: number;
|
|
5933
|
-
/** Optional progress hook. */
|
|
5934
|
-
onPolicyDecision?: (info: {
|
|
5935
|
-
generation: number;
|
|
5936
|
-
chose: 'primary' | 'secondary' | 'split';
|
|
5937
|
-
reason: string;
|
|
5938
|
-
}) => void;
|
|
5939
|
-
}
|
|
5940
|
-
declare function createCompositeMutator<P>(opts: CreateCompositeMutatorOpts<P>): MutateAdapter<P>;
|
|
5941
|
-
|
|
5942
5445
|
/**
|
|
5943
5446
|
* concurrency — small primitives the evolution loop needs.
|
|
5944
5447
|
*
|
|
@@ -6068,38 +5571,6 @@ declare function precision<T>(goldens: GoldenSpec[], candidates: T[], options?:
|
|
|
6068
5571
|
text?: (candidate: T) => string;
|
|
6069
5572
|
}): number;
|
|
6070
5573
|
|
|
6071
|
-
/**
|
|
6072
|
-
* JsonlTrialCache — `TrialCache` backed by a JSONL append-only file so a
|
|
6073
|
-
* crashed `runPromptEvolution` can resume without re-running expensive
|
|
6074
|
-
* trials. Last write wins on key collision; the file is forward-swept at
|
|
6075
|
-
* construction.
|
|
6076
|
-
*
|
|
6077
|
-
* Tail corruption (partial line at the bottom from a hard kill) is
|
|
6078
|
-
* tolerated — we skip unparseable lines and continue.
|
|
6079
|
-
*
|
|
6080
|
-
* The cache surface (`get` / `set`) is synchronous because `TrialCache`
|
|
6081
|
-
* is. Writes are mutex-serialised through a `LockedJsonlAppender`
|
|
6082
|
-
* (kicked off with `void`) so two in-process callers can't tear a long
|
|
6083
|
-
* line that exceeds POSIX `PIPE_BUF`. Cross-process safety still
|
|
6084
|
-
* requires fcntl/flock and is deliberately out of scope.
|
|
6085
|
-
*/
|
|
6086
|
-
|
|
6087
|
-
declare class JsonlTrialCache implements TrialCache {
|
|
6088
|
-
private readonly map;
|
|
6089
|
-
private readonly path;
|
|
6090
|
-
private readonly appender;
|
|
6091
|
-
constructor(path: string);
|
|
6092
|
-
get(key: string): TrialResult | undefined;
|
|
6093
|
-
set(key: string, value: TrialResult): void;
|
|
6094
|
-
size(): number;
|
|
6095
|
-
/**
|
|
6096
|
-
* Synchronous fallback path for tests / CLI tools that want to be sure
|
|
6097
|
-
* the line is on disk before returning. Bypasses the mutex (single-
|
|
6098
|
-
* threaded callers only).
|
|
6099
|
-
*/
|
|
6100
|
-
setSync(key: string, value: TrialResult): void;
|
|
6101
|
-
}
|
|
6102
|
-
|
|
6103
5574
|
/**
|
|
6104
5575
|
* Wrap a single judge LLM call with retry, optional fallback-model
|
|
6105
5576
|
* rotation, exponential backoff, and a typed `JudgeRetryOutcome`. Callers
|
|
@@ -6232,52 +5703,145 @@ declare function referenceReplayRunsToSteeringRows<Input = unknown>(runs: Refere
|
|
|
6232
5703
|
declare function referenceReplayScenarioToRunScore(scenarioScore: ReferenceReplayScenarioScore, durationMs?: number): RunScore;
|
|
6233
5704
|
|
|
6234
5705
|
/**
|
|
6235
|
-
*
|
|
6236
|
-
*
|
|
6237
|
-
* - `strict-fail` — any `judgeSucceeded === false` trial fails the whole
|
|
6238
|
-
* aggregate. Use for production gates: one corrupt trial halts the gate.
|
|
5706
|
+
* Reflective mutation — primitives for trace-conditioned prompt rewriting.
|
|
6239
5707
|
*
|
|
6240
|
-
*
|
|
6241
|
-
*
|
|
5708
|
+
* Used by `prompt-evolution.ts` (and any consumer running iterative
|
|
5709
|
+
* improvement). Given a parent prompt + concrete trace evidence (top trials,
|
|
5710
|
+
* bottom trials, missed expectations), produce an LLM-ready prompt that
|
|
5711
|
+
* proposes targeted mutations — not blind rephrasings.
|
|
6242
5712
|
*
|
|
6243
|
-
*
|
|
6244
|
-
*
|
|
5713
|
+
* Why this lives outside `prompt-evolution.ts`: any consumer that wants to
|
|
5714
|
+
* run reflective rewriting WITHOUT the population/Pareto machinery can
|
|
5715
|
+
* import these primitives directly.
|
|
6245
5716
|
*
|
|
6246
|
-
*
|
|
6247
|
-
*
|
|
5717
|
+
* Quality bar (vs. naive "mutate this prompt"):
|
|
5718
|
+
* - Show parent ↔ children diff, not just one variant
|
|
5719
|
+
* - Quote specific missed goldens with their match phrases
|
|
5720
|
+
* - Surface the model's actual emitted output side-by-side with what was expected
|
|
5721
|
+
* - Quote concrete mutation primitives so the model has a vocabulary
|
|
6248
5722
|
*/
|
|
5723
|
+
interface TrialTrace {
|
|
5724
|
+
/** Stable id for the trial — surfaces in the prompt for grounding. */
|
|
5725
|
+
id: string;
|
|
5726
|
+
/** Score the trial received on its primary metric. */
|
|
5727
|
+
score: number;
|
|
5728
|
+
/** Candidate inputs the agent was given (e.g., the fixture or scenario). */
|
|
5729
|
+
inputName?: string;
|
|
5730
|
+
/**
|
|
5731
|
+
* Goldens / expectations this trial was tested against, with whether each
|
|
5732
|
+
* was matched. The reflection prompt quotes the missed ones specifically.
|
|
5733
|
+
*/
|
|
5734
|
+
expectations?: Array<{
|
|
5735
|
+
id: string;
|
|
5736
|
+
phrase: string;
|
|
5737
|
+
matched: boolean;
|
|
5738
|
+
}>;
|
|
5739
|
+
/** Free-form text — what the agent actually emitted (e.g., findings, plan). */
|
|
5740
|
+
emitted?: string;
|
|
5741
|
+
/** Optional structured metrics (recall, precision, cost, latency). */
|
|
5742
|
+
metrics?: Record<string, number>;
|
|
5743
|
+
}
|
|
5744
|
+
interface ReflectionContext {
|
|
5745
|
+
/** What is being mutated — appears in the system prompt for orientation. */
|
|
5746
|
+
target: string;
|
|
5747
|
+
/** Current variant's payload — JSON-serialised for the prompt. */
|
|
5748
|
+
parentPayload: unknown;
|
|
5749
|
+
/** Best-performing trials this generation. */
|
|
5750
|
+
topTrials: TrialTrace[];
|
|
5751
|
+
/** Worst-performing trials this generation — the missed-golden source. */
|
|
5752
|
+
bottomTrials: TrialTrace[];
|
|
5753
|
+
/** How many children the mutator should propose. */
|
|
5754
|
+
childCount: number;
|
|
5755
|
+
/** Optional: domain-specific mutation primitives the model can pick from. */
|
|
5756
|
+
mutationPrimitives?: string[];
|
|
5757
|
+
}
|
|
5758
|
+
declare const DEFAULT_MUTATION_PRIMITIVES: string[];
|
|
5759
|
+
/**
|
|
5760
|
+
* Build the LLM-ready reflection prompt. Output is plain text — pass it as
|
|
5761
|
+
* the user message. The system message should be small and stable (e.g.
|
|
5762
|
+
* "Output ONLY a JSON object matching the schema below.").
|
|
5763
|
+
*/
|
|
5764
|
+
declare function buildReflectionPrompt(ctx: ReflectionContext): string;
|
|
5765
|
+
interface ReflectionProposal {
|
|
5766
|
+
label: string;
|
|
5767
|
+
rationale: string;
|
|
5768
|
+
payload: unknown;
|
|
5769
|
+
}
|
|
5770
|
+
declare function parseReflectionResponse(raw: string, maxProposals?: number): ReflectionProposal[];
|
|
6249
5771
|
|
|
6250
|
-
|
|
6251
|
-
|
|
6252
|
-
|
|
6253
|
-
|
|
6254
|
-
|
|
6255
|
-
|
|
6256
|
-
|
|
6257
|
-
|
|
6258
|
-
|
|
6259
|
-
|
|
6260
|
-
|
|
6261
|
-
|
|
6262
|
-
|
|
6263
|
-
|
|
6264
|
-
|
|
6265
|
-
|
|
6266
|
-
|
|
6267
|
-
|
|
5772
|
+
/**
|
|
5773
|
+
* SandboxPool — bounded checkout/release pool for mutation slots.
|
|
5774
|
+
*
|
|
5775
|
+
* The composite-mutator's `code` channel needs an isolated workspace per
|
|
5776
|
+
* mutation attempt: a git worktree, a sandbox container, a tmpdir clone —
|
|
5777
|
+
* whatever the consumer's runtime is. Without a pool, every consumer
|
|
5778
|
+
* re-implements the same machinery (mint N slots, check one out per
|
|
5779
|
+
* mutation, reset before reuse, drain at the end, track utilisation for
|
|
5780
|
+
* the cost ledger). This primitive ships that machinery so consumers
|
|
5781
|
+
* supply only a `SlotFactory`.
|
|
5782
|
+
*
|
|
5783
|
+
* Generic over a slot resource `T` so the same pool serves git worktrees
|
|
5784
|
+
* (T = path), Tangle sandboxes (T = SandboxBox), or anything else with
|
|
5785
|
+
* the create/reset/destroy lifecycle.
|
|
5786
|
+
*
|
|
5787
|
+
* Concurrency: FIFO via the shared `Mutex` primitive. Each `checkout()`
|
|
5788
|
+
* either takes an idle slot or queues until one is released. Lifecycle
|
|
5789
|
+
* is single-process — multi-process pools need external coordination
|
|
5790
|
+
* (file locks, etc.) and are deliberately out of scope.
|
|
5791
|
+
*/
|
|
5792
|
+
interface PoolSlot<T> {
|
|
5793
|
+
/** Stable id assigned at slot creation. Use for telemetry / lineage. */
|
|
5794
|
+
readonly id: string;
|
|
5795
|
+
/** Consumer-defined resource. */
|
|
5796
|
+
readonly resource: T;
|
|
5797
|
+
}
|
|
5798
|
+
interface SlotFactory<T> {
|
|
5799
|
+
/** Build a new slot. Called lazily as the pool grows up to `size`. */
|
|
5800
|
+
create(slotId: string): Promise<T>;
|
|
5801
|
+
/**
|
|
5802
|
+
* Reset a slot to a clean state before reuse. Called BEFORE every
|
|
5803
|
+
* checkout returns it (including the first — so the factory's
|
|
5804
|
+
* `create` can leave the slot dirty and let `reset` normalise).
|
|
5805
|
+
* Optional; default is a no-op.
|
|
5806
|
+
*/
|
|
5807
|
+
reset?(slot: PoolSlot<T>): Promise<void>;
|
|
5808
|
+
/** Tear the slot down. Called by `drain()`. */
|
|
5809
|
+
destroy(slot: PoolSlot<T>): Promise<void>;
|
|
5810
|
+
}
|
|
5811
|
+
interface SandboxPool<T> {
|
|
5812
|
+
/**
|
|
5813
|
+
* Take a slot. If all slots are busy, the promise resolves when one
|
|
5814
|
+
* is released. Always pair with the returned `release` (or wrap with
|
|
5815
|
+
* `withSlot`).
|
|
5816
|
+
*/
|
|
5817
|
+
checkout(): Promise<{
|
|
5818
|
+
slot: PoolSlot<T>;
|
|
5819
|
+
release: () => void;
|
|
5820
|
+
}>;
|
|
6268
5821
|
/**
|
|
6269
|
-
*
|
|
6270
|
-
*
|
|
6271
|
-
* downstream — the eval is corrupt.
|
|
5822
|
+
* Run `fn` with a checked-out slot, releasing on completion or throw.
|
|
5823
|
+
* The convenience wrapper most callers should use.
|
|
6272
5824
|
*/
|
|
6273
|
-
|
|
6274
|
-
|
|
6275
|
-
|
|
5825
|
+
withSlot<R>(fn: (slot: PoolSlot<T>) => Promise<R>): Promise<R>;
|
|
5826
|
+
/** Destroy every slot. Idempotent. */
|
|
5827
|
+
drain(): Promise<void>;
|
|
5828
|
+
/** How many slots have been minted (≤ `size`). */
|
|
5829
|
+
poolSize(): number;
|
|
5830
|
+
/** How many checkouts are currently outstanding. */
|
|
5831
|
+
activeCheckouts(): number;
|
|
5832
|
+
/** Snapshot of busy/total durations for the cost ledger. */
|
|
5833
|
+
utilization(): {
|
|
5834
|
+
busyMs: number;
|
|
5835
|
+
totalMs: number;
|
|
5836
|
+
checkouts: number;
|
|
6276
5837
|
};
|
|
6277
5838
|
}
|
|
6278
|
-
|
|
6279
|
-
|
|
6280
|
-
|
|
5839
|
+
interface CreateSandboxPoolOpts<T> {
|
|
5840
|
+
/** Maximum concurrent slots. Slots are minted on first need, not eagerly. */
|
|
5841
|
+
size: number;
|
|
5842
|
+
factory: SlotFactory<T>;
|
|
5843
|
+
}
|
|
5844
|
+
declare function createSandboxPool<T>(opts: CreateSandboxPoolOpts<T>): SandboxPool<T>;
|
|
6281
5845
|
|
|
6282
5846
|
/**
|
|
6283
5847
|
* Pipeline-level OTEL integration — auto-attaches an OTEL exporter when
|
|
@@ -6363,28 +5927,4 @@ declare function traceJudge(judge: JudgeFn, judgeName: string, opts: TracedJudge
|
|
|
6363
5927
|
*/
|
|
6364
5928
|
declare function traceJudgeEnsemble(judges: JudgeFn[], judgeNames: string[], opts: TracedJudgeOptions): JudgeFn;
|
|
6365
5929
|
|
|
6366
|
-
/**
|
|
6367
|
-
* Traced mutator wrapper — instruments reflective-mutation LLM calls.
|
|
6368
|
-
*
|
|
6369
|
-
* The reflective mutator (used by production-loop + multi-shot-optimization)
|
|
6370
|
-
* builds a prompt via `buildReflectionPrompt` and calls an LLM to produce
|
|
6371
|
-
* candidate mutations. This wrapper emits a span around each mutation call
|
|
6372
|
-
* so OTEL sinks observe:
|
|
6373
|
-
* - Model used for mutation
|
|
6374
|
-
* - Input context (target, trial count, child count)
|
|
6375
|
-
* - Output (proposal count, labels)
|
|
6376
|
-
* - Duration + cost if available
|
|
6377
|
-
*/
|
|
6378
|
-
|
|
6379
|
-
interface TracedMutatorOptions {
|
|
6380
|
-
/** TraceEmitter for span emission. */
|
|
6381
|
-
emitter: TraceEmitter;
|
|
6382
|
-
/** Parent span id. If omitted, uses emitter stack. */
|
|
6383
|
-
parentSpanId?: string;
|
|
6384
|
-
}
|
|
6385
|
-
/**
|
|
6386
|
-
* Wrap a MutateAdapter so every mutate() call emits a span.
|
|
6387
|
-
*/
|
|
6388
|
-
declare function traceMutator<P>(adapter: MutateAdapter<P>, opts: TracedMutatorOptions): MutateAdapter<P>;
|
|
6389
|
-
|
|
6390
|
-
export { ANALYST_SEVERITIES, ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AgentProfile, type AggregatorMode, type AlignmentOp, type Analyst, type AnalystContext, type AnalystCost, type AnalystFinding, type AnalystHooks, type AnalystInputKind, AnalystRegistry, type AnalystRegistryOptions, type AnalystRequirements, type AnalystRunEvent, type AnalystRunInputs, type AnalystRunResult, type AnalystRunSummary, type AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BackendIntegrityError, type BackendIntegrityReport, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, type BudgetPolicy, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, type ChatCallOpts, type ChatClient, type ChatRequest, type ChatResponse, type ChatTransport, CheckResult, type CliBridgeTransportOpts, type CodeMutationOutcome, type CodeMutationRunner, CollectedArtifacts, type CommandRunner, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ControlEvalResult, ConvergenceTracker, type CorrectnessChecker, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateChatClientOpts, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type DirectProviderTransportOpts, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvidenceRef, type EvolutionRound, EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, type FailureClusterConfig, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, GateDecision, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGateConfig, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntegrationGateSurface, type IntegrationInvokeFailureInput, type IntegrationManifestGateInput, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAdapterOpts, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallRequest, LlmCallResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MockTransportOpts, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, MultiShotMutateAdapter, MultiShotOptimizationResult, MultiShotRunner, MultiShotScorer, MultiShotTrialResult, type MultiToolchainLayerConfig, MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, ParetoResult, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type ProductionEvolveConfig, type ProductionLoopCronConfig, type ProductionLoopDecision, type ProductionLoopRenderContext, type ProductionLoopResult, type ProductionShipConfig, type PromptHandle, PromptRegistry, TrialResult as PromptTrialResult, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegistryRunOpts, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouterTransportOpts, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, type RunProductionLoopOptions, RunRecord, type RunScore, type RunScoreWeights, RunSplitTag, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxSdkTransportOpts, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, type TracedMutatorOptions, Trajectory, TrajectoryStep, type TrialAggregate, type TrialAttempt, TrialCache, TrialTelemetry, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, VariantAggregate, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, agentProfileHash, aggregatePrReviewScore, aggregateRunScore, aggregateTrialsByMode, analyzeAntiSlop, analyzeSeries, appendScorecard, assertRealBackend, attributeCounterfactuals, bisect, buildDriverSystemPrompt, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, computeFindingId, containsAll, createAntiSlopJudge, createChatClient, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, integrationAsi, integrationGateEvals, integrationInvokeFailedPayload, integrationManifestResolvedPayload, integrationManifestValidatedPayload, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, liftSeverity, linterJudge, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, makeFinding, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runProductionLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeBackendIntegrity, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, traceMutator, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
|
|
5930
|
+
export { ANALYST_SEVERITIES, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, AgentEvalError, type AgentProfile, type AlignmentOp, type Analyst, type AnalystContext, type AnalystCost, type AnalystFinding, type AnalystHooks, type AnalystInputKind, AnalystRegistry, type AnalystRegistryOptions, type AnalystRequirements, type AnalystRunEvent, type AnalystRunInputs, type AnalystRunResult, type AnalystRunSummary, type AnalystSeverity, AnalyzeTracesInput, AnalyzeTracesOptions, AnalyzeTracesResult, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, Artifact$1 as Artifact, type Artifact as ArtifactCheckArtifact, type ArtifactEventLike, type ArtifactValidator, type AutoPrClient, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BackendIntegrityError, type BackendIntegrityReport, BaselineReport, BehaviorAssertion, BenchmarkReport, BenchmarkRunner, BenchmarkRunnerConfig, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, BudgetGuard, BudgetLedgerEntry, type BudgetPolicy, BudgetSpec, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CausalAttributionReport, type CellVerdict, type ChatCallOpts, type ChatClient, type ChatRequest, type ChatResponse, type ChatTransport, CheckResult, type CliBridgeTransportOpts, CollectedArtifacts, type CommandRunner, CompletionCriterion, type CompletionRequirement, type CompletionVerdict, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrectnessChecker, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateChatClientOpts, type CreateDefaultReviewerOptions, type CreateSandboxPoolOpts, type CreateTraceAnalystKindOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_PR_REVIEW_SCORE_WEIGHTS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, DEFAULT_TRACE_ANALYST_KINDS, Dataset, DatasetScenario, type DecideNextUserTurnOpts, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DiffPolicy, type DiffScorecardOptions, type DirEntry, type DirectProviderTransportOpts, type Direction, type DiscoverPersonasOptions, type DiscoveredPersona, DriverResult, DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EvidenceRef, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_MODE_KIND_SPEC, FINDING_SUBJECT_GRAMMAR_PROMPT, FINDING_SUBJECT_KINDS, type FactorContribution, type FactorialCell, FeedbackLabel, FeedbackTrajectory, FeedbackTrajectoryStore, type FileChange, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, type FindingSubject, type FindingSubjectKind, FindingSubjectStringSchema, type FindingsDiff, FindingsStore, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GhCliClientOptions, type GoldenSeverity, type GoldenSpec, type HarnessAdapter, HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HttpGithubClientOptions, type HypothesisManifest, type HypothesisResult, IMPROVEMENT_KIND_SPEC, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAdapterOpts, type JudgeFleetOptions, JudgeFn, JudgeInput, type JudgeReplayResult, type JudgeRetryOutcome, type JudgeRetryPolicy, JudgeRunner, KIND_EXPECTED_SUBJECTS, KNOWLEDGE_GAP_KIND_SPEC, KNOWLEDGE_POISONING_KIND_SPEC, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, Layer, LayerResult, type LiveProofArtifact, type LiveProofConfig, type LiveProofContext, type LiveProofResult, LlmCallRequest, LlmCallResult, LlmClientOptions, type LlmCorrectnessCheckerOpts, LlmSpan, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, MetricsCollector, type MockTransportOpts, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, Mutex, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, OtelExportConfig, OtelExporter, type OtelPipelineHandle, type OtelPipelineOptions, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoResult, type PersistedFinding, PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PrReviewAuditCase, type PrReviewBenchmarkSummary, type PrReviewComment, type PrReviewMatchedFinding, type PrReviewOutcome, type PrReviewReferenceFinding, type PrReviewScore, type PrReviewScoreWeights, type PrReviewSeverity, type PrReviewSource, type ProducedProposal, type ProducedState, ProductClient, ProductClientConfig, type PromptHandle, PromptRegistry, type ProposalEventLike, type ProposeAutomatedPullRequestInput, type ProposeAutomatedPullRequestResult, RAW_FINDING_SCHEMA_PROMPT, type RawAnalystFinding, RawAnalystFindingSchema, type RecordRunsOptions, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegistryRunOpts, ReleaseConfidenceScorecard, ReleaseConfidenceThresholds, type RepoRef, type RequirementCheck, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouterTransportOpts, Run$1 as Run, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticAdapterOpts, type RunCriticOptions, type RunDiff, RunFilter, RunRecord, type RunScore, type RunScoreWeights, type RunTrace, type RuntimeEventLike, SEMANTIC_CONCEPT_JUDGE_VERSION, SandboxDriver, SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxSdkTransportOpts, type SatisfiedBy, type ScanOptions, Scenario, type ScenarioCost, ScenarioFile, ScenarioRegistry, ScenarioResult, type Scorecard, type ScorecardCell, type ScorecardCellDiff, type ScorecardDiff, type ScorecardEntry, type ScorecardLogLine, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SemanticConceptJudgeAdapterOpts, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, Severity, type SignedManifest, type SignedManifestAlgo, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, Span, type SteeringBundle, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type SynthesisReason, type SynthesisTarget, type TaskGold, TestResult, type ThresholdContract, TokenCounter, type TokenSpec, type ToolCallEventLike, TraceAnalysisStore, type TraceAnalystAdapterOpts, type TraceAnalystGolden, type TraceAnalystKindSpec, TraceEmitter, TraceEvent, TraceStore, type TraceToolGroupName, type TracedAnalystOptions, type TracedJudgeOptions, Trajectory, TrajectoryStep, type TrialTrace, TurnMetrics, UNIVERSAL_FINDERS, type ValidationContext, type ValidationIssue, type ValidationResult, type VerifierAdapterOpts, VerifyContext, VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, agentProfileHash, aggregatePrReviewScore, aggregateRunScore, analyzeAntiSlop, analyzeSeries, appendScorecard, assertRealBackend, attributeCounterfactuals, bisect, buildDriverSystemPrompt, buildReflectionPrompt, buildReviewerPrompt, buildTraceToolsForGroup, byteLengthRange, canaryLeakView, canonicalize, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, codeExecutionJudge, coherenceJudge, collectionPreserved, commentsForSource, commitBisect, compareReferenceReplay, compilerJudge, composeValidators, computeFindingId, containsAll, createAntiSlopJudge, createChatClient, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createJudgeAdapter, createLlmCorrectnessChecker, createRunCriticAdapter, createSandboxPool, createSemanticConceptJudge, createSemanticConceptJudgeAdapter, createTraceAnalystAdapter, createTraceAnalystKind, createVerifierAdapter, crossTraceDiff, crowdingDistance, decideNextUserTurn, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultIsMaterial, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, diffFindings, diffScorecard, discoverPersonas, distillPlaybook, dominates, estimateCost, estimateTokens, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, extractAssetUrls, extractErrorCount, extractProducedState, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, formatScorecardDiff, ghCliClient, precision as goldenPrecision, hashContent, hashJson, htmlContainsElement, httpGithubClient, inMemoryReferenceReplayStore, isOtelConfigured, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, keyPreserved, liftSeverity, linterJudge, loadScorecard, loadScorerFromGrader, localCommandRunner, lowercaseMutator, makeFinding, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, notBlocked, paraphraseRobustness, paraphraseRobustnessScenarios, paretoFrontier, paretoFrontierWithCrowding, parseCorrectnessResponse, parseFindingSubject, parseRawFinding, parseReflectionResponse, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, printDriverSummary, promptBisect, proposeAutomatedPullRequest, proposeSynthesisTargets, recordRuns, recordRunsToScorecard, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, renderFindingSubject, renderMarkdownReport, renderPlaybookMarkdown, renderPriorFindings, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, resetLockedAppendersForTesting, rowCount, rowWhere, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runLiveProof, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, scalarScore, scanForMuffledGates, scoreContinuity, scorePrReviewComments, scorePrReviewSource, scoreReferenceReplay, securityJudge, selectHarnessVariant, sentenceReorderMutator, signManifest, statusAdvanced, summarizeBackendIntegrity, summarizeHarnessResults, summarizePrReviewBenchmark, testJudge, textInSnapshot, toLangfuseEnvelope, toPrometheusText, traceJudge, traceJudgeEnsemble, tracedAnalyzeTraces, typoMutator, urlContains, verifyCompletion, verifyManifest, visualDiff, viteDeployRunner, weightedRecall, whitespaceCollapseMutator, withJudgeRetry, withOtelPipeline, wranglerDeployRunner };
|