@tangle-network/agent-eval 0.11.1 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +489 -4
- package/dist/index.js +659 -35
- package/dist/index.js.map +1 -1
- package/dist/sink-fetch-C0B8ximv.d.ts +101 -0
- package/dist/telemetry/file.d.ts +19 -0
- package/dist/telemetry/file.js +40 -0
- package/dist/telemetry/file.js.map +1 -0
- package/dist/telemetry/index.d.ts +38 -0
- package/dist/telemetry/index.js +128 -0
- package/dist/telemetry/index.js.map +1 -0
- package/package.json +19 -10
package/dist/index.d.ts
CHANGED
|
@@ -959,13 +959,13 @@ interface RunDiff {
|
|
|
959
959
|
* and returns a number per scenario. This lets the optimizer stay small +
|
|
960
960
|
* testable.
|
|
961
961
|
*/
|
|
962
|
-
interface PromptVariant {
|
|
962
|
+
interface PromptVariant$1 {
|
|
963
963
|
id: string;
|
|
964
964
|
prompt: string;
|
|
965
965
|
metadata?: Record<string, unknown>;
|
|
966
966
|
}
|
|
967
967
|
interface OptimizationConfig {
|
|
968
|
-
variants: PromptVariant[];
|
|
968
|
+
variants: PromptVariant$1[];
|
|
969
969
|
/** How many trials per (variant, scenario) — controls CI tightness. Default 3. */
|
|
970
970
|
trialsPerScenario?: number;
|
|
971
971
|
/** Significance threshold for pairwise comparison (default 0.05). */
|
|
@@ -976,7 +976,7 @@ interface OptimizationConfig {
|
|
|
976
976
|
* monotonicity).
|
|
977
977
|
*/
|
|
978
978
|
scoreVariant: (args: {
|
|
979
|
-
variant: PromptVariant;
|
|
979
|
+
variant: PromptVariant$1;
|
|
980
980
|
scenarioId: string;
|
|
981
981
|
trialIndex: number;
|
|
982
982
|
}) => Promise<number>;
|
|
@@ -1750,6 +1750,46 @@ declare function dominates<T>(a: T, b: T, objectives: Objective<T>[]): boolean;
|
|
|
1750
1750
|
* iff no other candidate dominates it.
|
|
1751
1751
|
*/
|
|
1752
1752
|
declare function paretoFrontier<T>(candidates: T[], objectives: Objective<T>[]): ParetoResult<T>;
|
|
1753
|
+
/**
|
|
1754
|
+
* Weighted-sum scalarisation. Use as a tie-break / single-winner selector
|
|
1755
|
+
* when callers don't want to consume a frontier. Each objective contributes
|
|
1756
|
+
* its normalised value (0..1 via min-max across the candidate pool) times
|
|
1757
|
+
* its weight; missing weights default to 1/N.
|
|
1758
|
+
*
|
|
1759
|
+
* Direction is honoured automatically — `minimize` axes have their values
|
|
1760
|
+
* inverted before scaling so "higher scalar = better" always holds.
|
|
1761
|
+
*/
|
|
1762
|
+
declare function scalarScore<T>(candidates: T[], objectives: Objective<T>[], options?: {
|
|
1763
|
+
weights?: Partial<Record<string, number>>;
|
|
1764
|
+
}): Array<{
|
|
1765
|
+
candidate: T;
|
|
1766
|
+
score: number;
|
|
1767
|
+
}>;
|
|
1768
|
+
/**
|
|
1769
|
+
* NSGA-II crowding distance — secondary sort for ties on the frontier.
|
|
1770
|
+
*
|
|
1771
|
+
* When the Pareto front collapses to a single point (or many candidates tie
|
|
1772
|
+
* on dominance), naive selection picks arbitrarily and the population
|
|
1773
|
+
* degenerates over generations. NSGA-II preserves diversity by preferring
|
|
1774
|
+
* candidates with more empty space around them on the frontier.
|
|
1775
|
+
*
|
|
1776
|
+
* Returns an array of `{ candidate, distance }` in the SAME order as the
|
|
1777
|
+
* input. Higher distance = more isolated = should be preferred when
|
|
1778
|
+
* preserving diversity.
|
|
1779
|
+
*/
|
|
1780
|
+
declare function crowdingDistance<T>(candidates: T[], objectives: Objective<T>[]): Array<{
|
|
1781
|
+
candidate: T;
|
|
1782
|
+
distance: number;
|
|
1783
|
+
}>;
|
|
1784
|
+
/**
|
|
1785
|
+
* Pareto frontier with tie-break by crowding distance — the canonical
|
|
1786
|
+
* NSGA-II selection step. Returns the frontier sorted by descending crowding
|
|
1787
|
+
* distance so callers can `.slice(0, k)` to pick K diverse winners.
|
|
1788
|
+
*/
|
|
1789
|
+
declare function paretoFrontierWithCrowding<T>(candidates: T[], objectives: Objective<T>[]): Array<{
|
|
1790
|
+
candidate: T;
|
|
1791
|
+
distance: number;
|
|
1792
|
+
}>;
|
|
1753
1793
|
|
|
1754
1794
|
type HarnessIntervention = 'continue' | 'plan' | 'audit' | 'recover' | 'repair' | 'verify' | 'final_gate' | 'wait_for_measurement' | 'abort';
|
|
1755
1795
|
interface WorkflowTopology {
|
|
@@ -6079,4 +6119,449 @@ interface ReferenceReplaySteeringRowsOptions<Input = unknown> {
|
|
|
6079
6119
|
declare function referenceReplayRunsToSteeringRows<Input = unknown>(runs: ReferenceReplayRun<Input>[], options?: ReferenceReplaySteeringRowsOptions<Input>): SteeringOptimizationRow[];
|
|
6080
6120
|
declare function referenceReplayScenarioToRunScore(scenarioScore: ReferenceReplayScenarioScore, durationMs?: number): RunScore;
|
|
6081
6121
|
|
|
6082
|
-
export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, MODEL_PRICING, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createLlmReviewer, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runProposeReview, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
|
|
6122
|
+
/**
|
|
6123
|
+
* PromptEvolutionLoop — population-based reflective-mutation evolution.
|
|
6124
|
+
*
|
|
6125
|
+
* Above the existing `AxGepaSteeringOptimizer` (which RANKS variants),
|
|
6126
|
+
* this loop GENERATES variants. Each generation:
|
|
6127
|
+
* 1. Score the population across (variant × scenario × rep).
|
|
6128
|
+
* 2. Pick survivors from the Pareto frontier (with crowding-distance tie-break).
|
|
6129
|
+
* 3. Ask the mutator for replacements until population size is restored.
|
|
6130
|
+
* 4. Repeat for N generations OR until convergence.
|
|
6131
|
+
*
|
|
6132
|
+
* Domain-agnostic. Consumers supply:
|
|
6133
|
+
* - A seed population of `PromptVariant`s.
|
|
6134
|
+
* - A `ScoreAdapter` that runs (variant, scenario, rep) → `TrialResult`.
|
|
6135
|
+
* - A `MutateAdapter` that produces children given trace evidence.
|
|
6136
|
+
* - Pareto `Objective<TrialAggregate>[]` defining the multi-objective vector.
|
|
6137
|
+
*
|
|
6138
|
+
* The loop owns: population management, parallel scheduling (concurrency-
|
|
6139
|
+
* limited), Pareto selection with crowding distance, generation reporting.
|
|
6140
|
+
*
|
|
6141
|
+
* It does NOT own: rendering trials to a model, executing prompts, choosing
|
|
6142
|
+
* mutation primitives, persisting to disk. Those are the consumer's call.
|
|
6143
|
+
*/
|
|
6144
|
+
|
|
6145
|
+
interface PromptVariant<P = unknown> {
|
|
6146
|
+
/** Stable id for the variant — surfaces in reports and trial results. */
|
|
6147
|
+
id: string;
|
|
6148
|
+
/** Variant payload — interpretation is the consumer's responsibility. */
|
|
6149
|
+
payload: P;
|
|
6150
|
+
/** Generation index (0 = seed, then 1, 2, ...). */
|
|
6151
|
+
generation: number;
|
|
6152
|
+
/** Parent variant id when produced via mutation; absent for seeds. */
|
|
6153
|
+
parentId?: string;
|
|
6154
|
+
/** Human label for reports. */
|
|
6155
|
+
label: string;
|
|
6156
|
+
/** What the mutator was trying to fix. */
|
|
6157
|
+
rationale?: string;
|
|
6158
|
+
}
|
|
6159
|
+
interface TrialResult {
|
|
6160
|
+
variantId: string;
|
|
6161
|
+
scenarioId: string;
|
|
6162
|
+
rep: number;
|
|
6163
|
+
ok: boolean;
|
|
6164
|
+
/** Primary scalar score the consumer cares about (e.g., recall, accuracy). */
|
|
6165
|
+
score: number;
|
|
6166
|
+
/** Token cost (or any cost-like dimension). */
|
|
6167
|
+
cost?: number;
|
|
6168
|
+
/** Wall time in ms. */
|
|
6169
|
+
durationMs?: number;
|
|
6170
|
+
/** Free-form metric bag for objective accessors. */
|
|
6171
|
+
metrics?: Record<string, number>;
|
|
6172
|
+
error?: string;
|
|
6173
|
+
}
|
|
6174
|
+
/** Aggregated trial summary for one (variant, scenario) pair across reps. */
|
|
6175
|
+
interface ScenarioAggregate {
|
|
6176
|
+
variantId: string;
|
|
6177
|
+
scenarioId: string;
|
|
6178
|
+
meanScore: number;
|
|
6179
|
+
meanCost: number;
|
|
6180
|
+
meanDurationMs: number;
|
|
6181
|
+
okRate: number;
|
|
6182
|
+
trials: number;
|
|
6183
|
+
/** Mean of every numeric metric across reps. */
|
|
6184
|
+
metrics: Record<string, number>;
|
|
6185
|
+
}
|
|
6186
|
+
/** Aggregated trial summary for one variant across all scenarios. */
|
|
6187
|
+
interface VariantAggregate {
|
|
6188
|
+
variantId: string;
|
|
6189
|
+
meanScore: number;
|
|
6190
|
+
meanCost: number;
|
|
6191
|
+
meanDurationMs: number;
|
|
6192
|
+
okRate: number;
|
|
6193
|
+
scenarios: ScenarioAggregate[];
|
|
6194
|
+
/** Mean of every numeric metric, averaged across scenarios. */
|
|
6195
|
+
metrics: Record<string, number>;
|
|
6196
|
+
}
|
|
6197
|
+
interface ScoreAdapter<P = unknown> {
|
|
6198
|
+
score(args: {
|
|
6199
|
+
variant: PromptVariant<P>;
|
|
6200
|
+
scenarioId: string;
|
|
6201
|
+
rep: number;
|
|
6202
|
+
}): Promise<TrialResult>;
|
|
6203
|
+
}
|
|
6204
|
+
interface MutateAdapter<P = unknown> {
|
|
6205
|
+
mutate(args: {
|
|
6206
|
+
parent: PromptVariant<P>;
|
|
6207
|
+
parentAggregate: VariantAggregate;
|
|
6208
|
+
topTrials: TrialResult[];
|
|
6209
|
+
bottomTrials: TrialResult[];
|
|
6210
|
+
childCount: number;
|
|
6211
|
+
generation: number;
|
|
6212
|
+
}): Promise<PromptVariant<P>[]>;
|
|
6213
|
+
}
|
|
6214
|
+
interface PromptEvolutionConfig<P = unknown> {
|
|
6215
|
+
runId: string;
|
|
6216
|
+
/** What component is being mutated — surfaces in reports + reflection prompts. */
|
|
6217
|
+
target: string;
|
|
6218
|
+
seedVariants: PromptVariant<P>[];
|
|
6219
|
+
scenarioIds: string[];
|
|
6220
|
+
reps: number;
|
|
6221
|
+
generations: number;
|
|
6222
|
+
populationSize: number;
|
|
6223
|
+
/** Maximum concurrent score() calls. */
|
|
6224
|
+
scoreConcurrency: number;
|
|
6225
|
+
scoreAdapter: ScoreAdapter<P>;
|
|
6226
|
+
mutateAdapter: MutateAdapter<P>;
|
|
6227
|
+
/** Pareto objectives over `VariantAggregate`. Ordered by importance. */
|
|
6228
|
+
objectives: Objective<VariantAggregate>[];
|
|
6229
|
+
/** Optional weights for the scalar tie-break selector (by objective name). */
|
|
6230
|
+
scalarWeights?: Record<string, number>;
|
|
6231
|
+
/** Stop early if a generation produces no Pareto improvement. Default true. */
|
|
6232
|
+
earlyStopOnNoImprovement?: boolean;
|
|
6233
|
+
onProgress?: (event: PromptEvolutionEvent) => void;
|
|
6234
|
+
/**
|
|
6235
|
+
* Optional cache key for memoising scored (variantId, scenarioId, rep)
|
|
6236
|
+
* tuples. When provided AND a cache instance is passed, repeated trials
|
|
6237
|
+
* skip re-scoring. Cache keys are stable across runs.
|
|
6238
|
+
*/
|
|
6239
|
+
cache?: TrialCache;
|
|
6240
|
+
}
|
|
6241
|
+
interface TrialCache {
|
|
6242
|
+
get(key: string): TrialResult | undefined;
|
|
6243
|
+
set(key: string, value: TrialResult): void;
|
|
6244
|
+
}
|
|
6245
|
+
declare class InMemoryTrialCache implements TrialCache {
|
|
6246
|
+
private store;
|
|
6247
|
+
get(key: string): TrialResult | undefined;
|
|
6248
|
+
set(key: string, value: TrialResult): void;
|
|
6249
|
+
size(): number;
|
|
6250
|
+
clear(): void;
|
|
6251
|
+
}
|
|
6252
|
+
type PromptEvolutionEvent = {
|
|
6253
|
+
type: 'generation-start';
|
|
6254
|
+
generation: number;
|
|
6255
|
+
populationSize: number;
|
|
6256
|
+
} | {
|
|
6257
|
+
type: 'trial-complete';
|
|
6258
|
+
generation: number;
|
|
6259
|
+
variantId: string;
|
|
6260
|
+
scenarioId: string;
|
|
6261
|
+
rep: number;
|
|
6262
|
+
ok: boolean;
|
|
6263
|
+
score: number;
|
|
6264
|
+
cached: boolean;
|
|
6265
|
+
} | {
|
|
6266
|
+
type: 'generation-complete';
|
|
6267
|
+
report: GenerationReport<unknown>;
|
|
6268
|
+
} | {
|
|
6269
|
+
type: 'converged';
|
|
6270
|
+
generation: number;
|
|
6271
|
+
reason: string;
|
|
6272
|
+
};
|
|
6273
|
+
interface GenerationReport<P = unknown> {
|
|
6274
|
+
runId: string;
|
|
6275
|
+
target: string;
|
|
6276
|
+
generation: number;
|
|
6277
|
+
variants: PromptVariant<P>[];
|
|
6278
|
+
aggregates: VariantAggregate[];
|
|
6279
|
+
/** Frontier candidates, sorted by descending crowding distance. */
|
|
6280
|
+
paretoFrontIds: string[];
|
|
6281
|
+
/** Scalar-best variant id — used for the single "winner" if callers want one. */
|
|
6282
|
+
winnerId: string;
|
|
6283
|
+
/** Trials that fed this generation (kept for downstream reporting). */
|
|
6284
|
+
trials: TrialResult[];
|
|
6285
|
+
}
|
|
6286
|
+
interface PromptEvolutionResult<P = unknown> {
|
|
6287
|
+
runId: string;
|
|
6288
|
+
target: string;
|
|
6289
|
+
generations: GenerationReport<P>[];
|
|
6290
|
+
/** Best variant by scalar score in the final generation. */
|
|
6291
|
+
bestVariant: PromptVariant<P>;
|
|
6292
|
+
/** Best aggregate (matches bestVariant). */
|
|
6293
|
+
bestAggregate: VariantAggregate;
|
|
6294
|
+
}
|
|
6295
|
+
declare function runPromptEvolution<P>(config: PromptEvolutionConfig<P>): Promise<PromptEvolutionResult<P>>;
|
|
6296
|
+
|
|
6297
|
+
/**
|
|
6298
|
+
* GoldenMatcher — fuzzy matcher for "did the agent produce the expected things?".
|
|
6299
|
+
*
|
|
6300
|
+
* Universal primitive across agent-eval consumers. Use it for:
|
|
6301
|
+
* - Test suites: did the run hit the expected assertions?
|
|
6302
|
+
* - Tool agents: did the agent emit the expected tool call sequence?
|
|
6303
|
+
* - Judges: did the verdict include the expected concepts?
|
|
6304
|
+
* - Design audits: did the auditor surface the planted defects?
|
|
6305
|
+
*
|
|
6306
|
+
* Match rule (per golden):
|
|
6307
|
+
* - Any phrase in `golden.any` (case-insensitive substring) appears in the
|
|
6308
|
+
* candidate's text fields, OR
|
|
6309
|
+
* - Any pattern in `golden.anyRegex` (case-insensitive) matches.
|
|
6310
|
+
*
|
|
6311
|
+
* Recall is severity-weighted by default: critical=3, major=2, minor=1.
|
|
6312
|
+
* Missing one critical hurts more than missing three minors.
|
|
6313
|
+
*/
|
|
6314
|
+
type GoldenSeverity = 'critical' | 'major' | 'minor';
|
|
6315
|
+
interface GoldenSpec {
|
|
6316
|
+
/** Stable identifier — survives across runs so consumers can grep by id. */
|
|
6317
|
+
id: string;
|
|
6318
|
+
/** Severity drives recall weighting. */
|
|
6319
|
+
severity: GoldenSeverity;
|
|
6320
|
+
/**
|
|
6321
|
+
* Substring phrases (case-insensitive). A hit on ANY phrase counts as a
|
|
6322
|
+
* match. Keep these SHORT (3-6 words) and SPECIFIC.
|
|
6323
|
+
*/
|
|
6324
|
+
any: string[];
|
|
6325
|
+
/** Optional regex patterns. ORed with `any`. */
|
|
6326
|
+
anyRegex?: string[];
|
|
6327
|
+
/** Free-form note — surfaces in reports for humans. */
|
|
6328
|
+
hint?: string;
|
|
6329
|
+
/** Optional category for grouping/filtering. */
|
|
6330
|
+
category?: string;
|
|
6331
|
+
}
|
|
6332
|
+
interface MatchResult {
|
|
6333
|
+
/** Same length as goldens; `true` when matched. */
|
|
6334
|
+
matches: boolean[];
|
|
6335
|
+
/** Convenience: count of hits. */
|
|
6336
|
+
hits: number;
|
|
6337
|
+
/** Convenience: total goldens. */
|
|
6338
|
+
total: number;
|
|
6339
|
+
}
|
|
6340
|
+
/**
|
|
6341
|
+
* Match each golden against `candidates`, where each candidate exposes one or
|
|
6342
|
+
* more text fields the matcher should search. Defaults to searching all
|
|
6343
|
+
* string-typed fields concatenated.
|
|
6344
|
+
*/
|
|
6345
|
+
declare function matchGoldens<T>(goldens: GoldenSpec[], candidates: T[], options?: {
|
|
6346
|
+
/**
|
|
6347
|
+
* Extract the searchable text for a candidate. Default: concatenate every
|
|
6348
|
+
* top-level string field with a space.
|
|
6349
|
+
*/
|
|
6350
|
+
text?: (candidate: T) => string;
|
|
6351
|
+
}): MatchResult;
|
|
6352
|
+
/** Severity weights — exposed so consumers can override (rare). */
|
|
6353
|
+
declare const DEFAULT_SEVERITY_WEIGHTS: Record<GoldenSeverity, number>;
|
|
6354
|
+
/** Severity-weighted recall over a MatchResult + the goldens that produced it. */
|
|
6355
|
+
declare function weightedRecall(goldens: GoldenSpec[], result: MatchResult, weights?: Record<GoldenSeverity, number>): number;
|
|
6356
|
+
/**
|
|
6357
|
+
* Precision proxy: fraction of emitted candidates that match SOME golden.
|
|
6358
|
+
*
|
|
6359
|
+
* No human-labelled negatives means unmatched candidates are SOFT false
|
|
6360
|
+
* positives — punishes verbose agents that pad with filler. Doesn't punish
|
|
6361
|
+
* unknown-but-real findings; the way to tighten this is to grow the golden
|
|
6362
|
+
* set, not to invent a stricter score.
|
|
6363
|
+
*/
|
|
6364
|
+
declare function precision<T>(goldens: GoldenSpec[], candidates: T[], options?: {
|
|
6365
|
+
text?: (candidate: T) => string;
|
|
6366
|
+
}): number;
|
|
6367
|
+
|
|
6368
|
+
/**
|
|
6369
|
+
* Inter-critic / inter-pass orthogonality.
|
|
6370
|
+
*
|
|
6371
|
+
* Detects redundant ensembles. When you run N critics (or N audit passes,
|
|
6372
|
+
* or N specialized agents) on the same input, you want them to disagree —
|
|
6373
|
+
* each contributing distinct signal. If they all converge on the same set
|
|
6374
|
+
* of findings, you're paying N× cost for ~1× signal.
|
|
6375
|
+
*
|
|
6376
|
+
* The metric is `1 − mean pairwise cosine similarity` over bags of words
|
|
6377
|
+
* extracted from each pass's outputs. 1.0 = fully orthogonal,
|
|
6378
|
+
* 0.0 = fully redundant.
|
|
6379
|
+
*
|
|
6380
|
+
* Universal primitive: pass anything that produces text (findings, tool
|
|
6381
|
+
* calls rendered as JSON, verdict strings) and the matcher derives its own
|
|
6382
|
+
* vocabulary.
|
|
6383
|
+
*/
|
|
6384
|
+
interface OrthogonalityInput<T> {
|
|
6385
|
+
passes: Array<{
|
|
6386
|
+
findings: T[];
|
|
6387
|
+
}>;
|
|
6388
|
+
/** Render one element to text. Default: defaultRender (concatenates string fields). */
|
|
6389
|
+
text?: (item: T) => string;
|
|
6390
|
+
/** Minimum token length kept in the bag. Default 4 (drops short fillers). */
|
|
6391
|
+
minTokenLength?: number;
|
|
6392
|
+
}
|
|
6393
|
+
interface OrthogonalityResult {
|
|
6394
|
+
/** 1 − mean pairwise cosine similarity across passes. 1=fully orthogonal, 0=fully redundant. */
|
|
6395
|
+
orthogonality: number;
|
|
6396
|
+
/** Number of passes considered. */
|
|
6397
|
+
passCount: number;
|
|
6398
|
+
/** Pairwise cosine similarities, in upper-triangular order (for debugging). */
|
|
6399
|
+
similarities: number[];
|
|
6400
|
+
}
|
|
6401
|
+
declare function passOrthogonality<T>(input: OrthogonalityInput<T>): OrthogonalityResult;
|
|
6402
|
+
|
|
6403
|
+
/**
|
|
6404
|
+
* Bootstrap-CI promotion gate.
|
|
6405
|
+
*
|
|
6406
|
+
* In any iterative-improvement loop (GEPA, prompt evolution, dataset
|
|
6407
|
+
* curation), the question is "did this generation actually improve, or are
|
|
6408
|
+
* we celebrating noise?". With small N and noisy outcomes, point-estimate
|
|
6409
|
+
* deltas lie. Bootstrap confidence intervals tell the operator whether the
|
|
6410
|
+
* delta is real before code or prompts get promoted.
|
|
6411
|
+
*
|
|
6412
|
+
* This module is pure functions — no I/O, no model calls. Easy to unit-test
|
|
6413
|
+
* and to compose into any verdict gate.
|
|
6414
|
+
*
|
|
6415
|
+
* Default gate:
|
|
6416
|
+
* - Bootstrap mean baseline vs candidate (1k resamples).
|
|
6417
|
+
* - Compute the delta distribution; pass if the lower CI bound > 0.
|
|
6418
|
+
* - Tunable confidence (default 95%) and resample count.
|
|
6419
|
+
*
|
|
6420
|
+
* Verdict semantics intentionally match the existing `experiments.jsonl`
|
|
6421
|
+
* vocabulary:
|
|
6422
|
+
* - ADVANCE: candidate's CI lower bound > baseline mean (real win)
|
|
6423
|
+
* - KEEP: overlap, but candidate point estimate >= baseline (neutral)
|
|
6424
|
+
* - REVERT: candidate's CI upper bound < baseline mean (real regression)
|
|
6425
|
+
* - INCONCLUSIVE: not enough samples or CI straddles zero with no signal
|
|
6426
|
+
*/
|
|
6427
|
+
type Verdict = 'ADVANCE' | 'KEEP' | 'REVERT' | 'INCONCLUSIVE';
|
|
6428
|
+
interface BootstrapResult {
|
|
6429
|
+
baselineMean: number;
|
|
6430
|
+
candidateMean: number;
|
|
6431
|
+
/** candidateMean - baselineMean, point estimate. */
|
|
6432
|
+
delta: number;
|
|
6433
|
+
/** Lower bound of the (1 - alpha) CI on the delta. */
|
|
6434
|
+
ciLower: number;
|
|
6435
|
+
/** Upper bound of the (1 - alpha) CI on the delta. */
|
|
6436
|
+
ciUpper: number;
|
|
6437
|
+
/** Number of bootstrap resamples used. */
|
|
6438
|
+
iterations: number;
|
|
6439
|
+
alpha: number;
|
|
6440
|
+
verdict: Verdict;
|
|
6441
|
+
}
|
|
6442
|
+
interface BootstrapOptions {
|
|
6443
|
+
/** Confidence level alpha (default 0.05 → 95% CI). */
|
|
6444
|
+
alpha?: number;
|
|
6445
|
+
/** Number of resamples (default 1000). */
|
|
6446
|
+
iterations?: number;
|
|
6447
|
+
/**
|
|
6448
|
+
* Minimum total samples (baseline + candidate) below which we always
|
|
6449
|
+
* return INCONCLUSIVE — bootstrap with too few samples is meaningless.
|
|
6450
|
+
* Default 6 (combined).
|
|
6451
|
+
*/
|
|
6452
|
+
minTotalSamples?: number;
|
|
6453
|
+
/** RNG seed for reproducibility. Default: Math.random. */
|
|
6454
|
+
seed?: number;
|
|
6455
|
+
}
|
|
6456
|
+
/**
|
|
6457
|
+
* Compute the bootstrap CI on (candidateMean - baselineMean) and a verdict.
|
|
6458
|
+
*
|
|
6459
|
+
* Uses simple percentile bootstrap on the difference of resampled means.
|
|
6460
|
+
* That's the standard non-parametric primitive — no distributional
|
|
6461
|
+
* assumptions, robust to skew, easy to reason about.
|
|
6462
|
+
*/
|
|
6463
|
+
declare function bootstrapCi(baseline: number[], candidate: number[], options?: BootstrapOptions): BootstrapResult;
|
|
6464
|
+
/**
|
|
6465
|
+
* Judge-replay promotion gate.
|
|
6466
|
+
*
|
|
6467
|
+
* The cheap inner-loop judge that drives an evolution run is by definition
|
|
6468
|
+
* fast and noisy. When you're about to promote a winning variant to the
|
|
6469
|
+
* canonical default, you want a STRONGER judge (a more expensive model, a
|
|
6470
|
+
* human grader, a separately-trained reward model) to confirm the win
|
|
6471
|
+
* generalises beyond the inner loop.
|
|
6472
|
+
*
|
|
6473
|
+
* This helper takes raw winner + baseline outputs, scores both through the
|
|
6474
|
+
* stronger judge, and applies `bootstrapCi`. ADVANCE means the stronger
|
|
6475
|
+
* judge agrees the winner is real with the configured confidence. Doesn't
|
|
6476
|
+
* matter what shape your "output" is — pass a string, an object, anything
|
|
6477
|
+
* the judge can read.
|
|
6478
|
+
*/
|
|
6479
|
+
interface JudgeReplayGateArgs<TOutput> {
|
|
6480
|
+
baselineOutputs: TOutput[];
|
|
6481
|
+
candidateOutputs: TOutput[];
|
|
6482
|
+
/** Stronger judge — async to allow LLM calls. Return a 0..N scalar score. */
|
|
6483
|
+
judge: (output: TOutput) => Promise<number> | number;
|
|
6484
|
+
alpha?: number;
|
|
6485
|
+
iterations?: number;
|
|
6486
|
+
/** RNG seed for reproducibility. */
|
|
6487
|
+
seed?: number;
|
|
6488
|
+
/** Maximum concurrent judge calls. Default 4. */
|
|
6489
|
+
judgeConcurrency?: number;
|
|
6490
|
+
}
|
|
6491
|
+
declare function judgeReplayGate<TOutput>(args: JudgeReplayGateArgs<TOutput>): Promise<BootstrapResult & {
|
|
6492
|
+
baselineSamples: number;
|
|
6493
|
+
candidateSamples: number;
|
|
6494
|
+
}>;
|
|
6495
|
+
|
|
6496
|
+
/**
|
|
6497
|
+
* Reflective mutation — primitives for trace-conditioned prompt rewriting.
|
|
6498
|
+
*
|
|
6499
|
+
* Used by `prompt-evolution.ts` (and any consumer running iterative
|
|
6500
|
+
* improvement). Given a parent prompt + concrete trace evidence (top trials,
|
|
6501
|
+
* bottom trials, missed expectations), produce an LLM-ready prompt that
|
|
6502
|
+
* proposes targeted mutations — not blind rephrasings.
|
|
6503
|
+
*
|
|
6504
|
+
* Why this lives outside `prompt-evolution.ts`: any consumer that wants to
|
|
6505
|
+
* run reflective rewriting WITHOUT the population/Pareto machinery can
|
|
6506
|
+
* import these primitives directly.
|
|
6507
|
+
*
|
|
6508
|
+
* Quality bar (vs. naive "mutate this prompt"):
|
|
6509
|
+
* - Show parent ↔ children diff, not just one variant
|
|
6510
|
+
* - Quote specific missed goldens with their match phrases
|
|
6511
|
+
* - Surface the model's actual emitted output side-by-side with what was expected
|
|
6512
|
+
* - Quote concrete mutation primitives so the model has a vocabulary
|
|
6513
|
+
*/
|
|
6514
|
+
interface TrialTrace {
|
|
6515
|
+
/** Stable id for the trial — surfaces in the prompt for grounding. */
|
|
6516
|
+
id: string;
|
|
6517
|
+
/** Score the trial received on its primary metric. */
|
|
6518
|
+
score: number;
|
|
6519
|
+
/** Candidate inputs the agent was given (e.g., the fixture or scenario). */
|
|
6520
|
+
inputName?: string;
|
|
6521
|
+
/**
|
|
6522
|
+
* Goldens / expectations this trial was tested against, with whether each
|
|
6523
|
+
* was matched. The reflection prompt quotes the missed ones specifically.
|
|
6524
|
+
*/
|
|
6525
|
+
expectations?: Array<{
|
|
6526
|
+
id: string;
|
|
6527
|
+
phrase: string;
|
|
6528
|
+
matched: boolean;
|
|
6529
|
+
}>;
|
|
6530
|
+
/** Free-form text — what the agent actually emitted (e.g., findings, plan). */
|
|
6531
|
+
emitted?: string;
|
|
6532
|
+
/** Optional structured metrics (recall, precision, cost, latency). */
|
|
6533
|
+
metrics?: Record<string, number>;
|
|
6534
|
+
}
|
|
6535
|
+
interface ReflectionContext {
|
|
6536
|
+
/** What is being mutated — appears in the system prompt for orientation. */
|
|
6537
|
+
target: string;
|
|
6538
|
+
/** Current variant's payload — JSON-serialised for the prompt. */
|
|
6539
|
+
parentPayload: unknown;
|
|
6540
|
+
/** Best-performing trials this generation. */
|
|
6541
|
+
topTrials: TrialTrace[];
|
|
6542
|
+
/** Worst-performing trials this generation — the missed-golden source. */
|
|
6543
|
+
bottomTrials: TrialTrace[];
|
|
6544
|
+
/** How many children the mutator should propose. */
|
|
6545
|
+
childCount: number;
|
|
6546
|
+
/** Optional: domain-specific mutation primitives the model can pick from. */
|
|
6547
|
+
mutationPrimitives?: string[];
|
|
6548
|
+
}
|
|
6549
|
+
declare const DEFAULT_MUTATION_PRIMITIVES: string[];
|
|
6550
|
+
/**
|
|
6551
|
+
* Build the LLM-ready reflection prompt. Output is plain text — pass it as
|
|
6552
|
+
* the user message. The system message should be small and stable (e.g.
|
|
6553
|
+
* "Output ONLY a JSON object matching the schema below.").
|
|
6554
|
+
*/
|
|
6555
|
+
declare function buildReflectionPrompt(ctx: ReflectionContext): string;
|
|
6556
|
+
interface ReflectionProposal {
|
|
6557
|
+
label: string;
|
|
6558
|
+
rationale: string;
|
|
6559
|
+
payload: unknown;
|
|
6560
|
+
}
|
|
6561
|
+
/**
|
|
6562
|
+
* Parse the model's JSON response back into proposals. Tolerates markdown
|
|
6563
|
+
* fences and surrounding prose. Returns at most `maxProposals`.
|
|
6564
|
+
*/
|
|
6565
|
+
declare function parseReflectionResponse(raw: string, maxProposals?: number): ReflectionProposal[];
|
|
6566
|
+
|
|
6567
|
+
export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type MutateAdapter, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialCache, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createLlmReviewer, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, paretoFrontierWithCrowding, parseReflectionResponse, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runPromptEvolution, runProposeReview, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
|