@tangle-network/agent-eval 0.7.1 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +1397 -1
- package/dist/index.js +2132 -42
- package/dist/index.js.map +1 -1
- package/package.json +9 -8
package/dist/index.d.ts
CHANGED
|
@@ -2828,6 +2828,26 @@ declare class CostTracker {
|
|
|
2828
2828
|
timestamp?: number;
|
|
2829
2829
|
}): CostEntry;
|
|
2830
2830
|
markOutcome(scenarioId: string, completed: boolean): void;
|
|
2831
|
+
/**
|
|
2832
|
+
* Convenience: record + markOutcome in one call from a
|
|
2833
|
+
* `{ usage, verdict }`-shaped response (starter-foundry's
|
|
2834
|
+
* `invokeMetaJudge` returns this shape; consumers that wrap any
|
|
2835
|
+
* judge/critic can follow the same convention).
|
|
2836
|
+
*
|
|
2837
|
+
* `usage.model` must be present in `MODEL_PRICING` for cost math to
|
|
2838
|
+
* populate; otherwise totalCostUsd stays at 0 for the entry but
|
|
2839
|
+
* tokens still aggregate.
|
|
2840
|
+
*/
|
|
2841
|
+
recordVerdict(verdict: {
|
|
2842
|
+
usage?: {
|
|
2843
|
+
inputTokens: number;
|
|
2844
|
+
outputTokens: number;
|
|
2845
|
+
model: string;
|
|
2846
|
+
cachedTokens?: number;
|
|
2847
|
+
reasoningTokens?: number;
|
|
2848
|
+
};
|
|
2849
|
+
verdict?: 'pass' | 'fail' | 'borderline' | string;
|
|
2850
|
+
}, scenarioId: string, tags?: Record<string, string>): CostEntry | null;
|
|
2831
2851
|
get(scenarioId: string): ScenarioCost | undefined;
|
|
2832
2852
|
list(): ScenarioCost[];
|
|
2833
2853
|
summary(): CostSummary;
|
|
@@ -2843,6 +2863,108 @@ interface CostSummary {
|
|
|
2843
2863
|
costPerCompletedTaskUsd: number | null;
|
|
2844
2864
|
}
|
|
2845
2865
|
|
|
2866
|
+
/**
|
|
2867
|
+
* muffled-gate-scanner — test helper that greps consumer source for
|
|
2868
|
+
* gate + measurement anti-patterns and fails with file:line locations.
|
|
2869
|
+
*
|
|
2870
|
+
* Named pattern lives at starter-foundry's `.evolve/patterns/muffled-gate.md`;
|
|
2871
|
+
* same shape applies to every consumer (a gate that should fail loud
|
|
2872
|
+
* returns silent success; a metric that should emit a real number
|
|
2873
|
+
* reports noise/empty).
|
|
2874
|
+
*
|
|
2875
|
+
* Usage (in a consumer project's test file):
|
|
2876
|
+
*
|
|
2877
|
+
* import { scanForMuffledGates, DEFAULT_FINDERS } from '@tangle-network/agent-eval'
|
|
2878
|
+
*
|
|
2879
|
+
* test('no muffled gates in eval surface', () => {
|
|
2880
|
+
* const findings = scanForMuffledGates({
|
|
2881
|
+
* repoRoot: process.cwd(),
|
|
2882
|
+
* scanFiles: ['src/eval/scaffold.ts', 'scripts/promote.mjs'],
|
|
2883
|
+
* finders: DEFAULT_FINDERS,
|
|
2884
|
+
* })
|
|
2885
|
+
* if (findings.length) assert.fail(formatFindings(findings))
|
|
2886
|
+
* })
|
|
2887
|
+
*
|
|
2888
|
+
* Customize by passing your own `finders` — each finder is
|
|
2889
|
+
* `(file, text) => Finding[]` and runs per-file.
|
|
2890
|
+
*
|
|
2891
|
+
* Escape hatch: any line containing `muffle-ok:` is excluded from all
|
|
2892
|
+
* finders, letting consumers opt a legitimate fallback out explicitly.
|
|
2893
|
+
*/
|
|
2894
|
+
interface MuffledFinding {
|
|
2895
|
+
file: string;
|
|
2896
|
+
line: number;
|
|
2897
|
+
lineText: string;
|
|
2898
|
+
pattern: string;
|
|
2899
|
+
}
|
|
2900
|
+
type MuffledFinder = (file: string, text: string) => MuffledFinding[];
|
|
2901
|
+
interface ScanOptions {
|
|
2902
|
+
/** Absolute path to the repo root. */
|
|
2903
|
+
repoRoot: string;
|
|
2904
|
+
/** Explicit file list (paths relative to repoRoot) for context-specific finders. */
|
|
2905
|
+
scanFiles: string[];
|
|
2906
|
+
/**
|
|
2907
|
+
* Auto-derived scan: walk these dirs for files matching importGlob + the
|
|
2908
|
+
* string `importsContain` and run the universal finders on them. Pattern
|
|
2909
|
+
* from starter-foundry H4 (research/decisions/001) — catches new files
|
|
2910
|
+
* with agent-eval import that would otherwise escape context-specific
|
|
2911
|
+
* scan lists.
|
|
2912
|
+
*/
|
|
2913
|
+
autoDerive?: {
|
|
2914
|
+
roots: string[];
|
|
2915
|
+
extensions: RegExp;
|
|
2916
|
+
importsContain: string;
|
|
2917
|
+
universalFinders: MuffledFinder[];
|
|
2918
|
+
};
|
|
2919
|
+
/** Per-file finders (context-specific patterns). */
|
|
2920
|
+
finders: MuffledFinder[];
|
|
2921
|
+
}
|
|
2922
|
+
/**
|
|
2923
|
+
* Default finder: `command || true` in a testCommand/setupCommand/cmd/command
|
|
2924
|
+
* string. Swallows exit codes.
|
|
2925
|
+
*/
|
|
2926
|
+
declare const findFallbackToPass: MuffledFinder;
|
|
2927
|
+
/**
|
|
2928
|
+
* `testCommand: 'true'` literal silent-pass — an unknown-language dispatch
|
|
2929
|
+
* arm that returns a no-op instead of throwing.
|
|
2930
|
+
*/
|
|
2931
|
+
declare const findLiteralTruePass: MuffledFinder;
|
|
2932
|
+
/**
|
|
2933
|
+
* `new SubprocessSandboxDriver({ cwd: ... })` — constructor arg silently
|
|
2934
|
+
* dropped in agent-eval <0.7.1. 0.7.1+ honors as fallback, but the form
|
|
2935
|
+
* still invites confusion; prefer `new SubprocessSandboxDriver()` with
|
|
2936
|
+
* cwd in the per-call HarnessConfig.
|
|
2937
|
+
*/
|
|
2938
|
+
declare const findConstructorCwdDropped: MuffledFinder;
|
|
2939
|
+
/**
|
|
2940
|
+
* `if (!expected) return true` — matcher auto-passes when ground truth is
|
|
2941
|
+
* absent. Inflates accuracy metrics for scenarios without expectations.
|
|
2942
|
+
*/
|
|
2943
|
+
declare const findAutoMatchNoExpectation: MuffledFinder;
|
|
2944
|
+
/**
|
|
2945
|
+
* `if (p.skipped) return true` — skip-counts-as-pass in quality scorers.
|
|
2946
|
+
* Use three-valued `true | false | 'skipped'` return + explicit partial
|
|
2947
|
+
* credit instead.
|
|
2948
|
+
*/
|
|
2949
|
+
declare const findSkipCountsAsPass: MuffledFinder;
|
|
2950
|
+
/**
|
|
2951
|
+
* The canonical default bundle. Callers can import these individually,
|
|
2952
|
+
* replace them, or append custom finders for project-specific patterns.
|
|
2953
|
+
*/
|
|
2954
|
+
declare const DEFAULT_FINDERS: MuffledFinder[];
|
|
2955
|
+
/** Finders that should run on EVERY file with the target import, not just SCAN_FILES. */
|
|
2956
|
+
declare const UNIVERSAL_FINDERS: MuffledFinder[];
|
|
2957
|
+
/**
|
|
2958
|
+
* Run all finders against the configured files. Returns a flat list of
|
|
2959
|
+
* findings. Callers format + assert as they prefer.
|
|
2960
|
+
*/
|
|
2961
|
+
declare function scanForMuffledGates(opts: ScanOptions): MuffledFinding[];
|
|
2962
|
+
/**
|
|
2963
|
+
* Format findings into a single assert.fail-ready message. Each finding
|
|
2964
|
+
* carries file:line + pattern name + the offending line.
|
|
2965
|
+
*/
|
|
2966
|
+
declare function formatFindings(findings: MuffledFinding[]): string;
|
|
2967
|
+
|
|
2846
2968
|
/**
|
|
2847
2969
|
* Series convergence — detects whether a sequence of scalar measurements
|
|
2848
2970
|
* is stabilizing, drifting, or noisy.
|
|
@@ -4674,4 +4796,1278 @@ interface UseCaseSignals {
|
|
|
4674
4796
|
declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
|
|
4675
4797
|
declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
|
|
4676
4798
|
|
|
4677
|
-
export { type ActiveLearningOptions, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeploymentOutcome, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmJsonCall, type LlmReviewerConfig, type LlmSpan, MODEL_PRICING, type MatcherResult, type MeasurementPolicy, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxResult, type SandboxSpan, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type Verification, type VerifyFn, type VisualDiffOptions, type VisualDiffResult, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, createLlmReviewer, crossTraceDiff, defaultJudges, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, failureClusterView, fileContains, fileExists, firstDivergenceView, formatBenchmarkReport, formatDriverReport, groupBy, hashContent, hashScenarios, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReviewStore, judgeAgreementView, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, lowercaseMutator, mannWhitneyU, mergeSteeringBundle, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runJudgeFleet, runProposeReview, runSelfPlay, runTestGradedScenario, runsForScenario, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
|
|
4799
|
+
/**
|
|
4800
|
+
* LLM client with graceful degrade.
|
|
4801
|
+
*
|
|
4802
|
+
* OpenAI-compatible `/v1/chat/completions` client with:
|
|
4803
|
+
* - Exponential-backoff retry on 429 + 5xx gateway errors (502/503/504).
|
|
4804
|
+
* - Retry on transient network errors (fetch failed, AbortError, ECONNRESET).
|
|
4805
|
+
* - Graceful json_schema → json_object degrade on 400 with schema-reject body.
|
|
4806
|
+
* - Fenced-JSON stripping (```json ... ```) for models that wrap structured output.
|
|
4807
|
+
* - Configurable base URL + api key / bearer, works with LiteLLM proxies, OpenAI
|
|
4808
|
+
* directly, cli-bridge subscriptions, and any router that speaks the spec.
|
|
4809
|
+
*
|
|
4810
|
+
* Usage:
|
|
4811
|
+
* const { value, result } = await callLlmJson<MyType>(
|
|
4812
|
+
* { model: 'gpt-4o', messages: [...], jsonSchema: { name: 'x', schema: {...} } },
|
|
4813
|
+
* { baseUrl: 'https://router.tangle.tools/v1', apiKey: process.env.KEY },
|
|
4814
|
+
* )
|
|
4815
|
+
*
|
|
4816
|
+
* This is THE llm-calling seam for agent-eval primitives that need structured
|
|
4817
|
+
* output (semantic concept judge, reviewer directives, critic scores). Primitives
|
|
4818
|
+
* that need free-form text use `callLlm` and parse output themselves.
|
|
4819
|
+
*/
|
|
4820
|
+
interface LlmMessage {
|
|
4821
|
+
role: 'system' | 'user' | 'assistant';
|
|
4822
|
+
/**
|
|
4823
|
+
* Either a plain text content string OR a multimodal content array
|
|
4824
|
+
* (text + image_url parts) for vision-capable models.
|
|
4825
|
+
*/
|
|
4826
|
+
content: string | Array<{
|
|
4827
|
+
type: 'text';
|
|
4828
|
+
text: string;
|
|
4829
|
+
} | {
|
|
4830
|
+
type: 'image_url';
|
|
4831
|
+
image_url: {
|
|
4832
|
+
url: string;
|
|
4833
|
+
detail?: 'auto' | 'low' | 'high';
|
|
4834
|
+
};
|
|
4835
|
+
}>;
|
|
4836
|
+
}
|
|
4837
|
+
interface LlmCallRequest {
|
|
4838
|
+
model: string;
|
|
4839
|
+
messages: LlmMessage[];
|
|
4840
|
+
/** Optional JSON-mode response format (response_format: json_object). */
|
|
4841
|
+
jsonMode?: boolean;
|
|
4842
|
+
/** Optional structured output via JSON Schema. Falls back to json_object on 400. */
|
|
4843
|
+
jsonSchema?: {
|
|
4844
|
+
name: string;
|
|
4845
|
+
schema: Record<string, unknown>;
|
|
4846
|
+
};
|
|
4847
|
+
temperature?: number;
|
|
4848
|
+
maxTokens?: number;
|
|
4849
|
+
/** Per-call timeout, default 60s. */
|
|
4850
|
+
timeoutMs?: number;
|
|
4851
|
+
}
|
|
4852
|
+
interface LlmUsage {
|
|
4853
|
+
promptTokens: number;
|
|
4854
|
+
completionTokens: number;
|
|
4855
|
+
totalTokens: number;
|
|
4856
|
+
/** Proxies populate this when prompt caching is on. */
|
|
4857
|
+
cachedPromptTokens?: number;
|
|
4858
|
+
}
|
|
4859
|
+
interface LlmCallResult {
|
|
4860
|
+
/** The text content of the first choice. Empty string if none. */
|
|
4861
|
+
content: string;
|
|
4862
|
+
usage: LlmUsage;
|
|
4863
|
+
/**
|
|
4864
|
+
* Cost in USD. Pulled from proxy's `_response_cost` field when present;
|
|
4865
|
+
* `null` when neither the proxy nor the caller can derive it.
|
|
4866
|
+
*/
|
|
4867
|
+
costUsd: number | null;
|
|
4868
|
+
/** Model name actually used (echoed from response). */
|
|
4869
|
+
model: string;
|
|
4870
|
+
/** Wall-clock duration of the HTTP call (last attempt, if retried). */
|
|
4871
|
+
durationMs: number;
|
|
4872
|
+
/** Raw response body. */
|
|
4873
|
+
raw: Record<string, unknown>;
|
|
4874
|
+
}
|
|
4875
|
+
declare class LlmCallError extends Error {
|
|
4876
|
+
readonly status: number;
|
|
4877
|
+
readonly body: string;
|
|
4878
|
+
readonly model: string;
|
|
4879
|
+
constructor(message: string, status: number, body: string, model: string);
|
|
4880
|
+
}
|
|
4881
|
+
interface LlmClientOptions {
|
|
4882
|
+
/** Base URL (without trailing slash). Must end at the `/v1` prefix. */
|
|
4883
|
+
baseUrl?: string;
|
|
4884
|
+
/** Bearer token — either `apiKey` or `bearer` populates `Authorization: Bearer ...`. */
|
|
4885
|
+
apiKey?: string;
|
|
4886
|
+
bearer?: string;
|
|
4887
|
+
/** Override for the `Authorization` header (e.g. `X-Auth: ...`). Takes precedence over apiKey/bearer. */
|
|
4888
|
+
authHeader?: {
|
|
4889
|
+
name: string;
|
|
4890
|
+
value: string;
|
|
4891
|
+
};
|
|
4892
|
+
/** Default timeout in ms. Per-call can override. */
|
|
4893
|
+
defaultTimeoutMs?: number;
|
|
4894
|
+
/** Max retry attempts on retriable errors. Default 3 (1 initial + 2 retries). */
|
|
4895
|
+
maxRetries?: number;
|
|
4896
|
+
/** Fetch implementation — defaults to global `fetch`. Override for custom transport (e.g. tests). */
|
|
4897
|
+
fetch?: typeof fetch;
|
|
4898
|
+
}
|
|
4899
|
+
/**
|
|
4900
|
+
* Strip a ```json / ``` code fence if the model emitted one.
|
|
4901
|
+
* Idempotent for naked JSON. Some models (claude-code via router, certain
|
|
4902
|
+
* deepseek models) wrap output even under json_object.
|
|
4903
|
+
*/
|
|
4904
|
+
declare function stripFencedJson(raw: string): string;
|
|
4905
|
+
/**
|
|
4906
|
+
* Low-level call. Returns raw content + usage + cost. Retries on transient
|
|
4907
|
+
* failures; does NOT degrade schema here — callers that want graceful
|
|
4908
|
+
* degrade use `callLlmJson`.
|
|
4909
|
+
*/
|
|
4910
|
+
declare function callLlm(req: LlmCallRequest, opts?: LlmClientOptions): Promise<LlmCallResult>;
|
|
4911
|
+
/**
|
|
4912
|
+
* Structured-output call. Returns parsed JSON plus the raw result envelope.
|
|
4913
|
+
* Degrades `jsonSchema` → `jsonMode` on a 400 that names the schema param —
|
|
4914
|
+
* critical for deepseek-v3/v4, kimi-k2.6, and other models that don't accept
|
|
4915
|
+
* the `response_format.json_schema` shape but DO accept `json_object`.
|
|
4916
|
+
*/
|
|
4917
|
+
declare function callLlmJson<T = unknown>(req: LlmCallRequest, opts?: LlmClientOptions): Promise<{
|
|
4918
|
+
value: T;
|
|
4919
|
+
result: LlmCallResult;
|
|
4920
|
+
}>;
|
|
4921
|
+
/**
|
|
4922
|
+
* Probe whether a model is reachable. Returns latency + null error on
|
|
4923
|
+
* success; `ok=false` + error message on any failure (HTTP, timeout,
|
|
4924
|
+
* network, parse). Designed for sweep preflights — fail loud at the
|
|
4925
|
+
* boundary before burning a 30-leaf run on a misconfigured router.
|
|
4926
|
+
*
|
|
4927
|
+
* Sends a tiny `ping` message with `maxTokens=64`. Reasoning models
|
|
4928
|
+
* (glm-5.1, deepseek-v4) can burn the entire budget on internal reasoning
|
|
4929
|
+
* for short prompts, so don't tighten this further. We don't validate
|
|
4930
|
+
* content; HTTP 200 means reachable.
|
|
4931
|
+
*/
|
|
4932
|
+
declare function probeLlm(model: string, opts?: LlmClientOptions & {
|
|
4933
|
+
timeoutMs?: number;
|
|
4934
|
+
}): Promise<{
|
|
4935
|
+
ok: boolean;
|
|
4936
|
+
latencyMs: number;
|
|
4937
|
+
error: string | null;
|
|
4938
|
+
}>;
|
|
4939
|
+
/**
|
|
4940
|
+
* Stateful client — construct once with defaults, call many times.
|
|
4941
|
+
* Thin wrapper around the free functions; exists for callers that want
|
|
4942
|
+
* to inject a single configured instance into multiple primitives.
|
|
4943
|
+
*/
|
|
4944
|
+
declare class LlmClient {
|
|
4945
|
+
private readonly opts;
|
|
4946
|
+
constructor(opts?: LlmClientOptions);
|
|
4947
|
+
call(req: LlmCallRequest, per?: LlmClientOptions): Promise<LlmCallResult>;
|
|
4948
|
+
callJson<T = unknown>(req: LlmCallRequest, per?: LlmClientOptions): Promise<{
|
|
4949
|
+
value: T;
|
|
4950
|
+
result: LlmCallResult;
|
|
4951
|
+
}>;
|
|
4952
|
+
}
|
|
4953
|
+
|
|
4954
|
+
/**
|
|
4955
|
+
* Multi-layer verifier — ordered pipeline of verification layers.
|
|
4956
|
+
*
|
|
4957
|
+
* Different contract from {@link JudgeRunner} (which runs parallel
|
|
4958
|
+
* specs against a sandbox). MultiLayerVerifier is a DAG of layers
|
|
4959
|
+
* (install → typecheck → build → lint → serve → semantic → …) with
|
|
4960
|
+
* dependency-based skip, per-layer findings, soft-fail semantics, and
|
|
4961
|
+
* an aggregated `blendedScore` across all passed layers.
|
|
4962
|
+
*
|
|
4963
|
+
* Use when you want:
|
|
4964
|
+
* - ordered stages where a failing upstream stage skips downstream ones
|
|
4965
|
+
* - each stage produces rich `findings` (severity + message + evidence)
|
|
4966
|
+
* - a single composite score across stages with per-stage weights
|
|
4967
|
+
* - soft-fail stages whose failure doesn't abort the pipeline
|
|
4968
|
+
*
|
|
4969
|
+
* Use {@link JudgeRunner} when you want:
|
|
4970
|
+
* - N independent judges running in parallel against the same artifact
|
|
4971
|
+
* - no inter-judge dependencies
|
|
4972
|
+
* - boolean `passed` per judge + overall
|
|
4973
|
+
*
|
|
4974
|
+
* Both primitives compose — JudgeRunner can be invoked as a single
|
|
4975
|
+
* layer inside a MultiLayerVerifier if that suits the caller.
|
|
4976
|
+
*/
|
|
4977
|
+
type LayerStatus = 'pass' | 'fail' | 'skipped' | 'error' | 'timeout';
|
|
4978
|
+
type Severity = 'critical' | 'major' | 'minor' | 'info';
|
|
4979
|
+
interface Finding {
|
|
4980
|
+
severity: Severity;
|
|
4981
|
+
message: string;
|
|
4982
|
+
evidence?: string;
|
|
4983
|
+
/** Optional layer name the finding belongs to (set by the verifier if omitted). */
|
|
4984
|
+
layer?: string;
|
|
4985
|
+
/**
|
|
4986
|
+
* Free-form structured payload — used by `multiToolchainLayer` to attach
|
|
4987
|
+
* `{ adapter: 'pnpm' }`, by judges to attach evidence pointers, etc.
|
|
4988
|
+
* Renderers MAY interrogate; agent-eval primitives never assume shape.
|
|
4989
|
+
*/
|
|
4990
|
+
detail?: Record<string, unknown>;
|
|
4991
|
+
}
|
|
4992
|
+
interface LayerResult {
|
|
4993
|
+
layer: string;
|
|
4994
|
+
status: LayerStatus;
|
|
4995
|
+
/** 0..1 score, optional — layers that don't produce a numeric score omit. */
|
|
4996
|
+
score?: number;
|
|
4997
|
+
durationMs: number;
|
|
4998
|
+
findings: Finding[];
|
|
4999
|
+
/** Short human-readable summary (one line). */
|
|
5000
|
+
reason?: string;
|
|
5001
|
+
/**
|
|
5002
|
+
* Numeric layer-level diagnostics: error counts, warning counts,
|
|
5003
|
+
* cyclomatic complexity, total adapter wall-time, etc. Keyed by
|
|
5004
|
+
* diagnostic name; null = "diagnostic not applicable / not measured."
|
|
5005
|
+
* Renderers that know the keys can display them; ones that don't,
|
|
5006
|
+
* ignore. Free-form on purpose — consumers type the value shape in
|
|
5007
|
+
* their own namespace. Added in 0.10.
|
|
5008
|
+
*/
|
|
5009
|
+
diagnostics?: Record<string, number | null>;
|
|
5010
|
+
/** Any rich per-layer detail — rendered as-is by consumers that know the layer. */
|
|
5011
|
+
detail?: Record<string, unknown>;
|
|
5012
|
+
}
|
|
5013
|
+
interface VerifyContext<Env = unknown> {
|
|
5014
|
+
/** Per-run opaque context the caller provides. Layers destructure what they need. */
|
|
5015
|
+
env: Env;
|
|
5016
|
+
/** Previously-computed results from layers that already ran. */
|
|
5017
|
+
prior: Record<string, LayerResult>;
|
|
5018
|
+
/** Signal — if aborted, layers MUST bail within reasonable wall. */
|
|
5019
|
+
signal: AbortSignal;
|
|
5020
|
+
}
|
|
5021
|
+
interface Layer<Env = unknown> {
|
|
5022
|
+
name: string;
|
|
5023
|
+
/** Stages that must have `status: 'pass'` before this layer runs. */
|
|
5024
|
+
dependsOn?: string[];
|
|
5025
|
+
/**
|
|
5026
|
+
* Weight in the composite `blendedScore`. Default 1.0. Layers with weight 0
|
|
5027
|
+
* contribute findings but not score.
|
|
5028
|
+
*/
|
|
5029
|
+
weight?: number;
|
|
5030
|
+
/**
|
|
5031
|
+
* If true, a `fail` status contributes to `blendedScore` (as 0) instead of
|
|
5032
|
+
* being dropped — use for layers whose failure is a real signal. Default:
|
|
5033
|
+
* fail drops from numerator + denominator, matching VB's existing semantics.
|
|
5034
|
+
*/
|
|
5035
|
+
failContributesToScore?: boolean;
|
|
5036
|
+
/** Optional per-layer wall-cap in ms. Honored by the verifier (AbortSignal). */
|
|
5037
|
+
capMs?: number;
|
|
5038
|
+
run: (ctx: VerifyContext<Env>) => Promise<LayerResult> | LayerResult;
|
|
5039
|
+
}
|
|
5040
|
+
interface VerifyOptions<Env = unknown> {
|
|
5041
|
+
env: Env;
|
|
5042
|
+
/**
|
|
5043
|
+
* Overall wall cap. Default: sum of layer capMs, or Infinity if any layer
|
|
5044
|
+
* omits a cap. The verifier short-circuits remaining layers on overall cap.
|
|
5045
|
+
*/
|
|
5046
|
+
overallCapMs?: number;
|
|
5047
|
+
/** Called with each layer result as it completes. */
|
|
5048
|
+
onLayer?: (result: LayerResult) => void;
|
|
5049
|
+
}
|
|
5050
|
+
interface VerificationReport {
|
|
5051
|
+
layers: LayerResult[];
|
|
5052
|
+
passCount: number;
|
|
5053
|
+
failCount: number;
|
|
5054
|
+
skippedCount: number;
|
|
5055
|
+
errorCount: number;
|
|
5056
|
+
/** True iff at least one scored layer ran AND every scored layer passed. */
|
|
5057
|
+
allPass: boolean;
|
|
5058
|
+
/**
|
|
5059
|
+
* Weighted mean of `score` across contributing layers. 0 when no layers
|
|
5060
|
+
* contributed. See {@link Layer.failContributesToScore} for fail semantics.
|
|
5061
|
+
*/
|
|
5062
|
+
blendedScore: number;
|
|
5063
|
+
durationMs: number;
|
|
5064
|
+
startedAt: string;
|
|
5065
|
+
finishedAt: string;
|
|
5066
|
+
}
|
|
5067
|
+
/**
|
|
5068
|
+
* Grade a semantic-concept-style judge result into a single layer status.
|
|
5069
|
+
*
|
|
5070
|
+
* Pass when overall score >= threshold AND no critical-severity concept gap.
|
|
5071
|
+
* Fail otherwise. Use inside a `Layer.run` when wrapping a concept judge.
|
|
5072
|
+
*
|
|
5073
|
+
* Generalized from VerticalBench H3 fix: `failingConcepts.length === 0` was
|
|
5074
|
+
* too strict — a single concept at 6/10 failed the entire layer despite
|
|
5075
|
+
* overall score being >= 0.7. Now we trust the judge's own `severity` field:
|
|
5076
|
+
* `critical` findings veto; `major`/`minor` reduce the score but don't veto.
|
|
5077
|
+
*/
|
|
5078
|
+
declare function gradeSemanticStatus(input: {
|
|
5079
|
+
score: number;
|
|
5080
|
+
findings: Array<{
|
|
5081
|
+
severity: Severity;
|
|
5082
|
+
present?: boolean;
|
|
5083
|
+
score?: number;
|
|
5084
|
+
}>;
|
|
5085
|
+
available: boolean;
|
|
5086
|
+
threshold?: number;
|
|
5087
|
+
}): LayerStatus;
|
|
5088
|
+
declare class MultiLayerVerifier<Env = unknown> {
|
|
5089
|
+
private readonly layers;
|
|
5090
|
+
constructor(layers: Layer<Env>[]);
|
|
5091
|
+
run(opts: VerifyOptions<Env>): Promise<VerificationReport>;
|
|
5092
|
+
}
|
|
5093
|
+
|
|
5094
|
+
/**
|
|
5095
|
+
* CommandRunner — abstract subprocess execution surface.
|
|
5096
|
+
*
|
|
5097
|
+
* Layers in a {@link MultiLayerVerifier} that need to invoke external
|
|
5098
|
+
* tools (compilers, test runners, package managers) call out via this
|
|
5099
|
+
* interface rather than directly using `child_process`. Two reasons:
|
|
5100
|
+
*
|
|
5101
|
+
* 1. **Sandbox interchangeability.** A run that targets a sandbox box
|
|
5102
|
+
* (via SDK-specific Box.exec) and a run that targets the host both
|
|
5103
|
+
* satisfy this same contract. The harness doesn't care which.
|
|
5104
|
+
* 2. **Testability.** Tests inject a fake runner and assert on calls
|
|
5105
|
+
* without spawning real subprocesses.
|
|
5106
|
+
*
|
|
5107
|
+
* agent-eval ships only the local implementation (host-process). Sandbox
|
|
5108
|
+
* implementations live with their consumer because they depend on
|
|
5109
|
+
* SDK-specific Box / Sandbox types that don't belong in this package.
|
|
5110
|
+
*/
|
|
5111
|
+
interface RunCommandInput {
|
|
5112
|
+
/** Executable name, looked up via PATH unless absolute. */
|
|
5113
|
+
cmd: string;
|
|
5114
|
+
/** Argument vector, NOT shell-interpolated. Each element passed to argv. */
|
|
5115
|
+
argv: string[];
|
|
5116
|
+
/** Working directory. Defaults to runner's notion of cwd if omitted. */
|
|
5117
|
+
cwd?: string;
|
|
5118
|
+
/**
|
|
5119
|
+
* Wall-clock cap in ms. The runner SHOULD return `timedOut: true` when
|
|
5120
|
+
* exceeded; callers MAY treat status null + timedOut as "killed."
|
|
5121
|
+
*/
|
|
5122
|
+
capMs?: number;
|
|
5123
|
+
/** Env overrides merged on top of the runner's base environment. */
|
|
5124
|
+
env?: Record<string, string>;
|
|
5125
|
+
/** Optional stdin payload. */
|
|
5126
|
+
stdin?: string;
|
|
5127
|
+
}
|
|
5128
|
+
interface RunCommandResult {
|
|
5129
|
+
/** Exit code, or null when the process couldn't start / was killed. */
|
|
5130
|
+
status: number | null;
|
|
5131
|
+
stdout: string;
|
|
5132
|
+
stderr: string;
|
|
5133
|
+
durationMs: number;
|
|
5134
|
+
timedOut: boolean;
|
|
5135
|
+
/** Non-fatal runner-side error (binary missing, signal, etc.). */
|
|
5136
|
+
runnerError?: string;
|
|
5137
|
+
}
|
|
5138
|
+
interface DirEntry {
|
|
5139
|
+
name: string;
|
|
5140
|
+
isDirectory: boolean;
|
|
5141
|
+
isFile: boolean;
|
|
5142
|
+
/** File size in bytes. `null` for directories (not stat'd). */
|
|
5143
|
+
sizeBytes: number | null;
|
|
5144
|
+
}
|
|
5145
|
+
interface CommandRunner {
|
|
5146
|
+
/** Identifier for telemetry + logs. Open-ended literal-union for new runners. */
|
|
5147
|
+
readonly name: string;
|
|
5148
|
+
/** Execute a command in the runner's environment. */
|
|
5149
|
+
run(input: RunCommandInput): Promise<RunCommandResult>;
|
|
5150
|
+
/** True iff `<name>` resolves on the runner's PATH. */
|
|
5151
|
+
hasBin(name: string): Promise<boolean>;
|
|
5152
|
+
/** True iff the given path exists in the runner's filesystem. */
|
|
5153
|
+
fileExists(path: string): Promise<boolean>;
|
|
5154
|
+
/** Read a file. Returns `null` if missing or unreadable. */
|
|
5155
|
+
readFile(path: string): Promise<string | null>;
|
|
5156
|
+
/** List a directory. Returns `[]` if unreadable / missing. */
|
|
5157
|
+
readDir(path: string): Promise<DirEntry[]>;
|
|
5158
|
+
}
|
|
5159
|
+
/**
|
|
5160
|
+
* Host-process runner. Uses node:child_process spawnSync (synchronous
|
|
5161
|
+
* under the hood — wrapped in a Promise to satisfy the interface). For
|
|
5162
|
+
* very long-running commands consider an async-spawn variant; this
|
|
5163
|
+
* shape matches VB's existing behavior and is fine for build/test/lint
|
|
5164
|
+
* subprocesses that finish in seconds-to-minutes.
|
|
5165
|
+
*/
|
|
5166
|
+
declare const localCommandRunner: CommandRunner;
|
|
5167
|
+
|
|
5168
|
+
/**
|
|
5169
|
+
* Multi-toolchain layer factory + merge helper.
|
|
5170
|
+
*
|
|
5171
|
+
* Some verification stages (install, typecheck, build, lint) run the
|
|
5172
|
+
* SAME logical layer across multiple parallel adapters — pnpm AND npm
|
|
5173
|
+
* AND cargo AND forge for a polyglot scaffold. The verifier presents
|
|
5174
|
+
* one row per stage; the toolchain breakdown lives in `findings.detail`.
|
|
5175
|
+
*
|
|
5176
|
+
* This module provides the merge: take N independent `LayerResult`s
|
|
5177
|
+
* (one per adapter) and reduce them to a single `LayerResult` whose
|
|
5178
|
+
* status is the worst of the parts and whose findings cite the adapter
|
|
5179
|
+
* that produced each one. Plus a {@link multiToolchainLayer} factory
|
|
5180
|
+
* that runs the adapter calls in parallel + applies the reducer.
|
|
5181
|
+
*
|
|
5182
|
+
* Pure utility — composes with {@link MultiLayerVerifier}.{run}.
|
|
5183
|
+
*/
|
|
5184
|
+
|
|
5185
|
+
interface AdapterRun {
|
|
5186
|
+
/** Identifier for the adapter (e.g. 'pnpm', 'npm', 'cargo', 'forge'). */
|
|
5187
|
+
adapter: string;
|
|
5188
|
+
result: LayerResult;
|
|
5189
|
+
}
|
|
5190
|
+
interface MergeOptions {
|
|
5191
|
+
/**
|
|
5192
|
+
* How to combine per-adapter `durationMs`. Default `'max'` (parallel
|
|
5193
|
+
* wall-clock). Set `'sum'` when reporting total work done across
|
|
5194
|
+
* adapters rather than wall time.
|
|
5195
|
+
*/
|
|
5196
|
+
mergeDuration?: 'max' | 'sum';
|
|
5197
|
+
/**
|
|
5198
|
+
* Prefix finding messages with a per-adapter tag (e.g. `[pnpm] typecheck failed`).
|
|
5199
|
+
* Default: no prefix (renderers read `detail.adapter` instead).
|
|
5200
|
+
*/
|
|
5201
|
+
messagePrefixer?: (adapter: string) => string;
|
|
5202
|
+
/**
|
|
5203
|
+
* How to reduce per-adapter `LayerResult.diagnostics` into the merged
|
|
5204
|
+
* result's diagnostics. `'max'` (default) — for each key, merged =
|
|
5205
|
+
* max across adapters where value is non-null (matches "if ANY adapter
|
|
5206
|
+
* saw N errors, merged saw N"). `'sum'` — sum non-null values.
|
|
5207
|
+
*/
|
|
5208
|
+
mergeDiagnostics?: 'max' | 'sum';
|
|
5209
|
+
}
|
|
5210
|
+
/**
|
|
5211
|
+
* Reduce N adapter runs to a single `LayerResult` for a logical layer.
|
|
5212
|
+
*
|
|
5213
|
+
* - status: worst of the parts (pass < skipped < fail < timeout < error)
|
|
5214
|
+
* - score: weighted mean of numeric scores (skip = no contribution)
|
|
5215
|
+
* - findings: union, each tagged with `detail.adapter`
|
|
5216
|
+
* - durationMs: `mergeDuration` option (default 'max' for parallel wall-clock)
|
|
5217
|
+
* - diagnostics: `mergeDiagnostics` option (default 'max' per key)
|
|
5218
|
+
* - reason: " · "-joined `name: status` per adapter
|
|
5219
|
+
*/
|
|
5220
|
+
declare function mergeLayerResults(name: string, perAdapter: AdapterRun[], options?: MergeOptions): LayerResult;
|
|
5221
|
+
interface MultiToolchainLayerConfig<Env, Adapter> {
|
|
5222
|
+
name: string;
|
|
5223
|
+
adapters: ReadonlyArray<Adapter>;
|
|
5224
|
+
/** Adapter identifier — used in findings + reason. */
|
|
5225
|
+
adapterName: (a: Adapter) => string;
|
|
5226
|
+
/** Run a single adapter against the verify context. */
|
|
5227
|
+
run: (a: Adapter, ctx: VerifyContext<Env>) => Promise<LayerResult> | LayerResult;
|
|
5228
|
+
dependsOn?: string[];
|
|
5229
|
+
weight?: number;
|
|
5230
|
+
failContributesToScore?: boolean;
|
|
5231
|
+
capMs?: number;
|
|
5232
|
+
/**
|
|
5233
|
+
* Per-adapter parallel cap. Defaults to 8 — defense in depth against a
|
|
5234
|
+
* caller passing 50 adapters and fanning out 50 simultaneous subprocesses.
|
|
5235
|
+
* Adapters that need higher concurrency raise this explicitly.
|
|
5236
|
+
*/
|
|
5237
|
+
maxParallel?: number;
|
|
5238
|
+
}
|
|
5239
|
+
/**
|
|
5240
|
+
* Build a {@link Layer} that fans the same logical stage across N adapters
|
|
5241
|
+
* in parallel and merges via {@link mergeLayerResults}.
|
|
5242
|
+
*
|
|
5243
|
+
* Per-adapter throws are caught + converted to `status: 'error'` results
|
|
5244
|
+
* so one bad adapter doesn't poison the whole layer.
|
|
5245
|
+
*/
|
|
5246
|
+
declare function multiToolchainLayer<Env, Adapter>(config: MultiToolchainLayerConfig<Env, Adapter>): Layer<Env>;
|
|
5247
|
+
|
|
5248
|
+
/**
|
|
5249
|
+
* Reviewer primitives — prompt builder + default ReviewFn factory.
|
|
5250
|
+
*
|
|
5251
|
+
* `buildReviewerPrompt` is the pure, LLM-agnostic piece: takes
|
|
5252
|
+
* `ReviewerPromptInput` (user request, trace summary, verification
|
|
5253
|
+
* summary, memory, optional extra context) and emits the system +
|
|
5254
|
+
* user message pair. No LLM dependency — callers that want to drive
|
|
5255
|
+
* their own transport get full control.
|
|
5256
|
+
*
|
|
5257
|
+
* `createDefaultReviewer` is the convenience factory: wires the prompt
|
|
5258
|
+
* builder to `callLlmJson` with a default schema + soft-fail policy.
|
|
5259
|
+
* Returns a function that maps `ReviewerPromptInput` to `ReviewerOutput`.
|
|
5260
|
+
*
|
|
5261
|
+
* Same pattern as `runSemanticConceptJudge` / `createSemanticConceptJudge`:
|
|
5262
|
+
* low-level pure builder + high-level factory built on top.
|
|
5263
|
+
*/
|
|
5264
|
+
|
|
5265
|
+
interface ReviewerMemoryEntry {
|
|
5266
|
+
shot: number;
|
|
5267
|
+
ts?: string;
|
|
5268
|
+
observations?: string;
|
|
5269
|
+
diagnosis?: string;
|
|
5270
|
+
nextShotInstruction?: string;
|
|
5271
|
+
shouldContinue?: boolean;
|
|
5272
|
+
confidence?: number;
|
|
5273
|
+
}
|
|
5274
|
+
interface ReviewerVerificationSummary {
|
|
5275
|
+
blendedScore: number;
|
|
5276
|
+
allPass: boolean;
|
|
5277
|
+
failCount: number;
|
|
5278
|
+
failingLayers?: string[];
|
|
5279
|
+
}
|
|
5280
|
+
interface ReviewerPromptInput {
|
|
5281
|
+
shot: number;
|
|
5282
|
+
userRequest: string;
|
|
5283
|
+
/**
|
|
5284
|
+
* Compact trace summary — tool-call counts, errors, recent activity
|
|
5285
|
+
* lines. Built by the caller from whatever trace format they have;
|
|
5286
|
+
* agent-eval does not prescribe.
|
|
5287
|
+
*/
|
|
5288
|
+
traceSummary: string;
|
|
5289
|
+
verification: ReviewerVerificationSummary;
|
|
5290
|
+
memory: ReviewerMemoryEntry[];
|
|
5291
|
+
/**
|
|
5292
|
+
* Optional extra context injected into the prompt between the trace
|
|
5293
|
+
* and the verification blocks. Use for workdir file-tree snapshots,
|
|
5294
|
+
* scaffold descriptions, or any environmental fact the reviewer
|
|
5295
|
+
* needs to direct the next shot accurately.
|
|
5296
|
+
*/
|
|
5297
|
+
extraContext?: string;
|
|
5298
|
+
/**
|
|
5299
|
+
* Optional extra section appended at the end of the prompt (e.g.
|
|
5300
|
+
* leaf metadata, scenario id). Free-form — no agent-eval-shaped
|
|
5301
|
+
* schema.
|
|
5302
|
+
*/
|
|
5303
|
+
trailingContext?: string;
|
|
5304
|
+
}
|
|
5305
|
+
interface ReviewerOutput {
|
|
5306
|
+
shot: number;
|
|
5307
|
+
observations: string;
|
|
5308
|
+
diagnosis: string;
|
|
5309
|
+
nextShotInstruction: string;
|
|
5310
|
+
shouldContinue: boolean;
|
|
5311
|
+
/** 0..1 self-assessed confidence in the directive. */
|
|
5312
|
+
confidence: number;
|
|
5313
|
+
/** LLM cost in USD if the transport reports it, else null. */
|
|
5314
|
+
costUsd: number | null;
|
|
5315
|
+
durationMs: number;
|
|
5316
|
+
/** False when the LLM errored or returned malformed JSON; caller soft-fails to defaults. */
|
|
5317
|
+
available: boolean;
|
|
5318
|
+
error?: string;
|
|
5319
|
+
}
|
|
5320
|
+
interface ReviewerSoftFailDefaults {
|
|
5321
|
+
observations?: string;
|
|
5322
|
+
diagnosis?: string;
|
|
5323
|
+
nextShotInstruction?: string;
|
|
5324
|
+
shouldContinue?: boolean;
|
|
5325
|
+
confidence?: number;
|
|
5326
|
+
}
|
|
5327
|
+
interface CreateDefaultReviewerOptions {
|
|
5328
|
+
/** Model id to call. */
|
|
5329
|
+
model: string;
|
|
5330
|
+
/** Per-call timeout. Default 180s. */
|
|
5331
|
+
timeoutMs?: number;
|
|
5332
|
+
/** LlmClient transport config (baseUrl, apiKey, authHeader, etc.). */
|
|
5333
|
+
llm?: LlmClientOptions;
|
|
5334
|
+
/**
|
|
5335
|
+
* Override the prompt builder. Default: `buildReviewerPrompt`.
|
|
5336
|
+
* Consumers with different reviewer voices pass their own.
|
|
5337
|
+
*/
|
|
5338
|
+
promptBuilder?: (input: ReviewerPromptInput) => {
|
|
5339
|
+
system: string;
|
|
5340
|
+
user: string;
|
|
5341
|
+
};
|
|
5342
|
+
/**
|
|
5343
|
+
* Soft-fail values when the LLM throws or returns unparseable JSON.
|
|
5344
|
+
* Matches VerticalBench's shipped policy: continue with generic
|
|
5345
|
+
* instruction at confidence 0.3 so the worker keeps trying.
|
|
5346
|
+
*/
|
|
5347
|
+
softFailDefaults?: ReviewerSoftFailDefaults;
|
|
5348
|
+
}
|
|
5349
|
+
/**
|
|
5350
|
+
* Build the reviewer's system + user messages. Pure function, no LLM
|
|
5351
|
+
* call. Callers that want their own transport or a different structured
|
|
5352
|
+
* output can use this and skip `createDefaultReviewer` entirely.
|
|
5353
|
+
*/
|
|
5354
|
+
declare function buildReviewerPrompt(input: ReviewerPromptInput): {
|
|
5355
|
+
system: string;
|
|
5356
|
+
user: string;
|
|
5357
|
+
};
|
|
5358
|
+
/**
|
|
5359
|
+
* Factory: returns a function that invokes the default reviewer against
|
|
5360
|
+
* an LLM and parses the structured output. Soft-fails to the provided
|
|
5361
|
+
* defaults on LLM throw or JSON-parse error so the shot loop keeps
|
|
5362
|
+
* moving rather than crashing.
|
|
5363
|
+
*/
|
|
5364
|
+
declare function createDefaultReviewer(options: CreateDefaultReviewerOptions): (input: ReviewerPromptInput) => Promise<ReviewerOutput>;
|
|
5365
|
+
|
|
5366
|
+
/**
|
|
5367
|
+
* Semantic concept judge — "does the built artifact actually implement
|
|
5368
|
+
* the features the user asked for?"
|
|
5369
|
+
*
|
|
5370
|
+
* Distinct from the domain/code/coherence judges in `judges.ts`:
|
|
5371
|
+
* - those judges score free-form conversational agent outputs along
|
|
5372
|
+
* quality dimensions (accuracy, depth, etc.)
|
|
5373
|
+
* - this judge scores a *built artifact* (served HTML + source files)
|
|
5374
|
+
* against an explicit list of expected concepts, returning per-concept
|
|
5375
|
+
* {present, score 0-10, evidence, severity}.
|
|
5376
|
+
*
|
|
5377
|
+
* The judge is strict about distinguishing (a) a working implementation
|
|
5378
|
+
* from (b) a keyword-present stub. "// TODO: mint button" is NOT present.
|
|
5379
|
+
* Only real, functional, wired-up code counts.
|
|
5380
|
+
*
|
|
5381
|
+
* Use via {@link createSemanticConceptJudge} or directly via
|
|
5382
|
+
* {@link runSemanticConceptJudge}. Soft-fails (available=false) on LLM
|
|
5383
|
+
* or JSON-parse errors so the caller can treat that as "layer skipped"
|
|
5384
|
+
* rather than "layer failed" in a multi-layer pipeline.
|
|
5385
|
+
*/
|
|
5386
|
+
|
|
5387
|
+
/**
|
|
5388
|
+
* Implementation complexity class for weighted scoring (added 0.11).
|
|
5389
|
+
*
|
|
5390
|
+
* - `render` (default): the concept is a UI surface that displays static
|
|
5391
|
+
* data — render a list, show a counter, lay out a button. Single-file
|
|
5392
|
+
* work, no external integration.
|
|
5393
|
+
* - `integrate`: the concept requires wiring a real external system —
|
|
5394
|
+
* wallet connect (wagmi + RainbowKit + chain config), payment provider
|
|
5395
|
+
* (Stripe Elements + intent + webhook), an API client with auth.
|
|
5396
|
+
* Multi-file, library-knowledge, runtime correctness matters.
|
|
5397
|
+
* - `compute`: the concept requires algorithmic work — solver, simulator,
|
|
5398
|
+
* constraint propagation, ML inference. Correctness > UI polish.
|
|
5399
|
+
*
|
|
5400
|
+
* Default weights (when applied via `weightConcepts: 'complexity'`):
|
|
5401
|
+
* render=1.0, integrate=2.0, compute=2.5
|
|
5402
|
+
*
|
|
5403
|
+
* Cross-vertical scoring without complexity weighting silently inflates
|
|
5404
|
+
* the rate of UI-heavy verticals (healthcare, fintech dashboards) vs
|
|
5405
|
+
* integration-heavy verticals (DeFi, wallets) — all concepts treated
|
|
5406
|
+
* equally even though the agent does 2-3x the work for `integrate`.
|
|
5407
|
+
*/
|
|
5408
|
+
type ConceptComplexity = 'render' | 'integrate' | 'compute';
|
|
5409
|
+
interface ConceptSpec {
|
|
5410
|
+
name: string;
|
|
5411
|
+
/** Short hints that help the judge; not used for matching. */
|
|
5412
|
+
keywords?: string[];
|
|
5413
|
+
/** Optional explicit weight; default 1.0. Overrides complexity-derived weight. */
|
|
5414
|
+
weight?: number;
|
|
5415
|
+
/** Implementation complexity class. Default `render`. */
|
|
5416
|
+
complexity?: ConceptComplexity;
|
|
5417
|
+
}
|
|
5418
|
+
interface ConceptFinding {
|
|
5419
|
+
concept: string;
|
|
5420
|
+
present: boolean;
|
|
5421
|
+
/** 0..10. 10 = production-ready; 7 = functional thin; 4 = partial; 0 = absent. */
|
|
5422
|
+
score: number;
|
|
5423
|
+
evidence: string;
|
|
5424
|
+
severity: Severity;
|
|
5425
|
+
}
|
|
5426
|
+
interface SemanticConceptJudgeInput {
|
|
5427
|
+
/** Full natural-language prompt the agent was handed. */
|
|
5428
|
+
userRequest: string;
|
|
5429
|
+
/** Rendered HTML the preview returns (UI artifacts). Optional. */
|
|
5430
|
+
servedHtml?: string;
|
|
5431
|
+
/** Top-level source files from the agent's workdir. */
|
|
5432
|
+
sourceFiles: Array<{
|
|
5433
|
+
path: string;
|
|
5434
|
+
content: string;
|
|
5435
|
+
}>;
|
|
5436
|
+
/** The expected concept list. */
|
|
5437
|
+
expectedConcepts: ConceptSpec[];
|
|
5438
|
+
/** Free-form metadata (id, difficulty) to inject into the prompt. */
|
|
5439
|
+
artifactLabel?: string;
|
|
5440
|
+
artifactDescription?: string;
|
|
5441
|
+
}
|
|
5442
|
+
interface SemanticConceptJudgeResult {
|
|
5443
|
+
kind: 'semantic-concept';
|
|
5444
|
+
version: string;
|
|
5445
|
+
/** Normalized 0..1 score — mean of per-concept scores / 10. */
|
|
5446
|
+
score: number;
|
|
5447
|
+
presentCount: number;
|
|
5448
|
+
totalCount: number;
|
|
5449
|
+
findings: ConceptFinding[];
|
|
5450
|
+
summary: string;
|
|
5451
|
+
durationMs: number;
|
|
5452
|
+
costUsd: number | null;
|
|
5453
|
+
/** False on LLM/JSON error — treat as "skipped / unable to judge" in pipelines. */
|
|
5454
|
+
available: boolean;
|
|
5455
|
+
error?: string;
|
|
5456
|
+
}
|
|
5457
|
+
/**
|
|
5458
|
+
* Score-aggregation strategy. Default `mean` (legacy behavior — 0.10
|
|
5459
|
+
* and earlier always averaged 0-10 scores). `complexity` applies the
|
|
5460
|
+
* default weight table (render=1, integrate=2, compute=2.5) unless a
|
|
5461
|
+
* concept has an explicit `weight`. `explicit` honors only `weight`
|
|
5462
|
+
* (defaulting to 1 for unspecified).
|
|
5463
|
+
*/
|
|
5464
|
+
type ConceptWeightStrategy = 'mean' | 'complexity' | 'explicit';
|
|
5465
|
+
declare const DEFAULT_COMPLEXITY_WEIGHTS: Record<ConceptComplexity, number>;
|
|
5466
|
+
interface SemanticConceptJudgeOptions {
|
|
5467
|
+
/** Model id to call. Default 'claude-sonnet-4-6' via agent-eval defaults. */
|
|
5468
|
+
model?: string;
|
|
5469
|
+
/** Per-call timeout. Default 180s. */
|
|
5470
|
+
timeoutMs?: number;
|
|
5471
|
+
/** Pipeline budget for the prompt (source blob truncation). Default 45000. */
|
|
5472
|
+
maxSourceChars?: number;
|
|
5473
|
+
/** Per-file cap before inclusion. Default 20000. */
|
|
5474
|
+
maxPerFileChars?: number;
|
|
5475
|
+
/** HTML cap. Default 30000. */
|
|
5476
|
+
maxHtmlChars?: number;
|
|
5477
|
+
/** LlmClient config (baseUrl, apiKey, authHeader, …). */
|
|
5478
|
+
llm?: LlmClientOptions;
|
|
5479
|
+
/**
|
|
5480
|
+
* Score aggregation strategy. Default `mean` for backward compatibility
|
|
5481
|
+
* with 0.10 and earlier callers. Cross-vertical comparisons should use
|
|
5482
|
+
* `complexity` to neutralize the integrate-vs-render asymmetry.
|
|
5483
|
+
*/
|
|
5484
|
+
weightConcepts?: ConceptWeightStrategy;
|
|
5485
|
+
/** Override the default complexity → weight table. */
|
|
5486
|
+
complexityWeights?: Partial<Record<ConceptComplexity, number>>;
|
|
5487
|
+
}
|
|
5488
|
+
declare const SEMANTIC_CONCEPT_JUDGE_VERSION = "semantic-concept-judge-v1-2026-04-24";
|
|
5489
|
+
/**
|
|
5490
|
+
* Run the semantic concept judge. Soft-fails to available=false on
|
|
5491
|
+
* LLM/JSON errors — callers in a MultiLayerVerifier pipeline can treat
|
|
5492
|
+
* that as "skip" rather than "fail."
|
|
5493
|
+
*/
|
|
5494
|
+
declare function runSemanticConceptJudge(input: SemanticConceptJudgeInput, options?: SemanticConceptJudgeOptions): Promise<SemanticConceptJudgeResult>;
|
|
5495
|
+
/**
|
|
5496
|
+
* Factory: pin LLM options once, return a closure that accepts inputs.
|
|
5497
|
+
* Convenient for pipelines that want to share a single LlmClient config.
|
|
5498
|
+
*/
|
|
5499
|
+
declare function createSemanticConceptJudge(options?: SemanticConceptJudgeOptions): (input: SemanticConceptJudgeInput) => Promise<SemanticConceptJudgeResult>;
|
|
5500
|
+
|
|
5501
|
+
/**
|
|
5502
|
+
* Intent-match judge — "did the agent build the right APP, ignoring
|
|
5503
|
+
* whether every feature is wired up?"
|
|
5504
|
+
*
|
|
5505
|
+
* Distinct from {@link runSemanticConceptJudge} which scores per-concept
|
|
5506
|
+
* presence. The semantic judge can return 0/4 concepts present even
|
|
5507
|
+
* when the agent built a thoughtful, polished, on-brief app that just
|
|
5508
|
+
* lacks one or two features. The semantic judge can also return 4/4
|
|
5509
|
+
* present even when the agent shipped the wrong project (keyword-rich
|
|
5510
|
+
* stub).
|
|
5511
|
+
*
|
|
5512
|
+
* Intent-match asks ONE question:
|
|
5513
|
+
* "Looking at the agent's work as a whole — independent of feature
|
|
5514
|
+
* coverage — is this an honest attempt at the user's request?"
|
|
5515
|
+
*
|
|
5516
|
+
* Returns a 0–1 score and a 1-sentence evidence string. Use as a sanity
|
|
5517
|
+
* check on `completenessScore`-style metrics: if intent-match is high
|
|
5518
|
+
* and concept count is low, the agent built the right thing but is
|
|
5519
|
+
* missing features (ship and iterate). If intent-match is low, the
|
|
5520
|
+
* agent built the wrong thing (reject regardless of concept count).
|
|
5521
|
+
*
|
|
5522
|
+
* Soft-fails on LLM/JSON error (`available: false`) so callers can
|
|
5523
|
+
* treat failure as "judge skipped."
|
|
5524
|
+
*
|
|
5525
|
+
* Added in 0.11 to replace the lying `completenessScore: 1` field that
|
|
5526
|
+
* VerticalBench shipped pre-Gen-48 — that field was keyword-driven and
|
|
5527
|
+
* fired true on builds with zero spec concepts implemented.
|
|
5528
|
+
*/
|
|
5529
|
+
|
|
5530
|
+
declare const INTENT_MATCH_JUDGE_VERSION = "intent-match-judge-v1-2026-04-24";
|
|
5531
|
+
interface IntentMatchInput {
|
|
5532
|
+
/** The full natural-language prompt the agent was handed. */
|
|
5533
|
+
userRequest: string;
|
|
5534
|
+
/** Top-level source files from the agent's workdir. */
|
|
5535
|
+
sourceFiles: Array<{
|
|
5536
|
+
path: string;
|
|
5537
|
+
content: string;
|
|
5538
|
+
}>;
|
|
5539
|
+
/** Rendered HTML the preview returned, when available. */
|
|
5540
|
+
servedHtml?: string;
|
|
5541
|
+
/** Optional metadata to inject (id, vertical, difficulty). */
|
|
5542
|
+
artifactLabel?: string;
|
|
5543
|
+
artifactDescription?: string;
|
|
5544
|
+
}
|
|
5545
|
+
interface IntentMatchResult {
|
|
5546
|
+
kind: 'intent-match';
|
|
5547
|
+
version: string;
|
|
5548
|
+
/** 0..1 — 1 = unmistakably the right app, 0 = unrelated to the brief. */
|
|
5549
|
+
score: number;
|
|
5550
|
+
/** One-sentence rationale citing concrete evidence (file or HTML). */
|
|
5551
|
+
evidence: string;
|
|
5552
|
+
durationMs: number;
|
|
5553
|
+
costUsd: number | null;
|
|
5554
|
+
available: boolean;
|
|
5555
|
+
error?: string;
|
|
5556
|
+
}
|
|
5557
|
+
interface IntentMatchOptions {
|
|
5558
|
+
model?: string;
|
|
5559
|
+
timeoutMs?: number;
|
|
5560
|
+
maxSourceChars?: number;
|
|
5561
|
+
maxPerFileChars?: number;
|
|
5562
|
+
maxHtmlChars?: number;
|
|
5563
|
+
llm?: LlmClientOptions;
|
|
5564
|
+
}
|
|
5565
|
+
/**
|
|
5566
|
+
* Run the intent-match judge. Soft-fails to available=false on error.
|
|
5567
|
+
*/
|
|
5568
|
+
declare function runIntentMatchJudge(input: IntentMatchInput, options?: IntentMatchOptions): Promise<IntentMatchResult>;
|
|
5569
|
+
/**
|
|
5570
|
+
* Factory: pin LLM options once, return a closure.
|
|
5571
|
+
*/
|
|
5572
|
+
declare function createIntentMatchJudge(options?: IntentMatchOptions): (input: IntentMatchInput) => Promise<IntentMatchResult>;
|
|
5573
|
+
|
|
5574
|
+
/**
|
|
5575
|
+
* Flow layer — drive a previewed app through a scripted user walk.
|
|
5576
|
+
*
|
|
5577
|
+
* The MultiLayerVerifier already had a `flow` slot wired in
|
|
5578
|
+
* VerticalBench's verification-harness, but the layer module was
|
|
5579
|
+
* always-skipped ("flow layer module not yet wired"). This adds the
|
|
5580
|
+
* module: a Layer<Env> that takes a {@link FlowSpec} (URL + steps),
|
|
5581
|
+
* boots a preview server via the supplied {@link FlowRunner}, executes
|
|
5582
|
+
* each step, and returns a LayerResult whose `findings` enumerate
|
|
5583
|
+
* which step failed.
|
|
5584
|
+
*
|
|
5585
|
+
* The runner is injected so this module can swap between:
|
|
5586
|
+
* - production: agent-browser CLI (a11y-tree based steps)
|
|
5587
|
+
* - test: in-memory mock that returns canned step outcomes
|
|
5588
|
+
* - future: Playwright, Puppeteer, custom scrapers
|
|
5589
|
+
*
|
|
5590
|
+
* Shipped in 0.11 alongside {@link runIntentMatchJudge} — together they
|
|
5591
|
+
* close the "the agent shipped the wrong app and we didn't catch it"
|
|
5592
|
+
* blind spot. Intent-match catches "wrong app entirely"; flow catches
|
|
5593
|
+
* "right app but the buttons don't work."
|
|
5594
|
+
*/
|
|
5595
|
+
|
|
5596
|
+
type FlowAction = 'navigate' | 'click' | 'fill' | 'expect-text' | 'expect-element' | 'expect-url' | 'wait';
|
|
5597
|
+
interface FlowStep {
|
|
5598
|
+
/** What this step does. */
|
|
5599
|
+
action: FlowAction;
|
|
5600
|
+
/** Human-readable description for findings. */
|
|
5601
|
+
describe?: string;
|
|
5602
|
+
/**
|
|
5603
|
+
* For navigate/expect-url: full URL. For click/fill/expect-element:
|
|
5604
|
+
* accessible-name selector or CSS selector.
|
|
5605
|
+
* For expect-text: substring expected on the page.
|
|
5606
|
+
* For wait: ignored (use `value` for ms).
|
|
5607
|
+
*/
|
|
5608
|
+
target?: string;
|
|
5609
|
+
/** For fill: text to enter. For wait: ms. */
|
|
5610
|
+
value?: string;
|
|
5611
|
+
/** Severity of a failure. Default `major`. */
|
|
5612
|
+
severity?: Severity;
|
|
5613
|
+
}
|
|
5614
|
+
interface FlowSpec {
|
|
5615
|
+
/** Initial URL the runner should open. */
|
|
5616
|
+
url: string;
|
|
5617
|
+
/** Ordered steps. Stops at the first failure unless `continueOnFail: true`. */
|
|
5618
|
+
steps: FlowStep[];
|
|
5619
|
+
/** When true, execute every step even after a failure (collect all findings). */
|
|
5620
|
+
continueOnFail?: boolean;
|
|
5621
|
+
/** Per-step wall cap (ms). Default 15s. */
|
|
5622
|
+
stepTimeoutMs?: number;
|
|
5623
|
+
}
|
|
5624
|
+
interface FlowRunnerStepResult {
|
|
5625
|
+
ok: boolean;
|
|
5626
|
+
/** Concrete observation: matched text snippet, captured URL, error message. */
|
|
5627
|
+
evidence?: string;
|
|
5628
|
+
/** Wall-clock duration of the step. */
|
|
5629
|
+
durationMs?: number;
|
|
5630
|
+
}
|
|
5631
|
+
interface FlowRunner {
|
|
5632
|
+
/** Open the target URL. Returns when the page is interactable. */
|
|
5633
|
+
open(url: string): Promise<FlowRunnerStepResult>;
|
|
5634
|
+
/** Execute one step. The runner owns interpretation of `target`. */
|
|
5635
|
+
step(step: FlowStep): Promise<FlowRunnerStepResult>;
|
|
5636
|
+
/** Tear down browser, free resources. Always called once per layer.run. */
|
|
5637
|
+
close(): Promise<void>;
|
|
5638
|
+
}
|
|
5639
|
+
interface FlowLayerEnv {
|
|
5640
|
+
/** Optional override per-call. Defaults supplied by the layer factory. */
|
|
5641
|
+
flowSpec?: FlowSpec;
|
|
5642
|
+
}
|
|
5643
|
+
interface FlowLayerFactoryInput {
|
|
5644
|
+
/** Static spec (used when env doesn't supply one). */
|
|
5645
|
+
flowSpec?: FlowSpec;
|
|
5646
|
+
/** Build the runner per call (lets the layer create + tear down per leaf). */
|
|
5647
|
+
runner: () => FlowRunner | Promise<FlowRunner>;
|
|
5648
|
+
/** Layer name. Default `flow`. */
|
|
5649
|
+
name?: string;
|
|
5650
|
+
/** Layer dependencies — default `['serve']` so a non-booting preview skips us. */
|
|
5651
|
+
dependsOn?: string[];
|
|
5652
|
+
/** Layer weight for blendedScore (0..1+). Default 1. */
|
|
5653
|
+
weight?: number;
|
|
5654
|
+
/** Cap for the entire flow run (ms). Default 60s. */
|
|
5655
|
+
capMs?: number;
|
|
5656
|
+
}
|
|
5657
|
+
/**
|
|
5658
|
+
* Build a flow layer that scripts a user walk via the supplied runner.
|
|
5659
|
+
*
|
|
5660
|
+
* Score: 1.0 when every step passed; otherwise 1 - (failedSteps / totalSteps).
|
|
5661
|
+
* Status: `pass` iff every step passed; `fail` if any step failed; `error`
|
|
5662
|
+
* on runner setup error; `skipped` when no flowSpec is available.
|
|
5663
|
+
*/
|
|
5664
|
+
declare function flowLayer<Env extends FlowLayerEnv = FlowLayerEnv>(input: FlowLayerFactoryInput): Layer<Env>;
|
|
5665
|
+
|
|
5666
|
+
/**
|
|
5667
|
+
* Deploy gate layer — would the agent's build actually publish?
|
|
5668
|
+
*
|
|
5669
|
+
* The product Blueprint Agent fronts promises "go from idea to live URL."
|
|
5670
|
+
* Pre-Gen-48 the eval stopped at install/typecheck/build/serve — every
|
|
5671
|
+
* one of which can pass while `vite build` (or `next build`, etc) fails
|
|
5672
|
+
* on a production-only constraint (env-var requirement, dynamic import
|
|
5673
|
+
* not statically resolvable, missing public asset).
|
|
5674
|
+
*
|
|
5675
|
+
* Deploy gate runs the production build via the supplied {@link DeployRunner}
|
|
5676
|
+
* and asserts:
|
|
5677
|
+
* - command exited 0
|
|
5678
|
+
* - artifact dir contains an entry point (index.html for static SPAs,
|
|
5679
|
+
* equivalent per framework family)
|
|
5680
|
+
*
|
|
5681
|
+
* Shipped in 0.11 with the canonical `vite` runner. Future generations
|
|
5682
|
+
* add wrangler-deploy --dry-run, next-build, etc — each as another
|
|
5683
|
+
* runner factory.
|
|
5684
|
+
*/
|
|
5685
|
+
|
|
5686
|
+
type DeployFamily = 'frontend-static' | 'nextjs' | 'remix' | 'fullstack-ts';
|
|
5687
|
+
interface DeployRunResult {
|
|
5688
|
+
ok: boolean;
|
|
5689
|
+
/** Stdout/stderr tail surfaced as evidence. Bounded in caller. */
|
|
5690
|
+
output?: string;
|
|
5691
|
+
/** Wall-clock duration of the build command. */
|
|
5692
|
+
durationMs?: number;
|
|
5693
|
+
/** Path to artifact directory the runner expects (dist/, .next/, build/, etc). */
|
|
5694
|
+
artifactDir?: string;
|
|
5695
|
+
/** True iff artifactDir contains the family's expected entry point. */
|
|
5696
|
+
artifactValid?: boolean;
|
|
5697
|
+
}
|
|
5698
|
+
interface DeployRunner {
|
|
5699
|
+
/** Run the production build. The runner owns command + cwd. */
|
|
5700
|
+
run(): Promise<DeployRunResult>;
|
|
5701
|
+
}
|
|
5702
|
+
interface DeployGateLayerInput {
|
|
5703
|
+
/** Build the runner per call. */
|
|
5704
|
+
runner: () => DeployRunner | Promise<DeployRunner>;
|
|
5705
|
+
/** Family hint — for logging, surfaced in diagnostics. */
|
|
5706
|
+
family?: DeployFamily;
|
|
5707
|
+
/** Layer name. Default `deploy`. */
|
|
5708
|
+
name?: string;
|
|
5709
|
+
/** Layer dependencies — default `['build']`. */
|
|
5710
|
+
dependsOn?: string[];
|
|
5711
|
+
/** Weight in blendedScore. Default 1. */
|
|
5712
|
+
weight?: number;
|
|
5713
|
+
/** Cap (ms). Default 120s — prod builds are slower than dev. */
|
|
5714
|
+
capMs?: number;
|
|
5715
|
+
/** When true, treat artifactValid=false as a fail (default true). */
|
|
5716
|
+
requireArtifact?: boolean;
|
|
5717
|
+
}
|
|
5718
|
+
/**
|
|
5719
|
+
* Build a deploy gate layer that runs the production build and verifies
|
|
5720
|
+
* the artifact. Pass: ok && artifactValid. Score: 1.0 (pass) or 0 (fail).
|
|
5721
|
+
*
|
|
5722
|
+
* For families where artifact-validation isn't applicable (e.g. a
|
|
5723
|
+
* server-rendered build that prints a manifest), set `requireArtifact:
|
|
5724
|
+
* false` and rely on the runner's own ok signal.
|
|
5725
|
+
*/
|
|
5726
|
+
declare function deployGateLayer<Env = unknown>(input: DeployGateLayerInput): Layer<Env>;
|
|
5727
|
+
interface ViteDeployRunnerInput {
|
|
5728
|
+
/** Workdir to build. The runner cd's here. */
|
|
5729
|
+
workdir: string;
|
|
5730
|
+
/**
|
|
5731
|
+
* Function to run a shell command in `workdir`. Same shape as
|
|
5732
|
+
* agent-eval's CommandRunner.run for compositional reuse.
|
|
5733
|
+
*/
|
|
5734
|
+
exec: (cmd: string, opts?: {
|
|
5735
|
+
cwd?: string;
|
|
5736
|
+
timeoutMs?: number;
|
|
5737
|
+
}) => Promise<{
|
|
5738
|
+
stdout: string;
|
|
5739
|
+
stderr: string;
|
|
5740
|
+
exitCode: number;
|
|
5741
|
+
}>;
|
|
5742
|
+
/**
|
|
5743
|
+
* Function to test whether a path exists in the workdir. Inject
|
|
5744
|
+
* `(p) => existsSync(join(workdir, p))` for host runs.
|
|
5745
|
+
*/
|
|
5746
|
+
exists: (relativePath: string) => boolean | Promise<boolean>;
|
|
5747
|
+
/** Build command. Default `npm run build`. */
|
|
5748
|
+
buildCommand?: string;
|
|
5749
|
+
/** Artifact directory to validate. Default `dist`. */
|
|
5750
|
+
artifactDir?: string;
|
|
5751
|
+
/** Entry-point file under artifactDir. Default `index.html`. */
|
|
5752
|
+
artifactEntry?: string;
|
|
5753
|
+
/** Per-build cap (ms). Default 90s. */
|
|
5754
|
+
timeoutMs?: number;
|
|
5755
|
+
}
|
|
5756
|
+
/**
|
|
5757
|
+
* Canonical runner for `frontend-static` family — runs the build script,
|
|
5758
|
+
* validates `<artifactDir>/<artifactEntry>` exists. Use as the `runner:`
|
|
5759
|
+
* factory for {@link deployGateLayer}.
|
|
5760
|
+
*/
|
|
5761
|
+
declare function viteDeployRunner(input: ViteDeployRunnerInput): DeployRunner;
|
|
5762
|
+
|
|
5763
|
+
/**
|
|
5764
|
+
* Keyword-coverage judge — baseline complement to the semantic concept
|
|
5765
|
+
* judge.
|
|
5766
|
+
*
|
|
5767
|
+
* Where {@link runSemanticConceptJudge} uses an LLM to read source code
|
|
5768
|
+
* and decide whether a concept is REALLY implemented (not just
|
|
5769
|
+
* keyword-mentioned), this judge does the cheap, deterministic version:
|
|
5770
|
+
* fetch the served preview, concatenate every linked CSS/JS asset, and
|
|
5771
|
+
* substring-match each expected concept's keywords against the
|
|
5772
|
+
* concatenated haystack. Optional `requiredElement` selector adds a
|
|
5773
|
+
* structural gate so "supply counter" can require an actual `<input>` or
|
|
5774
|
+
* `<table>`, not just a comment containing the word.
|
|
5775
|
+
*
|
|
5776
|
+
* Use both judges. Keyword coverage is a fast 0-cost gate — a stub page
|
|
5777
|
+
* with the right keywords passes here, fails the semantic judge. Score
|
|
5778
|
+
* divergence between the two is itself a signal: high keyword coverage
|
|
5779
|
+
* + low semantic = "the agent slapped the right words on the right
|
|
5780
|
+
* scaffold but didn't wire any of it up."
|
|
5781
|
+
*
|
|
5782
|
+
* Pure functions, soft-fail on fetch error, no LLM dependency.
|
|
5783
|
+
*/
|
|
5784
|
+
interface KeywordConceptSpec {
|
|
5785
|
+
name: string;
|
|
5786
|
+
keywords: string[];
|
|
5787
|
+
/**
|
|
5788
|
+
* Optional CSS selector that must match in the HTML for the concept
|
|
5789
|
+
* to count as present. Tiny subset:
|
|
5790
|
+
* - `tag` (e.g. `form`)
|
|
5791
|
+
* - `tag[attr="value"]` (e.g. `input[type="number"]`)
|
|
5792
|
+
* - `tag[attr]` (presence only)
|
|
5793
|
+
* Anything more complex is rejected with `null` (treated as
|
|
5794
|
+
* "unenforced", not "failed").
|
|
5795
|
+
*/
|
|
5796
|
+
requiredElement?: string;
|
|
5797
|
+
}
|
|
5798
|
+
interface KeywordCoverageFinding {
|
|
5799
|
+
concept: string;
|
|
5800
|
+
found: boolean;
|
|
5801
|
+
matchedKeywords: string[];
|
|
5802
|
+
/** True iff the optional requiredElement selector matched; null when no selector. */
|
|
5803
|
+
requiredElementPresent: boolean | null;
|
|
5804
|
+
}
|
|
5805
|
+
interface KeywordCoverageResult {
|
|
5806
|
+
/** 0..1 share of concepts satisfied. */
|
|
5807
|
+
score: number;
|
|
5808
|
+
presentCount: number;
|
|
5809
|
+
totalCount: number;
|
|
5810
|
+
findings: KeywordCoverageFinding[];
|
|
5811
|
+
durationMs: number;
|
|
5812
|
+
/** Total bytes assembled across html + linked assets. */
|
|
5813
|
+
totalAssembledBytes: number;
|
|
5814
|
+
/** Soft-failure reason if the audit couldn't run. */
|
|
5815
|
+
error?: string;
|
|
5816
|
+
}
|
|
5817
|
+
interface KeywordCoverageOptions {
|
|
5818
|
+
/** Override fetch implementation — for tests. */
|
|
5819
|
+
fetch?: typeof fetch;
|
|
5820
|
+
/** Per-asset fetch timeout (default 3s). */
|
|
5821
|
+
assetTimeoutMs?: number;
|
|
5822
|
+
/** Initial-HTML fetch timeout (default 5s). */
|
|
5823
|
+
htmlTimeoutMs?: number;
|
|
5824
|
+
}
|
|
5825
|
+
/**
|
|
5826
|
+
* Element-presence check using a tiny CSS-selector subset. Returns
|
|
5827
|
+
* null when the selector isn't supported — caller treats that as
|
|
5828
|
+
* "unenforced" rather than "failed."
|
|
5829
|
+
*/
|
|
5830
|
+
declare function htmlContainsElement(html: string, selector: string): boolean | null;
|
|
5831
|
+
/**
|
|
5832
|
+
* Pull every `<link rel=stylesheet href>` and `<script src>` from a
|
|
5833
|
+
* raw HTML body. Returns absolute URLs resolved against `baseUrl`.
|
|
5834
|
+
* Permissive regex — agent-authored markup doesn't always quote
|
|
5835
|
+
* attributes the same way.
|
|
5836
|
+
*/
|
|
5837
|
+
declare function extractAssetUrls(html: string, baseUrl: string): string[];
|
|
5838
|
+
/**
|
|
5839
|
+
* Score expected concepts against an already-fetched HTML payload + any
|
|
5840
|
+
* pre-fetched CSS/JS assets. Use when the runner has the bytes in hand
|
|
5841
|
+
* and doesn't want a fresh HTTP round-trip — e.g. sandbox runtime where
|
|
5842
|
+
* the preview content was fetched via curl from inside the container.
|
|
5843
|
+
*/
|
|
5844
|
+
declare function runKeywordCoverageJudge(html: string, expectedConcepts: ReadonlyArray<KeywordConceptSpec>, assets?: ReadonlyArray<string>): KeywordCoverageResult;
|
|
5845
|
+
/**
|
|
5846
|
+
* URL-fetch flavor — GET the preview, parallel-fetch every linked
|
|
5847
|
+
* stylesheet + script (with bounded timeouts, soft-fail individually),
|
|
5848
|
+
* then score via {@link runKeywordCoverageJudge}.
|
|
5849
|
+
*/
|
|
5850
|
+
declare function runKeywordCoverageJudgeUrl(previewUrl: string, expectedConcepts: ReadonlyArray<KeywordConceptSpec>, options?: KeywordCoverageOptions): Promise<KeywordCoverageResult>;
|
|
5851
|
+
|
|
5852
|
+
/**
|
|
5853
|
+
* Toolchain error-count extractor.
|
|
5854
|
+
*
|
|
5855
|
+
* Given stderr/stdout from a compiler or test runner, count the number
|
|
5856
|
+
* of reported errors/failures. Patterns are deliberately narrow —
|
|
5857
|
+
* unknown stderr returns `null` rather than zero so callers can
|
|
5858
|
+
* distinguish "no errors" from "different toolchain, couldn't parse".
|
|
5859
|
+
*
|
|
5860
|
+
* All patterns are anchored to the start of a line and use bounded
|
|
5861
|
+
* character classes to avoid catastrophic backtracking on pathological
|
|
5862
|
+
* inputs.
|
|
5863
|
+
*
|
|
5864
|
+
* Add new toolchains by appending to {@link ERROR_COUNT_PATTERNS};
|
|
5865
|
+
* order matters only in the sense that the first matching pattern wins.
|
|
5866
|
+
*/
|
|
5867
|
+
interface ErrorCountPattern {
|
|
5868
|
+
/** Stable identifier for logging + tests. */
|
|
5869
|
+
name: string;
|
|
5870
|
+
/** Must be global (`g` flag) — the extractor counts matches. */
|
|
5871
|
+
regex: RegExp;
|
|
5872
|
+
/** Optional post-processing to extract a count from a single captured match. */
|
|
5873
|
+
transform?: (match: RegExpMatchArray) => number;
|
|
5874
|
+
}
|
|
5875
|
+
declare const ERROR_COUNT_PATTERNS: ErrorCountPattern[];
|
|
5876
|
+
interface ExtractOptions {
|
|
5877
|
+
/** Restrict to named patterns — default: all patterns. */
|
|
5878
|
+
only?: string[];
|
|
5879
|
+
/** Additional patterns to consider BEFORE the built-in list. */
|
|
5880
|
+
extra?: ErrorCountPattern[];
|
|
5881
|
+
}
|
|
5882
|
+
interface ExtractResult {
|
|
5883
|
+
/** Total count of matched errors, or null when no pattern matched. */
|
|
5884
|
+
count: number | null;
|
|
5885
|
+
/** Name of the pattern that matched, or null. */
|
|
5886
|
+
matched: string | null;
|
|
5887
|
+
/** Original matches for callers that want to surface specifics. */
|
|
5888
|
+
samples: string[];
|
|
5889
|
+
}
|
|
5890
|
+
/**
|
|
5891
|
+
* Try each pattern in order; return the first with matches.
|
|
5892
|
+
*
|
|
5893
|
+
* Returning `null` (instead of zero) on no-match is deliberate — a
|
|
5894
|
+
* callsite that greps for "typescript errors" on cargo output should
|
|
5895
|
+
* NOT treat that as "zero TS errors" because the toolchain is wrong.
|
|
5896
|
+
*/
|
|
5897
|
+
declare function extractErrorCount(text: string, opts?: ExtractOptions): ExtractResult;
|
|
5898
|
+
|
|
5899
|
+
/**
|
|
5900
|
+
* Reference replay — score an agent against withheld historical outcomes.
|
|
5901
|
+
*
|
|
5902
|
+
* This is the generic version of the public-audit replay pattern:
|
|
5903
|
+
* run a candidate system on an old task, keep the reference answers hidden
|
|
5904
|
+
* until after execution, then score recall/precision and gate promotion
|
|
5905
|
+
* across train/dev/test/holdout splits.
|
|
5906
|
+
*/
|
|
5907
|
+
type ReferenceReplaySplit = 'train' | 'dev' | 'test' | 'holdout';
|
|
5908
|
+
interface ReferenceReplayItem {
|
|
5909
|
+
id: string;
|
|
5910
|
+
title: string;
|
|
5911
|
+
description?: string;
|
|
5912
|
+
severity?: string;
|
|
5913
|
+
tags?: string[];
|
|
5914
|
+
weight?: number;
|
|
5915
|
+
}
|
|
5916
|
+
interface ReferenceReplayCandidate {
|
|
5917
|
+
id: string;
|
|
5918
|
+
title: string;
|
|
5919
|
+
description?: string;
|
|
5920
|
+
severity?: string;
|
|
5921
|
+
tags?: string[];
|
|
5922
|
+
metadata?: Record<string, unknown>;
|
|
5923
|
+
}
|
|
5924
|
+
interface ReferenceReplayScenario {
|
|
5925
|
+
id: string;
|
|
5926
|
+
split?: ReferenceReplaySplit;
|
|
5927
|
+
references: ReferenceReplayItem[];
|
|
5928
|
+
candidates: ReferenceReplayCandidate[];
|
|
5929
|
+
metadata?: Record<string, unknown>;
|
|
5930
|
+
}
|
|
5931
|
+
interface ReferenceReplayCase<Input = unknown> {
|
|
5932
|
+
id: string;
|
|
5933
|
+
split?: ReferenceReplaySplit;
|
|
5934
|
+
input: Input;
|
|
5935
|
+
references: ReferenceReplayItem[];
|
|
5936
|
+
metadata?: Record<string, unknown>;
|
|
5937
|
+
}
|
|
5938
|
+
interface ReferenceReplayExecutionScenario<Input = unknown> {
|
|
5939
|
+
id: string;
|
|
5940
|
+
split: ReferenceReplaySplit;
|
|
5941
|
+
input: Input;
|
|
5942
|
+
metadata?: Record<string, unknown>;
|
|
5943
|
+
}
|
|
5944
|
+
interface ReferenceReplayRunContext {
|
|
5945
|
+
runId: string;
|
|
5946
|
+
caseIndex: number;
|
|
5947
|
+
abortSignal?: AbortSignal;
|
|
5948
|
+
}
|
|
5949
|
+
interface ReferenceReplayAdapter<Input = unknown> {
|
|
5950
|
+
run(scenario: ReferenceReplayExecutionScenario<Input>, context: ReferenceReplayRunContext): Promise<ReferenceReplayCandidate[]>;
|
|
5951
|
+
}
|
|
5952
|
+
type ReferenceReplayAdapterFn<Input = unknown> = (scenario: ReferenceReplayExecutionScenario<Input>, context: ReferenceReplayRunContext) => Promise<ReferenceReplayCandidate[]>;
|
|
5953
|
+
type ReferenceReplayAdapterLike<Input = unknown> = ReferenceReplayAdapter<Input> | ReferenceReplayAdapterFn<Input>;
|
|
5954
|
+
interface ReferenceReplayMatch {
|
|
5955
|
+
scenarioId: string;
|
|
5956
|
+
referenceId: string;
|
|
5957
|
+
candidateId: string | null;
|
|
5958
|
+
score: number;
|
|
5959
|
+
matched: boolean;
|
|
5960
|
+
weight: number;
|
|
5961
|
+
reason: string;
|
|
5962
|
+
}
|
|
5963
|
+
interface ReferenceReplayScenarioScore {
|
|
5964
|
+
scenarioId: string;
|
|
5965
|
+
split: ReferenceReplaySplit;
|
|
5966
|
+
matched: number;
|
|
5967
|
+
total: number;
|
|
5968
|
+
falsePositives: number;
|
|
5969
|
+
matchedWeight: number;
|
|
5970
|
+
totalWeight: number;
|
|
5971
|
+
precision: number;
|
|
5972
|
+
recall: number;
|
|
5973
|
+
f1: number;
|
|
5974
|
+
matches: ReferenceReplayMatch[];
|
|
5975
|
+
}
|
|
5976
|
+
interface ReferenceReplayAggregate {
|
|
5977
|
+
matched: number;
|
|
5978
|
+
total: number;
|
|
5979
|
+
falsePositives: number;
|
|
5980
|
+
matchedWeight: number;
|
|
5981
|
+
totalWeight: number;
|
|
5982
|
+
precision: number;
|
|
5983
|
+
recall: number;
|
|
5984
|
+
f1: number;
|
|
5985
|
+
weightedRecall: number;
|
|
5986
|
+
}
|
|
5987
|
+
interface ReferenceReplayScore {
|
|
5988
|
+
scenarios: ReferenceReplayScenarioScore[];
|
|
5989
|
+
aggregate: ReferenceReplayAggregate;
|
|
5990
|
+
bySplit: Partial<Record<ReferenceReplaySplit, ReferenceReplayAggregate>>;
|
|
5991
|
+
}
|
|
5992
|
+
interface ReferenceMatchResult {
|
|
5993
|
+
score: number;
|
|
5994
|
+
reason?: string;
|
|
5995
|
+
}
|
|
5996
|
+
type ReferenceReplayMatcher = (reference: ReferenceReplayItem, candidate: ReferenceReplayCandidate, scenario: ReferenceReplayScenario) => ReferenceMatchResult;
|
|
5997
|
+
interface ReferenceReplayScoreOptions {
|
|
5998
|
+
matcher?: ReferenceReplayMatcher;
|
|
5999
|
+
matchThreshold?: number;
|
|
6000
|
+
includeHoldout?: boolean;
|
|
6001
|
+
splits?: ReferenceReplaySplit[];
|
|
6002
|
+
}
|
|
6003
|
+
interface ReferenceReplayPromotionPolicy {
|
|
6004
|
+
/** Splits that must improve or stay flat. Default: ['dev', 'test']. */
|
|
6005
|
+
requiredSplits?: ReferenceReplaySplit[];
|
|
6006
|
+
/** Minimum aggregate F1 lift required on required splits. Default 0. */
|
|
6007
|
+
minF1Delta?: number;
|
|
6008
|
+
/** Maximum F1 drop allowed on any compared split. Default 0. */
|
|
6009
|
+
maxRegression?: number;
|
|
6010
|
+
/** If true, holdout must be present and must not regress. Default true. */
|
|
6011
|
+
requireHoldoutNonRegression?: boolean;
|
|
6012
|
+
}
|
|
6013
|
+
interface ReferenceReplaySplitComparison {
|
|
6014
|
+
split: ReferenceReplaySplit;
|
|
6015
|
+
baselineF1: number;
|
|
6016
|
+
candidateF1: number;
|
|
6017
|
+
f1Delta: number;
|
|
6018
|
+
baselineRecall: number;
|
|
6019
|
+
candidateRecall: number;
|
|
6020
|
+
recallDelta: number;
|
|
6021
|
+
}
|
|
6022
|
+
interface ReferenceReplayPromotionDecision {
|
|
6023
|
+
promote: boolean;
|
|
6024
|
+
reason: string;
|
|
6025
|
+
aggregateDelta: number;
|
|
6026
|
+
comparisons: ReferenceReplaySplitComparison[];
|
|
6027
|
+
regressions: ReferenceReplaySplitComparison[];
|
|
6028
|
+
}
|
|
6029
|
+
interface ReferenceReplayCaseRun<Input = unknown> {
|
|
6030
|
+
caseId: string;
|
|
6031
|
+
split: ReferenceReplaySplit;
|
|
6032
|
+
input: Input;
|
|
6033
|
+
metadata?: Record<string, unknown>;
|
|
6034
|
+
references: ReferenceReplayItem[];
|
|
6035
|
+
candidates: ReferenceReplayCandidate[];
|
|
6036
|
+
score: ReferenceReplayScenarioScore;
|
|
6037
|
+
durationMs: number;
|
|
6038
|
+
error?: string;
|
|
6039
|
+
}
|
|
6040
|
+
interface ReferenceReplayRun<Input = unknown> {
|
|
6041
|
+
id: string;
|
|
6042
|
+
variantId?: string;
|
|
6043
|
+
startedAt: number;
|
|
6044
|
+
completedAt: number;
|
|
6045
|
+
durationMs: number;
|
|
6046
|
+
cases: ReferenceReplayCaseRun<Input>[];
|
|
6047
|
+
score: ReferenceReplayScore;
|
|
6048
|
+
metadata?: Record<string, unknown>;
|
|
6049
|
+
}
|
|
6050
|
+
interface ReferenceReplayRunOptions<Input = unknown> extends ReferenceReplayScoreOptions {
|
|
6051
|
+
adapter: ReferenceReplayAdapterLike<Input>;
|
|
6052
|
+
runId?: string;
|
|
6053
|
+
variantId?: string;
|
|
6054
|
+
metadata?: Record<string, unknown>;
|
|
6055
|
+
store?: ReferenceReplayRunStore<Input>;
|
|
6056
|
+
abortSignal?: AbortSignal;
|
|
6057
|
+
continueOnError?: boolean;
|
|
6058
|
+
now?: () => number;
|
|
6059
|
+
}
|
|
6060
|
+
interface ReferenceReplayRunStore<Input = unknown> {
|
|
6061
|
+
save(run: ReferenceReplayRun<Input>): Promise<void>;
|
|
6062
|
+
list(): Promise<ReferenceReplayRun<Input>[]>;
|
|
6063
|
+
}
|
|
6064
|
+
declare function runReferenceReplay<Input = unknown>(cases: ReferenceReplayCase<Input>[], options: ReferenceReplayRunOptions<Input>): Promise<ReferenceReplayRun<Input>>;
|
|
6065
|
+
declare function decideReferenceReplayRunPromotion(baseline: ReferenceReplayRun, candidate: ReferenceReplayRun, policy?: ReferenceReplayPromotionPolicy): ReferenceReplayPromotionDecision;
|
|
6066
|
+
declare function inMemoryReferenceReplayStore<Input = unknown>(initial?: ReferenceReplayRun<Input>[]): ReferenceReplayRunStore<Input>;
|
|
6067
|
+
declare function jsonlReferenceReplayStore<Input = unknown>(path: string): ReferenceReplayRunStore<Input>;
|
|
6068
|
+
declare function scoreReferenceReplay(scenarios: ReferenceReplayScenario[], options?: ReferenceReplayScoreOptions): ReferenceReplayScore;
|
|
6069
|
+
declare function compareReferenceReplay(baseline: ReferenceReplayScore, candidate: ReferenceReplayScore): ReferenceReplaySplitComparison[];
|
|
6070
|
+
declare function decideReferenceReplayPromotion(baseline: ReferenceReplayScore, candidate: ReferenceReplayScore, policy?: ReferenceReplayPromotionPolicy): ReferenceReplayPromotionDecision;
|
|
6071
|
+
declare function defaultReferenceReplayMatcher(reference: ReferenceReplayItem, candidate: ReferenceReplayCandidate): ReferenceMatchResult;
|
|
6072
|
+
|
|
6073
|
+
export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, MODEL_PRICING, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createLlmReviewer, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runProposeReview, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
|