@tangle-network/agent-eval 0.7.2 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -4796,4 +4796,1278 @@ interface UseCaseSignals {
4796
4796
  declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
4797
4797
  declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
4798
4798
 
4799
- export { type ActiveLearningOptions, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeploymentOutcome, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmJsonCall, type LlmReviewerConfig, type LlmSpan, MODEL_PRICING, type MatcherResult, type MeasurementPolicy, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type Verification, type VerifyFn, type VisualDiffOptions, type VisualDiffResult, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, createLlmReviewer, crossTraceDiff, defaultJudges, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, formatBenchmarkReport, formatDriverReport, formatFindings, groupBy, hashContent, hashScenarios, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReviewStore, judgeAgreementView, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, lowercaseMutator, mannWhitneyU, mergeSteeringBundle, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runJudgeFleet, runProposeReview, runSelfPlay, runTestGradedScenario, runsForScenario, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
4799
+ /**
4800
+ * LLM client with graceful degrade.
4801
+ *
4802
+ * OpenAI-compatible `/v1/chat/completions` client with:
4803
+ * - Exponential-backoff retry on 429 + 5xx gateway errors (502/503/504).
4804
+ * - Retry on transient network errors (fetch failed, AbortError, ECONNRESET).
4805
+ * - Graceful json_schema → json_object degrade on 400 with schema-reject body.
4806
+ * - Fenced-JSON stripping (```json ... ```) for models that wrap structured output.
4807
+ * - Configurable base URL + api key / bearer, works with LiteLLM proxies, OpenAI
4808
+ * directly, cli-bridge subscriptions, and any router that speaks the spec.
4809
+ *
4810
+ * Usage:
4811
+ * const { value, result } = await callLlmJson<MyType>(
4812
+ * { model: 'gpt-4o', messages: [...], jsonSchema: { name: 'x', schema: {...} } },
4813
+ * { baseUrl: 'https://router.tangle.tools/v1', apiKey: process.env.KEY },
4814
+ * )
4815
+ *
4816
+ * This is THE llm-calling seam for agent-eval primitives that need structured
4817
+ * output (semantic concept judge, reviewer directives, critic scores). Primitives
4818
+ * that need free-form text use `callLlm` and parse output themselves.
4819
+ */
4820
+ interface LlmMessage {
4821
+ role: 'system' | 'user' | 'assistant';
4822
+ /**
4823
+ * Either a plain text content string OR a multimodal content array
4824
+ * (text + image_url parts) for vision-capable models.
4825
+ */
4826
+ content: string | Array<{
4827
+ type: 'text';
4828
+ text: string;
4829
+ } | {
4830
+ type: 'image_url';
4831
+ image_url: {
4832
+ url: string;
4833
+ detail?: 'auto' | 'low' | 'high';
4834
+ };
4835
+ }>;
4836
+ }
4837
+ interface LlmCallRequest {
4838
+ model: string;
4839
+ messages: LlmMessage[];
4840
+ /** Optional JSON-mode response format (response_format: json_object). */
4841
+ jsonMode?: boolean;
4842
+ /** Optional structured output via JSON Schema. Falls back to json_object on 400. */
4843
+ jsonSchema?: {
4844
+ name: string;
4845
+ schema: Record<string, unknown>;
4846
+ };
4847
+ temperature?: number;
4848
+ maxTokens?: number;
4849
+ /** Per-call timeout, default 60s. */
4850
+ timeoutMs?: number;
4851
+ }
4852
+ interface LlmUsage {
4853
+ promptTokens: number;
4854
+ completionTokens: number;
4855
+ totalTokens: number;
4856
+ /** Proxies populate this when prompt caching is on. */
4857
+ cachedPromptTokens?: number;
4858
+ }
4859
+ interface LlmCallResult {
4860
+ /** The text content of the first choice. Empty string if none. */
4861
+ content: string;
4862
+ usage: LlmUsage;
4863
+ /**
4864
+ * Cost in USD. Pulled from proxy's `_response_cost` field when present;
4865
+ * `null` when neither the proxy nor the caller can derive it.
4866
+ */
4867
+ costUsd: number | null;
4868
+ /** Model name actually used (echoed from response). */
4869
+ model: string;
4870
+ /** Wall-clock duration of the HTTP call (last attempt, if retried). */
4871
+ durationMs: number;
4872
+ /** Raw response body. */
4873
+ raw: Record<string, unknown>;
4874
+ }
4875
+ declare class LlmCallError extends Error {
4876
+ readonly status: number;
4877
+ readonly body: string;
4878
+ readonly model: string;
4879
+ constructor(message: string, status: number, body: string, model: string);
4880
+ }
4881
+ interface LlmClientOptions {
4882
+ /** Base URL (without trailing slash). Must end at the `/v1` prefix. */
4883
+ baseUrl?: string;
4884
+ /** Bearer token — either `apiKey` or `bearer` populates `Authorization: Bearer ...`. */
4885
+ apiKey?: string;
4886
+ bearer?: string;
4887
+ /** Override for the `Authorization` header (e.g. `X-Auth: ...`). Takes precedence over apiKey/bearer. */
4888
+ authHeader?: {
4889
+ name: string;
4890
+ value: string;
4891
+ };
4892
+ /** Default timeout in ms. Per-call can override. */
4893
+ defaultTimeoutMs?: number;
4894
+ /** Max retry attempts on retriable errors. Default 3 (1 initial + 2 retries). */
4895
+ maxRetries?: number;
4896
+ /** Fetch implementation — defaults to global `fetch`. Override for custom transport (e.g. tests). */
4897
+ fetch?: typeof fetch;
4898
+ }
4899
+ /**
4900
+ * Strip a ```json / ``` code fence if the model emitted one.
4901
+ * Idempotent for naked JSON. Some models (claude-code via router, certain
4902
+ * deepseek models) wrap output even under json_object.
4903
+ */
4904
+ declare function stripFencedJson(raw: string): string;
4905
+ /**
4906
+ * Low-level call. Returns raw content + usage + cost. Retries on transient
4907
+ * failures; does NOT degrade schema here — callers that want graceful
4908
+ * degrade use `callLlmJson`.
4909
+ */
4910
+ declare function callLlm(req: LlmCallRequest, opts?: LlmClientOptions): Promise<LlmCallResult>;
4911
+ /**
4912
+ * Structured-output call. Returns parsed JSON plus the raw result envelope.
4913
+ * Degrades `jsonSchema` → `jsonMode` on a 400 that names the schema param —
4914
+ * critical for deepseek-v3/v4, kimi-k2.6, and other models that don't accept
4915
+ * the `response_format.json_schema` shape but DO accept `json_object`.
4916
+ */
4917
+ declare function callLlmJson<T = unknown>(req: LlmCallRequest, opts?: LlmClientOptions): Promise<{
4918
+ value: T;
4919
+ result: LlmCallResult;
4920
+ }>;
4921
+ /**
4922
+ * Probe whether a model is reachable. Returns latency + null error on
4923
+ * success; `ok=false` + error message on any failure (HTTP, timeout,
4924
+ * network, parse). Designed for sweep preflights — fail loud at the
4925
+ * boundary before burning a 30-leaf run on a misconfigured router.
4926
+ *
4927
+ * Sends a tiny `ping` message with `maxTokens=64`. Reasoning models
4928
+ * (glm-5.1, deepseek-v4) can burn the entire budget on internal reasoning
4929
+ * for short prompts, so don't tighten this further. We don't validate
4930
+ * content; HTTP 200 means reachable.
4931
+ */
4932
+ declare function probeLlm(model: string, opts?: LlmClientOptions & {
4933
+ timeoutMs?: number;
4934
+ }): Promise<{
4935
+ ok: boolean;
4936
+ latencyMs: number;
4937
+ error: string | null;
4938
+ }>;
4939
+ /**
4940
+ * Stateful client — construct once with defaults, call many times.
4941
+ * Thin wrapper around the free functions; exists for callers that want
4942
+ * to inject a single configured instance into multiple primitives.
4943
+ */
4944
+ declare class LlmClient {
4945
+ private readonly opts;
4946
+ constructor(opts?: LlmClientOptions);
4947
+ call(req: LlmCallRequest, per?: LlmClientOptions): Promise<LlmCallResult>;
4948
+ callJson<T = unknown>(req: LlmCallRequest, per?: LlmClientOptions): Promise<{
4949
+ value: T;
4950
+ result: LlmCallResult;
4951
+ }>;
4952
+ }
4953
+
4954
+ /**
4955
+ * Multi-layer verifier — ordered pipeline of verification layers.
4956
+ *
4957
+ * Different contract from {@link JudgeRunner} (which runs parallel
4958
+ * specs against a sandbox). MultiLayerVerifier is a DAG of layers
4959
+ * (install → typecheck → build → lint → serve → semantic → …) with
4960
+ * dependency-based skip, per-layer findings, soft-fail semantics, and
4961
+ * an aggregated `blendedScore` across all passed layers.
4962
+ *
4963
+ * Use when you want:
4964
+ * - ordered stages where a failing upstream stage skips downstream ones
4965
+ * - each stage produces rich `findings` (severity + message + evidence)
4966
+ * - a single composite score across stages with per-stage weights
4967
+ * - soft-fail stages whose failure doesn't abort the pipeline
4968
+ *
4969
+ * Use {@link JudgeRunner} when you want:
4970
+ * - N independent judges running in parallel against the same artifact
4971
+ * - no inter-judge dependencies
4972
+ * - boolean `passed` per judge + overall
4973
+ *
4974
+ * Both primitives compose — JudgeRunner can be invoked as a single
4975
+ * layer inside a MultiLayerVerifier if that suits the caller.
4976
+ */
4977
+ type LayerStatus = 'pass' | 'fail' | 'skipped' | 'error' | 'timeout';
4978
+ type Severity = 'critical' | 'major' | 'minor' | 'info';
4979
+ interface Finding {
4980
+ severity: Severity;
4981
+ message: string;
4982
+ evidence?: string;
4983
+ /** Optional layer name the finding belongs to (set by the verifier if omitted). */
4984
+ layer?: string;
4985
+ /**
4986
+ * Free-form structured payload — used by `multiToolchainLayer` to attach
4987
+ * `{ adapter: 'pnpm' }`, by judges to attach evidence pointers, etc.
4988
+ * Renderers MAY interrogate; agent-eval primitives never assume shape.
4989
+ */
4990
+ detail?: Record<string, unknown>;
4991
+ }
4992
+ interface LayerResult {
4993
+ layer: string;
4994
+ status: LayerStatus;
4995
+ /** 0..1 score, optional — layers that don't produce a numeric score omit. */
4996
+ score?: number;
4997
+ durationMs: number;
4998
+ findings: Finding[];
4999
+ /** Short human-readable summary (one line). */
5000
+ reason?: string;
5001
+ /**
5002
+ * Numeric layer-level diagnostics: error counts, warning counts,
5003
+ * cyclomatic complexity, total adapter wall-time, etc. Keyed by
5004
+ * diagnostic name; null = "diagnostic not applicable / not measured."
5005
+ * Renderers that know the keys can display them; ones that don't,
5006
+ * ignore. Free-form on purpose — consumers type the value shape in
5007
+ * their own namespace. Added in 0.10.
5008
+ */
5009
+ diagnostics?: Record<string, number | null>;
5010
+ /** Any rich per-layer detail — rendered as-is by consumers that know the layer. */
5011
+ detail?: Record<string, unknown>;
5012
+ }
5013
+ interface VerifyContext<Env = unknown> {
5014
+ /** Per-run opaque context the caller provides. Layers destructure what they need. */
5015
+ env: Env;
5016
+ /** Previously-computed results from layers that already ran. */
5017
+ prior: Record<string, LayerResult>;
5018
+ /** Signal — if aborted, layers MUST bail within reasonable wall. */
5019
+ signal: AbortSignal;
5020
+ }
5021
+ interface Layer<Env = unknown> {
5022
+ name: string;
5023
+ /** Stages that must have `status: 'pass'` before this layer runs. */
5024
+ dependsOn?: string[];
5025
+ /**
5026
+ * Weight in the composite `blendedScore`. Default 1.0. Layers with weight 0
5027
+ * contribute findings but not score.
5028
+ */
5029
+ weight?: number;
5030
+ /**
5031
+ * If true, a `fail` status contributes to `blendedScore` (as 0) instead of
5032
+ * being dropped — use for layers whose failure is a real signal. Default:
5033
+ * fail drops from numerator + denominator, matching VB's existing semantics.
5034
+ */
5035
+ failContributesToScore?: boolean;
5036
+ /** Optional per-layer wall-cap in ms. Honored by the verifier (AbortSignal). */
5037
+ capMs?: number;
5038
+ run: (ctx: VerifyContext<Env>) => Promise<LayerResult> | LayerResult;
5039
+ }
5040
+ interface VerifyOptions<Env = unknown> {
5041
+ env: Env;
5042
+ /**
5043
+ * Overall wall cap. Default: sum of layer capMs, or Infinity if any layer
5044
+ * omits a cap. The verifier short-circuits remaining layers on overall cap.
5045
+ */
5046
+ overallCapMs?: number;
5047
+ /** Called with each layer result as it completes. */
5048
+ onLayer?: (result: LayerResult) => void;
5049
+ }
5050
+ interface VerificationReport {
5051
+ layers: LayerResult[];
5052
+ passCount: number;
5053
+ failCount: number;
5054
+ skippedCount: number;
5055
+ errorCount: number;
5056
+ /** True iff at least one scored layer ran AND every scored layer passed. */
5057
+ allPass: boolean;
5058
+ /**
5059
+ * Weighted mean of `score` across contributing layers. 0 when no layers
5060
+ * contributed. See {@link Layer.failContributesToScore} for fail semantics.
5061
+ */
5062
+ blendedScore: number;
5063
+ durationMs: number;
5064
+ startedAt: string;
5065
+ finishedAt: string;
5066
+ }
5067
+ /**
5068
+ * Grade a semantic-concept-style judge result into a single layer status.
5069
+ *
5070
+ * Pass when overall score >= threshold AND no critical-severity concept gap.
5071
+ * Fail otherwise. Use inside a `Layer.run` when wrapping a concept judge.
5072
+ *
5073
+ * Generalized from VerticalBench H3 fix: `failingConcepts.length === 0` was
5074
+ * too strict — a single concept at 6/10 failed the entire layer despite
5075
+ * overall score being >= 0.7. Now we trust the judge's own `severity` field:
5076
+ * `critical` findings veto; `major`/`minor` reduce the score but don't veto.
5077
+ */
5078
+ declare function gradeSemanticStatus(input: {
5079
+ score: number;
5080
+ findings: Array<{
5081
+ severity: Severity;
5082
+ present?: boolean;
5083
+ score?: number;
5084
+ }>;
5085
+ available: boolean;
5086
+ threshold?: number;
5087
+ }): LayerStatus;
5088
+ declare class MultiLayerVerifier<Env = unknown> {
5089
+ private readonly layers;
5090
+ constructor(layers: Layer<Env>[]);
5091
+ run(opts: VerifyOptions<Env>): Promise<VerificationReport>;
5092
+ }
5093
+
5094
+ /**
5095
+ * CommandRunner — abstract subprocess execution surface.
5096
+ *
5097
+ * Layers in a {@link MultiLayerVerifier} that need to invoke external
5098
+ * tools (compilers, test runners, package managers) call out via this
5099
+ * interface rather than directly using `child_process`. Two reasons:
5100
+ *
5101
+ * 1. **Sandbox interchangeability.** A run that targets a sandbox box
5102
+ * (via SDK-specific Box.exec) and a run that targets the host both
5103
+ * satisfy this same contract. The harness doesn't care which.
5104
+ * 2. **Testability.** Tests inject a fake runner and assert on calls
5105
+ * without spawning real subprocesses.
5106
+ *
5107
+ * agent-eval ships only the local implementation (host-process). Sandbox
5108
+ * implementations live with their consumer because they depend on
5109
+ * SDK-specific Box / Sandbox types that don't belong in this package.
5110
+ */
5111
+ interface RunCommandInput {
5112
+ /** Executable name, looked up via PATH unless absolute. */
5113
+ cmd: string;
5114
+ /** Argument vector, NOT shell-interpolated. Each element passed to argv. */
5115
+ argv: string[];
5116
+ /** Working directory. Defaults to runner's notion of cwd if omitted. */
5117
+ cwd?: string;
5118
+ /**
5119
+ * Wall-clock cap in ms. The runner SHOULD return `timedOut: true` when
5120
+ * exceeded; callers MAY treat status null + timedOut as "killed."
5121
+ */
5122
+ capMs?: number;
5123
+ /** Env overrides merged on top of the runner's base environment. */
5124
+ env?: Record<string, string>;
5125
+ /** Optional stdin payload. */
5126
+ stdin?: string;
5127
+ }
5128
+ interface RunCommandResult {
5129
+ /** Exit code, or null when the process couldn't start / was killed. */
5130
+ status: number | null;
5131
+ stdout: string;
5132
+ stderr: string;
5133
+ durationMs: number;
5134
+ timedOut: boolean;
5135
+ /** Non-fatal runner-side error (binary missing, signal, etc.). */
5136
+ runnerError?: string;
5137
+ }
5138
+ interface DirEntry {
5139
+ name: string;
5140
+ isDirectory: boolean;
5141
+ isFile: boolean;
5142
+ /** File size in bytes. `null` for directories (not stat'd). */
5143
+ sizeBytes: number | null;
5144
+ }
5145
+ interface CommandRunner {
5146
+ /** Identifier for telemetry + logs. Open-ended literal-union for new runners. */
5147
+ readonly name: string;
5148
+ /** Execute a command in the runner's environment. */
5149
+ run(input: RunCommandInput): Promise<RunCommandResult>;
5150
+ /** True iff `<name>` resolves on the runner's PATH. */
5151
+ hasBin(name: string): Promise<boolean>;
5152
+ /** True iff the given path exists in the runner's filesystem. */
5153
+ fileExists(path: string): Promise<boolean>;
5154
+ /** Read a file. Returns `null` if missing or unreadable. */
5155
+ readFile(path: string): Promise<string | null>;
5156
+ /** List a directory. Returns `[]` if unreadable / missing. */
5157
+ readDir(path: string): Promise<DirEntry[]>;
5158
+ }
5159
+ /**
5160
+ * Host-process runner. Uses node:child_process spawnSync (synchronous
5161
+ * under the hood — wrapped in a Promise to satisfy the interface). For
5162
+ * very long-running commands consider an async-spawn variant; this
5163
+ * shape matches VB's existing behavior and is fine for build/test/lint
5164
+ * subprocesses that finish in seconds-to-minutes.
5165
+ */
5166
+ declare const localCommandRunner: CommandRunner;
5167
+
5168
+ /**
5169
+ * Multi-toolchain layer factory + merge helper.
5170
+ *
5171
+ * Some verification stages (install, typecheck, build, lint) run the
5172
+ * SAME logical layer across multiple parallel adapters — pnpm AND npm
5173
+ * AND cargo AND forge for a polyglot scaffold. The verifier presents
5174
+ * one row per stage; the toolchain breakdown lives in `findings.detail`.
5175
+ *
5176
+ * This module provides the merge: take N independent `LayerResult`s
5177
+ * (one per adapter) and reduce them to a single `LayerResult` whose
5178
+ * status is the worst of the parts and whose findings cite the adapter
5179
+ * that produced each one. Plus a {@link multiToolchainLayer} factory
5180
+ * that runs the adapter calls in parallel + applies the reducer.
5181
+ *
5182
+ * Pure utility — composes with {@link MultiLayerVerifier}.{run}.
5183
+ */
5184
+
5185
+ interface AdapterRun {
5186
+ /** Identifier for the adapter (e.g. 'pnpm', 'npm', 'cargo', 'forge'). */
5187
+ adapter: string;
5188
+ result: LayerResult;
5189
+ }
5190
+ interface MergeOptions {
5191
+ /**
5192
+ * How to combine per-adapter `durationMs`. Default `'max'` (parallel
5193
+ * wall-clock). Set `'sum'` when reporting total work done across
5194
+ * adapters rather than wall time.
5195
+ */
5196
+ mergeDuration?: 'max' | 'sum';
5197
+ /**
5198
+ * Prefix finding messages with a per-adapter tag (e.g. `[pnpm] typecheck failed`).
5199
+ * Default: no prefix (renderers read `detail.adapter` instead).
5200
+ */
5201
+ messagePrefixer?: (adapter: string) => string;
5202
+ /**
5203
+ * How to reduce per-adapter `LayerResult.diagnostics` into the merged
5204
+ * result's diagnostics. `'max'` (default) — for each key, merged =
5205
+ * max across adapters where value is non-null (matches "if ANY adapter
5206
+ * saw N errors, merged saw N"). `'sum'` — sum non-null values.
5207
+ */
5208
+ mergeDiagnostics?: 'max' | 'sum';
5209
+ }
5210
+ /**
5211
+ * Reduce N adapter runs to a single `LayerResult` for a logical layer.
5212
+ *
5213
+ * - status: worst of the parts (pass < skipped < fail < timeout < error)
5214
+ * - score: weighted mean of numeric scores (skip = no contribution)
5215
+ * - findings: union, each tagged with `detail.adapter`
5216
+ * - durationMs: `mergeDuration` option (default 'max' for parallel wall-clock)
5217
+ * - diagnostics: `mergeDiagnostics` option (default 'max' per key)
5218
+ * - reason: " · "-joined `name: status` per adapter
5219
+ */
5220
+ declare function mergeLayerResults(name: string, perAdapter: AdapterRun[], options?: MergeOptions): LayerResult;
5221
+ interface MultiToolchainLayerConfig<Env, Adapter> {
5222
+ name: string;
5223
+ adapters: ReadonlyArray<Adapter>;
5224
+ /** Adapter identifier — used in findings + reason. */
5225
+ adapterName: (a: Adapter) => string;
5226
+ /** Run a single adapter against the verify context. */
5227
+ run: (a: Adapter, ctx: VerifyContext<Env>) => Promise<LayerResult> | LayerResult;
5228
+ dependsOn?: string[];
5229
+ weight?: number;
5230
+ failContributesToScore?: boolean;
5231
+ capMs?: number;
5232
+ /**
5233
+ * Per-adapter parallel cap. Defaults to 8 — defense in depth against a
5234
+ * caller passing 50 adapters and fanning out 50 simultaneous subprocesses.
5235
+ * Adapters that need higher concurrency raise this explicitly.
5236
+ */
5237
+ maxParallel?: number;
5238
+ }
5239
+ /**
5240
+ * Build a {@link Layer} that fans the same logical stage across N adapters
5241
+ * in parallel and merges via {@link mergeLayerResults}.
5242
+ *
5243
+ * Per-adapter throws are caught + converted to `status: 'error'` results
5244
+ * so one bad adapter doesn't poison the whole layer.
5245
+ */
5246
+ declare function multiToolchainLayer<Env, Adapter>(config: MultiToolchainLayerConfig<Env, Adapter>): Layer<Env>;
5247
+
5248
+ /**
5249
+ * Reviewer primitives — prompt builder + default ReviewFn factory.
5250
+ *
5251
+ * `buildReviewerPrompt` is the pure, LLM-agnostic piece: takes
5252
+ * `ReviewerPromptInput` (user request, trace summary, verification
5253
+ * summary, memory, optional extra context) and emits the system +
5254
+ * user message pair. No LLM dependency — callers that want to drive
5255
+ * their own transport get full control.
5256
+ *
5257
+ * `createDefaultReviewer` is the convenience factory: wires the prompt
5258
+ * builder to `callLlmJson` with a default schema + soft-fail policy.
5259
+ * Returns a function that maps `ReviewerPromptInput` to `ReviewerOutput`.
5260
+ *
5261
+ * Same pattern as `runSemanticConceptJudge` / `createSemanticConceptJudge`:
5262
+ * low-level pure builder + high-level factory built on top.
5263
+ */
5264
+
5265
+ interface ReviewerMemoryEntry {
5266
+ shot: number;
5267
+ ts?: string;
5268
+ observations?: string;
5269
+ diagnosis?: string;
5270
+ nextShotInstruction?: string;
5271
+ shouldContinue?: boolean;
5272
+ confidence?: number;
5273
+ }
5274
+ interface ReviewerVerificationSummary {
5275
+ blendedScore: number;
5276
+ allPass: boolean;
5277
+ failCount: number;
5278
+ failingLayers?: string[];
5279
+ }
5280
+ interface ReviewerPromptInput {
5281
+ shot: number;
5282
+ userRequest: string;
5283
+ /**
5284
+ * Compact trace summary — tool-call counts, errors, recent activity
5285
+ * lines. Built by the caller from whatever trace format they have;
5286
+ * agent-eval does not prescribe.
5287
+ */
5288
+ traceSummary: string;
5289
+ verification: ReviewerVerificationSummary;
5290
+ memory: ReviewerMemoryEntry[];
5291
+ /**
5292
+ * Optional extra context injected into the prompt between the trace
5293
+ * and the verification blocks. Use for workdir file-tree snapshots,
5294
+ * scaffold descriptions, or any environmental fact the reviewer
5295
+ * needs to direct the next shot accurately.
5296
+ */
5297
+ extraContext?: string;
5298
+ /**
5299
+ * Optional extra section appended at the end of the prompt (e.g.
5300
+ * leaf metadata, scenario id). Free-form — no agent-eval-shaped
5301
+ * schema.
5302
+ */
5303
+ trailingContext?: string;
5304
+ }
5305
+ interface ReviewerOutput {
5306
+ shot: number;
5307
+ observations: string;
5308
+ diagnosis: string;
5309
+ nextShotInstruction: string;
5310
+ shouldContinue: boolean;
5311
+ /** 0..1 self-assessed confidence in the directive. */
5312
+ confidence: number;
5313
+ /** LLM cost in USD if the transport reports it, else null. */
5314
+ costUsd: number | null;
5315
+ durationMs: number;
5316
+ /** False when the LLM errored or returned malformed JSON; caller soft-fails to defaults. */
5317
+ available: boolean;
5318
+ error?: string;
5319
+ }
5320
+ interface ReviewerSoftFailDefaults {
5321
+ observations?: string;
5322
+ diagnosis?: string;
5323
+ nextShotInstruction?: string;
5324
+ shouldContinue?: boolean;
5325
+ confidence?: number;
5326
+ }
5327
+ interface CreateDefaultReviewerOptions {
5328
+ /** Model id to call. */
5329
+ model: string;
5330
+ /** Per-call timeout. Default 180s. */
5331
+ timeoutMs?: number;
5332
+ /** LlmClient transport config (baseUrl, apiKey, authHeader, etc.). */
5333
+ llm?: LlmClientOptions;
5334
+ /**
5335
+ * Override the prompt builder. Default: `buildReviewerPrompt`.
5336
+ * Consumers with different reviewer voices pass their own.
5337
+ */
5338
+ promptBuilder?: (input: ReviewerPromptInput) => {
5339
+ system: string;
5340
+ user: string;
5341
+ };
5342
+ /**
5343
+ * Soft-fail values when the LLM throws or returns unparseable JSON.
5344
+ * Matches VerticalBench's shipped policy: continue with generic
5345
+ * instruction at confidence 0.3 so the worker keeps trying.
5346
+ */
5347
+ softFailDefaults?: ReviewerSoftFailDefaults;
5348
+ }
5349
+ /**
5350
+ * Build the reviewer's system + user messages. Pure function, no LLM
5351
+ * call. Callers that want their own transport or a different structured
5352
+ * output can use this and skip `createDefaultReviewer` entirely.
5353
+ */
5354
+ declare function buildReviewerPrompt(input: ReviewerPromptInput): {
5355
+ system: string;
5356
+ user: string;
5357
+ };
5358
+ /**
5359
+ * Factory: returns a function that invokes the default reviewer against
5360
+ * an LLM and parses the structured output. Soft-fails to the provided
5361
+ * defaults on LLM throw or JSON-parse error so the shot loop keeps
5362
+ * moving rather than crashing.
5363
+ */
5364
+ declare function createDefaultReviewer(options: CreateDefaultReviewerOptions): (input: ReviewerPromptInput) => Promise<ReviewerOutput>;
5365
+
5366
+ /**
5367
+ * Semantic concept judge — "does the built artifact actually implement
5368
+ * the features the user asked for?"
5369
+ *
5370
+ * Distinct from the domain/code/coherence judges in `judges.ts`:
5371
+ * - those judges score free-form conversational agent outputs along
5372
+ * quality dimensions (accuracy, depth, etc.)
5373
+ * - this judge scores a *built artifact* (served HTML + source files)
5374
+ * against an explicit list of expected concepts, returning per-concept
5375
+ * {present, score 0-10, evidence, severity}.
5376
+ *
5377
+ * The judge is strict about distinguishing (a) a working implementation
5378
+ * from (b) a keyword-present stub. "// TODO: mint button" is NOT present.
5379
+ * Only real, functional, wired-up code counts.
5380
+ *
5381
+ * Use via {@link createSemanticConceptJudge} or directly via
5382
+ * {@link runSemanticConceptJudge}. Soft-fails (available=false) on LLM
5383
+ * or JSON-parse errors so the caller can treat that as "layer skipped"
5384
+ * rather than "layer failed" in a multi-layer pipeline.
5385
+ */
5386
+
5387
+ /**
5388
+ * Implementation complexity class for weighted scoring (added 0.11).
5389
+ *
5390
+ * - `render` (default): the concept is a UI surface that displays static
5391
+ * data — render a list, show a counter, lay out a button. Single-file
5392
+ * work, no external integration.
5393
+ * - `integrate`: the concept requires wiring a real external system —
5394
+ * wallet connect (wagmi + RainbowKit + chain config), payment provider
5395
+ * (Stripe Elements + intent + webhook), an API client with auth.
5396
+ * Multi-file, library-knowledge, runtime correctness matters.
5397
+ * - `compute`: the concept requires algorithmic work — solver, simulator,
5398
+ * constraint propagation, ML inference. Correctness > UI polish.
5399
+ *
5400
+ * Default weights (when applied via `weightConcepts: 'complexity'`):
5401
+ * render=1.0, integrate=2.0, compute=2.5
5402
+ *
5403
+ * Cross-vertical scoring without complexity weighting silently inflates
5404
+ * the rate of UI-heavy verticals (healthcare, fintech dashboards) vs
5405
+ * integration-heavy verticals (DeFi, wallets) — all concepts treated
5406
+ * equally even though the agent does 2-3x the work for `integrate`.
5407
+ */
5408
+ type ConceptComplexity = 'render' | 'integrate' | 'compute';
5409
+ interface ConceptSpec {
5410
+ name: string;
5411
+ /** Short hints that help the judge; not used for matching. */
5412
+ keywords?: string[];
5413
+ /** Optional explicit weight; default 1.0. Overrides complexity-derived weight. */
5414
+ weight?: number;
5415
+ /** Implementation complexity class. Default `render`. */
5416
+ complexity?: ConceptComplexity;
5417
+ }
5418
+ interface ConceptFinding {
5419
+ concept: string;
5420
+ present: boolean;
5421
+ /** 0..10. 10 = production-ready; 7 = functional thin; 4 = partial; 0 = absent. */
5422
+ score: number;
5423
+ evidence: string;
5424
+ severity: Severity;
5425
+ }
5426
+ interface SemanticConceptJudgeInput {
5427
+ /** Full natural-language prompt the agent was handed. */
5428
+ userRequest: string;
5429
+ /** Rendered HTML the preview returns (UI artifacts). Optional. */
5430
+ servedHtml?: string;
5431
+ /** Top-level source files from the agent's workdir. */
5432
+ sourceFiles: Array<{
5433
+ path: string;
5434
+ content: string;
5435
+ }>;
5436
+ /** The expected concept list. */
5437
+ expectedConcepts: ConceptSpec[];
5438
+ /** Free-form metadata (id, difficulty) to inject into the prompt. */
5439
+ artifactLabel?: string;
5440
+ artifactDescription?: string;
5441
+ }
5442
+ interface SemanticConceptJudgeResult {
5443
+ kind: 'semantic-concept';
5444
+ version: string;
5445
+ /** Normalized 0..1 score — mean of per-concept scores / 10. */
5446
+ score: number;
5447
+ presentCount: number;
5448
+ totalCount: number;
5449
+ findings: ConceptFinding[];
5450
+ summary: string;
5451
+ durationMs: number;
5452
+ costUsd: number | null;
5453
+ /** False on LLM/JSON error — treat as "skipped / unable to judge" in pipelines. */
5454
+ available: boolean;
5455
+ error?: string;
5456
+ }
5457
+ /**
5458
+ * Score-aggregation strategy. Default `mean` (legacy behavior — 0.10
5459
+ * and earlier always averaged 0-10 scores). `complexity` applies the
5460
+ * default weight table (render=1, integrate=2, compute=2.5) unless a
5461
+ * concept has an explicit `weight`. `explicit` honors only `weight`
5462
+ * (defaulting to 1 for unspecified).
5463
+ */
5464
+ type ConceptWeightStrategy = 'mean' | 'complexity' | 'explicit';
5465
+ declare const DEFAULT_COMPLEXITY_WEIGHTS: Record<ConceptComplexity, number>;
5466
+ interface SemanticConceptJudgeOptions {
5467
+ /** Model id to call. Default 'claude-sonnet-4-6' via agent-eval defaults. */
5468
+ model?: string;
5469
+ /** Per-call timeout. Default 180s. */
5470
+ timeoutMs?: number;
5471
+ /** Pipeline budget for the prompt (source blob truncation). Default 45000. */
5472
+ maxSourceChars?: number;
5473
+ /** Per-file cap before inclusion. Default 20000. */
5474
+ maxPerFileChars?: number;
5475
+ /** HTML cap. Default 30000. */
5476
+ maxHtmlChars?: number;
5477
+ /** LlmClient config (baseUrl, apiKey, authHeader, …). */
5478
+ llm?: LlmClientOptions;
5479
+ /**
5480
+ * Score aggregation strategy. Default `mean` for backward compatibility
5481
+ * with 0.10 and earlier callers. Cross-vertical comparisons should use
5482
+ * `complexity` to neutralize the integrate-vs-render asymmetry.
5483
+ */
5484
+ weightConcepts?: ConceptWeightStrategy;
5485
+ /** Override the default complexity → weight table. */
5486
+ complexityWeights?: Partial<Record<ConceptComplexity, number>>;
5487
+ }
5488
+ declare const SEMANTIC_CONCEPT_JUDGE_VERSION = "semantic-concept-judge-v1-2026-04-24";
5489
+ /**
5490
+ * Run the semantic concept judge. Soft-fails to available=false on
5491
+ * LLM/JSON errors — callers in a MultiLayerVerifier pipeline can treat
5492
+ * that as "skip" rather than "fail."
5493
+ */
5494
+ declare function runSemanticConceptJudge(input: SemanticConceptJudgeInput, options?: SemanticConceptJudgeOptions): Promise<SemanticConceptJudgeResult>;
5495
+ /**
5496
+ * Factory: pin LLM options once, return a closure that accepts inputs.
5497
+ * Convenient for pipelines that want to share a single LlmClient config.
5498
+ */
5499
+ declare function createSemanticConceptJudge(options?: SemanticConceptJudgeOptions): (input: SemanticConceptJudgeInput) => Promise<SemanticConceptJudgeResult>;
5500
+
5501
+ /**
5502
+ * Intent-match judge — "did the agent build the right APP, ignoring
5503
+ * whether every feature is wired up?"
5504
+ *
5505
+ * Distinct from {@link runSemanticConceptJudge} which scores per-concept
5506
+ * presence. The semantic judge can return 0/4 concepts present even
5507
+ * when the agent built a thoughtful, polished, on-brief app that just
5508
+ * lacks one or two features. The semantic judge can also return 4/4
5509
+ * present even when the agent shipped the wrong project (keyword-rich
5510
+ * stub).
5511
+ *
5512
+ * Intent-match asks ONE question:
5513
+ * "Looking at the agent's work as a whole — independent of feature
5514
+ * coverage — is this an honest attempt at the user's request?"
5515
+ *
5516
+ * Returns a 0–1 score and a 1-sentence evidence string. Use as a sanity
5517
+ * check on `completenessScore`-style metrics: if intent-match is high
5518
+ * and concept count is low, the agent built the right thing but is
5519
+ * missing features (ship and iterate). If intent-match is low, the
5520
+ * agent built the wrong thing (reject regardless of concept count).
5521
+ *
5522
+ * Soft-fails on LLM/JSON error (`available: false`) so callers can
5523
+ * treat failure as "judge skipped."
5524
+ *
5525
+ * Added in 0.11 to replace the lying `completenessScore: 1` field that
5526
+ * VerticalBench shipped pre-Gen-48 — that field was keyword-driven and
5527
+ * fired true on builds with zero spec concepts implemented.
5528
+ */
5529
+
5530
+ declare const INTENT_MATCH_JUDGE_VERSION = "intent-match-judge-v1-2026-04-24";
5531
+ interface IntentMatchInput {
5532
+ /** The full natural-language prompt the agent was handed. */
5533
+ userRequest: string;
5534
+ /** Top-level source files from the agent's workdir. */
5535
+ sourceFiles: Array<{
5536
+ path: string;
5537
+ content: string;
5538
+ }>;
5539
+ /** Rendered HTML the preview returned, when available. */
5540
+ servedHtml?: string;
5541
+ /** Optional metadata to inject (id, vertical, difficulty). */
5542
+ artifactLabel?: string;
5543
+ artifactDescription?: string;
5544
+ }
5545
+ interface IntentMatchResult {
5546
+ kind: 'intent-match';
5547
+ version: string;
5548
+ /** 0..1 — 1 = unmistakably the right app, 0 = unrelated to the brief. */
5549
+ score: number;
5550
+ /** One-sentence rationale citing concrete evidence (file or HTML). */
5551
+ evidence: string;
5552
+ durationMs: number;
5553
+ costUsd: number | null;
5554
+ available: boolean;
5555
+ error?: string;
5556
+ }
5557
+ interface IntentMatchOptions {
5558
+ model?: string;
5559
+ timeoutMs?: number;
5560
+ maxSourceChars?: number;
5561
+ maxPerFileChars?: number;
5562
+ maxHtmlChars?: number;
5563
+ llm?: LlmClientOptions;
5564
+ }
5565
+ /**
5566
+ * Run the intent-match judge. Soft-fails to available=false on error.
5567
+ */
5568
+ declare function runIntentMatchJudge(input: IntentMatchInput, options?: IntentMatchOptions): Promise<IntentMatchResult>;
5569
+ /**
5570
+ * Factory: pin LLM options once, return a closure.
5571
+ */
5572
+ declare function createIntentMatchJudge(options?: IntentMatchOptions): (input: IntentMatchInput) => Promise<IntentMatchResult>;
5573
+
5574
+ /**
5575
+ * Flow layer — drive a previewed app through a scripted user walk.
5576
+ *
5577
+ * The MultiLayerVerifier already had a `flow` slot wired in
5578
+ * VerticalBench's verification-harness, but the layer module was
5579
+ * always-skipped ("flow layer module not yet wired"). This adds the
5580
+ * module: a Layer<Env> that takes a {@link FlowSpec} (URL + steps),
5581
+ * boots a preview server via the supplied {@link FlowRunner}, executes
5582
+ * each step, and returns a LayerResult whose `findings` enumerate
5583
+ * which step failed.
5584
+ *
5585
+ * The runner is injected so this module can swap between:
5586
+ * - production: agent-browser CLI (a11y-tree based steps)
5587
+ * - test: in-memory mock that returns canned step outcomes
5588
+ * - future: Playwright, Puppeteer, custom scrapers
5589
+ *
5590
+ * Shipped in 0.11 alongside {@link runIntentMatchJudge} — together they
5591
+ * close the "the agent shipped the wrong app and we didn't catch it"
5592
+ * blind spot. Intent-match catches "wrong app entirely"; flow catches
5593
+ * "right app but the buttons don't work."
5594
+ */
5595
+
5596
+ type FlowAction = 'navigate' | 'click' | 'fill' | 'expect-text' | 'expect-element' | 'expect-url' | 'wait';
5597
+ interface FlowStep {
5598
+ /** What this step does. */
5599
+ action: FlowAction;
5600
+ /** Human-readable description for findings. */
5601
+ describe?: string;
5602
+ /**
5603
+ * For navigate/expect-url: full URL. For click/fill/expect-element:
5604
+ * accessible-name selector or CSS selector.
5605
+ * For expect-text: substring expected on the page.
5606
+ * For wait: ignored (use `value` for ms).
5607
+ */
5608
+ target?: string;
5609
+ /** For fill: text to enter. For wait: ms. */
5610
+ value?: string;
5611
+ /** Severity of a failure. Default `major`. */
5612
+ severity?: Severity;
5613
+ }
5614
+ interface FlowSpec {
5615
+ /** Initial URL the runner should open. */
5616
+ url: string;
5617
+ /** Ordered steps. Stops at the first failure unless `continueOnFail: true`. */
5618
+ steps: FlowStep[];
5619
+ /** When true, execute every step even after a failure (collect all findings). */
5620
+ continueOnFail?: boolean;
5621
+ /** Per-step wall cap (ms). Default 15s. */
5622
+ stepTimeoutMs?: number;
5623
+ }
5624
+ interface FlowRunnerStepResult {
5625
+ ok: boolean;
5626
+ /** Concrete observation: matched text snippet, captured URL, error message. */
5627
+ evidence?: string;
5628
+ /** Wall-clock duration of the step. */
5629
+ durationMs?: number;
5630
+ }
5631
+ interface FlowRunner {
5632
+ /** Open the target URL. Returns when the page is interactable. */
5633
+ open(url: string): Promise<FlowRunnerStepResult>;
5634
+ /** Execute one step. The runner owns interpretation of `target`. */
5635
+ step(step: FlowStep): Promise<FlowRunnerStepResult>;
5636
+ /** Tear down browser, free resources. Always called once per layer.run. */
5637
+ close(): Promise<void>;
5638
+ }
5639
+ interface FlowLayerEnv {
5640
+ /** Optional override per-call. Defaults supplied by the layer factory. */
5641
+ flowSpec?: FlowSpec;
5642
+ }
5643
+ interface FlowLayerFactoryInput {
5644
+ /** Static spec (used when env doesn't supply one). */
5645
+ flowSpec?: FlowSpec;
5646
+ /** Build the runner per call (lets the layer create + tear down per leaf). */
5647
+ runner: () => FlowRunner | Promise<FlowRunner>;
5648
+ /** Layer name. Default `flow`. */
5649
+ name?: string;
5650
+ /** Layer dependencies — default `['serve']` so a non-booting preview skips us. */
5651
+ dependsOn?: string[];
5652
+ /** Layer weight for blendedScore (0..1+). Default 1. */
5653
+ weight?: number;
5654
+ /** Cap for the entire flow run (ms). Default 60s. */
5655
+ capMs?: number;
5656
+ }
5657
+ /**
5658
+ * Build a flow layer that scripts a user walk via the supplied runner.
5659
+ *
5660
+ * Score: 1.0 when every step passed; otherwise 1 - (failedSteps / totalSteps).
5661
+ * Status: `pass` iff every step passed; `fail` if any step failed; `error`
5662
+ * on runner setup error; `skipped` when no flowSpec is available.
5663
+ */
5664
+ declare function flowLayer<Env extends FlowLayerEnv = FlowLayerEnv>(input: FlowLayerFactoryInput): Layer<Env>;
5665
+
5666
+ /**
5667
+ * Deploy gate layer — would the agent's build actually publish?
5668
+ *
5669
+ * The product Blueprint Agent fronts promises "go from idea to live URL."
5670
+ * Pre-Gen-48 the eval stopped at install/typecheck/build/serve — every
5671
+ * one of which can pass while `vite build` (or `next build`, etc) fails
5672
+ * on a production-only constraint (env-var requirement, dynamic import
5673
+ * not statically resolvable, missing public asset).
5674
+ *
5675
+ * Deploy gate runs the production build via the supplied {@link DeployRunner}
5676
+ * and asserts:
5677
+ * - command exited 0
5678
+ * - artifact dir contains an entry point (index.html for static SPAs,
5679
+ * equivalent per framework family)
5680
+ *
5681
+ * Shipped in 0.11 with the canonical `vite` runner. Future generations
5682
+ * add wrangler-deploy --dry-run, next-build, etc — each as another
5683
+ * runner factory.
5684
+ */
5685
+
5686
+ type DeployFamily = 'frontend-static' | 'nextjs' | 'remix' | 'fullstack-ts';
5687
+ interface DeployRunResult {
5688
+ ok: boolean;
5689
+ /** Stdout/stderr tail surfaced as evidence. Bounded in caller. */
5690
+ output?: string;
5691
+ /** Wall-clock duration of the build command. */
5692
+ durationMs?: number;
5693
+ /** Path to artifact directory the runner expects (dist/, .next/, build/, etc). */
5694
+ artifactDir?: string;
5695
+ /** True iff artifactDir contains the family's expected entry point. */
5696
+ artifactValid?: boolean;
5697
+ }
5698
+ interface DeployRunner {
5699
+ /** Run the production build. The runner owns command + cwd. */
5700
+ run(): Promise<DeployRunResult>;
5701
+ }
5702
+ interface DeployGateLayerInput {
5703
+ /** Build the runner per call. */
5704
+ runner: () => DeployRunner | Promise<DeployRunner>;
5705
+ /** Family hint — for logging, surfaced in diagnostics. */
5706
+ family?: DeployFamily;
5707
+ /** Layer name. Default `deploy`. */
5708
+ name?: string;
5709
+ /** Layer dependencies — default `['build']`. */
5710
+ dependsOn?: string[];
5711
+ /** Weight in blendedScore. Default 1. */
5712
+ weight?: number;
5713
+ /** Cap (ms). Default 120s — prod builds are slower than dev. */
5714
+ capMs?: number;
5715
+ /** When true, treat artifactValid=false as a fail (default true). */
5716
+ requireArtifact?: boolean;
5717
+ }
5718
+ /**
5719
+ * Build a deploy gate layer that runs the production build and verifies
5720
+ * the artifact. Pass: ok && artifactValid. Score: 1.0 (pass) or 0 (fail).
5721
+ *
5722
+ * For families where artifact-validation isn't applicable (e.g. a
5723
+ * server-rendered build that prints a manifest), set `requireArtifact:
5724
+ * false` and rely on the runner's own ok signal.
5725
+ */
5726
+ declare function deployGateLayer<Env = unknown>(input: DeployGateLayerInput): Layer<Env>;
5727
+ interface ViteDeployRunnerInput {
5728
+ /** Workdir to build. The runner cd's here. */
5729
+ workdir: string;
5730
+ /**
5731
+ * Function to run a shell command in `workdir`. Same shape as
5732
+ * agent-eval's CommandRunner.run for compositional reuse.
5733
+ */
5734
+ exec: (cmd: string, opts?: {
5735
+ cwd?: string;
5736
+ timeoutMs?: number;
5737
+ }) => Promise<{
5738
+ stdout: string;
5739
+ stderr: string;
5740
+ exitCode: number;
5741
+ }>;
5742
+ /**
5743
+ * Function to test whether a path exists in the workdir. Inject
5744
+ * `(p) => existsSync(join(workdir, p))` for host runs.
5745
+ */
5746
+ exists: (relativePath: string) => boolean | Promise<boolean>;
5747
+ /** Build command. Default `npm run build`. */
5748
+ buildCommand?: string;
5749
+ /** Artifact directory to validate. Default `dist`. */
5750
+ artifactDir?: string;
5751
+ /** Entry-point file under artifactDir. Default `index.html`. */
5752
+ artifactEntry?: string;
5753
+ /** Per-build cap (ms). Default 90s. */
5754
+ timeoutMs?: number;
5755
+ }
5756
+ /**
5757
+ * Canonical runner for `frontend-static` family — runs the build script,
5758
+ * validates `<artifactDir>/<artifactEntry>` exists. Use as the `runner:`
5759
+ * factory for {@link deployGateLayer}.
5760
+ */
5761
+ declare function viteDeployRunner(input: ViteDeployRunnerInput): DeployRunner;
5762
+
5763
+ /**
5764
+ * Keyword-coverage judge — baseline complement to the semantic concept
5765
+ * judge.
5766
+ *
5767
+ * Where {@link runSemanticConceptJudge} uses an LLM to read source code
5768
+ * and decide whether a concept is REALLY implemented (not just
5769
+ * keyword-mentioned), this judge does the cheap, deterministic version:
5770
+ * fetch the served preview, concatenate every linked CSS/JS asset, and
5771
+ * substring-match each expected concept's keywords against the
5772
+ * concatenated haystack. Optional `requiredElement` selector adds a
5773
+ * structural gate so "supply counter" can require an actual `<input>` or
5774
+ * `<table>`, not just a comment containing the word.
5775
+ *
5776
+ * Use both judges. Keyword coverage is a fast 0-cost gate — a stub page
5777
+ * with the right keywords passes here, fails the semantic judge. Score
5778
+ * divergence between the two is itself a signal: high keyword coverage
5779
+ * + low semantic = "the agent slapped the right words on the right
5780
+ * scaffold but didn't wire any of it up."
5781
+ *
5782
+ * Pure functions, soft-fail on fetch error, no LLM dependency.
5783
+ */
5784
+ interface KeywordConceptSpec {
5785
+ name: string;
5786
+ keywords: string[];
5787
+ /**
5788
+ * Optional CSS selector that must match in the HTML for the concept
5789
+ * to count as present. Tiny subset:
5790
+ * - `tag` (e.g. `form`)
5791
+ * - `tag[attr="value"]` (e.g. `input[type="number"]`)
5792
+ * - `tag[attr]` (presence only)
5793
+ * Anything more complex is rejected with `null` (treated as
5794
+ * "unenforced", not "failed").
5795
+ */
5796
+ requiredElement?: string;
5797
+ }
5798
+ interface KeywordCoverageFinding {
5799
+ concept: string;
5800
+ found: boolean;
5801
+ matchedKeywords: string[];
5802
+ /** True iff the optional requiredElement selector matched; null when no selector. */
5803
+ requiredElementPresent: boolean | null;
5804
+ }
5805
+ interface KeywordCoverageResult {
5806
+ /** 0..1 share of concepts satisfied. */
5807
+ score: number;
5808
+ presentCount: number;
5809
+ totalCount: number;
5810
+ findings: KeywordCoverageFinding[];
5811
+ durationMs: number;
5812
+ /** Total bytes assembled across html + linked assets. */
5813
+ totalAssembledBytes: number;
5814
+ /** Soft-failure reason if the audit couldn't run. */
5815
+ error?: string;
5816
+ }
5817
+ interface KeywordCoverageOptions {
5818
+ /** Override fetch implementation — for tests. */
5819
+ fetch?: typeof fetch;
5820
+ /** Per-asset fetch timeout (default 3s). */
5821
+ assetTimeoutMs?: number;
5822
+ /** Initial-HTML fetch timeout (default 5s). */
5823
+ htmlTimeoutMs?: number;
5824
+ }
5825
+ /**
5826
+ * Element-presence check using a tiny CSS-selector subset. Returns
5827
+ * null when the selector isn't supported — caller treats that as
5828
+ * "unenforced" rather than "failed."
5829
+ */
5830
+ declare function htmlContainsElement(html: string, selector: string): boolean | null;
5831
+ /**
5832
+ * Pull every `<link rel=stylesheet href>` and `<script src>` from a
5833
+ * raw HTML body. Returns absolute URLs resolved against `baseUrl`.
5834
+ * Permissive regex — agent-authored markup doesn't always quote
5835
+ * attributes the same way.
5836
+ */
5837
+ declare function extractAssetUrls(html: string, baseUrl: string): string[];
5838
+ /**
5839
+ * Score expected concepts against an already-fetched HTML payload + any
5840
+ * pre-fetched CSS/JS assets. Use when the runner has the bytes in hand
5841
+ * and doesn't want a fresh HTTP round-trip — e.g. sandbox runtime where
5842
+ * the preview content was fetched via curl from inside the container.
5843
+ */
5844
+ declare function runKeywordCoverageJudge(html: string, expectedConcepts: ReadonlyArray<KeywordConceptSpec>, assets?: ReadonlyArray<string>): KeywordCoverageResult;
5845
+ /**
5846
+ * URL-fetch flavor — GET the preview, parallel-fetch every linked
5847
+ * stylesheet + script (with bounded timeouts, soft-fail individually),
5848
+ * then score via {@link runKeywordCoverageJudge}.
5849
+ */
5850
+ declare function runKeywordCoverageJudgeUrl(previewUrl: string, expectedConcepts: ReadonlyArray<KeywordConceptSpec>, options?: KeywordCoverageOptions): Promise<KeywordCoverageResult>;
5851
+
5852
+ /**
5853
+ * Toolchain error-count extractor.
5854
+ *
5855
+ * Given stderr/stdout from a compiler or test runner, count the number
5856
+ * of reported errors/failures. Patterns are deliberately narrow —
5857
+ * unknown stderr returns `null` rather than zero so callers can
5858
+ * distinguish "no errors" from "different toolchain, couldn't parse".
5859
+ *
5860
+ * All patterns are anchored to the start of a line and use bounded
5861
+ * character classes to avoid catastrophic backtracking on pathological
5862
+ * inputs.
5863
+ *
5864
+ * Add new toolchains by appending to {@link ERROR_COUNT_PATTERNS};
5865
+ * order matters only in the sense that the first matching pattern wins.
5866
+ */
5867
+ interface ErrorCountPattern {
5868
+ /** Stable identifier for logging + tests. */
5869
+ name: string;
5870
+ /** Must be global (`g` flag) — the extractor counts matches. */
5871
+ regex: RegExp;
5872
+ /** Optional post-processing to extract a count from a single captured match. */
5873
+ transform?: (match: RegExpMatchArray) => number;
5874
+ }
5875
+ declare const ERROR_COUNT_PATTERNS: ErrorCountPattern[];
5876
+ interface ExtractOptions {
5877
+ /** Restrict to named patterns — default: all patterns. */
5878
+ only?: string[];
5879
+ /** Additional patterns to consider BEFORE the built-in list. */
5880
+ extra?: ErrorCountPattern[];
5881
+ }
5882
+ interface ExtractResult {
5883
+ /** Total count of matched errors, or null when no pattern matched. */
5884
+ count: number | null;
5885
+ /** Name of the pattern that matched, or null. */
5886
+ matched: string | null;
5887
+ /** Original matches for callers that want to surface specifics. */
5888
+ samples: string[];
5889
+ }
5890
+ /**
5891
+ * Try each pattern in order; return the first with matches.
5892
+ *
5893
+ * Returning `null` (instead of zero) on no-match is deliberate — a
5894
+ * callsite that greps for "typescript errors" on cargo output should
5895
+ * NOT treat that as "zero TS errors" because the toolchain is wrong.
5896
+ */
5897
+ declare function extractErrorCount(text: string, opts?: ExtractOptions): ExtractResult;
5898
+
5899
+ /**
5900
+ * Reference replay — score an agent against withheld historical outcomes.
5901
+ *
5902
+ * This is the generic version of the public-audit replay pattern:
5903
+ * run a candidate system on an old task, keep the reference answers hidden
5904
+ * until after execution, then score recall/precision and gate promotion
5905
+ * across train/dev/test/holdout splits.
5906
+ */
5907
+ type ReferenceReplaySplit = 'train' | 'dev' | 'test' | 'holdout';
5908
+ interface ReferenceReplayItem {
5909
+ id: string;
5910
+ title: string;
5911
+ description?: string;
5912
+ severity?: string;
5913
+ tags?: string[];
5914
+ weight?: number;
5915
+ }
5916
+ interface ReferenceReplayCandidate {
5917
+ id: string;
5918
+ title: string;
5919
+ description?: string;
5920
+ severity?: string;
5921
+ tags?: string[];
5922
+ metadata?: Record<string, unknown>;
5923
+ }
5924
+ interface ReferenceReplayScenario {
5925
+ id: string;
5926
+ split?: ReferenceReplaySplit;
5927
+ references: ReferenceReplayItem[];
5928
+ candidates: ReferenceReplayCandidate[];
5929
+ metadata?: Record<string, unknown>;
5930
+ }
5931
+ interface ReferenceReplayCase<Input = unknown> {
5932
+ id: string;
5933
+ split?: ReferenceReplaySplit;
5934
+ input: Input;
5935
+ references: ReferenceReplayItem[];
5936
+ metadata?: Record<string, unknown>;
5937
+ }
5938
+ interface ReferenceReplayExecutionScenario<Input = unknown> {
5939
+ id: string;
5940
+ split: ReferenceReplaySplit;
5941
+ input: Input;
5942
+ metadata?: Record<string, unknown>;
5943
+ }
5944
+ interface ReferenceReplayRunContext {
5945
+ runId: string;
5946
+ caseIndex: number;
5947
+ abortSignal?: AbortSignal;
5948
+ }
5949
+ interface ReferenceReplayAdapter<Input = unknown> {
5950
+ run(scenario: ReferenceReplayExecutionScenario<Input>, context: ReferenceReplayRunContext): Promise<ReferenceReplayCandidate[]>;
5951
+ }
5952
+ type ReferenceReplayAdapterFn<Input = unknown> = (scenario: ReferenceReplayExecutionScenario<Input>, context: ReferenceReplayRunContext) => Promise<ReferenceReplayCandidate[]>;
5953
+ type ReferenceReplayAdapterLike<Input = unknown> = ReferenceReplayAdapter<Input> | ReferenceReplayAdapterFn<Input>;
5954
+ interface ReferenceReplayMatch {
5955
+ scenarioId: string;
5956
+ referenceId: string;
5957
+ candidateId: string | null;
5958
+ score: number;
5959
+ matched: boolean;
5960
+ weight: number;
5961
+ reason: string;
5962
+ }
5963
+ interface ReferenceReplayScenarioScore {
5964
+ scenarioId: string;
5965
+ split: ReferenceReplaySplit;
5966
+ matched: number;
5967
+ total: number;
5968
+ falsePositives: number;
5969
+ matchedWeight: number;
5970
+ totalWeight: number;
5971
+ precision: number;
5972
+ recall: number;
5973
+ f1: number;
5974
+ matches: ReferenceReplayMatch[];
5975
+ }
5976
+ interface ReferenceReplayAggregate {
5977
+ matched: number;
5978
+ total: number;
5979
+ falsePositives: number;
5980
+ matchedWeight: number;
5981
+ totalWeight: number;
5982
+ precision: number;
5983
+ recall: number;
5984
+ f1: number;
5985
+ weightedRecall: number;
5986
+ }
5987
+ interface ReferenceReplayScore {
5988
+ scenarios: ReferenceReplayScenarioScore[];
5989
+ aggregate: ReferenceReplayAggregate;
5990
+ bySplit: Partial<Record<ReferenceReplaySplit, ReferenceReplayAggregate>>;
5991
+ }
5992
+ interface ReferenceMatchResult {
5993
+ score: number;
5994
+ reason?: string;
5995
+ }
5996
+ type ReferenceReplayMatcher = (reference: ReferenceReplayItem, candidate: ReferenceReplayCandidate, scenario: ReferenceReplayScenario) => ReferenceMatchResult;
5997
+ interface ReferenceReplayScoreOptions {
5998
+ matcher?: ReferenceReplayMatcher;
5999
+ matchThreshold?: number;
6000
+ includeHoldout?: boolean;
6001
+ splits?: ReferenceReplaySplit[];
6002
+ }
6003
+ interface ReferenceReplayPromotionPolicy {
6004
+ /** Splits that must improve or stay flat. Default: ['dev', 'test']. */
6005
+ requiredSplits?: ReferenceReplaySplit[];
6006
+ /** Minimum aggregate F1 lift required on required splits. Default 0. */
6007
+ minF1Delta?: number;
6008
+ /** Maximum F1 drop allowed on any compared split. Default 0. */
6009
+ maxRegression?: number;
6010
+ /** If true, holdout must be present and must not regress. Default true. */
6011
+ requireHoldoutNonRegression?: boolean;
6012
+ }
6013
+ interface ReferenceReplaySplitComparison {
6014
+ split: ReferenceReplaySplit;
6015
+ baselineF1: number;
6016
+ candidateF1: number;
6017
+ f1Delta: number;
6018
+ baselineRecall: number;
6019
+ candidateRecall: number;
6020
+ recallDelta: number;
6021
+ }
6022
+ interface ReferenceReplayPromotionDecision {
6023
+ promote: boolean;
6024
+ reason: string;
6025
+ aggregateDelta: number;
6026
+ comparisons: ReferenceReplaySplitComparison[];
6027
+ regressions: ReferenceReplaySplitComparison[];
6028
+ }
6029
+ interface ReferenceReplayCaseRun<Input = unknown> {
6030
+ caseId: string;
6031
+ split: ReferenceReplaySplit;
6032
+ input: Input;
6033
+ metadata?: Record<string, unknown>;
6034
+ references: ReferenceReplayItem[];
6035
+ candidates: ReferenceReplayCandidate[];
6036
+ score: ReferenceReplayScenarioScore;
6037
+ durationMs: number;
6038
+ error?: string;
6039
+ }
6040
+ interface ReferenceReplayRun<Input = unknown> {
6041
+ id: string;
6042
+ variantId?: string;
6043
+ startedAt: number;
6044
+ completedAt: number;
6045
+ durationMs: number;
6046
+ cases: ReferenceReplayCaseRun<Input>[];
6047
+ score: ReferenceReplayScore;
6048
+ metadata?: Record<string, unknown>;
6049
+ }
6050
+ interface ReferenceReplayRunOptions<Input = unknown> extends ReferenceReplayScoreOptions {
6051
+ adapter: ReferenceReplayAdapterLike<Input>;
6052
+ runId?: string;
6053
+ variantId?: string;
6054
+ metadata?: Record<string, unknown>;
6055
+ store?: ReferenceReplayRunStore<Input>;
6056
+ abortSignal?: AbortSignal;
6057
+ continueOnError?: boolean;
6058
+ now?: () => number;
6059
+ }
6060
+ interface ReferenceReplayRunStore<Input = unknown> {
6061
+ save(run: ReferenceReplayRun<Input>): Promise<void>;
6062
+ list(): Promise<ReferenceReplayRun<Input>[]>;
6063
+ }
6064
+ declare function runReferenceReplay<Input = unknown>(cases: ReferenceReplayCase<Input>[], options: ReferenceReplayRunOptions<Input>): Promise<ReferenceReplayRun<Input>>;
6065
+ declare function decideReferenceReplayRunPromotion(baseline: ReferenceReplayRun, candidate: ReferenceReplayRun, policy?: ReferenceReplayPromotionPolicy): ReferenceReplayPromotionDecision;
6066
+ declare function inMemoryReferenceReplayStore<Input = unknown>(initial?: ReferenceReplayRun<Input>[]): ReferenceReplayRunStore<Input>;
6067
+ declare function jsonlReferenceReplayStore<Input = unknown>(path: string): ReferenceReplayRunStore<Input>;
6068
+ declare function scoreReferenceReplay(scenarios: ReferenceReplayScenario[], options?: ReferenceReplayScoreOptions): ReferenceReplayScore;
6069
+ declare function compareReferenceReplay(baseline: ReferenceReplayScore, candidate: ReferenceReplayScore): ReferenceReplaySplitComparison[];
6070
+ declare function decideReferenceReplayPromotion(baseline: ReferenceReplayScore, candidate: ReferenceReplayScore, policy?: ReferenceReplayPromotionPolicy): ReferenceReplayPromotionDecision;
6071
+ declare function defaultReferenceReplayMatcher(reference: ReferenceReplayItem, candidate: ReferenceReplayCandidate): ReferenceMatchResult;
6072
+
6073
+ export { type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateDefaultReviewerOptions, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, MODEL_PRICING, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiToolchainLayerConfig, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewReport, type ProposeReviewShot, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunScore, type RunScoreWeights, type RunStatus, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, aggregateRunScore, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createIntentMatchJudge, createLlmReviewer, createSemanticConceptJudge, crossTraceDiff, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderSteeringText, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runProposeReview, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stripFencedJson, stuckLoopView, summarize, summarizeHarnessResults, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };