npm - @tangle-network/agent-eval - Versions diffs - 0.40.5 → 0.42.0 - Mend

@tangle-network/agent-eval 0.40.5 → 0.42.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

package/dist/campaign/index.d.ts +48 -355
package/dist/campaign/index.js +106 -6
package/dist/campaign/index.js.map +1 -1
package/dist/{chunk-AU2JLNSZ.js → chunk-H4TOS272.js} +1 -65
package/dist/chunk-H4TOS272.js.map +1 -0
package/dist/{chunk-NKLGKF2Q.js → chunk-KQ26DYTQ.js} +2 -18
package/dist/chunk-KQ26DYTQ.js.map +1 -0
package/dist/{chunk-EGIPWXHL.js → chunk-MNL6LXGQ.js} +98 -2
package/dist/chunk-MNL6LXGQ.js.map +1 -0
package/dist/{chunk-5U2DOJU4.js → chunk-N4SBKEPJ.js} +199 -2
package/dist/chunk-N4SBKEPJ.js.map +1 -0
package/dist/{chunk-LCIDRYGP.js → chunk-PD3MH6WU.js} +8 -8
package/dist/{control-CmLJk3IG.d.ts → control-ojEWkMfJ.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/{feedback-trajectory-Dvy-bt7x.d.ts → feedback-trajectory-BSxqEpu7.d.ts} +1 -1
package/dist/index.d.ts +227 -687
package/dist/index.js +753 -1237
package/dist/index.js.map +1 -1
package/dist/integrity-CTDhR1Sg.d.ts +81 -0
package/dist/llm-client-BXVRUZyX.d.ts +234 -0
package/dist/openapi.json +1 -1
package/dist/pipelines/index.js +67 -3
package/dist/pipelines/index.js.map +1 -1
package/dist/{integrity-DYR5gWlb.d.ts → raw-provider-sink-C46HDghv.d.ts} +1 -80
package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} +21 -3
package/dist/reporting.d.ts +2 -3
package/dist/reporting.js +4 -8
package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} +116 -205
package/dist/rl.d.ts +103 -221
package/dist/rl.js +44 -199
package/dist/rl.js.map +1 -1
package/dist/sequential-DdV5ShjT.d.ts +561 -0
package/dist/traces.d.ts +3 -2
package/dist/traces.js +5 -5
package/dist/types-BLbRTxoc.d.ts +367 -0
package/dist/wire/index.d.ts +1 -1
package/package.json +1 -6
package/dist/chunk-5U2DOJU4.js.map +0 -1
package/dist/chunk-AU2JLNSZ.js.map +0 -1
package/dist/chunk-DMW5VENN.js +0 -1412
package/dist/chunk-DMW5VENN.js.map +0 -1
package/dist/chunk-EGIPWXHL.js.map +0 -1
package/dist/chunk-MAZ26DC7.js +0 -99
package/dist/chunk-MAZ26DC7.js.map +0 -1
package/dist/chunk-NKLGKF2Q.js.map +0 -1
package/dist/multi-layer-verifier-BNi4-8lR.d.ts +0 -141
package/dist/optimization.d.ts +0 -11
package/dist/optimization.js +0 -71
package/dist/optimization.js.map +0 -1
package/dist/sequential-5iSVfzl2.d.ts +0 -139
package/dist/summary-report-DuZXOk7K.d.ts +0 -917
/package/dist/{chunk-LCIDRYGP.js.map → chunk-PD3MH6WU.js.map} +0 -0

package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} RENAMED Viewed

@@ -1,7 +1,7 @@
 import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
 import { TCloud } from '@tangle-network/tcloud';
 import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-BlwAtYYf.js';
-import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-DuZXOk7K.js';
+import { w as GateDecision } from './sequential-DdV5ShjT.js';
 import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
 interface Scenario {
@@ -311,6 +311,25 @@ interface EvalResult {
  * fails closed instead of being treated as a neutral zero.
  */
+/** Severity of an actionable finding attached to a run/trace. */
+type AsiSeverity = 'info' | 'warning' | 'error' | 'critical';
+/** Actionable side-info — a diagnosed finding the loop can act on. */
+interface ActionableSideInfo {
+    /** Stable expectation/check id when available. */
+    expectationId?: string;
+    /** Human-readable diagnosis of what happened. */
+    message: string;
+    severity?: AsiSeverity;
+    /** Concrete trace excerpt, file path, tool call, screenshot id, etc. */
+    evidence?: string;
+    /** Prompt/tool/context surface likely responsible. */
+    responsibleSurface?: string;
+    /** Suggested fix in natural language. */
+    suggestion?: string;
+    /** Whether this expectation was satisfied. Defaults to false for ASI rows. */
+    matched?: boolean;
+    metadata?: Record<string, unknown>;
+}
 type ReleaseConfidenceStatus = 'pass' | 'warn' | 'fail';
 type ReleaseConfidenceAxisName = 'corpus' | 'quality' | 'generalization' | 'diagnostics' | 'efficiency';
 interface ReleaseTraceEvidence {
@@ -401,7 +420,6 @@ interface ReleaseConfidenceScorecard {
     gateDecision: GateDecision | null;
     summary: string;
 }
-declare function releaseTraceEvidenceFromMultiShotTrials(trials: readonly MultiShotTrialResult[]): ReleaseTraceEvidence[];
 declare function evaluateReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
 declare function assertReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
@@ -713,4 +731,4 @@ interface RenderReleaseReportOptions {
 }
 declare function renderReleaseReport(scorecard: ReleaseConfidenceScorecard, options?: RenderReleaseReportOptions): string;
-export { type RouteMap as $, type DriverState as A, type BootstrapOptions as B, type CheckResult as C, type DriverResult as D, type CollectedArtifacts as E, type ScenarioResult as F, type TurnMetrics as G, type ScenarioFile as H, type CompletionCriterion as I, type JudgeReplayGateArgs as J, type ArtifactCheck as K, type ArtifactResult as L, type CorpusAgreementOptions as M, type CorpusAgreementPerDimension as N, type CorpusAgreementReport as O, type PairedBootstrapOptions as P, type CorpusScoreRecord as Q, type ReleaseConfidenceAxis as R, type Scenario as S, type TestResult as T, type EvalResult as U, type Verdict as V, type FeedbackPattern as W, type JudgeConfig as X, type JudgeRubric as Y, type JudgeScore as Z, type PersonaRigor as _, type BootstrapResult as a, type RubricDimension as a0, type Turn as a1, type TurnResult as a2, bonferroni as a3, cohensD as a4, confidenceInterval as a5, corpusInterRaterAgreement as a6, corpusInterRaterAgreementFromJudgeScores as a7, interRaterReliability as a8, mannWhitneyU as a9, normalizeScores as aa, pairedMde as ab, pairedTTest as ac, partialCredit as ad, requiredSampleSize as ae, weightedMean as af, type PairedBootstrapResult as b, type ReleaseConfidenceAxisName as c, type ReleaseConfidenceInput as d, type ReleaseConfidenceIssue as e, type ReleaseConfidenceMetrics as f, type ReleaseConfidenceScorecard as g, type ReleaseConfidenceStatus as h, type ReleaseConfidenceThresholds as i, type ReleaseTraceEvidence as j, type RenderReleaseReportOptions as k, assertReleaseConfidence as l, benjaminiHochberg as m, bootstrapCi as n, evaluateReleaseConfidence as o, judgeReplayGate as p, pairedBootstrap as q, releaseTraceEvidenceFromMultiShotTrials as r, renderReleaseReport as s, type JudgeInput as t, type JudgeFn as u, type BenchmarkRunnerConfig as v, wilcoxonSignedRank as w, type BenchmarkReport as x, type ProductClientConfig as y, type PersonaConfig as z };
+export { type PersonaRigor as $, type CollectedArtifacts as A, type BootstrapOptions as B, type CheckResult as C, type DriverResult as D, type ScenarioResult as E, type TurnMetrics as F, type ScenarioFile as G, type CompletionCriterion as H, type ActionableSideInfo as I, type JudgeReplayGateArgs as J, type ArtifactCheck as K, type ArtifactResult as L, type AsiSeverity as M, type CorpusAgreementOptions as N, type CorpusAgreementPerDimension as O, type PairedBootstrapOptions as P, type CorpusAgreementReport as Q, type ReleaseConfidenceAxis as R, type Scenario as S, type TestResult as T, type CorpusScoreRecord as U, type Verdict as V, type EvalResult as W, type FeedbackPattern as X, type JudgeConfig as Y, type JudgeRubric as Z, type JudgeScore as _, type BootstrapResult as a, type RouteMap as a0, type RubricDimension as a1, type Turn as a2, type TurnResult as a3, bonferroni as a4, cohensD as a5, confidenceInterval as a6, corpusInterRaterAgreement as a7, corpusInterRaterAgreementFromJudgeScores as a8, interRaterReliability as a9, mannWhitneyU as aa, normalizeScores as ab, pairedMde as ac, pairedTTest as ad, partialCredit as ae, requiredSampleSize as af, weightedMean as ag, type PairedBootstrapResult as b, type ReleaseConfidenceAxisName as c, type ReleaseConfidenceInput as d, type ReleaseConfidenceIssue as e, type ReleaseConfidenceMetrics as f, type ReleaseConfidenceScorecard as g, type ReleaseConfidenceStatus as h, type ReleaseConfidenceThresholds as i, type ReleaseTraceEvidence as j, type RenderReleaseReportOptions as k, assertReleaseConfidence as l, benjaminiHochberg as m, bootstrapCi as n, evaluateReleaseConfidence as o, judgeReplayGate as p, pairedBootstrap as q, renderReleaseReport as r, type JudgeInput as s, type JudgeFn as t, type BenchmarkRunnerConfig as u, type BenchmarkReport as v, wilcoxonSignedRank as w, type ProductClientConfig as x, type PersonaConfig as y, type DriverState as z };

package/dist/reporting.d.ts CHANGED Viewed

@@ -1,7 +1,6 @@
 export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-ByZEC3BX.js';
-export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as releaseTraceEvidenceFromMultiShotTrials, s as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-Di84bXD7.js';
-export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
-export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-DuZXOk7K.js';
+export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-BtpgWRI0.js';
+export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, I as InterimReleaseConfidence, c as InterimReleaseConfidenceInput, P as PairedEvalueOptions, d as PairedEvalueSequence, e as PairedEvalueStep, f as ParetoFigureSpec, g as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, h as ResearchReport, i as ResearchReportCandidate, j as ResearchReportDecision, k as ResearchReportMethodology, l as ResearchReportOptions, m as ResearchReportRecommendation, S as SequentialDecision, n as SummaryTable, o as SummaryTableOptions, p as SummaryTableRow, q as evaluateInterimReleaseConfidence, r as gainHistogram, s as pairedEvalueSequence, t as paretoChart, u as researchReport, v as summaryTable } from './sequential-DdV5ShjT.js';
 import './run-record-BGY6bHRh.js';
 import './errors-mje_cKOs.js';
 import './outcome-store-D6KWmYvj.js';

package/dist/reporting.js CHANGED Viewed

@@ -3,23 +3,20 @@ import {
   bootstrapCi,
   evaluateReleaseConfidence,
   judgeReplayGate,
-  releaseTraceEvidenceFromMultiShotTrials,
   renderReleaseReport
-} from "./chunk-NKLGKF2Q.js";
+} from "./chunk-KQ26DYTQ.js";
 import {
   rubricPredictiveValidity
 } from "./chunk-YRZ4M5GS.js";
-import {
-  evaluateInterimReleaseConfidence,
-  pairedEvalueSequence
-} from "./chunk-MAZ26DC7.js";
 import {
   RESEARCH_REPORT_HARD_PAIR_FLOOR,
+  evaluateInterimReleaseConfidence,
   gainHistogram,
+  pairedEvalueSequence,
   paretoChart,
   researchReport,
   summaryTable
-} from "./chunk-EGIPWXHL.js";
+} from "./chunk-MNL6LXGQ.js";
 import {
   benjaminiHochberg,
   pairedBootstrap,
@@ -40,7 +37,6 @@ export {
   pairedBootstrap,
   pairedEvalueSequence,
   paretoChart,
-  releaseTraceEvidenceFromMultiShotTrials,
   renderReleaseReport,
   researchReport,
   rubricPredictiveValidity,

package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} RENAMED Viewed

@@ -1,238 +1,149 @@
 import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-BGY6bHRh.js';
-import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-mje_cKOs.js';
-import { R as RawProviderSink, P as ProviderRedactor, a as RunIntegrityExpectations, b as RunIntegrityReport } from './integrity-DYR5gWlb.js';
-import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-DuZXOk7K.js';
+import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-BXVRUZyX.js';
+import { l as ResearchReportOptions, h as ResearchReport, w as GateDecision } from './sequential-DdV5ShjT.js';
 import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
+import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CTDhR1Sg.js';
+import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
 import { T as TraceStore } from './store-Db2Bv8Cf.js';
 /**
- * LLM client with graceful degrade.
+ * Multi-layer verifier — ordered pipeline of verification layers.
  *
- * OpenAI-compatible `/v1/chat/completions` client with:
- *   - Exponential-backoff retry on 429 + 5xx gateway errors (502/503/504).
- *   - Retry on transient network errors (fetch failed, AbortError, ECONNRESET).
- *   - Graceful json_schema → json_object degrade on 400 with schema-reject body.
- *   - Fenced-JSON stripping (```json ... ```) for models that wrap structured output.
- *   - Configurable base URL + api key / bearer, works with LiteLLM proxies, OpenAI
- *     directly, cli-bridge subscriptions, and any router that speaks the spec.
+ * Different contract from {@link JudgeRunner} (which runs parallel
+ * specs against a sandbox). MultiLayerVerifier is a DAG of layers
+ * (install → typecheck → build → lint → serve → semantic → …) with
+ * dependency-based skip, per-layer findings, soft-fail semantics, and
+ * an aggregated `blendedScore` across all passed layers.
  *
- * Usage:
- *   const { value, result } = await callLlmJson<MyType>(
- *     { model: 'gpt-4o', messages: [...], jsonSchema: { name: 'x', schema: {...} } },
- *     { baseUrl: 'https://router.tangle.tools/v1', apiKey: process.env.KEY },
- *   )
+ * Use when you want:
+ *   - ordered stages where a failing upstream stage skips downstream ones
+ *   - each stage produces rich `findings` (severity + message + evidence)
+ *   - a single composite score across stages with per-stage weights
+ *   - soft-fail stages whose failure doesn't abort the pipeline
  *
- * This is THE llm-calling seam for agent-eval primitives that need structured
- * output (semantic concept judge, reviewer directives, critic scores). Primitives
- * that need free-form text use `callLlm` and parse output themselves.
+ * Use {@link JudgeRunner} when you want:
+ *   - N independent judges running in parallel against the same artifact
+ *   - no inter-judge dependencies
+ *   - boolean `passed` per judge + overall
+ *
+ * Both primitives compose — JudgeRunner can be invoked as a single
+ * layer inside a MultiLayerVerifier if that suits the caller.
  */
-interface LlmMessage {
-    role: 'system' | 'user' | 'assistant';
+type LayerStatus = 'pass' | 'fail' | 'skipped' | 'error' | 'timeout';
+type Severity = 'critical' | 'major' | 'minor' | 'info';
+interface Finding {
+    severity: Severity;
+    message: string;
+    evidence?: string;
+    /** Optional layer name the finding belongs to (set by the verifier if omitted). */
+    layer?: string;
     /**
-     * Either a plain text content string OR a multimodal content array
-     * (text + image_url parts) for vision-capable models.
+     * Free-form structured payload — used by `multiToolchainLayer` to attach
+     * `{ adapter: 'pnpm' }`, by judges to attach evidence pointers, etc.
+     * Renderers MAY interrogate; agent-eval primitives never assume shape.
      */
-    content: string | Array<{
-        type: 'text';
-        text: string;
-    } | {
-        type: 'image_url';
-        image_url: {
-            url: string;
-            detail?: 'auto' | 'low' | 'high';
-        };
-    }>;
-}
-interface LlmCallRequest {
-    model: string;
-    messages: LlmMessage[];
-    /** Optional JSON-mode response format (response_format: json_object). */
-    jsonMode?: boolean;
-    /** Optional structured output via JSON Schema. Falls back to json_object on 400. */
-    jsonSchema?: {
-        name: string;
-        schema: Record<string, unknown>;
-    };
-    temperature?: number;
-    maxTokens?: number;
-    /** Per-call timeout, default 60s. */
-    timeoutMs?: number;
+    detail?: Record<string, unknown>;
 }
-interface LlmUsage {
-    promptTokens: number;
-    completionTokens: number;
-    totalTokens: number;
-    /** Proxies populate this when prompt caching is on. */
-    cachedPromptTokens?: number;
-}
-interface LlmCallResult {
-    /** The text content of the first choice. Empty string if none. */
-    content: string;
-    usage: LlmUsage;
+interface LayerResult {
+    layer: string;
+    status: LayerStatus;
+    /** 0..1 score, optional — layers that don't produce a numeric score omit. */
+    score?: number;
+    durationMs: number;
+    findings: Finding[];
+    /** Short human-readable summary (one line). */
+    reason?: string;
     /**
-     * Cost in USD. Pulled from proxy's `_response_cost` field when present;
-     * `null` when neither the proxy nor the caller can derive it.
+     * Numeric layer-level diagnostics: error counts, warning counts,
+     * cyclomatic complexity, total adapter wall-time, etc. Keyed by
+     * diagnostic name; null = "diagnostic not applicable / not measured."
+     * Renderers that know the keys can display them; ones that don't,
+     * ignore. Free-form on purpose — consumers type the value shape in
+     * their own namespace.
      */
-    costUsd: number | null;
-    /** Model name actually used (echoed from response). */
-    model: string;
-    /** Wall-clock duration of the HTTP call (last attempt, if retried). */
-    durationMs: number;
-    /** Raw response body. */
-    raw: Record<string, unknown>;
+    diagnostics?: Record<string, number | null>;
+    /** Any rich per-layer detail — rendered as-is by consumers that know the layer. */
+    detail?: Record<string, unknown>;
 }
-declare class LlmCallError extends AgentEvalError {
-    readonly status: number;
-    readonly body: string;
-    readonly model: string;
-    constructor(message: string, status: number, body: string, model: string);
+interface VerifyContext<Env = unknown> {
+    /** Per-run opaque context the caller provides. Layers destructure what they need. */
+    env: Env;
+    /** Previously-computed results from layers that already ran. */
+    prior: Record<string, LayerResult>;
+    /** Signal — if aborted, layers MUST bail within reasonable wall. */
+    signal: AbortSignal;
 }
-interface LlmClientOptions {
-    /** Base URL (without trailing slash). Must end at the `/v1` prefix. */
-    baseUrl?: string;
-    /** Bearer token — either `apiKey` or `bearer` populates `Authorization: Bearer ...`. */
-    apiKey?: string;
-    bearer?: string;
-    /** Override for the `Authorization` header (e.g. `X-Auth: ...`). Takes precedence over apiKey/bearer. */
-    authHeader?: {
-        name: string;
-        value: string;
-    };
-    /** Default timeout in ms. Per-call can override. */
-    defaultTimeoutMs?: number;
-    /** Max retry attempts on retriable errors. Default 3 (1 initial + 2 retries). */
-    maxRetries?: number;
-    /** Fetch implementation — defaults to global `fetch`. Override for custom transport (e.g. tests). */
-    fetch?: typeof fetch;
+interface Layer<Env = unknown> {
+    name: string;
+    /** Stages that must have `status: 'pass'` before this layer runs. */
+    dependsOn?: string[];
     /**
-     * Optional raw HTTP capture sink. When provided, every request, response,
-     * and error (across all retry attempts) is recorded to the sink, with auth
-     * headers and credential-shaped body fields redacted by default. This is
-     * the layer-1 forensics primitive: structured `LlmSpan`s record intent,
-     * raw events record what actually crossed the wire.
+     * Weight in the composite `blendedScore`. Default 1.0. Layers with weight 0
+     * contribute findings but not score.
      */
-    rawSink?: RawProviderSink;
+    weight?: number;
     /**
-     * Logical provider id attached to raw events. When omitted, derived from
-     * `baseUrl` via `providerFromBaseUrl`.
+     * If true, a `fail` status contributes to `blendedScore` (as 0) instead of
+     * being dropped — use for layers whose failure is a real signal. Default:
+     * fail drops from numerator + denominator, matching VB's existing semantics.
      */
-    provider?: string;
-    /** Trace context attached to raw events; populated by emitter-aware callers. */
-    traceContext?: {
-        runId?: string;
-        spanId?: string;
-    };
-    /** Override the redaction strategy for this call. Defaults to `defaultProviderRedactor`. */
-    redactor?: ProviderRedactor;
-}
-/**
- * True when an error is a transient transport/network fault worth retrying,
- * as opposed to a deterministic failure (4xx schema reject, JSON parse) that
- * a retry cannot fix. Inspects `LlmCallError.status`, then the error's
- * name/message/code, then recurses into `error.cause` — undici nests the
- * real socket fault one or more levels under `.cause`.
- *
- * This is THE retry classifier for the package: `callLlm` and
- * `withJudgeRetry` both route through it, so a connection-class error is
- * treated identically whether it surfaces in the HTTP client or a
- * TCloud-backed judge.
- */
-declare function isTransientLlmError(err: unknown): boolean;
-/** Exponential backoff: 500ms, 1s, 2s, 4s, ... capped at 16s. Attempt is 0-indexed. */
-declare function backoffMs(attempt: number): number;
-/**
- * Strip a ```json / ``` code fence if the model emitted one.
- * Idempotent for naked JSON. Some models (claude-code via router, certain
- * deepseek models) wrap output even under json_object.
- */
-declare function stripFencedJson(raw: string): string;
-/**
- * Low-level call. Returns raw content + usage + cost. Retries on transient
- * failures; does NOT degrade schema here — callers that want graceful
- * degrade use `callLlmJson`.
- */
-declare function callLlm(req: LlmCallRequest, opts?: LlmClientOptions): Promise<LlmCallResult>;
-/**
- * Structured-output call. Returns parsed JSON plus the raw result envelope.
- * Degrades `jsonSchema` → `jsonMode` on a 400 that names the schema param —
- * critical for deepseek-v3/v4, kimi-k2.6, and other models that don't accept
- * the `response_format.json_schema` shape but DO accept `json_object`.
- */
-declare function callLlmJson<T = unknown>(req: LlmCallRequest, opts?: LlmClientOptions): Promise<{
-    value: T;
-    result: LlmCallResult;
-}>;
-type LlmRouteAssertionReason = 'no_explicit_base_url' | 'base_url_blocked' | 'base_url_not_allowed' | 'no_auth' | 'wrong_provider';
-declare class LlmRouteAssertionError extends CaptureIntegrityError {
-    readonly reason: LlmRouteAssertionReason;
-    readonly baseUrl: string;
-    constructor(message: string, reason: LlmRouteAssertionReason, baseUrl: string);
+    failContributesToScore?: boolean;
+    /** Optional per-layer wall-cap in ms. Honored by the verifier (AbortSignal). */
+    capMs?: number;
+    run: (ctx: VerifyContext<Env>) => Promise<LayerResult> | LayerResult;
 }
-interface LlmRouteRequirements {
-    /**
-     * Throw if `opts.baseUrl` is undefined, i.e. the call would fall back to
-     * `DEFAULT_BASE_URL`. Set this for evaluation runs where silently using
-     * the public/free-tier router is a defect — the launch reviewer needs to
-     * know exactly which provider answered.
-     */
-    requireExplicitBaseUrl?: boolean;
+interface VerifyOptions<Env = unknown> {
+    env: Env;
     /**
-     * Allowlist of acceptable base URLs. Strings match by prefix
-     * (case-insensitive); RegExps test against the full base URL.
+     * Overall wall cap. Default: sum of layer capMs, or Infinity if any layer
+     * omits a cap. The verifier short-circuits remaining layers on overall cap.
      */
-    allowedBaseUrls?: Array<string | RegExp>;
-    /** Blocklist that takes precedence over `allowedBaseUrls`. */
-    blockedBaseUrls?: Array<string | RegExp>;
-    /** Throw if no auth header / api key is configured. */
-    requireAuth?: boolean;
+    overallCapMs?: number;
+    /** Called with each layer result as it completes. */
+    onLayer?: (result: LayerResult) => void;
+}
+interface VerificationReport {
+    layers: LayerResult[];
+    passCount: number;
+    failCount: number;
+    skippedCount: number;
+    errorCount: number;
+    /** True iff at least one scored layer ran AND every scored layer passed. */
+    allPass: boolean;
     /**
-     * Logical provider id the configured `baseUrl` is expected to match (via
-     * `providerFromBaseUrl`). Mainly useful when paired with `requireExplicitBaseUrl`.
+     * Weighted mean of `score` across contributing layers. 0 when no layers
+     * contributed. See {@link Layer.failContributesToScore} for fail semantics.
      */
-    expectedProvider?: string;
+    blendedScore: number;
+    durationMs: number;
+    startedAt: string;
+    finishedAt: string;
 }
 /**
- * Fail-loud assertion that the configured LLM client points at the route
- * the caller intends. Designed for the matrix-runner preflight: invoke
- * once before any LLM call to catch misconfiguration before a sweep burns
- * dollars on the wrong provider.
+ * Grade a semantic-concept-style judge result into a single layer status.
  *
- * Throws `LlmRouteAssertionError`. Pure — no I/O — so it's safe to call
- * from constructors and CI gates.
- */
-declare function assertLlmRoute(opts: LlmClientOptions, req?: LlmRouteRequirements): void;
-/**
- * Probe whether a model is reachable. Returns latency + null error on
- * success; `ok=false` + error message on any failure (HTTP, timeout,
- * network, parse). Designed for sweep preflights — fail loud at the
- * boundary before burning a 30-leaf run on a misconfigured router.
+ * Pass when overall score >= threshold AND no critical-severity concept gap.
+ * Fail otherwise. Use inside a `Layer.run` when wrapping a concept judge.
  *
- * Sends a tiny `ping` message with `maxTokens=64`. Reasoning models
- * (glm-5.1, deepseek-v4) can burn the entire budget on internal reasoning
- * for short prompts, so don't tighten this further. We don't validate
- * content; HTTP 200 means reachable.
- */
-declare function probeLlm(model: string, opts?: LlmClientOptions & {
-    timeoutMs?: number;
-}): Promise<{
-    ok: boolean;
-    latencyMs: number;
-    error: string | null;
-}>;
-/**
- * Stateful client — construct once with defaults, call many times.
- * Thin wrapper around the free functions; exists for callers that want
- * to inject a single configured instance into multiple primitives.
+ * Generalized from VerticalBench H3 fix: `failingConcepts.length === 0` was
+ * too strict — a single concept at 6/10 failed the entire layer despite
+ * overall score being >= 0.7. Now we trust the judge's own `severity` field:
+ * `critical` findings veto; `major`/`minor` reduce the score but don't veto.
  */
-declare class LlmClient {
-    private readonly opts;
-    constructor(opts?: LlmClientOptions);
-    call(req: LlmCallRequest, per?: LlmClientOptions): Promise<LlmCallResult>;
-    callJson<T = unknown>(req: LlmCallRequest, per?: LlmClientOptions): Promise<{
-        value: T;
-        result: LlmCallResult;
+declare function gradeSemanticStatus(input: {
+    score: number;
+    findings: Array<{
+        severity: Severity;
+        present?: boolean;
+        score?: number;
     }>;
+    available: boolean;
+    threshold?: number;
+}): LayerStatus;
+declare class MultiLayerVerifier<Env = unknown> {
+    private readonly layers;
+    constructor(layers: Layer<Env>[]);
+    run(opts: VerifyOptions<Env>): Promise<VerificationReport>;
 }
 /**
@@ -608,4 +519,4 @@ declare class NoopResearcher implements Researcher {
     evaluateChange(_plan: ExperimentPlan): Promise<ExperimentResult>;
 }
-export { probeLlm as A, stripFencedJson as B, CallbackResearcher as C, type EvalCampaignOptions as E, type FailedRun as F, type LlmClientOptions as L, NoopResearcher as N, type Researcher as R, type SteeringChange as S, type CallbackResearcherOptions as a, type CampaignFactoryParams as b, type CampaignIntegrityPolicy as c, type CampaignRunContext as d, type CampaignRunOutcome as e, type CampaignRunner as f, type CampaignScenario as g, type CampaignVariant as h, type EvalCampaignResult as i, type ExperimentPlan as j, type ExperimentResult as k, type FailureMode as l, type LlmCallRequest as m, type LlmCallResult as n, LlmCallError as o, LlmClient as p, type LlmMessage as q, runEvalCampaign as r, LlmRouteAssertionError as s, type LlmRouteRequirements as t, type LlmUsage as u, assertLlmRoute as v, backoffMs as w, callLlm as x, callLlmJson as y, isTransientLlmError as z };
+export { CallbackResearcher as C, type ExperimentPlan as E, type FailureMode as F, type Layer as L, MultiLayerVerifier as M, NoopResearcher as N, type Researcher as R, type SteeringChange as S, type VerificationReport as V, type ExperimentResult as a, type EvalCampaignResult as b, type EvalCampaignOptions as c, type Severity as d, type VerifyOptions as e, type LayerResult as f, type VerifyContext as g, type CallbackResearcherOptions as h, type CampaignFactoryParams as i, type CampaignIntegrityPolicy as j, type CampaignRunContext as k, type CampaignRunOutcome as l, type CampaignRunner as m, type CampaignScenario as n, type CampaignVariant as o, type FailedRun as p, type Finding as q, runEvalCampaign as r, type LayerStatus as s, gradeSemanticStatus as t };