@tangle-network/agent-eval 0.40.5 → 0.42.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/dist/campaign/index.d.ts +48 -355
  2. package/dist/campaign/index.js +106 -6
  3. package/dist/campaign/index.js.map +1 -1
  4. package/dist/{chunk-AU2JLNSZ.js → chunk-H4TOS272.js} +1 -65
  5. package/dist/chunk-H4TOS272.js.map +1 -0
  6. package/dist/{chunk-NKLGKF2Q.js → chunk-KQ26DYTQ.js} +2 -18
  7. package/dist/chunk-KQ26DYTQ.js.map +1 -0
  8. package/dist/{chunk-EGIPWXHL.js → chunk-MNL6LXGQ.js} +98 -2
  9. package/dist/chunk-MNL6LXGQ.js.map +1 -0
  10. package/dist/{chunk-5U2DOJU4.js → chunk-N4SBKEPJ.js} +199 -2
  11. package/dist/chunk-N4SBKEPJ.js.map +1 -0
  12. package/dist/{chunk-LCIDRYGP.js → chunk-PD3MH6WU.js} +8 -8
  13. package/dist/{control-CmLJk3IG.d.ts → control-ojEWkMfJ.d.ts} +1 -1
  14. package/dist/control.d.ts +2 -2
  15. package/dist/{feedback-trajectory-Dvy-bt7x.d.ts → feedback-trajectory-BSxqEpu7.d.ts} +1 -1
  16. package/dist/index.d.ts +227 -687
  17. package/dist/index.js +753 -1237
  18. package/dist/index.js.map +1 -1
  19. package/dist/integrity-CTDhR1Sg.d.ts +81 -0
  20. package/dist/llm-client-BXVRUZyX.d.ts +234 -0
  21. package/dist/openapi.json +1 -1
  22. package/dist/pipelines/index.js +67 -3
  23. package/dist/pipelines/index.js.map +1 -1
  24. package/dist/{integrity-DYR5gWlb.d.ts → raw-provider-sink-C46HDghv.d.ts} +1 -80
  25. package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} +21 -3
  26. package/dist/reporting.d.ts +2 -3
  27. package/dist/reporting.js +4 -8
  28. package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} +116 -205
  29. package/dist/rl.d.ts +103 -221
  30. package/dist/rl.js +44 -199
  31. package/dist/rl.js.map +1 -1
  32. package/dist/sequential-DdV5ShjT.d.ts +561 -0
  33. package/dist/traces.d.ts +3 -2
  34. package/dist/traces.js +5 -5
  35. package/dist/types-BLbRTxoc.d.ts +367 -0
  36. package/dist/wire/index.d.ts +1 -1
  37. package/package.json +1 -6
  38. package/dist/chunk-5U2DOJU4.js.map +0 -1
  39. package/dist/chunk-AU2JLNSZ.js.map +0 -1
  40. package/dist/chunk-DMW5VENN.js +0 -1412
  41. package/dist/chunk-DMW5VENN.js.map +0 -1
  42. package/dist/chunk-EGIPWXHL.js.map +0 -1
  43. package/dist/chunk-MAZ26DC7.js +0 -99
  44. package/dist/chunk-MAZ26DC7.js.map +0 -1
  45. package/dist/chunk-NKLGKF2Q.js.map +0 -1
  46. package/dist/multi-layer-verifier-BNi4-8lR.d.ts +0 -141
  47. package/dist/optimization.d.ts +0 -11
  48. package/dist/optimization.js +0 -71
  49. package/dist/optimization.js.map +0 -1
  50. package/dist/sequential-5iSVfzl2.d.ts +0 -139
  51. package/dist/summary-report-DuZXOk7K.d.ts +0 -917
  52. /package/dist/{chunk-LCIDRYGP.js.map → chunk-PD3MH6WU.js.map} +0 -0
@@ -1,7 +1,7 @@
1
1
  import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
2
2
  import { TCloud } from '@tangle-network/tcloud';
3
3
  import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-BlwAtYYf.js';
4
- import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-DuZXOk7K.js';
4
+ import { w as GateDecision } from './sequential-DdV5ShjT.js';
5
5
  import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
6
6
 
7
7
  interface Scenario {
@@ -311,6 +311,25 @@ interface EvalResult {
311
311
  * fails closed instead of being treated as a neutral zero.
312
312
  */
313
313
 
314
+ /** Severity of an actionable finding attached to a run/trace. */
315
+ type AsiSeverity = 'info' | 'warning' | 'error' | 'critical';
316
+ /** Actionable side-info — a diagnosed finding the loop can act on. */
317
+ interface ActionableSideInfo {
318
+ /** Stable expectation/check id when available. */
319
+ expectationId?: string;
320
+ /** Human-readable diagnosis of what happened. */
321
+ message: string;
322
+ severity?: AsiSeverity;
323
+ /** Concrete trace excerpt, file path, tool call, screenshot id, etc. */
324
+ evidence?: string;
325
+ /** Prompt/tool/context surface likely responsible. */
326
+ responsibleSurface?: string;
327
+ /** Suggested fix in natural language. */
328
+ suggestion?: string;
329
+ /** Whether this expectation was satisfied. Defaults to false for ASI rows. */
330
+ matched?: boolean;
331
+ metadata?: Record<string, unknown>;
332
+ }
314
333
  type ReleaseConfidenceStatus = 'pass' | 'warn' | 'fail';
315
334
  type ReleaseConfidenceAxisName = 'corpus' | 'quality' | 'generalization' | 'diagnostics' | 'efficiency';
316
335
  interface ReleaseTraceEvidence {
@@ -401,7 +420,6 @@ interface ReleaseConfidenceScorecard {
401
420
  gateDecision: GateDecision | null;
402
421
  summary: string;
403
422
  }
404
- declare function releaseTraceEvidenceFromMultiShotTrials(trials: readonly MultiShotTrialResult[]): ReleaseTraceEvidence[];
405
423
  declare function evaluateReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
406
424
  declare function assertReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
407
425
 
@@ -713,4 +731,4 @@ interface RenderReleaseReportOptions {
713
731
  }
714
732
  declare function renderReleaseReport(scorecard: ReleaseConfidenceScorecard, options?: RenderReleaseReportOptions): string;
715
733
 
716
- export { type RouteMap as $, type DriverState as A, type BootstrapOptions as B, type CheckResult as C, type DriverResult as D, type CollectedArtifacts as E, type ScenarioResult as F, type TurnMetrics as G, type ScenarioFile as H, type CompletionCriterion as I, type JudgeReplayGateArgs as J, type ArtifactCheck as K, type ArtifactResult as L, type CorpusAgreementOptions as M, type CorpusAgreementPerDimension as N, type CorpusAgreementReport as O, type PairedBootstrapOptions as P, type CorpusScoreRecord as Q, type ReleaseConfidenceAxis as R, type Scenario as S, type TestResult as T, type EvalResult as U, type Verdict as V, type FeedbackPattern as W, type JudgeConfig as X, type JudgeRubric as Y, type JudgeScore as Z, type PersonaRigor as _, type BootstrapResult as a, type RubricDimension as a0, type Turn as a1, type TurnResult as a2, bonferroni as a3, cohensD as a4, confidenceInterval as a5, corpusInterRaterAgreement as a6, corpusInterRaterAgreementFromJudgeScores as a7, interRaterReliability as a8, mannWhitneyU as a9, normalizeScores as aa, pairedMde as ab, pairedTTest as ac, partialCredit as ad, requiredSampleSize as ae, weightedMean as af, type PairedBootstrapResult as b, type ReleaseConfidenceAxisName as c, type ReleaseConfidenceInput as d, type ReleaseConfidenceIssue as e, type ReleaseConfidenceMetrics as f, type ReleaseConfidenceScorecard as g, type ReleaseConfidenceStatus as h, type ReleaseConfidenceThresholds as i, type ReleaseTraceEvidence as j, type RenderReleaseReportOptions as k, assertReleaseConfidence as l, benjaminiHochberg as m, bootstrapCi as n, evaluateReleaseConfidence as o, judgeReplayGate as p, pairedBootstrap as q, releaseTraceEvidenceFromMultiShotTrials as r, renderReleaseReport as s, type JudgeInput as t, type JudgeFn as u, type BenchmarkRunnerConfig as v, wilcoxonSignedRank as w, type BenchmarkReport as x, type ProductClientConfig as y, type PersonaConfig as z };
734
+ export { type PersonaRigor as $, type CollectedArtifacts as A, type BootstrapOptions as B, type CheckResult as C, type DriverResult as D, type ScenarioResult as E, type TurnMetrics as F, type ScenarioFile as G, type CompletionCriterion as H, type ActionableSideInfo as I, type JudgeReplayGateArgs as J, type ArtifactCheck as K, type ArtifactResult as L, type AsiSeverity as M, type CorpusAgreementOptions as N, type CorpusAgreementPerDimension as O, type PairedBootstrapOptions as P, type CorpusAgreementReport as Q, type ReleaseConfidenceAxis as R, type Scenario as S, type TestResult as T, type CorpusScoreRecord as U, type Verdict as V, type EvalResult as W, type FeedbackPattern as X, type JudgeConfig as Y, type JudgeRubric as Z, type JudgeScore as _, type BootstrapResult as a, type RouteMap as a0, type RubricDimension as a1, type Turn as a2, type TurnResult as a3, bonferroni as a4, cohensD as a5, confidenceInterval as a6, corpusInterRaterAgreement as a7, corpusInterRaterAgreementFromJudgeScores as a8, interRaterReliability as a9, mannWhitneyU as aa, normalizeScores as ab, pairedMde as ac, pairedTTest as ad, partialCredit as ae, requiredSampleSize as af, weightedMean as ag, type PairedBootstrapResult as b, type ReleaseConfidenceAxisName as c, type ReleaseConfidenceInput as d, type ReleaseConfidenceIssue as e, type ReleaseConfidenceMetrics as f, type ReleaseConfidenceScorecard as g, type ReleaseConfidenceStatus as h, type ReleaseConfidenceThresholds as i, type ReleaseTraceEvidence as j, type RenderReleaseReportOptions as k, assertReleaseConfidence as l, benjaminiHochberg as m, bootstrapCi as n, evaluateReleaseConfidence as o, judgeReplayGate as p, pairedBootstrap as q, renderReleaseReport as r, type JudgeInput as s, type JudgeFn as t, type BenchmarkRunnerConfig as u, type BenchmarkReport as v, wilcoxonSignedRank as w, type ProductClientConfig as x, type PersonaConfig as y, type DriverState as z };
@@ -1,7 +1,6 @@
1
1
  export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-ByZEC3BX.js';
2
- export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as releaseTraceEvidenceFromMultiShotTrials, s as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-Di84bXD7.js';
3
- export { I as InterimReleaseConfidence, a as InterimReleaseConfidenceInput, P as PairedEvalueOptions, b as PairedEvalueSequence, c as PairedEvalueStep, S as SequentialDecision, e as evaluateInterimReleaseConfidence, p as pairedEvalueSequence } from './sequential-5iSVfzl2.js';
4
- export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-DuZXOk7K.js';
2
+ export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-BtpgWRI0.js';
3
+ export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, I as InterimReleaseConfidence, c as InterimReleaseConfidenceInput, P as PairedEvalueOptions, d as PairedEvalueSequence, e as PairedEvalueStep, f as ParetoFigureSpec, g as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, h as ResearchReport, i as ResearchReportCandidate, j as ResearchReportDecision, k as ResearchReportMethodology, l as ResearchReportOptions, m as ResearchReportRecommendation, S as SequentialDecision, n as SummaryTable, o as SummaryTableOptions, p as SummaryTableRow, q as evaluateInterimReleaseConfidence, r as gainHistogram, s as pairedEvalueSequence, t as paretoChart, u as researchReport, v as summaryTable } from './sequential-DdV5ShjT.js';
5
4
  import './run-record-BGY6bHRh.js';
6
5
  import './errors-mje_cKOs.js';
7
6
  import './outcome-store-D6KWmYvj.js';
package/dist/reporting.js CHANGED
@@ -3,23 +3,20 @@ import {
3
3
  bootstrapCi,
4
4
  evaluateReleaseConfidence,
5
5
  judgeReplayGate,
6
- releaseTraceEvidenceFromMultiShotTrials,
7
6
  renderReleaseReport
8
- } from "./chunk-NKLGKF2Q.js";
7
+ } from "./chunk-KQ26DYTQ.js";
9
8
  import {
10
9
  rubricPredictiveValidity
11
10
  } from "./chunk-YRZ4M5GS.js";
12
- import {
13
- evaluateInterimReleaseConfidence,
14
- pairedEvalueSequence
15
- } from "./chunk-MAZ26DC7.js";
16
11
  import {
17
12
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
13
+ evaluateInterimReleaseConfidence,
18
14
  gainHistogram,
15
+ pairedEvalueSequence,
19
16
  paretoChart,
20
17
  researchReport,
21
18
  summaryTable
22
- } from "./chunk-EGIPWXHL.js";
19
+ } from "./chunk-MNL6LXGQ.js";
23
20
  import {
24
21
  benjaminiHochberg,
25
22
  pairedBootstrap,
@@ -40,7 +37,6 @@ export {
40
37
  pairedBootstrap,
41
38
  pairedEvalueSequence,
42
39
  paretoChart,
43
- releaseTraceEvidenceFromMultiShotTrials,
44
40
  renderReleaseReport,
45
41
  researchReport,
46
42
  rubricPredictiveValidity,
@@ -1,238 +1,149 @@
1
1
  import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-BGY6bHRh.js';
2
- import { A as AgentEvalError, C as CaptureIntegrityError } from './errors-mje_cKOs.js';
3
- import { R as RawProviderSink, P as ProviderRedactor, a as RunIntegrityExpectations, b as RunIntegrityReport } from './integrity-DYR5gWlb.js';
4
- import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-DuZXOk7K.js';
2
+ import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-BXVRUZyX.js';
3
+ import { l as ResearchReportOptions, h as ResearchReport, w as GateDecision } from './sequential-DdV5ShjT.js';
5
4
  import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
5
+ import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CTDhR1Sg.js';
6
+ import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
6
7
  import { T as TraceStore } from './store-Db2Bv8Cf.js';
7
8
 
8
9
  /**
9
- * LLM client with graceful degrade.
10
+ * Multi-layer verifier ordered pipeline of verification layers.
10
11
  *
11
- * OpenAI-compatible `/v1/chat/completions` client with:
12
- * - Exponential-backoff retry on 429 + 5xx gateway errors (502/503/504).
13
- * - Retry on transient network errors (fetch failed, AbortError, ECONNRESET).
14
- * - Graceful json_schema json_object degrade on 400 with schema-reject body.
15
- * - Fenced-JSON stripping (```json ... ```) for models that wrap structured output.
16
- * - Configurable base URL + api key / bearer, works with LiteLLM proxies, OpenAI
17
- * directly, cli-bridge subscriptions, and any router that speaks the spec.
12
+ * Different contract from {@link JudgeRunner} (which runs parallel
13
+ * specs against a sandbox). MultiLayerVerifier is a DAG of layers
14
+ * (install typecheck build lint serve → semantic → …) with
15
+ * dependency-based skip, per-layer findings, soft-fail semantics, and
16
+ * an aggregated `blendedScore` across all passed layers.
18
17
  *
19
- * Usage:
20
- * const { value, result } = await callLlmJson<MyType>(
21
- * { model: 'gpt-4o', messages: [...], jsonSchema: { name: 'x', schema: {...} } },
22
- * { baseUrl: 'https://router.tangle.tools/v1', apiKey: process.env.KEY },
23
- * )
18
+ * Use when you want:
19
+ * - ordered stages where a failing upstream stage skips downstream ones
20
+ * - each stage produces rich `findings` (severity + message + evidence)
21
+ * - a single composite score across stages with per-stage weights
22
+ * - soft-fail stages whose failure doesn't abort the pipeline
24
23
  *
25
- * This is THE llm-calling seam for agent-eval primitives that need structured
26
- * output (semantic concept judge, reviewer directives, critic scores). Primitives
27
- * that need free-form text use `callLlm` and parse output themselves.
24
+ * Use {@link JudgeRunner} when you want:
25
+ * - N independent judges running in parallel against the same artifact
26
+ * - no inter-judge dependencies
27
+ * - boolean `passed` per judge + overall
28
+ *
29
+ * Both primitives compose — JudgeRunner can be invoked as a single
30
+ * layer inside a MultiLayerVerifier if that suits the caller.
28
31
  */
29
-
30
- interface LlmMessage {
31
- role: 'system' | 'user' | 'assistant';
32
+ type LayerStatus = 'pass' | 'fail' | 'skipped' | 'error' | 'timeout';
33
+ type Severity = 'critical' | 'major' | 'minor' | 'info';
34
+ interface Finding {
35
+ severity: Severity;
36
+ message: string;
37
+ evidence?: string;
38
+ /** Optional layer name the finding belongs to (set by the verifier if omitted). */
39
+ layer?: string;
32
40
  /**
33
- * Either a plain text content string OR a multimodal content array
34
- * (text + image_url parts) for vision-capable models.
41
+ * Free-form structured payload used by `multiToolchainLayer` to attach
42
+ * `{ adapter: 'pnpm' }`, by judges to attach evidence pointers, etc.
43
+ * Renderers MAY interrogate; agent-eval primitives never assume shape.
35
44
  */
36
- content: string | Array<{
37
- type: 'text';
38
- text: string;
39
- } | {
40
- type: 'image_url';
41
- image_url: {
42
- url: string;
43
- detail?: 'auto' | 'low' | 'high';
44
- };
45
- }>;
46
- }
47
- interface LlmCallRequest {
48
- model: string;
49
- messages: LlmMessage[];
50
- /** Optional JSON-mode response format (response_format: json_object). */
51
- jsonMode?: boolean;
52
- /** Optional structured output via JSON Schema. Falls back to json_object on 400. */
53
- jsonSchema?: {
54
- name: string;
55
- schema: Record<string, unknown>;
56
- };
57
- temperature?: number;
58
- maxTokens?: number;
59
- /** Per-call timeout, default 60s. */
60
- timeoutMs?: number;
45
+ detail?: Record<string, unknown>;
61
46
  }
62
- interface LlmUsage {
63
- promptTokens: number;
64
- completionTokens: number;
65
- totalTokens: number;
66
- /** Proxies populate this when prompt caching is on. */
67
- cachedPromptTokens?: number;
68
- }
69
- interface LlmCallResult {
70
- /** The text content of the first choice. Empty string if none. */
71
- content: string;
72
- usage: LlmUsage;
47
+ interface LayerResult {
48
+ layer: string;
49
+ status: LayerStatus;
50
+ /** 0..1 score, optional — layers that don't produce a numeric score omit. */
51
+ score?: number;
52
+ durationMs: number;
53
+ findings: Finding[];
54
+ /** Short human-readable summary (one line). */
55
+ reason?: string;
73
56
  /**
74
- * Cost in USD. Pulled from proxy's `_response_cost` field when present;
75
- * `null` when neither the proxy nor the caller can derive it.
57
+ * Numeric layer-level diagnostics: error counts, warning counts,
58
+ * cyclomatic complexity, total adapter wall-time, etc. Keyed by
59
+ * diagnostic name; null = "diagnostic not applicable / not measured."
60
+ * Renderers that know the keys can display them; ones that don't,
61
+ * ignore. Free-form on purpose — consumers type the value shape in
62
+ * their own namespace.
76
63
  */
77
- costUsd: number | null;
78
- /** Model name actually used (echoed from response). */
79
- model: string;
80
- /** Wall-clock duration of the HTTP call (last attempt, if retried). */
81
- durationMs: number;
82
- /** Raw response body. */
83
- raw: Record<string, unknown>;
64
+ diagnostics?: Record<string, number | null>;
65
+ /** Any rich per-layer detail rendered as-is by consumers that know the layer. */
66
+ detail?: Record<string, unknown>;
84
67
  }
85
- declare class LlmCallError extends AgentEvalError {
86
- readonly status: number;
87
- readonly body: string;
88
- readonly model: string;
89
- constructor(message: string, status: number, body: string, model: string);
68
+ interface VerifyContext<Env = unknown> {
69
+ /** Per-run opaque context the caller provides. Layers destructure what they need. */
70
+ env: Env;
71
+ /** Previously-computed results from layers that already ran. */
72
+ prior: Record<string, LayerResult>;
73
+ /** Signal — if aborted, layers MUST bail within reasonable wall. */
74
+ signal: AbortSignal;
90
75
  }
91
- interface LlmClientOptions {
92
- /** Base URL (without trailing slash). Must end at the `/v1` prefix. */
93
- baseUrl?: string;
94
- /** Bearer token — either `apiKey` or `bearer` populates `Authorization: Bearer ...`. */
95
- apiKey?: string;
96
- bearer?: string;
97
- /** Override for the `Authorization` header (e.g. `X-Auth: ...`). Takes precedence over apiKey/bearer. */
98
- authHeader?: {
99
- name: string;
100
- value: string;
101
- };
102
- /** Default timeout in ms. Per-call can override. */
103
- defaultTimeoutMs?: number;
104
- /** Max retry attempts on retriable errors. Default 3 (1 initial + 2 retries). */
105
- maxRetries?: number;
106
- /** Fetch implementation — defaults to global `fetch`. Override for custom transport (e.g. tests). */
107
- fetch?: typeof fetch;
76
+ interface Layer<Env = unknown> {
77
+ name: string;
78
+ /** Stages that must have `status: 'pass'` before this layer runs. */
79
+ dependsOn?: string[];
108
80
  /**
109
- * Optional raw HTTP capture sink. When provided, every request, response,
110
- * and error (across all retry attempts) is recorded to the sink, with auth
111
- * headers and credential-shaped body fields redacted by default. This is
112
- * the layer-1 forensics primitive: structured `LlmSpan`s record intent,
113
- * raw events record what actually crossed the wire.
81
+ * Weight in the composite `blendedScore`. Default 1.0. Layers with weight 0
82
+ * contribute findings but not score.
114
83
  */
115
- rawSink?: RawProviderSink;
84
+ weight?: number;
116
85
  /**
117
- * Logical provider id attached to raw events. When omitted, derived from
118
- * `baseUrl` via `providerFromBaseUrl`.
86
+ * If true, a `fail` status contributes to `blendedScore` (as 0) instead of
87
+ * being dropped — use for layers whose failure is a real signal. Default:
88
+ * fail drops from numerator + denominator, matching VB's existing semantics.
119
89
  */
120
- provider?: string;
121
- /** Trace context attached to raw events; populated by emitter-aware callers. */
122
- traceContext?: {
123
- runId?: string;
124
- spanId?: string;
125
- };
126
- /** Override the redaction strategy for this call. Defaults to `defaultProviderRedactor`. */
127
- redactor?: ProviderRedactor;
128
- }
129
- /**
130
- * True when an error is a transient transport/network fault worth retrying,
131
- * as opposed to a deterministic failure (4xx schema reject, JSON parse) that
132
- * a retry cannot fix. Inspects `LlmCallError.status`, then the error's
133
- * name/message/code, then recurses into `error.cause` — undici nests the
134
- * real socket fault one or more levels under `.cause`.
135
- *
136
- * This is THE retry classifier for the package: `callLlm` and
137
- * `withJudgeRetry` both route through it, so a connection-class error is
138
- * treated identically whether it surfaces in the HTTP client or a
139
- * TCloud-backed judge.
140
- */
141
- declare function isTransientLlmError(err: unknown): boolean;
142
- /** Exponential backoff: 500ms, 1s, 2s, 4s, ... capped at 16s. Attempt is 0-indexed. */
143
- declare function backoffMs(attempt: number): number;
144
- /**
145
- * Strip a ```json / ``` code fence if the model emitted one.
146
- * Idempotent for naked JSON. Some models (claude-code via router, certain
147
- * deepseek models) wrap output even under json_object.
148
- */
149
- declare function stripFencedJson(raw: string): string;
150
- /**
151
- * Low-level call. Returns raw content + usage + cost. Retries on transient
152
- * failures; does NOT degrade schema here — callers that want graceful
153
- * degrade use `callLlmJson`.
154
- */
155
- declare function callLlm(req: LlmCallRequest, opts?: LlmClientOptions): Promise<LlmCallResult>;
156
- /**
157
- * Structured-output call. Returns parsed JSON plus the raw result envelope.
158
- * Degrades `jsonSchema` → `jsonMode` on a 400 that names the schema param —
159
- * critical for deepseek-v3/v4, kimi-k2.6, and other models that don't accept
160
- * the `response_format.json_schema` shape but DO accept `json_object`.
161
- */
162
- declare function callLlmJson<T = unknown>(req: LlmCallRequest, opts?: LlmClientOptions): Promise<{
163
- value: T;
164
- result: LlmCallResult;
165
- }>;
166
- type LlmRouteAssertionReason = 'no_explicit_base_url' | 'base_url_blocked' | 'base_url_not_allowed' | 'no_auth' | 'wrong_provider';
167
- declare class LlmRouteAssertionError extends CaptureIntegrityError {
168
- readonly reason: LlmRouteAssertionReason;
169
- readonly baseUrl: string;
170
- constructor(message: string, reason: LlmRouteAssertionReason, baseUrl: string);
90
+ failContributesToScore?: boolean;
91
+ /** Optional per-layer wall-cap in ms. Honored by the verifier (AbortSignal). */
92
+ capMs?: number;
93
+ run: (ctx: VerifyContext<Env>) => Promise<LayerResult> | LayerResult;
171
94
  }
172
- interface LlmRouteRequirements {
173
- /**
174
- * Throw if `opts.baseUrl` is undefined, i.e. the call would fall back to
175
- * `DEFAULT_BASE_URL`. Set this for evaluation runs where silently using
176
- * the public/free-tier router is a defect — the launch reviewer needs to
177
- * know exactly which provider answered.
178
- */
179
- requireExplicitBaseUrl?: boolean;
95
+ interface VerifyOptions<Env = unknown> {
96
+ env: Env;
180
97
  /**
181
- * Allowlist of acceptable base URLs. Strings match by prefix
182
- * (case-insensitive); RegExps test against the full base URL.
98
+ * Overall wall cap. Default: sum of layer capMs, or Infinity if any layer
99
+ * omits a cap. The verifier short-circuits remaining layers on overall cap.
183
100
  */
184
- allowedBaseUrls?: Array<string | RegExp>;
185
- /** Blocklist that takes precedence over `allowedBaseUrls`. */
186
- blockedBaseUrls?: Array<string | RegExp>;
187
- /** Throw if no auth header / api key is configured. */
188
- requireAuth?: boolean;
101
+ overallCapMs?: number;
102
+ /** Called with each layer result as it completes. */
103
+ onLayer?: (result: LayerResult) => void;
104
+ }
105
+ interface VerificationReport {
106
+ layers: LayerResult[];
107
+ passCount: number;
108
+ failCount: number;
109
+ skippedCount: number;
110
+ errorCount: number;
111
+ /** True iff at least one scored layer ran AND every scored layer passed. */
112
+ allPass: boolean;
189
113
  /**
190
- * Logical provider id the configured `baseUrl` is expected to match (via
191
- * `providerFromBaseUrl`). Mainly useful when paired with `requireExplicitBaseUrl`.
114
+ * Weighted mean of `score` across contributing layers. 0 when no layers
115
+ * contributed. See {@link Layer.failContributesToScore} for fail semantics.
192
116
  */
193
- expectedProvider?: string;
117
+ blendedScore: number;
118
+ durationMs: number;
119
+ startedAt: string;
120
+ finishedAt: string;
194
121
  }
195
122
  /**
196
- * Fail-loud assertion that the configured LLM client points at the route
197
- * the caller intends. Designed for the matrix-runner preflight: invoke
198
- * once before any LLM call to catch misconfiguration before a sweep burns
199
- * dollars on the wrong provider.
123
+ * Grade a semantic-concept-style judge result into a single layer status.
200
124
  *
201
- * Throws `LlmRouteAssertionError`. Pure no I/O so it's safe to call
202
- * from constructors and CI gates.
203
- */
204
- declare function assertLlmRoute(opts: LlmClientOptions, req?: LlmRouteRequirements): void;
205
- /**
206
- * Probe whether a model is reachable. Returns latency + null error on
207
- * success; `ok=false` + error message on any failure (HTTP, timeout,
208
- * network, parse). Designed for sweep preflights — fail loud at the
209
- * boundary before burning a 30-leaf run on a misconfigured router.
125
+ * Pass when overall score >= threshold AND no critical-severity concept gap.
126
+ * Fail otherwise. Use inside a `Layer.run` when wrapping a concept judge.
210
127
  *
211
- * Sends a tiny `ping` message with `maxTokens=64`. Reasoning models
212
- * (glm-5.1, deepseek-v4) can burn the entire budget on internal reasoning
213
- * for short prompts, so don't tighten this further. We don't validate
214
- * content; HTTP 200 means reachable.
215
- */
216
- declare function probeLlm(model: string, opts?: LlmClientOptions & {
217
- timeoutMs?: number;
218
- }): Promise<{
219
- ok: boolean;
220
- latencyMs: number;
221
- error: string | null;
222
- }>;
223
- /**
224
- * Stateful client — construct once with defaults, call many times.
225
- * Thin wrapper around the free functions; exists for callers that want
226
- * to inject a single configured instance into multiple primitives.
128
+ * Generalized from VerticalBench H3 fix: `failingConcepts.length === 0` was
129
+ * too strict a single concept at 6/10 failed the entire layer despite
130
+ * overall score being >= 0.7. Now we trust the judge's own `severity` field:
131
+ * `critical` findings veto; `major`/`minor` reduce the score but don't veto.
227
132
  */
228
- declare class LlmClient {
229
- private readonly opts;
230
- constructor(opts?: LlmClientOptions);
231
- call(req: LlmCallRequest, per?: LlmClientOptions): Promise<LlmCallResult>;
232
- callJson<T = unknown>(req: LlmCallRequest, per?: LlmClientOptions): Promise<{
233
- value: T;
234
- result: LlmCallResult;
133
+ declare function gradeSemanticStatus(input: {
134
+ score: number;
135
+ findings: Array<{
136
+ severity: Severity;
137
+ present?: boolean;
138
+ score?: number;
235
139
  }>;
140
+ available: boolean;
141
+ threshold?: number;
142
+ }): LayerStatus;
143
+ declare class MultiLayerVerifier<Env = unknown> {
144
+ private readonly layers;
145
+ constructor(layers: Layer<Env>[]);
146
+ run(opts: VerifyOptions<Env>): Promise<VerificationReport>;
236
147
  }
237
148
 
238
149
  /**
@@ -608,4 +519,4 @@ declare class NoopResearcher implements Researcher {
608
519
  evaluateChange(_plan: ExperimentPlan): Promise<ExperimentResult>;
609
520
  }
610
521
 
611
- export { probeLlm as A, stripFencedJson as B, CallbackResearcher as C, type EvalCampaignOptions as E, type FailedRun as F, type LlmClientOptions as L, NoopResearcher as N, type Researcher as R, type SteeringChange as S, type CallbackResearcherOptions as a, type CampaignFactoryParams as b, type CampaignIntegrityPolicy as c, type CampaignRunContext as d, type CampaignRunOutcome as e, type CampaignRunner as f, type CampaignScenario as g, type CampaignVariant as h, type EvalCampaignResult as i, type ExperimentPlan as j, type ExperimentResult as k, type FailureMode as l, type LlmCallRequest as m, type LlmCallResult as n, LlmCallError as o, LlmClient as p, type LlmMessage as q, runEvalCampaign as r, LlmRouteAssertionError as s, type LlmRouteRequirements as t, type LlmUsage as u, assertLlmRoute as v, backoffMs as w, callLlm as x, callLlmJson as y, isTransientLlmError as z };
522
+ export { CallbackResearcher as C, type ExperimentPlan as E, type FailureMode as F, type Layer as L, MultiLayerVerifier as M, NoopResearcher as N, type Researcher as R, type SteeringChange as S, type VerificationReport as V, type ExperimentResult as a, type EvalCampaignResult as b, type EvalCampaignOptions as c, type Severity as d, type VerifyOptions as e, type LayerResult as f, type VerifyContext as g, type CallbackResearcherOptions as h, type CampaignFactoryParams as i, type CampaignIntegrityPolicy as j, type CampaignRunContext as k, type CampaignRunOutcome as l, type CampaignRunner as m, type CampaignScenario as n, type CampaignVariant as o, type FailedRun as p, type Finding as q, runEvalCampaign as r, type LayerStatus as s, gradeSemanticStatus as t };