@tangle-network/agent-eval 0.41.0 → 0.42.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/campaign/index.d.ts +48 -368
- package/dist/campaign/index.js +67 -1
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-AU2JLNSZ.js → chunk-H4TOS272.js} +1 -65
- package/dist/chunk-H4TOS272.js.map +1 -0
- package/dist/{chunk-NKLGKF2Q.js → chunk-KQ26DYTQ.js} +2 -18
- package/dist/chunk-KQ26DYTQ.js.map +1 -0
- package/dist/{chunk-EGIPWXHL.js → chunk-MNL6LXGQ.js} +98 -2
- package/dist/chunk-MNL6LXGQ.js.map +1 -0
- package/dist/{chunk-5U2DOJU4.js → chunk-N4SBKEPJ.js} +199 -2
- package/dist/chunk-N4SBKEPJ.js.map +1 -0
- package/dist/{chunk-LCIDRYGP.js → chunk-PD3MH6WU.js} +8 -8
- package/dist/{control-CmLJk3IG.d.ts → control-ojEWkMfJ.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/{feedback-trajectory-Dvy-bt7x.d.ts → feedback-trajectory-BSxqEpu7.d.ts} +1 -1
- package/dist/index.d.ts +227 -687
- package/dist/index.js +753 -1237
- package/dist/index.js.map +1 -1
- package/dist/integrity-CTDhR1Sg.d.ts +81 -0
- package/dist/llm-client-BXVRUZyX.d.ts +234 -0
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +67 -3
- package/dist/pipelines/index.js.map +1 -1
- package/dist/{integrity-DYR5gWlb.d.ts → raw-provider-sink-C46HDghv.d.ts} +1 -80
- package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} +21 -3
- package/dist/reporting.d.ts +2 -3
- package/dist/reporting.js +4 -8
- package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} +116 -205
- package/dist/rl.d.ts +103 -221
- package/dist/rl.js +44 -199
- package/dist/rl.js.map +1 -1
- package/dist/sequential-DdV5ShjT.d.ts +561 -0
- package/dist/traces.d.ts +3 -2
- package/dist/traces.js +5 -5
- package/dist/types-BLbRTxoc.d.ts +367 -0
- package/dist/wire/index.d.ts +1 -1
- package/package.json +1 -6
- package/dist/chunk-5U2DOJU4.js.map +0 -1
- package/dist/chunk-AU2JLNSZ.js.map +0 -1
- package/dist/chunk-DMW5VENN.js +0 -1412
- package/dist/chunk-DMW5VENN.js.map +0 -1
- package/dist/chunk-EGIPWXHL.js.map +0 -1
- package/dist/chunk-MAZ26DC7.js +0 -99
- package/dist/chunk-MAZ26DC7.js.map +0 -1
- package/dist/chunk-NKLGKF2Q.js.map +0 -1
- package/dist/multi-layer-verifier-BNi4-8lR.d.ts +0 -141
- package/dist/optimization.d.ts +0 -11
- package/dist/optimization.js +0 -71
- package/dist/optimization.js.map +0 -1
- package/dist/sequential-5iSVfzl2.d.ts +0 -139
- package/dist/summary-report-DuZXOk7K.d.ts +0 -917
- /package/dist/{chunk-LCIDRYGP.js.map → chunk-PD3MH6WU.js.map} +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { C as ContinuousAgreementOptions, a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
|
|
2
2
|
import { TCloud } from '@tangle-network/tcloud';
|
|
3
3
|
import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-BlwAtYYf.js';
|
|
4
|
-
import {
|
|
4
|
+
import { w as GateDecision } from './sequential-DdV5ShjT.js';
|
|
5
5
|
import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
|
|
6
6
|
|
|
7
7
|
interface Scenario {
|
|
@@ -311,6 +311,25 @@ interface EvalResult {
|
|
|
311
311
|
* fails closed instead of being treated as a neutral zero.
|
|
312
312
|
*/
|
|
313
313
|
|
|
314
|
+
/** Severity of an actionable finding attached to a run/trace. */
|
|
315
|
+
type AsiSeverity = 'info' | 'warning' | 'error' | 'critical';
|
|
316
|
+
/** Actionable side-info — a diagnosed finding the loop can act on. */
|
|
317
|
+
interface ActionableSideInfo {
|
|
318
|
+
/** Stable expectation/check id when available. */
|
|
319
|
+
expectationId?: string;
|
|
320
|
+
/** Human-readable diagnosis of what happened. */
|
|
321
|
+
message: string;
|
|
322
|
+
severity?: AsiSeverity;
|
|
323
|
+
/** Concrete trace excerpt, file path, tool call, screenshot id, etc. */
|
|
324
|
+
evidence?: string;
|
|
325
|
+
/** Prompt/tool/context surface likely responsible. */
|
|
326
|
+
responsibleSurface?: string;
|
|
327
|
+
/** Suggested fix in natural language. */
|
|
328
|
+
suggestion?: string;
|
|
329
|
+
/** Whether this expectation was satisfied. Defaults to false for ASI rows. */
|
|
330
|
+
matched?: boolean;
|
|
331
|
+
metadata?: Record<string, unknown>;
|
|
332
|
+
}
|
|
314
333
|
type ReleaseConfidenceStatus = 'pass' | 'warn' | 'fail';
|
|
315
334
|
type ReleaseConfidenceAxisName = 'corpus' | 'quality' | 'generalization' | 'diagnostics' | 'efficiency';
|
|
316
335
|
interface ReleaseTraceEvidence {
|
|
@@ -401,7 +420,6 @@ interface ReleaseConfidenceScorecard {
|
|
|
401
420
|
gateDecision: GateDecision | null;
|
|
402
421
|
summary: string;
|
|
403
422
|
}
|
|
404
|
-
declare function releaseTraceEvidenceFromMultiShotTrials(trials: readonly MultiShotTrialResult[]): ReleaseTraceEvidence[];
|
|
405
423
|
declare function evaluateReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
|
|
406
424
|
declare function assertReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
|
|
407
425
|
|
|
@@ -713,4 +731,4 @@ interface RenderReleaseReportOptions {
|
|
|
713
731
|
}
|
|
714
732
|
declare function renderReleaseReport(scorecard: ReleaseConfidenceScorecard, options?: RenderReleaseReportOptions): string;
|
|
715
733
|
|
|
716
|
-
export { type
|
|
734
|
+
export { type PersonaRigor as $, type CollectedArtifacts as A, type BootstrapOptions as B, type CheckResult as C, type DriverResult as D, type ScenarioResult as E, type TurnMetrics as F, type ScenarioFile as G, type CompletionCriterion as H, type ActionableSideInfo as I, type JudgeReplayGateArgs as J, type ArtifactCheck as K, type ArtifactResult as L, type AsiSeverity as M, type CorpusAgreementOptions as N, type CorpusAgreementPerDimension as O, type PairedBootstrapOptions as P, type CorpusAgreementReport as Q, type ReleaseConfidenceAxis as R, type Scenario as S, type TestResult as T, type CorpusScoreRecord as U, type Verdict as V, type EvalResult as W, type FeedbackPattern as X, type JudgeConfig as Y, type JudgeRubric as Z, type JudgeScore as _, type BootstrapResult as a, type RouteMap as a0, type RubricDimension as a1, type Turn as a2, type TurnResult as a3, bonferroni as a4, cohensD as a5, confidenceInterval as a6, corpusInterRaterAgreement as a7, corpusInterRaterAgreementFromJudgeScores as a8, interRaterReliability as a9, mannWhitneyU as aa, normalizeScores as ab, pairedMde as ac, pairedTTest as ad, partialCredit as ae, requiredSampleSize as af, weightedMean as ag, type PairedBootstrapResult as b, type ReleaseConfidenceAxisName as c, type ReleaseConfidenceInput as d, type ReleaseConfidenceIssue as e, type ReleaseConfidenceMetrics as f, type ReleaseConfidenceScorecard as g, type ReleaseConfidenceStatus as h, type ReleaseConfidenceThresholds as i, type ReleaseTraceEvidence as j, type RenderReleaseReportOptions as k, assertReleaseConfidence as l, benjaminiHochberg as m, bootstrapCi as n, evaluateReleaseConfidence as o, judgeReplayGate as p, pairedBootstrap as q, renderReleaseReport as r, type JudgeInput as s, type JudgeFn as t, type BenchmarkRunnerConfig as u, type BenchmarkReport as v, wilcoxonSignedRank as w, type ProductClientConfig as x, type PersonaConfig as y, type DriverState as z };
|
package/dist/reporting.d.ts
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
export { R as RubricOutcomePair, a as RubricPredictiveValidityInput, b as RubricPredictiveValidityReport, c as RubricRanking, r as rubricPredictiveValidity } from './rubric-predictive-validity-ByZEC3BX.js';
|
|
2
|
-
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as
|
|
3
|
-
export { I as InterimReleaseConfidence,
|
|
4
|
-
export { C as GainDistributionBin, F as GainDistributionFigureSpec, H as GainDistributionOptions, J as ParetoFigureSpec, K as ParetoPoint, L as RESEARCH_REPORT_HARD_PAIR_FLOOR, N as ResearchReport, O as ResearchReportCandidate, Q as ResearchReportDecision, U as ResearchReportMethodology, W as ResearchReportOptions, X as ResearchReportRecommendation, Y as SummaryTable, Z as SummaryTableOptions, _ as SummaryTableRow, $ as gainHistogram, a0 as paretoChart, a1 as researchReport, a2 as summaryTable } from './summary-report-DuZXOk7K.js';
|
|
2
|
+
export { B as BootstrapOptions, a as BootstrapResult, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, b as PairedBootstrapResult, R as ReleaseConfidenceAxis, c as ReleaseConfidenceAxisName, d as ReleaseConfidenceInput, e as ReleaseConfidenceIssue, f as ReleaseConfidenceMetrics, g as ReleaseConfidenceScorecard, h as ReleaseConfidenceStatus, i as ReleaseConfidenceThresholds, j as ReleaseTraceEvidence, k as RenderReleaseReportOptions, V as Verdict, l as assertReleaseConfidence, m as benjaminiHochberg, n as bootstrapCi, o as evaluateReleaseConfidence, p as judgeReplayGate, q as pairedBootstrap, r as renderReleaseReport, w as wilcoxonSignedRank } from './release-report-BtpgWRI0.js';
|
|
3
|
+
export { G as GainDistributionBin, a as GainDistributionFigureSpec, b as GainDistributionOptions, I as InterimReleaseConfidence, c as InterimReleaseConfidenceInput, P as PairedEvalueOptions, d as PairedEvalueSequence, e as PairedEvalueStep, f as ParetoFigureSpec, g as ParetoPoint, R as RESEARCH_REPORT_HARD_PAIR_FLOOR, h as ResearchReport, i as ResearchReportCandidate, j as ResearchReportDecision, k as ResearchReportMethodology, l as ResearchReportOptions, m as ResearchReportRecommendation, S as SequentialDecision, n as SummaryTable, o as SummaryTableOptions, p as SummaryTableRow, q as evaluateInterimReleaseConfidence, r as gainHistogram, s as pairedEvalueSequence, t as paretoChart, u as researchReport, v as summaryTable } from './sequential-DdV5ShjT.js';
|
|
5
4
|
import './run-record-BGY6bHRh.js';
|
|
6
5
|
import './errors-mje_cKOs.js';
|
|
7
6
|
import './outcome-store-D6KWmYvj.js';
|
package/dist/reporting.js
CHANGED
|
@@ -3,23 +3,20 @@ import {
|
|
|
3
3
|
bootstrapCi,
|
|
4
4
|
evaluateReleaseConfidence,
|
|
5
5
|
judgeReplayGate,
|
|
6
|
-
releaseTraceEvidenceFromMultiShotTrials,
|
|
7
6
|
renderReleaseReport
|
|
8
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-KQ26DYTQ.js";
|
|
9
8
|
import {
|
|
10
9
|
rubricPredictiveValidity
|
|
11
10
|
} from "./chunk-YRZ4M5GS.js";
|
|
12
|
-
import {
|
|
13
|
-
evaluateInterimReleaseConfidence,
|
|
14
|
-
pairedEvalueSequence
|
|
15
|
-
} from "./chunk-MAZ26DC7.js";
|
|
16
11
|
import {
|
|
17
12
|
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
13
|
+
evaluateInterimReleaseConfidence,
|
|
18
14
|
gainHistogram,
|
|
15
|
+
pairedEvalueSequence,
|
|
19
16
|
paretoChart,
|
|
20
17
|
researchReport,
|
|
21
18
|
summaryTable
|
|
22
|
-
} from "./chunk-
|
|
19
|
+
} from "./chunk-MNL6LXGQ.js";
|
|
23
20
|
import {
|
|
24
21
|
benjaminiHochberg,
|
|
25
22
|
pairedBootstrap,
|
|
@@ -40,7 +37,6 @@ export {
|
|
|
40
37
|
pairedBootstrap,
|
|
41
38
|
pairedEvalueSequence,
|
|
42
39
|
paretoChart,
|
|
43
|
-
releaseTraceEvidenceFromMultiShotTrials,
|
|
44
40
|
renderReleaseReport,
|
|
45
41
|
researchReport,
|
|
46
42
|
rubricPredictiveValidity,
|
|
@@ -1,238 +1,149 @@
|
|
|
1
1
|
import { a as RunSplitTag, b as RunTokenUsage, c as RunJudgeMetadata, J as JudgeScoresRecord, A as AgentProfileCell, d as AgentProfileCellInput, R as RunRecord } from './run-record-BGY6bHRh.js';
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
4
|
-
import { W as ResearchReportOptions, N as ResearchReport, a3 as GateDecision } from './summary-report-DuZXOk7K.js';
|
|
2
|
+
import { L as LlmClientOptions, a as LlmRouteRequirements } from './llm-client-BXVRUZyX.js';
|
|
3
|
+
import { l as ResearchReportOptions, h as ResearchReport, w as GateDecision } from './sequential-DdV5ShjT.js';
|
|
5
4
|
import { T as TraceEmitter, R as RunCompleteHook } from './emitter-DP_cSSiw.js';
|
|
5
|
+
import { R as RunIntegrityExpectations, a as RunIntegrityReport } from './integrity-CTDhR1Sg.js';
|
|
6
|
+
import { R as RawProviderSink } from './raw-provider-sink-C46HDghv.js';
|
|
6
7
|
import { T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
7
8
|
|
|
8
9
|
/**
|
|
9
|
-
*
|
|
10
|
+
* Multi-layer verifier — ordered pipeline of verification layers.
|
|
10
11
|
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
* - Configurable base URL + api key / bearer, works with LiteLLM proxies, OpenAI
|
|
17
|
-
* directly, cli-bridge subscriptions, and any router that speaks the spec.
|
|
12
|
+
* Different contract from {@link JudgeRunner} (which runs parallel
|
|
13
|
+
* specs against a sandbox). MultiLayerVerifier is a DAG of layers
|
|
14
|
+
* (install → typecheck → build → lint → serve → semantic → …) with
|
|
15
|
+
* dependency-based skip, per-layer findings, soft-fail semantics, and
|
|
16
|
+
* an aggregated `blendedScore` across all passed layers.
|
|
18
17
|
*
|
|
19
|
-
*
|
|
20
|
-
*
|
|
21
|
-
*
|
|
22
|
-
*
|
|
23
|
-
*
|
|
18
|
+
* Use when you want:
|
|
19
|
+
* - ordered stages where a failing upstream stage skips downstream ones
|
|
20
|
+
* - each stage produces rich `findings` (severity + message + evidence)
|
|
21
|
+
* - a single composite score across stages with per-stage weights
|
|
22
|
+
* - soft-fail stages whose failure doesn't abort the pipeline
|
|
24
23
|
*
|
|
25
|
-
*
|
|
26
|
-
*
|
|
27
|
-
*
|
|
24
|
+
* Use {@link JudgeRunner} when you want:
|
|
25
|
+
* - N independent judges running in parallel against the same artifact
|
|
26
|
+
* - no inter-judge dependencies
|
|
27
|
+
* - boolean `passed` per judge + overall
|
|
28
|
+
*
|
|
29
|
+
* Both primitives compose — JudgeRunner can be invoked as a single
|
|
30
|
+
* layer inside a MultiLayerVerifier if that suits the caller.
|
|
28
31
|
*/
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
+
type LayerStatus = 'pass' | 'fail' | 'skipped' | 'error' | 'timeout';
|
|
33
|
+
type Severity = 'critical' | 'major' | 'minor' | 'info';
|
|
34
|
+
interface Finding {
|
|
35
|
+
severity: Severity;
|
|
36
|
+
message: string;
|
|
37
|
+
evidence?: string;
|
|
38
|
+
/** Optional layer name the finding belongs to (set by the verifier if omitted). */
|
|
39
|
+
layer?: string;
|
|
32
40
|
/**
|
|
33
|
-
*
|
|
34
|
-
*
|
|
41
|
+
* Free-form structured payload — used by `multiToolchainLayer` to attach
|
|
42
|
+
* `{ adapter: 'pnpm' }`, by judges to attach evidence pointers, etc.
|
|
43
|
+
* Renderers MAY interrogate; agent-eval primitives never assume shape.
|
|
35
44
|
*/
|
|
36
|
-
|
|
37
|
-
type: 'text';
|
|
38
|
-
text: string;
|
|
39
|
-
} | {
|
|
40
|
-
type: 'image_url';
|
|
41
|
-
image_url: {
|
|
42
|
-
url: string;
|
|
43
|
-
detail?: 'auto' | 'low' | 'high';
|
|
44
|
-
};
|
|
45
|
-
}>;
|
|
46
|
-
}
|
|
47
|
-
interface LlmCallRequest {
|
|
48
|
-
model: string;
|
|
49
|
-
messages: LlmMessage[];
|
|
50
|
-
/** Optional JSON-mode response format (response_format: json_object). */
|
|
51
|
-
jsonMode?: boolean;
|
|
52
|
-
/** Optional structured output via JSON Schema. Falls back to json_object on 400. */
|
|
53
|
-
jsonSchema?: {
|
|
54
|
-
name: string;
|
|
55
|
-
schema: Record<string, unknown>;
|
|
56
|
-
};
|
|
57
|
-
temperature?: number;
|
|
58
|
-
maxTokens?: number;
|
|
59
|
-
/** Per-call timeout, default 60s. */
|
|
60
|
-
timeoutMs?: number;
|
|
45
|
+
detail?: Record<string, unknown>;
|
|
61
46
|
}
|
|
62
|
-
interface
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
content: string;
|
|
72
|
-
usage: LlmUsage;
|
|
47
|
+
interface LayerResult {
|
|
48
|
+
layer: string;
|
|
49
|
+
status: LayerStatus;
|
|
50
|
+
/** 0..1 score, optional — layers that don't produce a numeric score omit. */
|
|
51
|
+
score?: number;
|
|
52
|
+
durationMs: number;
|
|
53
|
+
findings: Finding[];
|
|
54
|
+
/** Short human-readable summary (one line). */
|
|
55
|
+
reason?: string;
|
|
73
56
|
/**
|
|
74
|
-
*
|
|
75
|
-
*
|
|
57
|
+
* Numeric layer-level diagnostics: error counts, warning counts,
|
|
58
|
+
* cyclomatic complexity, total adapter wall-time, etc. Keyed by
|
|
59
|
+
* diagnostic name; null = "diagnostic not applicable / not measured."
|
|
60
|
+
* Renderers that know the keys can display them; ones that don't,
|
|
61
|
+
* ignore. Free-form on purpose — consumers type the value shape in
|
|
62
|
+
* their own namespace.
|
|
76
63
|
*/
|
|
77
|
-
|
|
78
|
-
/**
|
|
79
|
-
|
|
80
|
-
/** Wall-clock duration of the HTTP call (last attempt, if retried). */
|
|
81
|
-
durationMs: number;
|
|
82
|
-
/** Raw response body. */
|
|
83
|
-
raw: Record<string, unknown>;
|
|
64
|
+
diagnostics?: Record<string, number | null>;
|
|
65
|
+
/** Any rich per-layer detail — rendered as-is by consumers that know the layer. */
|
|
66
|
+
detail?: Record<string, unknown>;
|
|
84
67
|
}
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
68
|
+
interface VerifyContext<Env = unknown> {
|
|
69
|
+
/** Per-run opaque context the caller provides. Layers destructure what they need. */
|
|
70
|
+
env: Env;
|
|
71
|
+
/** Previously-computed results from layers that already ran. */
|
|
72
|
+
prior: Record<string, LayerResult>;
|
|
73
|
+
/** Signal — if aborted, layers MUST bail within reasonable wall. */
|
|
74
|
+
signal: AbortSignal;
|
|
90
75
|
}
|
|
91
|
-
interface
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
apiKey?: string;
|
|
96
|
-
bearer?: string;
|
|
97
|
-
/** Override for the `Authorization` header (e.g. `X-Auth: ...`). Takes precedence over apiKey/bearer. */
|
|
98
|
-
authHeader?: {
|
|
99
|
-
name: string;
|
|
100
|
-
value: string;
|
|
101
|
-
};
|
|
102
|
-
/** Default timeout in ms. Per-call can override. */
|
|
103
|
-
defaultTimeoutMs?: number;
|
|
104
|
-
/** Max retry attempts on retriable errors. Default 3 (1 initial + 2 retries). */
|
|
105
|
-
maxRetries?: number;
|
|
106
|
-
/** Fetch implementation — defaults to global `fetch`. Override for custom transport (e.g. tests). */
|
|
107
|
-
fetch?: typeof fetch;
|
|
76
|
+
interface Layer<Env = unknown> {
|
|
77
|
+
name: string;
|
|
78
|
+
/** Stages that must have `status: 'pass'` before this layer runs. */
|
|
79
|
+
dependsOn?: string[];
|
|
108
80
|
/**
|
|
109
|
-
*
|
|
110
|
-
*
|
|
111
|
-
* headers and credential-shaped body fields redacted by default. This is
|
|
112
|
-
* the layer-1 forensics primitive: structured `LlmSpan`s record intent,
|
|
113
|
-
* raw events record what actually crossed the wire.
|
|
81
|
+
* Weight in the composite `blendedScore`. Default 1.0. Layers with weight 0
|
|
82
|
+
* contribute findings but not score.
|
|
114
83
|
*/
|
|
115
|
-
|
|
84
|
+
weight?: number;
|
|
116
85
|
/**
|
|
117
|
-
*
|
|
118
|
-
*
|
|
86
|
+
* If true, a `fail` status contributes to `blendedScore` (as 0) instead of
|
|
87
|
+
* being dropped — use for layers whose failure is a real signal. Default:
|
|
88
|
+
* fail drops from numerator + denominator, matching VB's existing semantics.
|
|
119
89
|
*/
|
|
120
|
-
|
|
121
|
-
/**
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
spanId?: string;
|
|
125
|
-
};
|
|
126
|
-
/** Override the redaction strategy for this call. Defaults to `defaultProviderRedactor`. */
|
|
127
|
-
redactor?: ProviderRedactor;
|
|
128
|
-
}
|
|
129
|
-
/**
|
|
130
|
-
* True when an error is a transient transport/network fault worth retrying,
|
|
131
|
-
* as opposed to a deterministic failure (4xx schema reject, JSON parse) that
|
|
132
|
-
* a retry cannot fix. Inspects `LlmCallError.status`, then the error's
|
|
133
|
-
* name/message/code, then recurses into `error.cause` — undici nests the
|
|
134
|
-
* real socket fault one or more levels under `.cause`.
|
|
135
|
-
*
|
|
136
|
-
* This is THE retry classifier for the package: `callLlm` and
|
|
137
|
-
* `withJudgeRetry` both route through it, so a connection-class error is
|
|
138
|
-
* treated identically whether it surfaces in the HTTP client or a
|
|
139
|
-
* TCloud-backed judge.
|
|
140
|
-
*/
|
|
141
|
-
declare function isTransientLlmError(err: unknown): boolean;
|
|
142
|
-
/** Exponential backoff: 500ms, 1s, 2s, 4s, ... capped at 16s. Attempt is 0-indexed. */
|
|
143
|
-
declare function backoffMs(attempt: number): number;
|
|
144
|
-
/**
|
|
145
|
-
* Strip a ```json / ``` code fence if the model emitted one.
|
|
146
|
-
* Idempotent for naked JSON. Some models (claude-code via router, certain
|
|
147
|
-
* deepseek models) wrap output even under json_object.
|
|
148
|
-
*/
|
|
149
|
-
declare function stripFencedJson(raw: string): string;
|
|
150
|
-
/**
|
|
151
|
-
* Low-level call. Returns raw content + usage + cost. Retries on transient
|
|
152
|
-
* failures; does NOT degrade schema here — callers that want graceful
|
|
153
|
-
* degrade use `callLlmJson`.
|
|
154
|
-
*/
|
|
155
|
-
declare function callLlm(req: LlmCallRequest, opts?: LlmClientOptions): Promise<LlmCallResult>;
|
|
156
|
-
/**
|
|
157
|
-
* Structured-output call. Returns parsed JSON plus the raw result envelope.
|
|
158
|
-
* Degrades `jsonSchema` → `jsonMode` on a 400 that names the schema param —
|
|
159
|
-
* critical for deepseek-v3/v4, kimi-k2.6, and other models that don't accept
|
|
160
|
-
* the `response_format.json_schema` shape but DO accept `json_object`.
|
|
161
|
-
*/
|
|
162
|
-
declare function callLlmJson<T = unknown>(req: LlmCallRequest, opts?: LlmClientOptions): Promise<{
|
|
163
|
-
value: T;
|
|
164
|
-
result: LlmCallResult;
|
|
165
|
-
}>;
|
|
166
|
-
type LlmRouteAssertionReason = 'no_explicit_base_url' | 'base_url_blocked' | 'base_url_not_allowed' | 'no_auth' | 'wrong_provider';
|
|
167
|
-
declare class LlmRouteAssertionError extends CaptureIntegrityError {
|
|
168
|
-
readonly reason: LlmRouteAssertionReason;
|
|
169
|
-
readonly baseUrl: string;
|
|
170
|
-
constructor(message: string, reason: LlmRouteAssertionReason, baseUrl: string);
|
|
90
|
+
failContributesToScore?: boolean;
|
|
91
|
+
/** Optional per-layer wall-cap in ms. Honored by the verifier (AbortSignal). */
|
|
92
|
+
capMs?: number;
|
|
93
|
+
run: (ctx: VerifyContext<Env>) => Promise<LayerResult> | LayerResult;
|
|
171
94
|
}
|
|
172
|
-
interface
|
|
173
|
-
|
|
174
|
-
* Throw if `opts.baseUrl` is undefined, i.e. the call would fall back to
|
|
175
|
-
* `DEFAULT_BASE_URL`. Set this for evaluation runs where silently using
|
|
176
|
-
* the public/free-tier router is a defect — the launch reviewer needs to
|
|
177
|
-
* know exactly which provider answered.
|
|
178
|
-
*/
|
|
179
|
-
requireExplicitBaseUrl?: boolean;
|
|
95
|
+
interface VerifyOptions<Env = unknown> {
|
|
96
|
+
env: Env;
|
|
180
97
|
/**
|
|
181
|
-
*
|
|
182
|
-
*
|
|
98
|
+
* Overall wall cap. Default: sum of layer capMs, or Infinity if any layer
|
|
99
|
+
* omits a cap. The verifier short-circuits remaining layers on overall cap.
|
|
183
100
|
*/
|
|
184
|
-
|
|
185
|
-
/**
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
101
|
+
overallCapMs?: number;
|
|
102
|
+
/** Called with each layer result as it completes. */
|
|
103
|
+
onLayer?: (result: LayerResult) => void;
|
|
104
|
+
}
|
|
105
|
+
interface VerificationReport {
|
|
106
|
+
layers: LayerResult[];
|
|
107
|
+
passCount: number;
|
|
108
|
+
failCount: number;
|
|
109
|
+
skippedCount: number;
|
|
110
|
+
errorCount: number;
|
|
111
|
+
/** True iff at least one scored layer ran AND every scored layer passed. */
|
|
112
|
+
allPass: boolean;
|
|
189
113
|
/**
|
|
190
|
-
*
|
|
191
|
-
*
|
|
114
|
+
* Weighted mean of `score` across contributing layers. 0 when no layers
|
|
115
|
+
* contributed. See {@link Layer.failContributesToScore} for fail semantics.
|
|
192
116
|
*/
|
|
193
|
-
|
|
117
|
+
blendedScore: number;
|
|
118
|
+
durationMs: number;
|
|
119
|
+
startedAt: string;
|
|
120
|
+
finishedAt: string;
|
|
194
121
|
}
|
|
195
122
|
/**
|
|
196
|
-
*
|
|
197
|
-
* the caller intends. Designed for the matrix-runner preflight: invoke
|
|
198
|
-
* once before any LLM call to catch misconfiguration before a sweep burns
|
|
199
|
-
* dollars on the wrong provider.
|
|
123
|
+
* Grade a semantic-concept-style judge result into a single layer status.
|
|
200
124
|
*
|
|
201
|
-
*
|
|
202
|
-
*
|
|
203
|
-
*/
|
|
204
|
-
declare function assertLlmRoute(opts: LlmClientOptions, req?: LlmRouteRequirements): void;
|
|
205
|
-
/**
|
|
206
|
-
* Probe whether a model is reachable. Returns latency + null error on
|
|
207
|
-
* success; `ok=false` + error message on any failure (HTTP, timeout,
|
|
208
|
-
* network, parse). Designed for sweep preflights — fail loud at the
|
|
209
|
-
* boundary before burning a 30-leaf run on a misconfigured router.
|
|
125
|
+
* Pass when overall score >= threshold AND no critical-severity concept gap.
|
|
126
|
+
* Fail otherwise. Use inside a `Layer.run` when wrapping a concept judge.
|
|
210
127
|
*
|
|
211
|
-
*
|
|
212
|
-
*
|
|
213
|
-
*
|
|
214
|
-
*
|
|
215
|
-
*/
|
|
216
|
-
declare function probeLlm(model: string, opts?: LlmClientOptions & {
|
|
217
|
-
timeoutMs?: number;
|
|
218
|
-
}): Promise<{
|
|
219
|
-
ok: boolean;
|
|
220
|
-
latencyMs: number;
|
|
221
|
-
error: string | null;
|
|
222
|
-
}>;
|
|
223
|
-
/**
|
|
224
|
-
* Stateful client — construct once with defaults, call many times.
|
|
225
|
-
* Thin wrapper around the free functions; exists for callers that want
|
|
226
|
-
* to inject a single configured instance into multiple primitives.
|
|
128
|
+
* Generalized from VerticalBench H3 fix: `failingConcepts.length === 0` was
|
|
129
|
+
* too strict — a single concept at 6/10 failed the entire layer despite
|
|
130
|
+
* overall score being >= 0.7. Now we trust the judge's own `severity` field:
|
|
131
|
+
* `critical` findings veto; `major`/`minor` reduce the score but don't veto.
|
|
227
132
|
*/
|
|
228
|
-
declare
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
result: LlmCallResult;
|
|
133
|
+
declare function gradeSemanticStatus(input: {
|
|
134
|
+
score: number;
|
|
135
|
+
findings: Array<{
|
|
136
|
+
severity: Severity;
|
|
137
|
+
present?: boolean;
|
|
138
|
+
score?: number;
|
|
235
139
|
}>;
|
|
140
|
+
available: boolean;
|
|
141
|
+
threshold?: number;
|
|
142
|
+
}): LayerStatus;
|
|
143
|
+
declare class MultiLayerVerifier<Env = unknown> {
|
|
144
|
+
private readonly layers;
|
|
145
|
+
constructor(layers: Layer<Env>[]);
|
|
146
|
+
run(opts: VerifyOptions<Env>): Promise<VerificationReport>;
|
|
236
147
|
}
|
|
237
148
|
|
|
238
149
|
/**
|
|
@@ -608,4 +519,4 @@ declare class NoopResearcher implements Researcher {
|
|
|
608
519
|
evaluateChange(_plan: ExperimentPlan): Promise<ExperimentResult>;
|
|
609
520
|
}
|
|
610
521
|
|
|
611
|
-
export {
|
|
522
|
+
export { CallbackResearcher as C, type ExperimentPlan as E, type FailureMode as F, type Layer as L, MultiLayerVerifier as M, NoopResearcher as N, type Researcher as R, type SteeringChange as S, type VerificationReport as V, type ExperimentResult as a, type EvalCampaignResult as b, type EvalCampaignOptions as c, type Severity as d, type VerifyOptions as e, type LayerResult as f, type VerifyContext as g, type CallbackResearcherOptions as h, type CampaignFactoryParams as i, type CampaignIntegrityPolicy as j, type CampaignRunContext as k, type CampaignRunOutcome as l, type CampaignRunner as m, type CampaignScenario as n, type CampaignVariant as o, type FailedRun as p, type Finding as q, runEvalCampaign as r, type LayerStatus as s, gradeSemanticStatus as t };
|