@sanity/ailf 4.6.0 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/agent-harness-tools.yaml +42 -0
- package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
- package/canonical/grader-references/mcp-server-spec.yaml +51 -0
- package/canonical/grader-references/portable-text.yaml +48 -0
- package/config/rubrics.ts +38 -2
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
- package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
- package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
- package/dist/_vendor/ailf-core/examples/index.js +146 -47
- package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +9 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
- package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/services/index.js +5 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
- package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
- package/dist/_vendor/ailf-core/types/attribution.js +18 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
- package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
- package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
- package/dist/_vendor/ailf-core/types/confidence.js +7 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
- package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
- package/dist/_vendor/ailf-core/types/index.js +15 -1
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
- package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
- package/dist/adapters/attribution/index.d.ts +9 -0
- package/dist/adapters/attribution/index.js +8 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/grader-outputs/index.d.ts +10 -0
- package/dist/adapters/grader-outputs/index.js +8 -0
- package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
- package/dist/adapters/grader-outputs/legacy/index.js +10 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
- package/dist/adapters/index.d.ts +3 -0
- package/dist/adapters/index.js +4 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
- package/dist/adapters/task-sources/repo-schemas.js +19 -2
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/explain-handler.js +1 -1
- package/dist/commands/lookup-doc.d.ts +1 -1
- package/dist/commands/lookup-doc.js +3 -3
- package/dist/commands/pipeline-action.d.ts +6 -0
- package/dist/commands/pipeline-action.js +2 -0
- package/dist/commands/remote-pipeline.js +1 -0
- package/dist/composition-root.d.ts +36 -0
- package/dist/composition-root.js +48 -0
- package/dist/config/rubrics.ts +38 -2
- package/dist/grader/agent-harness.d.ts +14 -0
- package/dist/grader/agent-harness.js +17 -0
- package/dist/grader/common.d.ts +17 -0
- package/dist/grader/common.js +21 -0
- package/dist/grader/index.d.ts +38 -0
- package/dist/grader/index.js +75 -0
- package/dist/grader/knowledge-probe.d.ts +14 -0
- package/dist/grader/knowledge-probe.js +18 -0
- package/dist/grader/literacy.d.ts +13 -0
- package/dist/grader/literacy.js +17 -0
- package/dist/grader/mcp.d.ts +14 -0
- package/dist/grader/mcp.js +18 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +5 -0
- package/dist/orchestration/steps/calculate-scores-step.js +23 -1
- package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
- package/dist/orchestration/steps/compute-attribution-step.js +279 -0
- package/dist/orchestration/steps/gap-analysis-step.js +35 -7
- package/dist/orchestration/steps/index.d.ts +1 -0
- package/dist/orchestration/steps/index.js +1 -0
- package/dist/pipeline/attribution.d.ts +15 -0
- package/dist/pipeline/attribution.js +18 -9
- package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
- package/dist/pipeline/borderline-consensus-runner.js +124 -0
- package/dist/pipeline/borderline-detector.d.ts +24 -0
- package/dist/pipeline/borderline-detector.js +26 -0
- package/dist/pipeline/calculate-scores.d.ts +114 -3
- package/dist/pipeline/calculate-scores.js +426 -24
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +35 -17
- package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
- package/dist/pipeline/compiler/rubric-resolution.js +9 -1
- package/dist/pipeline/compute-attribution.d.ts +80 -0
- package/dist/pipeline/compute-attribution.js +196 -0
- package/dist/pipeline/failure-modes.d.ts +52 -17
- package/dist/pipeline/failure-modes.js +178 -117
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/package.json +6 -4
|
@@ -84,11 +84,24 @@ export function generateRunId() {
|
|
|
84
84
|
.toISOString()
|
|
85
85
|
.replace(/[-:]/g, "")
|
|
86
86
|
.replace(/\.\d{3}Z$/, "Z");
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
87
|
+
// Rejection-sample bytes against the largest multiple of 36 ≤ 256
|
|
88
|
+
// (252) before applying `% 36`. Naive `b % 36` over [0, 255] biases
|
|
89
|
+
// digits 0..3 (probability 8/256) over 4..35 (probability 7/256) by
|
|
90
|
+
// ~14% per character. Drawing fresh bytes whenever the buffer runs
|
|
91
|
+
// dry keeps the loop terminating with overwhelming probability
|
|
92
|
+
// (each byte is kept with probability 252/256 ≈ 98.4%).
|
|
93
|
+
const suffixChars = [];
|
|
94
|
+
while (suffixChars.length < 8) {
|
|
95
|
+
const buf = crypto.getRandomValues(new Uint8Array(8));
|
|
96
|
+
for (const b of buf) {
|
|
97
|
+
if (b >= 252)
|
|
98
|
+
continue; // reject biased range
|
|
99
|
+
suffixChars.push((b % 36).toString(36));
|
|
100
|
+
if (suffixChars.length === 8)
|
|
101
|
+
break;
|
|
102
|
+
}
|
|
91
103
|
}
|
|
104
|
+
const suffix = suffixChars.join("");
|
|
92
105
|
return `run_${ts}_${suffix}`;
|
|
93
106
|
}
|
|
94
107
|
/**
|
|
@@ -166,3 +179,66 @@ export function fixtureId(raw) {
|
|
|
166
179
|
}
|
|
167
180
|
return ok(raw);
|
|
168
181
|
}
|
|
182
|
+
/**
|
|
183
|
+
* Canonical shape for a `JudgmentId`.
|
|
184
|
+
*
|
|
185
|
+
* Two accepted forms:
|
|
186
|
+
* - `judgment_<runId-suffix>_<sanitized-task>__<sanitized-model>__<dimension>`
|
|
187
|
+
* — minted by `generateJudgmentId` for synthesized fall-back judgments
|
|
188
|
+
* so dedup is per-run (a re-run of the same task produces a distinct id).
|
|
189
|
+
* - `j_<alphanumeric>` — short form used by test fixtures and any caller
|
|
190
|
+
* that wants a stable, opaque id without the structured composite.
|
|
191
|
+
*
|
|
192
|
+
* Inner segments may carry alphanumerics, hyphens, dots, and colons (the
|
|
193
|
+
* provider id surface is colon-separated). The full string is bounded to
|
|
194
|
+
* 256 characters to keep the id index-friendly downstream.
|
|
195
|
+
*/
|
|
196
|
+
const JUDGMENT_ID_RE = /^(?:judgment_[0-9a-z]{1,16}_[a-z0-9][a-z0-9.:_-]*__[a-z0-9][a-z0-9.:_-]*__[a-z0-9][a-z0-9-]*|j_[A-Za-z0-9_-]{4,})$/;
|
|
197
|
+
/**
|
|
198
|
+
* Parse a raw string into a `JudgmentId`.
|
|
199
|
+
*
|
|
200
|
+
* See `JUDGMENT_ID_RE` for the accepted formats.
|
|
201
|
+
*/
|
|
202
|
+
export function judgmentId(raw) {
|
|
203
|
+
if (raw.length === 0 || raw.length > 256 || !JUDGMENT_ID_RE.test(raw)) {
|
|
204
|
+
return err({
|
|
205
|
+
code: "INVALID_JUDGMENT_ID",
|
|
206
|
+
raw,
|
|
207
|
+
message: `Invalid JudgmentId "${raw}": must match judgment_<runSalt>_<task>__<model>__<dimension> or j_<alnum>`,
|
|
208
|
+
});
|
|
209
|
+
}
|
|
210
|
+
return ok(raw);
|
|
211
|
+
}
|
|
212
|
+
/** Strip a value down to the alphanumeric+hyphen alphabet the id format allows. */
|
|
213
|
+
function sanitizeJudgmentSegment(value) {
|
|
214
|
+
// Lowercase + replace runs of non-alphanumerics with a single hyphen,
|
|
215
|
+
// trim leading/trailing hyphens. Keeps dots and colons in `modelId`-like
|
|
216
|
+
// values (`openai:gpt-5.2`) since the regex permits them.
|
|
217
|
+
return value
|
|
218
|
+
.toLowerCase()
|
|
219
|
+
.replace(/[^a-z0-9.:_-]+/g, "-")
|
|
220
|
+
.replace(/^-+|-+$/g, "");
|
|
221
|
+
}
|
|
222
|
+
/**
|
|
223
|
+
* Generate a deterministic `JudgmentId` for a synthesized fall-back
|
|
224
|
+
* judgment. Salting with `runId` (when supplied) makes the id unique
|
|
225
|
+
* per-run so consumers' `(taskId, modelId, dimension)` dedup key
|
|
226
|
+
* doesn't collide across re-runs of the same task — every run writes
|
|
227
|
+
* fresh ids that still encode the natural composite key.
|
|
228
|
+
*
|
|
229
|
+
* When `runId` is absent the salt collapses to `nosalt`, preserving the
|
|
230
|
+
* legacy "deterministic across runs" shape for callers that explicitly
|
|
231
|
+
* want it (e.g. unit tests that assert the exact id string).
|
|
232
|
+
*/
|
|
233
|
+
export function generateJudgmentId(input) {
|
|
234
|
+
// Take the trailing 8 chars of the runId (the random base36 suffix on
|
|
235
|
+
// the canonical shape) so the salt stays compact while still cycling
|
|
236
|
+
// every run. Falls back to a constant marker when runId isn't passed.
|
|
237
|
+
const runSalt = input.runId
|
|
238
|
+
? sanitizeJudgmentSegment(String(input.runId).slice(-8)) || "nosalt"
|
|
239
|
+
: "nosalt";
|
|
240
|
+
const task = sanitizeJudgmentSegment(input.taskId);
|
|
241
|
+
const model = sanitizeJudgmentSegment(input.modelId);
|
|
242
|
+
const dimension = sanitizeJudgmentSegment(input.dimension);
|
|
243
|
+
return `judgment_${runSalt}_${task}__${model}__${dimension}`;
|
|
244
|
+
}
|
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
* is an open tag (see `ConfidenceDerivation`). The list is the
|
|
20
20
|
* recommended starting set, not the universe.
|
|
21
21
|
*/
|
|
22
|
-
export declare const CONVENTIONAL_DERIVATIONS: readonly ["ensemble-stdev", "ceiling-cross-check", "regression-gate", "card-type-specific"];
|
|
22
|
+
export declare const CONVENTIONAL_DERIVATIONS: readonly ["ensemble-stdev", "ceiling-cross-check", "regression-gate", "card-type-specific", "synthesized-pre-cross-check"];
|
|
23
23
|
/**
|
|
24
24
|
* Tag identifying the formula used to derive `Confidence.level`.
|
|
25
25
|
*
|
|
@@ -24,6 +24,13 @@ export const CONVENTIONAL_DERIVATIONS = [
|
|
|
24
24
|
"ceiling-cross-check",
|
|
25
25
|
"regression-gate",
|
|
26
26
|
"card-type-specific",
|
|
27
|
+
// Sentinel placeholder used by the eval pipeline's
|
|
28
|
+
// `synthesizeUnparsedJudgment` fall-back. The validator
|
|
29
|
+
// (`validateGraderJudgmentsCalibration`) overwrites it with
|
|
30
|
+
// "ceiling-cross-check" before judgments leave the live pipeline,
|
|
31
|
+
// so it should not appear on stored reports — the literal is in
|
|
32
|
+
// this list so a leaked sentinel is greppable.
|
|
33
|
+
"synthesized-pre-cross-check",
|
|
27
34
|
];
|
|
28
35
|
/**
|
|
29
36
|
* Structural type guard for `Confidence`. Verifies the runtime shape
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Diagnosis core domain types — canonical shapes for the post-run
|
|
3
|
+
* synthesis layer (Doc 05).
|
|
4
|
+
*
|
|
5
|
+
* `Diagnosis.inputs` is the four-version cache envelope (VER-01); any
|
|
6
|
+
* segment bump invalidates a cached Diagnosis. `DiagnosisCard` is an
|
|
7
|
+
* outer-`status` discriminated union with a nested `cardType`
|
|
8
|
+
* discriminator inside the `ready` variant.
|
|
9
|
+
*
|
|
10
|
+
* Phase 1 lands placeholder body shapes; Phase 5 enriches each per
|
|
11
|
+
* Doc 05 specs.
|
|
12
|
+
*
|
|
13
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
14
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
15
|
+
* @see docs/decisions/D0050-artifact-registry-post-hoc-versioned-extension.md
|
|
16
|
+
*/
|
|
17
|
+
import type { RunId } from "./branded-ids.js";
|
|
18
|
+
import type { ReportId } from "./index.js";
|
|
19
|
+
/**
|
|
20
|
+
* The four-version cache envelope. Every cached `Diagnosis` carries the
|
|
21
|
+
* versions of the inputs that produced it; any bump in any segment
|
|
22
|
+
* invalidates the cache (cross-package contract test asserts this).
|
|
23
|
+
*
|
|
24
|
+
* Strings everywhere; semver convention by humans where authored
|
|
25
|
+
* manually. No branding, no tuples, no content-hash typing — keeps the
|
|
26
|
+
* envelope trivially serializable + greppable.
|
|
27
|
+
*/
|
|
28
|
+
export interface VersionedInputs {
|
|
29
|
+
graderJudgmentsVersion: string;
|
|
30
|
+
ensembleVersion: string;
|
|
31
|
+
diagnosisVersion: string;
|
|
32
|
+
cardVersion: string;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* The 8 ready-card archetypes. Phase 5 cards register against these
|
|
36
|
+
* literals; the slim-shape boundary in Phase 7 reads them to render the
|
|
37
|
+
* Studio diagnosis renderer.
|
|
38
|
+
*/
|
|
39
|
+
export type CardType = "area-summary" | "failure-mode-summary" | "no-issues" | "top-recommendations" | "weakest-area" | "low-confidence-attribution" | "doc-attribution-spotlight" | "regression-vs-baseline";
|
|
40
|
+
/**
|
|
41
|
+
* Per-card telemetry envelope. `cardVersion` here is the per-card
|
|
42
|
+
* version (e.g. `"area-summary@0.1.0"`), not the compound. Drives
|
|
43
|
+
* DIAG-06 cost telemetry in Phase 6.
|
|
44
|
+
*/
|
|
45
|
+
export interface CardMeta {
|
|
46
|
+
cardVersion: string;
|
|
47
|
+
tokenUsage?: {
|
|
48
|
+
input: number;
|
|
49
|
+
output: number;
|
|
50
|
+
};
|
|
51
|
+
latencyMs?: number;
|
|
52
|
+
/** ISO 8601 UTC timestamp. */
|
|
53
|
+
generatedAt: string;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* A single actionable suggestion surfaced by a recommendations card.
|
|
57
|
+
* The full Phase 5 shape may add fields (per Doc 05 specs); Phase 1
|
|
58
|
+
* locks the minimum required surface.
|
|
59
|
+
*/
|
|
60
|
+
export interface ActionSuggestion {
|
|
61
|
+
title: string;
|
|
62
|
+
body: string;
|
|
63
|
+
priority: "high" | "medium" | "low";
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Phase 1 body placeholders. Each shape is intentionally minimal; Phase 5
|
|
67
|
+
* card files enrich them per Doc 05 specs and assert
|
|
68
|
+
* `satisfies z.ZodType<Extract<DiagnosisCard, { status: "ready"; cardType: "X" }>["body"]>`
|
|
69
|
+
* against these declarations.
|
|
70
|
+
*/
|
|
71
|
+
export interface AreaSummaryBody {
|
|
72
|
+
summary: string;
|
|
73
|
+
}
|
|
74
|
+
export interface FailureModeSummaryBody {
|
|
75
|
+
summary: string;
|
|
76
|
+
}
|
|
77
|
+
export interface NoIssuesBody {
|
|
78
|
+
summary: string;
|
|
79
|
+
}
|
|
80
|
+
export interface TopRecommendationsBody {
|
|
81
|
+
summary: string;
|
|
82
|
+
suggestions: ActionSuggestion[];
|
|
83
|
+
}
|
|
84
|
+
export interface WeakestAreaBody {
|
|
85
|
+
summary: string;
|
|
86
|
+
}
|
|
87
|
+
export interface LowConfidenceAttributionBody {
|
|
88
|
+
summary: string;
|
|
89
|
+
}
|
|
90
|
+
export interface DocAttributionSpotlightBody {
|
|
91
|
+
summary: string;
|
|
92
|
+
}
|
|
93
|
+
export interface RegressionVsBaselineBody {
|
|
94
|
+
summary: string;
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Outer-`status` discriminated union: 8 ready variants (one per
|
|
98
|
+
* `cardType`, each carrying its per-cardType body), plus a `degraded`
|
|
99
|
+
* variant (parse failed or downgraded by the runner) and a `missing`
|
|
100
|
+
* variant (card not produced for this run).
|
|
101
|
+
*
|
|
102
|
+
* No `not-yet-generated` variant — old-report fallback is a Phase 7
|
|
103
|
+
* concern at the slim-shape boundary, handled at fetch-time, not in
|
|
104
|
+
* `DiagnosisCard` itself.
|
|
105
|
+
*/
|
|
106
|
+
export type DiagnosisCard = {
|
|
107
|
+
status: "ready";
|
|
108
|
+
cardType: "area-summary";
|
|
109
|
+
body: AreaSummaryBody;
|
|
110
|
+
meta: CardMeta;
|
|
111
|
+
} | {
|
|
112
|
+
status: "ready";
|
|
113
|
+
cardType: "failure-mode-summary";
|
|
114
|
+
body: FailureModeSummaryBody;
|
|
115
|
+
meta: CardMeta;
|
|
116
|
+
} | {
|
|
117
|
+
status: "ready";
|
|
118
|
+
cardType: "no-issues";
|
|
119
|
+
body: NoIssuesBody;
|
|
120
|
+
meta: CardMeta;
|
|
121
|
+
} | {
|
|
122
|
+
status: "ready";
|
|
123
|
+
cardType: "top-recommendations";
|
|
124
|
+
body: TopRecommendationsBody;
|
|
125
|
+
meta: CardMeta;
|
|
126
|
+
} | {
|
|
127
|
+
status: "ready";
|
|
128
|
+
cardType: "weakest-area";
|
|
129
|
+
body: WeakestAreaBody;
|
|
130
|
+
meta: CardMeta;
|
|
131
|
+
} | {
|
|
132
|
+
status: "ready";
|
|
133
|
+
cardType: "low-confidence-attribution";
|
|
134
|
+
body: LowConfidenceAttributionBody;
|
|
135
|
+
meta: CardMeta;
|
|
136
|
+
} | {
|
|
137
|
+
status: "ready";
|
|
138
|
+
cardType: "doc-attribution-spotlight";
|
|
139
|
+
body: DocAttributionSpotlightBody;
|
|
140
|
+
meta: CardMeta;
|
|
141
|
+
} | {
|
|
142
|
+
status: "ready";
|
|
143
|
+
cardType: "regression-vs-baseline";
|
|
144
|
+
body: RegressionVsBaselineBody;
|
|
145
|
+
meta: CardMeta;
|
|
146
|
+
} | {
|
|
147
|
+
status: "degraded";
|
|
148
|
+
cardType: CardType;
|
|
149
|
+
reason: string;
|
|
150
|
+
parseFailed: boolean;
|
|
151
|
+
meta: CardMeta;
|
|
152
|
+
} | {
|
|
153
|
+
status: "missing";
|
|
154
|
+
cardType: CardType;
|
|
155
|
+
reason: string;
|
|
156
|
+
};
|
|
157
|
+
/**
|
|
158
|
+
* The post-run synthesis aggregate. Consumed by Phase 5 (runner +
|
|
159
|
+
* cards), Phase 6 (CLI) and Phase 7 (Studio). Phase 1 lands the
|
|
160
|
+
* declarative shape; runtime construction lands in Phase 5.
|
|
161
|
+
*/
|
|
162
|
+
export interface Diagnosis {
|
|
163
|
+
runId: RunId;
|
|
164
|
+
reportId: ReportId;
|
|
165
|
+
inputs: VersionedInputs;
|
|
166
|
+
cards: DiagnosisCard[];
|
|
167
|
+
/** ISO 8601 UTC timestamp. */
|
|
168
|
+
generatedAt: string;
|
|
169
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Diagnosis core domain types — canonical shapes for the post-run
|
|
3
|
+
* synthesis layer (Doc 05).
|
|
4
|
+
*
|
|
5
|
+
* `Diagnosis.inputs` is the four-version cache envelope (VER-01); any
|
|
6
|
+
* segment bump invalidates a cached Diagnosis. `DiagnosisCard` is an
|
|
7
|
+
* outer-`status` discriminated union with a nested `cardType`
|
|
8
|
+
* discriminator inside the `ready` variant.
|
|
9
|
+
*
|
|
10
|
+
* Phase 1 lands placeholder body shapes; Phase 5 enriches each per
|
|
11
|
+
* Doc 05 specs.
|
|
12
|
+
*
|
|
13
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
14
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
15
|
+
* @see docs/decisions/D0050-artifact-registry-post-hoc-versioned-extension.md
|
|
16
|
+
*/
|
|
17
|
+
export {};
|
|
@@ -53,11 +53,26 @@ export interface PerspectiveDocRef {
|
|
|
53
53
|
perspective: string;
|
|
54
54
|
reason?: string;
|
|
55
55
|
}
|
|
56
|
+
/**
|
|
57
|
+
* A single criterion within a templated llm-rubric assertion.
|
|
58
|
+
*
|
|
59
|
+
* The `id` is a stable, slug-formatted identifier — auto-derived from
|
|
60
|
+
* `text` in Studio (via slug `options.source: "text"`), or backfilled
|
|
61
|
+
* from Sanity's `_key` for pre-migration documents. Survives criterion
|
|
62
|
+
* text edits; downstream judgments and diagnosis cards reference by
|
|
63
|
+
* `id` per D0052 (judgment-ref granularity).
|
|
64
|
+
*/
|
|
65
|
+
export interface CriterionRef {
|
|
66
|
+
/** Stable per-criterion identifier — slug-format (`[a-z0-9][a-z0-9-]*`). */
|
|
67
|
+
id: string;
|
|
68
|
+
/** Author-facing criterion text (the original bullet). */
|
|
69
|
+
text: string;
|
|
70
|
+
}
|
|
56
71
|
/** A templated assertion referencing a rubric template */
|
|
57
72
|
export interface GeneralizedTemplatedAssertion {
|
|
58
73
|
type: "llm-rubric";
|
|
59
74
|
template: string;
|
|
60
|
-
criteria:
|
|
75
|
+
criteria: CriterionRef[];
|
|
61
76
|
weight?: number;
|
|
62
77
|
}
|
|
63
78
|
/** A value-based assertion (contains, javascript, cost, latency, etc.) */
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GraderJudgment core domain types — canonical shapes for structured
|
|
3
|
+
* grader output (Doc 03, GRAD-02).
|
|
4
|
+
*
|
|
5
|
+
* Authored INDEPENDENTLY of any Zod schema (D0045 doctrine — Plan 02's
|
|
6
|
+
* `GraderJudgmentSchema` `satisfies` against this type, not the other
|
|
7
|
+
* way around). A tautological `satisfies z.ZodType<z.infer<typeof
|
|
8
|
+
* GraderJudgmentSchema>>` is forbidden.
|
|
9
|
+
*
|
|
10
|
+
* Phase 1 retained the existing pipeline core (`taskId`, `modelId`,
|
|
11
|
+
* `dimension`, `reason`, `score`, `outputFailure?`) as required for
|
|
12
|
+
* backward compat with Phase 0 callers (Doc 03 §"existing, unchanged")
|
|
13
|
+
* and added the GRAD-02 additive fields (`judgmentId`, `subJudgments`,
|
|
14
|
+
* `docCitations`, `failureMode`, `confidence`,
|
|
15
|
+
* `hallucinationCheckedAgainst`, `metadata`) as additive in Phase 1;
|
|
16
|
+
* required from Phase 3 GRAD-05.
|
|
17
|
+
*
|
|
18
|
+
* Phase 3 GRAD-05 has flipped the additive fields to required (this
|
|
19
|
+
* file) and the corresponding Zod schema in
|
|
20
|
+
* `packages/eval/src/adapters/grader-outputs/promptfoo-grader-output.ts`
|
|
21
|
+
* is `.strict()` with `graderJudgmentsVersion = "1.0.0"`. The
|
|
22
|
+
* read-only legacy parser at `…/legacy/promptfoo-grader-output-legacy.ts`
|
|
23
|
+
* (against `LegacyGraderJudgment`) is the named consumer for already-
|
|
24
|
+
* stored historical reports through GRAD-06 cutover.
|
|
25
|
+
*
|
|
26
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
27
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
28
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
29
|
+
*/
|
|
30
|
+
import type { JudgmentId } from "./branded-ids.js";
|
|
31
|
+
import type { Confidence } from "./confidence.js";
|
|
32
|
+
/**
|
|
33
|
+
* Role enum for doc citations attached to a grader judgment (GRAD-02).
|
|
34
|
+
* Closed string-literal union — Phase 3 may extend.
|
|
35
|
+
*/
|
|
36
|
+
export type DocCitationRole = "supports" | "contradicts" | "missing" | "irrelevant";
|
|
37
|
+
/**
|
|
38
|
+
* A single doc the grader cited while reasoning. `documentId` is the
|
|
39
|
+
* canonical D0052 reference (id, not slug); `slug` is a human-readable
|
|
40
|
+
* annotation only. `hallucinated` is set true at adapter time when the
|
|
41
|
+
* `slug` does not resolve against the task's `contextDocs` set.
|
|
42
|
+
*/
|
|
43
|
+
export interface DocCitation {
|
|
44
|
+
/** Canonical D0052 document ref (id, not slug). */
|
|
45
|
+
documentId: string;
|
|
46
|
+
/** Optional human-readable annotation. Never the identity. */
|
|
47
|
+
slug?: string;
|
|
48
|
+
role: DocCitationRole;
|
|
49
|
+
/** True when `slug` is not in the resolvable-set. */
|
|
50
|
+
hallucinated?: boolean;
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Per-criterion sub-judgment — one entry per task-criterion bullet
|
|
54
|
+
* (Doc 03 §"per-criterion sub-judgments"). The `criterionId` is the
|
|
55
|
+
* stable identifier declared on the task's `criteria` array (Phase 2
|
|
56
|
+
* GRAD-01 schema-sync), not synthesized at grade time.
|
|
57
|
+
*/
|
|
58
|
+
export interface CriterionSubJudgment {
|
|
59
|
+
/** Stable criterion identifier — matches `CriterionRef.id` from the task definition (D0052). */
|
|
60
|
+
criterionId: string;
|
|
61
|
+
met: boolean;
|
|
62
|
+
/** ≤280 chars — quote or paraphrase. */
|
|
63
|
+
evidence: string;
|
|
64
|
+
/** Grader self-confidence on this single criterion (D0049). */
|
|
65
|
+
confidence: Confidence;
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* The structured grader judgment — Phase 3 GRAD-05 shape.
|
|
69
|
+
*
|
|
70
|
+
* Existing pipeline core (Doc 03 §"existing, unchanged"): `taskId`,
|
|
71
|
+
* `modelId`, `dimension`, `reason`, `score`. The pre-existing
|
|
72
|
+
* `outputFailure?` remains optional. The
|
|
73
|
+
* `contextDocs? (legacy alias: canonicalDocs)` annotation (StoredJudgment
|
|
74
|
+
* extension) lives on the storage extension type, not here.
|
|
75
|
+
*
|
|
76
|
+
* Additive in Phase 1; required from Phase 3 GRAD-05: `judgmentId`,
|
|
77
|
+
* `subJudgments`, `docCitations`, `failureMode`, `confidence`,
|
|
78
|
+
* `hallucinationCheckedAgainst`, `metadata`. The corresponding Zod
|
|
79
|
+
* schema in `packages/eval/src/adapters/grader-outputs/promptfoo-grader-output.ts`
|
|
80
|
+
* is `.strict()` with `graderJudgmentsVersion = "1.0.0"`.
|
|
81
|
+
*/
|
|
82
|
+
export interface GraderJudgment {
|
|
83
|
+
/** Rubric template name (e.g. "task-completion", "code-correctness"). */
|
|
84
|
+
dimension: string;
|
|
85
|
+
/** The model that produced the response being graded. */
|
|
86
|
+
modelId: string;
|
|
87
|
+
/**
|
|
88
|
+
* True when the model failed to produce meaningful output (empty
|
|
89
|
+
* response, API error, or refusal). Distinguishes infrastructure
|
|
90
|
+
* failures from genuinely incorrect responses — a score of 0 from no
|
|
91
|
+
* output is fundamentally different from a score of 0 from wrong
|
|
92
|
+
* output.
|
|
93
|
+
*/
|
|
94
|
+
outputFailure?: boolean;
|
|
95
|
+
/** The grader's natural-language reasoning. */
|
|
96
|
+
reason: string;
|
|
97
|
+
/** Numeric score in [0, 100] (normalized). */
|
|
98
|
+
score: number;
|
|
99
|
+
/** The task this judgment belongs to. */
|
|
100
|
+
taskId: string;
|
|
101
|
+
/**
|
|
102
|
+
* D0052 granular branded id. Required from Phase 3 GRAD-05 — every
|
|
103
|
+
* grader emission carries one.
|
|
104
|
+
*/
|
|
105
|
+
judgmentId: JudgmentId;
|
|
106
|
+
/** Per-criterion sub-judgments. */
|
|
107
|
+
subJudgments: CriterionSubJudgment[];
|
|
108
|
+
/** Doc citations with role + hallucinated flag. */
|
|
109
|
+
docCitations: DocCitation[];
|
|
110
|
+
/**
|
|
111
|
+
* Per-dimension failure mode. Phase 3 GRAD-03 stamps the taxonomy
|
|
112
|
+
* literal at the runtime grader-prompt; the value is a free-form
|
|
113
|
+
* string for forward compat with future taxonomy extensions.
|
|
114
|
+
*/
|
|
115
|
+
failureMode: string;
|
|
116
|
+
/** Grader self-confidence per D0049. */
|
|
117
|
+
confidence: Confidence;
|
|
118
|
+
/** Hallucination cross-check (Pitfall #11) — union of task.context.docs and run.documentManifest. */
|
|
119
|
+
hallucinationCheckedAgainst: string[];
|
|
120
|
+
/** Metadata about the grader run. */
|
|
121
|
+
metadata: {
|
|
122
|
+
graderModel: string;
|
|
123
|
+
graderJudgmentsVersion: string;
|
|
124
|
+
};
|
|
125
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GraderJudgment core domain types — canonical shapes for structured
|
|
3
|
+
* grader output (Doc 03, GRAD-02).
|
|
4
|
+
*
|
|
5
|
+
* Authored INDEPENDENTLY of any Zod schema (D0045 doctrine — Plan 02's
|
|
6
|
+
* `GraderJudgmentSchema` `satisfies` against this type, not the other
|
|
7
|
+
* way around). A tautological `satisfies z.ZodType<z.infer<typeof
|
|
8
|
+
* GraderJudgmentSchema>>` is forbidden.
|
|
9
|
+
*
|
|
10
|
+
* Phase 1 retained the existing pipeline core (`taskId`, `modelId`,
|
|
11
|
+
* `dimension`, `reason`, `score`, `outputFailure?`) as required for
|
|
12
|
+
* backward compat with Phase 0 callers (Doc 03 §"existing, unchanged")
|
|
13
|
+
* and added the GRAD-02 additive fields (`judgmentId`, `subJudgments`,
|
|
14
|
+
* `docCitations`, `failureMode`, `confidence`,
|
|
15
|
+
* `hallucinationCheckedAgainst`, `metadata`) as additive in Phase 1;
|
|
16
|
+
* required from Phase 3 GRAD-05.
|
|
17
|
+
*
|
|
18
|
+
* Phase 3 GRAD-05 has flipped the additive fields to required (this
|
|
19
|
+
* file) and the corresponding Zod schema in
|
|
20
|
+
* `packages/eval/src/adapters/grader-outputs/promptfoo-grader-output.ts`
|
|
21
|
+
* is `.strict()` with `graderJudgmentsVersion = "1.0.0"`. The
|
|
22
|
+
* read-only legacy parser at `…/legacy/promptfoo-grader-output-legacy.ts`
|
|
23
|
+
* (against `LegacyGraderJudgment`) is the named consumer for already-
|
|
24
|
+
* stored historical reports through GRAD-06 cutover.
|
|
25
|
+
*
|
|
26
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
27
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
28
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
29
|
+
*/
|
|
30
|
+
export {};
|
|
@@ -13,6 +13,7 @@ import type { DocumentRef as _DocumentRef, EvalMode, RunContext } from "../../ai
|
|
|
13
13
|
import type { ArtifactType } from "../artifact-registry.js";
|
|
14
14
|
import type { SymbolPreflightReport } from "./symbol-preflight-report.js";
|
|
15
15
|
import type { AssociationValues, RunId } from "./branded-ids.js";
|
|
16
|
+
import type { GraderJudgment } from "./grader-judgment.js";
|
|
16
17
|
export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "./scoring-input.js";
|
|
17
18
|
export type { DocumentRef, RunContext, RunTrigger } from "../../ailf-shared/index.d.ts";
|
|
18
19
|
export type { StoredBaseline, StoredReport, StoredRun, StoredTaskResult, StoredTrace, SchemaVersioned, } from "./storage-schema.js";
|
|
@@ -32,9 +33,13 @@ export type { SymbolPreflightDeduction, SymbolPreflightFinding, SymbolPreflightR
|
|
|
32
33
|
export { DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT, type PreflightRubricContext, type PreflightScoringConfig, } from "./preflight-scoring.js";
|
|
33
34
|
export type { Confidence, ConfidenceDerivation } from "./confidence.js";
|
|
34
35
|
export { CONVENTIONAL_DERIVATIONS, isConfidence } from "./confidence.js";
|
|
35
|
-
export type { ArtifactId, AssociationAxis, AssociationValues, Brand, EntryKey, Err, FixtureId, IdValidationError, NewReportId, Ok, ProviderId, PromptId, Result, ResultId, RubricId, RunFingerprint, RunId, SuiteId, TaskId, TaskSlug, TraceId, } from "./branded-ids.js";
|
|
36
|
-
export { err, fixtureId, generateRunId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
|
|
37
|
-
export type { AgentHarnessTaskDefinition, ContentLakeAuthorableMode, ContentLakeAuthorableTask, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PromptVars, PathDocRef, PerspectiveDocRef, ReservedPromptVarKey, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
|
|
36
|
+
export type { ArtifactId, AssociationAxis, AssociationValues, Brand, EntryKey, Err, FixtureId, IdValidationError, JudgmentId, NewReportId, Ok, ProviderId, PromptId, Result, ResultId, RubricId, RunFingerprint, RunId, SuiteId, TaskId, TaskSlug, TraceId, } from "./branded-ids.js";
|
|
37
|
+
export { err, fixtureId, generateJudgmentId, generateRunId, judgmentId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
|
|
38
|
+
export type { AgentHarnessTaskDefinition, ContentLakeAuthorableMode, ContentLakeAuthorableTask, CriterionRef, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PromptVars, PathDocRef, PerspectiveDocRef, ReservedPromptVarKey, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
|
|
39
|
+
export type { ActionSuggestion, AreaSummaryBody, CardMeta, CardType, Diagnosis, DiagnosisCard, DocAttributionSpotlightBody, FailureModeSummaryBody, LowConfidenceAttributionBody, NoIssuesBody, RegressionVsBaselineBody, TopRecommendationsBody, VersionedInputs, WeakestAreaBody, } from "./diagnosis.js";
|
|
40
|
+
export type { AttributionMeta, DocAttribution, JudgmentAttribution, } from "./attribution.js";
|
|
41
|
+
export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderJudgment, } from "./grader-judgment.js";
|
|
42
|
+
export type { LegacyGraderJudgment } from "./legacy-grader-judgment.js";
|
|
38
43
|
type DocumentRef = _DocumentRef;
|
|
39
44
|
/** Aggregated retrieval metrics for a feature area */
|
|
40
45
|
export interface AreaRetrievalMetrics {
|
|
@@ -128,8 +133,31 @@ export interface FailureModeReport {
|
|
|
128
133
|
/** Total judgments analyzed */
|
|
129
134
|
totalJudgments: number;
|
|
130
135
|
}
|
|
131
|
-
/**
|
|
132
|
-
|
|
136
|
+
/**
|
|
137
|
+
* Failure mode classification for a low-scoring judgment.
|
|
138
|
+
*
|
|
139
|
+
* Open-set string (Plan 03-02 per-dimension taxonomies introduce modes
|
|
140
|
+
* outside the original literacy enum: `false-floor`, `spec-mismatch`,
|
|
141
|
+
* `tool-misuse`, `factual-error`, `hallucination`, etc. — the grader
|
|
142
|
+
* is told these are legal answers via the rubric prompt). The legacy
|
|
143
|
+
* literacy enum survives as `LegacyFailureModeType` for the
|
|
144
|
+
* report-aggregation helpers that need stable bucket ordering and
|
|
145
|
+
* icon tables; consumers that only care about presence/absence treat
|
|
146
|
+
* `FailureModeType` as `string`.
|
|
147
|
+
*/
|
|
148
|
+
export type FailureModeType = string;
|
|
149
|
+
/**
|
|
150
|
+
* Closed enum of the original literacy failure modes — used by the
|
|
151
|
+
* report formatters that iterate buckets in a stable order. Adding to
|
|
152
|
+
* this list is a deliberate extension; modes outside it still flow
|
|
153
|
+
* through the report (per-area `modes` record), just without a
|
|
154
|
+
* pre-allocated bucket in `summary`.
|
|
155
|
+
*/
|
|
156
|
+
export type LegacyFailureModeType = "api-error" | "incorrect-docs" | "missing-docs" | "model-limitation" | "outdated-docs" | "poor-structure" | "unclassified";
|
|
157
|
+
/** Set of canonical legacy modes — exported for report-formatter use. */
|
|
158
|
+
export declare const LEGACY_FAILURE_MODES: readonly LegacyFailureModeType[];
|
|
159
|
+
/** Type guard for legacy modes. */
|
|
160
|
+
export declare function isLegacyFailureMode(mode: string): mode is LegacyFailureModeType;
|
|
133
161
|
/** Per-feature-area score breakdown */
|
|
134
162
|
export interface FeatureScore {
|
|
135
163
|
/**
|
|
@@ -261,30 +289,16 @@ export interface GapEstimate {
|
|
|
261
289
|
/** Specific remediation description */
|
|
262
290
|
remediation: string;
|
|
263
291
|
}
|
|
264
|
-
/**
|
|
265
|
-
export interface GraderJudgment {
|
|
266
|
-
/** The rubric template used (task-completion, code-correctness, doc-coverage) */
|
|
267
|
-
dimension: string;
|
|
268
|
-
/** The model that produced the response being graded */
|
|
269
|
-
modelId: string;
|
|
292
|
+
/** Enriched grader judgment with stored documentation context refs. */
|
|
293
|
+
export interface StoredJudgment extends GraderJudgment {
|
|
270
294
|
/**
|
|
271
|
-
*
|
|
272
|
-
*
|
|
273
|
-
*
|
|
274
|
-
*
|
|
295
|
+
* Documentation context the task expected the model to use.
|
|
296
|
+
*
|
|
297
|
+
* Legacy alias `canonicalDocs` may appear on stored reports written
|
|
298
|
+
* before Phase 2 — readers should tolerate both. Writers (the pipeline)
|
|
299
|
+
* always emit `contextDocs`.
|
|
275
300
|
*/
|
|
276
|
-
|
|
277
|
-
/** The grader's natural language reasoning */
|
|
278
|
-
reason: string;
|
|
279
|
-
/** The numeric score (0–100) */
|
|
280
|
-
score: number;
|
|
281
|
-
/** The task this judgment belongs to */
|
|
282
|
-
taskId: string;
|
|
283
|
-
}
|
|
284
|
-
/** Enriched grader judgment with canonical doc references, stored in reports */
|
|
285
|
-
export interface StoredJudgment extends GraderJudgment {
|
|
286
|
-
/** Canonical docs that the task expected the model to use */
|
|
287
|
-
canonicalDocs?: DocumentRef[];
|
|
301
|
+
contextDocs?: DocumentRef[];
|
|
288
302
|
}
|
|
289
303
|
/**
|
|
290
304
|
* Per-test result stored in reports for drill-down and audit.
|
|
@@ -296,8 +310,11 @@ export interface StoredJudgment extends GraderJudgment {
|
|
|
296
310
|
export interface StoredTestResult {
|
|
297
311
|
/** Resolved feature area (from __featureArea or description) */
|
|
298
312
|
area: string;
|
|
299
|
-
/**
|
|
300
|
-
|
|
313
|
+
/**
|
|
314
|
+
* Documentation context the task expected the model to use.
|
|
315
|
+
* Legacy alias `canonicalDocs` may appear on pre-Phase-2 reports.
|
|
316
|
+
*/
|
|
317
|
+
contextDocs?: DocumentRef[];
|
|
301
318
|
/** Weighted composite score (gold variant only) */
|
|
302
319
|
compositeScore?: number;
|
|
303
320
|
/** Per-test cost (USD) */
|
|
@@ -349,6 +366,40 @@ export interface StoredTestResult {
|
|
|
349
366
|
}
|
|
350
367
|
/** Grader consistency diagnostics — does not affect scores, reported alongside */
|
|
351
368
|
export interface GraderReliability {
|
|
369
|
+
/**
|
|
370
|
+
* Plan 03-03 — count of grader-emission vs ceiling cross-check disagreements.
|
|
371
|
+
*
|
|
372
|
+
* Incremented by the live pipeline when `validateFailureMode(...)` returns
|
|
373
|
+
* `level: "medium"` (the grader's emitted `failureMode` does not agree with
|
|
374
|
+
* the ceiling-decomposition mode). Surfaces calibration drift over time
|
|
375
|
+
* without affecting scores. Optional — undefined when the run did not
|
|
376
|
+
* exercise the failure-mode validator (e.g., grader-consistency-only paths).
|
|
377
|
+
*
|
|
378
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
379
|
+
*/
|
|
380
|
+
failureModeCalibration?: number;
|
|
381
|
+
/**
|
|
382
|
+
* Plan 03-03 — count of strict-schema parse failures during grader-output
|
|
383
|
+
* extraction. Wired at the parse-fail branch in `extractGraderJudgments`;
|
|
384
|
+
* incremented when `GraderJudgmentSchema.safeParse` rejects a payload and
|
|
385
|
+
* the pipeline drops to the Phase 1 minimal-shape fallback.
|
|
386
|
+
*
|
|
387
|
+
* Plan 03-04 will tighten the strict schema (`.strict()` + GRAD-02 fields
|
|
388
|
+
* required) and graders will emit the structured wire format in earnest;
|
|
389
|
+
* this counter measures pre-hard-fail drift.
|
|
390
|
+
*/
|
|
391
|
+
parseFailures?: number;
|
|
392
|
+
/**
|
|
393
|
+
* Phase 4 ATTR-01 — count of grader citations whose `slug` was not
|
|
394
|
+
* in the resolvable-set (`hallucinationCheckedAgainst`).
|
|
395
|
+
* Incremented by `computeJudgmentAttribution(...)` for every
|
|
396
|
+
* citation that fails the hallucination short-circuit (Success
|
|
397
|
+
* Criterion #5). A counter, not a ratio — consumers compute the
|
|
398
|
+
* rate by dividing by total-citations if needed.
|
|
399
|
+
*
|
|
400
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
401
|
+
*/
|
|
402
|
+
hallucinationCount?: number;
|
|
352
403
|
/** Inter-grader agreement (from multi-grader comparison) — Phase 3 */
|
|
353
404
|
agreement?: {
|
|
354
405
|
/** Models compared against the primary grader */
|
|
@@ -18,7 +18,21 @@ export { InMemoryPluginRegistry } from "./plugin-registry.js";
|
|
|
18
18
|
export { evalModeType } from "./eval-mode-config.js";
|
|
19
19
|
export { DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT, } from "./preflight-scoring.js";
|
|
20
20
|
export { CONVENTIONAL_DERIVATIONS, isConfidence } from "./confidence.js";
|
|
21
|
-
export { err, fixtureId, generateRunId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
|
|
21
|
+
export { err, fixtureId, generateJudgmentId, generateRunId, judgmentId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
|
|
22
|
+
/** Set of canonical legacy modes — exported for report-formatter use. */
|
|
23
|
+
export const LEGACY_FAILURE_MODES = [
|
|
24
|
+
"api-error",
|
|
25
|
+
"incorrect-docs",
|
|
26
|
+
"missing-docs",
|
|
27
|
+
"model-limitation",
|
|
28
|
+
"outdated-docs",
|
|
29
|
+
"poor-structure",
|
|
30
|
+
"unclassified",
|
|
31
|
+
];
|
|
32
|
+
/** Type guard for legacy modes. */
|
|
33
|
+
export function isLegacyFailureMode(mode) {
|
|
34
|
+
return LEGACY_FAILURE_MODES.includes(mode);
|
|
35
|
+
}
|
|
22
36
|
// ---------------------------------------------------------------------------
|
|
23
37
|
// Comparison (Approach 2: structured comparison output)
|
|
24
38
|
// ---------------------------------------------------------------------------
|