@sanity/ailf 4.5.0 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/agent-harness-tools.yaml +42 -0
- package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
- package/canonical/grader-references/mcp-server-spec.yaml +51 -0
- package/canonical/grader-references/portable-text.yaml +48 -0
- package/config/rubrics.ts +38 -2
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +197 -2
- package/dist/_vendor/ailf-core/artifact-registry.js +419 -5
- package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
- package/dist/_vendor/ailf-core/examples/index.js +146 -47
- package/dist/_vendor/ailf-core/ports/context.d.ts +26 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/llm-client.d.ts +112 -0
- package/dist/_vendor/ailf-core/ports/llm-client.js +68 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +9 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
- package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/services/index.js +5 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
- package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
- package/dist/_vendor/ailf-core/types/attribution.js +18 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
- package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
- package/dist/_vendor/ailf-core/types/confidence.d.ts +68 -0
- package/dist/_vendor/ailf-core/types/confidence.js +56 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
- package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +82 -29
- package/dist/_vendor/ailf-core/types/index.js +16 -1
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
- package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
- package/dist/adapters/attribution/index.d.ts +9 -0
- package/dist/adapters/attribution/index.js +8 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/grader-outputs/index.d.ts +10 -0
- package/dist/adapters/grader-outputs/index.js +8 -0
- package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
- package/dist/adapters/grader-outputs/legacy/index.js +10 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
- package/dist/adapters/index.d.ts +3 -0
- package/dist/adapters/index.js +4 -0
- package/dist/adapters/llm/anthropic-llm-client.d.ts +48 -0
- package/dist/adapters/llm/anthropic-llm-client.js +205 -0
- package/dist/adapters/llm/fake-llm-client.d.ts +49 -0
- package/dist/adapters/llm/fake-llm-client.js +63 -0
- package/dist/adapters/llm/index.d.ts +9 -0
- package/dist/adapters/llm/index.js +4 -0
- package/dist/adapters/llm/openai-llm-client.d.ts +44 -0
- package/dist/adapters/llm/openai-llm-client.js +168 -0
- package/dist/adapters/llm/pricing.d.ts +12 -0
- package/dist/adapters/llm/pricing.js +8 -0
- package/dist/adapters/llm/retry.d.ts +56 -0
- package/dist/adapters/llm/retry.js +66 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +90 -22
- package/dist/adapters/task-sources/repo-schemas.js +19 -2
- package/dist/artifact-capture/api-gateway-artifact-writer.js +2 -1
- package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +2 -1
- package/dist/artifact-capture/gcs-artifact-writer.js +3 -1
- package/dist/artifact-capture/local-fs-artifact-writer.js +3 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/explain-handler.js +1 -1
- package/dist/commands/lookup-doc.d.ts +1 -1
- package/dist/commands/lookup-doc.js +3 -3
- package/dist/commands/pipeline-action.d.ts +6 -0
- package/dist/commands/pipeline-action.js +2 -0
- package/dist/commands/remote-pipeline.js +1 -0
- package/dist/composition-root.d.ts +59 -1
- package/dist/composition-root.js +95 -0
- package/dist/config/rubrics.ts +38 -2
- package/dist/grader/agent-harness.d.ts +14 -0
- package/dist/grader/agent-harness.js +17 -0
- package/dist/grader/common.d.ts +17 -0
- package/dist/grader/common.js +21 -0
- package/dist/grader/index.d.ts +38 -0
- package/dist/grader/index.js +75 -0
- package/dist/grader/knowledge-probe.d.ts +14 -0
- package/dist/grader/knowledge-probe.js +18 -0
- package/dist/grader/literacy.d.ts +13 -0
- package/dist/grader/literacy.js +17 -0
- package/dist/grader/mcp.d.ts +14 -0
- package/dist/grader/mcp.js +18 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +5 -0
- package/dist/orchestration/steps/calculate-scores-step.js +23 -1
- package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
- package/dist/orchestration/steps/compute-attribution-step.js +279 -0
- package/dist/orchestration/steps/gap-analysis-step.js +35 -7
- package/dist/orchestration/steps/index.d.ts +1 -0
- package/dist/orchestration/steps/index.js +1 -0
- package/dist/pipeline/attribution.d.ts +15 -0
- package/dist/pipeline/attribution.js +18 -9
- package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
- package/dist/pipeline/borderline-consensus-runner.js +124 -0
- package/dist/pipeline/borderline-detector.d.ts +24 -0
- package/dist/pipeline/borderline-detector.js +26 -0
- package/dist/pipeline/calculate-scores.d.ts +114 -3
- package/dist/pipeline/calculate-scores.js +426 -24
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +35 -17
- package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
- package/dist/pipeline/compiler/rubric-resolution.js +9 -1
- package/dist/pipeline/compute-attribution.d.ts +80 -0
- package/dist/pipeline/compute-attribution.js +196 -0
- package/dist/pipeline/failure-modes.d.ts +52 -17
- package/dist/pipeline/failure-modes.js +178 -117
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/package.json +6 -4
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/compute-attribution.ts
|
|
3
|
+
*
|
|
4
|
+
* Pure three-signal attribution ensemble helper (v0).
|
|
5
|
+
*
|
|
6
|
+
* Computes per-document attribution scores for a single grader judgment
|
|
7
|
+
* using three signals:
|
|
8
|
+
* - citation: the grader's explicit doc citation with a role weight
|
|
9
|
+
* - canonical: whether the doc appears in the task's declared context.docs
|
|
10
|
+
* - retrieved: whether the doc slug was visited by the agent (agentic mode)
|
|
11
|
+
*
|
|
12
|
+
* No AppContext, no filesystem I/O — this is a pure function importable
|
|
13
|
+
* by calibration scripts and orchestration steps alike.
|
|
14
|
+
*
|
|
15
|
+
* Weights (v0): citation=0.55, canonical=0.30, retrieved=0.15.
|
|
16
|
+
* Embedding model: "none" (v0 — no embedding calls yet).
|
|
17
|
+
*
|
|
18
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
19
|
+
* @see docs/decisions/D0050-per-entry-attribution-layout.md
|
|
20
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
21
|
+
* @see docs/design-docs/actionability-ladder/04-per-document-attribution-ensemble.md
|
|
22
|
+
*/
|
|
23
|
+
import type { Confidence, DocumentRef, GraderJudgment, JudgmentAttribution } from "../_vendor/ailf-core/index.d.ts";
|
|
24
|
+
/**
|
|
25
|
+
* Widened structural type for ensemble weights. Calibration / grid-search
|
|
26
|
+
* callers pass dynamic `number`-valued tuples — they cannot satisfy the
|
|
27
|
+
* literal type produced by `typeof V0_WEIGHTS as const`.
|
|
28
|
+
*/
|
|
29
|
+
export interface EnsembleWeights {
|
|
30
|
+
citation: number;
|
|
31
|
+
canonical: number;
|
|
32
|
+
retrieved: number;
|
|
33
|
+
}
|
|
34
|
+
/** v0 ensemble weights. Callers may pass custom weights for calibration. */
|
|
35
|
+
export declare const V0_WEIGHTS: EnsembleWeights;
|
|
36
|
+
/**
|
|
37
|
+
* Compute the per-document attribution for a single grader judgment.
|
|
38
|
+
*
|
|
39
|
+
* The function is side-effect free except for the `reliability` accumulator —
|
|
40
|
+
* the caller owns the `GraderReliability`-shaped mutable object and should
|
|
41
|
+
* persist it after iterating over all judgments in a run.
|
|
42
|
+
*
|
|
43
|
+
* Hallucination short-circuit: when a candidate doc has a citation AND
|
|
44
|
+
* `doc.slug` is absent from `judgment.hallucinationCheckedAgainst`,
|
|
45
|
+
* the doc's attribution short-circuits to `score: 0`,
|
|
46
|
+
* `confidence.level: "low"`, `signalsPresent: 0`, and
|
|
47
|
+
* `reliability.hallucinationCount` is incremented. The function does NOT
|
|
48
|
+
* throw — the pipeline continues and the count is persisted by the caller.
|
|
49
|
+
*
|
|
50
|
+
* @param judgment - The grader judgment being attributed.
|
|
51
|
+
* @param candidates - DocumentRefs to compute attribution for. Callers
|
|
52
|
+
* control the set (contextDocs ∪ cited docs ∪
|
|
53
|
+
* retrieved docs).
|
|
54
|
+
* @param contextDocIds - Set of documentIds declared in task.context.docs
|
|
55
|
+
* (the canonical signal source).
|
|
56
|
+
* @param retrievedSlugs - Set of doc slugs visited by the agent, or
|
|
57
|
+
* `undefined` when running in baseline mode (the
|
|
58
|
+
* retrieved signal is then dropped from the ensemble).
|
|
59
|
+
* @param reliability - Mutable accumulator; `hallucinationCount` is
|
|
60
|
+
* incremented for each hallucinated citation.
|
|
61
|
+
* @param weights - Ensemble weights; defaults to V0_WEIGHTS.
|
|
62
|
+
*/
|
|
63
|
+
export declare function computeJudgmentAttribution(judgment: GraderJudgment, candidates: DocumentRef[], contextDocIds: ReadonlySet<string>, retrievedSlugs: ReadonlySet<string> | undefined, reliability: {
|
|
64
|
+
hallucinationCount: number;
|
|
65
|
+
}, weights?: EnsembleWeights): JudgmentAttribution;
|
|
66
|
+
/**
|
|
67
|
+
* Derive a `Confidence` triple from the array of present signal values
|
|
68
|
+
* using standard-deviation-based agreement scoring.
|
|
69
|
+
*
|
|
70
|
+
* When `present.length <= 1`, the result is always `"low"` — a single
|
|
71
|
+
* signal cannot speak to agreement.
|
|
72
|
+
*
|
|
73
|
+
* The `0.5` normalization constant is the maximum standard deviation for
|
|
74
|
+
* a 2-or-3-point series whose values are in [0, 1] (achieved by
|
|
75
|
+
* [0, 1] or [0, 0, 1] patterns). Dividing the actual stdev by 0.5
|
|
76
|
+
* normalises agreement to [0, 1].
|
|
77
|
+
*
|
|
78
|
+
* Buckets: agreement > 0.7 → "high", > 0.4 → "medium", else → "low".
|
|
79
|
+
*/
|
|
80
|
+
export declare function deriveEnsembleConfidence(present: readonly number[]): Confidence;
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/compute-attribution.ts
|
|
3
|
+
*
|
|
4
|
+
* Pure three-signal attribution ensemble helper (v0).
|
|
5
|
+
*
|
|
6
|
+
* Computes per-document attribution scores for a single grader judgment
|
|
7
|
+
* using three signals:
|
|
8
|
+
* - citation: the grader's explicit doc citation with a role weight
|
|
9
|
+
* - canonical: whether the doc appears in the task's declared context.docs
|
|
10
|
+
* - retrieved: whether the doc slug was visited by the agent (agentic mode)
|
|
11
|
+
*
|
|
12
|
+
* No AppContext, no filesystem I/O — this is a pure function importable
|
|
13
|
+
* by calibration scripts and orchestration steps alike.
|
|
14
|
+
*
|
|
15
|
+
* Weights (v0): citation=0.55, canonical=0.30, retrieved=0.15.
|
|
16
|
+
* Embedding model: "none" (v0 — no embedding calls yet).
|
|
17
|
+
*
|
|
18
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
19
|
+
* @see docs/decisions/D0050-per-entry-attribution-layout.md
|
|
20
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
21
|
+
* @see docs/design-docs/actionability-ladder/04-per-document-attribution-ensemble.md
|
|
22
|
+
*/
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
// Constants
|
|
25
|
+
// ---------------------------------------------------------------------------
|
|
26
|
+
const ROLE_WEIGHTS = {
|
|
27
|
+
supports: 1.0,
|
|
28
|
+
contradicts: 0.8,
|
|
29
|
+
missing: 0.6,
|
|
30
|
+
irrelevant: 0.0,
|
|
31
|
+
};
|
|
32
|
+
/** v0 ensemble weights. Callers may pass custom weights for calibration. */
|
|
33
|
+
export const V0_WEIGHTS = {
|
|
34
|
+
citation: 0.55,
|
|
35
|
+
canonical: 0.3,
|
|
36
|
+
retrieved: 0.15,
|
|
37
|
+
};
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
// Public API
|
|
40
|
+
// ---------------------------------------------------------------------------
|
|
41
|
+
/**
|
|
42
|
+
* Compute the per-document attribution for a single grader judgment.
|
|
43
|
+
*
|
|
44
|
+
* The function is side-effect free except for the `reliability` accumulator —
|
|
45
|
+
* the caller owns the `GraderReliability`-shaped mutable object and should
|
|
46
|
+
* persist it after iterating over all judgments in a run.
|
|
47
|
+
*
|
|
48
|
+
* Hallucination short-circuit: when a candidate doc has a citation AND
|
|
49
|
+
* `doc.slug` is absent from `judgment.hallucinationCheckedAgainst`,
|
|
50
|
+
* the doc's attribution short-circuits to `score: 0`,
|
|
51
|
+
* `confidence.level: "low"`, `signalsPresent: 0`, and
|
|
52
|
+
* `reliability.hallucinationCount` is incremented. The function does NOT
|
|
53
|
+
* throw — the pipeline continues and the count is persisted by the caller.
|
|
54
|
+
*
|
|
55
|
+
* @param judgment - The grader judgment being attributed.
|
|
56
|
+
* @param candidates - DocumentRefs to compute attribution for. Callers
|
|
57
|
+
* control the set (contextDocs ∪ cited docs ∪
|
|
58
|
+
* retrieved docs).
|
|
59
|
+
* @param contextDocIds - Set of documentIds declared in task.context.docs
|
|
60
|
+
* (the canonical signal source).
|
|
61
|
+
* @param retrievedSlugs - Set of doc slugs visited by the agent, or
|
|
62
|
+
* `undefined` when running in baseline mode (the
|
|
63
|
+
* retrieved signal is then dropped from the ensemble).
|
|
64
|
+
* @param reliability - Mutable accumulator; `hallucinationCount` is
|
|
65
|
+
* incremented for each hallucinated citation.
|
|
66
|
+
* @param weights - Ensemble weights; defaults to V0_WEIGHTS.
|
|
67
|
+
*/
|
|
68
|
+
export function computeJudgmentAttribution(judgment, candidates, contextDocIds, retrievedSlugs, reliability, weights = V0_WEIGHTS) {
|
|
69
|
+
const resolvableSet = new Set(judgment.hallucinationCheckedAgainst);
|
|
70
|
+
// Build a map of citations by documentId for O(1) lookups (D0052).
|
|
71
|
+
const citationsByDocId = new Map();
|
|
72
|
+
for (const cit of judgment.docCitations) {
|
|
73
|
+
citationsByDocId.set(cit.documentId, cit);
|
|
74
|
+
}
|
|
75
|
+
const attributions = [];
|
|
76
|
+
// Defensive dedup by documentId. Callers may pass candidates that contain
|
|
77
|
+
// duplicate DocumentRef instances sharing a documentId (e.g. when
|
|
78
|
+
// contextSlugs has duplicate slug entries or two manifest slug aliases
|
|
79
|
+
// resolve to the same documentId). Without dedup, the hallucinationCount
|
|
80
|
+
// accumulator and the per-doc attribution array would double-count.
|
|
81
|
+
const seenDocIds = new Set();
|
|
82
|
+
for (const candidate of candidates) {
|
|
83
|
+
if (seenDocIds.has(candidate.documentId))
|
|
84
|
+
continue;
|
|
85
|
+
seenDocIds.add(candidate.documentId);
|
|
86
|
+
const cit = citationsByDocId.get(candidate.documentId);
|
|
87
|
+
// Hallucination short-circuit (Pitfall #11, T-04-01-01).
|
|
88
|
+
//
|
|
89
|
+
// A citation is hallucinated when:
|
|
90
|
+
// (a) the candidate has a slug and that slug is not in resolvableSet, OR
|
|
91
|
+
// (b) the candidate has no slug — we cannot verify resolvability, so
|
|
92
|
+
// treat it conservatively as a hallucination (D0052 keys
|
|
93
|
+
// resolvableSet by slug; no slug means no positive proof of
|
|
94
|
+
// resolvability).
|
|
95
|
+
//
|
|
96
|
+
// Score is forced to 0 and reliability.hallucinationCount is incremented
|
|
97
|
+
// so downstream consumers can audit citation grounding without
|
|
98
|
+
// re-deriving the set.
|
|
99
|
+
if (cit && (!candidate.slug || !resolvableSet.has(candidate.slug))) {
|
|
100
|
+
reliability.hallucinationCount += 1;
|
|
101
|
+
attributions.push({
|
|
102
|
+
documentId: candidate.documentId,
|
|
103
|
+
...(candidate.slug !== undefined ? { slug: candidate.slug } : {}),
|
|
104
|
+
score: 0,
|
|
105
|
+
signals: {},
|
|
106
|
+
confidence: {
|
|
107
|
+
level: "low",
|
|
108
|
+
signalsPresent: 0,
|
|
109
|
+
derivation: "ensemble-stdev",
|
|
110
|
+
},
|
|
111
|
+
});
|
|
112
|
+
continue;
|
|
113
|
+
}
|
|
114
|
+
// Citation signal: role weight when cited; undefined when not cited at all.
|
|
115
|
+
// Do NOT coerce to 0 — undefined signals are dropped from signalsPresent.
|
|
116
|
+
const citation = cit
|
|
117
|
+
? ROLE_WEIGHTS[cit.role]
|
|
118
|
+
: undefined;
|
|
119
|
+
// Canonical signal: binary presence in task's context.docs (always present).
|
|
120
|
+
const canonical = contextDocIds.has(candidate.documentId) ? 1 : 0;
|
|
121
|
+
// Retrieved signal: slug presence in agent retrieval set, or undefined
|
|
122
|
+
// when running in baseline mode (agentBehavior absent — Pitfall #4).
|
|
123
|
+
const retrieved = retrievedSlugs === undefined
|
|
124
|
+
? undefined
|
|
125
|
+
: retrievedSlugs.has(candidate.slug ?? "")
|
|
126
|
+
? 1
|
|
127
|
+
: 0;
|
|
128
|
+
// Collect present (non-undefined) signals for confidence derivation.
|
|
129
|
+
const present = [citation, canonical, retrieved].filter((s) => s !== undefined);
|
|
130
|
+
const score = clamp((citation ?? 0) * weights.citation +
|
|
131
|
+
canonical * weights.canonical +
|
|
132
|
+
(retrieved ?? 0) * weights.retrieved, 0, 1);
|
|
133
|
+
// Build the signals object omitting undefined members so JSON output is clean.
|
|
134
|
+
const signals = { canonical };
|
|
135
|
+
if (citation !== undefined)
|
|
136
|
+
signals.citation = citation;
|
|
137
|
+
if (retrieved !== undefined)
|
|
138
|
+
signals.retrieved = retrieved;
|
|
139
|
+
attributions.push({
|
|
140
|
+
documentId: candidate.documentId,
|
|
141
|
+
...(candidate.slug !== undefined ? { slug: candidate.slug } : {}),
|
|
142
|
+
score,
|
|
143
|
+
signals,
|
|
144
|
+
confidence: deriveEnsembleConfidence(present),
|
|
145
|
+
});
|
|
146
|
+
}
|
|
147
|
+
const judgmentRef = `${judgment.taskId}--${judgment.modelId}--${judgment.dimension}`;
|
|
148
|
+
return {
|
|
149
|
+
judgmentRef,
|
|
150
|
+
taskId: judgment.taskId,
|
|
151
|
+
modelId: judgment.modelId,
|
|
152
|
+
dimension: judgment.dimension,
|
|
153
|
+
attributions,
|
|
154
|
+
hallucinationCheckedAgainst: judgment.hallucinationCheckedAgainst,
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
/**
|
|
158
|
+
* Derive a `Confidence` triple from the array of present signal values
|
|
159
|
+
* using standard-deviation-based agreement scoring.
|
|
160
|
+
*
|
|
161
|
+
* When `present.length <= 1`, the result is always `"low"` — a single
|
|
162
|
+
* signal cannot speak to agreement.
|
|
163
|
+
*
|
|
164
|
+
* The `0.5` normalization constant is the maximum standard deviation for
|
|
165
|
+
* a 2-or-3-point series whose values are in [0, 1] (achieved by
|
|
166
|
+
* [0, 1] or [0, 0, 1] patterns). Dividing the actual stdev by 0.5
|
|
167
|
+
* normalises agreement to [0, 1].
|
|
168
|
+
*
|
|
169
|
+
* Buckets: agreement > 0.7 → "high", > 0.4 → "medium", else → "low".
|
|
170
|
+
*/
|
|
171
|
+
export function deriveEnsembleConfidence(present) {
|
|
172
|
+
if (present.length <= 1) {
|
|
173
|
+
return {
|
|
174
|
+
level: "low",
|
|
175
|
+
signalsPresent: present.length,
|
|
176
|
+
derivation: "ensemble-stdev",
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
const mean = present.reduce((sum, v) => sum + v, 0) / present.length;
|
|
180
|
+
const variance = present.reduce((sum, v) => sum + (v - mean) ** 2, 0) / present.length;
|
|
181
|
+
const stdev = Math.sqrt(variance);
|
|
182
|
+
// Normalise: max stdev for [0,1]-bounded series ≈ 0.5
|
|
183
|
+
const agreement = 1 - stdev / 0.5;
|
|
184
|
+
const level = agreement > 0.7 ? "high" : agreement > 0.4 ? "medium" : "low";
|
|
185
|
+
return {
|
|
186
|
+
level,
|
|
187
|
+
signalsPresent: present.length,
|
|
188
|
+
derivation: "ensemble-stdev",
|
|
189
|
+
};
|
|
190
|
+
}
|
|
191
|
+
// ---------------------------------------------------------------------------
|
|
192
|
+
// Internal helpers
|
|
193
|
+
// ---------------------------------------------------------------------------
|
|
194
|
+
function clamp(value, min, max) {
|
|
195
|
+
return Math.min(max, Math.max(min, value));
|
|
196
|
+
}
|
|
@@ -1,41 +1,76 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* pipeline/failure-modes.ts
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
* cross-referenced with ceiling decomposition data.
|
|
4
|
+
* Ceiling-cross-check failure-mode validator + report assembly.
|
|
6
5
|
*
|
|
7
|
-
*
|
|
6
|
+
* The grader emits `failureMode` directly under the per-dimension taxonomy
|
|
7
|
+
* (Plan 03-02 — `packages/eval/src/grader/`). This module consumes the
|
|
8
|
+
* grader's emission as the source of truth and uses the surviving ceiling
|
|
9
|
+
* decomposition (`classifyByCeiling`) only as a CONFIDENCE VALIDATOR — it
|
|
10
|
+
* cross-checks the emitted mode against structural score signals and emits
|
|
11
|
+
* a D0049 `Confidence` triple stamped with `derivation: "ceiling-cross-check"`.
|
|
8
12
|
*
|
|
9
|
-
* The classifier
|
|
10
|
-
*
|
|
11
|
-
*
|
|
13
|
+
* The legacy keyword-pattern classifier (and its five regex pattern
|
|
14
|
+
* constants) was deleted in Plan 03-03 — its production coverage was ~1%
|
|
15
|
+
* (Doc 03 §"Where classification lives"), and resurrecting it as a fallback
|
|
16
|
+
* is explicitly out of scope.
|
|
12
17
|
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
18
|
+
* @see docs/decisions/D0005-grader-model-separation.md — single grader emits
|
|
19
|
+
* failureMode under the per-dimension taxonomy
|
|
20
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md — Confidence triple
|
|
21
|
+
* shape and `ceiling-cross-check` derivation tag
|
|
17
22
|
*/
|
|
18
|
-
import type {
|
|
23
|
+
import type { Confidence } from "../_vendor/ailf-core/index.d.ts";
|
|
24
|
+
import type { FailureModeReport, FeatureScore, GraderJudgment } from "./types.js";
|
|
19
25
|
/**
|
|
20
26
|
* Build a complete failure mode report from grader judgments and scores.
|
|
21
27
|
*
|
|
28
|
+
* The grader-emitted `judgment.failureMode` is the source of truth (Plan
|
|
29
|
+
* 03-03 — keyword classifier deleted). `validateFailureMode` cross-checks
|
|
30
|
+
* the emission against ceiling decomposition and stamps a D0049 confidence.
|
|
31
|
+
*
|
|
32
|
+
* The `FailureMode` triple shape (`mode`, `confidence`, `source`) is
|
|
33
|
+
* preserved for backward compatibility with downstream consumers
|
|
34
|
+
* (gap-analysis, manifest emission) — the bucketed `confidence` enum maps
|
|
35
|
+
* 1:1 from `Confidence.level`, and `source` is always `"ceiling"` now that
|
|
36
|
+
* the keyword path is gone.
|
|
37
|
+
*
|
|
22
38
|
* @param judgments - All grader judgments from the evaluation
|
|
23
39
|
* @param scores - Per-area feature scores (for ceiling decomposition)
|
|
24
40
|
* @returns Failure mode report with per-area breakdowns
|
|
25
41
|
*/
|
|
26
42
|
export declare function buildFailureModeReport(judgments: GraderJudgment[], scores: FeatureScore[]): FailureModeReport;
|
|
27
43
|
/**
|
|
28
|
-
*
|
|
44
|
+
* Cross-check a grader-emitted `failureMode` against ceiling decomposition
|
|
45
|
+
* and emit a D0049 `Confidence` triple stamped with
|
|
46
|
+
* `derivation: "ceiling-cross-check"`.
|
|
29
47
|
*
|
|
30
|
-
*
|
|
31
|
-
*
|
|
48
|
+
* Replaces the deleted keyword-pattern + ceiling combine classifier — the
|
|
49
|
+
* grader's emission is now the source of truth for the mode itself; this
|
|
50
|
+
* function only stamps confidence based on whether the structural ceiling
|
|
51
|
+
* signal agrees.
|
|
32
52
|
*
|
|
33
|
-
*
|
|
53
|
+
* - `level: "high"` (`signalsPresent: 2`) — grader emission and ceiling
|
|
54
|
+
* decomposition agree on the same mode.
|
|
55
|
+
* - `level: "medium"` (`signalsPresent: 2`) — both signals present but
|
|
56
|
+
* disagree. The live pipeline increments
|
|
57
|
+
* `GraderReliability.failureModeCalibration` ONLY on this branch — a
|
|
58
|
+
* true calibration miss requires both signals to be present.
|
|
59
|
+
* - `level: "low"` (`signalsPresent: 1`) — only the grader's emission;
|
|
60
|
+
* ceiling decomposition produced no structural signal. Not a
|
|
61
|
+
* calibration miss (we have nothing to cross-check against).
|
|
62
|
+
* - `level: "low"` (`signalsPresent: 0`) — passing scores
|
|
63
|
+
* (`>= CLASSIFICATION_THRESHOLD`) don't classify; emit absent.
|
|
64
|
+
*
|
|
65
|
+
* @param judgment - The grader judgment carrying `failureMode` + `score`
|
|
34
66
|
* @param ceilingScore - The area's ceiling score (with-docs best case)
|
|
35
67
|
* @param floorScore - The area's floor score (no-docs baseline)
|
|
36
|
-
* @returns
|
|
68
|
+
* @returns D0049 Confidence triple stamped `derivation: "ceiling-cross-check"`
|
|
69
|
+
*
|
|
70
|
+
* @see docs/decisions/D0005-grader-model-separation.md
|
|
71
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
37
72
|
*/
|
|
38
|
-
export declare function
|
|
73
|
+
export declare function validateFailureMode(judgment: GraderJudgment, ceilingScore: number, floorScore: number): Confidence;
|
|
39
74
|
/**
|
|
40
75
|
* Format a failure mode report for console output.
|
|
41
76
|
*/
|