@sanity/ailf 4.5.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/canonical/grader-references/agent-harness-tools.yaml +42 -0
  2. package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
  3. package/canonical/grader-references/mcp-server-spec.yaml +51 -0
  4. package/canonical/grader-references/portable-text.yaml +48 -0
  5. package/config/rubrics.ts +38 -2
  6. package/dist/_vendor/ailf-core/artifact-registry.d.ts +197 -2
  7. package/dist/_vendor/ailf-core/artifact-registry.js +419 -5
  8. package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
  9. package/dist/_vendor/ailf-core/examples/index.js +146 -47
  10. package/dist/_vendor/ailf-core/ports/context.d.ts +26 -0
  11. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  12. package/dist/_vendor/ailf-core/ports/index.js +1 -0
  13. package/dist/_vendor/ailf-core/ports/llm-client.d.ts +112 -0
  14. package/dist/_vendor/ailf-core/ports/llm-client.js +68 -0
  15. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
  16. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
  17. package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
  18. package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
  19. package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
  20. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  21. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
  22. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  23. package/dist/_vendor/ailf-core/schemas/index.js +9 -0
  24. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
  25. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
  26. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
  27. package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
  28. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
  29. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
  30. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
  31. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
  32. package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
  33. package/dist/_vendor/ailf-core/services/index.js +5 -0
  34. package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
  35. package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
  36. package/dist/_vendor/ailf-core/types/attribution.js +18 -0
  37. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
  38. package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
  39. package/dist/_vendor/ailf-core/types/confidence.d.ts +68 -0
  40. package/dist/_vendor/ailf-core/types/confidence.js +56 -0
  41. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
  42. package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
  43. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
  44. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
  45. package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
  46. package/dist/_vendor/ailf-core/types/index.d.ts +82 -29
  47. package/dist/_vendor/ailf-core/types/index.js +16 -1
  48. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
  49. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
  50. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
  51. package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
  52. package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
  53. package/dist/adapters/api-client/build-request.d.ts +1 -0
  54. package/dist/adapters/api-client/build-request.js +3 -0
  55. package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
  56. package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
  57. package/dist/adapters/attribution/index.d.ts +9 -0
  58. package/dist/adapters/attribution/index.js +8 -0
  59. package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
  60. package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
  61. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  62. package/dist/adapters/grader-outputs/index.d.ts +10 -0
  63. package/dist/adapters/grader-outputs/index.js +8 -0
  64. package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
  65. package/dist/adapters/grader-outputs/legacy/index.js +10 -0
  66. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
  67. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
  68. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
  69. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
  70. package/dist/adapters/index.d.ts +3 -0
  71. package/dist/adapters/index.js +4 -0
  72. package/dist/adapters/llm/anthropic-llm-client.d.ts +48 -0
  73. package/dist/adapters/llm/anthropic-llm-client.js +205 -0
  74. package/dist/adapters/llm/fake-llm-client.d.ts +49 -0
  75. package/dist/adapters/llm/fake-llm-client.js +63 -0
  76. package/dist/adapters/llm/index.d.ts +9 -0
  77. package/dist/adapters/llm/index.js +4 -0
  78. package/dist/adapters/llm/openai-llm-client.d.ts +44 -0
  79. package/dist/adapters/llm/openai-llm-client.js +168 -0
  80. package/dist/adapters/llm/pricing.d.ts +12 -0
  81. package/dist/adapters/llm/pricing.js +8 -0
  82. package/dist/adapters/llm/retry.d.ts +56 -0
  83. package/dist/adapters/llm/retry.js +66 -0
  84. package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
  85. package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
  86. package/dist/adapters/task-sources/repo-schemas.d.ts +90 -22
  87. package/dist/adapters/task-sources/repo-schemas.js +19 -2
  88. package/dist/artifact-capture/api-gateway-artifact-writer.js +2 -1
  89. package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +2 -1
  90. package/dist/artifact-capture/gcs-artifact-writer.js +3 -1
  91. package/dist/artifact-capture/local-fs-artifact-writer.js +3 -1
  92. package/dist/commands/calculate-scores.js +1 -1
  93. package/dist/commands/explain-handler.js +1 -1
  94. package/dist/commands/lookup-doc.d.ts +1 -1
  95. package/dist/commands/lookup-doc.js +3 -3
  96. package/dist/commands/pipeline-action.d.ts +6 -0
  97. package/dist/commands/pipeline-action.js +2 -0
  98. package/dist/commands/remote-pipeline.js +1 -0
  99. package/dist/composition-root.d.ts +59 -1
  100. package/dist/composition-root.js +95 -0
  101. package/dist/config/rubrics.ts +38 -2
  102. package/dist/grader/agent-harness.d.ts +14 -0
  103. package/dist/grader/agent-harness.js +17 -0
  104. package/dist/grader/common.d.ts +17 -0
  105. package/dist/grader/common.js +21 -0
  106. package/dist/grader/index.d.ts +38 -0
  107. package/dist/grader/index.js +75 -0
  108. package/dist/grader/knowledge-probe.d.ts +14 -0
  109. package/dist/grader/knowledge-probe.js +18 -0
  110. package/dist/grader/literacy.d.ts +13 -0
  111. package/dist/grader/literacy.js +17 -0
  112. package/dist/grader/mcp.d.ts +14 -0
  113. package/dist/grader/mcp.js +18 -0
  114. package/dist/orchestration/build-app-context.js +1 -0
  115. package/dist/orchestration/build-step-sequence.js +5 -0
  116. package/dist/orchestration/steps/calculate-scores-step.js +23 -1
  117. package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
  118. package/dist/orchestration/steps/compute-attribution-step.js +279 -0
  119. package/dist/orchestration/steps/gap-analysis-step.js +35 -7
  120. package/dist/orchestration/steps/index.d.ts +1 -0
  121. package/dist/orchestration/steps/index.js +1 -0
  122. package/dist/pipeline/attribution.d.ts +15 -0
  123. package/dist/pipeline/attribution.js +18 -9
  124. package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
  125. package/dist/pipeline/borderline-consensus-runner.js +124 -0
  126. package/dist/pipeline/borderline-detector.d.ts +24 -0
  127. package/dist/pipeline/borderline-detector.js +26 -0
  128. package/dist/pipeline/calculate-scores.d.ts +114 -3
  129. package/dist/pipeline/calculate-scores.js +426 -24
  130. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  131. package/dist/pipeline/compiler/literacy-bridge.js +35 -17
  132. package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
  133. package/dist/pipeline/compiler/rubric-resolution.js +9 -1
  134. package/dist/pipeline/compute-attribution.d.ts +80 -0
  135. package/dist/pipeline/compute-attribution.js +196 -0
  136. package/dist/pipeline/failure-modes.d.ts +52 -17
  137. package/dist/pipeline/failure-modes.js +178 -117
  138. package/dist/pipeline/map-request-to-config.js +1 -0
  139. package/package.json +6 -4
@@ -0,0 +1,80 @@
1
+ /**
2
+ * pipeline/compute-attribution.ts
3
+ *
4
+ * Pure three-signal attribution ensemble helper (v0).
5
+ *
6
+ * Computes per-document attribution scores for a single grader judgment
7
+ * using three signals:
8
+ * - citation: the grader's explicit doc citation with a role weight
9
+ * - canonical: whether the doc appears in the task's declared context.docs
10
+ * - retrieved: whether the doc slug was visited by the agent (agentic mode)
11
+ *
12
+ * No AppContext, no filesystem I/O — this is a pure function importable
13
+ * by calibration scripts and orchestration steps alike.
14
+ *
15
+ * Weights (v0): citation=0.55, canonical=0.30, retrieved=0.15.
16
+ * Embedding model: "none" (v0 — no embedding calls yet).
17
+ *
18
+ * @see docs/decisions/D0049-shared-confidence-contract.md
19
+ * @see docs/decisions/D0050-per-entry-attribution-layout.md
20
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
21
+ * @see docs/design-docs/actionability-ladder/04-per-document-attribution-ensemble.md
22
+ */
23
+ import type { Confidence, DocumentRef, GraderJudgment, JudgmentAttribution } from "../_vendor/ailf-core/index.d.ts";
24
+ /**
25
+ * Widened structural type for ensemble weights. Calibration / grid-search
26
+ * callers pass dynamic `number`-valued tuples — they cannot satisfy the
27
+ * literal type produced by `typeof V0_WEIGHTS as const`.
28
+ */
29
+ export interface EnsembleWeights {
30
+ citation: number;
31
+ canonical: number;
32
+ retrieved: number;
33
+ }
34
+ /** v0 ensemble weights. Callers may pass custom weights for calibration. */
35
+ export declare const V0_WEIGHTS: EnsembleWeights;
36
+ /**
37
+ * Compute the per-document attribution for a single grader judgment.
38
+ *
39
+ * The function is side-effect free except for the `reliability` accumulator —
40
+ * the caller owns the `GraderReliability`-shaped mutable object and should
41
+ * persist it after iterating over all judgments in a run.
42
+ *
43
+ * Hallucination short-circuit: when a candidate doc has a citation AND
44
+ * `doc.slug` is absent from `judgment.hallucinationCheckedAgainst`,
45
+ * the doc's attribution short-circuits to `score: 0`,
46
+ * `confidence.level: "low"`, `signalsPresent: 0`, and
47
+ * `reliability.hallucinationCount` is incremented. The function does NOT
48
+ * throw — the pipeline continues and the count is persisted by the caller.
49
+ *
50
+ * @param judgment - The grader judgment being attributed.
51
+ * @param candidates - DocumentRefs to compute attribution for. Callers
52
+ * control the set (contextDocs ∪ cited docs ∪
53
+ * retrieved docs).
54
+ * @param contextDocIds - Set of documentIds declared in task.context.docs
55
+ * (the canonical signal source).
56
+ * @param retrievedSlugs - Set of doc slugs visited by the agent, or
57
+ * `undefined` when running in baseline mode (the
58
+ * retrieved signal is then dropped from the ensemble).
59
+ * @param reliability - Mutable accumulator; `hallucinationCount` is
60
+ * incremented for each hallucinated citation.
61
+ * @param weights - Ensemble weights; defaults to V0_WEIGHTS.
62
+ */
63
+ export declare function computeJudgmentAttribution(judgment: GraderJudgment, candidates: DocumentRef[], contextDocIds: ReadonlySet<string>, retrievedSlugs: ReadonlySet<string> | undefined, reliability: {
64
+ hallucinationCount: number;
65
+ }, weights?: EnsembleWeights): JudgmentAttribution;
66
+ /**
67
+ * Derive a `Confidence` triple from the array of present signal values
68
+ * using standard-deviation-based agreement scoring.
69
+ *
70
+ * When `present.length <= 1`, the result is always `"low"` — a single
71
+ * signal cannot speak to agreement.
72
+ *
73
+ * The `0.5` normalization constant is the maximum standard deviation for
74
+ * a 2-or-3-point series whose values are in [0, 1] (achieved by
75
+ * [0, 1] or [0, 0, 1] patterns). Dividing the actual stdev by 0.5
76
+ * normalises agreement to [0, 1].
77
+ *
78
+ * Buckets: agreement > 0.7 → "high", > 0.4 → "medium", else → "low".
79
+ */
80
+ export declare function deriveEnsembleConfidence(present: readonly number[]): Confidence;
@@ -0,0 +1,196 @@
1
+ /**
2
+ * pipeline/compute-attribution.ts
3
+ *
4
+ * Pure three-signal attribution ensemble helper (v0).
5
+ *
6
+ * Computes per-document attribution scores for a single grader judgment
7
+ * using three signals:
8
+ * - citation: the grader's explicit doc citation with a role weight
9
+ * - canonical: whether the doc appears in the task's declared context.docs
10
+ * - retrieved: whether the doc slug was visited by the agent (agentic mode)
11
+ *
12
+ * No AppContext, no filesystem I/O — this is a pure function importable
13
+ * by calibration scripts and orchestration steps alike.
14
+ *
15
+ * Weights (v0): citation=0.55, canonical=0.30, retrieved=0.15.
16
+ * Embedding model: "none" (v0 — no embedding calls yet).
17
+ *
18
+ * @see docs/decisions/D0049-shared-confidence-contract.md
19
+ * @see docs/decisions/D0050-per-entry-attribution-layout.md
20
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
21
+ * @see docs/design-docs/actionability-ladder/04-per-document-attribution-ensemble.md
22
+ */
23
+ // ---------------------------------------------------------------------------
24
+ // Constants
25
+ // ---------------------------------------------------------------------------
26
+ const ROLE_WEIGHTS = {
27
+ supports: 1.0,
28
+ contradicts: 0.8,
29
+ missing: 0.6,
30
+ irrelevant: 0.0,
31
+ };
32
+ /** v0 ensemble weights. Callers may pass custom weights for calibration. */
33
+ export const V0_WEIGHTS = {
34
+ citation: 0.55,
35
+ canonical: 0.3,
36
+ retrieved: 0.15,
37
+ };
38
+ // ---------------------------------------------------------------------------
39
+ // Public API
40
+ // ---------------------------------------------------------------------------
41
+ /**
42
+ * Compute the per-document attribution for a single grader judgment.
43
+ *
44
+ * The function is side-effect free except for the `reliability` accumulator —
45
+ * the caller owns the `GraderReliability`-shaped mutable object and should
46
+ * persist it after iterating over all judgments in a run.
47
+ *
48
+ * Hallucination short-circuit: when a candidate doc has a citation AND
49
+ * `doc.slug` is absent from `judgment.hallucinationCheckedAgainst`,
50
+ * the doc's attribution short-circuits to `score: 0`,
51
+ * `confidence.level: "low"`, `signalsPresent: 0`, and
52
+ * `reliability.hallucinationCount` is incremented. The function does NOT
53
+ * throw — the pipeline continues and the count is persisted by the caller.
54
+ *
55
+ * @param judgment - The grader judgment being attributed.
56
+ * @param candidates - DocumentRefs to compute attribution for. Callers
57
+ * control the set (contextDocs ∪ cited docs ∪
58
+ * retrieved docs).
59
+ * @param contextDocIds - Set of documentIds declared in task.context.docs
60
+ * (the canonical signal source).
61
+ * @param retrievedSlugs - Set of doc slugs visited by the agent, or
62
+ * `undefined` when running in baseline mode (the
63
+ * retrieved signal is then dropped from the ensemble).
64
+ * @param reliability - Mutable accumulator; `hallucinationCount` is
65
+ * incremented for each hallucinated citation.
66
+ * @param weights - Ensemble weights; defaults to V0_WEIGHTS.
67
+ */
68
+ export function computeJudgmentAttribution(judgment, candidates, contextDocIds, retrievedSlugs, reliability, weights = V0_WEIGHTS) {
69
+ const resolvableSet = new Set(judgment.hallucinationCheckedAgainst);
70
+ // Build a map of citations by documentId for O(1) lookups (D0052).
71
+ const citationsByDocId = new Map();
72
+ for (const cit of judgment.docCitations) {
73
+ citationsByDocId.set(cit.documentId, cit);
74
+ }
75
+ const attributions = [];
76
+ // Defensive dedup by documentId. Callers may pass candidates that contain
77
+ // duplicate DocumentRef instances sharing a documentId (e.g. when
78
+ // contextSlugs has duplicate slug entries or two manifest slug aliases
79
+ // resolve to the same documentId). Without dedup, the hallucinationCount
80
+ // accumulator and the per-doc attribution array would double-count.
81
+ const seenDocIds = new Set();
82
+ for (const candidate of candidates) {
83
+ if (seenDocIds.has(candidate.documentId))
84
+ continue;
85
+ seenDocIds.add(candidate.documentId);
86
+ const cit = citationsByDocId.get(candidate.documentId);
87
+ // Hallucination short-circuit (Pitfall #11, T-04-01-01).
88
+ //
89
+ // A citation is hallucinated when:
90
+ // (a) the candidate has a slug and that slug is not in resolvableSet, OR
91
+ // (b) the candidate has no slug — we cannot verify resolvability, so
92
+ // treat it conservatively as a hallucination (D0052 keys
93
+ // resolvableSet by slug; no slug means no positive proof of
94
+ // resolvability).
95
+ //
96
+ // Score is forced to 0 and reliability.hallucinationCount is incremented
97
+ // so downstream consumers can audit citation grounding without
98
+ // re-deriving the set.
99
+ if (cit && (!candidate.slug || !resolvableSet.has(candidate.slug))) {
100
+ reliability.hallucinationCount += 1;
101
+ attributions.push({
102
+ documentId: candidate.documentId,
103
+ ...(candidate.slug !== undefined ? { slug: candidate.slug } : {}),
104
+ score: 0,
105
+ signals: {},
106
+ confidence: {
107
+ level: "low",
108
+ signalsPresent: 0,
109
+ derivation: "ensemble-stdev",
110
+ },
111
+ });
112
+ continue;
113
+ }
114
+ // Citation signal: role weight when cited; undefined when not cited at all.
115
+ // Do NOT coerce to 0 — undefined signals are dropped from signalsPresent.
116
+ const citation = cit
117
+ ? ROLE_WEIGHTS[cit.role]
118
+ : undefined;
119
+ // Canonical signal: binary presence in task's context.docs (always present).
120
+ const canonical = contextDocIds.has(candidate.documentId) ? 1 : 0;
121
+ // Retrieved signal: slug presence in agent retrieval set, or undefined
122
+ // when running in baseline mode (agentBehavior absent — Pitfall #4).
123
+ const retrieved = retrievedSlugs === undefined
124
+ ? undefined
125
+ : retrievedSlugs.has(candidate.slug ?? "")
126
+ ? 1
127
+ : 0;
128
+ // Collect present (non-undefined) signals for confidence derivation.
129
+ const present = [citation, canonical, retrieved].filter((s) => s !== undefined);
130
+ const score = clamp((citation ?? 0) * weights.citation +
131
+ canonical * weights.canonical +
132
+ (retrieved ?? 0) * weights.retrieved, 0, 1);
133
+ // Build the signals object omitting undefined members so JSON output is clean.
134
+ const signals = { canonical };
135
+ if (citation !== undefined)
136
+ signals.citation = citation;
137
+ if (retrieved !== undefined)
138
+ signals.retrieved = retrieved;
139
+ attributions.push({
140
+ documentId: candidate.documentId,
141
+ ...(candidate.slug !== undefined ? { slug: candidate.slug } : {}),
142
+ score,
143
+ signals,
144
+ confidence: deriveEnsembleConfidence(present),
145
+ });
146
+ }
147
+ const judgmentRef = `${judgment.taskId}--${judgment.modelId}--${judgment.dimension}`;
148
+ return {
149
+ judgmentRef,
150
+ taskId: judgment.taskId,
151
+ modelId: judgment.modelId,
152
+ dimension: judgment.dimension,
153
+ attributions,
154
+ hallucinationCheckedAgainst: judgment.hallucinationCheckedAgainst,
155
+ };
156
+ }
157
+ /**
158
+ * Derive a `Confidence` triple from the array of present signal values
159
+ * using standard-deviation-based agreement scoring.
160
+ *
161
+ * When `present.length <= 1`, the result is always `"low"` — a single
162
+ * signal cannot speak to agreement.
163
+ *
164
+ * The `0.5` normalization constant is the maximum standard deviation for
165
+ * a 2-or-3-point series whose values are in [0, 1] (achieved by
166
+ * [0, 1] or [0, 0, 1] patterns). Dividing the actual stdev by 0.5
167
+ * normalises agreement to [0, 1].
168
+ *
169
+ * Buckets: agreement > 0.7 → "high", > 0.4 → "medium", else → "low".
170
+ */
171
+ export function deriveEnsembleConfidence(present) {
172
+ if (present.length <= 1) {
173
+ return {
174
+ level: "low",
175
+ signalsPresent: present.length,
176
+ derivation: "ensemble-stdev",
177
+ };
178
+ }
179
+ const mean = present.reduce((sum, v) => sum + v, 0) / present.length;
180
+ const variance = present.reduce((sum, v) => sum + (v - mean) ** 2, 0) / present.length;
181
+ const stdev = Math.sqrt(variance);
182
+ // Normalise: max stdev for [0,1]-bounded series ≈ 0.5
183
+ const agreement = 1 - stdev / 0.5;
184
+ const level = agreement > 0.7 ? "high" : agreement > 0.4 ? "medium" : "low";
185
+ return {
186
+ level,
187
+ signalsPresent: present.length,
188
+ derivation: "ensemble-stdev",
189
+ };
190
+ }
191
+ // ---------------------------------------------------------------------------
192
+ // Internal helpers
193
+ // ---------------------------------------------------------------------------
194
+ function clamp(value, min, max) {
195
+ return Math.min(max, Math.max(min, value));
196
+ }
@@ -1,41 +1,76 @@
1
1
  /**
2
2
  * pipeline/failure-modes.ts
3
3
  *
4
- * Keyword-based failure mode classifier for grader reasoning text,
5
- * cross-referenced with ceiling decomposition data.
4
+ * Ceiling-cross-check failure-mode validator + report assembly.
6
5
  *
7
- * Phase 3a of the Scenario Matrix implementation.
6
+ * The grader emits `failureMode` directly under the per-dimension taxonomy
7
+ * (Plan 03-02 — `packages/eval/src/grader/`). This module consumes the
8
+ * grader's emission as the source of truth and uses the surviving ceiling
9
+ * decomposition (`classifyByCeiling`) only as a CONFIDENCE VALIDATOR — it
10
+ * cross-checks the emitted mode against structural score signals and emits
11
+ * a D0049 `Confidence` triple stamped with `derivation: "ceiling-cross-check"`.
8
12
  *
9
- * The classifier uses two signal sources:
10
- * 1. Keyword matching on grader reason text (primary)
11
- * 2. Ceiling decomposition structural signals (supplementary)
13
+ * The legacy keyword-pattern classifier (and its five regex pattern
14
+ * constants) was deleted in Plan 03-03 its production coverage was ~1%
15
+ * (Doc 03 §"Where classification lives"), and resurrecting it as a fallback
16
+ * is explicitly out of scope.
12
17
  *
13
- * When both sources agree, confidence is boosted. When only ceiling
14
- * signals are available, they serve as a fallback for unclassified cases.
15
- *
16
- * @see docs/archive/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
18
+ * @see docs/decisions/D0005-grader-model-separation.md single grader emits
19
+ * failureMode under the per-dimension taxonomy
20
+ * @see docs/decisions/D0049-shared-confidence-contract.md — Confidence triple
21
+ * shape and `ceiling-cross-check` derivation tag
17
22
  */
18
- import type { FailureMode, FailureModeReport, FeatureScore, GraderJudgment } from "./types.js";
23
+ import type { Confidence } from "../_vendor/ailf-core/index.d.ts";
24
+ import type { FailureModeReport, FeatureScore, GraderJudgment } from "./types.js";
19
25
  /**
20
26
  * Build a complete failure mode report from grader judgments and scores.
21
27
  *
28
+ * The grader-emitted `judgment.failureMode` is the source of truth (Plan
29
+ * 03-03 — keyword classifier deleted). `validateFailureMode` cross-checks
30
+ * the emission against ceiling decomposition and stamps a D0049 confidence.
31
+ *
32
+ * The `FailureMode` triple shape (`mode`, `confidence`, `source`) is
33
+ * preserved for backward compatibility with downstream consumers
34
+ * (gap-analysis, manifest emission) — the bucketed `confidence` enum maps
35
+ * 1:1 from `Confidence.level`, and `source` is always `"ceiling"` now that
36
+ * the keyword path is gone.
37
+ *
22
38
  * @param judgments - All grader judgments from the evaluation
23
39
  * @param scores - Per-area feature scores (for ceiling decomposition)
24
40
  * @returns Failure mode report with per-area breakdowns
25
41
  */
26
42
  export declare function buildFailureModeReport(judgments: GraderJudgment[], scores: FeatureScore[]): FailureModeReport;
27
43
  /**
28
- * Classify the failure mode of a low-scoring grader judgment.
44
+ * Cross-check a grader-emitted `failureMode` against ceiling decomposition
45
+ * and emit a D0049 `Confidence` triple stamped with
46
+ * `derivation: "ceiling-cross-check"`.
29
47
  *
30
- * Uses keyword matching on the reason text, then cross-references with
31
- * ceiling decomposition data for structural confirmation.
48
+ * Replaces the deleted keyword-pattern + ceiling combine classifier the
49
+ * grader's emission is now the source of truth for the mode itself; this
50
+ * function only stamps confidence based on whether the structural ceiling
51
+ * signal agrees.
32
52
  *
33
- * @param judgment - The grader judgment to classify
53
+ * - `level: "high"` (`signalsPresent: 2`) grader emission and ceiling
54
+ * decomposition agree on the same mode.
55
+ * - `level: "medium"` (`signalsPresent: 2`) — both signals present but
56
+ * disagree. The live pipeline increments
57
+ * `GraderReliability.failureModeCalibration` ONLY on this branch — a
58
+ * true calibration miss requires both signals to be present.
59
+ * - `level: "low"` (`signalsPresent: 1`) — only the grader's emission;
60
+ * ceiling decomposition produced no structural signal. Not a
61
+ * calibration miss (we have nothing to cross-check against).
62
+ * - `level: "low"` (`signalsPresent: 0`) — passing scores
63
+ * (`>= CLASSIFICATION_THRESHOLD`) don't classify; emit absent.
64
+ *
65
+ * @param judgment - The grader judgment carrying `failureMode` + `score`
34
66
  * @param ceilingScore - The area's ceiling score (with-docs best case)
35
67
  * @param floorScore - The area's floor score (no-docs baseline)
36
- * @returns Classified failure mode with confidence level
68
+ * @returns D0049 Confidence triple stamped `derivation: "ceiling-cross-check"`
69
+ *
70
+ * @see docs/decisions/D0005-grader-model-separation.md
71
+ * @see docs/decisions/D0049-shared-confidence-contract.md
37
72
  */
38
- export declare function classifyFailureMode(judgment: GraderJudgment, ceilingScore: number, floorScore: number): FailureMode;
73
+ export declare function validateFailureMode(judgment: GraderJudgment, ceilingScore: number, floorScore: number): Confidence;
39
74
  /**
40
75
  * Format a failure mode report for console output.
41
76
  */