@sanity/ailf 4.6.0 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. package/canonical/grader-references/agent-harness-tools.yaml +42 -0
  2. package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
  3. package/canonical/grader-references/mcp-server-spec.yaml +51 -0
  4. package/canonical/grader-references/portable-text.yaml +48 -0
  5. package/config/diagnosis-cards.ts +318 -0
  6. package/config/models.ts +12 -0
  7. package/config/rubrics.ts +38 -2
  8. package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
  9. package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
  10. package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
  11. package/dist/_vendor/ailf-core/examples/index.js +146 -47
  12. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
  13. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
  14. package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
  15. package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
  16. package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
  17. package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
  18. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
  19. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
  20. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
  21. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
  22. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
  23. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
  24. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  25. package/dist/_vendor/ailf-core/index.js +4 -0
  26. package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
  27. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
  28. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
  29. package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
  30. package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
  31. package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
  32. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  33. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
  34. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  35. package/dist/_vendor/ailf-core/schemas/index.js +9 -0
  36. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
  37. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
  38. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
  39. package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
  40. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
  41. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
  42. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
  43. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
  44. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
  45. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +171 -0
  46. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
  47. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
  48. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
  49. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
  50. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
  51. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +104 -0
  52. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
  53. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +96 -0
  54. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +39 -0
  55. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +52 -0
  56. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
  57. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +77 -0
  58. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
  59. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
  60. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
  61. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +126 -0
  62. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
  63. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +107 -0
  64. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
  65. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +114 -0
  66. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
  67. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +273 -0
  68. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
  69. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
  70. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
  71. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
  72. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
  73. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
  74. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
  75. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
  76. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
  77. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
  78. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +16 -0
  79. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +86 -0
  80. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +50 -0
  81. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +35 -0
  82. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +136 -0
  83. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +153 -0
  84. package/dist/_vendor/ailf-core/services/index.d.ts +6 -0
  85. package/dist/_vendor/ailf-core/services/index.js +18 -0
  86. package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
  87. package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
  88. package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
  89. package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
  90. package/dist/_vendor/ailf-core/types/attribution.js +18 -0
  91. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
  92. package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
  93. package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
  94. package/dist/_vendor/ailf-core/types/confidence.js +7 -0
  95. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +271 -0
  96. package/dist/_vendor/ailf-core/types/diagnosis.js +19 -0
  97. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
  98. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
  99. package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
  100. package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
  101. package/dist/_vendor/ailf-core/types/index.js +15 -1
  102. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
  103. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
  104. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
  105. package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
  106. package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
  107. package/dist/adapters/api-client/build-request.d.ts +1 -0
  108. package/dist/adapters/api-client/build-request.js +3 -0
  109. package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
  110. package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
  111. package/dist/adapters/attribution/index.d.ts +9 -0
  112. package/dist/adapters/attribution/index.js +8 -0
  113. package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
  114. package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
  115. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  116. package/dist/adapters/grader-outputs/index.d.ts +10 -0
  117. package/dist/adapters/grader-outputs/index.js +8 -0
  118. package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
  119. package/dist/adapters/grader-outputs/legacy/index.js +10 -0
  120. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
  121. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
  122. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
  123. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
  124. package/dist/adapters/index.d.ts +3 -0
  125. package/dist/adapters/index.js +4 -0
  126. package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
  127. package/dist/adapters/llm/fake-llm-client.js +38 -1
  128. package/dist/adapters/llm/openai-llm-client.js +52 -3
  129. package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
  130. package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
  131. package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
  132. package/dist/adapters/task-sources/repo-schemas.js +19 -2
  133. package/dist/cli-program.js +3 -0
  134. package/dist/commands/calculate-scores.js +1 -1
  135. package/dist/commands/explain-handler.js +1 -1
  136. package/dist/commands/interpret.d.ts +50 -0
  137. package/dist/commands/interpret.js +212 -0
  138. package/dist/commands/lookup-doc.d.ts +1 -1
  139. package/dist/commands/lookup-doc.js +3 -3
  140. package/dist/commands/pipeline-action.d.ts +6 -0
  141. package/dist/commands/pipeline-action.js +2 -0
  142. package/dist/commands/remote-pipeline.js +1 -0
  143. package/dist/composition-root.d.ts +57 -23
  144. package/dist/composition-root.js +155 -41
  145. package/dist/config/diagnosis-cards.ts +318 -0
  146. package/dist/config/models.ts +12 -0
  147. package/dist/config/rubrics.ts +38 -2
  148. package/dist/grader/agent-harness.d.ts +9 -0
  149. package/dist/grader/agent-harness.js +9 -0
  150. package/dist/grader/common.d.ts +9 -0
  151. package/dist/grader/common.js +9 -0
  152. package/dist/grader/index.d.ts +24 -0
  153. package/dist/grader/index.js +24 -0
  154. package/dist/grader/knowledge-probe.d.ts +9 -0
  155. package/dist/grader/knowledge-probe.js +9 -0
  156. package/dist/grader/literacy.d.ts +9 -0
  157. package/dist/grader/literacy.js +9 -0
  158. package/dist/grader/mcp.d.ts +9 -0
  159. package/dist/grader/mcp.js +9 -0
  160. package/dist/orchestration/build-app-context.js +1 -0
  161. package/dist/orchestration/build-step-sequence.js +5 -0
  162. package/dist/orchestration/steps/calculate-scores-step.js +23 -1
  163. package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
  164. package/dist/orchestration/steps/compute-attribution-step.js +279 -0
  165. package/dist/orchestration/steps/gap-analysis-step.js +35 -7
  166. package/dist/orchestration/steps/index.d.ts +1 -0
  167. package/dist/orchestration/steps/index.js +1 -0
  168. package/dist/pipeline/attribution.d.ts +15 -0
  169. package/dist/pipeline/attribution.js +18 -9
  170. package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
  171. package/dist/pipeline/borderline-consensus-runner.js +124 -0
  172. package/dist/pipeline/borderline-detector.d.ts +24 -0
  173. package/dist/pipeline/borderline-detector.js +26 -0
  174. package/dist/pipeline/calculate-scores.d.ts +114 -3
  175. package/dist/pipeline/calculate-scores.js +426 -24
  176. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  177. package/dist/pipeline/compiler/literacy-bridge.js +35 -17
  178. package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
  179. package/dist/pipeline/compiler/rubric-resolution.js +9 -1
  180. package/dist/pipeline/compute-attribution.d.ts +80 -0
  181. package/dist/pipeline/compute-attribution.js +196 -0
  182. package/dist/pipeline/failure-modes.d.ts +52 -17
  183. package/dist/pipeline/failure-modes.js +178 -117
  184. package/dist/pipeline/map-request-to-config.js +1 -0
  185. package/package.json +7 -5
@@ -1,52 +1,47 @@
1
1
  /**
2
2
  * pipeline/failure-modes.ts
3
3
  *
4
- * Keyword-based failure mode classifier for grader reasoning text,
5
- * cross-referenced with ceiling decomposition data.
4
+ * Ceiling-cross-check failure-mode validator + report assembly.
6
5
  *
7
- * Phase 3a of the Scenario Matrix implementation.
6
+ * The grader emits `failureMode` directly under the per-dimension taxonomy
7
+ * (Plan 03-02 — `packages/eval/src/grader/`). This module consumes the
8
+ * grader's emission as the source of truth and uses the surviving ceiling
9
+ * decomposition (`classifyByCeiling`) only as a CONFIDENCE VALIDATOR — it
10
+ * cross-checks the emitted mode against structural score signals and emits
11
+ * a D0049 `Confidence` triple stamped with `derivation: "ceiling-cross-check"`.
8
12
  *
9
- * The classifier uses two signal sources:
10
- * 1. Keyword matching on grader reason text (primary)
11
- * 2. Ceiling decomposition structural signals (supplementary)
13
+ * The legacy keyword-pattern classifier (and its five regex pattern
14
+ * constants) was deleted in Plan 03-03 its production coverage was ~1%
15
+ * (Doc 03 §"Where classification lives"), and resurrecting it as a fallback
16
+ * is explicitly out of scope.
12
17
  *
13
- * When both sources agree, confidence is boosted. When only ceiling
14
- * signals are available, they serve as a fallback for unclassified cases.
15
- *
16
- * @see docs/archive/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
18
+ * @see docs/decisions/D0005-grader-model-separation.md single grader emits
19
+ * failureMode under the per-dimension taxonomy
20
+ * @see docs/decisions/D0049-shared-confidence-contract.md — Confidence triple
21
+ * shape and `ceiling-cross-check` derivation tag
17
22
  */
18
- import { detectFeatureArea } from "../_vendor/ailf-core/index.js";
23
+ import { LEGACY_FAILURE_MODES, detectFeatureArea } from "../_vendor/ailf-core/index.js";
19
24
  // ---------------------------------------------------------------------------
20
25
  // Constants
21
26
  // ---------------------------------------------------------------------------
22
27
  /** Only classify judgments with scores below this threshold */
23
28
  const CLASSIFICATION_THRESHOLD = 60;
24
- /** All failure mode types for initializing empty counts */
25
- const ALL_MODES = [
26
- "api-error",
27
- "incorrect-docs",
28
- "missing-docs",
29
- "model-limitation",
30
- "outdated-docs",
31
- "poor-structure",
32
- "unclassified",
33
- ];
34
- // ---------------------------------------------------------------------------
35
- // Keyword patterns
36
- // ---------------------------------------------------------------------------
37
- /** API error pattern — checked FIRST to prevent timeout errors containing
38
- * "deprecated" from being misclassified as outdated-docs. */
39
- const API_ERROR_PATTERN = /\[api-error\]|timeout|timed out|rate limit|429|503|ECONNRESET|ETIMEDOUT|socket hang up|fetch failed/i;
40
- const OUTDATED_PATTERN = /deprecated|old api|v[0-9]+ syntax|no longer supported|legacy|previous version|outdated|superseded|replaced by/i;
41
- const MISSING_PATTERN = /no documentation|not covered|had to guess|not found|missing.*doc|no.*information|undocumented|couldn't find|without.*documentation/i;
42
- const INCORRECT_PATTERN = /contradicts|incorrect.*doc|doc.*incorrect|wrong.*doc|doc.*wrong|documentation says.*but|factual error|inaccurate|misleading.*doc/i;
43
- const POOR_STRUCTURE_PATTERN = /unclear|ambiguous|couldn't determine|conflicting|confusing|hard to follow|poorly organized|scattered|fragmented/i;
44
29
  // ---------------------------------------------------------------------------
45
30
  // Public API
46
31
  // ---------------------------------------------------------------------------
47
32
  /**
48
33
  * Build a complete failure mode report from grader judgments and scores.
49
34
  *
35
+ * The grader-emitted `judgment.failureMode` is the source of truth (Plan
36
+ * 03-03 — keyword classifier deleted). `validateFailureMode` cross-checks
37
+ * the emission against ceiling decomposition and stamps a D0049 confidence.
38
+ *
39
+ * The `FailureMode` triple shape (`mode`, `confidence`, `source`) is
40
+ * preserved for backward compatibility with downstream consumers
41
+ * (gap-analysis, manifest emission) — the bucketed `confidence` enum maps
42
+ * 1:1 from `Confidence.level`, and `source` is always `"ceiling"` now that
43
+ * the keyword path is gone.
44
+ *
50
45
  * @param judgments - All grader judgments from the evaluation
51
46
  * @param scores - Per-area feature scores (for ceiling decomposition)
52
47
  * @returns Failure mode report with per-area breakdowns
@@ -66,9 +61,23 @@ export function buildFailureModeReport(judgments, scores) {
66
61
  const areaScore = area ? scoreByArea.get(area) : undefined;
67
62
  const ceilingScore = areaScore?.ceilingScore ?? 100;
68
63
  const floorScore = areaScore?.floorScore ?? 0;
69
- const classification = classifyFailureMode(judgment, ceilingScore, floorScore);
64
+ // Source the failure mode from the grader's emission. CR-02:
65
+ // FailureModeType is open-set (`string`) since Plan 03-02
66
+ // introduced per-dimension extensions (`false-floor`,
67
+ // `spec-mismatch`, `tool-misuse`, `factual-error`, …). The report
68
+ // surfaces the emission directly so downstream consumers see the
69
+ // grader's actual taxonomy choice rather than a collapsed
70
+ // `"unclassified"` bucket.
71
+ const emittedMode = readEmittedMode(judgment);
72
+ // Cross-check the grader's emission against ceiling decomposition.
73
+ const stamp = validateFailureMode(judgment, ceilingScore, floorScore);
74
+ const classification = {
75
+ confidence: stamp.level,
76
+ mode: emittedMode,
77
+ source: "ceiling",
78
+ };
70
79
  classifiedJudgments.push({ classification, judgment });
71
- summary[classification.mode]++;
80
+ summary[classification.mode] = (summary[classification.mode] ?? 0) + 1;
72
81
  // Per-area tracking
73
82
  if (area) {
74
83
  if (!byArea[area]) {
@@ -79,7 +88,8 @@ export function buildFailureModeReport(judgments, scores) {
79
88
  totalJudgments: 0,
80
89
  };
81
90
  }
82
- byArea[area].modes[classification.mode]++;
91
+ byArea[area].modes[classification.mode] =
92
+ (byArea[area].modes[classification.mode] ?? 0) + 1;
83
93
  byArea[area].totalJudgments++;
84
94
  }
85
95
  }
@@ -99,28 +109,74 @@ export function buildFailureModeReport(judgments, scores) {
99
109
  };
100
110
  }
101
111
  /**
102
- * Classify the failure mode of a low-scoring grader judgment.
112
+ * Cross-check a grader-emitted `failureMode` against ceiling decomposition
113
+ * and emit a D0049 `Confidence` triple stamped with
114
+ * `derivation: "ceiling-cross-check"`.
103
115
  *
104
- * Uses keyword matching on the reason text, then cross-references with
105
- * ceiling decomposition data for structural confirmation.
116
+ * Replaces the deleted keyword-pattern + ceiling combine classifier the
117
+ * grader's emission is now the source of truth for the mode itself; this
118
+ * function only stamps confidence based on whether the structural ceiling
119
+ * signal agrees.
106
120
  *
107
- * @param judgment - The grader judgment to classify
121
+ * - `level: "high"` (`signalsPresent: 2`) grader emission and ceiling
122
+ * decomposition agree on the same mode.
123
+ * - `level: "medium"` (`signalsPresent: 2`) — both signals present but
124
+ * disagree. The live pipeline increments
125
+ * `GraderReliability.failureModeCalibration` ONLY on this branch — a
126
+ * true calibration miss requires both signals to be present.
127
+ * - `level: "low"` (`signalsPresent: 1`) — only the grader's emission;
128
+ * ceiling decomposition produced no structural signal. Not a
129
+ * calibration miss (we have nothing to cross-check against).
130
+ * - `level: "low"` (`signalsPresent: 0`) — passing scores
131
+ * (`>= CLASSIFICATION_THRESHOLD`) don't classify; emit absent.
132
+ *
133
+ * @param judgment - The grader judgment carrying `failureMode` + `score`
108
134
  * @param ceilingScore - The area's ceiling score (with-docs best case)
109
135
  * @param floorScore - The area's floor score (no-docs baseline)
110
- * @returns Classified failure mode with confidence level
136
+ * @returns D0049 Confidence triple stamped `derivation: "ceiling-cross-check"`
137
+ *
138
+ * @see docs/decisions/D0005-grader-model-separation.md
139
+ * @see docs/decisions/D0049-shared-confidence-contract.md
111
140
  */
112
- export function classifyFailureMode(judgment, ceilingScore, floorScore) {
113
- // Passing scores don't need failure mode analysis
141
+ export function validateFailureMode(judgment, ceilingScore, floorScore) {
142
+ // Passing scores don't classify emit low-confidence absent.
114
143
  if (judgment.score >= CLASSIFICATION_THRESHOLD) {
115
- return { confidence: "low", mode: "unclassified", source: "keyword" };
144
+ return {
145
+ level: "low",
146
+ signalsPresent: 0,
147
+ derivation: "ceiling-cross-check",
148
+ };
116
149
  }
117
- const reason = judgment.reason.toLowerCase();
118
- // Step 1: Keyword-based classification
119
- const keywordMode = classifyByKeyword(reason);
120
- // Step 2: Ceiling-based structural classification
121
150
  const ceilingMode = classifyByCeiling(judgment.score, ceilingScore, floorScore);
122
- // Step 3: Combine signals
123
- return combineClassifications(keywordMode, ceilingMode);
151
+ if (!ceilingMode) {
152
+ // No structural ceiling signal — the grader's emission stands but
153
+ // there's nothing to cross-check against. Surface as low-confidence
154
+ // (signalsPresent: 1) so the caller can distinguish "we have one
155
+ // signal, not two" from "the two signals disagree" — and leave
156
+ // failureModeCalibration alone (folding this case in over-counts
157
+ // the reliability metric, see CR-04 in the Phase 3 review).
158
+ return {
159
+ level: "low",
160
+ signalsPresent: 1,
161
+ derivation: "ceiling-cross-check",
162
+ };
163
+ }
164
+ if (ceilingMode.mode === judgment.failureMode) {
165
+ // Both signals agree → high confidence stamp.
166
+ return {
167
+ level: "high",
168
+ signalsPresent: 2,
169
+ derivation: "ceiling-cross-check",
170
+ };
171
+ }
172
+ // Both signals present and disagree — the actual calibration-miss
173
+ // branch. The caller increments GraderReliability.failureModeCalibration
174
+ // only when signalsPresent === 2 here.
175
+ return {
176
+ level: "medium",
177
+ signalsPresent: 2,
178
+ derivation: "ceiling-cross-check",
179
+ };
124
180
  }
125
181
  // ---------------------------------------------------------------------------
126
182
  // Formatting
@@ -134,10 +190,13 @@ export function formatFailureModesConsole(report) {
134
190
  lines.push("");
135
191
  lines.push(` ${report.totalJudgments} judgments analyzed, ${report.classificationRate.toFixed(0)}% classified`);
136
192
  lines.push("");
137
- // Summary table
193
+ // Summary table — legacy modes first (in canonical order), then any
194
+ // per-dimension extensions present in the run sorted by count desc.
195
+ // CR-02: extensions are no longer narrowed to "unclassified"; the
196
+ // formatter now surfaces them rather than dropping the signal.
138
197
  lines.push(" Mode Count");
139
198
  lines.push(" ────────────────── ─────");
140
- for (const mode of ALL_MODES) {
199
+ for (const mode of orderedSummaryKeys(report.summary)) {
141
200
  const count = report.summary[mode] ?? 0;
142
201
  if (count > 0) {
143
202
  const icon = modeIcon(mode);
@@ -169,10 +228,12 @@ export function formatFailureModesMarkdown(report) {
169
228
  }
170
229
  lines.push(`**${report.totalJudgments} judgments** analyzed, **${report.classificationRate.toFixed(0)}%** classified`);
171
230
  lines.push("");
172
- // Summary table
231
+ // Summary table — legacy modes first, per-dimension extensions after
232
+ // (CR-02 — emission is now visible in aggregation rather than being
233
+ // collapsed to 'unclassified').
173
234
  lines.push("| Mode | Count | % |");
174
235
  lines.push("|------|-------|---|");
175
- for (const mode of ALL_MODES) {
236
+ for (const mode of orderedSummaryKeys(report.summary)) {
176
237
  const count = report.summary[mode] ?? 0;
177
238
  if (count > 0) {
178
239
  const pct = report.totalJudgments > 0
@@ -203,7 +264,31 @@ export function formatFailureModesMarkdown(report) {
203
264
  // ---------------------------------------------------------------------------
204
265
  // Internal helpers
205
266
  // ---------------------------------------------------------------------------
206
- /** Classify by ceiling decomposition structural signals */
267
+ /**
268
+ * Read the grader's emitted failureMode as the open-set
269
+ * `FailureModeType` (string). Per-dimension extensions from Plan 03-02
270
+ * (`false-floor`, `spec-mismatch`, `tool-misuse`, `factual-error`, …)
271
+ * survive the report aggregation as their own buckets — narrowing them
272
+ * to `"unclassified"` (the pre-CR-02 behavior) silently dropped at
273
+ * least 11 documented legal modes from `report.summary` and
274
+ * `report.byArea[*].topMode`. An absent or empty `failureMode` still
275
+ * buckets as `"unclassified"` so consumers see a stable label rather
276
+ * than an empty key.
277
+ */
278
+ function readEmittedMode(judgment) {
279
+ const emitted = judgment.failureMode;
280
+ if (typeof emitted !== "string" || emitted.length === 0) {
281
+ return "unclassified";
282
+ }
283
+ return emitted;
284
+ }
285
+ /**
286
+ * Classify by ceiling-decomposition structural signals — preserved
287
+ * verbatim from the pre-Plan-03-03 implementation. The function itself
288
+ * does not change; only its CALLER (`validateFailureMode`) changes how
289
+ * the output is consumed (confidence stamp instead of parallel
290
+ * classification signal).
291
+ */
207
292
  function classifyByCeiling(score, ceilingScore, floorScore) {
208
293
  const docLift = ceilingScore - floorScore;
209
294
  // Negative Doc Lift: docs are actively harmful
@@ -228,55 +313,6 @@ function classifyByCeiling(score, ceilingScore, floorScore) {
228
313
  }
229
314
  return null;
230
315
  }
231
- /** Classify by keyword matching on the reason text */
232
- function classifyByKeyword(reason) {
233
- // API errors checked first — prevents timeout messages containing
234
- // "deprecated" from being misclassified as outdated-docs.
235
- if (API_ERROR_PATTERN.test(reason)) {
236
- return { confidence: "high", mode: "api-error", source: "keyword" };
237
- }
238
- if (OUTDATED_PATTERN.test(reason)) {
239
- return { confidence: "high", mode: "outdated-docs", source: "keyword" };
240
- }
241
- if (MISSING_PATTERN.test(reason)) {
242
- return { confidence: "high", mode: "missing-docs", source: "keyword" };
243
- }
244
- if (INCORRECT_PATTERN.test(reason)) {
245
- return { confidence: "medium", mode: "incorrect-docs", source: "keyword" };
246
- }
247
- if (POOR_STRUCTURE_PATTERN.test(reason)) {
248
- return { confidence: "medium", mode: "poor-structure", source: "keyword" };
249
- }
250
- return null;
251
- }
252
- /**
253
- * Combine keyword and ceiling classifications.
254
- *
255
- * Priority:
256
- * 1. If both agree on mode → high confidence, source = "keyword+ceiling"
257
- * 2. If keyword matched → use keyword result
258
- * 3. If only ceiling matched → use ceiling result (lower confidence)
259
- * 4. If neither matched → unclassified
260
- */
261
- function combineClassifications(keyword, ceiling) {
262
- if (keyword && ceiling) {
263
- if (keyword.mode === ceiling.mode) {
264
- // Both agree — boost confidence
265
- return {
266
- confidence: "high",
267
- mode: keyword.mode,
268
- source: "keyword+ceiling",
269
- };
270
- }
271
- // Disagree — prefer keyword (it has more signal)
272
- return keyword;
273
- }
274
- if (keyword)
275
- return keyword;
276
- if (ceiling)
277
- return ceiling;
278
- return { confidence: "low", mode: "unclassified", source: "keyword" };
279
- }
280
316
  /**
281
317
  * Resolve area name from a task ID or description.
282
318
  *
@@ -310,15 +346,22 @@ function resolveArea(taskId, scoreByArea) {
310
346
  }
311
347
  return undefined;
312
348
  }
313
- /** Find the most common failure mode */
349
+ /**
350
+ * Find the most common failure mode in the per-area `modes` record.
351
+ *
352
+ * Iterates every key in the record (CR-02 — record is open-set since
353
+ * Plan 03-02 introduced per-dimension extensions) and picks the
354
+ * highest-count classified mode. Falls back to "unclassified" when
355
+ * the area has no classified emissions at all.
356
+ */
314
357
  function findTopMode(modes) {
315
358
  let topMode = "unclassified";
316
359
  let topCount = 0;
317
- for (const mode of ALL_MODES) {
360
+ for (const [mode, count] of Object.entries(modes)) {
318
361
  if (mode === "unclassified")
319
362
  continue; // Prefer classified modes
320
- if ((modes[mode] ?? 0) > topCount) {
321
- topCount = modes[mode];
363
+ if (count > topCount) {
364
+ topCount = count;
322
365
  topMode = mode;
323
366
  }
324
367
  }
@@ -327,19 +370,33 @@ function findTopMode(modes) {
327
370
  return "unclassified";
328
371
  return topMode;
329
372
  }
330
- /** Initialize mode counts to zero */
373
+ /**
374
+ * Initialize the per-area / per-summary mode-count record.
375
+ *
376
+ * Pre-allocates buckets for the legacy literacy modes (CR-02 — keeps
377
+ * stable presence for downstream consumers like Studio columns) and
378
+ * leaves per-dimension extensions to be added on first emission.
379
+ */
331
380
  function initModeCounts() {
332
- return {
333
- "api-error": 0,
334
- "incorrect-docs": 0,
335
- "missing-docs": 0,
336
- "model-limitation": 0,
337
- "outdated-docs": 0,
338
- "poor-structure": 0,
339
- unclassified: 0,
340
- };
381
+ const counts = {};
382
+ for (const mode of LEGACY_FAILURE_MODES) {
383
+ counts[mode] = 0;
384
+ }
385
+ return counts;
386
+ }
387
+ /** Stable display order for the summary tables — legacy first, then extensions. */
388
+ function orderedSummaryKeys(summary) {
389
+ const legacy = LEGACY_FAILURE_MODES.filter((m) => m in summary);
390
+ const extensions = Object.keys(summary)
391
+ .filter((m) => !isLegacyMode(m))
392
+ .sort((a, b) => (summary[b] ?? 0) - (summary[a] ?? 0));
393
+ return [...legacy, ...extensions];
394
+ }
395
+ const LEGACY_MODE_SET = new Set(LEGACY_FAILURE_MODES);
396
+ function isLegacyMode(mode) {
397
+ return LEGACY_MODE_SET.has(mode);
341
398
  }
342
- /** Get icon for a failure mode */
399
+ /** Get icon for a failure mode — legacy modes have dedicated icons; extensions fall back to a neutral marker. */
343
400
  function modeIcon(mode) {
344
401
  switch (mode) {
345
402
  case "api-error":
@@ -356,5 +413,9 @@ function modeIcon(mode) {
356
413
  return "🏗️";
357
414
  case "unclassified":
358
415
  return "❓";
416
+ default:
417
+ // Per-dimension extensions (Plan 03-02) — neutral icon, the mode
418
+ // name in the table still identifies the family.
419
+ return "•";
359
420
  }
360
421
  }
@@ -54,6 +54,7 @@ export function mapRequestToConfig(request, rootDir) {
54
54
  noRemoteCache: request.noRemoteCache ?? false,
55
55
  graderContext: request.graderContext,
56
56
  graderReplications: request.graderReplications,
57
+ borderlineReplications: request.borderlineReplications,
57
58
  urls: request.urls,
58
59
  headers: request.headers,
59
60
  allowedOrigins: request.allowedOrigins,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "4.6.0",
3
+ "version": "6.0.0",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"
@@ -52,15 +52,16 @@
52
52
  "@types/js-yaml": "^4.0.9",
53
53
  "@types/node": "^22.13.1",
54
54
  "nock": "^14.0.13",
55
+ "simple-statistics": "7.8.9",
55
56
  "tsx": "^4.19.2",
56
57
  "typescript": "^5.7.3",
57
58
  "vitest": "^4.1.5",
58
- "@sanity/ailf-shared": "0.1.0",
59
- "@sanity/ailf-core": "0.1.0"
59
+ "@sanity/ailf-core": "0.1.0",
60
+ "@sanity/ailf-shared": "0.1.0"
60
61
  },
61
62
  "scripts": {
62
- "build": "tsc && tsx scripts/bundle-workspace-deps.ts",
63
- "generate-configs": "tsx src/cli.ts generate-configs",
63
+ "build": "tsc && tsc -p tsconfig.scripts.json && tsx scripts/bundle-workspace-deps.ts",
64
+ "generate-configs": "tsx src/cli.ts generate-configs && tsx scripts/generate-diagnosis-config.ts",
64
65
  "fetch-docs": "tsx src/cli.ts fetch-docs",
65
66
  "measure-retrieval": "tsx src/cli.ts measure-retrieval",
66
67
  "eval": "tsx src/cli.ts eval",
@@ -77,6 +78,7 @@
77
78
  "pipeline": "tsx src/cli.ts pipeline",
78
79
  "validate": "tsx src/cli.ts validate config",
79
80
  "test": "vitest run",
81
+ "test:compiler": "AILF_E2E=1 vitest run src/pipeline/compiler/__tests__",
80
82
  "test:e2e": "AILF_E2E=1 vitest run src/__tests__/e2e",
81
83
  "test:e2e:adapters": "AILF_E2E=1 vitest run src/adapters",
82
84
  "test:e2e:api": "AILF_E2E_API=1 vitest run src/__tests__/api-tier2-tenant-integration.test.ts src/__tests__/gcs-artifact-writer-roundtrip.test.ts",