@sanity/ailf 4.6.0 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. package/canonical/grader-references/agent-harness-tools.yaml +42 -0
  2. package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
  3. package/canonical/grader-references/mcp-server-spec.yaml +51 -0
  4. package/canonical/grader-references/portable-text.yaml +48 -0
  5. package/config/diagnosis-cards.ts +318 -0
  6. package/config/models.ts +12 -0
  7. package/config/rubrics.ts +38 -2
  8. package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
  9. package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
  10. package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
  11. package/dist/_vendor/ailf-core/examples/index.js +146 -47
  12. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
  13. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
  14. package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
  15. package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
  16. package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
  17. package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
  18. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
  19. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
  20. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
  21. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
  22. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
  23. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
  24. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  25. package/dist/_vendor/ailf-core/index.js +4 -0
  26. package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
  27. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
  28. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
  29. package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
  30. package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
  31. package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
  32. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  33. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
  34. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  35. package/dist/_vendor/ailf-core/schemas/index.js +9 -0
  36. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
  37. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
  38. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
  39. package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
  40. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
  41. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
  42. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
  43. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
  44. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
  45. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +171 -0
  46. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
  47. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
  48. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
  49. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
  50. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
  51. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +104 -0
  52. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
  53. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +96 -0
  54. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +39 -0
  55. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +52 -0
  56. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
  57. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +77 -0
  58. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
  59. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
  60. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
  61. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +126 -0
  62. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
  63. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +107 -0
  64. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
  65. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +114 -0
  66. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
  67. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +273 -0
  68. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
  69. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
  70. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
  71. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
  72. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
  73. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
  74. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
  75. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
  76. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
  77. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
  78. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +16 -0
  79. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +86 -0
  80. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +50 -0
  81. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +35 -0
  82. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +136 -0
  83. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +153 -0
  84. package/dist/_vendor/ailf-core/services/index.d.ts +6 -0
  85. package/dist/_vendor/ailf-core/services/index.js +18 -0
  86. package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
  87. package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
  88. package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
  89. package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
  90. package/dist/_vendor/ailf-core/types/attribution.js +18 -0
  91. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
  92. package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
  93. package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
  94. package/dist/_vendor/ailf-core/types/confidence.js +7 -0
  95. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +271 -0
  96. package/dist/_vendor/ailf-core/types/diagnosis.js +19 -0
  97. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
  98. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
  99. package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
  100. package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
  101. package/dist/_vendor/ailf-core/types/index.js +15 -1
  102. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
  103. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
  104. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
  105. package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
  106. package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
  107. package/dist/adapters/api-client/build-request.d.ts +1 -0
  108. package/dist/adapters/api-client/build-request.js +3 -0
  109. package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
  110. package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
  111. package/dist/adapters/attribution/index.d.ts +9 -0
  112. package/dist/adapters/attribution/index.js +8 -0
  113. package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
  114. package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
  115. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  116. package/dist/adapters/grader-outputs/index.d.ts +10 -0
  117. package/dist/adapters/grader-outputs/index.js +8 -0
  118. package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
  119. package/dist/adapters/grader-outputs/legacy/index.js +10 -0
  120. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
  121. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
  122. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
  123. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
  124. package/dist/adapters/index.d.ts +3 -0
  125. package/dist/adapters/index.js +4 -0
  126. package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
  127. package/dist/adapters/llm/fake-llm-client.js +38 -1
  128. package/dist/adapters/llm/openai-llm-client.js +52 -3
  129. package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
  130. package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
  131. package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
  132. package/dist/adapters/task-sources/repo-schemas.js +19 -2
  133. package/dist/cli-program.js +3 -0
  134. package/dist/commands/calculate-scores.js +1 -1
  135. package/dist/commands/explain-handler.js +1 -1
  136. package/dist/commands/interpret.d.ts +50 -0
  137. package/dist/commands/interpret.js +212 -0
  138. package/dist/commands/lookup-doc.d.ts +1 -1
  139. package/dist/commands/lookup-doc.js +3 -3
  140. package/dist/commands/pipeline-action.d.ts +6 -0
  141. package/dist/commands/pipeline-action.js +2 -0
  142. package/dist/commands/remote-pipeline.js +1 -0
  143. package/dist/composition-root.d.ts +57 -23
  144. package/dist/composition-root.js +155 -41
  145. package/dist/config/diagnosis-cards.ts +318 -0
  146. package/dist/config/models.ts +12 -0
  147. package/dist/config/rubrics.ts +38 -2
  148. package/dist/grader/agent-harness.d.ts +9 -0
  149. package/dist/grader/agent-harness.js +9 -0
  150. package/dist/grader/common.d.ts +9 -0
  151. package/dist/grader/common.js +9 -0
  152. package/dist/grader/index.d.ts +24 -0
  153. package/dist/grader/index.js +24 -0
  154. package/dist/grader/knowledge-probe.d.ts +9 -0
  155. package/dist/grader/knowledge-probe.js +9 -0
  156. package/dist/grader/literacy.d.ts +9 -0
  157. package/dist/grader/literacy.js +9 -0
  158. package/dist/grader/mcp.d.ts +9 -0
  159. package/dist/grader/mcp.js +9 -0
  160. package/dist/orchestration/build-app-context.js +1 -0
  161. package/dist/orchestration/build-step-sequence.js +5 -0
  162. package/dist/orchestration/steps/calculate-scores-step.js +23 -1
  163. package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
  164. package/dist/orchestration/steps/compute-attribution-step.js +279 -0
  165. package/dist/orchestration/steps/gap-analysis-step.js +35 -7
  166. package/dist/orchestration/steps/index.d.ts +1 -0
  167. package/dist/orchestration/steps/index.js +1 -0
  168. package/dist/pipeline/attribution.d.ts +15 -0
  169. package/dist/pipeline/attribution.js +18 -9
  170. package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
  171. package/dist/pipeline/borderline-consensus-runner.js +124 -0
  172. package/dist/pipeline/borderline-detector.d.ts +24 -0
  173. package/dist/pipeline/borderline-detector.js +26 -0
  174. package/dist/pipeline/calculate-scores.d.ts +114 -3
  175. package/dist/pipeline/calculate-scores.js +426 -24
  176. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  177. package/dist/pipeline/compiler/literacy-bridge.js +35 -17
  178. package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
  179. package/dist/pipeline/compiler/rubric-resolution.js +9 -1
  180. package/dist/pipeline/compute-attribution.d.ts +80 -0
  181. package/dist/pipeline/compute-attribution.js +196 -0
  182. package/dist/pipeline/failure-modes.d.ts +52 -17
  183. package/dist/pipeline/failure-modes.js +178 -117
  184. package/dist/pipeline/map-request-to-config.js +1 -0
  185. package/package.json +7 -5
@@ -11,6 +11,15 @@
11
11
 
12
12
  import { defineRubrics } from "../_vendor/ailf-core/index.js"
13
13
 
14
+ // Plan 03-02 — per-dimension failure-mode taxonomies stamped onto each
15
+ // template entry below. Source of truth lives in packages/eval/src/grader/;
16
+ // the helper picks the right list by dimension family.
17
+ import { failureModesForDimension } from "../grader/index.js"
18
+ // Single source of truth for the wire-format version stamped into the
19
+ // grader-prompt footer (VER-01 D-02). Interpolated below so the
20
+ // announced version cannot drift from the schema's expected value.
21
+ import { graderJudgmentsVersion } from "../adapters/grader-outputs/index.js"
22
+
14
23
  export default defineRubrics({
15
24
  templates: {
16
25
  // ── Core literacy dimensions ────────────────────────────
@@ -25,6 +34,7 @@ export default defineRubrics({
25
34
  "100: Fully functional code — works as expected",
26
35
  ],
27
36
  criteria_label: "Must demonstrate:",
37
+ failureModes: failureModesForDimension("task-completion"),
28
38
  },
29
39
  "code-correctness": {
30
40
  dimension: "code-correctness",
@@ -37,6 +47,7 @@ export default defineRubrics({
37
47
  "100: Follows all best practices, idiomatic implementation",
38
48
  ],
39
49
  criteria_label: "Check for:",
50
+ failureModes: failureModesForDimension("code-correctness"),
40
51
  },
41
52
  "doc-coverage": {
42
53
  dimension: "doc-coverage",
@@ -48,6 +59,7 @@ export default defineRubrics({
48
59
  "80: Minor gaps — almost everything was documented",
49
60
  "100: Complete coverage — all necessary info was in docs",
50
61
  ],
62
+ failureModes: failureModesForDimension("doc-coverage"),
51
63
  },
52
64
 
53
65
  // ── MCP server dimensions ───────────────────────────────
@@ -62,6 +74,7 @@ export default defineRubrics({
62
74
  "100: Perfect tool inputs — all parameters correct and well-formed",
63
75
  ],
64
76
  criteria_label: "Evaluate:",
77
+ failureModes: failureModesForDimension("input-validation"),
65
78
  },
66
79
  "mcp-output-correctness": {
67
80
  dimension: "output-correctness",
@@ -74,6 +87,7 @@ export default defineRubrics({
74
87
  "100: Perfect output handling — correctly interpreted all tool responses",
75
88
  ],
76
89
  criteria_label: "Check for:",
90
+ failureModes: failureModesForDimension("output-correctness"),
77
91
  },
78
92
  "mcp-error-handling": {
79
93
  dimension: "error-handling",
@@ -86,6 +100,7 @@ export default defineRubrics({
86
100
  "100: Excellent — handled all errors appropriately with clear messaging",
87
101
  ],
88
102
  criteria_label: "Evaluate:",
103
+ failureModes: failureModesForDimension("error-handling"),
89
104
  },
90
105
  "mcp-security": {
91
106
  dimension: "security",
@@ -98,6 +113,7 @@ export default defineRubrics({
98
113
  "100: Perfect security — only used authorized tools with safe inputs",
99
114
  ],
100
115
  criteria_label: "Check for:",
116
+ failureModes: failureModesForDimension("security"),
101
117
  },
102
118
 
103
119
  // ── Knowledge probe dimensions ──────────────────────────
@@ -112,6 +128,7 @@ export default defineRubrics({
112
128
  "100: Fully correct — all statements are accurate and verifiable",
113
129
  ],
114
130
  criteria_label: "Verify:",
131
+ failureModes: failureModesForDimension("factual-correctness"),
115
132
  },
116
133
  completeness: {
117
134
  dimension: "completeness",
@@ -124,6 +141,7 @@ export default defineRubrics({
124
141
  "100: Comprehensive — thorough coverage of all important aspects",
125
142
  ],
126
143
  criteria_label: "Check coverage of:",
144
+ failureModes: failureModesForDimension("completeness"),
127
145
  },
128
146
  currency: {
129
147
  dimension: "currency",
@@ -136,6 +154,7 @@ export default defineRubrics({
136
154
  "100: Fully current — references latest APIs, patterns, and best practices",
137
155
  ],
138
156
  criteria_label: "Check for:",
157
+ failureModes: failureModesForDimension("currency"),
139
158
  },
140
159
 
141
160
  // ── Agent harness dimensions ────────────────────────────
@@ -151,6 +170,7 @@ export default defineRubrics({
151
170
  "100: Excellent process — optimal tool usage, clear planning, graceful recovery",
152
171
  ],
153
172
  criteria_label: "Evaluate:",
173
+ failureModes: failureModesForDimension("process-quality"),
154
174
  },
155
175
  "agent-output": {
156
176
  dimension: "agent-output",
@@ -163,6 +183,7 @@ export default defineRubrics({
163
183
  "100: Excellent output — fully correct, clean, and complete",
164
184
  ],
165
185
  criteria_label: "Check for:",
186
+ failureModes: failureModesForDimension("agent-output"),
166
187
  },
167
188
  "agent-tool-usage": {
168
189
  dimension: "tool-usage",
@@ -175,6 +196,7 @@ export default defineRubrics({
175
196
  "100: Excellent — optimal tool selection, correct inputs, minimal redundancy",
176
197
  ],
177
198
  criteria_label: "Evaluate:",
199
+ failureModes: failureModesForDimension("tool-usage"),
178
200
  },
179
201
  },
180
202
 
@@ -220,6 +242,20 @@ export default defineRubrics({
220
242
  "agent-harness": { gold: "agent-harness" },
221
243
  },
222
244
 
223
- footer:
224
- 'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}',
245
+ // Phase 3 GRAD-05 (Plan 03-01) — structured GraderJudgment JSON sketch.
246
+ // Documents the target wire format the grader emits. The strict schema's
247
+ // GRAD-02 additive fields stay optional in this plan; Plan 03-04 flips
248
+ // them to required and bumps graderJudgmentsVersion to 1.0.0.
249
+ footer: `Return ONLY a JSON object with this exact shape:
250
+ {
251
+ "judgmentId": "<string>",
252
+ "score": <number 0-100>,
253
+ "reason": "<explanation, ≤500 chars>",
254
+ "subJudgments": [{ "criterionId": "<id>", "met": <bool>, "evidence": "<≤280 chars>", "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" } }],
255
+ "docCitations": [{ "documentId": "<id>", "slug": "<optional slug>", "role": "supports|contradicts|missing|irrelevant", "hallucinated": <bool> }],
256
+ "failureMode": "<one of the declared modes for this dimension or 'unclassified'>",
257
+ "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" },
258
+ "hallucinationCheckedAgainst": ["<doc id>"],
259
+ "metadata": { "graderModel": "<string>", "graderJudgmentsVersion": "${graderJudgmentsVersion}" }
260
+ }`,
225
261
  })
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Agent-harness failure modes — re-export shim (D-05).
3
+ *
4
+ * Canonical data relocated to @sanity/ailf-core.
5
+ * Existing callers of this file continue to work unchanged.
6
+ *
7
+ * @see packages/core/src/grader/failure-modes/agent-harness.ts
8
+ */
9
+ export { AGENT_FAILURE_MODES, type AgentFailureMode } from "../_vendor/ailf-core/index.d.ts";
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Agent-harness failure modes — re-export shim (D-05).
3
+ *
4
+ * Canonical data relocated to @sanity/ailf-core.
5
+ * Existing callers of this file continue to work unchanged.
6
+ *
7
+ * @see packages/core/src/grader/failure-modes/agent-harness.ts
8
+ */
9
+ export { AGENT_FAILURE_MODES } from "../_vendor/ailf-core/index.js";
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Cross-cutting failure modes — re-export shim (D-05).
3
+ *
4
+ * Canonical data relocated to @sanity/ailf-core.
5
+ * Existing callers of this file continue to work unchanged.
6
+ *
7
+ * @see packages/core/src/grader/failure-modes/common.ts
8
+ */
9
+ export { COMMON_FAILURE_MODES, type CommonFailureMode } from "../_vendor/ailf-core/index.d.ts";
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Cross-cutting failure modes — re-export shim (D-05).
3
+ *
4
+ * Canonical data relocated to @sanity/ailf-core.
5
+ * Existing callers of this file continue to work unchanged.
6
+ *
7
+ * @see packages/core/src/grader/failure-modes/common.ts
8
+ */
9
+ export { COMMON_FAILURE_MODES } from "../_vendor/ailf-core/index.js";
@@ -0,0 +1,24 @@
1
+ /**
2
+ * Per-dimension failure-mode taxonomy barrel.
3
+ *
4
+ * D-05: taxonomy data relocated to @sanity/ailf-core so card files in
5
+ * packages/core/src/services/diagnosis/cards/ can import without violating
6
+ * the core→eval import direction rule.
7
+ *
8
+ * This file is now a re-export shim — all behavior lives in
9
+ * packages/core/src/grader/failure-modes/. Existing eval-side callers
10
+ * (rubrics.ts, rubric-resolution.ts, calibration.test.ts) continue to
11
+ * work with zero source changes.
12
+ *
13
+ * Named re-exports only (W0124 — never `export *`).
14
+ *
15
+ * Consumers:
16
+ * - `packages/eval/config/rubrics.ts` — calls `failureModesForDimension()`
17
+ * - `packages/eval/src/pipeline/compiler/rubric-resolution.ts`
18
+ * - `packages/eval/src/grader/__tests__/calibration.test.ts`
19
+ *
20
+ * @see packages/core/src/grader/failure-modes/index.ts — canonical location
21
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
22
+ * @see docs/decisions/D0005-grader-model-separation.md
23
+ */
24
+ export { AGENT_FAILURE_MODES, CANONICAL_DIMENSIONS, COMMON_FAILURE_MODES, KP_FAILURE_MODES, LITERACY_FAILURE_MODES, MCP_FAILURE_MODES, failureModesForDimension, isCanonicalFailureMode, type AgentFailureMode, type CommonFailureMode, type KPFailureMode, type LiteracyFailureMode, type MCPFailureMode, } from "../_vendor/ailf-core/index.d.ts";
@@ -0,0 +1,24 @@
1
+ /**
2
+ * Per-dimension failure-mode taxonomy barrel.
3
+ *
4
+ * D-05: taxonomy data relocated to @sanity/ailf-core so card files in
5
+ * packages/core/src/services/diagnosis/cards/ can import without violating
6
+ * the core→eval import direction rule.
7
+ *
8
+ * This file is now a re-export shim — all behavior lives in
9
+ * packages/core/src/grader/failure-modes/. Existing eval-side callers
10
+ * (rubrics.ts, rubric-resolution.ts, calibration.test.ts) continue to
11
+ * work with zero source changes.
12
+ *
13
+ * Named re-exports only (W0124 — never `export *`).
14
+ *
15
+ * Consumers:
16
+ * - `packages/eval/config/rubrics.ts` — calls `failureModesForDimension()`
17
+ * - `packages/eval/src/pipeline/compiler/rubric-resolution.ts`
18
+ * - `packages/eval/src/grader/__tests__/calibration.test.ts`
19
+ *
20
+ * @see packages/core/src/grader/failure-modes/index.ts — canonical location
21
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
22
+ * @see docs/decisions/D0005-grader-model-separation.md
23
+ */
24
+ export { AGENT_FAILURE_MODES, CANONICAL_DIMENSIONS, COMMON_FAILURE_MODES, KP_FAILURE_MODES, LITERACY_FAILURE_MODES, MCP_FAILURE_MODES, failureModesForDimension, isCanonicalFailureMode, } from "../_vendor/ailf-core/index.js";
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Knowledge-probe failure modes — re-export shim (D-05).
3
+ *
4
+ * Canonical data relocated to @sanity/ailf-core.
5
+ * Existing callers of this file continue to work unchanged.
6
+ *
7
+ * @see packages/core/src/grader/failure-modes/knowledge-probe.ts
8
+ */
9
+ export { KP_FAILURE_MODES, type KPFailureMode } from "../_vendor/ailf-core/index.d.ts";
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Knowledge-probe failure modes — re-export shim (D-05).
3
+ *
4
+ * Canonical data relocated to @sanity/ailf-core.
5
+ * Existing callers of this file continue to work unchanged.
6
+ *
7
+ * @see packages/core/src/grader/failure-modes/knowledge-probe.ts
8
+ */
9
+ export { KP_FAILURE_MODES } from "../_vendor/ailf-core/index.js";
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Literacy failure modes — re-export shim (D-05).
3
+ *
4
+ * Canonical data relocated to @sanity/ailf-core.
5
+ * Existing callers of this file continue to work unchanged.
6
+ *
7
+ * @see packages/core/src/grader/failure-modes/literacy.ts
8
+ */
9
+ export { LITERACY_FAILURE_MODES, type LiteracyFailureMode, } from "../_vendor/ailf-core/index.d.ts";
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Literacy failure modes — re-export shim (D-05).
3
+ *
4
+ * Canonical data relocated to @sanity/ailf-core.
5
+ * Existing callers of this file continue to work unchanged.
6
+ *
7
+ * @see packages/core/src/grader/failure-modes/literacy.ts
8
+ */
9
+ export { LITERACY_FAILURE_MODES, } from "../_vendor/ailf-core/index.js";
@@ -0,0 +1,9 @@
1
+ /**
2
+ * MCP failure modes — re-export shim (D-05).
3
+ *
4
+ * Canonical data relocated to @sanity/ailf-core.
5
+ * Existing callers of this file continue to work unchanged.
6
+ *
7
+ * @see packages/core/src/grader/failure-modes/mcp.ts
8
+ */
9
+ export { MCP_FAILURE_MODES, type MCPFailureMode } from "../_vendor/ailf-core/index.d.ts";
@@ -0,0 +1,9 @@
1
+ /**
2
+ * MCP failure modes — re-export shim (D-05).
3
+ *
4
+ * Canonical data relocated to @sanity/ailf-core.
5
+ * Existing callers of this file continue to work unchanged.
6
+ *
7
+ * @see packages/core/src/grader/failure-modes/mcp.ts
8
+ */
9
+ export { MCP_FAILURE_MODES } from "../_vendor/ailf-core/index.js";
@@ -49,6 +49,7 @@ export function mapToResolvedConfig(opts, rootDir) {
49
49
  noCache: opts.noCache,
50
50
  noRemoteCache: opts.noRemoteCache,
51
51
  graderReplications: opts.graderReplications,
52
+ borderlineReplications: opts.borderlineReplications,
52
53
  graderContext: opts.graderContext,
53
54
  outputDir: opts.outputDir,
54
55
  outputPath: opts.outputPath,
@@ -8,6 +8,7 @@
8
8
  import { LiteracyVariant } from "../pipeline/normalize-mode.js";
9
9
  import { CallbackStep } from "./steps/callback-step.js";
10
10
  import { CalculateScoresStep } from "./steps/calculate-scores-step.js";
11
+ import { ComputeAttributionStep } from "./steps/compute-attribution-step.js";
11
12
  import { CompareStep } from "./steps/compare-step.js";
12
13
  import { FetchDocsStep } from "./steps/fetch-docs-step.js";
13
14
  import { FinalizeRunStep } from "./steps/finalize-run-step.js";
@@ -75,6 +76,10 @@ export function buildStepSequence(ctx, pipelineStart = Date.now()) {
75
76
  if (config.gapAnalysisEnabled) {
76
77
  steps.push(new GapAnalysisStep());
77
78
  }
79
+ // Step 4b2: Per-judgment attribution ensemble (default-on).
80
+ // Depends on documentManifest being enriched onto score-summary.json
81
+ // by gap-analysis. Skipped silently when upstream files are missing.
82
+ steps.push(new ComputeAttributionStep());
78
83
  // Step 4c: Finalize the run — write `runs/{runId}/manifest.json` with the
79
84
  // catalog of artifacts produced so far. Skipped silently when no
80
85
  // artifactWriter is wired (D0032).
@@ -14,6 +14,8 @@ import { buildCacheContext } from "../cache-context.js";
14
14
  import { calculateAndWriteScores } from "../../pipeline/calculate-scores.js";
15
15
  import { checkResultsExist, checkScoreSummaryValid, } from "../../pipeline/checks.js";
16
16
  import { resultsFileForMode } from "../../pipeline/eval-constants.js";
17
+ import { gradeOnce, loadGraderModel } from "../../pipeline/grader-api.js";
18
+ import { createBorderlineConsensusRunner } from "../../composition-root.js";
17
19
  import { loadPreflightScoring } from "../../pipeline/preflight/load-preflight-scoring.js";
18
20
  import { loadSource } from "../../sources.js";
19
21
  import { uploadTestOutputs } from "../../pipeline/upload-test-outputs.js";
@@ -85,10 +87,30 @@ export class CalculateScoresStep {
85
87
  ctx.logger.warn(`[warn] W0198 preflight: failed to load preflight-scoring config — ${err instanceof Error ? err.message : String(err)}`);
86
88
  return undefined;
87
89
  });
90
+ // CR-01 — wire the borderline-consensus runner end-to-end. The
91
+ // composition root owns the threshold + replication defaults; the
92
+ // orchestration step supplies the regrade entry point (gradeOnce
93
+ // against the configured grader model). Built lazily — when no
94
+ // judgments are extracted (or none land in the ±5 borderline band),
95
+ // the runner short-circuits without paying the grader-model load.
96
+ let borderlineRegradeOnce;
97
+ try {
98
+ const grader = loadGraderModel(ctx.config.rootDir);
99
+ borderlineRegradeOnce = (responseText, rubricText) => gradeOnce(grader.id, responseText, rubricText, ctx.logger);
100
+ }
101
+ catch (err) {
102
+ ctx.logger.warn(`[warn] borderline consensus skipped — grader model not loadable: ${err instanceof Error ? err.message : String(err)}`);
103
+ }
104
+ const borderlineConsensusRunner = createBorderlineConsensusRunner(ctx.config.borderlineReplications !== undefined
105
+ ? { borderlineReplications: ctx.config.borderlineReplications }
106
+ : {});
88
107
  let belowCritical = [];
89
108
  try {
90
- const result = calculateAndWriteScores({
109
+ const result = await calculateAndWriteScores({
91
110
  allowedOrigins: ctx.config.allowedOrigins,
111
+ ...(borderlineRegradeOnce
112
+ ? { borderlineConsensusRunner, borderlineRegradeOnce }
113
+ : {}),
92
114
  logger: ctx.logger,
93
115
  // Pass the variant for literacy (scoring uses it to decide
94
116
  // whether to read agentic results), or mode for other modes
@@ -0,0 +1,44 @@
1
+ /**
2
+ * Pipeline step: Per-judgment attribution ensemble (v0).
3
+ *
4
+ * Reads `grader-judgments.json` and `score-summary.json` from the latest
5
+ * results, calls the pure `computeJudgmentAttribution(...)` helper for each
6
+ * judgment, and emits:
7
+ *
8
+ * - One `perEntryAttribution` artifact per judgment at
9
+ * `runs/{runId}/attribution/{entryKey}.json`
10
+ * - One `attributionMeta` artifact at
11
+ * `runs/{runId}/attribution-meta.json`
12
+ *
13
+ * Additionally, when any hallucinated citations are detected, the step
14
+ * atomically rewrites `score-summary.json` to persist
15
+ * `graderReliability.hallucinationCount` (D-05 — only this one direct-
16
+ * mutation path uses the temp+rename pattern; all artifact emissions go
17
+ * through `ctx.artifactWriter.emit` which handles atomicity internally).
18
+ *
19
+ * This step is `optional: true` — it self-skips when either
20
+ * `grader-judgments.json` or `score-summary.json` is missing, so
21
+ * non-graded runs are unaffected.
22
+ *
23
+ * Task → judgment join (D-10): `judgment.taskId` is the promptfoo row
24
+ * description, which for literacy mode is `"${task.title} (gold|baseline)"`.
25
+ * The join strips the variant suffix and looks up in a triple-keyed cache
26
+ * by `task.title` (primary), `task.description`, and `task.id` (defensive
27
+ * fallbacks for non-literacy modes).
28
+ *
29
+ * Retrieved signal (D-11/D-12): `FeatureAgentBehavior.feature` is the join
30
+ * key — for literacy mode it equals `task.area` exactly (compiler propagates
31
+ * `task.area → __featureArea → ab.feature`).
32
+ *
33
+ * @see docs/decisions/D0033-unified-artifact-writer.md
34
+ * @see docs/decisions/D0049-shared-confidence-contract.md
35
+ * @see docs/decisions/D0050-per-entry-attribution-layout.md
36
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
37
+ */
38
+ import type { AppContext, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
39
+ export declare class ComputeAttributionStep implements PipelineStep {
40
+ readonly name = "compute-attribution";
41
+ readonly optional = true;
42
+ check(ctx: AppContext): ValidationIssue[];
43
+ execute(ctx: AppContext, _state?: unknown): Promise<StepResult>;
44
+ }