@sanity/ailf 4.5.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/canonical/grader-references/agent-harness-tools.yaml +42 -0
  2. package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
  3. package/canonical/grader-references/mcp-server-spec.yaml +51 -0
  4. package/canonical/grader-references/portable-text.yaml +48 -0
  5. package/config/rubrics.ts +38 -2
  6. package/dist/_vendor/ailf-core/artifact-registry.d.ts +197 -2
  7. package/dist/_vendor/ailf-core/artifact-registry.js +419 -5
  8. package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
  9. package/dist/_vendor/ailf-core/examples/index.js +146 -47
  10. package/dist/_vendor/ailf-core/ports/context.d.ts +26 -0
  11. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  12. package/dist/_vendor/ailf-core/ports/index.js +1 -0
  13. package/dist/_vendor/ailf-core/ports/llm-client.d.ts +112 -0
  14. package/dist/_vendor/ailf-core/ports/llm-client.js +68 -0
  15. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
  16. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
  17. package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
  18. package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
  19. package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
  20. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  21. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
  22. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  23. package/dist/_vendor/ailf-core/schemas/index.js +9 -0
  24. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
  25. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
  26. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
  27. package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
  28. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
  29. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
  30. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
  31. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
  32. package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
  33. package/dist/_vendor/ailf-core/services/index.js +5 -0
  34. package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
  35. package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
  36. package/dist/_vendor/ailf-core/types/attribution.js +18 -0
  37. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
  38. package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
  39. package/dist/_vendor/ailf-core/types/confidence.d.ts +68 -0
  40. package/dist/_vendor/ailf-core/types/confidence.js +56 -0
  41. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
  42. package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
  43. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
  44. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
  45. package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
  46. package/dist/_vendor/ailf-core/types/index.d.ts +82 -29
  47. package/dist/_vendor/ailf-core/types/index.js +16 -1
  48. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
  49. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
  50. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
  51. package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
  52. package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
  53. package/dist/adapters/api-client/build-request.d.ts +1 -0
  54. package/dist/adapters/api-client/build-request.js +3 -0
  55. package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
  56. package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
  57. package/dist/adapters/attribution/index.d.ts +9 -0
  58. package/dist/adapters/attribution/index.js +8 -0
  59. package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
  60. package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
  61. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  62. package/dist/adapters/grader-outputs/index.d.ts +10 -0
  63. package/dist/adapters/grader-outputs/index.js +8 -0
  64. package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
  65. package/dist/adapters/grader-outputs/legacy/index.js +10 -0
  66. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
  67. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
  68. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
  69. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
  70. package/dist/adapters/index.d.ts +3 -0
  71. package/dist/adapters/index.js +4 -0
  72. package/dist/adapters/llm/anthropic-llm-client.d.ts +48 -0
  73. package/dist/adapters/llm/anthropic-llm-client.js +205 -0
  74. package/dist/adapters/llm/fake-llm-client.d.ts +49 -0
  75. package/dist/adapters/llm/fake-llm-client.js +63 -0
  76. package/dist/adapters/llm/index.d.ts +9 -0
  77. package/dist/adapters/llm/index.js +4 -0
  78. package/dist/adapters/llm/openai-llm-client.d.ts +44 -0
  79. package/dist/adapters/llm/openai-llm-client.js +168 -0
  80. package/dist/adapters/llm/pricing.d.ts +12 -0
  81. package/dist/adapters/llm/pricing.js +8 -0
  82. package/dist/adapters/llm/retry.d.ts +56 -0
  83. package/dist/adapters/llm/retry.js +66 -0
  84. package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
  85. package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
  86. package/dist/adapters/task-sources/repo-schemas.d.ts +90 -22
  87. package/dist/adapters/task-sources/repo-schemas.js +19 -2
  88. package/dist/artifact-capture/api-gateway-artifact-writer.js +2 -1
  89. package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +2 -1
  90. package/dist/artifact-capture/gcs-artifact-writer.js +3 -1
  91. package/dist/artifact-capture/local-fs-artifact-writer.js +3 -1
  92. package/dist/commands/calculate-scores.js +1 -1
  93. package/dist/commands/explain-handler.js +1 -1
  94. package/dist/commands/lookup-doc.d.ts +1 -1
  95. package/dist/commands/lookup-doc.js +3 -3
  96. package/dist/commands/pipeline-action.d.ts +6 -0
  97. package/dist/commands/pipeline-action.js +2 -0
  98. package/dist/commands/remote-pipeline.js +1 -0
  99. package/dist/composition-root.d.ts +59 -1
  100. package/dist/composition-root.js +95 -0
  101. package/dist/config/rubrics.ts +38 -2
  102. package/dist/grader/agent-harness.d.ts +14 -0
  103. package/dist/grader/agent-harness.js +17 -0
  104. package/dist/grader/common.d.ts +17 -0
  105. package/dist/grader/common.js +21 -0
  106. package/dist/grader/index.d.ts +38 -0
  107. package/dist/grader/index.js +75 -0
  108. package/dist/grader/knowledge-probe.d.ts +14 -0
  109. package/dist/grader/knowledge-probe.js +18 -0
  110. package/dist/grader/literacy.d.ts +13 -0
  111. package/dist/grader/literacy.js +17 -0
  112. package/dist/grader/mcp.d.ts +14 -0
  113. package/dist/grader/mcp.js +18 -0
  114. package/dist/orchestration/build-app-context.js +1 -0
  115. package/dist/orchestration/build-step-sequence.js +5 -0
  116. package/dist/orchestration/steps/calculate-scores-step.js +23 -1
  117. package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
  118. package/dist/orchestration/steps/compute-attribution-step.js +279 -0
  119. package/dist/orchestration/steps/gap-analysis-step.js +35 -7
  120. package/dist/orchestration/steps/index.d.ts +1 -0
  121. package/dist/orchestration/steps/index.js +1 -0
  122. package/dist/pipeline/attribution.d.ts +15 -0
  123. package/dist/pipeline/attribution.js +18 -9
  124. package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
  125. package/dist/pipeline/borderline-consensus-runner.js +124 -0
  126. package/dist/pipeline/borderline-detector.d.ts +24 -0
  127. package/dist/pipeline/borderline-detector.js +26 -0
  128. package/dist/pipeline/calculate-scores.d.ts +114 -3
  129. package/dist/pipeline/calculate-scores.js +426 -24
  130. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  131. package/dist/pipeline/compiler/literacy-bridge.js +35 -17
  132. package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
  133. package/dist/pipeline/compiler/rubric-resolution.js +9 -1
  134. package/dist/pipeline/compute-attribution.d.ts +80 -0
  135. package/dist/pipeline/compute-attribution.js +196 -0
  136. package/dist/pipeline/failure-modes.d.ts +52 -17
  137. package/dist/pipeline/failure-modes.js +178 -117
  138. package/dist/pipeline/map-request-to-config.js +1 -0
  139. package/package.json +6 -4
@@ -26,6 +26,8 @@ import { LocalFilesystemArtifactWriter } from "./artifact-capture/local-fs-artif
26
26
  import { resolveUploadConcurrency, setDefaultUploadConcurrency, } from "./artifact-capture/parallel-emit.js";
27
27
  import { UploadMetrics } from "./artifact-capture/upload-metrics.js";
28
28
  import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js";
29
+ import { AnthropicLLMClient, OpenAILLMClient } from "./adapters/llm/index.js";
30
+ import { runBorderlineConsensus, } from "./pipeline/borderline-consensus-runner.js";
29
31
  import { loadExternalPresets } from "./pipeline/compiler/preset-loader.js";
30
32
  import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
31
33
  import { PromptfooEvalAdapter } from "./adapters/eval-runners/promptfoo-eval-adapter.js";
@@ -91,12 +93,20 @@ export function createAppContext(config) {
91
93
  // from the context (D0032).
92
94
  const runId = generateRunId();
93
95
  logger.debug(`Pipeline runId: ${runId}`);
96
+ // LLM client (D0051) — wired when an API key is present. The grader path
97
+ // does NOT consume this; D0051 defers grader migration as a follow-up.
98
+ // Env mapping happens here so `createLLMClient` stays pure and testable.
99
+ const llmClient = createLLMClient(config, {
100
+ anthropicApiKey: process.env.ANTHROPIC_API_KEY,
101
+ openaiApiKey: process.env.OPENAI_API_KEY,
102
+ }, logger);
94
103
  return {
95
104
  artifactWriter,
96
105
  cache,
97
106
  config,
98
107
  docFetcher,
99
108
  evalRunner,
109
+ ...(llmClient ? { llmClient } : {}),
100
110
  logger,
101
111
  packageSurfaceResolver,
102
112
  progress,
@@ -107,6 +117,44 @@ export function createAppContext(config) {
107
117
  taskSource,
108
118
  };
109
119
  }
120
+ /**
121
+ * Select the LLMClient adapter based on `config.llmProvider` and the
122
+ * supplied API keys. Returns `undefined` when no usable credential is
123
+ * present — `AppContext.llmClient` stays unset and consumers handle that
124
+ * explicitly.
125
+ *
126
+ * Adapters never read `process.env` themselves (per
127
+ * `.claude/rules/typescript.md`); env mapping happens at the call site
128
+ * (typically `createAppContext`).
129
+ *
130
+ * Exported for unit-test access; not part of the public package API.
131
+ */
132
+ export function createLLMClient(config, keys, logger) {
133
+ const explicit = config.llmProvider;
134
+ const anthropicKey = keys.anthropicApiKey;
135
+ const openaiKey = keys.openaiApiKey;
136
+ // Auto-select: prefer Anthropic when both are present (matches the
137
+ // current grader's default model in `config/models.ts`).
138
+ const provider = explicit ?? (anthropicKey ? "anthropic" : openaiKey ? "openai" : undefined);
139
+ if (!provider) {
140
+ logger.debug("LLM client: not wired — no Anthropic or OpenAI API key supplied");
141
+ return undefined;
142
+ }
143
+ if (provider === "anthropic") {
144
+ if (!anthropicKey) {
145
+ logger.warn('llmProvider="anthropic" but no Anthropic API key supplied — LLMClient not wired');
146
+ return undefined;
147
+ }
148
+ logger.debug("LLM client: AnthropicLLMClient");
149
+ return new AnthropicLLMClient({ apiKey: anthropicKey, logger });
150
+ }
151
+ if (!openaiKey) {
152
+ logger.warn('llmProvider="openai" but no OpenAI API key supplied — LLMClient not wired');
153
+ return undefined;
154
+ }
155
+ logger.debug("LLM client: OpenAILLMClient");
156
+ return new OpenAILLMClient({ apiKey: openaiKey, logger });
157
+ }
110
158
  // ---------------------------------------------------------------------------
111
159
  // Sub-factories (extracted to keep createAppContext readable)
112
160
  // ---------------------------------------------------------------------------
@@ -446,3 +494,50 @@ function createReportStore(config) {
446
494
  undefined,
447
495
  });
448
496
  }
497
+ // ---------------------------------------------------------------------------
498
+ // Borderline-consensus wiring (Plan 03-04 / GRAD-04)
499
+ // ---------------------------------------------------------------------------
500
+ /**
501
+ * Severity boundaries from `packages/eval/config/thresholds.ts`
502
+ * (severity.critical/warning/info `composite-below` at L50/54/58 — 30, 50,
503
+ * 60). The borderline detector flags a judgment when its score is within
504
+ * ±5 of any of these. Composition-root reads them ONCE and threads the
505
+ * typed `readonly number[]` into `runBorderlineConsensus` rather than
506
+ * re-deriving them at each call site (Pitfall 5 — single source of truth
507
+ * for the scale).
508
+ */
509
+ export const BORDERLINE_SEVERITY_THRESHOLDS = [
510
+ 30, 50, 60,
511
+ ];
512
+ /**
513
+ * Default replications per borderline judgment when the caller's
514
+ * `RepoConfig.execution.borderlineReplications` is unset (locked answer
515
+ * #4 in plan 03-04). Three replications + the original score = four
516
+ * scores per consistency record, which is the minimum that produces a
517
+ * non-degenerate stdDev / median split.
518
+ */
519
+ export const DEFAULT_BORDERLINE_REPLICATIONS = 3;
520
+ /**
521
+ * Factory for the borderline-consensus runner. Returns a function that
522
+ * applies the severity-threshold and replication defaults from
523
+ * composition-root, leaving the live grader entry point (the `regrade`
524
+ * callback) and the candidate `judgments` array as runtime inputs.
525
+ *
526
+ * The pipeline-side caller (currently `pipeline/calculate-scores.ts`'s
527
+ * post-extraction junction) supplies the `regrade` callback that maps a
528
+ * `GraderJudgment` to a fresh score via the response/rubric text from
529
+ * the original Promptfoo result. See the runner's header for the
530
+ * rationale on injecting the regrader rather than calling `gradeOnce`
531
+ * inline (Pitfall 6 — preserve the runner's purity wrt the existing
532
+ * grader-comparison split).
533
+ */
534
+ export function createBorderlineConsensusRunner(opts) {
535
+ const replications = opts.borderlineReplications ?? DEFAULT_BORDERLINE_REPLICATIONS;
536
+ return (args) => runBorderlineConsensus({
537
+ judgments: args.judgments,
538
+ ...(args.logger ? { logger: args.logger } : {}),
539
+ regrade: args.regrade,
540
+ replications,
541
+ thresholds: BORDERLINE_SEVERITY_THRESHOLDS,
542
+ });
543
+ }
@@ -11,6 +11,15 @@
11
11
 
12
12
  import { defineRubrics } from "../_vendor/ailf-core/index.js"
13
13
 
14
+ // Plan 03-02 — per-dimension failure-mode taxonomies stamped onto each
15
+ // template entry below. Source of truth lives in packages/eval/src/grader/;
16
+ // the helper picks the right list by dimension family.
17
+ import { failureModesForDimension } from "../grader/index.js"
18
+ // Single source of truth for the wire-format version stamped into the
19
+ // grader-prompt footer (VER-01 D-02). Interpolated below so the
20
+ // announced version cannot drift from the schema's expected value.
21
+ import { graderJudgmentsVersion } from "../adapters/grader-outputs/index.js"
22
+
14
23
  export default defineRubrics({
15
24
  templates: {
16
25
  // ── Core literacy dimensions ────────────────────────────
@@ -25,6 +34,7 @@ export default defineRubrics({
25
34
  "100: Fully functional code — works as expected",
26
35
  ],
27
36
  criteria_label: "Must demonstrate:",
37
+ failureModes: failureModesForDimension("task-completion"),
28
38
  },
29
39
  "code-correctness": {
30
40
  dimension: "code-correctness",
@@ -37,6 +47,7 @@ export default defineRubrics({
37
47
  "100: Follows all best practices, idiomatic implementation",
38
48
  ],
39
49
  criteria_label: "Check for:",
50
+ failureModes: failureModesForDimension("code-correctness"),
40
51
  },
41
52
  "doc-coverage": {
42
53
  dimension: "doc-coverage",
@@ -48,6 +59,7 @@ export default defineRubrics({
48
59
  "80: Minor gaps — almost everything was documented",
49
60
  "100: Complete coverage — all necessary info was in docs",
50
61
  ],
62
+ failureModes: failureModesForDimension("doc-coverage"),
51
63
  },
52
64
 
53
65
  // ── MCP server dimensions ───────────────────────────────
@@ -62,6 +74,7 @@ export default defineRubrics({
62
74
  "100: Perfect tool inputs — all parameters correct and well-formed",
63
75
  ],
64
76
  criteria_label: "Evaluate:",
77
+ failureModes: failureModesForDimension("input-validation"),
65
78
  },
66
79
  "mcp-output-correctness": {
67
80
  dimension: "output-correctness",
@@ -74,6 +87,7 @@ export default defineRubrics({
74
87
  "100: Perfect output handling — correctly interpreted all tool responses",
75
88
  ],
76
89
  criteria_label: "Check for:",
90
+ failureModes: failureModesForDimension("output-correctness"),
77
91
  },
78
92
  "mcp-error-handling": {
79
93
  dimension: "error-handling",
@@ -86,6 +100,7 @@ export default defineRubrics({
86
100
  "100: Excellent — handled all errors appropriately with clear messaging",
87
101
  ],
88
102
  criteria_label: "Evaluate:",
103
+ failureModes: failureModesForDimension("error-handling"),
89
104
  },
90
105
  "mcp-security": {
91
106
  dimension: "security",
@@ -98,6 +113,7 @@ export default defineRubrics({
98
113
  "100: Perfect security — only used authorized tools with safe inputs",
99
114
  ],
100
115
  criteria_label: "Check for:",
116
+ failureModes: failureModesForDimension("security"),
101
117
  },
102
118
 
103
119
  // ── Knowledge probe dimensions ──────────────────────────
@@ -112,6 +128,7 @@ export default defineRubrics({
112
128
  "100: Fully correct — all statements are accurate and verifiable",
113
129
  ],
114
130
  criteria_label: "Verify:",
131
+ failureModes: failureModesForDimension("factual-correctness"),
115
132
  },
116
133
  completeness: {
117
134
  dimension: "completeness",
@@ -124,6 +141,7 @@ export default defineRubrics({
124
141
  "100: Comprehensive — thorough coverage of all important aspects",
125
142
  ],
126
143
  criteria_label: "Check coverage of:",
144
+ failureModes: failureModesForDimension("completeness"),
127
145
  },
128
146
  currency: {
129
147
  dimension: "currency",
@@ -136,6 +154,7 @@ export default defineRubrics({
136
154
  "100: Fully current — references latest APIs, patterns, and best practices",
137
155
  ],
138
156
  criteria_label: "Check for:",
157
+ failureModes: failureModesForDimension("currency"),
139
158
  },
140
159
 
141
160
  // ── Agent harness dimensions ────────────────────────────
@@ -151,6 +170,7 @@ export default defineRubrics({
151
170
  "100: Excellent process — optimal tool usage, clear planning, graceful recovery",
152
171
  ],
153
172
  criteria_label: "Evaluate:",
173
+ failureModes: failureModesForDimension("process-quality"),
154
174
  },
155
175
  "agent-output": {
156
176
  dimension: "agent-output",
@@ -163,6 +183,7 @@ export default defineRubrics({
163
183
  "100: Excellent output — fully correct, clean, and complete",
164
184
  ],
165
185
  criteria_label: "Check for:",
186
+ failureModes: failureModesForDimension("agent-output"),
166
187
  },
167
188
  "agent-tool-usage": {
168
189
  dimension: "tool-usage",
@@ -175,6 +196,7 @@ export default defineRubrics({
175
196
  "100: Excellent — optimal tool selection, correct inputs, minimal redundancy",
176
197
  ],
177
198
  criteria_label: "Evaluate:",
199
+ failureModes: failureModesForDimension("tool-usage"),
178
200
  },
179
201
  },
180
202
 
@@ -220,6 +242,20 @@ export default defineRubrics({
220
242
  "agent-harness": { gold: "agent-harness" },
221
243
  },
222
244
 
223
- footer:
224
- 'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}',
245
+ // Phase 3 GRAD-05 (Plan 03-01) — structured GraderJudgment JSON sketch.
246
+ // Documents the target wire format the grader emits. The strict schema's
247
+ // GRAD-02 additive fields stay optional in this plan; Plan 03-04 flips
248
+ // them to required and bumps graderJudgmentsVersion to 1.0.0.
249
+ footer: `Return ONLY a JSON object with this exact shape:
250
+ {
251
+ "judgmentId": "<string>",
252
+ "score": <number 0-100>,
253
+ "reason": "<explanation, ≤500 chars>",
254
+ "subJudgments": [{ "criterionId": "<id>", "met": <bool>, "evidence": "<≤280 chars>", "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" } }],
255
+ "docCitations": [{ "documentId": "<id>", "slug": "<optional slug>", "role": "supports|contradicts|missing|irrelevant", "hallucinated": <bool> }],
256
+ "failureMode": "<one of the declared modes for this dimension or 'unclassified'>",
257
+ "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" },
258
+ "hallucinationCheckedAgainst": ["<doc id>"],
259
+ "metadata": { "graderModel": "<string>", "graderJudgmentsVersion": "${graderJudgmentsVersion}" }
260
+ }`,
225
261
  })
@@ -0,0 +1,14 @@
1
+ /**
2
+ * Agent-harness failure modes — valid for the `agent-harness` dimension
3
+ * family (process-quality, agent-output, tool-usage).
4
+ *
5
+ * Phase 3 GRAD-03 (Plan 03-02). Agent-harness failures track how an agent
6
+ * uses tools and handles multi-step processes; the v0 modes are tool-misuse,
7
+ * chaotic-process (no plan), and missing-recovery (doesn't recover from
8
+ * tool errors).
9
+ *
10
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
11
+ * §"Per-dimension failure-mode taxonomies" (lines 239-283).
12
+ */
13
+ export declare const AGENT_FAILURE_MODES: readonly ["tool-misuse", "chaotic-process", "missing-recovery"];
14
+ export type AgentFailureMode = (typeof AGENT_FAILURE_MODES)[number];
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Agent-harness failure modes — valid for the `agent-harness` dimension
3
+ * family (process-quality, agent-output, tool-usage).
4
+ *
5
+ * Phase 3 GRAD-03 (Plan 03-02). Agent-harness failures track how an agent
6
+ * uses tools and handles multi-step processes; the v0 modes are tool-misuse,
7
+ * chaotic-process (no plan), and missing-recovery (doesn't recover from
8
+ * tool errors).
9
+ *
10
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
11
+ * §"Per-dimension failure-mode taxonomies" (lines 239-283).
12
+ */
13
+ export const AGENT_FAILURE_MODES = [
14
+ "tool-misuse", // assistant calls tools incorrectly or with wrong args
15
+ "chaotic-process", // assistant flails — undirected exploration, no plan
16
+ "missing-recovery", // assistant doesn't recover from a tool error
17
+ ];
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Cross-cutting failure modes — valid for any dimension family.
3
+ *
4
+ * Phase 3 GRAD-03 (Plan 03-02). The four cross-cutting modes capture failures
5
+ * that aren't tied to a specific dimension family: infrastructure failures,
6
+ * model ceiling effects, false-floor (model already knew the answer; docs
7
+ * added no value), and the low-confidence fallback. The per-dimension
8
+ * taxonomies (literacy, MCP, knowledge-probe, agent-harness) extend this
9
+ * cross-cutting list.
10
+ *
11
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
12
+ * §"Per-dimension failure-mode taxonomies" (lines 239-283 — the v0 lists)
13
+ * @see docs/decisions/D0005-grader-model-separation.md — single grader model;
14
+ * taxonomies travel with the rubric prompt for reproducibility.
15
+ */
16
+ export declare const COMMON_FAILURE_MODES: readonly ["api-error", "model-limitation", "false-floor", "unclassified"];
17
+ export type CommonFailureMode = (typeof COMMON_FAILURE_MODES)[number];
@@ -0,0 +1,21 @@
1
+ /**
2
+ * Cross-cutting failure modes — valid for any dimension family.
3
+ *
4
+ * Phase 3 GRAD-03 (Plan 03-02). The four cross-cutting modes capture failures
5
+ * that aren't tied to a specific dimension family: infrastructure failures,
6
+ * model ceiling effects, false-floor (model already knew the answer; docs
7
+ * added no value), and the low-confidence fallback. The per-dimension
8
+ * taxonomies (literacy, MCP, knowledge-probe, agent-harness) extend this
9
+ * cross-cutting list.
10
+ *
11
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
12
+ * §"Per-dimension failure-mode taxonomies" (lines 239-283 — the v0 lists)
13
+ * @see docs/decisions/D0005-grader-model-separation.md — single grader model;
14
+ * taxonomies travel with the rubric prompt for reproducibility.
15
+ */
16
+ export const COMMON_FAILURE_MODES = [
17
+ "api-error", // infrastructure failure, not a docs problem
18
+ "model-limitation", // high ceiling, model can't reach it
19
+ "false-floor", // model already knew the answer; docs added no value
20
+ "unclassified", // grader could not pick a mode (low-confidence fallback)
21
+ ];
@@ -0,0 +1,38 @@
1
+ /**
2
+ * Per-dimension failure-mode taxonomy barrel.
3
+ *
4
+ * Named re-exports only (W0124 — never `export *`).
5
+ *
6
+ * Consumers:
7
+ * - `packages/eval/config/rubrics.ts` — calls `failureModesForDimension()` to
8
+ * stamp a per-template legal-mode list onto every rubric template entry.
9
+ * - `packages/eval/src/pipeline/compiler/rubric-resolution.ts` — reads
10
+ * `template.failureModes` at prompt-assembly time and announces the legal
11
+ * modes to the grader before the structured-shape footer (Plan 03-01).
12
+ * - `packages/eval/src/grader/__tests__/calibration.test.ts` — fixture-driven
13
+ * ≥90% non-`unclassified` static calibration check (ROADMAP success
14
+ * criterion 1).
15
+ *
16
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
17
+ * §"Per-dimension failure-mode taxonomies" (lines 239-283).
18
+ * @see docs/decisions/D0005-grader-model-separation.md — single grader model;
19
+ * taxonomies travel with the rubric prompt for reproducibility.
20
+ */
21
+ export { COMMON_FAILURE_MODES, type CommonFailureMode } from "./common.js";
22
+ export { LITERACY_FAILURE_MODES, type LiteracyFailureMode } from "./literacy.js";
23
+ export { MCP_FAILURE_MODES, type MCPFailureMode } from "./mcp.js";
24
+ export { KP_FAILURE_MODES, type KPFailureMode } from "./knowledge-probe.js";
25
+ export { AGENT_FAILURE_MODES, type AgentFailureMode } from "./agent-harness.js";
26
+ /**
27
+ * Return the legal failure-mode list for a given rubric dimension.
28
+ *
29
+ * Accepts both family-level keys (`mcp-behavior`, `knowledge-probe`,
30
+ * `agent-harness`) and the per-template `dimension` strings used in
31
+ * `config/rubrics.ts` (`task-completion`, `input-validation`,
32
+ * `factual-correctness`, `process-quality`, …). The cross-cutting
33
+ * `COMMON_FAILURE_MODES` is always included.
34
+ *
35
+ * Unknown dimensions fall through to `COMMON_FAILURE_MODES` only — safe
36
+ * default, the grader can still pick `unclassified`.
37
+ */
38
+ export declare function failureModesForDimension(dimension: string): readonly string[];
@@ -0,0 +1,75 @@
1
+ /**
2
+ * Per-dimension failure-mode taxonomy barrel.
3
+ *
4
+ * Named re-exports only (W0124 — never `export *`).
5
+ *
6
+ * Consumers:
7
+ * - `packages/eval/config/rubrics.ts` — calls `failureModesForDimension()` to
8
+ * stamp a per-template legal-mode list onto every rubric template entry.
9
+ * - `packages/eval/src/pipeline/compiler/rubric-resolution.ts` — reads
10
+ * `template.failureModes` at prompt-assembly time and announces the legal
11
+ * modes to the grader before the structured-shape footer (Plan 03-01).
12
+ * - `packages/eval/src/grader/__tests__/calibration.test.ts` — fixture-driven
13
+ * ≥90% non-`unclassified` static calibration check (ROADMAP success
14
+ * criterion 1).
15
+ *
16
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
17
+ * §"Per-dimension failure-mode taxonomies" (lines 239-283).
18
+ * @see docs/decisions/D0005-grader-model-separation.md — single grader model;
19
+ * taxonomies travel with the rubric prompt for reproducibility.
20
+ */
21
+ export { COMMON_FAILURE_MODES } from "./common.js";
22
+ export { LITERACY_FAILURE_MODES } from "./literacy.js";
23
+ export { MCP_FAILURE_MODES } from "./mcp.js";
24
+ export { KP_FAILURE_MODES } from "./knowledge-probe.js";
25
+ export { AGENT_FAILURE_MODES } from "./agent-harness.js";
26
+ import { COMMON_FAILURE_MODES } from "./common.js";
27
+ import { LITERACY_FAILURE_MODES } from "./literacy.js";
28
+ import { MCP_FAILURE_MODES } from "./mcp.js";
29
+ import { KP_FAILURE_MODES } from "./knowledge-probe.js";
30
+ import { AGENT_FAILURE_MODES } from "./agent-harness.js";
31
+ /**
32
+ * Return the legal failure-mode list for a given rubric dimension.
33
+ *
34
+ * Accepts both family-level keys (`mcp-behavior`, `knowledge-probe`,
35
+ * `agent-harness`) and the per-template `dimension` strings used in
36
+ * `config/rubrics.ts` (`task-completion`, `input-validation`,
37
+ * `factual-correctness`, `process-quality`, …). The cross-cutting
38
+ * `COMMON_FAILURE_MODES` is always included.
39
+ *
40
+ * Unknown dimensions fall through to `COMMON_FAILURE_MODES` only — safe
41
+ * default, the grader can still pick `unclassified`.
42
+ */
43
+ export function failureModesForDimension(dimension) {
44
+ switch (dimension) {
45
+ // ── Literacy family ──────────────────────────────────────
46
+ case "task-completion":
47
+ case "code-correctness":
48
+ case "doc-coverage":
49
+ return [...COMMON_FAILURE_MODES, ...LITERACY_FAILURE_MODES];
50
+ // ── MCP family ───────────────────────────────────────────
51
+ // `mcp-behavior` is the family-level key (profile / depends-on
52
+ // shorthand). The per-template `dimension` strings are the four
53
+ // entries from config/rubrics.ts mcp-* templates.
54
+ case "mcp-behavior":
55
+ case "input-validation":
56
+ case "output-correctness":
57
+ case "error-handling":
58
+ case "security":
59
+ return [...COMMON_FAILURE_MODES, ...MCP_FAILURE_MODES];
60
+ // ── Knowledge-probe family ───────────────────────────────
61
+ case "knowledge-probe":
62
+ case "factual-correctness":
63
+ case "completeness":
64
+ case "currency":
65
+ return [...COMMON_FAILURE_MODES, ...KP_FAILURE_MODES];
66
+ // ── Agent-harness family ─────────────────────────────────
67
+ case "agent-harness":
68
+ case "process-quality":
69
+ case "agent-output":
70
+ case "tool-usage":
71
+ return [...COMMON_FAILURE_MODES, ...AGENT_FAILURE_MODES];
72
+ default:
73
+ return COMMON_FAILURE_MODES;
74
+ }
75
+ }
@@ -0,0 +1,14 @@
1
+ /**
2
+ * Knowledge-probe failure modes — valid for the `knowledge-probe` dimension
3
+ * family (factual-correctness, completeness, currency).
4
+ *
5
+ * Phase 3 GRAD-03 (Plan 03-02). Knowledge-probe failures track the model's
6
+ * ability to recall facts about Sanity's surface area; the v0 modes
7
+ * differentiate factual errors from omissions, currency drift, and
8
+ * hallucination.
9
+ *
10
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
11
+ * §"Per-dimension failure-mode taxonomies" (lines 239-283).
12
+ */
13
+ export declare const KP_FAILURE_MODES: readonly ["factual-error", "incompleteness", "currency-violation", "hallucination"];
14
+ export type KPFailureMode = (typeof KP_FAILURE_MODES)[number];
@@ -0,0 +1,18 @@
1
+ /**
2
+ * Knowledge-probe failure modes — valid for the `knowledge-probe` dimension
3
+ * family (factual-correctness, completeness, currency).
4
+ *
5
+ * Phase 3 GRAD-03 (Plan 03-02). Knowledge-probe failures track the model's
6
+ * ability to recall facts about Sanity's surface area; the v0 modes
7
+ * differentiate factual errors from omissions, currency drift, and
8
+ * hallucination.
9
+ *
10
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
11
+ * §"Per-dimension failure-mode taxonomies" (lines 239-283).
12
+ */
13
+ export const KP_FAILURE_MODES = [
14
+ "factual-error", // assistant asserts something demonstrably false
15
+ "incompleteness", // assistant covers part of the answer; misses key piece
16
+ "currency-violation", // assistant cites stale facts beyond doc currency horizon
17
+ "hallucination", // assistant invents details not present in any doc
18
+ ];
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Literacy failure modes — valid for `task-completion`, `code-correctness`,
3
+ * `doc-coverage` (the literacy dimension family).
4
+ *
5
+ * Phase 3 GRAD-03 (Plan 03-02). The v0 list is the four canonical
6
+ * documentation-quality failure modes. Conservative on purpose; calibration
7
+ * (Plan 03-02 Task 3) reveals whether expansion is needed in a follow-on.
8
+ *
9
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
10
+ * §"Per-dimension failure-mode taxonomies" (lines 239-283).
11
+ */
12
+ export declare const LITERACY_FAILURE_MODES: readonly ["missing-docs", "outdated-docs", "incorrect-docs", "poor-structure"];
13
+ export type LiteracyFailureMode = (typeof LITERACY_FAILURE_MODES)[number];
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Literacy failure modes — valid for `task-completion`, `code-correctness`,
3
+ * `doc-coverage` (the literacy dimension family).
4
+ *
5
+ * Phase 3 GRAD-03 (Plan 03-02). The v0 list is the four canonical
6
+ * documentation-quality failure modes. Conservative on purpose; calibration
7
+ * (Plan 03-02 Task 3) reveals whether expansion is needed in a follow-on.
8
+ *
9
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
10
+ * §"Per-dimension failure-mode taxonomies" (lines 239-283).
11
+ */
12
+ export const LITERACY_FAILURE_MODES = [
13
+ "missing-docs", // relevant doc didn't exist
14
+ "outdated-docs", // doc reflects an older API/version
15
+ "incorrect-docs", // doc states something factually wrong
16
+ "poor-structure", // doc exists but is hard to find or follow
17
+ ];
@@ -0,0 +1,14 @@
1
+ /**
2
+ * MCP failure modes — valid for the `mcp-behavior` dimension family
3
+ * (input-validation, output-correctness, error-handling, security).
4
+ *
5
+ * Phase 3 GRAD-03 (Plan 03-02). `missing-docs` is intentionally re-exported
6
+ * from the literacy family — MCP server tasks frequently fail because the
7
+ * MCP spec itself is under-documented; that's a literacy failure even when
8
+ * surfaced through MCP grading.
9
+ *
10
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
11
+ * §"Per-dimension failure-mode taxonomies" (lines 239-283).
12
+ */
13
+ export declare const MCP_FAILURE_MODES: readonly ["spec-mismatch", "missing-error-handling", "over-privileged", "missing-docs"];
14
+ export type MCPFailureMode = (typeof MCP_FAILURE_MODES)[number];
@@ -0,0 +1,18 @@
1
+ /**
2
+ * MCP failure modes — valid for the `mcp-behavior` dimension family
3
+ * (input-validation, output-correctness, error-handling, security).
4
+ *
5
+ * Phase 3 GRAD-03 (Plan 03-02). `missing-docs` is intentionally re-exported
6
+ * from the literacy family — MCP server tasks frequently fail because the
7
+ * MCP spec itself is under-documented; that's a literacy failure even when
8
+ * surfaced through MCP grading.
9
+ *
10
+ * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
11
+ * §"Per-dimension failure-mode taxonomies" (lines 239-283).
12
+ */
13
+ export const MCP_FAILURE_MODES = [
14
+ "spec-mismatch", // tool/server output doesn't match published MCP spec
15
+ "missing-error-handling", // tool failure path under-documented or absent
16
+ "over-privileged", // tool exposes operations the doc didn't sanction
17
+ "missing-docs", // re-export from literacy (cross-cutting)
18
+ ];
@@ -49,6 +49,7 @@ export function mapToResolvedConfig(opts, rootDir) {
49
49
  noCache: opts.noCache,
50
50
  noRemoteCache: opts.noRemoteCache,
51
51
  graderReplications: opts.graderReplications,
52
+ borderlineReplications: opts.borderlineReplications,
52
53
  graderContext: opts.graderContext,
53
54
  outputDir: opts.outputDir,
54
55
  outputPath: opts.outputPath,
@@ -8,6 +8,7 @@
8
8
  import { LiteracyVariant } from "../pipeline/normalize-mode.js";
9
9
  import { CallbackStep } from "./steps/callback-step.js";
10
10
  import { CalculateScoresStep } from "./steps/calculate-scores-step.js";
11
+ import { ComputeAttributionStep } from "./steps/compute-attribution-step.js";
11
12
  import { CompareStep } from "./steps/compare-step.js";
12
13
  import { FetchDocsStep } from "./steps/fetch-docs-step.js";
13
14
  import { FinalizeRunStep } from "./steps/finalize-run-step.js";
@@ -75,6 +76,10 @@ export function buildStepSequence(ctx, pipelineStart = Date.now()) {
75
76
  if (config.gapAnalysisEnabled) {
76
77
  steps.push(new GapAnalysisStep());
77
78
  }
79
+ // Step 4b2: Per-judgment attribution ensemble (default-on).
80
+ // Depends on documentManifest being enriched onto score-summary.json
81
+ // by gap-analysis. Skipped silently when upstream files are missing.
82
+ steps.push(new ComputeAttributionStep());
78
83
  // Step 4c: Finalize the run — write `runs/{runId}/manifest.json` with the
79
84
  // catalog of artifacts produced so far. Skipped silently when no
80
85
  // artifactWriter is wired (D0032).
@@ -14,6 +14,8 @@ import { buildCacheContext } from "../cache-context.js";
14
14
  import { calculateAndWriteScores } from "../../pipeline/calculate-scores.js";
15
15
  import { checkResultsExist, checkScoreSummaryValid, } from "../../pipeline/checks.js";
16
16
  import { resultsFileForMode } from "../../pipeline/eval-constants.js";
17
+ import { gradeOnce, loadGraderModel } from "../../pipeline/grader-api.js";
18
+ import { createBorderlineConsensusRunner } from "../../composition-root.js";
17
19
  import { loadPreflightScoring } from "../../pipeline/preflight/load-preflight-scoring.js";
18
20
  import { loadSource } from "../../sources.js";
19
21
  import { uploadTestOutputs } from "../../pipeline/upload-test-outputs.js";
@@ -85,10 +87,30 @@ export class CalculateScoresStep {
85
87
  ctx.logger.warn(`[warn] W0198 preflight: failed to load preflight-scoring config — ${err instanceof Error ? err.message : String(err)}`);
86
88
  return undefined;
87
89
  });
90
+ // CR-01 — wire the borderline-consensus runner end-to-end. The
91
+ // composition root owns the threshold + replication defaults; the
92
+ // orchestration step supplies the regrade entry point (gradeOnce
93
+ // against the configured grader model). Built lazily — when no
94
+ // judgments are extracted (or none land in the ±5 borderline band),
95
+ // the runner short-circuits without paying the grader-model load.
96
+ let borderlineRegradeOnce;
97
+ try {
98
+ const grader = loadGraderModel(ctx.config.rootDir);
99
+ borderlineRegradeOnce = (responseText, rubricText) => gradeOnce(grader.id, responseText, rubricText, ctx.logger);
100
+ }
101
+ catch (err) {
102
+ ctx.logger.warn(`[warn] borderline consensus skipped — grader model not loadable: ${err instanceof Error ? err.message : String(err)}`);
103
+ }
104
+ const borderlineConsensusRunner = createBorderlineConsensusRunner(ctx.config.borderlineReplications !== undefined
105
+ ? { borderlineReplications: ctx.config.borderlineReplications }
106
+ : {});
88
107
  let belowCritical = [];
89
108
  try {
90
- const result = calculateAndWriteScores({
109
+ const result = await calculateAndWriteScores({
91
110
  allowedOrigins: ctx.config.allowedOrigins,
111
+ ...(borderlineRegradeOnce
112
+ ? { borderlineConsensusRunner, borderlineRegradeOnce }
113
+ : {}),
92
114
  logger: ctx.logger,
93
115
  // Pass the variant for literacy (scoring uses it to decide
94
116
  // whether to read agentic results), or mode for other modes