@sanity/ailf 4.6.0 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. package/canonical/grader-references/agent-harness-tools.yaml +42 -0
  2. package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
  3. package/canonical/grader-references/mcp-server-spec.yaml +51 -0
  4. package/canonical/grader-references/portable-text.yaml +48 -0
  5. package/config/diagnosis-cards.ts +318 -0
  6. package/config/models.ts +12 -0
  7. package/config/rubrics.ts +38 -2
  8. package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
  9. package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
  10. package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
  11. package/dist/_vendor/ailf-core/examples/index.js +146 -47
  12. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
  13. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
  14. package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
  15. package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
  16. package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
  17. package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
  18. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
  19. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
  20. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
  21. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
  22. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
  23. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
  24. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  25. package/dist/_vendor/ailf-core/index.js +4 -0
  26. package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
  27. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
  28. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
  29. package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
  30. package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
  31. package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
  32. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  33. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
  34. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  35. package/dist/_vendor/ailf-core/schemas/index.js +9 -0
  36. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
  37. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
  38. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
  39. package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
  40. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
  41. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
  42. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
  43. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
  44. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
  45. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +171 -0
  46. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
  47. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
  48. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
  49. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
  50. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
  51. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +104 -0
  52. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
  53. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +96 -0
  54. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +39 -0
  55. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +52 -0
  56. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
  57. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +77 -0
  58. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
  59. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
  60. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
  61. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +126 -0
  62. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
  63. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +107 -0
  64. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
  65. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +114 -0
  66. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
  67. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +273 -0
  68. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
  69. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
  70. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
  71. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
  72. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
  73. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
  74. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
  75. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
  76. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
  77. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
  78. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +16 -0
  79. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +86 -0
  80. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +50 -0
  81. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +35 -0
  82. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +136 -0
  83. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +153 -0
  84. package/dist/_vendor/ailf-core/services/index.d.ts +6 -0
  85. package/dist/_vendor/ailf-core/services/index.js +18 -0
  86. package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
  87. package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
  88. package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
  89. package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
  90. package/dist/_vendor/ailf-core/types/attribution.js +18 -0
  91. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
  92. package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
  93. package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
  94. package/dist/_vendor/ailf-core/types/confidence.js +7 -0
  95. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +271 -0
  96. package/dist/_vendor/ailf-core/types/diagnosis.js +19 -0
  97. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
  98. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
  99. package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
  100. package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
  101. package/dist/_vendor/ailf-core/types/index.js +15 -1
  102. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
  103. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
  104. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
  105. package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
  106. package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
  107. package/dist/adapters/api-client/build-request.d.ts +1 -0
  108. package/dist/adapters/api-client/build-request.js +3 -0
  109. package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
  110. package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
  111. package/dist/adapters/attribution/index.d.ts +9 -0
  112. package/dist/adapters/attribution/index.js +8 -0
  113. package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
  114. package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
  115. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  116. package/dist/adapters/grader-outputs/index.d.ts +10 -0
  117. package/dist/adapters/grader-outputs/index.js +8 -0
  118. package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
  119. package/dist/adapters/grader-outputs/legacy/index.js +10 -0
  120. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
  121. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
  122. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
  123. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
  124. package/dist/adapters/index.d.ts +3 -0
  125. package/dist/adapters/index.js +4 -0
  126. package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
  127. package/dist/adapters/llm/fake-llm-client.js +38 -1
  128. package/dist/adapters/llm/openai-llm-client.js +52 -3
  129. package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
  130. package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
  131. package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
  132. package/dist/adapters/task-sources/repo-schemas.js +19 -2
  133. package/dist/cli-program.js +3 -0
  134. package/dist/commands/calculate-scores.js +1 -1
  135. package/dist/commands/explain-handler.js +1 -1
  136. package/dist/commands/interpret.d.ts +50 -0
  137. package/dist/commands/interpret.js +212 -0
  138. package/dist/commands/lookup-doc.d.ts +1 -1
  139. package/dist/commands/lookup-doc.js +3 -3
  140. package/dist/commands/pipeline-action.d.ts +6 -0
  141. package/dist/commands/pipeline-action.js +2 -0
  142. package/dist/commands/remote-pipeline.js +1 -0
  143. package/dist/composition-root.d.ts +57 -23
  144. package/dist/composition-root.js +155 -41
  145. package/dist/config/diagnosis-cards.ts +318 -0
  146. package/dist/config/models.ts +12 -0
  147. package/dist/config/rubrics.ts +38 -2
  148. package/dist/grader/agent-harness.d.ts +9 -0
  149. package/dist/grader/agent-harness.js +9 -0
  150. package/dist/grader/common.d.ts +9 -0
  151. package/dist/grader/common.js +9 -0
  152. package/dist/grader/index.d.ts +24 -0
  153. package/dist/grader/index.js +24 -0
  154. package/dist/grader/knowledge-probe.d.ts +9 -0
  155. package/dist/grader/knowledge-probe.js +9 -0
  156. package/dist/grader/literacy.d.ts +9 -0
  157. package/dist/grader/literacy.js +9 -0
  158. package/dist/grader/mcp.d.ts +9 -0
  159. package/dist/grader/mcp.js +9 -0
  160. package/dist/orchestration/build-app-context.js +1 -0
  161. package/dist/orchestration/build-step-sequence.js +5 -0
  162. package/dist/orchestration/steps/calculate-scores-step.js +23 -1
  163. package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
  164. package/dist/orchestration/steps/compute-attribution-step.js +279 -0
  165. package/dist/orchestration/steps/gap-analysis-step.js +35 -7
  166. package/dist/orchestration/steps/index.d.ts +1 -0
  167. package/dist/orchestration/steps/index.js +1 -0
  168. package/dist/pipeline/attribution.d.ts +15 -0
  169. package/dist/pipeline/attribution.js +18 -9
  170. package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
  171. package/dist/pipeline/borderline-consensus-runner.js +124 -0
  172. package/dist/pipeline/borderline-detector.d.ts +24 -0
  173. package/dist/pipeline/borderline-detector.js +26 -0
  174. package/dist/pipeline/calculate-scores.d.ts +114 -3
  175. package/dist/pipeline/calculate-scores.js +426 -24
  176. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  177. package/dist/pipeline/compiler/literacy-bridge.js +35 -17
  178. package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
  179. package/dist/pipeline/compiler/rubric-resolution.js +9 -1
  180. package/dist/pipeline/compute-attribution.d.ts +80 -0
  181. package/dist/pipeline/compute-attribution.js +196 -0
  182. package/dist/pipeline/failure-modes.d.ts +52 -17
  183. package/dist/pipeline/failure-modes.js +178 -117
  184. package/dist/pipeline/map-request-to-config.js +1 -0
  185. package/package.json +7 -5
@@ -0,0 +1,42 @@
1
+ # agent-harness-tools.yaml — Human-graded reference sample for agent-harness tasks
2
+ #
3
+ # Phase 3 GRAD-03 calibration fixture (Plan 03-02). Top-level
4
+ # `expectedFailureMode` + `dimension` are read by the calibration test in
5
+ # packages/eval/src/grader/__tests__/calibration.test.ts.
6
+
7
+ - taskId: agent-multi-step-schema-edit
8
+ area: agent-harness
9
+ dimension: agent-harness
10
+ expectedFailureMode: tool-misuse
11
+ response: |
12
+ To add a `coverImage` field to the `post` schema and migrate existing
13
+ documents, the agent ran the following tool calls:
14
+
15
+ 1. `read_file({ path: "schemas/post.ts" })` — read the schema.
16
+ 2. `write_file({ path: "schemas/post.ts", content: "<new schema>" })` — wrote the updated schema.
17
+ 3. `run_command({ command: "sanity migrate add-cover-image" })` — attempted to run a migration.
18
+
19
+ The `sanity migrate` command failed because no migration script exists at
20
+ that name. The agent then stopped without creating the migration file.
21
+ rubrics:
22
+ - dimension: tool-usage
23
+ rubricText: |
24
+ Score the agent's tool usage from 0 to 100:
25
+ - 0: Completely wrong tool usage — called wrong tools or with invalid inputs
26
+ - 25: Poor tool usage — correct tools but wrong parameters or sequencing
27
+ - 50: Adequate — correct tools and basic parameters, some inefficiency
28
+ - 75: Good — efficient tool usage with proper error handling
29
+ - 100: Excellent — optimal tool selection, correct inputs, minimal redundancy
30
+
31
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
32
+ humanScore: 25
33
+ notes: |
34
+ The agent invoked `run_command` with a migration name that did not
35
+ exist in the project, instead of first listing available migrations
36
+ with `list_migrations` (or equivalent) or scaffolding a new migration
37
+ file with the correct tool. The migration step was a tool-misuse:
38
+ the right tool exists, but the agent supplied invalid arguments
39
+ (a non-existent migration name) and then failed to recover by
40
+ scaffolding a new migration. Failure mode: tool-misuse.
41
+ gradedBy: "senior-engineer"
42
+ gradedAt: "2026-05-10"
@@ -0,0 +1,36 @@
1
+ # knowledge-probe-recall.yaml — Human-graded reference sample for KP tasks
2
+ #
3
+ # Phase 3 GRAD-03 calibration fixture (Plan 03-02). Top-level
4
+ # `expectedFailureMode` + `dimension` are read by the calibration test in
5
+ # packages/eval/src/grader/__tests__/calibration.test.ts.
6
+
7
+ - taskId: kp-default-dataset
8
+ area: knowledge-probe
9
+ dimension: knowledge-probe
10
+ expectedFailureMode: factual-error
11
+ response: |
12
+ The default dataset for new Sanity Studio projects is `production`. When
13
+ you run `npm create sanity@latest` and accept the defaults, the CLI
14
+ creates a dataset named `production`.
15
+ rubrics:
16
+ - dimension: factual-correctness
17
+ rubricText: |
18
+ Score factual correctness from 0 to 100:
19
+ - 0: Completely wrong — fabricated facts, hallucinated APIs, incorrect syntax
20
+ - 25: Mostly wrong — some real concepts but significant factual errors
21
+ - 50: Mixed — some correct facts alongside notable inaccuracies
22
+ - 75: Mostly correct — minor inaccuracies or imprecisions
23
+ - 100: Fully correct — all statements are accurate and verifiable
24
+
25
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
26
+ humanScore: 30
27
+ notes: |
28
+ Per current Sanity CLI docs the default dataset chosen by
29
+ `npm create sanity@latest` is selectable, with `production` being
30
+ the conventional default name surfaced in the prompt — but the
31
+ candidate phrases this as a deterministic default rather than a
32
+ prompt-driven choice. The factual claim "the default is production"
33
+ is wrong as stated; the CLI prompts for a dataset name and offers
34
+ `production` as a suggested value. Failure mode: factual-error.
35
+ gradedBy: "senior-engineer"
36
+ gradedAt: "2026-05-10"
@@ -0,0 +1,51 @@
1
+ # mcp-server-spec.yaml — Human-graded reference sample for MCP server tasks
2
+ #
3
+ # Phase 3 GRAD-03 calibration fixture (Plan 03-02). Top-level
4
+ # `expectedFailureMode` + `dimension` are read by the calibration test in
5
+ # packages/eval/src/grader/__tests__/calibration.test.ts.
6
+
7
+ - taskId: mcp-search-tool-pagination
8
+ area: mcp-server
9
+ dimension: mcp-behavior
10
+ expectedFailureMode: spec-mismatch
11
+ response: |
12
+ Here's an MCP `search` tool that returns paginated results:
13
+
14
+ ```typescript
15
+ import { Server } from "@modelcontextprotocol/sdk/server/index.js"
16
+
17
+ server.setRequestHandler({ method: "tools/call" }, async (request) => {
18
+ if (request.params.name === "search") {
19
+ const { query, page = 1, pageSize = 10 } = request.params.arguments
20
+ const results = await searchIndex(query, page, pageSize)
21
+ return {
22
+ content: [
23
+ { type: "text", text: JSON.stringify(results) },
24
+ ],
25
+ nextPageToken: page + 1,
26
+ }
27
+ }
28
+ })
29
+ ```
30
+ rubrics:
31
+ - dimension: output-correctness
32
+ rubricText: |
33
+ Score MCP tool output handling from 0 to 100:
34
+ - 0: Completely failed to use tool outputs
35
+ - 25: Used outputs but misinterpreted the data
36
+ - 50: Partially correct interpretation of tool outputs
37
+ - 75: Mostly correct — minor misuse of returned data
38
+ - 100: Perfect output handling — correctly interpreted all tool responses
39
+
40
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
41
+ humanScore: 30
42
+ notes: |
43
+ The candidate adds a `nextPageToken` field at the top level of the
44
+ tool-call response. The published MCP `tools/call` response spec only
45
+ defines `content`, `isError`, and `_meta` at the top level; arbitrary
46
+ top-level fields like `nextPageToken` are not part of the spec and
47
+ are ignored by spec-compliant clients. Pagination metadata belongs
48
+ inside the `content` block (e.g., as a structured JSON payload) or
49
+ in `_meta`. Failure mode: spec-mismatch.
50
+ gradedBy: "senior-engineer"
51
+ gradedAt: "2026-05-10"
@@ -0,0 +1,48 @@
1
+ # portable-text.yaml — Human-graded reference sample for Portable Text tasks
2
+ #
3
+ # Phase 3 GRAD-03 calibration fixture (Plan 03-02). Top-level
4
+ # `expectedFailureMode` + `dimension` are read by the calibration test in
5
+ # packages/eval/src/grader/__tests__/calibration.test.ts.
6
+
7
+ - taskId: portable-text-react-marks
8
+ area: portable-text
9
+ dimension: doc-coverage
10
+ expectedFailureMode: outdated-docs
11
+ response: |
12
+ To customize the marks renderer in @portabletext/react, you can pass a `serializers` prop:
13
+
14
+ ```tsx
15
+ import { PortableText } from "@portabletext/react"
16
+
17
+ const serializers = {
18
+ marks: {
19
+ link: ({ children, mark }) => (
20
+ <a href={mark.href} target="_blank" rel="noopener">{children}</a>
21
+ ),
22
+ },
23
+ }
24
+
25
+ export function Body({ value }: { value: any }) {
26
+ return <PortableText blocks={value} serializers={serializers} />
27
+ }
28
+ ```
29
+ rubrics:
30
+ - dimension: doc-coverage
31
+ rubricText: |
32
+ Score documentation coverage from 0 to 100:
33
+ - 0: Had to hallucinate/guess most implementation details
34
+ - 30: Significant gaps — filled with assumptions
35
+ - 50: Some gaps — inferred from partial information
36
+ - 80: Minor gaps — almost everything was documented
37
+ - 100: Complete coverage — all necessary info was in docs
38
+
39
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
40
+ humanScore: 25
41
+ notes: |
42
+ The candidate uses the v2 `serializers` prop and `blocks` prop, both of
43
+ which were renamed in @portabletext/react v3+ to `components` and
44
+ `value`. The doc the candidate is following is outdated — current
45
+ docs document the v3 component-based API. Failure mode:
46
+ outdated-docs (the doc reflects an older API).
47
+ gradedBy: "senior-engineer"
48
+ gradedAt: "2026-05-10"
@@ -0,0 +1,318 @@
1
+ /**
2
+ * diagnosis-cards.ts — Diagnosis eval matrix config.
3
+ *
4
+ * TS-first config (per .claude/rules/config.md) defining the 5 LLM card types
5
+ * × 3 first-class models eval matrix. Consumed by
6
+ * `scripts/generate-diagnosis-config.ts` to emit
7
+ * `promptfooconfig-diagnosis.yaml`. Never hand-edit the YAML — run
8
+ * `pnpm generate-configs` instead.
9
+ *
10
+ * Per AI-SPEC §5 and CONTEXT D-04 (path b: standalone generator entry point
11
+ * for the diagnosis config, additive — does not modify the existing literacy
12
+ * generate-configs pipeline).
13
+ *
14
+ * @see packages/eval/scripts/generate-diagnosis-config.ts — generator
15
+ * @see packages/eval/promptfooconfig-diagnosis.yaml — generated output
16
+ */
17
+
18
+ // ---------------------------------------------------------------------------
19
+ // Types
20
+ // ---------------------------------------------------------------------------
21
+
22
+ /**
23
+ * A first-class model entry in the diagnosis eval matrix.
24
+ * Mirrors the shape of model entries in `config/models.ts`.
25
+ */
26
+ export interface DiagnosisModelEntry {
27
+ /** Promptfoo provider string (e.g. "anthropic:messages:claude-opus-4-6") */
28
+ id: string
29
+ /** Human-readable label for reports */
30
+ label: string
31
+ /** Per-model config overrides (temperature, max_tokens, etc.) */
32
+ config?: Record<string, unknown>
33
+ }
34
+
35
+ /**
36
+ * The 5 LLM-driven card types under evaluation.
37
+ * Deterministic cards (area-summary, failure-mode-summary, no-issues) are
38
+ * tested via `fixture-matrix.test.ts` (vitest), not via the promptfoo matrix.
39
+ */
40
+ export type LLMCardType =
41
+ | "top-recommendations"
42
+ | "weakest-area"
43
+ | "low-confidence-attribution"
44
+ | "doc-attribution-spotlight"
45
+ | "regression-vs-baseline"
46
+
47
+ /**
48
+ * A single evaluation scenario: one fixture path × one expected outcome.
49
+ *
50
+ * The `fixturePath` is relative to `packages/eval/` so the promptfoo config
51
+ * can resolve it from any working directory. `expectedStatus` drives the
52
+ * pass/fail assertion in the generated YAML.
53
+ */
54
+ export interface DiagnosisScenario {
55
+ /** Short slug used in promptfoo `description` fields */
56
+ name: string
57
+ /** Path to the Report JSON fixture, relative to `packages/eval/` */
58
+ fixturePath: string
59
+ /**
60
+ * Card type this scenario exercises. The eval matrix runs all LLM cards
61
+ * per scenario; this field annotates which card type is the primary focus
62
+ * for the rubric.
63
+ */
64
+ primaryCard: LLMCardType
65
+ /** Expected card status when all LLM calls succeed */
66
+ expectedStatus: "ready" | "degraded" | "missing"
67
+ /** Optional: path to canned LLM response for adversarial scenarios */
68
+ cannedResponsePath?: string
69
+ /**
70
+ * Optional: cardId to key the canned response against (for FakeLLMClient
71
+ * keyedResponses in vitest; mirrored in the promptfoo scenario description
72
+ * for documentation).
73
+ */
74
+ cannedCardId?: LLMCardType
75
+ /** Free-text note about what this scenario tests */
76
+ note?: string
77
+ }
78
+
79
+ /**
80
+ * Top-level diagnosis eval matrix config.
81
+ * Exported as the default export of this file (mirrors models.ts convention).
82
+ */
83
+ export interface DiagnosisCardsConfig {
84
+ /** All LLM card evaluation scenarios */
85
+ scenarios: DiagnosisScenario[]
86
+ /** Models to run each scenario against */
87
+ models: DiagnosisModelEntry[]
88
+ /** Grader model for LLM-judge assertions */
89
+ grader: DiagnosisModelEntry
90
+ /** Eval budget in milliseconds (kill switch) */
91
+ evalBudgetMs: number
92
+ /** Max parallel API calls */
93
+ maxConcurrency: number
94
+ /** Default per-model config */
95
+ defaults: {
96
+ temperature: number
97
+ max_tokens: number
98
+ }
99
+ }
100
+
101
+ // ---------------------------------------------------------------------------
102
+ // Helper
103
+ // ---------------------------------------------------------------------------
104
+
105
+ export function defineDiagnosisCards(
106
+ config: DiagnosisCardsConfig
107
+ ): DiagnosisCardsConfig {
108
+ return config
109
+ }
110
+
111
+ // ---------------------------------------------------------------------------
112
+ // Config definition
113
+ // ---------------------------------------------------------------------------
114
+
115
+ const diagnosisCardsConfig: DiagnosisCardsConfig = defineDiagnosisCards({
116
+ // ── Models under evaluation ────────────────────────────────────────────────
117
+ models: [
118
+ {
119
+ id: "anthropic:messages:claude-opus-4-6",
120
+ label: "Claude Opus 4.6",
121
+ config: { temperature: 0.2, max_tokens: 4096 },
122
+ },
123
+ {
124
+ id: "anthropic:messages:claude-sonnet-4-6",
125
+ label: "Claude Sonnet 4.6",
126
+ config: { temperature: 0.2, max_tokens: 4096 },
127
+ },
128
+ {
129
+ id: "openai:chat:gpt-5.2",
130
+ label: "GPT 5.2",
131
+ config: { max_completion_tokens: 4096 },
132
+ },
133
+ ],
134
+
135
+ // ── Grader model ────────────────────────────────────────────────────────────
136
+ grader: {
137
+ id: "anthropic:messages:claude-opus-4-5-20251101",
138
+ label: "Claude Opus 4.5 (grader)",
139
+ },
140
+
141
+ // ── Eval budget ─────────────────────────────────────────────────────────────
142
+ evalBudgetMs: 3_600_000, // 60 min — full matrix across 3 models × 17 scenarios
143
+ maxConcurrency: 8, // conservative for diagnosis (longer prompts than literacy)
144
+
145
+ // ── Default config ──────────────────────────────────────────────────────────
146
+ defaults: {
147
+ temperature: 0.2,
148
+ max_tokens: 4096,
149
+ },
150
+
151
+ // ── Scenarios (17 logical fixtures) ─────────────────────────────────────────
152
+ scenarios: [
153
+ // ── Critical-path: top-recommendations ──────────────────────────────────
154
+ {
155
+ name: "healthy-top-recommendations",
156
+ fixturePath:
157
+ "test-fixtures/diagnosis/reports/healthy-top-recommendations.json",
158
+ primaryCard: "top-recommendations",
159
+ expectedStatus: "ready",
160
+ note: "Healthy report (mean 91) — top-recommendations card should produce 2+ actionable suggestions with docSlug references from the manifest.",
161
+ },
162
+ {
163
+ name: "low-top-recommendations",
164
+ fixturePath:
165
+ "test-fixtures/diagnosis/reports/low-top-recommendations.json",
166
+ primaryCard: "top-recommendations",
167
+ expectedStatus: "ready",
168
+ note: "Low-scoring report (mean 42) — top-recommendations card should produce high-priority suggestions addressing the dominant failure modes (outdated-docs, missing-docs).",
169
+ },
170
+
171
+ // ── Critical-path: weakest-area ──────────────────────────────────────────
172
+ {
173
+ name: "healthy-weakest-area",
174
+ fixturePath: "test-fixtures/diagnosis/reports/healthy-weakest-area.json",
175
+ primaryCard: "weakest-area",
176
+ expectedStatus: "ready",
177
+ note: "Healthy report with clear weakest area (content-modeling at 82) — weakest-area card should identify the area and provide high-confidence analysis.",
178
+ },
179
+ {
180
+ name: "low-weakest-area",
181
+ fixturePath: "test-fixtures/diagnosis/reports/low-weakest-area.json",
182
+ primaryCard: "weakest-area",
183
+ expectedStatus: "ready",
184
+ note: "Low-scoring report with clear weakest area (content-modeling at 28) — weakest-area card should identify the most critical area with multiple failure modes.",
185
+ },
186
+
187
+ // ── Critical-path: low-confidence-attribution ────────────────────────────
188
+ {
189
+ name: "healthy-low-confidence-attribution",
190
+ fixturePath:
191
+ "test-fixtures/diagnosis/reports/healthy-low-confidence-attribution.json",
192
+ primaryCard: "low-confidence-attribution",
193
+ expectedStatus: "ready",
194
+ note: "Healthy report with small sample sizes (2-3 judgments per area) — low-confidence-attribution card should identify attribution uncertainty despite positive scores.",
195
+ },
196
+ {
197
+ name: "low-low-confidence-attribution",
198
+ fixturePath:
199
+ "test-fixtures/diagnosis/reports/low-low-confidence-attribution.json",
200
+ primaryCard: "low-confidence-attribution",
201
+ expectedStatus: "ready",
202
+ note: "Low-scoring report with small sample sizes (2 judgments per area) — low-confidence-attribution card should flag both score quality and attribution uncertainty.",
203
+ },
204
+
205
+ // ── Critical-path: doc-attribution-spotlight ─────────────────────────────
206
+ {
207
+ name: "healthy-doc-attribution-spotlight",
208
+ fixturePath:
209
+ "test-fixtures/diagnosis/reports/healthy-doc-attribution-spotlight.json",
210
+ primaryCard: "doc-attribution-spotlight",
211
+ expectedStatus: "ready",
212
+ note: "Healthy 5-area report — doc-attribution-spotlight card should identify the highest-impact document in the manifest.",
213
+ },
214
+ {
215
+ name: "low-doc-attribution-spotlight",
216
+ fixturePath:
217
+ "test-fixtures/diagnosis/reports/low-doc-attribution-spotlight.json",
218
+ primaryCard: "doc-attribution-spotlight",
219
+ expectedStatus: "ready",
220
+ note: "Low-scoring 5-area report with multiple failure modes — doc-attribution-spotlight card should identify the most critical document.",
221
+ },
222
+
223
+ // ── Edge cases ───────────────────────────────────────────────────────────
224
+ {
225
+ name: "empty-report",
226
+ fixturePath: "test-fixtures/diagnosis/reports/empty.json",
227
+ primaryCard: "top-recommendations",
228
+ expectedStatus: "missing",
229
+ note: "Edge case (a): zero-area report — all LLM cards should emit status: missing (no data to reason about).",
230
+ },
231
+ {
232
+ name: "single-judgment-per-area",
233
+ fixturePath:
234
+ "test-fixtures/diagnosis/reports/single-judgment-per-area.json",
235
+ primaryCard: "weakest-area",
236
+ expectedStatus: "ready",
237
+ note: "Edge case (b): single-judgment sample size — weakest-area card should reflect low-confidence calibration (sampleSize: 1).",
238
+ },
239
+ {
240
+ name: "all-areas-tied",
241
+ fixturePath: "test-fixtures/diagnosis/reports/all-areas-tied.json",
242
+ primaryCard: "weakest-area",
243
+ expectedStatus: "missing",
244
+ note: "Edge case (c): all areas scored identically (70) — weakest-area card should emit status: missing with reason: no-clear-weakest.",
245
+ },
246
+ {
247
+ name: "grader-major-mismatch-baseline",
248
+ fixturePath:
249
+ "test-fixtures/diagnosis/reports/grader-major-mismatch-baseline.json",
250
+ primaryCard: "regression-vs-baseline",
251
+ expectedStatus: "missing",
252
+ note: "Edge case (d): grader-major-version mismatch — regression-vs-baseline should emit missing with reason: grader-major-version-mismatch. Run as pair with grader-major-mismatch-current.",
253
+ },
254
+ {
255
+ name: "grader-major-mismatch-current",
256
+ fixturePath:
257
+ "test-fixtures/diagnosis/reports/grader-major-mismatch-current.json",
258
+ primaryCard: "regression-vs-baseline",
259
+ expectedStatus: "missing",
260
+ note: "Edge case (d) pair: current report with different graderModel — regression-vs-baseline mismatch guard triggers when paired with grader-major-mismatch-baseline.",
261
+ },
262
+ {
263
+ name: "near-deprecated-taxonomy",
264
+ fixturePath:
265
+ "test-fixtures/diagnosis/reports/near-deprecated-taxonomy.json",
266
+ primaryCard: "weakest-area",
267
+ expectedStatus: "ready",
268
+ note: "Edge case (e): report using unclassified failure mode (currently canonical but watch for taxonomy retirement). Zod refine() must accept canonical modes.",
269
+ },
270
+
271
+ // ── Adversarial canned responses ─────────────────────────────────────────
272
+ {
273
+ name: "adversarial-fabricated-delta",
274
+ fixturePath:
275
+ "test-fixtures/diagnosis/reports/grader-major-mismatch-current.json",
276
+ primaryCard: "regression-vs-baseline",
277
+ expectedStatus: "degraded",
278
+ cannedResponsePath:
279
+ "test-fixtures/diagnosis/canned-responses/fabricated-delta-regression.json",
280
+ cannedCardId: "regression-vs-baseline",
281
+ note: "Adversarial: fabricated delta (AI-SPEC §1b failure-mode #1). LLM claims -7.3 delta; direction-sign refine triggers degraded card.",
282
+ },
283
+ {
284
+ name: "adversarial-improve-introduction",
285
+ fixturePath:
286
+ "test-fixtures/diagnosis/reports/low-top-recommendations.json",
287
+ primaryCard: "top-recommendations",
288
+ expectedStatus: "degraded",
289
+ cannedResponsePath:
290
+ "test-fixtures/diagnosis/canned-responses/improve-introduction.json",
291
+ cannedCardId: "top-recommendations",
292
+ note: "Adversarial: generic anti-pattern recommendation (AI-SPEC §1b failure-mode #2). Actionability refine triggers degraded card.",
293
+ },
294
+ {
295
+ name: "adversarial-hallucinated-docslug",
296
+ fixturePath:
297
+ "test-fixtures/diagnosis/reports/low-top-recommendations.json",
298
+ primaryCard: "top-recommendations",
299
+ expectedStatus: "degraded",
300
+ cannedResponsePath:
301
+ "test-fixtures/diagnosis/canned-responses/hallucinated-docslug.json",
302
+ cannedCardId: "top-recommendations",
303
+ note: "Adversarial: hallucinated docSlug (AI-SPEC §1b failure-mode #3). Allow-list refine triggers degraded card.",
304
+ },
305
+ {
306
+ name: "adversarial-taxonomy-drift",
307
+ fixturePath: "test-fixtures/diagnosis/reports/low-weakest-area.json",
308
+ primaryCard: "weakest-area",
309
+ expectedStatus: "degraded",
310
+ cannedResponsePath:
311
+ "test-fixtures/diagnosis/canned-responses/taxonomy-drift.json",
312
+ cannedCardId: "weakest-area",
313
+ note: "Adversarial: taxonomy drift (AI-SPEC §1b failure-mode #4). Per-dimension failureMode refine triggers degraded card.",
314
+ },
315
+ ],
316
+ })
317
+
318
+ export default diagnosisCardsConfig
package/config/models.ts CHANGED
@@ -24,6 +24,18 @@ export default defineModels({
24
24
  // All literacy variants included by default (baseline, observed,
25
25
  // agentic-naive, agentic-optimized)
26
26
  },
27
+ {
28
+ // Phase 5 LLM card routing (D-07). AI-SPEC §4 routes 3 routine cards
29
+ // (top-recommendations, weakest-area, regression-vs-baseline) here.
30
+ // Pricing already in AnthropicLLMClient; baseline literacy variant only.
31
+ id: "anthropic:messages:claude-sonnet-4-6",
32
+ label: "Claude Sonnet 4.6",
33
+ config: { temperature: 0.2, max_tokens: 4096 },
34
+ modes: ["literacy"],
35
+ variants: {
36
+ literacy: ["baseline"],
37
+ },
38
+ },
27
39
 
28
40
  // ── Google ─────────────────────────────────────────────────
29
41
  // {
package/config/rubrics.ts CHANGED
@@ -11,6 +11,15 @@
11
11
 
12
12
  import { defineRubrics } from "@sanity/ailf-core"
13
13
 
14
+ // Plan 03-02 — per-dimension failure-mode taxonomies stamped onto each
15
+ // template entry below. Source of truth lives in packages/eval/src/grader/;
16
+ // the helper picks the right list by dimension family.
17
+ import { failureModesForDimension } from "../src/grader/index.js"
18
+ // Single source of truth for the wire-format version stamped into the
19
+ // grader-prompt footer (VER-01 D-02). Interpolated below so the
20
+ // announced version cannot drift from the schema's expected value.
21
+ import { graderJudgmentsVersion } from "../src/adapters/grader-outputs/index.js"
22
+
14
23
  export default defineRubrics({
15
24
  templates: {
16
25
  // ── Core literacy dimensions ────────────────────────────
@@ -25,6 +34,7 @@ export default defineRubrics({
25
34
  "100: Fully functional code — works as expected",
26
35
  ],
27
36
  criteria_label: "Must demonstrate:",
37
+ failureModes: failureModesForDimension("task-completion"),
28
38
  },
29
39
  "code-correctness": {
30
40
  dimension: "code-correctness",
@@ -37,6 +47,7 @@ export default defineRubrics({
37
47
  "100: Follows all best practices, idiomatic implementation",
38
48
  ],
39
49
  criteria_label: "Check for:",
50
+ failureModes: failureModesForDimension("code-correctness"),
40
51
  },
41
52
  "doc-coverage": {
42
53
  dimension: "doc-coverage",
@@ -48,6 +59,7 @@ export default defineRubrics({
48
59
  "80: Minor gaps — almost everything was documented",
49
60
  "100: Complete coverage — all necessary info was in docs",
50
61
  ],
62
+ failureModes: failureModesForDimension("doc-coverage"),
51
63
  },
52
64
 
53
65
  // ── MCP server dimensions ───────────────────────────────
@@ -62,6 +74,7 @@ export default defineRubrics({
62
74
  "100: Perfect tool inputs — all parameters correct and well-formed",
63
75
  ],
64
76
  criteria_label: "Evaluate:",
77
+ failureModes: failureModesForDimension("input-validation"),
65
78
  },
66
79
  "mcp-output-correctness": {
67
80
  dimension: "output-correctness",
@@ -74,6 +87,7 @@ export default defineRubrics({
74
87
  "100: Perfect output handling — correctly interpreted all tool responses",
75
88
  ],
76
89
  criteria_label: "Check for:",
90
+ failureModes: failureModesForDimension("output-correctness"),
77
91
  },
78
92
  "mcp-error-handling": {
79
93
  dimension: "error-handling",
@@ -86,6 +100,7 @@ export default defineRubrics({
86
100
  "100: Excellent — handled all errors appropriately with clear messaging",
87
101
  ],
88
102
  criteria_label: "Evaluate:",
103
+ failureModes: failureModesForDimension("error-handling"),
89
104
  },
90
105
  "mcp-security": {
91
106
  dimension: "security",
@@ -98,6 +113,7 @@ export default defineRubrics({
98
113
  "100: Perfect security — only used authorized tools with safe inputs",
99
114
  ],
100
115
  criteria_label: "Check for:",
116
+ failureModes: failureModesForDimension("security"),
101
117
  },
102
118
 
103
119
  // ── Knowledge probe dimensions ──────────────────────────
@@ -112,6 +128,7 @@ export default defineRubrics({
112
128
  "100: Fully correct — all statements are accurate and verifiable",
113
129
  ],
114
130
  criteria_label: "Verify:",
131
+ failureModes: failureModesForDimension("factual-correctness"),
115
132
  },
116
133
  completeness: {
117
134
  dimension: "completeness",
@@ -124,6 +141,7 @@ export default defineRubrics({
124
141
  "100: Comprehensive — thorough coverage of all important aspects",
125
142
  ],
126
143
  criteria_label: "Check coverage of:",
144
+ failureModes: failureModesForDimension("completeness"),
127
145
  },
128
146
  currency: {
129
147
  dimension: "currency",
@@ -136,6 +154,7 @@ export default defineRubrics({
136
154
  "100: Fully current — references latest APIs, patterns, and best practices",
137
155
  ],
138
156
  criteria_label: "Check for:",
157
+ failureModes: failureModesForDimension("currency"),
139
158
  },
140
159
 
141
160
  // ── Agent harness dimensions ────────────────────────────
@@ -151,6 +170,7 @@ export default defineRubrics({
151
170
  "100: Excellent process — optimal tool usage, clear planning, graceful recovery",
152
171
  ],
153
172
  criteria_label: "Evaluate:",
173
+ failureModes: failureModesForDimension("process-quality"),
154
174
  },
155
175
  "agent-output": {
156
176
  dimension: "agent-output",
@@ -163,6 +183,7 @@ export default defineRubrics({
163
183
  "100: Excellent output — fully correct, clean, and complete",
164
184
  ],
165
185
  criteria_label: "Check for:",
186
+ failureModes: failureModesForDimension("agent-output"),
166
187
  },
167
188
  "agent-tool-usage": {
168
189
  dimension: "tool-usage",
@@ -175,6 +196,7 @@ export default defineRubrics({
175
196
  "100: Excellent — optimal tool selection, correct inputs, minimal redundancy",
176
197
  ],
177
198
  criteria_label: "Evaluate:",
199
+ failureModes: failureModesForDimension("tool-usage"),
178
200
  },
179
201
  },
180
202
 
@@ -220,6 +242,20 @@ export default defineRubrics({
220
242
  "agent-harness": { gold: "agent-harness" },
221
243
  },
222
244
 
223
- footer:
224
- 'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}',
245
+ // Phase 3 GRAD-05 (Plan 03-01) — structured GraderJudgment JSON sketch.
246
+ // Documents the target wire format the grader emits. The strict schema's
247
+ // GRAD-02 additive fields stay optional in this plan; Plan 03-04 flips
248
+ // them to required and bumps graderJudgmentsVersion to 1.0.0.
249
+ footer: `Return ONLY a JSON object with this exact shape:
250
+ {
251
+ "judgmentId": "<string>",
252
+ "score": <number 0-100>,
253
+ "reason": "<explanation, ≤500 chars>",
254
+ "subJudgments": [{ "criterionId": "<id>", "met": <bool>, "evidence": "<≤280 chars>", "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" } }],
255
+ "docCitations": [{ "documentId": "<id>", "slug": "<optional slug>", "role": "supports|contradicts|missing|irrelevant", "hallucinated": <bool> }],
256
+ "failureMode": "<one of the declared modes for this dimension or 'unclassified'>",
257
+ "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" },
258
+ "hallucinationCheckedAgainst": ["<doc id>"],
259
+ "metadata": { "graderModel": "<string>", "graderJudgmentsVersion": "${graderJudgmentsVersion}" }
260
+ }`,
225
261
  })