@sanity/ailf 4.5.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/canonical/grader-references/agent-harness-tools.yaml +42 -0
  2. package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
  3. package/canonical/grader-references/mcp-server-spec.yaml +51 -0
  4. package/canonical/grader-references/portable-text.yaml +48 -0
  5. package/config/rubrics.ts +38 -2
  6. package/dist/_vendor/ailf-core/artifact-registry.d.ts +197 -2
  7. package/dist/_vendor/ailf-core/artifact-registry.js +419 -5
  8. package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
  9. package/dist/_vendor/ailf-core/examples/index.js +146 -47
  10. package/dist/_vendor/ailf-core/ports/context.d.ts +26 -0
  11. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  12. package/dist/_vendor/ailf-core/ports/index.js +1 -0
  13. package/dist/_vendor/ailf-core/ports/llm-client.d.ts +112 -0
  14. package/dist/_vendor/ailf-core/ports/llm-client.js +68 -0
  15. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
  16. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
  17. package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
  18. package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
  19. package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
  20. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  21. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
  22. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  23. package/dist/_vendor/ailf-core/schemas/index.js +9 -0
  24. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
  25. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
  26. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
  27. package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
  28. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
  29. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
  30. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
  31. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
  32. package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
  33. package/dist/_vendor/ailf-core/services/index.js +5 -0
  34. package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
  35. package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
  36. package/dist/_vendor/ailf-core/types/attribution.js +18 -0
  37. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
  38. package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
  39. package/dist/_vendor/ailf-core/types/confidence.d.ts +68 -0
  40. package/dist/_vendor/ailf-core/types/confidence.js +56 -0
  41. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
  42. package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
  43. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
  44. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
  45. package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
  46. package/dist/_vendor/ailf-core/types/index.d.ts +82 -29
  47. package/dist/_vendor/ailf-core/types/index.js +16 -1
  48. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
  49. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
  50. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
  51. package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
  52. package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
  53. package/dist/adapters/api-client/build-request.d.ts +1 -0
  54. package/dist/adapters/api-client/build-request.js +3 -0
  55. package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
  56. package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
  57. package/dist/adapters/attribution/index.d.ts +9 -0
  58. package/dist/adapters/attribution/index.js +8 -0
  59. package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
  60. package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
  61. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  62. package/dist/adapters/grader-outputs/index.d.ts +10 -0
  63. package/dist/adapters/grader-outputs/index.js +8 -0
  64. package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
  65. package/dist/adapters/grader-outputs/legacy/index.js +10 -0
  66. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
  67. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
  68. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
  69. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
  70. package/dist/adapters/index.d.ts +3 -0
  71. package/dist/adapters/index.js +4 -0
  72. package/dist/adapters/llm/anthropic-llm-client.d.ts +48 -0
  73. package/dist/adapters/llm/anthropic-llm-client.js +205 -0
  74. package/dist/adapters/llm/fake-llm-client.d.ts +49 -0
  75. package/dist/adapters/llm/fake-llm-client.js +63 -0
  76. package/dist/adapters/llm/index.d.ts +9 -0
  77. package/dist/adapters/llm/index.js +4 -0
  78. package/dist/adapters/llm/openai-llm-client.d.ts +44 -0
  79. package/dist/adapters/llm/openai-llm-client.js +168 -0
  80. package/dist/adapters/llm/pricing.d.ts +12 -0
  81. package/dist/adapters/llm/pricing.js +8 -0
  82. package/dist/adapters/llm/retry.d.ts +56 -0
  83. package/dist/adapters/llm/retry.js +66 -0
  84. package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
  85. package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
  86. package/dist/adapters/task-sources/repo-schemas.d.ts +90 -22
  87. package/dist/adapters/task-sources/repo-schemas.js +19 -2
  88. package/dist/artifact-capture/api-gateway-artifact-writer.js +2 -1
  89. package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +2 -1
  90. package/dist/artifact-capture/gcs-artifact-writer.js +3 -1
  91. package/dist/artifact-capture/local-fs-artifact-writer.js +3 -1
  92. package/dist/commands/calculate-scores.js +1 -1
  93. package/dist/commands/explain-handler.js +1 -1
  94. package/dist/commands/lookup-doc.d.ts +1 -1
  95. package/dist/commands/lookup-doc.js +3 -3
  96. package/dist/commands/pipeline-action.d.ts +6 -0
  97. package/dist/commands/pipeline-action.js +2 -0
  98. package/dist/commands/remote-pipeline.js +1 -0
  99. package/dist/composition-root.d.ts +59 -1
  100. package/dist/composition-root.js +95 -0
  101. package/dist/config/rubrics.ts +38 -2
  102. package/dist/grader/agent-harness.d.ts +14 -0
  103. package/dist/grader/agent-harness.js +17 -0
  104. package/dist/grader/common.d.ts +17 -0
  105. package/dist/grader/common.js +21 -0
  106. package/dist/grader/index.d.ts +38 -0
  107. package/dist/grader/index.js +75 -0
  108. package/dist/grader/knowledge-probe.d.ts +14 -0
  109. package/dist/grader/knowledge-probe.js +18 -0
  110. package/dist/grader/literacy.d.ts +13 -0
  111. package/dist/grader/literacy.js +17 -0
  112. package/dist/grader/mcp.d.ts +14 -0
  113. package/dist/grader/mcp.js +18 -0
  114. package/dist/orchestration/build-app-context.js +1 -0
  115. package/dist/orchestration/build-step-sequence.js +5 -0
  116. package/dist/orchestration/steps/calculate-scores-step.js +23 -1
  117. package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
  118. package/dist/orchestration/steps/compute-attribution-step.js +279 -0
  119. package/dist/orchestration/steps/gap-analysis-step.js +35 -7
  120. package/dist/orchestration/steps/index.d.ts +1 -0
  121. package/dist/orchestration/steps/index.js +1 -0
  122. package/dist/pipeline/attribution.d.ts +15 -0
  123. package/dist/pipeline/attribution.js +18 -9
  124. package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
  125. package/dist/pipeline/borderline-consensus-runner.js +124 -0
  126. package/dist/pipeline/borderline-detector.d.ts +24 -0
  127. package/dist/pipeline/borderline-detector.js +26 -0
  128. package/dist/pipeline/calculate-scores.d.ts +114 -3
  129. package/dist/pipeline/calculate-scores.js +426 -24
  130. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  131. package/dist/pipeline/compiler/literacy-bridge.js +35 -17
  132. package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
  133. package/dist/pipeline/compiler/rubric-resolution.js +9 -1
  134. package/dist/pipeline/compute-attribution.d.ts +80 -0
  135. package/dist/pipeline/compute-attribution.js +196 -0
  136. package/dist/pipeline/failure-modes.d.ts +52 -17
  137. package/dist/pipeline/failure-modes.js +178 -117
  138. package/dist/pipeline/map-request-to-config.js +1 -0
  139. package/package.json +6 -4
@@ -0,0 +1,42 @@
1
+ # agent-harness-tools.yaml — Human-graded reference sample for agent-harness tasks
2
+ #
3
+ # Phase 3 GRAD-03 calibration fixture (Plan 03-02). Top-level
4
+ # `expectedFailureMode` + `dimension` are read by the calibration test in
5
+ # packages/eval/src/grader/__tests__/calibration.test.ts.
6
+
7
+ - taskId: agent-multi-step-schema-edit
8
+ area: agent-harness
9
+ dimension: agent-harness
10
+ expectedFailureMode: tool-misuse
11
+ response: |
12
+ To add a `coverImage` field to the `post` schema and migrate existing
13
+ documents, the agent ran the following tool calls:
14
+
15
+ 1. `read_file({ path: "schemas/post.ts" })` — read the schema.
16
+ 2. `write_file({ path: "schemas/post.ts", content: "<new schema>" })` — wrote the updated schema.
17
+ 3. `run_command({ command: "sanity migrate add-cover-image" })` — attempted to run a migration.
18
+
19
+ The `sanity migrate` command failed because no migration script exists at
20
+ that name. The agent then stopped without creating the migration file.
21
+ rubrics:
22
+ - dimension: tool-usage
23
+ rubricText: |
24
+ Score the agent's tool usage from 0 to 100:
25
+ - 0: Completely wrong tool usage — called wrong tools or with invalid inputs
26
+ - 25: Poor tool usage — correct tools but wrong parameters or sequencing
27
+ - 50: Adequate — correct tools and basic parameters, some inefficiency
28
+ - 75: Good — efficient tool usage with proper error handling
29
+ - 100: Excellent — optimal tool selection, correct inputs, minimal redundancy
30
+
31
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
32
+ humanScore: 25
33
+ notes: |
34
+ The agent invoked `run_command` with a migration name that did not
35
+ exist in the project, instead of first listing available migrations
36
+ with `list_migrations` (or equivalent) or scaffolding a new migration
37
+ file with the correct tool. The migration step was a tool-misuse:
38
+ the right tool exists, but the agent supplied invalid arguments
39
+ (a non-existent migration name) and then failed to recover by
40
+ scaffolding a new migration. Failure mode: tool-misuse.
41
+ gradedBy: "senior-engineer"
42
+ gradedAt: "2026-05-10"
@@ -0,0 +1,36 @@
1
+ # knowledge-probe-recall.yaml — Human-graded reference sample for KP tasks
2
+ #
3
+ # Phase 3 GRAD-03 calibration fixture (Plan 03-02). Top-level
4
+ # `expectedFailureMode` + `dimension` are read by the calibration test in
5
+ # packages/eval/src/grader/__tests__/calibration.test.ts.
6
+
7
+ - taskId: kp-default-dataset
8
+ area: knowledge-probe
9
+ dimension: knowledge-probe
10
+ expectedFailureMode: factual-error
11
+ response: |
12
+ The default dataset for new Sanity Studio projects is `production`. When
13
+ you run `npm create sanity@latest` and accept the defaults, the CLI
14
+ creates a dataset named `production`.
15
+ rubrics:
16
+ - dimension: factual-correctness
17
+ rubricText: |
18
+ Score factual correctness from 0 to 100:
19
+ - 0: Completely wrong — fabricated facts, hallucinated APIs, incorrect syntax
20
+ - 25: Mostly wrong — some real concepts but significant factual errors
21
+ - 50: Mixed — some correct facts alongside notable inaccuracies
22
+ - 75: Mostly correct — minor inaccuracies or imprecisions
23
+ - 100: Fully correct — all statements are accurate and verifiable
24
+
25
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
26
+ humanScore: 30
27
+ notes: |
28
+ Per current Sanity CLI docs the default dataset chosen by
29
+ `npm create sanity@latest` is selectable, with `production` being
30
+ the conventional default name surfaced in the prompt — but the
31
+ candidate phrases this as a deterministic default rather than a
32
+ prompt-driven choice. The factual claim "the default is production"
33
+ is wrong as stated; the CLI prompts for a dataset name and offers
34
+ `production` as a suggested value. Failure mode: factual-error.
35
+ gradedBy: "senior-engineer"
36
+ gradedAt: "2026-05-10"
@@ -0,0 +1,51 @@
1
+ # mcp-server-spec.yaml — Human-graded reference sample for MCP server tasks
2
+ #
3
+ # Phase 3 GRAD-03 calibration fixture (Plan 03-02). Top-level
4
+ # `expectedFailureMode` + `dimension` are read by the calibration test in
5
+ # packages/eval/src/grader/__tests__/calibration.test.ts.
6
+
7
+ - taskId: mcp-search-tool-pagination
8
+ area: mcp-server
9
+ dimension: mcp-behavior
10
+ expectedFailureMode: spec-mismatch
11
+ response: |
12
+ Here's an MCP `search` tool that returns paginated results:
13
+
14
+ ```typescript
15
+ import { Server } from "@modelcontextprotocol/sdk/server/index.js"
16
+
17
+ server.setRequestHandler({ method: "tools/call" }, async (request) => {
18
+ if (request.params.name === "search") {
19
+ const { query, page = 1, pageSize = 10 } = request.params.arguments
20
+ const results = await searchIndex(query, page, pageSize)
21
+ return {
22
+ content: [
23
+ { type: "text", text: JSON.stringify(results) },
24
+ ],
25
+ nextPageToken: page + 1,
26
+ }
27
+ }
28
+ })
29
+ ```
30
+ rubrics:
31
+ - dimension: output-correctness
32
+ rubricText: |
33
+ Score MCP tool output handling from 0 to 100:
34
+ - 0: Completely failed to use tool outputs
35
+ - 25: Used outputs but misinterpreted the data
36
+ - 50: Partially correct interpretation of tool outputs
37
+ - 75: Mostly correct — minor misuse of returned data
38
+ - 100: Perfect output handling — correctly interpreted all tool responses
39
+
40
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
41
+ humanScore: 30
42
+ notes: |
43
+ The candidate adds a `nextPageToken` field at the top level of the
44
+ tool-call response. The published MCP `tools/call` response spec only
45
+ defines `content`, `isError`, and `_meta` at the top level; arbitrary
46
+ top-level fields like `nextPageToken` are not part of the spec and
47
+ are ignored by spec-compliant clients. Pagination metadata belongs
48
+ inside the `content` block (e.g., as a structured JSON payload) or
49
+ in `_meta`. Failure mode: spec-mismatch.
50
+ gradedBy: "senior-engineer"
51
+ gradedAt: "2026-05-10"
@@ -0,0 +1,48 @@
1
+ # portable-text.yaml — Human-graded reference sample for Portable Text tasks
2
+ #
3
+ # Phase 3 GRAD-03 calibration fixture (Plan 03-02). Top-level
4
+ # `expectedFailureMode` + `dimension` are read by the calibration test in
5
+ # packages/eval/src/grader/__tests__/calibration.test.ts.
6
+
7
+ - taskId: portable-text-react-marks
8
+ area: portable-text
9
+ dimension: doc-coverage
10
+ expectedFailureMode: outdated-docs
11
+ response: |
12
+ To customize the marks renderer in @portabletext/react, you can pass a `serializers` prop:
13
+
14
+ ```tsx
15
+ import { PortableText } from "@portabletext/react"
16
+
17
+ const serializers = {
18
+ marks: {
19
+ link: ({ children, mark }) => (
20
+ <a href={mark.href} target="_blank" rel="noopener">{children}</a>
21
+ ),
22
+ },
23
+ }
24
+
25
+ export function Body({ value }: { value: any }) {
26
+ return <PortableText blocks={value} serializers={serializers} />
27
+ }
28
+ ```
29
+ rubrics:
30
+ - dimension: doc-coverage
31
+ rubricText: |
32
+ Score documentation coverage from 0 to 100:
33
+ - 0: Had to hallucinate/guess most implementation details
34
+ - 30: Significant gaps — filled with assumptions
35
+ - 50: Some gaps — inferred from partial information
36
+ - 80: Minor gaps — almost everything was documented
37
+ - 100: Complete coverage — all necessary info was in docs
38
+
39
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
40
+ humanScore: 25
41
+ notes: |
42
+ The candidate uses the v2 `serializers` prop and `blocks` prop, both of
43
+ which were renamed in @portabletext/react v3+ to `components` and
44
+ `value`. The doc the candidate is following is outdated — current
45
+ docs document the v3 component-based API. Failure mode:
46
+ outdated-docs (the doc reflects an older API).
47
+ gradedBy: "senior-engineer"
48
+ gradedAt: "2026-05-10"
package/config/rubrics.ts CHANGED
@@ -11,6 +11,15 @@
11
11
 
12
12
  import { defineRubrics } from "@sanity/ailf-core"
13
13
 
14
+ // Plan 03-02 — per-dimension failure-mode taxonomies stamped onto each
15
+ // template entry below. Source of truth lives in packages/eval/src/grader/;
16
+ // the helper picks the right list by dimension family.
17
+ import { failureModesForDimension } from "../src/grader/index.js"
18
+ // Single source of truth for the wire-format version stamped into the
19
+ // grader-prompt footer (VER-01 D-02). Interpolated below so the
20
+ // announced version cannot drift from the schema's expected value.
21
+ import { graderJudgmentsVersion } from "../src/adapters/grader-outputs/index.js"
22
+
14
23
  export default defineRubrics({
15
24
  templates: {
16
25
  // ── Core literacy dimensions ────────────────────────────
@@ -25,6 +34,7 @@ export default defineRubrics({
25
34
  "100: Fully functional code — works as expected",
26
35
  ],
27
36
  criteria_label: "Must demonstrate:",
37
+ failureModes: failureModesForDimension("task-completion"),
28
38
  },
29
39
  "code-correctness": {
30
40
  dimension: "code-correctness",
@@ -37,6 +47,7 @@ export default defineRubrics({
37
47
  "100: Follows all best practices, idiomatic implementation",
38
48
  ],
39
49
  criteria_label: "Check for:",
50
+ failureModes: failureModesForDimension("code-correctness"),
40
51
  },
41
52
  "doc-coverage": {
42
53
  dimension: "doc-coverage",
@@ -48,6 +59,7 @@ export default defineRubrics({
48
59
  "80: Minor gaps — almost everything was documented",
49
60
  "100: Complete coverage — all necessary info was in docs",
50
61
  ],
62
+ failureModes: failureModesForDimension("doc-coverage"),
51
63
  },
52
64
 
53
65
  // ── MCP server dimensions ───────────────────────────────
@@ -62,6 +74,7 @@ export default defineRubrics({
62
74
  "100: Perfect tool inputs — all parameters correct and well-formed",
63
75
  ],
64
76
  criteria_label: "Evaluate:",
77
+ failureModes: failureModesForDimension("input-validation"),
65
78
  },
66
79
  "mcp-output-correctness": {
67
80
  dimension: "output-correctness",
@@ -74,6 +87,7 @@ export default defineRubrics({
74
87
  "100: Perfect output handling — correctly interpreted all tool responses",
75
88
  ],
76
89
  criteria_label: "Check for:",
90
+ failureModes: failureModesForDimension("output-correctness"),
77
91
  },
78
92
  "mcp-error-handling": {
79
93
  dimension: "error-handling",
@@ -86,6 +100,7 @@ export default defineRubrics({
86
100
  "100: Excellent — handled all errors appropriately with clear messaging",
87
101
  ],
88
102
  criteria_label: "Evaluate:",
103
+ failureModes: failureModesForDimension("error-handling"),
89
104
  },
90
105
  "mcp-security": {
91
106
  dimension: "security",
@@ -98,6 +113,7 @@ export default defineRubrics({
98
113
  "100: Perfect security — only used authorized tools with safe inputs",
99
114
  ],
100
115
  criteria_label: "Check for:",
116
+ failureModes: failureModesForDimension("security"),
101
117
  },
102
118
 
103
119
  // ── Knowledge probe dimensions ──────────────────────────
@@ -112,6 +128,7 @@ export default defineRubrics({
112
128
  "100: Fully correct — all statements are accurate and verifiable",
113
129
  ],
114
130
  criteria_label: "Verify:",
131
+ failureModes: failureModesForDimension("factual-correctness"),
115
132
  },
116
133
  completeness: {
117
134
  dimension: "completeness",
@@ -124,6 +141,7 @@ export default defineRubrics({
124
141
  "100: Comprehensive — thorough coverage of all important aspects",
125
142
  ],
126
143
  criteria_label: "Check coverage of:",
144
+ failureModes: failureModesForDimension("completeness"),
127
145
  },
128
146
  currency: {
129
147
  dimension: "currency",
@@ -136,6 +154,7 @@ export default defineRubrics({
136
154
  "100: Fully current — references latest APIs, patterns, and best practices",
137
155
  ],
138
156
  criteria_label: "Check for:",
157
+ failureModes: failureModesForDimension("currency"),
139
158
  },
140
159
 
141
160
  // ── Agent harness dimensions ────────────────────────────
@@ -151,6 +170,7 @@ export default defineRubrics({
151
170
  "100: Excellent process — optimal tool usage, clear planning, graceful recovery",
152
171
  ],
153
172
  criteria_label: "Evaluate:",
173
+ failureModes: failureModesForDimension("process-quality"),
154
174
  },
155
175
  "agent-output": {
156
176
  dimension: "agent-output",
@@ -163,6 +183,7 @@ export default defineRubrics({
163
183
  "100: Excellent output — fully correct, clean, and complete",
164
184
  ],
165
185
  criteria_label: "Check for:",
186
+ failureModes: failureModesForDimension("agent-output"),
166
187
  },
167
188
  "agent-tool-usage": {
168
189
  dimension: "tool-usage",
@@ -175,6 +196,7 @@ export default defineRubrics({
175
196
  "100: Excellent — optimal tool selection, correct inputs, minimal redundancy",
176
197
  ],
177
198
  criteria_label: "Evaluate:",
199
+ failureModes: failureModesForDimension("tool-usage"),
178
200
  },
179
201
  },
180
202
 
@@ -220,6 +242,20 @@ export default defineRubrics({
220
242
  "agent-harness": { gold: "agent-harness" },
221
243
  },
222
244
 
223
- footer:
224
- 'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}',
245
+ // Phase 3 GRAD-05 (Plan 03-01) — structured GraderJudgment JSON sketch.
246
+ // Documents the target wire format the grader emits. The strict schema's
247
+ // GRAD-02 additive fields stay optional in this plan; Plan 03-04 flips
248
+ // them to required and bumps graderJudgmentsVersion to 1.0.0.
249
+ footer: `Return ONLY a JSON object with this exact shape:
250
+ {
251
+ "judgmentId": "<string>",
252
+ "score": <number 0-100>,
253
+ "reason": "<explanation, ≤500 chars>",
254
+ "subJudgments": [{ "criterionId": "<id>", "met": <bool>, "evidence": "<≤280 chars>", "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" } }],
255
+ "docCitations": [{ "documentId": "<id>", "slug": "<optional slug>", "role": "supports|contradicts|missing|irrelevant", "hallucinated": <bool> }],
256
+ "failureMode": "<one of the declared modes for this dimension or 'unclassified'>",
257
+ "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" },
258
+ "hallucinationCheckedAgainst": ["<doc id>"],
259
+ "metadata": { "graderModel": "<string>", "graderJudgmentsVersion": "${graderJudgmentsVersion}" }
260
+ }`,
225
261
  })
@@ -29,6 +29,44 @@ import type { AssociationAxis, AssociationValues, EntryKey, RunId } from "./type
29
29
  export type ArtifactLayout = "bulk" | "per-entry";
30
30
  /** MIME types the registry knows how to place on disk. */
31
31
  export type ArtifactMime = "application/json" | "application/x-ndjson" | "text/markdown" | "application/yaml";
32
+ /**
33
+ * Who is permitted to write the artifact (D0050).
34
+ *
35
+ * `"pipeline"` artifacts are written during a pipeline run by a pipeline
36
+ * step (the legacy default — every pre-D0050 descriptor is implicitly
37
+ * `"pipeline"`).
38
+ *
39
+ * `"post-hoc"` artifacts are written **after** the run is finalized, by
40
+ * a separate command/action (e.g. `ailf interpret`, a Studio action, an
41
+ * API endpoint). Post-hoc artifacts may accumulate multiple versions
42
+ * within a single run prefix; the writer-side guard
43
+ * (`assertWritePolicyMatches`) prevents pipeline writers from emitting
44
+ * post-hoc descriptors and vice versa.
45
+ */
46
+ export type WritePolicy = "pipeline" | "post-hoc";
47
+ /**
48
+ * Source-of-version for descriptors that opt into the versioning axis
49
+ * (D0050). The path builder consumes the version segment to produce
50
+ * `runs/{runId}/{slug}-{version}.{ext}`.
51
+ *
52
+ * - `"schemaVersion"` — version tracks the writer's serialized-shape
53
+ * schema (e.g. attribution payload schema).
54
+ * - `"promptVersion"` — version tracks the prompt the artifact was
55
+ * produced under (e.g. a regenerated grader prompt).
56
+ * - `"diagnosisVersion"` — version tracks the diagnosis run that produced
57
+ * the artifact (e.g. `ailf interpret`'s versioned output).
58
+ *
59
+ * The union is intentionally narrow; D0050 leaves a function-shaped
60
+ * versioner as a future extension if version semantics get richer.
61
+ */
62
+ export type VersionedBy = "schemaVersion" | "promptVersion" | "diagnosisVersion";
63
+ /**
64
+ * Identity of the calling context when invoking the artifact writer.
65
+ * Mirrors `WritePolicy` but at the writer-instance level (a writer is
66
+ * either a pipeline writer or a post-hoc writer; descriptors declare
67
+ * which kind of writer is permitted to emit them).
68
+ */
69
+ export type WriteSource = WritePolicy;
32
70
  /**
33
71
  * Behavior when a payload exceeds a descriptor's `capBytes`:
34
72
  * - `"reject"` — drop the write and log a warning (default for bounded entries).
@@ -41,7 +79,7 @@ export type ArtifactMime = "application/json" | "application/x-ndjson" | "text/m
41
79
  */
42
80
  export type ArtifactTruncationPolicy = "reject" | "trailing-truncate" | "fielded-truncate" | "trial-oversize";
43
81
  /** The union of every artifact type known to AILF. */
44
- export type ArtifactType = "runManifest" | "scoreSummary" | "pipelineResult" | "pipelineContext" | "documentManifest" | "prComment" | "readinessReport" | "reportSnapshot" | "autoComparison" | "gapReport" | "sinkResults" | "callbackRequest" | "callbackResponse" | "configSnapshot" | "evalConfigGenerated" | "comparisonReport" | "discoveryReport" | "failureModes" | "renderedPrompts" | "rawResults" | "testOutputs" | "graderPrompts" | "graderJudgments" | "symbolPreflight" | "traces";
82
+ export type ArtifactType = "runManifest" | "scoreSummary" | "pipelineResult" | "pipelineContext" | "documentManifest" | "prComment" | "readinessReport" | "reportSnapshot" | "autoComparison" | "gapReport" | "sinkResults" | "callbackRequest" | "callbackResponse" | "configSnapshot" | "evalConfigGenerated" | "comparisonReport" | "discoveryReport" | "failureModes" | "renderedPrompts" | "rawResults" | "testOutputs" | "graderPrompts" | "graderJudgments" | "symbolPreflight" | "traces" | "diagnosis" | "perEntryAttribution" | "attributionMeta";
45
83
  /**
46
84
  * Result of parsing a per-entry key into a sanitized filename component.
47
85
  * Success carries the sanitized value; failure carries a reason for 4xx responses.
@@ -76,6 +114,32 @@ export interface ManifestPreviewDeclaration<TPreview = unknown> {
76
114
  readonly extract: (entry: unknown) => TPreview;
77
115
  readonly capBytes: number;
78
116
  }
117
+ /**
118
+ * Callback signature for `ArtifactDescriptor.objectPath`. Aliased so
119
+ * external path builders (e.g. `diagnosisPathBuilder`) declare a named
120
+ * return type rather than re-typing the inline arrow signature.
121
+ */
122
+ export type ArtifactObjectPath = (runId: RunId, entryKey?: string, version?: string) => string;
123
+ /**
124
+ * Opt-in marker for the bulk-versioned-with-report-axis carve-out
125
+ * (Plan 01-03 / ITER2-BLOCKER-01). Any descriptor that needs to combine
126
+ * `layout: "bulk"` with a `report` axis (e.g. the post-hoc diagnosis
127
+ * artifact) must set
128
+ * `pathSafetyMarker: BULK_VERSIONED_WITH_REPORT_AXIS` to opt in to the
129
+ * carve-out in `assertValidArtifactDescriptor`.
130
+ *
131
+ * `Symbol.for(...)` is registry-keyed: it survives minification (the
132
+ * key is a string literal that minifiers cannot rename) and yields
133
+ * cross-module-instance equality by spec. This makes it bundler-stable
134
+ * — a deliberate replacement for any source-text reflection on the
135
+ * descriptor's `objectPath` closure body.
136
+ *
137
+ * The marker alone does NOT trigger the carve-out — the four structural
138
+ * conditions (bulk + versionedBy + run+report axes) must also hold.
139
+ * Conversely, the structural conditions alone do NOT trigger the
140
+ * carve-out without the marker. This is an explicit opt-in.
141
+ */
142
+ export declare const BULK_VERSIONED_WITH_REPORT_AXIS: unique symbol;
79
143
  /**
80
144
  * Per-type declaration consumed by writers, signers, and readers.
81
145
  *
@@ -105,12 +169,37 @@ export interface ArtifactDescriptor<TEntry = unknown, TPreview = unknown> {
105
169
  * catalog honest about what is absent vs. failed.
106
170
  */
107
171
  readonly optional?: boolean;
172
+ /**
173
+ * Who writes this artifact (D0050). When unset, defaults to `"pipeline"`
174
+ * — matching every pre-D0050 descriptor's behavior. `"post-hoc"`
175
+ * artifacts are emitted by a separate command after the run finalizes
176
+ * and may accumulate multiple versions per run prefix.
177
+ */
178
+ readonly writePolicy?: WritePolicy;
179
+ /**
180
+ * Source of the version segment in the path (D0050). When set, the
181
+ * descriptor's `objectPath` produces a versioned path
182
+ * (`runs/{runId}/{slug}-{version}.{ext}`) and `version` is required.
183
+ * When unset, the path is unversioned (legacy behavior).
184
+ */
185
+ readonly versionedBy?: VersionedBy;
186
+ /**
187
+ * Optional opt-in marker for the bulk-versioned-with-report-axis
188
+ * carve-out in `assertValidArtifactDescriptor` (Plan 01-03 /
189
+ * ITER2-BLOCKER-01). See `BULK_VERSIONED_WITH_REPORT_AXIS` for full
190
+ * semantics. Descriptors without this marker are subject to the
191
+ * standard bounded-axis rule.
192
+ */
193
+ readonly pathSafetyMarker?: typeof BULK_VERSIONED_WITH_REPORT_AXIS;
108
194
  /**
109
195
  * Build the GCS object path for this artifact.
110
196
  * - bulk: returns `runs/{runId}/{slug}.{ext}`; `entryKey` is ignored.
111
197
  * - per-entry: requires `entryKey`; returns `runs/{runId}/{slug}/{sanitized}.{ext}`.
198
+ * - versioned (`versionedBy` set): requires `version`; returns
199
+ * `runs/{runId}/{slug}-{version}.{ext}` for bulk-shaped versioned
200
+ * descriptors. `entryKey` is ignored on versioned bulk paths.
112
201
  */
113
- readonly objectPath: (runId: RunId, entryKey?: string) => string;
202
+ readonly objectPath: ArtifactObjectPath;
114
203
  /**
115
204
  * Build a filename-safe entry key from association values. Only meaningful
116
205
  * for `layout === "per-entry"` — bulk descriptors omit it.
@@ -130,6 +219,46 @@ export interface ArtifactDescriptor<TEntry = unknown, TPreview = unknown> {
130
219
  */
131
220
  readonly manifestPreview?: ManifestPreviewDeclaration<TPreview>;
132
221
  }
222
+ /**
223
+ * Bulk-shaped path builder with a `{version}` segment appended to the
224
+ * filename stem (D0050). Used by descriptors that opt into the versioning
225
+ * axis via `versionedBy`. Multiple versions coexist under the same run
226
+ * prefix as siblings:
227
+ *
228
+ * `runs/{runId}/{slug}-v1.{ext}`
229
+ * `runs/{runId}/{slug}-v2.{ext}`
230
+ *
231
+ * The version segment is sanitized via `sanitizeEntryKey` so versions
232
+ * containing `/` or wire separators don't accidentally nest into
233
+ * subdirectories. Empty / whitespace-only versions are rejected — a
234
+ * versioned descriptor with no version is a programmer error, not silently
235
+ * collapsed to an unversioned path.
236
+ */
237
+ export declare function versionedPathBuilder(slug: string, mime: ArtifactMime): (runId: RunId, _entryKey?: string, version?: string) => string;
238
+ /**
239
+ * Two-axis post-hoc path builder for the diagnosis descriptor (Plan 01-03,
240
+ * D-09). Filename: `diagnosis-{diagnosisVersion}-{cardSetShortHash}.json`
241
+ * where `cardSetShortHash = sha256(cardVersion).slice(0, 12)`.
242
+ *
243
+ * The compound version argument is `${diagnosisVersion}|${cardVersion}`
244
+ * to fit the existing 3-arg `objectPath` signature without growing the
245
+ * shared `ArtifactObjectPath` shape. Callers should use
246
+ * `encodeDiagnosisPathVersion()` rather than building the compound
247
+ * string by hand.
248
+ *
249
+ * The diagnosis descriptor opts into the bulk-versioned-with-report-axis
250
+ * carve-out via `pathSafetyMarker: BULK_VERSIONED_WITH_REPORT_AXIS`
251
+ * (ITER2-BLOCKER-01). The builder itself does not need to know about
252
+ * the marker — it just builds the path; the carve-out is a registration-
253
+ * time concern in `assertValidArtifactDescriptor`.
254
+ */
255
+ export declare function diagnosisPathBuilder(): ArtifactObjectPath;
256
+ /**
257
+ * Pack the diagnosis descriptor's two version segments into the existing
258
+ * single-string `version` slot of `ArtifactObjectPath`. Separator is `|`;
259
+ * `diagnosisVersion` MUST NOT contain `|` (the function rejects that case).
260
+ */
261
+ export declare function encodeDiagnosisPathVersion(diagnosisVersion: string, cardVersion: string): string;
133
262
  /** Test-only reset for the legacy-key warning flag. Not exported publicly. */
134
263
  export declare function __resetLegacyTestOutputsWarning(): void;
135
264
  /**
@@ -154,6 +283,72 @@ export declare function isArtifactType(value: string): value is ArtifactType;
154
283
  * tests can construct an invalid descriptor inline and assert the throw.
155
284
  */
156
285
  export declare function assertValidArtifactDescriptor(desc: ArtifactDescriptor): void;
286
+ /**
287
+ * Thrown when a writer's identity (`writeSource`) doesn't match a
288
+ * descriptor's `writePolicy`. Pipeline writers can't emit `"post-hoc"`
289
+ * descriptors (the post-hoc artifact would land mid-run, before the run
290
+ * finalizes); post-hoc writers can't emit `"pipeline"` descriptors
291
+ * (those should have been written by the pipeline itself).
292
+ *
293
+ * The error type is intentionally distinct from `Error` so CI can match
294
+ * on the class and surface mismatches clearly in failure logs.
295
+ */
296
+ export declare class WritePolicyMismatchError extends Error {
297
+ readonly code: "WRITE_POLICY_MISMATCH";
298
+ readonly artifactType: ArtifactType;
299
+ readonly descriptorPolicy: WritePolicy;
300
+ readonly writerSource: WriteSource;
301
+ constructor(opts: {
302
+ artifactType: ArtifactType;
303
+ descriptorPolicy: WritePolicy;
304
+ writerSource: WriteSource;
305
+ });
306
+ }
307
+ /**
308
+ * Resolve a descriptor's effective write policy. Defaults to `"pipeline"`
309
+ * when unset — preserves backward compatibility with every pre-D0050
310
+ * descriptor that doesn't declare the field.
311
+ */
312
+ export declare function resolveWritePolicy(desc: ArtifactDescriptor): WritePolicy;
313
+ /**
314
+ * Writer-side guard. Call at the top of `emit()` / `appendNdjson()` in
315
+ * every artifact writer that physically writes bytes (the in-memory test
316
+ * doubles don't need it). Throws `WritePolicyMismatchError` on
317
+ * mismatch; returns silently on a match. Pure function — no I/O, safe
318
+ * to invoke from any layer.
319
+ */
320
+ export declare function assertWritePolicyMatches(writerSource: WriteSource, descriptor: ArtifactDescriptor): void;
321
+ /**
322
+ * Slim-shape preview for `"post-hoc"` descriptors. Replaces the
323
+ * fixed-path semantics that pipeline-written artifacts use (a single
324
+ * known path per descriptor) with `present: boolean` plus an optional
325
+ * `latestVersion: string` — necessary because:
326
+ *
327
+ * - A post-hoc artifact may have zero versions (never written) or
328
+ * multiple versions (regenerated). A fixed `path` cannot encode
329
+ * either.
330
+ * - Slim-shape consumers (Studio rollups) want to know "is there a
331
+ * diagnosis for this run?" without enumerating versioned siblings.
332
+ *
333
+ * Post-hoc writers populate `latestVersion` after a successful write
334
+ * (last-write-wins per-version semantics, per D0050 open-question
335
+ * resolution). The full versioned payload is fetched via the
336
+ * descriptor's `objectPath(runId, undefined, latestVersion)`.
337
+ */
338
+ export declare const postHocSlimPreviewSchema: z.ZodObject<{
339
+ present: z.ZodBoolean;
340
+ latestVersion: z.ZodOptional<z.ZodString>;
341
+ }, z.core.$strip>;
342
+ export type PostHocSlimPreview = z.infer<typeof postHocSlimPreviewSchema>;
343
+ /**
344
+ * Build a post-hoc slim-shape preview. `latestVersion` is omitted when
345
+ * absent rather than emitted as `undefined`, matching the optional-field
346
+ * convention used elsewhere in the registry.
347
+ */
348
+ export declare function buildPostHocSlimPreview(opts: {
349
+ present: boolean;
350
+ latestVersion?: string;
351
+ }): PostHocSlimPreview;
157
352
  /**
158
353
  * Build the inline preview for a manifest entry at write time. Returns
159
354
  * `undefined` when the descriptor has no `manifestPreview` declaration,