@sanity/ailf 4.6.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/canonical/grader-references/agent-harness-tools.yaml +42 -0
  2. package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
  3. package/canonical/grader-references/mcp-server-spec.yaml +51 -0
  4. package/canonical/grader-references/portable-text.yaml +48 -0
  5. package/config/rubrics.ts +38 -2
  6. package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
  7. package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
  8. package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
  9. package/dist/_vendor/ailf-core/examples/index.js +146 -47
  10. package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
  11. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
  12. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
  13. package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
  14. package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
  15. package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
  16. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  17. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
  18. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  19. package/dist/_vendor/ailf-core/schemas/index.js +9 -0
  20. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
  21. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
  22. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
  23. package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
  24. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
  25. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
  26. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
  27. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
  28. package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
  29. package/dist/_vendor/ailf-core/services/index.js +5 -0
  30. package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
  31. package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
  32. package/dist/_vendor/ailf-core/types/attribution.js +18 -0
  33. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
  34. package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
  35. package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
  36. package/dist/_vendor/ailf-core/types/confidence.js +7 -0
  37. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
  38. package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
  39. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
  40. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
  41. package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
  42. package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
  43. package/dist/_vendor/ailf-core/types/index.js +15 -1
  44. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
  45. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
  46. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
  47. package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
  48. package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
  49. package/dist/adapters/api-client/build-request.d.ts +1 -0
  50. package/dist/adapters/api-client/build-request.js +3 -0
  51. package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
  52. package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
  53. package/dist/adapters/attribution/index.d.ts +9 -0
  54. package/dist/adapters/attribution/index.js +8 -0
  55. package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
  56. package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
  57. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  58. package/dist/adapters/grader-outputs/index.d.ts +10 -0
  59. package/dist/adapters/grader-outputs/index.js +8 -0
  60. package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
  61. package/dist/adapters/grader-outputs/legacy/index.js +10 -0
  62. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
  63. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
  64. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
  65. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
  66. package/dist/adapters/index.d.ts +3 -0
  67. package/dist/adapters/index.js +4 -0
  68. package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
  69. package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
  70. package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
  71. package/dist/adapters/task-sources/repo-schemas.js +19 -2
  72. package/dist/commands/calculate-scores.js +1 -1
  73. package/dist/commands/explain-handler.js +1 -1
  74. package/dist/commands/lookup-doc.d.ts +1 -1
  75. package/dist/commands/lookup-doc.js +3 -3
  76. package/dist/commands/pipeline-action.d.ts +6 -0
  77. package/dist/commands/pipeline-action.js +2 -0
  78. package/dist/commands/remote-pipeline.js +1 -0
  79. package/dist/composition-root.d.ts +36 -0
  80. package/dist/composition-root.js +48 -0
  81. package/dist/config/rubrics.ts +38 -2
  82. package/dist/grader/agent-harness.d.ts +14 -0
  83. package/dist/grader/agent-harness.js +17 -0
  84. package/dist/grader/common.d.ts +17 -0
  85. package/dist/grader/common.js +21 -0
  86. package/dist/grader/index.d.ts +38 -0
  87. package/dist/grader/index.js +75 -0
  88. package/dist/grader/knowledge-probe.d.ts +14 -0
  89. package/dist/grader/knowledge-probe.js +18 -0
  90. package/dist/grader/literacy.d.ts +13 -0
  91. package/dist/grader/literacy.js +17 -0
  92. package/dist/grader/mcp.d.ts +14 -0
  93. package/dist/grader/mcp.js +18 -0
  94. package/dist/orchestration/build-app-context.js +1 -0
  95. package/dist/orchestration/build-step-sequence.js +5 -0
  96. package/dist/orchestration/steps/calculate-scores-step.js +23 -1
  97. package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
  98. package/dist/orchestration/steps/compute-attribution-step.js +279 -0
  99. package/dist/orchestration/steps/gap-analysis-step.js +35 -7
  100. package/dist/orchestration/steps/index.d.ts +1 -0
  101. package/dist/orchestration/steps/index.js +1 -0
  102. package/dist/pipeline/attribution.d.ts +15 -0
  103. package/dist/pipeline/attribution.js +18 -9
  104. package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
  105. package/dist/pipeline/borderline-consensus-runner.js +124 -0
  106. package/dist/pipeline/borderline-detector.d.ts +24 -0
  107. package/dist/pipeline/borderline-detector.js +26 -0
  108. package/dist/pipeline/calculate-scores.d.ts +114 -3
  109. package/dist/pipeline/calculate-scores.js +426 -24
  110. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  111. package/dist/pipeline/compiler/literacy-bridge.js +35 -17
  112. package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
  113. package/dist/pipeline/compiler/rubric-resolution.js +9 -1
  114. package/dist/pipeline/compute-attribution.d.ts +80 -0
  115. package/dist/pipeline/compute-attribution.js +196 -0
  116. package/dist/pipeline/failure-modes.d.ts +52 -17
  117. package/dist/pipeline/failure-modes.js +178 -117
  118. package/dist/pipeline/map-request-to-config.js +1 -0
  119. package/package.json +6 -4
@@ -0,0 +1,36 @@
1
+ /**
2
+ * confidence-schema.ts — shared Zod schema for the D0049 `Confidence` triple.
3
+ *
4
+ * Authored ONCE here so adapter schemas under
5
+ * `packages/eval/src/adapters/grader-outputs/` and
6
+ * `packages/eval/src/adapters/attribution/` import a single shared schema
7
+ * fragment instead of redeclaring the shape inline. The schema asserts
8
+ * `satisfies z.ZodType<Confidence>` against the domain type in
9
+ * `packages/core/src/types/confidence.ts` so drift is a build error.
10
+ *
11
+ * NON-BOUNDARY HELPER: this file lives outside the D0045 SCAN_ROOTS gate
12
+ * by intent — it is a reusable schema fragment, not a trust boundary.
13
+ * Consumers import via the pinned subpath export
14
+ * `@sanity/ailf-core/schemas` (declared in `packages/core/package.json`),
15
+ * NOT through the top-level barrel — that pin is the
16
+ * single legal access path so all adapter sites use the same specifier.
17
+ *
18
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
19
+ * @see docs/decisions/D0049-shared-confidence-contract.md
20
+ */
21
+ import { z } from "zod";
22
+ /**
23
+ * Shared schema for {@link Confidence}. The `derivation` field is the
24
+ * open `ConfidenceDerivation` tag; we accept any non-empty string so
25
+ * future emitters can mint their own identifiers without editing this
26
+ * package (matches `isConfidence`'s runtime guard).
27
+ */
28
+ export declare const ConfidenceSchema: z.ZodObject<{
29
+ level: z.ZodEnum<{
30
+ low: "low";
31
+ medium: "medium";
32
+ high: "high";
33
+ }>;
34
+ signalsPresent: z.ZodNumber;
35
+ derivation: z.ZodString;
36
+ }, z.core.$strip>;
@@ -0,0 +1,32 @@
1
+ /**
2
+ * confidence-schema.ts — shared Zod schema for the D0049 `Confidence` triple.
3
+ *
4
+ * Authored ONCE here so adapter schemas under
5
+ * `packages/eval/src/adapters/grader-outputs/` and
6
+ * `packages/eval/src/adapters/attribution/` import a single shared schema
7
+ * fragment instead of redeclaring the shape inline. The schema asserts
8
+ * `satisfies z.ZodType<Confidence>` against the domain type in
9
+ * `packages/core/src/types/confidence.ts` so drift is a build error.
10
+ *
11
+ * NON-BOUNDARY HELPER: this file lives outside the D0045 SCAN_ROOTS gate
12
+ * by intent — it is a reusable schema fragment, not a trust boundary.
13
+ * Consumers import via the pinned subpath export
14
+ * `@sanity/ailf-core/schemas` (declared in `packages/core/package.json`),
15
+ * NOT through the top-level barrel — that pin is the
16
+ * single legal access path so all adapter sites use the same specifier.
17
+ *
18
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
19
+ * @see docs/decisions/D0049-shared-confidence-contract.md
20
+ */
21
+ import { z } from "zod";
22
+ /**
23
+ * Shared schema for {@link Confidence}. The `derivation` field is the
24
+ * open `ConfidenceDerivation` tag; we accept any non-empty string so
25
+ * future emitters can mint their own identifiers without editing this
26
+ * package (matches `isConfidence`'s runtime guard).
27
+ */
28
+ export const ConfidenceSchema = z.object({
29
+ level: z.enum(["high", "medium", "low"]),
30
+ signalsPresent: z.number().int().nonnegative(),
31
+ derivation: z.string().min(1),
32
+ });
@@ -41,6 +41,7 @@ export declare const EvalConfigSchema: z.ZodObject<{
41
41
  execution: z.ZodOptional<z.ZodObject<{
42
42
  concurrency: z.ZodOptional<z.ZodNumber>;
43
43
  graderReplications: z.ZodOptional<z.ZodNumber>;
44
+ borderlineReplications: z.ZodOptional<z.ZodNumber>;
44
45
  gapAnalysis: z.ZodOptional<z.ZodBoolean>;
45
46
  apiUrl: z.ZodOptional<z.ZodString>;
46
47
  }, z.core.$strip>>;
@@ -85,15 +85,19 @@ export const EvalConfigSchema = z
85
85
  * `concurrency`, `gapAnalysis`, and `graderReplications` fields and adds
86
86
  * `apiUrl` to the same group.
87
87
  *
88
- * - `concurrency` — max parallel API calls
89
- * - `graderReplications` — grader consistency replications
90
- * - `gapAnalysis` enable failure-mode + impact analysis (default true)
91
- * - `apiUrl` — AILF API base URL (default https://ailf-api.sanity.build)
88
+ * - `concurrency` — max parallel API calls
89
+ * - `graderReplications` — grader consistency replications
90
+ * - `borderlineReplications` replications per borderline judgment
91
+ * for the GRAD-04 intra-grader consensus
92
+ * pass (default 3 in composition-root)
93
+ * - `gapAnalysis` — enable failure-mode + impact analysis (default true)
94
+ * - `apiUrl` — AILF API base URL (default https://ailf-api.sanity.build)
92
95
  */
93
96
  execution: z
94
97
  .object({
95
98
  concurrency: z.number().int().positive().optional(),
96
99
  graderReplications: z.number().int().positive().optional(),
100
+ borderlineReplications: z.number().int().positive().optional(),
97
101
  gapAnalysis: z.boolean().optional(),
98
102
  apiUrl: z.string().url().optional(),
99
103
  })
@@ -19,3 +19,5 @@ export * from "./schedules.js";
19
19
  export * from "./sinks.js";
20
20
  export * from "./symbol-preflight-report.js";
21
21
  export * from "./test-budgets.js";
22
+ export { ConfidenceSchema } from "./confidence-schema.js";
23
+ export { brandedString } from "./branded-string.js";
@@ -19,3 +19,12 @@ export * from "./schedules.js";
19
19
  export * from "./sinks.js";
20
20
  export * from "./symbol-preflight-report.js";
21
21
  export * from "./test-budgets.js";
22
+ // Phase 1 Plan 02 — shared schema fragment for D0049 Confidence.
23
+ // Named re-export only (W0124 / D0045) and pinned-subpath access path
24
+ // `@sanity/ailf-core/schemas` for adapter consumers.
25
+ export { ConfidenceSchema } from "./confidence-schema.js";
26
+ // Phase 1 Plan 03 — single audited cast site for `Brand<string, T>`
27
+ // schemas. Adapters MUST route branded-field declarations through this
28
+ // helper instead of replicating `as unknown as z.ZodType<…>` at each
29
+ // schema author site (project rule: no `as` on `unknown`).
30
+ export { brandedString } from "./branded-string.js";
@@ -48,6 +48,7 @@ export declare const PipelineRequestSchema: z.ZodObject<{
48
48
  "with-docs": "with-docs";
49
49
  }>>;
50
50
  graderReplications: z.ZodOptional<z.ZodNumber>;
51
+ borderlineReplications: z.ZodOptional<z.ZodNumber>;
51
52
  headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
52
53
  inlineTasks: z.ZodOptional<z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
53
54
  jobId: z.ZodOptional<z.ZodString>;
@@ -114,6 +114,7 @@ export const PipelineRequestSchema = z.object({
114
114
  */
115
115
  graderContext: z.enum(["rubric-only", "with-docs"]).optional(),
116
116
  graderReplications: z.number().int().positive().optional(),
117
+ borderlineReplications: z.number().int().positive().optional(),
117
118
  headers: z.record(z.string(), z.string()).optional(),
118
119
  inlineTasks: z.array(z.record(z.string(), z.unknown())).optional(),
119
120
  jobId: z.string().optional(),
@@ -20,6 +20,7 @@ import { z } from "zod";
20
20
  export declare const RubricTemplateSchema: z.ZodObject<{
21
21
  criteria_label: z.ZodOptional<z.ZodNullable<z.ZodString>>;
22
22
  dimension: z.ZodOptional<z.ZodString>;
23
+ failureModes: z.ZodOptional<z.ZodArray<z.ZodString>>;
23
24
  header: z.ZodString;
24
25
  scale: z.ZodArray<z.ZodString>;
25
26
  }, z.core.$strip>;
@@ -52,6 +53,7 @@ export declare const RubricConfigSchema: z.ZodObject<{
52
53
  templates: z.ZodRecord<z.ZodString, z.ZodObject<{
53
54
  criteria_label: z.ZodOptional<z.ZodNullable<z.ZodString>>;
54
55
  dimension: z.ZodOptional<z.ZodString>;
56
+ failureModes: z.ZodOptional<z.ZodArray<z.ZodString>>;
55
57
  header: z.ZodString;
56
58
  scale: z.ZodArray<z.ZodString>;
57
59
  }, z.core.$strip>>;
@@ -112,7 +114,10 @@ export type FeatureRegistry = z.infer<typeof FeatureRegistrySchema>;
112
114
  * and provides task-specific criteria.
113
115
  */
114
116
  declare const TemplatedLlmRubricAssertSchema: z.ZodObject<{
115
- criteria: z.ZodArray<z.ZodString>;
117
+ criteria: z.ZodArray<z.ZodObject<{
118
+ id: z.ZodString;
119
+ text: z.ZodString;
120
+ }, z.core.$strip>>;
116
121
  template: z.ZodString;
117
122
  type: z.ZodLiteral<"llm-rubric">;
118
123
  weight: z.ZodOptional<z.ZodNumber>;
@@ -129,7 +134,10 @@ export type TemplatedLlmRubricAssert = z.infer<typeof TemplatedLlmRubricAssertSc
129
134
  * is gone, but union is more flexible for future additions).
130
135
  */
131
136
  export declare const AssertionSchema: z.ZodUnion<readonly [z.ZodObject<{
132
- criteria: z.ZodArray<z.ZodString>;
137
+ criteria: z.ZodArray<z.ZodObject<{
138
+ id: z.ZodString;
139
+ text: z.ZodString;
140
+ }, z.core.$strip>>;
133
141
  template: z.ZodString;
134
142
  type: z.ZodLiteral<"llm-rubric">;
135
143
  weight: z.ZodOptional<z.ZodNumber>;
@@ -174,7 +182,10 @@ export type CanonicalDoc = z.infer<typeof CanonicalDocSchema>;
174
182
  */
175
183
  export declare const SingleTaskSchema: z.ZodObject<{
176
184
  assert: z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
177
- criteria: z.ZodArray<z.ZodString>;
185
+ criteria: z.ZodArray<z.ZodObject<{
186
+ id: z.ZodString;
187
+ text: z.ZodString;
188
+ }, z.core.$strip>>;
178
189
  template: z.ZodString;
179
190
  type: z.ZodLiteral<"llm-rubric">;
180
191
  weight: z.ZodOptional<z.ZodNumber>;
@@ -233,7 +244,10 @@ export type SingleTask = z.infer<typeof SingleTaskSchema>;
233
244
  */
234
245
  export declare const LegacyTaskSchema: z.ZodObject<{
235
246
  assert: z.ZodOptional<z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
236
- criteria: z.ZodArray<z.ZodString>;
247
+ criteria: z.ZodArray<z.ZodObject<{
248
+ id: z.ZodString;
249
+ text: z.ZodString;
250
+ }, z.core.$strip>>;
237
251
  template: z.ZodString;
238
252
  type: z.ZodLiteral<"llm-rubric">;
239
253
  weight: z.ZodOptional<z.ZodNumber>;
@@ -269,7 +283,10 @@ export type LegacyTask = z.infer<typeof LegacyTaskSchema>;
269
283
  */
270
284
  export declare const TaskEntrySchema: z.ZodUnion<readonly [z.ZodObject<{
271
285
  assert: z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
272
- criteria: z.ZodArray<z.ZodString>;
286
+ criteria: z.ZodArray<z.ZodObject<{
287
+ id: z.ZodString;
288
+ text: z.ZodString;
289
+ }, z.core.$strip>>;
273
290
  template: z.ZodString;
274
291
  type: z.ZodLiteral<"llm-rubric">;
275
292
  weight: z.ZodOptional<z.ZodNumber>;
@@ -321,7 +338,10 @@ export declare const TaskEntrySchema: z.ZodUnion<readonly [z.ZodObject<{
321
338
  }, z.core.$loose>;
322
339
  }, z.core.$strip>, z.ZodObject<{
323
340
  assert: z.ZodOptional<z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
324
- criteria: z.ZodArray<z.ZodString>;
341
+ criteria: z.ZodArray<z.ZodObject<{
342
+ id: z.ZodString;
343
+ text: z.ZodString;
344
+ }, z.core.$strip>>;
325
345
  template: z.ZodString;
326
346
  type: z.ZodLiteral<"llm-rubric">;
327
347
  weight: z.ZodOptional<z.ZodNumber>;
@@ -355,7 +375,10 @@ export type TaskEntryParsed = z.infer<typeof TaskEntrySchema>;
355
375
  */
356
376
  export declare const TaskFileSchema: z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
357
377
  assert: z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
358
- criteria: z.ZodArray<z.ZodString>;
378
+ criteria: z.ZodArray<z.ZodObject<{
379
+ id: z.ZodString;
380
+ text: z.ZodString;
381
+ }, z.core.$strip>>;
359
382
  template: z.ZodString;
360
383
  type: z.ZodLiteral<"llm-rubric">;
361
384
  weight: z.ZodOptional<z.ZodNumber>;
@@ -407,7 +430,10 @@ export declare const TaskFileSchema: z.ZodArray<z.ZodUnion<readonly [z.ZodObject
407
430
  }, z.core.$loose>;
408
431
  }, z.core.$strip>, z.ZodObject<{
409
432
  assert: z.ZodOptional<z.ZodArray<z.ZodUnion<readonly [z.ZodObject<{
410
- criteria: z.ZodArray<z.ZodString>;
433
+ criteria: z.ZodArray<z.ZodObject<{
434
+ id: z.ZodString;
435
+ text: z.ZodString;
436
+ }, z.core.$strip>>;
411
437
  template: z.ZodString;
412
438
  type: z.ZodLiteral<"llm-rubric">;
413
439
  weight: z.ZodOptional<z.ZodNumber>;
@@ -26,6 +26,14 @@ export const RubricTemplateSchema = z.object({
26
26
  .min(1, "criteria_label must be a non-empty string")
27
27
  .nullish(),
28
28
  dimension: z.string().min(1).optional(),
29
+ /**
30
+ * Plan 03-02 — per-dimension legal failure-mode list. When present and
31
+ * non-empty, the runtime rubric assembler announces the legal modes to
32
+ * the grader before the structured-shape footer (Plan 03-01).
33
+ * Stamped at config-load time by `failureModesForDimension(dimension)`
34
+ * from `packages/eval/src/grader/index.ts`.
35
+ */
36
+ failureModes: z.array(z.string().min(1)).optional(),
29
37
  header: z.string().min(1, "header must be a non-empty string"),
30
38
  scale: z
31
39
  .array(z.string().min(1))
@@ -118,13 +126,27 @@ export const FeatureRegistrySchema = z.object({
118
126
  // ---------------------------------------------------------------------------
119
127
  // Assertion schemas — one per Promptfoo assertion type
120
128
  // ---------------------------------------------------------------------------
129
+ // TODO(GRAD-01 follow-up): This schema duplicates
130
+ // packages/eval/src/adapters/task-sources/repo-schemas.ts:TemplatedAssertionSchema.
131
+ // Retiring requires reverse-extracting the canonical schema into
132
+ // packages/core/src/schemas/ (D0048 prevents importing from packages/eval).
133
+ // Out of Phase 2 scope; tracked separately.
134
+ //
135
+ // The `satisfies z.ZodType<CriterionRef>` clause asserts this duplicate
136
+ // stays shape-compatible with the canonical domain type in
137
+ // `@sanity/ailf-core` (D0045). If a future edit adds a third field to one
138
+ // schema and not the other, this build error catches the drift.
139
+ const CriterionRefShape = z.object({
140
+ id: z.string().min(1, "id must be a non-empty slug"),
141
+ text: z.string().min(1, "text must be a non-empty string"),
142
+ });
121
143
  /**
122
144
  * Templated llm-rubric assertion — references a rubric template by key
123
145
  * and provides task-specific criteria.
124
146
  */
125
147
  const TemplatedLlmRubricAssertSchema = z.object({
126
148
  criteria: z
127
- .array(z.string().min(1))
149
+ .array(CriterionRefShape)
128
150
  .min(1, "criteria must have at least one entry"),
129
151
  template: z.string().min(1, "template must be a non-empty string"),
130
152
  type: z.literal("llm-rubric"),
@@ -0,0 +1,40 @@
1
+ /**
2
+ * Diagnosis card registry — placeholder home for Phase 5 cards.
3
+ *
4
+ * Phase 5 cards declare:
5
+ *
6
+ * export const card = {
7
+ * type, version, schema, generate
8
+ * } satisfies CardDefinition
9
+ *
10
+ * The compound `cardVersion` (VER-01 / D-02) is built from per-card
11
+ * `version` by sorting `${type}@${version}` ascending and joining with
12
+ * `,`. Phase 1 lands the empty registry; Phase 5 registers cards via
13
+ * the composition root, not by mutating this binding.
14
+ *
15
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
16
+ * @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
17
+ * @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02, D-08)
18
+ */
19
+ import type { z } from "zod";
20
+ import type { CardType, DiagnosisCard } from "../../types/diagnosis.js";
21
+ /**
22
+ * Per-card definition. `schema` is the per-card body parser; `generate`
23
+ * is the runner-invoked builder. Phase 5 fills in the
24
+ * `report+attribution+llm` parameter list when card files land — Phase 1
25
+ * keeps the signature minimal so the registry compiles before any cards
26
+ * exist.
27
+ */
28
+ export interface CardDefinition<TBody = unknown> {
29
+ readonly type: CardType;
30
+ readonly version: string;
31
+ readonly schema: z.ZodType<TBody>;
32
+ readonly generate: () => Promise<DiagnosisCard>;
33
+ }
34
+ /**
35
+ * Phase 1: empty entrypoint. Phase 5 cards register here through the
36
+ * composition root. The exported binding is a `ReadonlyMap` so
37
+ * downstream consumers cannot mutate it (would re-introduce the vitest
38
+ * worker-leak hazard).
39
+ */
40
+ export declare const cardRegistry: ReadonlyMap<CardType, CardDefinition>;
@@ -0,0 +1,25 @@
1
+ /**
2
+ * Diagnosis card registry — placeholder home for Phase 5 cards.
3
+ *
4
+ * Phase 5 cards declare:
5
+ *
6
+ * export const card = {
7
+ * type, version, schema, generate
8
+ * } satisfies CardDefinition
9
+ *
10
+ * The compound `cardVersion` (VER-01 / D-02) is built from per-card
11
+ * `version` by sorting `${type}@${version}` ascending and joining with
12
+ * `,`. Phase 1 lands the empty registry; Phase 5 registers cards via
13
+ * the composition root, not by mutating this binding.
14
+ *
15
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
16
+ * @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
17
+ * @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02, D-08)
18
+ */
19
+ /**
20
+ * Phase 1: empty entrypoint. Phase 5 cards register here through the
21
+ * composition root. The exported binding is a `ReadonlyMap` so
22
+ * downstream consumers cannot mutate it (would re-introduce the vitest
23
+ * worker-leak hazard).
24
+ */
25
+ export const cardRegistry = new Map();
@@ -0,0 +1,19 @@
1
+ /**
2
+ * Diagnosis runner — engine entry point (D0048).
3
+ *
4
+ * Phase 1 lands the version constant only; the runner factory + cache
5
+ * lookup land in Phase 5.
6
+ *
7
+ * @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
8
+ * @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02)
9
+ */
10
+ /**
11
+ * Bumped when the runner's selection logic, prompt orchestration, or
12
+ * card-set composition changes in a way that should invalidate cached
13
+ * Diagnoses (VER-01 / D-02). Co-located here so the cache-invalidation
14
+ * contract test reads the canonical value.
15
+ *
16
+ * `export const` (never `export let`) — module-scope mutables leak
17
+ * across vitest workers (cross-cutting hazard #2).
18
+ */
19
+ export declare const diagnosisVersion = "0.1.0";
@@ -0,0 +1,19 @@
1
+ /**
2
+ * Diagnosis runner — engine entry point (D0048).
3
+ *
4
+ * Phase 1 lands the version constant only; the runner factory + cache
5
+ * lookup land in Phase 5.
6
+ *
7
+ * @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
8
+ * @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02)
9
+ */
10
+ /**
11
+ * Bumped when the runner's selection logic, prompt orchestration, or
12
+ * card-set composition changes in a way that should invalidate cached
13
+ * Diagnoses (VER-01 / D-02). Co-located here so the cache-invalidation
14
+ * contract test reads the canonical value.
15
+ *
16
+ * `export const` (never `export let`) — module-scope mutables leak
17
+ * across vitest workers (cross-cutting hazard #2).
18
+ */
19
+ export const diagnosisVersion = "0.1.0";
@@ -13,3 +13,5 @@ export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskS
13
13
  export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
14
14
  export { buildSlimReportSummary } from "./slim-report-summary.js";
15
15
  export { reportToMarkdown, type RenderableReport, } from "./report-to-markdown.js";
16
+ export { diagnosisVersion } from "./diagnosis-runner.js";
17
+ export { cardRegistry, type CardDefinition } from "./diagnosis/registry.js";
@@ -13,3 +13,8 @@ export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskS
13
13
  export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
14
14
  export { buildSlimReportSummary } from "./slim-report-summary.js";
15
15
  export { reportToMarkdown, } from "./report-to-markdown.js";
16
+ // ---------------------------------------------------------------------------
17
+ // Actionability ladder Phase 1 — diagnosis runner + card registry
18
+ // ---------------------------------------------------------------------------
19
+ export { diagnosisVersion } from "./diagnosis-runner.js";
20
+ export { cardRegistry } from "./diagnosis/registry.js";
@@ -493,8 +493,9 @@ function renderLowScoringJudgments(md, judgments) {
493
493
  .join("\n");
494
494
  md.line(reasonLines);
495
495
  md.blank();
496
- if (j.canonicalDocs && j.canonicalDocs.length > 0) {
497
- const docList = j.canonicalDocs.map((d) => `\`${d.slug}\``).join(", ");
496
+ const jDocs = j.contextDocs ?? j.canonicalDocs;
497
+ if (jDocs && jDocs.length > 0) {
498
+ const docList = jDocs.map((d) => `\`${d.slug}\``).join(", ");
498
499
  md.line(`*Expected docs: ${docList}*`);
499
500
  md.blank();
500
501
  }
@@ -0,0 +1,82 @@
1
+ /**
2
+ * Attribution core domain types — canonical shapes for the per-document
3
+ * attribution ensemble (Doc 04).
4
+ *
5
+ * Phase 1 lands the type carriers; Phase 4 lands the compute step. The
6
+ * Zod schemas in `packages/eval/src/adapters/attribution/` assert
7
+ * `satisfies z.ZodType<...>` against these types.
8
+ *
9
+ * Doc identity is referenced by `documentId` (D0052), not by `slug` —
10
+ * `slug` is retained as a human-readable annotation only. The
11
+ * resolvable-set check is carried as a separate
12
+ * `hallucinationCheckedAgainst: string[]` field (Pitfall #11).
13
+ *
14
+ * @see docs/decisions/D0049-shared-confidence-contract.md
15
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
16
+ * @see docs/design-docs/actionability-ladder/04-per-document-attribution-ensemble.md
17
+ */
18
+ import type { Confidence } from "./confidence.js";
19
+ /**
20
+ * Per-document attribution score for one judgment. The `signals` sub-record
21
+ * carries each ensemble member's contribution; the top-level `score` is
22
+ * the post-weighting composite.
23
+ *
24
+ * `documentId` is the canonical D0052 reference; `slug` is a
25
+ * human-readable annotation only and must not be relied on for identity.
26
+ */
27
+ export interface DocAttribution {
28
+ /** Canonical D0052 document ref (id, not slug). */
29
+ documentId: string;
30
+ /** Optional human-readable annotation. Never the identity. */
31
+ slug?: string;
32
+ /** Composite attribution score in [0, 1]. */
33
+ score: number;
34
+ /** Per-ensemble-member contributions before weighting. */
35
+ signals: {
36
+ citation?: number;
37
+ canonical?: number;
38
+ retrieved?: number;
39
+ };
40
+ /** Shared D0049 confidence triple. */
41
+ confidence: Confidence;
42
+ }
43
+ /**
44
+ * Per-judgment attribution carrier. Emitted by Phase 4's
45
+ * `ComputeAttributionStep`; persisted at
46
+ * `runs/{runId}/attribution/{entryKey}.json`.
47
+ *
48
+ * `hallucinationCheckedAgainst` is the resolvable-set used at compute
49
+ * time — required (not optional) so consumers can audit citation
50
+ * grounding without re-deriving the set. Per Pitfall #11 the canonical
51
+ * task field is `contextDocs`; do not invent `expectedDocs` /
52
+ * `usedDocs` synonyms.
53
+ */
54
+ export interface JudgmentAttribution {
55
+ /** D0052 granular ref to the underlying grader judgment. */
56
+ judgmentRef: string;
57
+ taskId: string;
58
+ modelId: string;
59
+ dimension: string;
60
+ attributions: DocAttribution[];
61
+ /** Resolvable-set used at compute time (Pitfall #11). */
62
+ hallucinationCheckedAgainst: string[];
63
+ }
64
+ /**
65
+ * Run-scoped attribution metadata. Persisted alongside the per-entry
66
+ * attribution objects so consumers can interpret signal-weighting and
67
+ * embedding choices without re-loading the calibration set.
68
+ *
69
+ * `embeddingModel` is REQUIRED (Pitfall #6) — silently downgrading to a
70
+ * default has caused regressions in adjacent codebases.
71
+ */
72
+ export interface AttributionMeta {
73
+ ensembleVersion: string;
74
+ /** Embedding model identifier — REQUIRED (Pitfall #6). */
75
+ embeddingModel: string;
76
+ calibrationSetVersion?: string;
77
+ weights: {
78
+ citation: number;
79
+ canonical: number;
80
+ retrieved: number;
81
+ };
82
+ }
@@ -0,0 +1,18 @@
1
+ /**
2
+ * Attribution core domain types — canonical shapes for the per-document
3
+ * attribution ensemble (Doc 04).
4
+ *
5
+ * Phase 1 lands the type carriers; Phase 4 lands the compute step. The
6
+ * Zod schemas in `packages/eval/src/adapters/attribution/` assert
7
+ * `satisfies z.ZodType<...>` against these types.
8
+ *
9
+ * Doc identity is referenced by `documentId` (D0052), not by `slug` —
10
+ * `slug` is retained as a human-readable annotation only. The
11
+ * resolvable-set check is carried as a separate
12
+ * `hallucinationCheckedAgainst: string[]` field (Pitfall #11).
13
+ *
14
+ * @see docs/decisions/D0049-shared-confidence-contract.md
15
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
16
+ * @see docs/design-docs/actionability-ladder/04-per-document-attribution-ensemble.md
17
+ */
18
+ export {};
@@ -29,6 +29,8 @@ declare const __brand: unique symbol;
29
29
  export type Brand<T, B extends string> = T & {
30
30
  readonly [__brand]: B;
31
31
  };
32
+ /** Unique identifier for a grader judgment (D0052 granular). */
33
+ export type JudgmentId = Brand<string, "JudgmentId">;
32
34
  /** Unique identifier for an evaluation task */
33
35
  export type TaskId = Brand<string, "TaskId">;
34
36
  /** URL-safe slug for a task (derived from title) */
@@ -74,7 +76,7 @@ export type ArtifactId = Brand<string, "ArtifactId">;
74
76
  * per-mode (e.g. `failureModes`, one entry per classified failure category —
75
77
  * D0033 M7, W0051 Slice 2).
76
78
  */
77
- export type AssociationAxis = "run" | "mode" | "task" | "model" | "grader" | "trial" | "category";
79
+ export type AssociationAxis = "run" | "mode" | "task" | "model" | "grader" | "trial" | "category" | "report";
78
80
  /**
79
81
  * The sanitized, filename-safe identifier for a single per-entry artifact
80
82
  * object. Produced by `ArtifactDescriptor.formatEntryKey` and parsed by
@@ -178,4 +180,27 @@ export declare function providerId(raw: string): Result<ProviderId, IdValidation
178
180
  * Valid format: alphanumeric + hyphens, 1–128 characters.
179
181
  */
180
182
  export declare function fixtureId(raw: string): Result<FixtureId, IdValidationError>;
183
+ /**
184
+ * Parse a raw string into a `JudgmentId`.
185
+ *
186
+ * See `JUDGMENT_ID_RE` for the accepted formats.
187
+ */
188
+ export declare function judgmentId(raw: string): Result<JudgmentId, IdValidationError>;
189
+ /**
190
+ * Generate a deterministic `JudgmentId` for a synthesized fall-back
191
+ * judgment. Salting with `runId` (when supplied) makes the id unique
192
+ * per-run so consumers' `(taskId, modelId, dimension)` dedup key
193
+ * doesn't collide across re-runs of the same task — every run writes
194
+ * fresh ids that still encode the natural composite key.
195
+ *
196
+ * When `runId` is absent the salt collapses to `nosalt`, preserving the
197
+ * legacy "deterministic across runs" shape for callers that explicitly
198
+ * want it (e.g. unit tests that assert the exact id string).
199
+ */
200
+ export declare function generateJudgmentId(input: {
201
+ taskId: string;
202
+ modelId: string;
203
+ dimension: string;
204
+ runId?: RunId | string;
205
+ }): JudgmentId;
181
206
  export {};