@sanity/ailf 4.6.0 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. package/canonical/grader-references/agent-harness-tools.yaml +42 -0
  2. package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
  3. package/canonical/grader-references/mcp-server-spec.yaml +51 -0
  4. package/canonical/grader-references/portable-text.yaml +48 -0
  5. package/config/diagnosis-cards.ts +318 -0
  6. package/config/models.ts +12 -0
  7. package/config/rubrics.ts +38 -2
  8. package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
  9. package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
  10. package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
  11. package/dist/_vendor/ailf-core/examples/index.js +146 -47
  12. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
  13. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
  14. package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
  15. package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
  16. package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
  17. package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
  18. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
  19. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
  20. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
  21. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
  22. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
  23. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
  24. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  25. package/dist/_vendor/ailf-core/index.js +4 -0
  26. package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
  27. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
  28. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
  29. package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
  30. package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
  31. package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
  32. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  33. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
  34. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  35. package/dist/_vendor/ailf-core/schemas/index.js +9 -0
  36. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
  37. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
  38. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
  39. package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
  40. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
  41. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
  42. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
  43. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
  44. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
  45. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +171 -0
  46. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
  47. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
  48. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
  49. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
  50. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
  51. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +104 -0
  52. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
  53. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +96 -0
  54. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +39 -0
  55. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +52 -0
  56. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
  57. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +77 -0
  58. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
  59. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
  60. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
  61. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +126 -0
  62. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
  63. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +107 -0
  64. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
  65. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +114 -0
  66. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
  67. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +273 -0
  68. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
  69. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
  70. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
  71. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
  72. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
  73. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
  74. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
  75. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
  76. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
  77. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
  78. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +16 -0
  79. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +86 -0
  80. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +50 -0
  81. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +35 -0
  82. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +136 -0
  83. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +153 -0
  84. package/dist/_vendor/ailf-core/services/index.d.ts +6 -0
  85. package/dist/_vendor/ailf-core/services/index.js +18 -0
  86. package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
  87. package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
  88. package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
  89. package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
  90. package/dist/_vendor/ailf-core/types/attribution.js +18 -0
  91. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
  92. package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
  93. package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
  94. package/dist/_vendor/ailf-core/types/confidence.js +7 -0
  95. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +271 -0
  96. package/dist/_vendor/ailf-core/types/diagnosis.js +19 -0
  97. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
  98. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
  99. package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
  100. package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
  101. package/dist/_vendor/ailf-core/types/index.js +15 -1
  102. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
  103. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
  104. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
  105. package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
  106. package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
  107. package/dist/adapters/api-client/build-request.d.ts +1 -0
  108. package/dist/adapters/api-client/build-request.js +3 -0
  109. package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
  110. package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
  111. package/dist/adapters/attribution/index.d.ts +9 -0
  112. package/dist/adapters/attribution/index.js +8 -0
  113. package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
  114. package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
  115. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  116. package/dist/adapters/grader-outputs/index.d.ts +10 -0
  117. package/dist/adapters/grader-outputs/index.js +8 -0
  118. package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
  119. package/dist/adapters/grader-outputs/legacy/index.js +10 -0
  120. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
  121. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
  122. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
  123. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
  124. package/dist/adapters/index.d.ts +3 -0
  125. package/dist/adapters/index.js +4 -0
  126. package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
  127. package/dist/adapters/llm/fake-llm-client.js +38 -1
  128. package/dist/adapters/llm/openai-llm-client.js +52 -3
  129. package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
  130. package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
  131. package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
  132. package/dist/adapters/task-sources/repo-schemas.js +19 -2
  133. package/dist/cli-program.js +3 -0
  134. package/dist/commands/calculate-scores.js +1 -1
  135. package/dist/commands/explain-handler.js +1 -1
  136. package/dist/commands/interpret.d.ts +50 -0
  137. package/dist/commands/interpret.js +212 -0
  138. package/dist/commands/lookup-doc.d.ts +1 -1
  139. package/dist/commands/lookup-doc.js +3 -3
  140. package/dist/commands/pipeline-action.d.ts +6 -0
  141. package/dist/commands/pipeline-action.js +2 -0
  142. package/dist/commands/remote-pipeline.js +1 -0
  143. package/dist/composition-root.d.ts +57 -23
  144. package/dist/composition-root.js +155 -41
  145. package/dist/config/diagnosis-cards.ts +318 -0
  146. package/dist/config/models.ts +12 -0
  147. package/dist/config/rubrics.ts +38 -2
  148. package/dist/grader/agent-harness.d.ts +9 -0
  149. package/dist/grader/agent-harness.js +9 -0
  150. package/dist/grader/common.d.ts +9 -0
  151. package/dist/grader/common.js +9 -0
  152. package/dist/grader/index.d.ts +24 -0
  153. package/dist/grader/index.js +24 -0
  154. package/dist/grader/knowledge-probe.d.ts +9 -0
  155. package/dist/grader/knowledge-probe.js +9 -0
  156. package/dist/grader/literacy.d.ts +9 -0
  157. package/dist/grader/literacy.js +9 -0
  158. package/dist/grader/mcp.d.ts +9 -0
  159. package/dist/grader/mcp.js +9 -0
  160. package/dist/orchestration/build-app-context.js +1 -0
  161. package/dist/orchestration/build-step-sequence.js +5 -0
  162. package/dist/orchestration/steps/calculate-scores-step.js +23 -1
  163. package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
  164. package/dist/orchestration/steps/compute-attribution-step.js +279 -0
  165. package/dist/orchestration/steps/gap-analysis-step.js +35 -7
  166. package/dist/orchestration/steps/index.d.ts +1 -0
  167. package/dist/orchestration/steps/index.js +1 -0
  168. package/dist/pipeline/attribution.d.ts +15 -0
  169. package/dist/pipeline/attribution.js +18 -9
  170. package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
  171. package/dist/pipeline/borderline-consensus-runner.js +124 -0
  172. package/dist/pipeline/borderline-detector.d.ts +24 -0
  173. package/dist/pipeline/borderline-detector.js +26 -0
  174. package/dist/pipeline/calculate-scores.d.ts +114 -3
  175. package/dist/pipeline/calculate-scores.js +426 -24
  176. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  177. package/dist/pipeline/compiler/literacy-bridge.js +35 -17
  178. package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
  179. package/dist/pipeline/compiler/rubric-resolution.js +9 -1
  180. package/dist/pipeline/compute-attribution.d.ts +80 -0
  181. package/dist/pipeline/compute-attribution.js +196 -0
  182. package/dist/pipeline/failure-modes.d.ts +52 -17
  183. package/dist/pipeline/failure-modes.js +178 -117
  184. package/dist/pipeline/map-request-to-config.js +1 -0
  185. package/package.json +7 -5
@@ -32,6 +32,40 @@ export type CuratedAssertionType = (typeof CURATED_ASSERTION_TYPES)[number];
32
32
  */
33
33
  export declare const RUBRIC_TEMPLATE_NAMES: readonly ["task-completion", "code-correctness", "doc-coverage", "mcp-input-validation", "mcp-output-correctness", "mcp-error-handling", "mcp-security", "factual-correctness", "completeness", "currency", "process-quality", "agent-output", "agent-tool-usage"];
34
34
  export type RubricTemplateName = (typeof RUBRIC_TEMPLATE_NAMES)[number];
35
+ /**
36
+ * A single criterion within an llm-rubric assertion. Stable id-text pair.
37
+ */
38
+ export declare const CriterionRefSchema: z.ZodObject<{
39
+ id: z.ZodString;
40
+ text: z.ZodString;
41
+ }, z.core.$strip>;
42
+ /**
43
+ * A templated LLM-rubric assertion — uses one of the predefined rubric
44
+ * templates with author-supplied criteria.
45
+ */
46
+ export declare const TemplatedAssertionSchema: z.ZodObject<{
47
+ type: z.ZodLiteral<"llm-rubric">;
48
+ template: z.ZodEnum<{
49
+ "task-completion": "task-completion";
50
+ "code-correctness": "code-correctness";
51
+ "doc-coverage": "doc-coverage";
52
+ "mcp-input-validation": "mcp-input-validation";
53
+ "mcp-output-correctness": "mcp-output-correctness";
54
+ "mcp-error-handling": "mcp-error-handling";
55
+ "mcp-security": "mcp-security";
56
+ "factual-correctness": "factual-correctness";
57
+ completeness: "completeness";
58
+ currency: "currency";
59
+ "process-quality": "process-quality";
60
+ "agent-output": "agent-output";
61
+ "agent-tool-usage": "agent-tool-usage";
62
+ }>;
63
+ criteria: z.ZodArray<z.ZodObject<{
64
+ id: z.ZodString;
65
+ text: z.ZodString;
66
+ }, z.core.$strip>>;
67
+ weight: z.ZodOptional<z.ZodNumber>;
68
+ }, z.core.$strip>;
35
69
  /**
36
70
  * Zod schema for a single task definition — a mode-discriminated union
37
71
  * mirroring `GeneralizedTaskDefinition`.
@@ -73,7 +107,10 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
73
107
  "agent-output": "agent-output";
74
108
  "agent-tool-usage": "agent-tool-usage";
75
109
  }>;
76
- criteria: z.ZodArray<z.ZodString>;
110
+ criteria: z.ZodArray<z.ZodObject<{
111
+ id: z.ZodString;
112
+ text: z.ZodString;
113
+ }, z.core.$strip>>;
77
114
  weight: z.ZodOptional<z.ZodNumber>;
78
115
  }, z.core.$strip>, z.ZodObject<{
79
116
  type: z.ZodEnum<{
@@ -187,7 +224,10 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
187
224
  "agent-output": "agent-output";
188
225
  "agent-tool-usage": "agent-tool-usage";
189
226
  }>;
190
- criteria: z.ZodArray<z.ZodString>;
227
+ criteria: z.ZodArray<z.ZodObject<{
228
+ id: z.ZodString;
229
+ text: z.ZodString;
230
+ }, z.core.$strip>>;
191
231
  weight: z.ZodOptional<z.ZodNumber>;
192
232
  }, z.core.$strip>, z.ZodObject<{
193
233
  type: z.ZodEnum<{
@@ -341,7 +381,10 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
341
381
  "agent-output": "agent-output";
342
382
  "agent-tool-usage": "agent-tool-usage";
343
383
  }>;
344
- criteria: z.ZodArray<z.ZodString>;
384
+ criteria: z.ZodArray<z.ZodObject<{
385
+ id: z.ZodString;
386
+ text: z.ZodString;
387
+ }, z.core.$strip>>;
345
388
  weight: z.ZodOptional<z.ZodNumber>;
346
389
  }, z.core.$strip>, z.ZodObject<{
347
390
  type: z.ZodEnum<{
@@ -472,7 +515,10 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
472
515
  "agent-output": "agent-output";
473
516
  "agent-tool-usage": "agent-tool-usage";
474
517
  }>;
475
- criteria: z.ZodArray<z.ZodString>;
518
+ criteria: z.ZodArray<z.ZodObject<{
519
+ id: z.ZodString;
520
+ text: z.ZodString;
521
+ }, z.core.$strip>>;
476
522
  weight: z.ZodOptional<z.ZodNumber>;
477
523
  }, z.core.$strip>, z.ZodObject<{
478
524
  type: z.ZodEnum<{
@@ -591,7 +637,10 @@ export declare const CanonicalTaskSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
591
637
  "agent-output": "agent-output";
592
638
  "agent-tool-usage": "agent-tool-usage";
593
639
  }>;
594
- criteria: z.ZodArray<z.ZodString>;
640
+ criteria: z.ZodArray<z.ZodObject<{
641
+ id: z.ZodString;
642
+ text: z.ZodString;
643
+ }, z.core.$strip>>;
595
644
  weight: z.ZodOptional<z.ZodNumber>;
596
645
  }, z.core.$strip>, z.ZodObject<{
597
646
  type: z.ZodEnum<{
@@ -699,7 +748,10 @@ export declare const ContentLakeAuthorableTaskSchema: z.ZodObject<{
699
748
  "agent-output": "agent-output";
700
749
  "agent-tool-usage": "agent-tool-usage";
701
750
  }>;
702
- criteria: z.ZodArray<z.ZodString>;
751
+ criteria: z.ZodArray<z.ZodObject<{
752
+ id: z.ZodString;
753
+ text: z.ZodString;
754
+ }, z.core.$strip>>;
703
755
  weight: z.ZodOptional<z.ZodNumber>;
704
756
  }, z.core.$strip>, z.ZodObject<{
705
757
  type: z.ZodEnum<{
@@ -819,7 +871,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
819
871
  "agent-output": "agent-output";
820
872
  "agent-tool-usage": "agent-tool-usage";
821
873
  }>;
822
- criteria: z.ZodArray<z.ZodString>;
874
+ criteria: z.ZodArray<z.ZodObject<{
875
+ id: z.ZodString;
876
+ text: z.ZodString;
877
+ }, z.core.$strip>>;
823
878
  weight: z.ZodOptional<z.ZodNumber>;
824
879
  }, z.core.$strip>, z.ZodObject<{
825
880
  type: z.ZodEnum<{
@@ -933,7 +988,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
933
988
  "agent-output": "agent-output";
934
989
  "agent-tool-usage": "agent-tool-usage";
935
990
  }>;
936
- criteria: z.ZodArray<z.ZodString>;
991
+ criteria: z.ZodArray<z.ZodObject<{
992
+ id: z.ZodString;
993
+ text: z.ZodString;
994
+ }, z.core.$strip>>;
937
995
  weight: z.ZodOptional<z.ZodNumber>;
938
996
  }, z.core.$strip>, z.ZodObject<{
939
997
  type: z.ZodEnum<{
@@ -1087,7 +1145,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
1087
1145
  "agent-output": "agent-output";
1088
1146
  "agent-tool-usage": "agent-tool-usage";
1089
1147
  }>;
1090
- criteria: z.ZodArray<z.ZodString>;
1148
+ criteria: z.ZodArray<z.ZodObject<{
1149
+ id: z.ZodString;
1150
+ text: z.ZodString;
1151
+ }, z.core.$strip>>;
1091
1152
  weight: z.ZodOptional<z.ZodNumber>;
1092
1153
  }, z.core.$strip>, z.ZodObject<{
1093
1154
  type: z.ZodEnum<{
@@ -1218,7 +1279,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
1218
1279
  "agent-output": "agent-output";
1219
1280
  "agent-tool-usage": "agent-tool-usage";
1220
1281
  }>;
1221
- criteria: z.ZodArray<z.ZodString>;
1282
+ criteria: z.ZodArray<z.ZodObject<{
1283
+ id: z.ZodString;
1284
+ text: z.ZodString;
1285
+ }, z.core.$strip>>;
1222
1286
  weight: z.ZodOptional<z.ZodNumber>;
1223
1287
  }, z.core.$strip>, z.ZodObject<{
1224
1288
  type: z.ZodEnum<{
@@ -1337,7 +1401,10 @@ export declare const CanonicalTaskFileSchema: z.ZodArray<z.ZodDiscriminatedUnion
1337
1401
  "agent-output": "agent-output";
1338
1402
  "agent-tool-usage": "agent-tool-usage";
1339
1403
  }>;
1340
- criteria: z.ZodArray<z.ZodString>;
1404
+ criteria: z.ZodArray<z.ZodObject<{
1405
+ id: z.ZodString;
1406
+ text: z.ZodString;
1407
+ }, z.core.$strip>>;
1341
1408
  weight: z.ZodOptional<z.ZodNumber>;
1342
1409
  }, z.core.$strip>, z.ZodObject<{
1343
1410
  type: z.ZodEnum<{
@@ -1468,6 +1535,7 @@ export declare const RepoConfigSchema: z.ZodObject<{
1468
1535
  execution: z.ZodOptional<z.ZodObject<{
1469
1536
  concurrency: z.ZodOptional<z.ZodNumber>;
1470
1537
  graderReplications: z.ZodOptional<z.ZodNumber>;
1538
+ borderlineReplications: z.ZodOptional<z.ZodNumber>;
1471
1539
  gapAnalysis: z.ZodOptional<z.ZodBoolean>;
1472
1540
  apiUrl: z.ZodOptional<z.ZodString>;
1473
1541
  }, z.core.$strip>>;
@@ -111,14 +111,26 @@ const CanonicalDocRefSchema = z.union([
111
111
  // ---------------------------------------------------------------------------
112
112
  // Assertion schemas
113
113
  // ---------------------------------------------------------------------------
114
+ /**
115
+ * A single criterion within an llm-rubric assertion. Stable id-text pair.
116
+ */
117
+ export const CriterionRefSchema = z.object({
118
+ id: z
119
+ .string()
120
+ .min(1)
121
+ .regex(/^[a-z0-9][a-z0-9-]*$/, {
122
+ message: "criterion id must be lowercase alphanumeric with hyphens",
123
+ }),
124
+ text: z.string().min(1),
125
+ });
114
126
  /**
115
127
  * A templated LLM-rubric assertion — uses one of the predefined rubric
116
128
  * templates with author-supplied criteria.
117
129
  */
118
- const TemplatedAssertionSchema = z.object({
130
+ export const TemplatedAssertionSchema = z.object({
119
131
  type: z.literal("llm-rubric"),
120
132
  template: z.enum(RUBRIC_TEMPLATE_NAMES),
121
- criteria: z.array(z.string().min(1)).min(1),
133
+ criteria: z.array(CriterionRefSchema).min(1),
122
134
  weight: z.number().optional(),
123
135
  });
124
136
  /**
@@ -562,6 +574,11 @@ const ExecutionConfigSchema = z
562
574
  .object({
563
575
  concurrency: z.number().int().positive().optional(),
564
576
  graderReplications: z.number().int().positive().optional(),
577
+ /**
578
+ * Plan 03-04 GRAD-04 — replications per borderline judgment.
579
+ * Default 3 (composition-root). Positive integer.
580
+ */
581
+ borderlineReplications: z.number().int().positive().optional(),
565
582
  gapAnalysis: z.boolean().optional(),
566
583
  apiUrl: z.string().url().optional(),
567
584
  })
@@ -32,6 +32,7 @@ import { createFetchDocsCommand } from "./commands/fetch-docs.js";
32
32
  import { createGenerateConfigsCommand } from "./commands/generate-configs.js";
33
33
  import { createGraderCommand } from "./commands/grader/index.js";
34
34
  import { createInitCommand } from "./commands/init.js";
35
+ import { createInterpretCommand } from "./commands/interpret.js";
35
36
  import { createInteractiveCommand } from "./commands/interactive.js";
36
37
  import { createLookupDocCommand } from "./commands/lookup-doc.js";
37
38
  import { createMeasureRetrievalCommand } from "./commands/measure-retrieval.js";
@@ -110,6 +111,8 @@ export function buildCliProgram(opts) {
110
111
  .addCommand(createWeeklyDigestCommand())
111
112
  .addCommand(createCheckStalenessCommand());
112
113
  program.addCommand(reportCommand.helpGroup(CommandGroup.AnalysisReports));
114
+ // `ailf interpret <reportId>` — top-level (not nested under report) per AI-SPEC
115
+ program.addCommand(createInterpretCommand().helpGroup(CommandGroup.AnalysisReports));
113
116
  // ── Grader Reliability ────────────────────────────────────────────────
114
117
  program.addCommand(createGraderCommand().helpGroup(CommandGroup.GraderReliability));
115
118
  // ── Setup & Configuration ─────────────────────────────────────────────
@@ -38,7 +38,7 @@ export function createCalculateScoresCommand() {
38
38
  remote: false,
39
39
  apiUrl: "https://ailf-api.sanity.build",
40
40
  });
41
- const result = calculateAndWriteScores({
41
+ const result = await calculateAndWriteScores({
42
42
  resultsPath,
43
43
  rootDir: ctx.config.rootDir,
44
44
  source: opts.source,
@@ -298,7 +298,7 @@ const EXPLAIN_REGISTRY = {
298
298
  ],
299
299
  },
300
300
  "lookup-doc": {
301
- description: "Search Sanity for documentation articles by keyword (find slugs for canonicalDocs)",
301
+ description: "Search Sanity for documentation articles by keyword (find slugs for contextDocs)",
302
302
  steps: [
303
303
  {
304
304
  cacheStatus: "miss",
@@ -0,0 +1,50 @@
1
+ /**
2
+ * interpret command — generate a Diagnosis for a Report.
3
+ *
4
+ * Wraps `getDiagnosisRunner(ctx)` from the composition root in a Commander
5
+ * command for consistent CLI integration. Closest analog: compare.ts.
6
+ *
7
+ * Entry points:
8
+ * ailf interpret <reportId> — one-line-per-card summary
9
+ * ailf interpret <reportId> --json — full Diagnosis JSON
10
+ * ailf interpret latest — most recent report
11
+ * ailf interpret <id> --compare <ref> — DIAG-05 regression comparison
12
+ * ailf interpret <id> --refresh — bypass version-keyed cache
13
+ *
14
+ * @see packages/eval/src/commands/compare.ts — CLI factory analog
15
+ * @see packages/eval/src/composition-root.ts — getDiagnosisRunner
16
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §6
17
+ */
18
+ import { Command } from "commander";
19
+ import type { DiagnosisRunner, VersionedInputs } from "../_vendor/ailf-core/index.d.ts";
20
+ interface MinimalReportStore {
21
+ read(id: string): Promise<unknown | null>;
22
+ latest(): Promise<unknown | null>;
23
+ }
24
+ export interface InterpretCommandOptions {
25
+ /**
26
+ * Override the runner factory for tests. When omitted, the command
27
+ * imports `getDiagnosisRunner` from the composition root at action time.
28
+ */
29
+ readonly runnerFactory?: (ctx: unknown) => DiagnosisRunner;
30
+ /**
31
+ * Override the store factory for tests. When omitted, the command
32
+ * creates the app context and uses `ctx.reportStore` at action time.
33
+ */
34
+ readonly storeFactory?: () => MinimalReportStore | null;
35
+ /**
36
+ * Override the versions resolver for tests. Receives the stored report
37
+ * record and returns the `VersionedInputs` needed by the runner.
38
+ * When omitted, the command derives versions from the report's metadata.
39
+ */
40
+ readonly versionsFromReport?: (report: unknown) => VersionedInputs;
41
+ }
42
+ /**
43
+ * Create the `ailf interpret <reportId>` Commander command.
44
+ *
45
+ * Accepts optional `InterpretCommandOptions` for testability — tests can
46
+ * inject a fake runner factory and store factory without touching module
47
+ * mocks (preferred per testing.md).
48
+ */
49
+ export declare function createInterpretCommand(options?: InterpretCommandOptions): Command;
50
+ export {};
@@ -0,0 +1,212 @@
1
+ /**
2
+ * interpret command — generate a Diagnosis for a Report.
3
+ *
4
+ * Wraps `getDiagnosisRunner(ctx)` from the composition root in a Commander
5
+ * command for consistent CLI integration. Closest analog: compare.ts.
6
+ *
7
+ * Entry points:
8
+ * ailf interpret <reportId> — one-line-per-card summary
9
+ * ailf interpret <reportId> --json — full Diagnosis JSON
10
+ * ailf interpret latest — most recent report
11
+ * ailf interpret <id> --compare <ref> — DIAG-05 regression comparison
12
+ * ailf interpret <id> --refresh — bypass version-keyed cache
13
+ *
14
+ * @see packages/eval/src/commands/compare.ts — CLI factory analog
15
+ * @see packages/eval/src/composition-root.ts — getDiagnosisRunner
16
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §6
17
+ */
18
+ import { dirname, resolve } from "path";
19
+ import { fileURLToPath } from "url";
20
+ import { Command } from "commander";
21
+ import { addOutputDirOption } from "./shared/options.js";
22
+ import { resolveOutputDir } from "./shared/resolve-output-dir.js";
23
+ // ---------------------------------------------------------------------------
24
+ // Module-level root constant (same pattern as compare.ts)
25
+ // ---------------------------------------------------------------------------
26
+ const __dirname = dirname(fileURLToPath(import.meta.url));
27
+ const ROOT = resolve(__dirname, "..", "..");
28
+ // ---------------------------------------------------------------------------
29
+ // Card output formatting (AI-SPEC §6 graceful-degradation-visibility)
30
+ // ---------------------------------------------------------------------------
31
+ /**
32
+ * Visual status markers — locked visual contract per plan Test 7:
33
+ * ready: "✓", degraded: "⚠", missing: "—"
34
+ */
35
+ const STATUS_ICONS = {
36
+ ready: "✓",
37
+ degraded: "⚠",
38
+ missing: "—",
39
+ };
40
+ function getCardSummaryText(card) {
41
+ if (card.status === "ready") {
42
+ return card.body.summary;
43
+ }
44
+ if (card.status === "degraded") {
45
+ return card.reason;
46
+ }
47
+ // missing
48
+ return card.reason;
49
+ }
50
+ /**
51
+ * Format a single card as a one-line summary string.
52
+ *
53
+ * Format: `<icon> <cardType>: <summary>`
54
+ * Per AI-SPEC §6: distinct icons for ready / degraded / missing.
55
+ */
56
+ function formatCardSummaryLine(card) {
57
+ const icon = STATUS_ICONS[card.status];
58
+ const text = getCardSummaryText(card);
59
+ return `${icon} ${card.cardType}: ${text}`;
60
+ }
61
+ // ---------------------------------------------------------------------------
62
+ // Default versions resolver
63
+ // ---------------------------------------------------------------------------
64
+ /**
65
+ * Derive VersionedInputs from a stored report record.
66
+ *
67
+ * The four-version chain is carried in `report.summary.versions` per the
68
+ * Phase 5 schema, with `diagnosisVersion` sourced from the runner's const.
69
+ * Falls back to hard-coded "unknown" values when the fields are not present
70
+ * (legacy reports without version metadata).
71
+ */
72
+ function defaultVersionsFromReport(report) {
73
+ const rec = report;
74
+ const summary = rec.summary;
75
+ const versions = summary?.versions;
76
+ return {
77
+ graderJudgmentsVersion: typeof versions?.graderJudgmentsVersion === "string"
78
+ ? versions.graderJudgmentsVersion
79
+ : "unknown",
80
+ ensembleVersion: typeof versions?.ensembleVersion === "string"
81
+ ? versions.ensembleVersion
82
+ : "unknown",
83
+ diagnosisVersion: typeof versions?.diagnosisVersion === "string"
84
+ ? versions.diagnosisVersion
85
+ : "0.1.0",
86
+ cardVersion: typeof versions?.cardVersion === "string"
87
+ ? versions.cardVersion
88
+ : "0.1.0",
89
+ };
90
+ }
91
+ // ---------------------------------------------------------------------------
92
+ // Command factory
93
+ // ---------------------------------------------------------------------------
94
+ /**
95
+ * Create the `ailf interpret <reportId>` Commander command.
96
+ *
97
+ * Accepts optional `InterpretCommandOptions` for testability — tests can
98
+ * inject a fake runner factory and store factory without touching module
99
+ * mocks (preferred per testing.md).
100
+ */
101
+ export function createInterpretCommand(options = {}) {
102
+ const { runnerFactory, storeFactory, versionsFromReport } = options;
103
+ const cmd = new Command("interpret")
104
+ .description("Generate a Diagnosis for a Report — 8 typed cards explaining what's weak and what to do")
105
+ .argument("<reportId>", "Report ID (or 'latest' for the most recent)")
106
+ .option("-c, --compare <ref>", "Baseline report ID for regression-vs-baseline comparison")
107
+ .option("--refresh", "Bypass the version-keyed cache and recompute")
108
+ .option("--json", "Print full Diagnosis JSON instead of one-line-per-card summary")
109
+ .action(async (reportId, opts) => {
110
+ const outputDir = resolveOutputDir(opts.outputDir);
111
+ // ---------------------------------------------------------------------------
112
+ // Resolve store: injected factory (tests) or composition root (production)
113
+ // ---------------------------------------------------------------------------
114
+ let store;
115
+ let ctx;
116
+ if (storeFactory) {
117
+ store = storeFactory();
118
+ ctx = null;
119
+ }
120
+ else {
121
+ // Production path — lazy import to keep the module fast in tests
122
+ // Minimal config: report-read-only, no eval/fetch/publish.
123
+ const { createAppContext } = await import("../composition-root.js");
124
+ ctx = createAppContext({
125
+ compareEnabled: false,
126
+ gapAnalysisEnabled: false,
127
+ mode: "literacy",
128
+ noAutoScope: false,
129
+ noCache: true,
130
+ noRemoteCache: true,
131
+ outputDir,
132
+ publishEnabled: false,
133
+ rootDir: ROOT,
134
+ searchMode: "open",
135
+ skipEval: true,
136
+ skipFetch: true,
137
+ remote: false,
138
+ apiUrl: "https://ailf-api.sanity.build",
139
+ });
140
+ const prodCtx = ctx;
141
+ store = prodCtx.reportStore;
142
+ }
143
+ if (!store) {
144
+ process.stderr.write("Error: report store is not available\n");
145
+ process.exit(1);
146
+ }
147
+ // ---------------------------------------------------------------------------
148
+ // Resolve main report
149
+ // ---------------------------------------------------------------------------
150
+ const report = reportId === "latest"
151
+ ? await store.latest()
152
+ : await store.read(reportId);
153
+ if (!report) {
154
+ process.stderr.write(`Error: report not found: ${reportId}\n`);
155
+ process.exit(1);
156
+ }
157
+ // ---------------------------------------------------------------------------
158
+ // Optionally resolve baseline (DIAG-05)
159
+ // ---------------------------------------------------------------------------
160
+ let baseline;
161
+ if (opts.compare) {
162
+ baseline = await store.read(opts.compare);
163
+ if (!baseline) {
164
+ process.stderr.write(`Error: baseline report not found: ${opts.compare}\n`);
165
+ process.exit(1);
166
+ }
167
+ }
168
+ // ---------------------------------------------------------------------------
169
+ // Resolve versions
170
+ // ---------------------------------------------------------------------------
171
+ const versions = versionsFromReport
172
+ ? versionsFromReport(report)
173
+ : defaultVersionsFromReport(report);
174
+ // ---------------------------------------------------------------------------
175
+ // Build runner
176
+ // ---------------------------------------------------------------------------
177
+ let runner;
178
+ if (runnerFactory) {
179
+ runner = runnerFactory(ctx);
180
+ }
181
+ else {
182
+ const { getDiagnosisRunner } = await import("../composition-root.js");
183
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
184
+ runner = getDiagnosisRunner(ctx);
185
+ }
186
+ // ---------------------------------------------------------------------------
187
+ // Run diagnosis
188
+ // ---------------------------------------------------------------------------
189
+ const diagnosis = await runner.run({
190
+ // The report here is the eval's ReportStore record, which satisfies
191
+ // the Report interface for runner.run purposes (both carry id + provenance.runId).
192
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
193
+ report: report,
194
+ versions,
195
+ ...(baseline ? { baseline: baseline } : {}),
196
+ refresh: opts.refresh ?? false,
197
+ });
198
+ // ---------------------------------------------------------------------------
199
+ // Print output
200
+ // ---------------------------------------------------------------------------
201
+ if (opts.json) {
202
+ process.stdout.write(`${JSON.stringify(diagnosis, null, 2)}\n`);
203
+ }
204
+ else {
205
+ for (const card of diagnosis.cards) {
206
+ process.stdout.write(`${formatCardSummaryLine(card)}\n`);
207
+ }
208
+ }
209
+ });
210
+ addOutputDirOption(cmd);
211
+ return cmd;
212
+ }
@@ -1,7 +1,7 @@
1
1
  /**
2
2
  * lookup-doc command — search Sanity for documentation articles by keyword.
3
3
  *
4
- * Helps external contributors find the correct `slug` for canonicalDocs
4
+ * Helps external contributors find the correct `slug` for contextDocs
5
5
  * references without needing to browse the CMS or guess from URLs.
6
6
  *
7
7
  * Usage:
@@ -1,7 +1,7 @@
1
1
  /**
2
2
  * lookup-doc command — search Sanity for documentation articles by keyword.
3
3
  *
4
- * Helps external contributors find the correct `slug` for canonicalDocs
4
+ * Helps external contributors find the correct `slug` for contextDocs
5
5
  * references without needing to browse the CMS or guess from URLs.
6
6
  *
7
7
  * Usage:
@@ -14,7 +14,7 @@
14
14
  import { Command } from "commander";
15
15
  export function createLookupDocCommand() {
16
16
  return new Command("lookup-doc")
17
- .description("Search Sanity docs by keyword — find slugs for canonicalDocs references")
17
+ .description("Search Sanity docs by keyword — find slugs for contextDocs references")
18
18
  .argument("<keyword>", "Search keyword (matches title and slug)")
19
19
  .option("-l, --limit <n>", "Maximum results to show", parseInt, 10)
20
20
  .option("-s, --source <name>", "Documentation source (from sources.yaml)")
@@ -73,7 +73,7 @@ export function createLookupDocCommand() {
73
73
  console.log(` ${"".padEnd(maxSlugLen + 6)} │ Section: ${section}\n`);
74
74
  }
75
75
  console.log(" Usage in .ailf/tasks/*.yaml:\n");
76
- console.log(" canonicalDocs:");
76
+ console.log(" contextDocs:");
77
77
  console.log(` - slug: ${results[0].slug}`);
78
78
  console.log(` reason: "${results[0].title}"`);
79
79
  if (results[0].sectionSlug) {
@@ -27,6 +27,12 @@ export interface ResolvedOptions {
27
27
  dryRun: boolean;
28
28
  gapAnalysisEnabled: boolean;
29
29
  graderReplications?: number;
30
+ /**
31
+ * Replications per borderline judgment for the GRAD-04 intra-grader
32
+ * consensus pass. Sourced from `.ailf/config.yaml`'s
33
+ * `execution.borderlineReplications`.
34
+ */
35
+ borderlineReplications?: number;
30
36
  /** Grader context policy from `.ailf/config.yaml` `grader.context` */
31
37
  graderContext?: "rubric-only" | "with-docs";
32
38
  headerArgs: string[];
@@ -248,6 +248,7 @@ export function computeResolvedOptions(opts) {
248
248
  // env var (where one exists) > .ailf/config.yaml > built-in default
249
249
  const concurrency = repoConfig?.execution?.concurrency;
250
250
  const graderReplications = repoConfig?.execution?.graderReplications;
251
+ const borderlineReplications = repoConfig?.execution?.borderlineReplications;
251
252
  const gapAnalysisEnabled = repoConfig?.execution?.gapAnalysis ?? true;
252
253
  // Grader context policy. Cascade: env var > .ailf/config.yaml > unset
253
254
  // (defaults to rubric-only at the EvalConfig boundary). The env var is the
@@ -291,6 +292,7 @@ export function computeResolvedOptions(opts) {
291
292
  dryRun: opts.dryRun,
292
293
  gapAnalysisEnabled,
293
294
  graderReplications,
295
+ borderlineReplications,
294
296
  graderContext,
295
297
  headerArgs,
296
298
  impactSummary,
@@ -142,6 +142,7 @@ function toConfigSlice(opts) {
142
142
  perspectiveOverride: opts.perspectiveOverride,
143
143
  graderContext: opts.graderContext,
144
144
  graderReplications: opts.graderReplications,
145
+ borderlineReplications: opts.borderlineReplications,
145
146
  gapAnalysisEnabled: opts.gapAnalysisEnabled,
146
147
  noRemoteCache: opts.noRemoteCache,
147
148
  // D0037 / W0069 caller envelope overrides — flags override env vars