@sanity/ailf 4.6.0 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. package/canonical/grader-references/agent-harness-tools.yaml +42 -0
  2. package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
  3. package/canonical/grader-references/mcp-server-spec.yaml +51 -0
  4. package/canonical/grader-references/portable-text.yaml +48 -0
  5. package/config/diagnosis-cards.ts +318 -0
  6. package/config/models.ts +12 -0
  7. package/config/rubrics.ts +38 -2
  8. package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
  9. package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
  10. package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
  11. package/dist/_vendor/ailf-core/examples/index.js +146 -47
  12. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
  13. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
  14. package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
  15. package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
  16. package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
  17. package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
  18. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
  19. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
  20. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
  21. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
  22. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
  23. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
  24. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  25. package/dist/_vendor/ailf-core/index.js +4 -0
  26. package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
  27. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
  28. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
  29. package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
  30. package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
  31. package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
  32. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  33. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
  34. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  35. package/dist/_vendor/ailf-core/schemas/index.js +9 -0
  36. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
  37. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
  38. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
  39. package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
  40. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
  41. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
  42. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
  43. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
  44. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
  45. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +171 -0
  46. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
  47. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
  48. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
  49. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
  50. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
  51. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +104 -0
  52. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
  53. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +96 -0
  54. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +39 -0
  55. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +52 -0
  56. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
  57. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +77 -0
  58. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
  59. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
  60. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
  61. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +126 -0
  62. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
  63. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +107 -0
  64. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
  65. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +114 -0
  66. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
  67. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +273 -0
  68. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
  69. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
  70. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
  71. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
  72. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
  73. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
  74. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
  75. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
  76. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
  77. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
  78. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +16 -0
  79. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +86 -0
  80. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +50 -0
  81. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +35 -0
  82. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +136 -0
  83. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +153 -0
  84. package/dist/_vendor/ailf-core/services/index.d.ts +6 -0
  85. package/dist/_vendor/ailf-core/services/index.js +18 -0
  86. package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
  87. package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
  88. package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
  89. package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
  90. package/dist/_vendor/ailf-core/types/attribution.js +18 -0
  91. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
  92. package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
  93. package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
  94. package/dist/_vendor/ailf-core/types/confidence.js +7 -0
  95. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +271 -0
  96. package/dist/_vendor/ailf-core/types/diagnosis.js +19 -0
  97. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
  98. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
  99. package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
  100. package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
  101. package/dist/_vendor/ailf-core/types/index.js +15 -1
  102. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
  103. package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
  104. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
  105. package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
  106. package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
  107. package/dist/adapters/api-client/build-request.d.ts +1 -0
  108. package/dist/adapters/api-client/build-request.js +3 -0
  109. package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
  110. package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
  111. package/dist/adapters/attribution/index.d.ts +9 -0
  112. package/dist/adapters/attribution/index.js +8 -0
  113. package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
  114. package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
  115. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  116. package/dist/adapters/grader-outputs/index.d.ts +10 -0
  117. package/dist/adapters/grader-outputs/index.js +8 -0
  118. package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
  119. package/dist/adapters/grader-outputs/legacy/index.js +10 -0
  120. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
  121. package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
  122. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
  123. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
  124. package/dist/adapters/index.d.ts +3 -0
  125. package/dist/adapters/index.js +4 -0
  126. package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
  127. package/dist/adapters/llm/fake-llm-client.js +38 -1
  128. package/dist/adapters/llm/openai-llm-client.js +52 -3
  129. package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
  130. package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
  131. package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
  132. package/dist/adapters/task-sources/repo-schemas.js +19 -2
  133. package/dist/cli-program.js +3 -0
  134. package/dist/commands/calculate-scores.js +1 -1
  135. package/dist/commands/explain-handler.js +1 -1
  136. package/dist/commands/interpret.d.ts +50 -0
  137. package/dist/commands/interpret.js +212 -0
  138. package/dist/commands/lookup-doc.d.ts +1 -1
  139. package/dist/commands/lookup-doc.js +3 -3
  140. package/dist/commands/pipeline-action.d.ts +6 -0
  141. package/dist/commands/pipeline-action.js +2 -0
  142. package/dist/commands/remote-pipeline.js +1 -0
  143. package/dist/composition-root.d.ts +57 -23
  144. package/dist/composition-root.js +155 -41
  145. package/dist/config/diagnosis-cards.ts +318 -0
  146. package/dist/config/models.ts +12 -0
  147. package/dist/config/rubrics.ts +38 -2
  148. package/dist/grader/agent-harness.d.ts +9 -0
  149. package/dist/grader/agent-harness.js +9 -0
  150. package/dist/grader/common.d.ts +9 -0
  151. package/dist/grader/common.js +9 -0
  152. package/dist/grader/index.d.ts +24 -0
  153. package/dist/grader/index.js +24 -0
  154. package/dist/grader/knowledge-probe.d.ts +9 -0
  155. package/dist/grader/knowledge-probe.js +9 -0
  156. package/dist/grader/literacy.d.ts +9 -0
  157. package/dist/grader/literacy.js +9 -0
  158. package/dist/grader/mcp.d.ts +9 -0
  159. package/dist/grader/mcp.js +9 -0
  160. package/dist/orchestration/build-app-context.js +1 -0
  161. package/dist/orchestration/build-step-sequence.js +5 -0
  162. package/dist/orchestration/steps/calculate-scores-step.js +23 -1
  163. package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
  164. package/dist/orchestration/steps/compute-attribution-step.js +279 -0
  165. package/dist/orchestration/steps/gap-analysis-step.js +35 -7
  166. package/dist/orchestration/steps/index.d.ts +1 -0
  167. package/dist/orchestration/steps/index.js +1 -0
  168. package/dist/pipeline/attribution.d.ts +15 -0
  169. package/dist/pipeline/attribution.js +18 -9
  170. package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
  171. package/dist/pipeline/borderline-consensus-runner.js +124 -0
  172. package/dist/pipeline/borderline-detector.d.ts +24 -0
  173. package/dist/pipeline/borderline-detector.js +26 -0
  174. package/dist/pipeline/calculate-scores.d.ts +114 -3
  175. package/dist/pipeline/calculate-scores.js +426 -24
  176. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  177. package/dist/pipeline/compiler/literacy-bridge.js +35 -17
  178. package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
  179. package/dist/pipeline/compiler/rubric-resolution.js +9 -1
  180. package/dist/pipeline/compute-attribution.d.ts +80 -0
  181. package/dist/pipeline/compute-attribution.js +196 -0
  182. package/dist/pipeline/failure-modes.d.ts +52 -17
  183. package/dist/pipeline/failure-modes.js +178 -117
  184. package/dist/pipeline/map-request-to-config.js +1 -0
  185. package/package.json +7 -5
@@ -0,0 +1,82 @@
1
+ /**
2
+ * Attribution core domain types — canonical shapes for the per-document
3
+ * attribution ensemble (Doc 04).
4
+ *
5
+ * Phase 1 lands the type carriers; Phase 4 lands the compute step. The
6
+ * Zod schemas in `packages/eval/src/adapters/attribution/` assert
7
+ * `satisfies z.ZodType<...>` against these types.
8
+ *
9
+ * Doc identity is referenced by `documentId` (D0052), not by `slug` —
10
+ * `slug` is retained as a human-readable annotation only. The
11
+ * resolvable-set check is carried as a separate
12
+ * `hallucinationCheckedAgainst: string[]` field (Pitfall #11).
13
+ *
14
+ * @see docs/decisions/D0049-shared-confidence-contract.md
15
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
16
+ * @see docs/design-docs/actionability-ladder/04-per-document-attribution-ensemble.md
17
+ */
18
+ import type { Confidence } from "./confidence.js";
19
+ /**
20
+ * Per-document attribution score for one judgment. The `signals` sub-record
21
+ * carries each ensemble member's contribution; the top-level `score` is
22
+ * the post-weighting composite.
23
+ *
24
+ * `documentId` is the canonical D0052 reference; `slug` is a
25
+ * human-readable annotation only and must not be relied on for identity.
26
+ */
27
+ export interface DocAttribution {
28
+ /** Canonical D0052 document ref (id, not slug). */
29
+ documentId: string;
30
+ /** Optional human-readable annotation. Never the identity. */
31
+ slug?: string;
32
+ /** Composite attribution score in [0, 1]. */
33
+ score: number;
34
+ /** Per-ensemble-member contributions before weighting. */
35
+ signals: {
36
+ citation?: number;
37
+ canonical?: number;
38
+ retrieved?: number;
39
+ };
40
+ /** Shared D0049 confidence triple. */
41
+ confidence: Confidence;
42
+ }
43
+ /**
44
+ * Per-judgment attribution carrier. Emitted by Phase 4's
45
+ * `ComputeAttributionStep`; persisted at
46
+ * `runs/{runId}/attribution/{entryKey}.json`.
47
+ *
48
+ * `hallucinationCheckedAgainst` is the resolvable-set used at compute
49
+ * time — required (not optional) so consumers can audit citation
50
+ * grounding without re-deriving the set. Per Pitfall #11 the canonical
51
+ * task field is `contextDocs`; do not invent `expectedDocs` /
52
+ * `usedDocs` synonyms.
53
+ */
54
+ export interface JudgmentAttribution {
55
+ /** D0052 granular ref to the underlying grader judgment. */
56
+ judgmentRef: string;
57
+ taskId: string;
58
+ modelId: string;
59
+ dimension: string;
60
+ attributions: DocAttribution[];
61
+ /** Resolvable-set used at compute time (Pitfall #11). */
62
+ hallucinationCheckedAgainst: string[];
63
+ }
64
+ /**
65
+ * Run-scoped attribution metadata. Persisted alongside the per-entry
66
+ * attribution objects so consumers can interpret signal-weighting and
67
+ * embedding choices without re-loading the calibration set.
68
+ *
69
+ * `embeddingModel` is REQUIRED (Pitfall #6) — silently downgrading to a
70
+ * default has caused regressions in adjacent codebases.
71
+ */
72
+ export interface AttributionMeta {
73
+ ensembleVersion: string;
74
+ /** Embedding model identifier — REQUIRED (Pitfall #6). */
75
+ embeddingModel: string;
76
+ calibrationSetVersion?: string;
77
+ weights: {
78
+ citation: number;
79
+ canonical: number;
80
+ retrieved: number;
81
+ };
82
+ }
@@ -0,0 +1,18 @@
1
+ /**
2
+ * Attribution core domain types — canonical shapes for the per-document
3
+ * attribution ensemble (Doc 04).
4
+ *
5
+ * Phase 1 lands the type carriers; Phase 4 lands the compute step. The
6
+ * Zod schemas in `packages/eval/src/adapters/attribution/` assert
7
+ * `satisfies z.ZodType<...>` against these types.
8
+ *
9
+ * Doc identity is referenced by `documentId` (D0052), not by `slug` —
10
+ * `slug` is retained as a human-readable annotation only. The
11
+ * resolvable-set check is carried as a separate
12
+ * `hallucinationCheckedAgainst: string[]` field (Pitfall #11).
13
+ *
14
+ * @see docs/decisions/D0049-shared-confidence-contract.md
15
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
16
+ * @see docs/design-docs/actionability-ladder/04-per-document-attribution-ensemble.md
17
+ */
18
+ export {};
@@ -29,6 +29,8 @@ declare const __brand: unique symbol;
29
29
  export type Brand<T, B extends string> = T & {
30
30
  readonly [__brand]: B;
31
31
  };
32
+ /** Unique identifier for a grader judgment (D0052 granular). */
33
+ export type JudgmentId = Brand<string, "JudgmentId">;
32
34
  /** Unique identifier for an evaluation task */
33
35
  export type TaskId = Brand<string, "TaskId">;
34
36
  /** URL-safe slug for a task (derived from title) */
@@ -74,7 +76,7 @@ export type ArtifactId = Brand<string, "ArtifactId">;
74
76
  * per-mode (e.g. `failureModes`, one entry per classified failure category —
75
77
  * D0033 M7, W0051 Slice 2).
76
78
  */
77
- export type AssociationAxis = "run" | "mode" | "task" | "model" | "grader" | "trial" | "category";
79
+ export type AssociationAxis = "run" | "mode" | "task" | "model" | "grader" | "trial" | "category" | "report";
78
80
  /**
79
81
  * The sanitized, filename-safe identifier for a single per-entry artifact
80
82
  * object. Produced by `ArtifactDescriptor.formatEntryKey` and parsed by
@@ -178,4 +180,27 @@ export declare function providerId(raw: string): Result<ProviderId, IdValidation
178
180
  * Valid format: alphanumeric + hyphens, 1–128 characters.
179
181
  */
180
182
  export declare function fixtureId(raw: string): Result<FixtureId, IdValidationError>;
183
+ /**
184
+ * Parse a raw string into a `JudgmentId`.
185
+ *
186
+ * See `JUDGMENT_ID_RE` for the accepted formats.
187
+ */
188
+ export declare function judgmentId(raw: string): Result<JudgmentId, IdValidationError>;
189
+ /**
190
+ * Generate a deterministic `JudgmentId` for a synthesized fall-back
191
+ * judgment. Salting with `runId` (when supplied) makes the id unique
192
+ * per-run so consumers' `(taskId, modelId, dimension)` dedup key
193
+ * doesn't collide across re-runs of the same task — every run writes
194
+ * fresh ids that still encode the natural composite key.
195
+ *
196
+ * When `runId` is absent the salt collapses to `nosalt`, preserving the
197
+ * legacy "deterministic across runs" shape for callers that explicitly
198
+ * want it (e.g. unit tests that assert the exact id string).
199
+ */
200
+ export declare function generateJudgmentId(input: {
201
+ taskId: string;
202
+ modelId: string;
203
+ dimension: string;
204
+ runId?: RunId | string;
205
+ }): JudgmentId;
181
206
  export {};
@@ -84,11 +84,24 @@ export function generateRunId() {
84
84
  .toISOString()
85
85
  .replace(/[-:]/g, "")
86
86
  .replace(/\.\d{3}Z$/, "Z");
87
- const bytes = crypto.getRandomValues(new Uint8Array(8));
88
- let suffix = "";
89
- for (const b of bytes) {
90
- suffix += (b % 36).toString(36);
87
+ // Rejection-sample bytes against the largest multiple of 36 ≤ 256
88
+ // (252) before applying `% 36`. Naive `b % 36` over [0, 255] biases
89
+ // digits 0..3 (probability 8/256) over 4..35 (probability 7/256) by
90
+ // ~14% per character. Drawing fresh bytes whenever the buffer runs
91
+ // dry keeps the loop terminating with overwhelming probability
92
+ // (each byte is kept with probability 252/256 ≈ 98.4%).
93
+ const suffixChars = [];
94
+ while (suffixChars.length < 8) {
95
+ const buf = crypto.getRandomValues(new Uint8Array(8));
96
+ for (const b of buf) {
97
+ if (b >= 252)
98
+ continue; // reject biased range
99
+ suffixChars.push((b % 36).toString(36));
100
+ if (suffixChars.length === 8)
101
+ break;
102
+ }
91
103
  }
104
+ const suffix = suffixChars.join("");
92
105
  return `run_${ts}_${suffix}`;
93
106
  }
94
107
  /**
@@ -166,3 +179,66 @@ export function fixtureId(raw) {
166
179
  }
167
180
  return ok(raw);
168
181
  }
182
+ /**
183
+ * Canonical shape for a `JudgmentId`.
184
+ *
185
+ * Two accepted forms:
186
+ * - `judgment_<runId-suffix>_<sanitized-task>__<sanitized-model>__<dimension>`
187
+ * — minted by `generateJudgmentId` for synthesized fall-back judgments
188
+ * so dedup is per-run (a re-run of the same task produces a distinct id).
189
+ * - `j_<alphanumeric>` — short form used by test fixtures and any caller
190
+ * that wants a stable, opaque id without the structured composite.
191
+ *
192
+ * Inner segments may carry alphanumerics, hyphens, dots, and colons (the
193
+ * provider id surface is colon-separated). The full string is bounded to
194
+ * 256 characters to keep the id index-friendly downstream.
195
+ */
196
+ const JUDGMENT_ID_RE = /^(?:judgment_[0-9a-z]{1,16}_[a-z0-9][a-z0-9.:_-]*__[a-z0-9][a-z0-9.:_-]*__[a-z0-9][a-z0-9-]*|j_[A-Za-z0-9_-]{4,})$/;
197
+ /**
198
+ * Parse a raw string into a `JudgmentId`.
199
+ *
200
+ * See `JUDGMENT_ID_RE` for the accepted formats.
201
+ */
202
+ export function judgmentId(raw) {
203
+ if (raw.length === 0 || raw.length > 256 || !JUDGMENT_ID_RE.test(raw)) {
204
+ return err({
205
+ code: "INVALID_JUDGMENT_ID",
206
+ raw,
207
+ message: `Invalid JudgmentId "${raw}": must match judgment_<runSalt>_<task>__<model>__<dimension> or j_<alnum>`,
208
+ });
209
+ }
210
+ return ok(raw);
211
+ }
212
+ /** Strip a value down to the alphanumeric+hyphen alphabet the id format allows. */
213
+ function sanitizeJudgmentSegment(value) {
214
+ // Lowercase + replace runs of non-alphanumerics with a single hyphen,
215
+ // trim leading/trailing hyphens. Keeps dots and colons in `modelId`-like
216
+ // values (`openai:gpt-5.2`) since the regex permits them.
217
+ return value
218
+ .toLowerCase()
219
+ .replace(/[^a-z0-9.:_-]+/g, "-")
220
+ .replace(/^-+|-+$/g, "");
221
+ }
222
+ /**
223
+ * Generate a deterministic `JudgmentId` for a synthesized fall-back
224
+ * judgment. Salting with `runId` (when supplied) makes the id unique
225
+ * per-run so consumers' `(taskId, modelId, dimension)` dedup key
226
+ * doesn't collide across re-runs of the same task — every run writes
227
+ * fresh ids that still encode the natural composite key.
228
+ *
229
+ * When `runId` is absent the salt collapses to `nosalt`, preserving the
230
+ * legacy "deterministic across runs" shape for callers that explicitly
231
+ * want it (e.g. unit tests that assert the exact id string).
232
+ */
233
+ export function generateJudgmentId(input) {
234
+ // Take the trailing 8 chars of the runId (the random base36 suffix on
235
+ // the canonical shape) so the salt stays compact while still cycling
236
+ // every run. Falls back to a constant marker when runId isn't passed.
237
+ const runSalt = input.runId
238
+ ? sanitizeJudgmentSegment(String(input.runId).slice(-8)) || "nosalt"
239
+ : "nosalt";
240
+ const task = sanitizeJudgmentSegment(input.taskId);
241
+ const model = sanitizeJudgmentSegment(input.modelId);
242
+ const dimension = sanitizeJudgmentSegment(input.dimension);
243
+ return `judgment_${runSalt}_${task}__${model}__${dimension}`;
244
+ }
@@ -19,7 +19,7 @@
19
19
  * is an open tag (see `ConfidenceDerivation`). The list is the
20
20
  * recommended starting set, not the universe.
21
21
  */
22
- export declare const CONVENTIONAL_DERIVATIONS: readonly ["ensemble-stdev", "ceiling-cross-check", "regression-gate", "card-type-specific"];
22
+ export declare const CONVENTIONAL_DERIVATIONS: readonly ["ensemble-stdev", "ceiling-cross-check", "regression-gate", "card-type-specific", "synthesized-pre-cross-check"];
23
23
  /**
24
24
  * Tag identifying the formula used to derive `Confidence.level`.
25
25
  *
@@ -24,6 +24,13 @@ export const CONVENTIONAL_DERIVATIONS = [
24
24
  "ceiling-cross-check",
25
25
  "regression-gate",
26
26
  "card-type-specific",
27
+ // Sentinel placeholder used by the eval pipeline's
28
+ // `synthesizeUnparsedJudgment` fall-back. The validator
29
+ // (`validateGraderJudgmentsCalibration`) overwrites it with
30
+ // "ceiling-cross-check" before judgments leave the live pipeline,
31
+ // so it should not appear on stored reports — the literal is in
32
+ // this list so a leaked sentinel is greppable.
33
+ "synthesized-pre-cross-check",
27
34
  ];
28
35
  /**
29
36
  * Structural type guard for `Confidence`. Verifies the runtime shape
@@ -0,0 +1,271 @@
1
+ /**
2
+ * Diagnosis core domain types — canonical shapes for the post-run
3
+ * synthesis layer (Doc 05).
4
+ *
5
+ * `Diagnosis.inputs` is the four-version cache envelope (VER-01); any
6
+ * segment bump invalidates a cached Diagnosis. `DiagnosisCard` is an
7
+ * outer-`status` discriminated union with a nested `cardType`
8
+ * discriminator inside the `ready` variant.
9
+ *
10
+ * Phase 1 lands placeholder body shapes; Phase 5 enriches each per
11
+ * AI-SPEC §3 and CONTEXT D-05/D-07. The `DiagnosisCard` discriminated
12
+ * union surface (arms + `cardType` literals) is stable — only the
13
+ * `body: <BodyInterface>` references resolve to richer shapes.
14
+ *
15
+ * @see docs/decisions/D0049-shared-confidence-contract.md
16
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
17
+ * @see docs/decisions/D0050-artifact-registry-post-hoc-versioned-extension.md
18
+ */
19
+ import type { Confidence } from "./confidence.js";
20
+ import type { RunId } from "./branded-ids.js";
21
+ import type { ReportId } from "./index.js";
22
+ /**
23
+ * The four-version cache envelope. Every cached `Diagnosis` carries the
24
+ * versions of the inputs that produced it; any bump in any segment
25
+ * invalidates the cache (cross-package contract test asserts this).
26
+ *
27
+ * Strings everywhere; semver convention by humans where authored
28
+ * manually. No branding, no tuples, no content-hash typing — keeps the
29
+ * envelope trivially serializable + greppable.
30
+ */
31
+ export interface VersionedInputs {
32
+ graderJudgmentsVersion: string;
33
+ ensembleVersion: string;
34
+ diagnosisVersion: string;
35
+ cardVersion: string;
36
+ }
37
+ /**
38
+ * The 8 ready-card archetypes. Phase 5 cards register against these
39
+ * literals; the slim-shape boundary in Phase 7 reads them to render the
40
+ * Studio diagnosis renderer.
41
+ */
42
+ export type CardType = "area-summary" | "failure-mode-summary" | "no-issues" | "top-recommendations" | "weakest-area" | "low-confidence-attribution" | "doc-attribution-spotlight" | "regression-vs-baseline";
43
+ /**
44
+ * Per-card telemetry envelope. `cardVersion` here is the per-card
45
+ * version (e.g. `"area-summary@0.1.0"`), not the compound. Drives
46
+ * DIAG-06 cost telemetry in Phase 6.
47
+ */
48
+ export interface CardMeta {
49
+ cardVersion: string;
50
+ tokenUsage?: {
51
+ input: number;
52
+ output: number;
53
+ };
54
+ latencyMs?: number;
55
+ /** ISO 8601 UTC timestamp. */
56
+ generatedAt: string;
57
+ }
58
+ /**
59
+ * A single actionable suggestion surfaced by a recommendations card.
60
+ *
61
+ * Phase 5 adds `docSlug` (the canonical doc page to rewrite) and
62
+ * `sectionHeading` (null when the suggestion targets the whole page)
63
+ * per AI-SPEC actionability-specificity rubric + failure-mode #2
64
+ * mitigation.
65
+ */
66
+ export interface ActionSuggestion {
67
+ title: string;
68
+ body: string;
69
+ priority: "high" | "medium" | "low";
70
+ /** Canonical slug of the documentation page this suggestion targets. */
71
+ docSlug: string;
72
+ /**
73
+ * Heading within `docSlug` that should be revised, or `null` when the
74
+ * suggestion targets the page as a whole.
75
+ */
76
+ sectionHeading: string | null;
77
+ }
78
+ /**
79
+ * Minimal judgment reference per D0052 (taskId × modelId × dimension).
80
+ * Used by `LowConfidenceAttributionBody.judgmentRefs` to cite the
81
+ * specific judgments that drove a low-confidence finding.
82
+ */
83
+ export interface JudgmentRef {
84
+ taskId: string;
85
+ modelId: string;
86
+ dimension: string;
87
+ }
88
+ /**
89
+ * Phase 5 enriched body shapes. Each keeps `summary: string` (load-bearing
90
+ * for CLI default render per AI-SPEC §6) and adds fields the corresponding
91
+ * Zod schema needs (asserting `satisfies z.ZodType<T>` in the card file).
92
+ */
93
+ /** area-summary: deterministic — keep only summary (no behavioral claims). */
94
+ export interface AreaSummaryBody {
95
+ summary: string;
96
+ }
97
+ /**
98
+ * failure-mode-summary: deterministic + D-05 dimension/failureMode gate.
99
+ * `count` = frequency in the report; `sampleSize` = judgment count for the
100
+ * dimension (per AI-SPEC failure-mode #3 mitigation).
101
+ */
102
+ export interface FailureModeSummaryBody {
103
+ summary: string;
104
+ /** Rubric dimension this summary targets (e.g. "task-completion"). */
105
+ dimension: string;
106
+ /** Canonical failure mode within this dimension. */
107
+ failureMode: string;
108
+ /** Number of judgments in this report with this failure mode. */
109
+ count: number;
110
+ /** Total judgments for this dimension — calibration denominator. */
111
+ sampleSize: number;
112
+ }
113
+ /**
114
+ * no-issues: deterministic + AI-SPEC failure-mode #7 sycophancy guard.
115
+ * `thresholdScore` surfaces the threshold used to qualify as "no issues"
116
+ * so readers can see the criterion behind the positive assessment.
117
+ */
118
+ export interface NoIssuesBody {
119
+ summary: string;
120
+ /** Minimum composite score that qualified this area as "no issues". */
121
+ thresholdScore: number;
122
+ }
123
+ /**
124
+ * top-recommendations: LLM-driven. `suggestions` reuses the enriched
125
+ * `ActionSuggestion` shape (docSlug + sectionHeading per AI-SPEC
126
+ * actionability-specificity rubric + failure-mode #2 mitigation).
127
+ */
128
+ export interface TopRecommendationsBody {
129
+ summary: string;
130
+ suggestions: ActionSuggestion[];
131
+ }
132
+ /**
133
+ * weakest-area: LLM-driven. Adds area identification, dimension/failureMode
134
+ * context, and a small-sample calibration guard (AI-SPEC failure-mode #3).
135
+ */
136
+ export interface WeakestAreaBody {
137
+ summary: string;
138
+ /** Documentation area with the lowest composite score. */
139
+ area: string;
140
+ /** Primary dimension driving the low score. */
141
+ dimension: string;
142
+ /** Dominant failure mode in this area. */
143
+ failureMode: string;
144
+ /** Number of judgments sampled for this area — calibration denominator. */
145
+ sampleSize: number;
146
+ /** Calibrated confidence per D0049 (ensemble-stdev derivation). */
147
+ confidence: Confidence;
148
+ }
149
+ /**
150
+ * regression-vs-baseline: LLM-driven. `deltas` is the per-area diff
151
+ * (JS-computed pre-call, max 10 entries); `drivers` is LLM prose;
152
+ * `overallTrend` is a 4-bucket summary per AI-SPEC §3 lines 605-613.
153
+ */
154
+ export interface RegressionVsBaselineBody {
155
+ summary: string;
156
+ /**
157
+ * Per-area score deltas (max 10). `drivers` carries the LLM's prose
158
+ * reasoning about what caused the change.
159
+ */
160
+ deltas: {
161
+ area: string;
162
+ direction: "improved" | "regressed" | "unchanged";
163
+ pointsDelta: number;
164
+ drivers: string[];
165
+ }[];
166
+ /** 4-bucket aggregate trend across all deltas. */
167
+ overallTrend: "net-improved" | "net-regressed" | "mixed" | "stable";
168
+ }
169
+ /**
170
+ * low-confidence-attribution: LLM-driven. `judgmentRefs` cites the
171
+ * specific judgments (D0052 triple) that drove the low-confidence finding.
172
+ */
173
+ export interface LowConfidenceAttributionBody {
174
+ summary: string;
175
+ /** Judgment references (D0052) driving this low-confidence finding. */
176
+ judgmentRefs: JudgmentRef[];
177
+ }
178
+ /**
179
+ * doc-attribution-spotlight: LLM-driven. `docCitations` carries per-doc
180
+ * attribution roles and confidence calibration (AI-SPEC failure-mode #5).
181
+ */
182
+ export interface DocAttributionSpotlightBody {
183
+ summary: string;
184
+ /**
185
+ * Per-doc attribution records. `role` classifies how the doc contributed;
186
+ * `confidence` calibrates the attribution certainty (D0049).
187
+ */
188
+ docCitations: {
189
+ docSlug: string;
190
+ confidence: Confidence;
191
+ role: "supports" | "contradicts" | "missing" | "irrelevant";
192
+ }[];
193
+ }
194
+ /**
195
+ * Outer-`status` discriminated union: 8 ready variants (one per
196
+ * `cardType`, each carrying its per-cardType body), plus a `degraded`
197
+ * variant (parse failed or downgraded by the runner) and a `missing`
198
+ * variant (card not produced for this run).
199
+ *
200
+ * No `not-yet-generated` variant — old-report fallback is a Phase 7
201
+ * concern at the slim-shape boundary, handled at fetch-time, not in
202
+ * `DiagnosisCard` itself.
203
+ *
204
+ * D-07: only the `body: <BodyInterface>` references resolve to richer
205
+ * shapes. The union arms, status literals, and cardType literals are
206
+ * identical to Phase 1.
207
+ */
208
+ export type DiagnosisCard = {
209
+ status: "ready";
210
+ cardType: "area-summary";
211
+ body: AreaSummaryBody;
212
+ meta: CardMeta;
213
+ } | {
214
+ status: "ready";
215
+ cardType: "failure-mode-summary";
216
+ body: FailureModeSummaryBody;
217
+ meta: CardMeta;
218
+ } | {
219
+ status: "ready";
220
+ cardType: "no-issues";
221
+ body: NoIssuesBody;
222
+ meta: CardMeta;
223
+ } | {
224
+ status: "ready";
225
+ cardType: "top-recommendations";
226
+ body: TopRecommendationsBody;
227
+ meta: CardMeta;
228
+ } | {
229
+ status: "ready";
230
+ cardType: "weakest-area";
231
+ body: WeakestAreaBody;
232
+ meta: CardMeta;
233
+ } | {
234
+ status: "ready";
235
+ cardType: "low-confidence-attribution";
236
+ body: LowConfidenceAttributionBody;
237
+ meta: CardMeta;
238
+ } | {
239
+ status: "ready";
240
+ cardType: "doc-attribution-spotlight";
241
+ body: DocAttributionSpotlightBody;
242
+ meta: CardMeta;
243
+ } | {
244
+ status: "ready";
245
+ cardType: "regression-vs-baseline";
246
+ body: RegressionVsBaselineBody;
247
+ meta: CardMeta;
248
+ } | {
249
+ status: "degraded";
250
+ cardType: CardType;
251
+ reason: string;
252
+ parseFailed: boolean;
253
+ meta: CardMeta;
254
+ } | {
255
+ status: "missing";
256
+ cardType: CardType;
257
+ reason: string;
258
+ };
259
+ /**
260
+ * The post-run synthesis aggregate. Consumed by Phase 5 (runner +
261
+ * cards), Phase 6 (CLI) and Phase 7 (Studio). Phase 1 lands the
262
+ * declarative shape; runtime construction lands in Phase 5.
263
+ */
264
+ export interface Diagnosis {
265
+ runId: RunId;
266
+ reportId: ReportId;
267
+ inputs: VersionedInputs;
268
+ cards: DiagnosisCard[];
269
+ /** ISO 8601 UTC timestamp. */
270
+ generatedAt: string;
271
+ }
@@ -0,0 +1,19 @@
1
+ /**
2
+ * Diagnosis core domain types — canonical shapes for the post-run
3
+ * synthesis layer (Doc 05).
4
+ *
5
+ * `Diagnosis.inputs` is the four-version cache envelope (VER-01); any
6
+ * segment bump invalidates a cached Diagnosis. `DiagnosisCard` is an
7
+ * outer-`status` discriminated union with a nested `cardType`
8
+ * discriminator inside the `ready` variant.
9
+ *
10
+ * Phase 1 lands placeholder body shapes; Phase 5 enriches each per
11
+ * AI-SPEC §3 and CONTEXT D-05/D-07. The `DiagnosisCard` discriminated
12
+ * union surface (arms + `cardType` literals) is stable — only the
13
+ * `body: <BodyInterface>` references resolve to richer shapes.
14
+ *
15
+ * @see docs/decisions/D0049-shared-confidence-contract.md
16
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
17
+ * @see docs/decisions/D0050-artifact-registry-post-hoc-versioned-extension.md
18
+ */
19
+ export {};
@@ -53,11 +53,26 @@ export interface PerspectiveDocRef {
53
53
  perspective: string;
54
54
  reason?: string;
55
55
  }
56
+ /**
57
+ * A single criterion within a templated llm-rubric assertion.
58
+ *
59
+ * The `id` is a stable, slug-formatted identifier — auto-derived from
60
+ * `text` in Studio (via slug `options.source: "text"`), or backfilled
61
+ * from Sanity's `_key` for pre-migration documents. Survives criterion
62
+ * text edits; downstream judgments and diagnosis cards reference by
63
+ * `id` per D0052 (judgment-ref granularity).
64
+ */
65
+ export interface CriterionRef {
66
+ /** Stable per-criterion identifier — slug-format (`[a-z0-9][a-z0-9-]*`). */
67
+ id: string;
68
+ /** Author-facing criterion text (the original bullet). */
69
+ text: string;
70
+ }
56
71
  /** A templated assertion referencing a rubric template */
57
72
  export interface GeneralizedTemplatedAssertion {
58
73
  type: "llm-rubric";
59
74
  template: string;
60
- criteria: string[];
75
+ criteria: CriterionRef[];
61
76
  weight?: number;
62
77
  }
63
78
  /** A value-based assertion (contains, javascript, cost, latency, etc.) */