@sanity/ailf 6.0.0 → 6.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/config/airbyte/ai_literacy_framework.connector.yaml +276 -0
  2. package/config/bigquery/views/synthesis_parse_failure_rate_7d.sql +42 -0
  3. package/dist/_vendor/ailf-core/artifact-registry.d.ts +17 -0
  4. package/dist/_vendor/ailf-core/artifact-registry.js +14 -0
  5. package/dist/_vendor/ailf-core/ports/context.d.ts +22 -0
  6. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -0
  7. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -0
  8. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +59 -0
  9. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +5 -1
  10. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +47 -3
  11. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +10 -0
  12. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +13 -0
  13. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +17 -1
  14. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +1 -1
  15. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +5 -1
  16. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +5 -1
  17. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +5 -1
  18. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +15 -2
  19. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +5 -3
  20. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +19 -31
  21. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  22. package/dist/_vendor/ailf-core/services/index.js +1 -1
  23. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +3 -0
  24. package/dist/_vendor/ailf-core/types/index.d.ts +7 -0
  25. package/dist/_vendor/ailf-core/types/repo-config.d.ts +16 -0
  26. package/dist/_vendor/ailf-core/types/synthesis-telemetry.d.ts +101 -0
  27. package/dist/_vendor/ailf-core/types/synthesis-telemetry.js +18 -0
  28. package/dist/adapters/config-sources/file-config-adapter.js +8 -6
  29. package/dist/adapters/llm/index.d.ts +1 -1
  30. package/dist/adapters/llm/index.js +1 -1
  31. package/dist/adapters/llm/openai-llm-client.js +7 -2
  32. package/dist/adapters/llm/retry.d.ts +18 -0
  33. package/dist/adapters/llm/retry.js +21 -0
  34. package/dist/adapters/synthesis/synthesis-telemetry-schema.d.ts +49 -0
  35. package/dist/adapters/synthesis/synthesis-telemetry-schema.js +55 -0
  36. package/dist/adapters/task-sources/content-lake-task-source.js +10 -5
  37. package/dist/adapters/task-sources/repo-schemas.d.ts +7 -0
  38. package/dist/adapters/task-sources/repo-schemas.js +10 -0
  39. package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +11 -1
  40. package/dist/artifact-capture/api-gateway-artifact-writer.js +3 -1
  41. package/dist/artifact-capture/batching-api-gateway-artifact-writer.d.ts +11 -1
  42. package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +3 -1
  43. package/dist/artifact-capture/gcs-artifact-writer.d.ts +11 -1
  44. package/dist/artifact-capture/gcs-artifact-writer.js +6 -3
  45. package/dist/artifact-capture/local-fs-artifact-writer.d.ts +11 -1
  46. package/dist/artifact-capture/local-fs-artifact-writer.js +6 -3
  47. package/dist/commands/interpret.d.ts +21 -1
  48. package/dist/commands/interpret.js +13 -4
  49. package/dist/commands/pipeline-action.d.ts +44 -0
  50. package/dist/commands/pipeline-action.js +193 -1
  51. package/dist/commands/run.d.ts +2 -0
  52. package/dist/commands/run.js +2 -0
  53. package/dist/composition-root.d.ts +22 -5
  54. package/dist/composition-root.js +78 -8
  55. package/dist/orchestration/pipeline-orchestrator.js +3 -0
  56. package/dist/orchestration/steps/gap-analysis-step.js +0 -1
  57. package/dist/report-store.d.ts +40 -0
  58. package/dist/report-store.js +88 -0
  59. package/package.json +1 -1
@@ -225,6 +225,134 @@ definitions:
225
225
  schema:
226
226
  $ref: "#/schemas/area_scores"
227
227
 
228
+ # ------------------------------------------------------------------
229
+ # Stream 3: synthesis_summary — one row per report with synthesis telemetry
230
+ # ------------------------------------------------------------------
231
+ # GROQ projection emits cost, parse-failure counts, and rate from the
232
+ # summary.synthesis.diagnosis path written by the Phase-6 post-run hook.
233
+ # Rows are gated on defined(summary.synthesis.diagnosis) so reports that
234
+ # predate Phase 6 produce no rows (incremental cursor still catches them
235
+ # on re-sync once backfilled).
236
+ synthesis_summary:
237
+ type: DeclarativeStream
238
+ name: synthesis_summary
239
+ retriever:
240
+ type: SimpleRetriever
241
+ decoder:
242
+ type: JsonDecoder
243
+ requester:
244
+ $ref: "#/definitions/base_requester"
245
+ path: /v2026-03-12/data/query/{{ config['dataset'] }}
246
+ http_method: GET
247
+ request_parameters:
248
+ query: >-
249
+ *[_type=="ailf.report" && _createdAt > "{{
250
+ stream_interval.start_time or '1970-01-01T00:00:00Z' }}" &&
251
+ _createdAt <= "{{ stream_interval.end_time }}" &&
252
+ defined(summary.synthesis.diagnosis)]|order(_createdAt asc){
253
+ "report_id": reportId,
254
+ "completed_at": completedAt,
255
+ "mode": provenance.mode,
256
+ "source_name": provenance.source.name,
257
+ "grader_model": provenance.graderModel,
258
+ "synthesis_cost": summary.synthesis.diagnosis.cost,
259
+ "parse_failure_count":
260
+ summary.synthesis.diagnosis.parseFailureCount,
261
+ "parse_failure_rate":
262
+ summary.synthesis.diagnosis.parseFailureRate,
263
+ _createdAt
264
+ }
265
+ record_selector:
266
+ type: RecordSelector
267
+ extractor:
268
+ type: DpathExtractor
269
+ field_path:
270
+ - result
271
+ primary_key:
272
+ - report_id
273
+ incremental_sync:
274
+ type: DatetimeBasedCursor
275
+ cursor_field: _createdAt
276
+ cursor_datetime_formats:
277
+ - "%Y-%m-%dT%H:%M:%S.%fZ"
278
+ - "%Y-%m-%dT%H:%M:%SZ"
279
+ datetime_format: "%Y-%m-%dT%H:%M:%SZ"
280
+ start_datetime:
281
+ type: MinMaxDatetime
282
+ datetime: "{{ config.get('start_date', '2026-01-01T00:00:00Z') }}"
283
+ datetime_format: "%Y-%m-%dT%H:%M:%SZ"
284
+ step: P30D
285
+ cursor_granularity: PT1S
286
+ schema_loader:
287
+ type: InlineSchemaLoader
288
+ schema:
289
+ $ref: "#/schemas/synthesis_summary"
290
+
291
+ # ------------------------------------------------------------------
292
+ # Stream 4: synthesis_per_card — one row per report with per-card array
293
+ # ------------------------------------------------------------------
294
+ # GROQ projection emits the nested perCard array. GROQ cannot explode
295
+ # arrays into flat rows, so the nesting is preserved — BigQuery consumers
296
+ # should UNNEST(JSON_QUERY_ARRAY(per_card)) to get flat rows per card.
297
+ # primary_key is report_id only (not compound) for the same reason.
298
+ synthesis_per_card:
299
+ type: DeclarativeStream
300
+ name: synthesis_per_card
301
+ retriever:
302
+ type: SimpleRetriever
303
+ decoder:
304
+ type: JsonDecoder
305
+ requester:
306
+ $ref: "#/definitions/base_requester"
307
+ path: /v2026-03-12/data/query/{{ config['dataset'] }}
308
+ http_method: GET
309
+ request_parameters:
310
+ query: >-
311
+ *[_type=="ailf.report" && _createdAt > "{{
312
+ stream_interval.start_time or '1970-01-01T00:00:00Z' }}" &&
313
+ _createdAt <= "{{ stream_interval.end_time }}" &&
314
+ defined(summary.synthesis.diagnosis.perCard)]|order(_createdAt
315
+ asc){
316
+ "report_id": reportId,
317
+ "completed_at": completedAt,
318
+ "per_card": summary.synthesis.diagnosis.perCard[]{
319
+ "card_type": cardType,
320
+ "cost": cost,
321
+ "parse_failed": parseFailed,
322
+ "latency_ms": latencyMs,
323
+ "token_input": tokenInput,
324
+ "token_output": tokenOutput,
325
+ "card_version": cardVersion,
326
+ "generated_at": generatedAt
327
+ },
328
+ _createdAt
329
+ }
330
+ record_selector:
331
+ type: RecordSelector
332
+ extractor:
333
+ type: DpathExtractor
334
+ field_path:
335
+ - result
336
+ primary_key:
337
+ - report_id
338
+ incremental_sync:
339
+ type: DatetimeBasedCursor
340
+ cursor_field: _createdAt
341
+ cursor_datetime_formats:
342
+ - "%Y-%m-%dT%H:%M:%S.%fZ"
343
+ - "%Y-%m-%dT%H:%M:%SZ"
344
+ datetime_format: "%Y-%m-%dT%H:%M:%SZ"
345
+ start_datetime:
346
+ type: MinMaxDatetime
347
+ datetime: "{{ config.get('start_date', '2026-01-01T00:00:00Z') }}"
348
+ datetime_format: "%Y-%m-%dT%H:%M:%SZ"
349
+ step: P30D
350
+ cursor_granularity: PT1S
351
+ schema_loader:
352
+ type: InlineSchemaLoader
353
+ schema:
354
+ $ref: "#/schemas/synthesis_per_card"
355
+
228
356
  base_requester:
229
357
  type: HttpRequester
230
358
  url_base: https://{{ config['project_id'] }}.api.sanity.io
@@ -235,6 +363,8 @@ definitions:
235
363
  streams:
236
364
  - $ref: "#/definitions/streams/reports"
237
365
  - $ref: "#/definitions/streams/area_scores"
366
+ - $ref: "#/definitions/streams/synthesis_summary"
367
+ - $ref: "#/definitions/streams/synthesis_per_card"
238
368
 
239
369
  spec:
240
370
  type: Spec
@@ -299,9 +429,25 @@ metadata:
299
429
  primaryKeysAreUnique: true
300
430
  primaryKeysArePresent: true
301
431
  responsesAreSuccessful: true
432
+ synthesis_summary:
433
+ hasRecords: true
434
+ streamHash: null
435
+ hasResponse: true
436
+ primaryKeysAreUnique: true
437
+ primaryKeysArePresent: true
438
+ responsesAreSuccessful: true
439
+ synthesis_per_card:
440
+ hasRecords: true
441
+ streamHash: null
442
+ hasResponse: true
443
+ primaryKeysAreUnique: true
444
+ primaryKeysArePresent: true
445
+ responsesAreSuccessful: true
302
446
  autoImportSchema:
303
447
  reports: false
304
448
  area_scores: false
449
+ synthesis_summary: false
450
+ synthesis_per_card: false
305
451
 
306
452
  # ======================================================================
307
453
  # Inline schemas — manually defined to match the designed BigQuery tables.
@@ -757,3 +903,133 @@ schemas:
757
903
  - "null"
758
904
  description: Sanity document creation timestamp (incremental cursor)
759
905
  additionalProperties: true
906
+
907
+ # ------------------------------------------------------------------
908
+ # synthesis_summary schema — flat, one row per report with synthesis telemetry
909
+ # ------------------------------------------------------------------
910
+ synthesis_summary:
911
+ type: object
912
+ $schema: http://json-schema.org/schema#
913
+ required:
914
+ - report_id
915
+ properties:
916
+ report_id:
917
+ type: string
918
+ description: UUID v7 report identifier (primary key)
919
+ completed_at:
920
+ type:
921
+ - string
922
+ - "null"
923
+ description: ISO 8601 timestamp when the evaluation completed
924
+ mode:
925
+ type:
926
+ - string
927
+ - "null"
928
+ description: "Evaluation mode: baseline, observed, or agentic"
929
+ source_name:
930
+ type:
931
+ - string
932
+ - "null"
933
+ description: Documentation source name (e.g., "production")
934
+ grader_model:
935
+ type:
936
+ - string
937
+ - "null"
938
+ description: Model used for LLM grading (context for cost comparison)
939
+ synthesis_cost:
940
+ type:
941
+ - number
942
+ - "null"
943
+ description:
944
+ Total USD cost of the Diagnosis synthesis run (sum of all card costs)
945
+ parse_failure_count:
946
+ type:
947
+ - number
948
+ - "null"
949
+ description:
950
+ Number of cards that failed Zod schema parse during synthesis
951
+ parse_failure_rate:
952
+ type:
953
+ - number
954
+ - "null"
955
+ description:
956
+ Fraction of cards that failed parse (0–1); 0.0 = no failures
957
+ _createdAt:
958
+ type:
959
+ - string
960
+ - "null"
961
+ description:
962
+ Sanity document creation timestamp (used as incremental cursor)
963
+ additionalProperties: true
964
+
965
+ # ------------------------------------------------------------------
966
+ # synthesis_per_card schema — nested per-card array, one row per report
967
+ # ------------------------------------------------------------------
968
+ # BigQuery consumers should UNNEST(JSON_QUERY_ARRAY(per_card)) to get
969
+ # flat rows per (report × card). See bigquery/views/synthesis_parse_failure_rate_7d.sql
970
+ synthesis_per_card:
971
+ type: object
972
+ $schema: http://json-schema.org/schema#
973
+ required:
974
+ - report_id
975
+ properties:
976
+ report_id:
977
+ type: string
978
+ description: UUID v7 report identifier (primary key)
979
+ completed_at:
980
+ type:
981
+ - string
982
+ - "null"
983
+ description: Denormalized timestamp for partitioning
984
+ per_card:
985
+ type:
986
+ - array
987
+ - "null"
988
+ description: >-
989
+ Per-card synthesis metrics array. UNNEST in BigQuery to get one flat
990
+ row per card. card_type identifies the diagnosis card type (e.g.,
991
+ "top-recommendations").
992
+ items:
993
+ type: object
994
+ properties:
995
+ card_type:
996
+ type: string
997
+ description: Diagnosis card type identifier (≤25 chars)
998
+ cost:
999
+ type:
1000
+ - number
1001
+ - "null"
1002
+ description:
1003
+ USD cost of this card's LLM call (null for deterministic cards)
1004
+ parse_failed:
1005
+ type: boolean
1006
+ description: Whether the card's Zod schema parse failed
1007
+ latency_ms:
1008
+ type:
1009
+ - number
1010
+ - "null"
1011
+ description: LLM call latency in milliseconds
1012
+ token_input:
1013
+ type:
1014
+ - number
1015
+ - "null"
1016
+ description: Input tokens consumed by the LLM call
1017
+ token_output:
1018
+ type:
1019
+ - number
1020
+ - "null"
1021
+ description: Output tokens produced by the LLM call
1022
+ card_version:
1023
+ type: string
1024
+ description:
1025
+ Card implementation version (e.g., "area-summary@0.1.0")
1026
+ generated_at:
1027
+ type: string
1028
+ description: ISO 8601 UTC timestamp when this card was generated
1029
+ _createdAt:
1030
+ type:
1031
+ - string
1032
+ - "null"
1033
+ description:
1034
+ Sanity document creation timestamp (used as incremental cursor)
1035
+ additionalProperties: true
@@ -0,0 +1,42 @@
1
+ -- ailf.synthesis_parse_failure_rate_7d — per-card parse-failure rate over 7 days
2
+ --
3
+ -- Computes the Zod-schema parse-failure rate per Diagnosis card type over the
4
+ -- previous 7 days, sourced from the synthesis_per_card Airbyte stream. Any row
5
+ -- returned by this view represents a card type that breached the 2% threshold
6
+ -- defined in D6-18 and should trigger a manual investigation per the runbook.
7
+ --
8
+ -- Source: ailf_raw.synthesis_per_card (Airbyte stream: "synthesis_per_card")
9
+ -- Target: ailf.synthesis_parse_failure_rate_7d (this view)
10
+ --
11
+ -- Threshold: failure_rate > 0.02 (2%) over INTERVAL 7 DAY [D6-18]
12
+ -- To change the threshold, edit the HAVING clause and WHERE clause below;
13
+ -- both are the single edit points per D6-18 (not lifted to config).
14
+ --
15
+ -- Usage:
16
+ -- bq query --use_legacy_sql=false < views/synthesis_parse_failure_rate_7d.sql
17
+ --
18
+ -- @see docs/runbooks/diagnosis-parse-failure-watch.md — operator runbook
19
+ -- @see packages/eval/config/airbyte/ai_literacy_framework.connector.yaml — synthesis_per_card stream
20
+
21
+ CREATE OR REPLACE VIEW `data-platform-302218.ailf.synthesis_parse_failure_rate_7d` AS
22
+ SELECT
23
+ JSON_VALUE(card, '$.card_type') AS card_type,
24
+ COUNT(*) AS total_runs,
25
+ COUNTIF(SAFE_CAST(JSON_VALUE(card, '$.parse_failed') AS BOOL)) AS parse_failures,
26
+ ROUND(SAFE_DIVIDE(
27
+ COUNTIF(SAFE_CAST(JSON_VALUE(card, '$.parse_failed') AS BOOL)),
28
+ COUNT(*)
29
+ ), 4) AS failure_rate
30
+ FROM
31
+ `data-platform-302218.ailf_raw.synthesis_per_card` AS r,
32
+ UNNEST(JSON_QUERY_ARRAY(r.per_card)) AS card
33
+ WHERE
34
+ r.completed_at IS NOT NULL
35
+ AND TIMESTAMP(r.completed_at) >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 7 DAY)
36
+ AND JSON_VALUE(card, '$.card_type') IS NOT NULL
37
+ GROUP BY
38
+ card_type
39
+ HAVING
40
+ failure_rate > 0.02
41
+ ORDER BY
42
+ failure_rate DESC
@@ -200,6 +200,23 @@ export interface ArtifactDescriptor<TEntry = unknown, TPreview = unknown> {
200
200
  * descriptors. `entryKey` is ignored on versioned bulk paths.
201
201
  */
202
202
  readonly objectPath: ArtifactObjectPath;
203
+ /**
204
+ * Extract the positional args (beyond `runId`) to pass to `objectPath`
205
+ * when this descriptor needs more than `runId` to build its path.
206
+ *
207
+ * The default plain-bulk path is `objectPath(runId)`. The default
208
+ * per-entry path is `objectPath(runId, entryKey)`. Descriptors whose
209
+ * path needs additional axes from `association` and/or fields from the
210
+ * payload (e.g. the bulk-versioned-with-report-axis carve-out used by
211
+ * the diagnosis descriptor — see `BULK_VERSIONED_WITH_REPORT_AXIS`)
212
+ * set this function. The writer calls it and spreads the result:
213
+ * `objectPath(runId, ...extractPathArgs(association, payload))`.
214
+ *
215
+ * Returning `[undefined]` (or any `undefined` element) is fine — the
216
+ * underlying builder is expected to throw a meaningful error in that
217
+ * case, which the writer surfaces via its existing try/catch.
218
+ */
219
+ readonly extractPathArgs?: (association: AssociationValues, payload: unknown) => readonly (string | undefined)[];
203
220
  /**
204
221
  * Build a filename-safe entry key from association values. Only meaningful
205
222
  * for `layout === "per-entry"` — bulk descriptors omit it.
@@ -674,6 +674,7 @@ function buildDescriptor(input) {
674
674
  versionedBy: input.versionedBy,
675
675
  pathSafetyMarker: input.pathSafetyMarker,
676
676
  objectPath,
677
+ extractPathArgs: input.extractPathArgs,
677
678
  formatEntryKey,
678
679
  parseEntryKey,
679
680
  manifestPreview: input.manifestPreview,
@@ -1185,6 +1186,19 @@ export const ARTIFACT_REGISTRY = {
1185
1186
  writePolicy: "post-hoc",
1186
1187
  versionedBy: "diagnosisVersion",
1187
1188
  objectPath: diagnosisPathBuilder(),
1189
+ // The diagnosis path builder takes (runId, reportId, version). The
1190
+ // reportId comes from association.report; the version is a compound
1191
+ // `diagnosisVersion|cardVersion` synthesized from the payload's
1192
+ // `inputs` field via `encodeDiagnosisPathVersion`. Writers spread
1193
+ // this beyond `runId` when calling `objectPath`.
1194
+ extractPathArgs: (assoc, payload) => {
1195
+ const reportId = typeof assoc.report === "string" ? assoc.report : undefined;
1196
+ const diag = payload;
1197
+ const dv = diag?.inputs?.diagnosisVersion;
1198
+ const cv = diag?.inputs?.cardVersion;
1199
+ const version = dv && cv ? encodeDiagnosisPathVersion(dv, cv) : undefined;
1200
+ return [reportId, version];
1201
+ },
1188
1202
  // Defense-in-depth: this descriptor's axes (`run`, `report`) are both
1189
1203
  // bounded, so the `assertValidArtifactDescriptor` unbounded-axis rule
1190
1204
  // does not fire and the carve-out is never consulted at module load
@@ -237,6 +237,14 @@ export interface ResolvedConfig {
237
237
  * Sourced from AILF_ARTIFACT_UPLOAD env var or `artifactUpload` in ailf.config.ts.
238
238
  */
239
239
  artifactUpload?: boolean;
240
+ /**
241
+ * Post-run diagnosis summary policy (Phase 6 / DIAG-06). Carried from
242
+ * `.ailf/config.yaml` summary.onRun via RepoConfigSchema (CLI path) or
243
+ * EvalConfigSchema (file-config path). Precedence resolution is deferred to
244
+ * shouldRunPostSummary() at execution time — this field carries only the
245
+ * config-file signal.
246
+ */
247
+ summaryOnRun?: "auto" | "always" | "never";
240
248
  }
241
249
  /**
242
250
  * Application context — the complete dependency carrier.
@@ -331,6 +339,20 @@ export interface ReportStorePort {
331
339
  findComparableBaseline(query: unknown): Promise<null | unknown>;
332
340
  /** Write a report to the store */
333
341
  write(report: unknown): Promise<unknown>;
342
+ /** Read a report by its ID (used by the post-run diagnosis hook). */
343
+ read(id: string): Promise<null | unknown>;
344
+ /** Patch synthesis telemetry onto a published report (Phase 6 / DIAG-06). */
345
+ patchSynthesis(id: string, telemetry: unknown): Promise<void>;
346
+ /**
347
+ * Patch a single artifact-manifest entry onto a published report.
348
+ *
349
+ * Used by deferred commands (e.g. `ailf interpret`) whose post-hoc writer
350
+ * produces a new ArtifactRef after the doc was already published. The
351
+ * pipeline path lifts the full manifest at publish time
352
+ * (publish-report-step); this is the post-hoc equivalent for one slot.
353
+ * Non-fatal on Sanity failure — mirrors `patchSynthesis`.
354
+ */
355
+ patchArtifactManifest(id: string, slot: string, ref: unknown): Promise<void>;
334
356
  }
335
357
  /**
336
358
  * Minimal report sink interface used by AppContext.
@@ -88,6 +88,13 @@ export declare const EvalConfigSchema: z.ZodObject<{
88
88
  skipEval: z.ZodOptional<z.ZodBoolean>;
89
89
  skipFetch: z.ZodOptional<z.ZodBoolean>;
90
90
  source: z.ZodOptional<z.ZodString>;
91
+ summary: z.ZodOptional<z.ZodObject<{
92
+ onRun: z.ZodOptional<z.ZodEnum<{
93
+ never: "never";
94
+ always: "always";
95
+ auto: "auto";
96
+ }>>;
97
+ }, z.core.$strip>>;
91
98
  tasks: z.ZodOptional<z.ZodArray<z.ZodString>>;
92
99
  urls: z.ZodOptional<z.ZodArray<z.ZodString>>;
93
100
  presets: z.ZodOptional<z.ZodArray<z.ZodString>>;
@@ -175,6 +175,14 @@ export const EvalConfigSchema = z
175
175
  skipFetch: z.boolean().optional(),
176
176
  /** Documentation source name */
177
177
  source: z.string().optional(),
178
+ /**
179
+ * Post-run diagnosis summary policy (Phase 6 / DIAG-06). Mirrors
180
+ * `RepoConfigSchema`'s `summary` block for `--config <path>` parity.
181
+ * Precedence resolved at the CLI layer by `shouldRunPostSummary()`.
182
+ */
183
+ summary: z
184
+ .object({ onRun: z.enum(["auto", "always", "never"]).optional() })
185
+ .optional(),
178
186
  /** Task ID filter */
179
187
  tasks: z.array(z.string()).optional(),
180
188
  /** Doc source URL overrides */
@@ -169,3 +169,62 @@ describe("generateFailureModeSummary — empty failure modes (Test 9)", () => {
169
169
  }
170
170
  });
171
171
  });
172
+ describe("generateFailureModeSummary — cross-cutting mode resolution", () => {
173
+ // `missing-docs` appears in both LITERACY_FAILURE_MODES and MCP_FAILURE_MODES.
174
+ // Before the per-EvalMode preference table, linear scan of CANONICAL_DIMENSIONS
175
+ // always resolved it to task-completion, mislabelling MCP-only runs.
176
+ function withMode(report, mode) {
177
+ return {
178
+ ...report,
179
+ provenance: { ...report.provenance, mode },
180
+ };
181
+ }
182
+ it("resolves 'missing-docs' to mcp-behavior on an mcp-server report", async () => {
183
+ const base = makeReport({
184
+ counts: { "mcp-behavior": 7 },
185
+ topTitles: [
186
+ {
187
+ id: "mcp-behavior::missing-docs",
188
+ category: "missing-docs",
189
+ severity: "high",
190
+ title: "missing-docs",
191
+ count: 7,
192
+ },
193
+ ],
194
+ totalJudgments: 20,
195
+ classificationRate: 0.35,
196
+ });
197
+ const report = withMode(base, "mcp-server");
198
+ const card = await generateFailureModeSummary(report, makeCtx());
199
+ expect(card.status).toBe("ready");
200
+ if (card.status === "ready") {
201
+ const body = card.body;
202
+ expect(body.dimension).toBe("mcp-behavior");
203
+ expect(body.failureMode).toBe("missing-docs");
204
+ }
205
+ });
206
+ it("resolves 'missing-docs' to task-completion on a literacy report", async () => {
207
+ const base = makeReport({
208
+ counts: { "task-completion": 7 },
209
+ topTitles: [
210
+ {
211
+ id: "task-completion::missing-docs",
212
+ category: "missing-docs",
213
+ severity: "high",
214
+ title: "missing-docs",
215
+ count: 7,
216
+ },
217
+ ],
218
+ totalJudgments: 20,
219
+ classificationRate: 0.35,
220
+ });
221
+ const report = withMode(base, "literacy");
222
+ const card = await generateFailureModeSummary(report, makeCtx());
223
+ expect(card.status).toBe("ready");
224
+ if (card.status === "ready") {
225
+ const body = card.body;
226
+ expect(body.dimension).toBe("task-completion");
227
+ expect(body.failureMode).toBe("missing-docs");
228
+ }
229
+ });
230
+ });
@@ -79,7 +79,9 @@ export const generateDocAttributionSpotlight = async (report, ctx) => {
79
79
  .max(5),
80
80
  });
81
81
  const prompt = buildDocAttributionSpotlightPrompt(report, ctx.judgmentAttributions);
82
- const { value, usage } = await ctx.llm.completeStructured({
82
+ // Destructure `cost` and `model` from the LLMClient return —
83
+ // already provided per llm-client.ts:139-144, previously discarded.
84
+ const { value, usage, cost, model } = await ctx.llm.completeStructured({
83
85
  model: CARD_MODEL,
84
86
  prompt: `${prompt.system}\n\n${prompt.user}`,
85
87
  schema: PerCallSchema,
@@ -99,6 +101,8 @@ export const generateDocAttributionSpotlight = async (report, ctx) => {
99
101
  cardVersion: "doc-attribution-spotlight@0.1.0",
100
102
  tokenUsage: { input: usage.promptTokens, output: usage.completionTokens },
101
103
  generatedAt: new Date().toISOString(),
104
+ cost,
105
+ model,
102
106
  },
103
107
  };
104
108
  };
@@ -38,10 +38,54 @@ export const FailureModeSummaryBodySchema = z
38
38
  // Private helper — find the dimension a failure mode belongs to
39
39
  // ---------------------------------------------------------------------------
40
40
  /**
41
- * Find the first canonical dimension whose taxonomy includes `mode`.
41
+ * Per-family dimension preference order.
42
+ *
43
+ * Some failure modes (e.g., `missing-docs`) appear in multiple dimension
44
+ * families. When we know the report's eval mode, we should resolve the mode
45
+ * to a dimension in the matching family first, falling back to the linear
46
+ * scan only when the mode-preferred family doesn't carry the failure mode.
47
+ */
48
+ const MODE_TO_PREFERRED_DIMENSIONS = {
49
+ literacy: ["task-completion", "code-correctness", "doc-coverage"],
50
+ "mcp-server": [
51
+ "mcp-behavior",
52
+ "input-validation",
53
+ "output-correctness",
54
+ "error-handling",
55
+ "security",
56
+ ],
57
+ "knowledge-probe": [
58
+ "knowledge-probe",
59
+ "factual-correctness",
60
+ "completeness",
61
+ "currency",
62
+ ],
63
+ "agent-harness": [
64
+ "agent-harness",
65
+ "process-quality",
66
+ "agent-output",
67
+ "tool-usage",
68
+ ],
69
+ custom: [],
70
+ };
71
+ /**
72
+ * Find a canonical dimension whose taxonomy includes `mode`. When a
73
+ * `preferredEvalMode` is supplied, prefer dimensions in the eval mode's
74
+ * family — e.g. a mode appearing in both literacy and MCP resolves to MCP on
75
+ * an MCP-only run. Falls back to the linear scan of CANONICAL_DIMENSIONS so
76
+ * cross-cutting modes (and modes from unknown eval modes) still resolve.
77
+ *
42
78
  * Returns `undefined` if the mode is not in any dimension's taxonomy.
43
79
  */
44
- function findDimensionForMode(mode) {
80
+ function findDimensionForMode(mode, preferredEvalMode) {
81
+ if (preferredEvalMode) {
82
+ const preferred = MODE_TO_PREFERRED_DIMENSIONS[preferredEvalMode] ?? [];
83
+ for (const dim of preferred) {
84
+ if (failureModesForDimension(dim).includes(mode)) {
85
+ return dim;
86
+ }
87
+ }
88
+ }
45
89
  for (const dim of CANONICAL_DIMENSIONS) {
46
90
  if (failureModesForDimension(dim).includes(mode)) {
47
91
  return dim;
@@ -67,7 +111,7 @@ export const generateFailureModeSummary = async (report) => {
67
111
  // Find the top entry — topTitles is already sorted by count descending
68
112
  const topEntry = slimFm.topTitles.reduce((best, entry) => (entry.count > best.count ? entry : best), slimFm.topTitles[0]);
69
113
  const failureMode = topEntry.category;
70
- const dimension = findDimensionForMode(failureMode);
114
+ const dimension = findDimensionForMode(failureMode, report.provenance?.mode);
71
115
  if (!dimension) {
72
116
  return {
73
117
  status: "missing",
@@ -25,6 +25,16 @@ import { generateDocAttributionSpotlight } from "./doc-attribution-spotlight.js"
25
25
  import { generateRegressionVsBaseline } from "./regression-vs-baseline.js";
26
26
  import type { CardGenerator } from "../../diagnosis-runner.js";
27
27
  import type { CardType } from "../../../types/diagnosis.js";
28
+ /**
29
+ * Canonical version of the card-registry surface — bumped whenever any card
30
+ * generator or body schema in this barrel changes. Used as the `cardVersion`
31
+ * fallback in version-resolver helpers (CLI `interpret`, API
32
+ * `versionsFromRecord`) so the four-version cache invalidation envelope stays
33
+ * in sync with the actual registry.
34
+ *
35
+ * Mirrors the pattern of `diagnosisVersion` exported from `diagnosis-runner.ts`.
36
+ */
37
+ export declare const CARD_REGISTRY_VERSION = "0.1.0";
28
38
  /**
29
39
  * The canonical card-generator registry for the diagnosis engine.
30
40
  *
@@ -24,6 +24,19 @@ import { generateLowConfidenceAttribution } from "./low-confidence-attribution.j
24
24
  import { generateDocAttributionSpotlight } from "./doc-attribution-spotlight.js";
25
25
  import { generateRegressionVsBaseline } from "./regression-vs-baseline.js";
26
26
  // ---------------------------------------------------------------------------
27
+ // Card registry version (cache invalidation segment)
28
+ // ---------------------------------------------------------------------------
29
+ /**
30
+ * Canonical version of the card-registry surface — bumped whenever any card
31
+ * generator or body schema in this barrel changes. Used as the `cardVersion`
32
+ * fallback in version-resolver helpers (CLI `interpret`, API
33
+ * `versionsFromRecord`) so the four-version cache invalidation envelope stays
34
+ * in sync with the actual registry.
35
+ *
36
+ * Mirrors the pattern of `diagnosisVersion` exported from `diagnosis-runner.ts`.
37
+ */
38
+ export const CARD_REGISTRY_VERSION = "0.1.0";
39
+ // ---------------------------------------------------------------------------
27
40
  // DIAGNOSIS_CARD_GENERATORS — full 8-card registry literal
28
41
  // ---------------------------------------------------------------------------
29
42
  /**