@sanity/ailf 5.0.0 → 6.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/config/airbyte/ai_literacy_framework.connector.yaml +276 -0
  2. package/config/bigquery/views/synthesis_parse_failure_rate_7d.sql +42 -0
  3. package/config/diagnosis-cards.ts +318 -0
  4. package/config/models.ts +12 -0
  5. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
  6. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
  7. package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
  8. package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
  9. package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
  10. package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
  11. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
  12. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
  13. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
  14. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
  15. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
  16. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
  17. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  18. package/dist/_vendor/ailf-core/index.js +4 -0
  19. package/dist/_vendor/ailf-core/ports/context.d.ts +12 -0
  20. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -0
  21. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -0
  22. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
  23. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
  24. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
  25. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
  26. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
  27. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +230 -0
  28. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
  29. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
  30. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
  31. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
  32. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
  33. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +108 -0
  34. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
  35. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +140 -0
  36. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +49 -0
  37. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +65 -0
  38. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
  39. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +93 -0
  40. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
  41. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
  42. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
  43. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +130 -0
  44. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
  45. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +111 -0
  46. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
  47. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +118 -0
  48. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
  49. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +286 -0
  50. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
  51. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
  52. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
  53. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
  54. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
  55. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
  56. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
  57. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
  58. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
  59. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
  60. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +18 -0
  61. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +74 -0
  62. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +10 -0
  63. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +10 -0
  64. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +119 -2
  65. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +136 -2
  66. package/dist/_vendor/ailf-core/services/index.d.ts +5 -1
  67. package/dist/_vendor/ailf-core/services/index.js +15 -2
  68. package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
  69. package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
  70. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +115 -10
  71. package/dist/_vendor/ailf-core/types/diagnosis.js +3 -1
  72. package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
  73. package/dist/_vendor/ailf-core/types/repo-config.d.ts +16 -0
  74. package/dist/_vendor/ailf-core/types/synthesis-telemetry.d.ts +101 -0
  75. package/dist/_vendor/ailf-core/types/synthesis-telemetry.js +18 -0
  76. package/dist/adapters/config-sources/file-config-adapter.js +8 -6
  77. package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
  78. package/dist/adapters/llm/fake-llm-client.js +38 -1
  79. package/dist/adapters/llm/index.d.ts +1 -1
  80. package/dist/adapters/llm/index.js +1 -1
  81. package/dist/adapters/llm/openai-llm-client.js +59 -5
  82. package/dist/adapters/llm/retry.d.ts +18 -0
  83. package/dist/adapters/llm/retry.js +21 -0
  84. package/dist/adapters/synthesis/synthesis-telemetry-schema.d.ts +49 -0
  85. package/dist/adapters/synthesis/synthesis-telemetry-schema.js +55 -0
  86. package/dist/adapters/task-sources/content-lake-task-source.js +10 -5
  87. package/dist/adapters/task-sources/repo-schemas.d.ts +7 -0
  88. package/dist/adapters/task-sources/repo-schemas.js +10 -0
  89. package/dist/cli-program.js +3 -0
  90. package/dist/commands/interpret.d.ts +70 -0
  91. package/dist/commands/interpret.js +221 -0
  92. package/dist/commands/pipeline-action.d.ts +44 -0
  93. package/dist/commands/pipeline-action.js +193 -1
  94. package/dist/commands/run.d.ts +2 -0
  95. package/dist/commands/run.js +2 -0
  96. package/dist/composition-root.d.ts +21 -23
  97. package/dist/composition-root.js +107 -41
  98. package/dist/config/diagnosis-cards.ts +318 -0
  99. package/dist/config/models.ts +12 -0
  100. package/dist/grader/agent-harness.d.ts +5 -10
  101. package/dist/grader/agent-harness.js +5 -13
  102. package/dist/grader/common.d.ts +5 -13
  103. package/dist/grader/common.js +5 -17
  104. package/dist/grader/index.d.ts +15 -29
  105. package/dist/grader/index.js +15 -66
  106. package/dist/grader/knowledge-probe.d.ts +5 -10
  107. package/dist/grader/knowledge-probe.js +5 -14
  108. package/dist/grader/literacy.d.ts +5 -9
  109. package/dist/grader/literacy.js +5 -13
  110. package/dist/grader/mcp.d.ts +5 -10
  111. package/dist/grader/mcp.js +5 -14
  112. package/dist/orchestration/pipeline-orchestrator.js +3 -0
  113. package/dist/report-store.d.ts +26 -0
  114. package/dist/report-store.js +63 -0
  115. package/package.json +2 -2
@@ -225,6 +225,134 @@ definitions:
225
225
  schema:
226
226
  $ref: "#/schemas/area_scores"
227
227
 
228
+ # ------------------------------------------------------------------
229
+ # Stream 3: synthesis_summary — one row per report with synthesis telemetry
230
+ # ------------------------------------------------------------------
231
+ # GROQ projection emits cost, parse-failure counts, and rate from the
232
+ # summary.synthesis.diagnosis path written by the Phase-6 post-run hook.
233
+ # Rows are gated on defined(summary.synthesis.diagnosis) so reports that
234
+ # predate Phase 6 produce no rows (incremental cursor still catches them
235
+ # on re-sync once backfilled).
236
+ synthesis_summary:
237
+ type: DeclarativeStream
238
+ name: synthesis_summary
239
+ retriever:
240
+ type: SimpleRetriever
241
+ decoder:
242
+ type: JsonDecoder
243
+ requester:
244
+ $ref: "#/definitions/base_requester"
245
+ path: /v2026-03-12/data/query/{{ config['dataset'] }}
246
+ http_method: GET
247
+ request_parameters:
248
+ query: >-
249
+ *[_type=="ailf.report" && _createdAt > "{{
250
+ stream_interval.start_time or '1970-01-01T00:00:00Z' }}" &&
251
+ _createdAt <= "{{ stream_interval.end_time }}" &&
252
+ defined(summary.synthesis.diagnosis)]|order(_createdAt asc){
253
+ "report_id": reportId,
254
+ "completed_at": completedAt,
255
+ "mode": provenance.mode,
256
+ "source_name": provenance.source.name,
257
+ "grader_model": provenance.graderModel,
258
+ "synthesis_cost": summary.synthesis.diagnosis.cost,
259
+ "parse_failure_count":
260
+ summary.synthesis.diagnosis.parseFailureCount,
261
+ "parse_failure_rate":
262
+ summary.synthesis.diagnosis.parseFailureRate,
263
+ _createdAt
264
+ }
265
+ record_selector:
266
+ type: RecordSelector
267
+ extractor:
268
+ type: DpathExtractor
269
+ field_path:
270
+ - result
271
+ primary_key:
272
+ - report_id
273
+ incremental_sync:
274
+ type: DatetimeBasedCursor
275
+ cursor_field: _createdAt
276
+ cursor_datetime_formats:
277
+ - "%Y-%m-%dT%H:%M:%S.%fZ"
278
+ - "%Y-%m-%dT%H:%M:%SZ"
279
+ datetime_format: "%Y-%m-%dT%H:%M:%SZ"
280
+ start_datetime:
281
+ type: MinMaxDatetime
282
+ datetime: "{{ config.get('start_date', '2026-01-01T00:00:00Z') }}"
283
+ datetime_format: "%Y-%m-%dT%H:%M:%SZ"
284
+ step: P30D
285
+ cursor_granularity: PT1S
286
+ schema_loader:
287
+ type: InlineSchemaLoader
288
+ schema:
289
+ $ref: "#/schemas/synthesis_summary"
290
+
291
+ # ------------------------------------------------------------------
292
+ # Stream 4: synthesis_per_card — one row per report with per-card array
293
+ # ------------------------------------------------------------------
294
+ # GROQ projection emits the nested perCard array. GROQ cannot explode
295
+ # arrays into flat rows, so the nesting is preserved — BigQuery consumers
296
+ # should UNNEST(JSON_QUERY_ARRAY(per_card)) to get flat rows per card.
297
+ # primary_key is report_id only (not compound) for the same reason.
298
+ synthesis_per_card:
299
+ type: DeclarativeStream
300
+ name: synthesis_per_card
301
+ retriever:
302
+ type: SimpleRetriever
303
+ decoder:
304
+ type: JsonDecoder
305
+ requester:
306
+ $ref: "#/definitions/base_requester"
307
+ path: /v2026-03-12/data/query/{{ config['dataset'] }}
308
+ http_method: GET
309
+ request_parameters:
310
+ query: >-
311
+ *[_type=="ailf.report" && _createdAt > "{{
312
+ stream_interval.start_time or '1970-01-01T00:00:00Z' }}" &&
313
+ _createdAt <= "{{ stream_interval.end_time }}" &&
314
+ defined(summary.synthesis.diagnosis.perCard)]|order(_createdAt
315
+ asc){
316
+ "report_id": reportId,
317
+ "completed_at": completedAt,
318
+ "per_card": summary.synthesis.diagnosis.perCard[]{
319
+ "card_type": cardType,
320
+ "cost": cost,
321
+ "parse_failed": parseFailed,
322
+ "latency_ms": latencyMs,
323
+ "token_input": tokenInput,
324
+ "token_output": tokenOutput,
325
+ "card_version": cardVersion,
326
+ "generated_at": generatedAt
327
+ },
328
+ _createdAt
329
+ }
330
+ record_selector:
331
+ type: RecordSelector
332
+ extractor:
333
+ type: DpathExtractor
334
+ field_path:
335
+ - result
336
+ primary_key:
337
+ - report_id
338
+ incremental_sync:
339
+ type: DatetimeBasedCursor
340
+ cursor_field: _createdAt
341
+ cursor_datetime_formats:
342
+ - "%Y-%m-%dT%H:%M:%S.%fZ"
343
+ - "%Y-%m-%dT%H:%M:%SZ"
344
+ datetime_format: "%Y-%m-%dT%H:%M:%SZ"
345
+ start_datetime:
346
+ type: MinMaxDatetime
347
+ datetime: "{{ config.get('start_date', '2026-01-01T00:00:00Z') }}"
348
+ datetime_format: "%Y-%m-%dT%H:%M:%SZ"
349
+ step: P30D
350
+ cursor_granularity: PT1S
351
+ schema_loader:
352
+ type: InlineSchemaLoader
353
+ schema:
354
+ $ref: "#/schemas/synthesis_per_card"
355
+
228
356
  base_requester:
229
357
  type: HttpRequester
230
358
  url_base: https://{{ config['project_id'] }}.api.sanity.io
@@ -235,6 +363,8 @@ definitions:
235
363
  streams:
236
364
  - $ref: "#/definitions/streams/reports"
237
365
  - $ref: "#/definitions/streams/area_scores"
366
+ - $ref: "#/definitions/streams/synthesis_summary"
367
+ - $ref: "#/definitions/streams/synthesis_per_card"
238
368
 
239
369
  spec:
240
370
  type: Spec
@@ -299,9 +429,25 @@ metadata:
299
429
  primaryKeysAreUnique: true
300
430
  primaryKeysArePresent: true
301
431
  responsesAreSuccessful: true
432
+ synthesis_summary:
433
+ hasRecords: true
434
+ streamHash: null
435
+ hasResponse: true
436
+ primaryKeysAreUnique: true
437
+ primaryKeysArePresent: true
438
+ responsesAreSuccessful: true
439
+ synthesis_per_card:
440
+ hasRecords: true
441
+ streamHash: null
442
+ hasResponse: true
443
+ primaryKeysAreUnique: true
444
+ primaryKeysArePresent: true
445
+ responsesAreSuccessful: true
302
446
  autoImportSchema:
303
447
  reports: false
304
448
  area_scores: false
449
+ synthesis_summary: false
450
+ synthesis_per_card: false
305
451
 
306
452
  # ======================================================================
307
453
  # Inline schemas — manually defined to match the designed BigQuery tables.
@@ -757,3 +903,133 @@ schemas:
757
903
  - "null"
758
904
  description: Sanity document creation timestamp (incremental cursor)
759
905
  additionalProperties: true
906
+
907
+ # ------------------------------------------------------------------
908
+ # synthesis_summary schema — flat, one row per report with synthesis telemetry
909
+ # ------------------------------------------------------------------
910
+ synthesis_summary:
911
+ type: object
912
+ $schema: http://json-schema.org/schema#
913
+ required:
914
+ - report_id
915
+ properties:
916
+ report_id:
917
+ type: string
918
+ description: UUID v7 report identifier (primary key)
919
+ completed_at:
920
+ type:
921
+ - string
922
+ - "null"
923
+ description: ISO 8601 timestamp when the evaluation completed
924
+ mode:
925
+ type:
926
+ - string
927
+ - "null"
928
+ description: "Evaluation mode: baseline, observed, or agentic"
929
+ source_name:
930
+ type:
931
+ - string
932
+ - "null"
933
+ description: Documentation source name (e.g., "production")
934
+ grader_model:
935
+ type:
936
+ - string
937
+ - "null"
938
+ description: Model used for LLM grading (context for cost comparison)
939
+ synthesis_cost:
940
+ type:
941
+ - number
942
+ - "null"
943
+ description:
944
+ Total USD cost of the Diagnosis synthesis run (sum of all card costs)
945
+ parse_failure_count:
946
+ type:
947
+ - number
948
+ - "null"
949
+ description:
950
+ Number of cards that failed Zod schema parse during synthesis
951
+ parse_failure_rate:
952
+ type:
953
+ - number
954
+ - "null"
955
+ description:
956
+ Fraction of cards that failed parse (0–1); 0.0 = no failures
957
+ _createdAt:
958
+ type:
959
+ - string
960
+ - "null"
961
+ description:
962
+ Sanity document creation timestamp (used as incremental cursor)
963
+ additionalProperties: true
964
+
965
+ # ------------------------------------------------------------------
966
+ # synthesis_per_card schema — nested per-card array, one row per report
967
+ # ------------------------------------------------------------------
968
+ # BigQuery consumers should UNNEST(JSON_QUERY_ARRAY(per_card)) to get
969
+ # flat rows per (report × card). See bigquery/views/synthesis_parse_failure_rate_7d.sql
970
+ synthesis_per_card:
971
+ type: object
972
+ $schema: http://json-schema.org/schema#
973
+ required:
974
+ - report_id
975
+ properties:
976
+ report_id:
977
+ type: string
978
+ description: UUID v7 report identifier (primary key)
979
+ completed_at:
980
+ type:
981
+ - string
982
+ - "null"
983
+ description: Denormalized timestamp for partitioning
984
+ per_card:
985
+ type:
986
+ - array
987
+ - "null"
988
+ description: >-
989
+ Per-card synthesis metrics array. UNNEST in BigQuery to get one flat
990
+ row per card. card_type identifies the diagnosis card type (e.g.,
991
+ "top-recommendations").
992
+ items:
993
+ type: object
994
+ properties:
995
+ card_type:
996
+ type: string
997
+ description: Diagnosis card type identifier (≤25 chars)
998
+ cost:
999
+ type:
1000
+ - number
1001
+ - "null"
1002
+ description:
1003
+ USD cost of this card's LLM call (null for deterministic cards)
1004
+ parse_failed:
1005
+ type: boolean
1006
+ description: Whether the card's Zod schema parse failed
1007
+ latency_ms:
1008
+ type:
1009
+ - number
1010
+ - "null"
1011
+ description: LLM call latency in milliseconds
1012
+ token_input:
1013
+ type:
1014
+ - number
1015
+ - "null"
1016
+ description: Input tokens consumed by the LLM call
1017
+ token_output:
1018
+ type:
1019
+ - number
1020
+ - "null"
1021
+ description: Output tokens produced by the LLM call
1022
+ card_version:
1023
+ type: string
1024
+ description:
1025
+ Card implementation version (e.g., "area-summary@0.1.0")
1026
+ generated_at:
1027
+ type: string
1028
+ description: ISO 8601 UTC timestamp when this card was generated
1029
+ _createdAt:
1030
+ type:
1031
+ - string
1032
+ - "null"
1033
+ description:
1034
+ Sanity document creation timestamp (used as incremental cursor)
1035
+ additionalProperties: true
@@ -0,0 +1,42 @@
1
+ -- ailf.synthesis_parse_failure_rate_7d — per-card parse-failure rate over 7 days
2
+ --
3
+ -- Computes the Zod-schema parse-failure rate per Diagnosis card type over the
4
+ -- previous 7 days, sourced from the synthesis_per_card Airbyte stream. Any row
5
+ -- returned by this view represents a card type that breached the 2% threshold
6
+ -- defined in D6-18 and should trigger a manual investigation per the runbook.
7
+ --
8
+ -- Source: ailf_raw.synthesis_per_card (Airbyte stream: "synthesis_per_card")
9
+ -- Target: ailf.synthesis_parse_failure_rate_7d (this view)
10
+ --
11
+ -- Threshold: failure_rate > 0.02 (2%) over INTERVAL 7 DAY [D6-18]
12
+ -- To change the threshold, edit the HAVING clause and WHERE clause below;
13
+ -- both are the single edit points per D6-18 (not lifted to config).
14
+ --
15
+ -- Usage:
16
+ -- bq query --use_legacy_sql=false < views/synthesis_parse_failure_rate_7d.sql
17
+ --
18
+ -- @see docs/runbooks/diagnosis-parse-failure-watch.md — operator runbook
19
+ -- @see packages/eval/config/airbyte/ai_literacy_framework.connector.yaml — synthesis_per_card stream
20
+
21
+ CREATE OR REPLACE VIEW `data-platform-302218.ailf.synthesis_parse_failure_rate_7d` AS
22
+ SELECT
23
+ JSON_VALUE(card, '$.card_type') AS card_type,
24
+ COUNT(*) AS total_runs,
25
+ COUNTIF(SAFE_CAST(JSON_VALUE(card, '$.parse_failed') AS BOOL)) AS parse_failures,
26
+ ROUND(SAFE_DIVIDE(
27
+ COUNTIF(SAFE_CAST(JSON_VALUE(card, '$.parse_failed') AS BOOL)),
28
+ COUNT(*)
29
+ ), 4) AS failure_rate
30
+ FROM
31
+ `data-platform-302218.ailf_raw.synthesis_per_card` AS r,
32
+ UNNEST(JSON_QUERY_ARRAY(r.per_card)) AS card
33
+ WHERE
34
+ r.completed_at IS NOT NULL
35
+ AND TIMESTAMP(r.completed_at) >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 7 DAY)
36
+ AND JSON_VALUE(card, '$.card_type') IS NOT NULL
37
+ GROUP BY
38
+ card_type
39
+ HAVING
40
+ failure_rate > 0.02
41
+ ORDER BY
42
+ failure_rate DESC
@@ -0,0 +1,318 @@
1
+ /**
2
+ * diagnosis-cards.ts — Diagnosis eval matrix config.
3
+ *
4
+ * TS-first config (per .claude/rules/config.md) defining the 5 LLM card types
5
+ * × 3 first-class models eval matrix. Consumed by
6
+ * `scripts/generate-diagnosis-config.ts` to emit
7
+ * `promptfooconfig-diagnosis.yaml`. Never hand-edit the YAML — run
8
+ * `pnpm generate-configs` instead.
9
+ *
10
+ * Per AI-SPEC §5 and CONTEXT D-04 (path b: standalone generator entry point
11
+ * for the diagnosis config, additive — does not modify the existing literacy
12
+ * generate-configs pipeline).
13
+ *
14
+ * @see packages/eval/scripts/generate-diagnosis-config.ts — generator
15
+ * @see packages/eval/promptfooconfig-diagnosis.yaml — generated output
16
+ */
17
+
18
+ // ---------------------------------------------------------------------------
19
+ // Types
20
+ // ---------------------------------------------------------------------------
21
+
22
+ /**
23
+ * A first-class model entry in the diagnosis eval matrix.
24
+ * Mirrors the shape of model entries in `config/models.ts`.
25
+ */
26
+ export interface DiagnosisModelEntry {
27
+ /** Promptfoo provider string (e.g. "anthropic:messages:claude-opus-4-6") */
28
+ id: string
29
+ /** Human-readable label for reports */
30
+ label: string
31
+ /** Per-model config overrides (temperature, max_tokens, etc.) */
32
+ config?: Record<string, unknown>
33
+ }
34
+
35
+ /**
36
+ * The 5 LLM-driven card types under evaluation.
37
+ * Deterministic cards (area-summary, failure-mode-summary, no-issues) are
38
+ * tested via `fixture-matrix.test.ts` (vitest), not via the promptfoo matrix.
39
+ */
40
+ export type LLMCardType =
41
+ | "top-recommendations"
42
+ | "weakest-area"
43
+ | "low-confidence-attribution"
44
+ | "doc-attribution-spotlight"
45
+ | "regression-vs-baseline"
46
+
47
+ /**
48
+ * A single evaluation scenario: one fixture path × one expected outcome.
49
+ *
50
+ * The `fixturePath` is relative to `packages/eval/` so the promptfoo config
51
+ * can resolve it from any working directory. `expectedStatus` drives the
52
+ * pass/fail assertion in the generated YAML.
53
+ */
54
+ export interface DiagnosisScenario {
55
+ /** Short slug used in promptfoo `description` fields */
56
+ name: string
57
+ /** Path to the Report JSON fixture, relative to `packages/eval/` */
58
+ fixturePath: string
59
+ /**
60
+ * Card type this scenario exercises. The eval matrix runs all LLM cards
61
+ * per scenario; this field annotates which card type is the primary focus
62
+ * for the rubric.
63
+ */
64
+ primaryCard: LLMCardType
65
+ /** Expected card status when all LLM calls succeed */
66
+ expectedStatus: "ready" | "degraded" | "missing"
67
+ /** Optional: path to canned LLM response for adversarial scenarios */
68
+ cannedResponsePath?: string
69
+ /**
70
+ * Optional: cardId to key the canned response against (for FakeLLMClient
71
+ * keyedResponses in vitest; mirrored in the promptfoo scenario description
72
+ * for documentation).
73
+ */
74
+ cannedCardId?: LLMCardType
75
+ /** Free-text note about what this scenario tests */
76
+ note?: string
77
+ }
78
+
79
+ /**
80
+ * Top-level diagnosis eval matrix config.
81
+ * Exported as the default export of this file (mirrors models.ts convention).
82
+ */
83
+ export interface DiagnosisCardsConfig {
84
+ /** All LLM card evaluation scenarios */
85
+ scenarios: DiagnosisScenario[]
86
+ /** Models to run each scenario against */
87
+ models: DiagnosisModelEntry[]
88
+ /** Grader model for LLM-judge assertions */
89
+ grader: DiagnosisModelEntry
90
+ /** Eval budget in milliseconds (kill switch) */
91
+ evalBudgetMs: number
92
+ /** Max parallel API calls */
93
+ maxConcurrency: number
94
+ /** Default per-model config */
95
+ defaults: {
96
+ temperature: number
97
+ max_tokens: number
98
+ }
99
+ }
100
+
101
+ // ---------------------------------------------------------------------------
102
+ // Helper
103
+ // ---------------------------------------------------------------------------
104
+
105
+ export function defineDiagnosisCards(
106
+ config: DiagnosisCardsConfig
107
+ ): DiagnosisCardsConfig {
108
+ return config
109
+ }
110
+
111
+ // ---------------------------------------------------------------------------
112
+ // Config definition
113
+ // ---------------------------------------------------------------------------
114
+
115
+ const diagnosisCardsConfig: DiagnosisCardsConfig = defineDiagnosisCards({
116
+ // ── Models under evaluation ────────────────────────────────────────────────
117
+ models: [
118
+ {
119
+ id: "anthropic:messages:claude-opus-4-6",
120
+ label: "Claude Opus 4.6",
121
+ config: { temperature: 0.2, max_tokens: 4096 },
122
+ },
123
+ {
124
+ id: "anthropic:messages:claude-sonnet-4-6",
125
+ label: "Claude Sonnet 4.6",
126
+ config: { temperature: 0.2, max_tokens: 4096 },
127
+ },
128
+ {
129
+ id: "openai:chat:gpt-5.2",
130
+ label: "GPT 5.2",
131
+ config: { max_completion_tokens: 4096 },
132
+ },
133
+ ],
134
+
135
+ // ── Grader model ────────────────────────────────────────────────────────────
136
+ grader: {
137
+ id: "anthropic:messages:claude-opus-4-5-20251101",
138
+ label: "Claude Opus 4.5 (grader)",
139
+ },
140
+
141
+ // ── Eval budget ─────────────────────────────────────────────────────────────
142
+ evalBudgetMs: 3_600_000, // 60 min — full matrix across 3 models × 17 scenarios
143
+ maxConcurrency: 8, // conservative for diagnosis (longer prompts than literacy)
144
+
145
+ // ── Default config ──────────────────────────────────────────────────────────
146
+ defaults: {
147
+ temperature: 0.2,
148
+ max_tokens: 4096,
149
+ },
150
+
151
+ // ── Scenarios (17 logical fixtures) ─────────────────────────────────────────
152
+ scenarios: [
153
+ // ── Critical-path: top-recommendations ──────────────────────────────────
154
+ {
155
+ name: "healthy-top-recommendations",
156
+ fixturePath:
157
+ "test-fixtures/diagnosis/reports/healthy-top-recommendations.json",
158
+ primaryCard: "top-recommendations",
159
+ expectedStatus: "ready",
160
+ note: "Healthy report (mean 91) — top-recommendations card should produce 2+ actionable suggestions with docSlug references from the manifest.",
161
+ },
162
+ {
163
+ name: "low-top-recommendations",
164
+ fixturePath:
165
+ "test-fixtures/diagnosis/reports/low-top-recommendations.json",
166
+ primaryCard: "top-recommendations",
167
+ expectedStatus: "ready",
168
+ note: "Low-scoring report (mean 42) — top-recommendations card should produce high-priority suggestions addressing the dominant failure modes (outdated-docs, missing-docs).",
169
+ },
170
+
171
+ // ── Critical-path: weakest-area ──────────────────────────────────────────
172
+ {
173
+ name: "healthy-weakest-area",
174
+ fixturePath: "test-fixtures/diagnosis/reports/healthy-weakest-area.json",
175
+ primaryCard: "weakest-area",
176
+ expectedStatus: "ready",
177
+ note: "Healthy report with clear weakest area (content-modeling at 82) — weakest-area card should identify the area and provide high-confidence analysis.",
178
+ },
179
+ {
180
+ name: "low-weakest-area",
181
+ fixturePath: "test-fixtures/diagnosis/reports/low-weakest-area.json",
182
+ primaryCard: "weakest-area",
183
+ expectedStatus: "ready",
184
+ note: "Low-scoring report with clear weakest area (content-modeling at 28) — weakest-area card should identify the most critical area with multiple failure modes.",
185
+ },
186
+
187
+ // ── Critical-path: low-confidence-attribution ────────────────────────────
188
+ {
189
+ name: "healthy-low-confidence-attribution",
190
+ fixturePath:
191
+ "test-fixtures/diagnosis/reports/healthy-low-confidence-attribution.json",
192
+ primaryCard: "low-confidence-attribution",
193
+ expectedStatus: "ready",
194
+ note: "Healthy report with small sample sizes (2-3 judgments per area) — low-confidence-attribution card should identify attribution uncertainty despite positive scores.",
195
+ },
196
+ {
197
+ name: "low-low-confidence-attribution",
198
+ fixturePath:
199
+ "test-fixtures/diagnosis/reports/low-low-confidence-attribution.json",
200
+ primaryCard: "low-confidence-attribution",
201
+ expectedStatus: "ready",
202
+ note: "Low-scoring report with small sample sizes (2 judgments per area) — low-confidence-attribution card should flag both score quality and attribution uncertainty.",
203
+ },
204
+
205
+ // ── Critical-path: doc-attribution-spotlight ─────────────────────────────
206
+ {
207
+ name: "healthy-doc-attribution-spotlight",
208
+ fixturePath:
209
+ "test-fixtures/diagnosis/reports/healthy-doc-attribution-spotlight.json",
210
+ primaryCard: "doc-attribution-spotlight",
211
+ expectedStatus: "ready",
212
+ note: "Healthy 5-area report — doc-attribution-spotlight card should identify the highest-impact document in the manifest.",
213
+ },
214
+ {
215
+ name: "low-doc-attribution-spotlight",
216
+ fixturePath:
217
+ "test-fixtures/diagnosis/reports/low-doc-attribution-spotlight.json",
218
+ primaryCard: "doc-attribution-spotlight",
219
+ expectedStatus: "ready",
220
+ note: "Low-scoring 5-area report with multiple failure modes — doc-attribution-spotlight card should identify the most critical document.",
221
+ },
222
+
223
+ // ── Edge cases ───────────────────────────────────────────────────────────
224
+ {
225
+ name: "empty-report",
226
+ fixturePath: "test-fixtures/diagnosis/reports/empty.json",
227
+ primaryCard: "top-recommendations",
228
+ expectedStatus: "missing",
229
+ note: "Edge case (a): zero-area report — all LLM cards should emit status: missing (no data to reason about).",
230
+ },
231
+ {
232
+ name: "single-judgment-per-area",
233
+ fixturePath:
234
+ "test-fixtures/diagnosis/reports/single-judgment-per-area.json",
235
+ primaryCard: "weakest-area",
236
+ expectedStatus: "ready",
237
+ note: "Edge case (b): single-judgment sample size — weakest-area card should reflect low-confidence calibration (sampleSize: 1).",
238
+ },
239
+ {
240
+ name: "all-areas-tied",
241
+ fixturePath: "test-fixtures/diagnosis/reports/all-areas-tied.json",
242
+ primaryCard: "weakest-area",
243
+ expectedStatus: "missing",
244
+ note: "Edge case (c): all areas scored identically (70) — weakest-area card should emit status: missing with reason: no-clear-weakest.",
245
+ },
246
+ {
247
+ name: "grader-major-mismatch-baseline",
248
+ fixturePath:
249
+ "test-fixtures/diagnosis/reports/grader-major-mismatch-baseline.json",
250
+ primaryCard: "regression-vs-baseline",
251
+ expectedStatus: "missing",
252
+ note: "Edge case (d): grader-major-version mismatch — regression-vs-baseline should emit missing with reason: grader-major-version-mismatch. Run as pair with grader-major-mismatch-current.",
253
+ },
254
+ {
255
+ name: "grader-major-mismatch-current",
256
+ fixturePath:
257
+ "test-fixtures/diagnosis/reports/grader-major-mismatch-current.json",
258
+ primaryCard: "regression-vs-baseline",
259
+ expectedStatus: "missing",
260
+ note: "Edge case (d) pair: current report with different graderModel — regression-vs-baseline mismatch guard triggers when paired with grader-major-mismatch-baseline.",
261
+ },
262
+ {
263
+ name: "near-deprecated-taxonomy",
264
+ fixturePath:
265
+ "test-fixtures/diagnosis/reports/near-deprecated-taxonomy.json",
266
+ primaryCard: "weakest-area",
267
+ expectedStatus: "ready",
268
+ note: "Edge case (e): report using unclassified failure mode (currently canonical but watch for taxonomy retirement). Zod refine() must accept canonical modes.",
269
+ },
270
+
271
+ // ── Adversarial canned responses ─────────────────────────────────────────
272
+ {
273
+ name: "adversarial-fabricated-delta",
274
+ fixturePath:
275
+ "test-fixtures/diagnosis/reports/grader-major-mismatch-current.json",
276
+ primaryCard: "regression-vs-baseline",
277
+ expectedStatus: "degraded",
278
+ cannedResponsePath:
279
+ "test-fixtures/diagnosis/canned-responses/fabricated-delta-regression.json",
280
+ cannedCardId: "regression-vs-baseline",
281
+ note: "Adversarial: fabricated delta (AI-SPEC §1b failure-mode #1). LLM claims -7.3 delta; direction-sign refine triggers degraded card.",
282
+ },
283
+ {
284
+ name: "adversarial-improve-introduction",
285
+ fixturePath:
286
+ "test-fixtures/diagnosis/reports/low-top-recommendations.json",
287
+ primaryCard: "top-recommendations",
288
+ expectedStatus: "degraded",
289
+ cannedResponsePath:
290
+ "test-fixtures/diagnosis/canned-responses/improve-introduction.json",
291
+ cannedCardId: "top-recommendations",
292
+ note: "Adversarial: generic anti-pattern recommendation (AI-SPEC §1b failure-mode #2). Actionability refine triggers degraded card.",
293
+ },
294
+ {
295
+ name: "adversarial-hallucinated-docslug",
296
+ fixturePath:
297
+ "test-fixtures/diagnosis/reports/low-top-recommendations.json",
298
+ primaryCard: "top-recommendations",
299
+ expectedStatus: "degraded",
300
+ cannedResponsePath:
301
+ "test-fixtures/diagnosis/canned-responses/hallucinated-docslug.json",
302
+ cannedCardId: "top-recommendations",
303
+ note: "Adversarial: hallucinated docSlug (AI-SPEC §1b failure-mode #3). Allow-list refine triggers degraded card.",
304
+ },
305
+ {
306
+ name: "adversarial-taxonomy-drift",
307
+ fixturePath: "test-fixtures/diagnosis/reports/low-weakest-area.json",
308
+ primaryCard: "weakest-area",
309
+ expectedStatus: "degraded",
310
+ cannedResponsePath:
311
+ "test-fixtures/diagnosis/canned-responses/taxonomy-drift.json",
312
+ cannedCardId: "weakest-area",
313
+ note: "Adversarial: taxonomy drift (AI-SPEC §1b failure-mode #4). Per-dimension failureMode refine triggers degraded card.",
314
+ },
315
+ ],
316
+ })
317
+
318
+ export default diagnosisCardsConfig
package/config/models.ts CHANGED
@@ -24,6 +24,18 @@ export default defineModels({
24
24
  // All literacy variants included by default (baseline, observed,
25
25
  // agentic-naive, agentic-optimized)
26
26
  },
27
+ {
28
+ // Phase 5 LLM card routing (D-07). AI-SPEC §4 routes 3 routine cards
29
+ // (top-recommendations, weakest-area, regression-vs-baseline) here.
30
+ // Pricing already in AnthropicLLMClient; baseline literacy variant only.
31
+ id: "anthropic:messages:claude-sonnet-4-6",
32
+ label: "Claude Sonnet 4.6",
33
+ config: { temperature: 0.2, max_tokens: 4096 },
34
+ modes: ["literacy"],
35
+ variants: {
36
+ literacy: ["baseline"],
37
+ },
38
+ },
27
39
 
28
40
  // ── Google ─────────────────────────────────────────────────
29
41
  // {