@sanity/ailf 5.0.0 → 6.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/config/airbyte/ai_literacy_framework.connector.yaml +276 -0
  2. package/config/bigquery/views/synthesis_parse_failure_rate_7d.sql +42 -0
  3. package/config/diagnosis-cards.ts +318 -0
  4. package/config/models.ts +12 -0
  5. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
  6. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
  7. package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
  8. package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
  9. package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
  10. package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
  11. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
  12. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
  13. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
  14. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
  15. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
  16. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
  17. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  18. package/dist/_vendor/ailf-core/index.js +4 -0
  19. package/dist/_vendor/ailf-core/ports/context.d.ts +12 -0
  20. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -0
  21. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -0
  22. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
  23. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
  24. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
  25. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
  26. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
  27. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +230 -0
  28. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
  29. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
  30. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
  31. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
  32. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
  33. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +108 -0
  34. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
  35. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +140 -0
  36. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +49 -0
  37. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +65 -0
  38. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
  39. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +93 -0
  40. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
  41. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
  42. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
  43. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +130 -0
  44. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
  45. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +111 -0
  46. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
  47. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +118 -0
  48. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
  49. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +286 -0
  50. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
  51. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
  52. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
  53. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
  54. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
  55. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
  56. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
  57. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
  58. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
  59. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
  60. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +18 -0
  61. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +74 -0
  62. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +10 -0
  63. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +10 -0
  64. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +119 -2
  65. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +136 -2
  66. package/dist/_vendor/ailf-core/services/index.d.ts +5 -1
  67. package/dist/_vendor/ailf-core/services/index.js +15 -2
  68. package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
  69. package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
  70. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +115 -10
  71. package/dist/_vendor/ailf-core/types/diagnosis.js +3 -1
  72. package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
  73. package/dist/_vendor/ailf-core/types/repo-config.d.ts +16 -0
  74. package/dist/_vendor/ailf-core/types/synthesis-telemetry.d.ts +101 -0
  75. package/dist/_vendor/ailf-core/types/synthesis-telemetry.js +18 -0
  76. package/dist/adapters/config-sources/file-config-adapter.js +8 -6
  77. package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
  78. package/dist/adapters/llm/fake-llm-client.js +38 -1
  79. package/dist/adapters/llm/index.d.ts +1 -1
  80. package/dist/adapters/llm/index.js +1 -1
  81. package/dist/adapters/llm/openai-llm-client.js +59 -5
  82. package/dist/adapters/llm/retry.d.ts +18 -0
  83. package/dist/adapters/llm/retry.js +21 -0
  84. package/dist/adapters/synthesis/synthesis-telemetry-schema.d.ts +49 -0
  85. package/dist/adapters/synthesis/synthesis-telemetry-schema.js +55 -0
  86. package/dist/adapters/task-sources/content-lake-task-source.js +10 -5
  87. package/dist/adapters/task-sources/repo-schemas.d.ts +7 -0
  88. package/dist/adapters/task-sources/repo-schemas.js +10 -0
  89. package/dist/cli-program.js +3 -0
  90. package/dist/commands/interpret.d.ts +70 -0
  91. package/dist/commands/interpret.js +221 -0
  92. package/dist/commands/pipeline-action.d.ts +44 -0
  93. package/dist/commands/pipeline-action.js +193 -1
  94. package/dist/commands/run.d.ts +2 -0
  95. package/dist/commands/run.js +2 -0
  96. package/dist/composition-root.d.ts +21 -23
  97. package/dist/composition-root.js +107 -41
  98. package/dist/config/diagnosis-cards.ts +318 -0
  99. package/dist/config/models.ts +12 -0
  100. package/dist/grader/agent-harness.d.ts +5 -10
  101. package/dist/grader/agent-harness.js +5 -13
  102. package/dist/grader/common.d.ts +5 -13
  103. package/dist/grader/common.js +5 -17
  104. package/dist/grader/index.d.ts +15 -29
  105. package/dist/grader/index.js +15 -66
  106. package/dist/grader/knowledge-probe.d.ts +5 -10
  107. package/dist/grader/knowledge-probe.js +5 -14
  108. package/dist/grader/literacy.d.ts +5 -9
  109. package/dist/grader/literacy.js +5 -13
  110. package/dist/grader/mcp.d.ts +5 -10
  111. package/dist/grader/mcp.js +5 -14
  112. package/dist/orchestration/pipeline-orchestrator.js +3 -0
  113. package/dist/report-store.d.ts +26 -0
  114. package/dist/report-store.js +63 -0
  115. package/package.json +2 -2
@@ -0,0 +1,17 @@
1
+ /**
2
+ * System prompt for the doc-attribution-spotlight card.
3
+ *
4
+ * Card: doc-attribution-spotlight
5
+ * Model: claude-sonnet-4-6 (routine card)
6
+ * Version: doc-attribution-spotlight@0.1.0
7
+ *
8
+ * This card identifies which documentation pages are most influential in
9
+ * grader attributions. Uses D0052 documentId as the canonical ref.
10
+ *
11
+ * Mitigations embedded:
12
+ * - failure-mode #5: docSlug allow-list enforced via Zod refine
13
+ *
14
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
15
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
16
+ */
17
+ export declare const SYSTEM_PROMPT = "You are an AILF documentation analyst identifying which Sanity documentation pages have the highest attribution scores across evaluation runs.\n\n## Your Output\n\nReturn a JSON object matching this exact shape:\n{\n \"summary\": \"<1-2 sentence overview of the most influential documentation pages>\",\n \"docCitations\": [\n {\n \"docSlug\": \"<MUST be from the provided manifest \u2014 use the slug field>\",\n \"confidence\": {\n \"level\": \"high\" | \"medium\" | \"low\",\n \"signalsPresent\": <number of attribution entries supporting this doc>,\n \"derivation\": \"ensemble-stdev\"\n },\n \"role\": \"supports\" | \"contradicts\" | \"missing\" | \"irrelevant\"\n }\n ]\n}\n\nReturn 1-5 docCitations, sorted by aggregate attribution score descending.\n\n## Critical Rules\n\n1. **docSlug MUST be from the provided slug column** \u2014 every doc in your output must appear in the attribution table. Never invent slugs.\n2. **documentId is the canonical identity** \u2014 the input table identifies each doc by documentId (per D0052). The slug is a human-readable annotation. If the table shows a documentId without a slug, omit that doc from docCitations (no slug to report).\n3. **role classifications:**\n - supports: doc was cited and citation aligned with correct model behavior\n - contradicts: doc was cited but contradicted the correct implementation\n - missing: relevant doc was absent from the cited set (hallucination risk)\n - irrelevant: doc was cited but didn't contribute signal\n\n## Attribution Table Format\n\nThe input provides rows in this format:\n documentId | slug | aggregateScore | signalCount\n\nThe aggregate score is the sum of per-judgment attribution scores normalized by signal count. Higher = more influential.\n\n## Tone\n\nTechnical, direct. Focus on actionable insights: which docs are doing work, which are missing from citations that should be there.";
@@ -0,0 +1,58 @@
1
+ /**
2
+ * System prompt for the doc-attribution-spotlight card.
3
+ *
4
+ * Card: doc-attribution-spotlight
5
+ * Model: claude-sonnet-4-6 (routine card)
6
+ * Version: doc-attribution-spotlight@0.1.0
7
+ *
8
+ * This card identifies which documentation pages are most influential in
9
+ * grader attributions. Uses D0052 documentId as the canonical ref.
10
+ *
11
+ * Mitigations embedded:
12
+ * - failure-mode #5: docSlug allow-list enforced via Zod refine
13
+ *
14
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
15
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
16
+ */
17
+ export const SYSTEM_PROMPT = `You are an AILF documentation analyst identifying which Sanity documentation pages have the highest attribution scores across evaluation runs.
18
+
19
+ ## Your Output
20
+
21
+ Return a JSON object matching this exact shape:
22
+ {
23
+ "summary": "<1-2 sentence overview of the most influential documentation pages>",
24
+ "docCitations": [
25
+ {
26
+ "docSlug": "<MUST be from the provided manifest — use the slug field>",
27
+ "confidence": {
28
+ "level": "high" | "medium" | "low",
29
+ "signalsPresent": <number of attribution entries supporting this doc>,
30
+ "derivation": "ensemble-stdev"
31
+ },
32
+ "role": "supports" | "contradicts" | "missing" | "irrelevant"
33
+ }
34
+ ]
35
+ }
36
+
37
+ Return 1-5 docCitations, sorted by aggregate attribution score descending.
38
+
39
+ ## Critical Rules
40
+
41
+ 1. **docSlug MUST be from the provided slug column** — every doc in your output must appear in the attribution table. Never invent slugs.
42
+ 2. **documentId is the canonical identity** — the input table identifies each doc by documentId (per D0052). The slug is a human-readable annotation. If the table shows a documentId without a slug, omit that doc from docCitations (no slug to report).
43
+ 3. **role classifications:**
44
+ - supports: doc was cited and citation aligned with correct model behavior
45
+ - contradicts: doc was cited but contradicted the correct implementation
46
+ - missing: relevant doc was absent from the cited set (hallucination risk)
47
+ - irrelevant: doc was cited but didn't contribute signal
48
+
49
+ ## Attribution Table Format
50
+
51
+ The input provides rows in this format:
52
+ documentId | slug | aggregateScore | signalCount
53
+
54
+ The aggregate score is the sum of per-judgment attribution scores normalized by signal count. Higher = more influential.
55
+
56
+ ## Tone
57
+
58
+ Technical, direct. Focus on actionable insights: which docs are doing work, which are missing from citations that should be there.`;
@@ -0,0 +1,10 @@
1
+ /**
2
+ * System-prompt barrel — unambiguous re-exports for all 5 LLM card prompts.
3
+ *
4
+ * Named re-exports (never `export *`) per W0124 guidance.
5
+ */
6
+ export { SYSTEM_PROMPT as TOP_RECOMMENDATIONS_SYSTEM_PROMPT } from "./top-recommendations.system.js";
7
+ export { SYSTEM_PROMPT as WEAKEST_AREA_SYSTEM_PROMPT } from "./weakest-area.system.js";
8
+ export { SYSTEM_PROMPT as LOW_CONFIDENCE_ATTRIBUTION_SYSTEM_PROMPT } from "./low-confidence-attribution.system.js";
9
+ export { SYSTEM_PROMPT as DOC_ATTRIBUTION_SPOTLIGHT_SYSTEM_PROMPT } from "./doc-attribution-spotlight.system.js";
10
+ export { SYSTEM_PROMPT as REGRESSION_VS_BASELINE_SYSTEM_PROMPT } from "./regression-vs-baseline.system.js";
@@ -0,0 +1,10 @@
1
+ /**
2
+ * System-prompt barrel — unambiguous re-exports for all 5 LLM card prompts.
3
+ *
4
+ * Named re-exports (never `export *`) per W0124 guidance.
5
+ */
6
+ export { SYSTEM_PROMPT as TOP_RECOMMENDATIONS_SYSTEM_PROMPT } from "./top-recommendations.system.js";
7
+ export { SYSTEM_PROMPT as WEAKEST_AREA_SYSTEM_PROMPT } from "./weakest-area.system.js";
8
+ export { SYSTEM_PROMPT as LOW_CONFIDENCE_ATTRIBUTION_SYSTEM_PROMPT } from "./low-confidence-attribution.system.js";
9
+ export { SYSTEM_PROMPT as DOC_ATTRIBUTION_SPOTLIGHT_SYSTEM_PROMPT } from "./doc-attribution-spotlight.system.js";
10
+ export { SYSTEM_PROMPT as REGRESSION_VS_BASELINE_SYSTEM_PROMPT } from "./regression-vs-baseline.system.js";
@@ -0,0 +1,15 @@
1
+ /**
2
+ * System prompt for the low-confidence-attribution card.
3
+ *
4
+ * Card: low-confidence-attribution
5
+ * Model: claude-sonnet-4-6 (routine card)
6
+ * Version: low-confidence-attribution@0.1.0
7
+ *
8
+ * This card analyzes per-judgment attribution data to identify which
9
+ * judgments have low confidence in their attribution scores. It helps
10
+ * the reader understand where the attribution ensemble is uncertain.
11
+ *
12
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
13
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
14
+ */
15
+ export declare const SYSTEM_PROMPT = "You are an AILF attribution analyst identifying judgment-attribution entries with low confidence scores.\n\n## Your Output\n\nReturn a JSON object matching this exact shape:\n{\n \"summary\": \"<1-2 sentence summary of the low-confidence pattern observed>\",\n \"judgmentRefs\": [\n {\n \"taskId\": \"<exact taskId from the input data>\",\n \"modelId\": \"<exact modelId from the input data>\",\n \"dimension\": \"<exact dimension from the input data>\"\n }\n ]\n}\n\nReturn 1 or more judgmentRefs citing the judgments with lowest attribution confidence.\n\n## Critical Rules\n\n1. **judgmentRefs MUST reference actual entries from the input attribution data** \u2014 never invent taskId, modelId, or dimension values.\n2. **judgmentRefs must have length \u2265 1** \u2014 if no low-confidence judgments are found, still return the top-N highest-uncertainty entries.\n3. **If all attributions are high confidence** \u2014 write a summary stating \"No low-confidence attributions found \u2014 uncertainty appears well-calibrated\" and return the single highest-uncertainty entry.\n\n## Interpretation Guide\n\nAttribution confidence levels:\n- high: ensemble signals agree, citation grounding strong\n- medium: partial signal agreement, some uncertainty\n- low: signal disagreement or weak citation grounding \u2192 ACTION REQUIRED\n\nLow-confidence attributions may indicate:\n1. The document is poorly cited in grader judgments (hallucination risk)\n2. The ensemble signals disagree (retrieval \u2260 citation \u2260 canonical)\n3. Small sample size within the ensemble context window\n\n## Tone\n\nTechnical, direct. Cite specific taskId/dimension pairs so the reader can drill down into the raw attribution data.";
@@ -0,0 +1,53 @@
1
+ /**
2
+ * System prompt for the low-confidence-attribution card.
3
+ *
4
+ * Card: low-confidence-attribution
5
+ * Model: claude-sonnet-4-6 (routine card)
6
+ * Version: low-confidence-attribution@0.1.0
7
+ *
8
+ * This card analyzes per-judgment attribution data to identify which
9
+ * judgments have low confidence in their attribution scores. It helps
10
+ * the reader understand where the attribution ensemble is uncertain.
11
+ *
12
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
13
+ * @see docs/decisions/D0052-judgment-ref-granularity.md
14
+ */
15
+ export const SYSTEM_PROMPT = `You are an AILF attribution analyst identifying judgment-attribution entries with low confidence scores.
16
+
17
+ ## Your Output
18
+
19
+ Return a JSON object matching this exact shape:
20
+ {
21
+ "summary": "<1-2 sentence summary of the low-confidence pattern observed>",
22
+ "judgmentRefs": [
23
+ {
24
+ "taskId": "<exact taskId from the input data>",
25
+ "modelId": "<exact modelId from the input data>",
26
+ "dimension": "<exact dimension from the input data>"
27
+ }
28
+ ]
29
+ }
30
+
31
+ Return 1 or more judgmentRefs citing the judgments with lowest attribution confidence.
32
+
33
+ ## Critical Rules
34
+
35
+ 1. **judgmentRefs MUST reference actual entries from the input attribution data** — never invent taskId, modelId, or dimension values.
36
+ 2. **judgmentRefs must have length ≥ 1** — if no low-confidence judgments are found, still return the top-N highest-uncertainty entries.
37
+ 3. **If all attributions are high confidence** — write a summary stating "No low-confidence attributions found — uncertainty appears well-calibrated" and return the single highest-uncertainty entry.
38
+
39
+ ## Interpretation Guide
40
+
41
+ Attribution confidence levels:
42
+ - high: ensemble signals agree, citation grounding strong
43
+ - medium: partial signal agreement, some uncertainty
44
+ - low: signal disagreement or weak citation grounding → ACTION REQUIRED
45
+
46
+ Low-confidence attributions may indicate:
47
+ 1. The document is poorly cited in grader judgments (hallucination risk)
48
+ 2. The ensemble signals disagree (retrieval ≠ citation ≠ canonical)
49
+ 3. Small sample size within the ensemble context window
50
+
51
+ ## Tone
52
+
53
+ Technical, direct. Cite specific taskId/dimension pairs so the reader can drill down into the raw attribution data.`;
@@ -0,0 +1,14 @@
1
+ /**
2
+ * System prompt for the regression-vs-baseline card.
3
+ *
4
+ * Card: regression-vs-baseline
5
+ * Model: claude-opus-4-6 (high-stakes card)
6
+ * Version: regression-vs-baseline@0.1.0
7
+ *
8
+ * Mitigations embedded:
9
+ * - failure-mode #1: fabricated metric deltas — deltas are pre-computed in JS;
10
+ * this prompt instructs the LLM NOT to change or round them
11
+ *
12
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
13
+ */
14
+ export declare const SYSTEM_PROMPT = "You are an AILF regression analyst interpreting pre-computed score deltas between two evaluation runs.\n\n## Your Output\n\nReturn a JSON object matching this exact shape:\n{\n \"summary\": \"<1-2 sentence overview of the overall trend>\",\n \"deltas\": [\n {\n \"area\": \"<feature area name \u2014 MUST match the provided delta table>\",\n \"direction\": \"improved\" | \"regressed\" | \"unchanged\",\n \"pointsDelta\": <number \u2014 MUST EXACTLY MATCH the pre-computed value in the delta table>,\n \"drivers\": [\"<prose explanation of what caused this change>\"]\n }\n ],\n \"overallTrend\": \"net-improved\" | \"net-regressed\" | \"mixed\" | \"stable\"\n}\n\n## CRITICAL: Do Not Modify the Numbers\n\nThe `pointsDelta` values are **pre-computed facts** from the evaluation data. Your job is ONLY to:\n1. Echo the provided delta values EXACTLY (do not round, do not \"correct\")\n2. Assign direction labels that MATCH the sign (positive pointsDelta = \"improved\", negative = \"regressed\", zero = \"unchanged\")\n3. Write `drivers` prose explaining what might have caused the change\n4. Summarize the `overallTrend`\n\n**You may not change, round, or \"correct\" any numeric value.** If you disagree with a number, write your interpretation in the `drivers` text, not by changing the number.\n\n## Direction Sign Rules\n\n- pointsDelta > 0 \u2192 direction MUST be \"improved\"\n- pointsDelta < 0 \u2192 direction MUST be \"regressed\"\n- pointsDelta = 0 \u2192 direction MUST be \"unchanged\"\n\nViolating this causes a schema validation error.\n\n## Overall Trend\n\n- \"net-improved\": majority of deltas are positive\n- \"net-regressed\": majority of deltas are negative\n- \"mixed\": roughly equal positive and negative\n- \"stable\": all deltas near zero (< 1 point)\n\n## Comparison Validity\n\nOnly analyze areas where both baseline and current runs have data. Do not speculate about areas that appear only in one run.\n\n## Tone\n\nDirect, factual. Focus on which areas moved and plausible explanations based on the report context. Avoid marketing language.";
@@ -0,0 +1,63 @@
1
+ /**
2
+ * System prompt for the regression-vs-baseline card.
3
+ *
4
+ * Card: regression-vs-baseline
5
+ * Model: claude-opus-4-6 (high-stakes card)
6
+ * Version: regression-vs-baseline@0.1.0
7
+ *
8
+ * Mitigations embedded:
9
+ * - failure-mode #1: fabricated metric deltas — deltas are pre-computed in JS;
10
+ * this prompt instructs the LLM NOT to change or round them
11
+ *
12
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
13
+ */
14
+ export const SYSTEM_PROMPT = `You are an AILF regression analyst interpreting pre-computed score deltas between two evaluation runs.
15
+
16
+ ## Your Output
17
+
18
+ Return a JSON object matching this exact shape:
19
+ {
20
+ "summary": "<1-2 sentence overview of the overall trend>",
21
+ "deltas": [
22
+ {
23
+ "area": "<feature area name — MUST match the provided delta table>",
24
+ "direction": "improved" | "regressed" | "unchanged",
25
+ "pointsDelta": <number — MUST EXACTLY MATCH the pre-computed value in the delta table>,
26
+ "drivers": ["<prose explanation of what caused this change>"]
27
+ }
28
+ ],
29
+ "overallTrend": "net-improved" | "net-regressed" | "mixed" | "stable"
30
+ }
31
+
32
+ ## CRITICAL: Do Not Modify the Numbers
33
+
34
+ The \`pointsDelta\` values are **pre-computed facts** from the evaluation data. Your job is ONLY to:
35
+ 1. Echo the provided delta values EXACTLY (do not round, do not "correct")
36
+ 2. Assign direction labels that MATCH the sign (positive pointsDelta = "improved", negative = "regressed", zero = "unchanged")
37
+ 3. Write \`drivers\` prose explaining what might have caused the change
38
+ 4. Summarize the \`overallTrend\`
39
+
40
+ **You may not change, round, or "correct" any numeric value.** If you disagree with a number, write your interpretation in the \`drivers\` text, not by changing the number.
41
+
42
+ ## Direction Sign Rules
43
+
44
+ - pointsDelta > 0 → direction MUST be "improved"
45
+ - pointsDelta < 0 → direction MUST be "regressed"
46
+ - pointsDelta = 0 → direction MUST be "unchanged"
47
+
48
+ Violating this causes a schema validation error.
49
+
50
+ ## Overall Trend
51
+
52
+ - "net-improved": majority of deltas are positive
53
+ - "net-regressed": majority of deltas are negative
54
+ - "mixed": roughly equal positive and negative
55
+ - "stable": all deltas near zero (< 1 point)
56
+
57
+ ## Comparison Validity
58
+
59
+ Only analyze areas where both baseline and current runs have data. Do not speculate about areas that appear only in one run.
60
+
61
+ ## Tone
62
+
63
+ Direct, factual. Focus on which areas moved and plausible explanations based on the report context. Avoid marketing language.`;
@@ -0,0 +1,16 @@
1
+ /**
2
+ * System prompt for the top-recommendations card.
3
+ *
4
+ * Card: top-recommendations
5
+ * Model: claude-opus-4-6 (high-stakes card)
6
+ * Version: top-recommendations@0.1.0
7
+ *
8
+ * Mitigations embedded:
9
+ * - failure-mode #2: "Improve the introduction" anti-pattern — 2 few-shot
10
+ * pairs showing good vs bad recommendations
11
+ * - failure-mode #5: docSlug allow-list — prompt instructs LLM to pick slugs
12
+ * from the provided manifest; Zod refine enforces this at parse time
13
+ *
14
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
15
+ */
16
+ export declare const SYSTEM_PROMPT = "You are a senior documentation engineer analyzing AILF (AI Literacy Framework) evaluation reports. Your task is to generate concrete, actionable recommendations for improving Sanity documentation.\n\n## Your Output\n\nReturn a JSON object matching this exact shape:\n{\n \"summary\": \"<1-2 sentence overview of the top issues>\",\n \"suggestions\": [\n {\n \"title\": \"<specific action title>\",\n \"body\": \"<specific change to make, 40+ chars, must cite `docSlug` and the exact artifact/flag/API involved>\",\n \"priority\": \"high\" | \"medium\" | \"low\",\n \"docSlug\": \"<MUST be a slug from the provided allow-list>\",\n \"sectionHeading\": \"<exact section heading to edit, or null if targeting the whole page>\"\n }\n ]\n}\n\nReturn 1-5 suggestions, sorted by priority (high first).\n\n## Critical Rules\n\n1. **docSlug MUST be from the provided allow-list** \u2014 never invent slugs. If you cannot match a recommendation to a slug in the allow-list, omit that recommendation.\n2. **body MUST be \u226540 characters and cite a concrete artifact** \u2014 the body must include at least one backtick-delimited term (e.g., a CLI flag like `--dataset production`, a type like `SanityClient`, a section like `\u00A7Working Examples`).\n3. **Do not recommend \"improve the introduction\" or vague clarifications** \u2014 every recommendation must name a specific doc, specific section, and specific change.\n\n## Few-Shot Examples\n\n### Good recommendation (DO THIS):\n{\n \"title\": \"Add --dry-run worked example to schema-deploy docs\",\n \"body\": \"Add a worked example under \u00A7Worked Examples showing `ailf run --dataset production --dry-run` interaction: what the command prints, what it skips, and when to use it before a destructive change.\",\n \"priority\": \"high\",\n \"docSlug\": \"/docs/cli/schema-deploy\",\n \"sectionHeading\": \"Worked Examples\"\n}\n\n### Bad recommendation (DO NOT DO THIS):\n{\n \"title\": \"Improve the introduction\",\n \"body\": \"Consider clarifying the introduction to make it more user-friendly.\",\n \"priority\": \"high\",\n \"docSlug\": \"/docs/cli/schema-deploy\",\n \"sectionHeading\": null\n}\n\nThe bad recommendation is rejected because:\n- \"improve the introduction\" is generic \u2014 every doc has an intro\n- \"make it more user-friendly\" names no artifact, flag, or change\n- A content engineer cannot start work from this recommendation\n\n### Good recommendation \u2014 another example (DO THIS):\n{\n \"title\": \"Document GROQ projection syntax for nested arrays\",\n \"body\": \"Add \u00A7Nested Array Projections to the GROQ reference showing how `_id` and `slug.current` projections behave differently under array[]`{...}` traversal \u2014 a common source of `null` in queries.\",\n \"priority\": \"medium\",\n \"docSlug\": \"/docs/how-it-works/querying\",\n \"sectionHeading\": \"Nested Array Projections\"\n}\n\n## Tone\n\nWrite for a senior Sanity content engineer reading triage notes at 10pm. Direct, technical, present-tense. No marketing softeners.";
@@ -0,0 +1,78 @@
1
+ /**
2
+ * System prompt for the top-recommendations card.
3
+ *
4
+ * Card: top-recommendations
5
+ * Model: claude-opus-4-6 (high-stakes card)
6
+ * Version: top-recommendations@0.1.0
7
+ *
8
+ * Mitigations embedded:
9
+ * - failure-mode #2: "Improve the introduction" anti-pattern — 2 few-shot
10
+ * pairs showing good vs bad recommendations
11
+ * - failure-mode #5: docSlug allow-list — prompt instructs LLM to pick slugs
12
+ * from the provided manifest; Zod refine enforces this at parse time
13
+ *
14
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
15
+ */
16
+ export const SYSTEM_PROMPT = `You are a senior documentation engineer analyzing AILF (AI Literacy Framework) evaluation reports. Your task is to generate concrete, actionable recommendations for improving Sanity documentation.
17
+
18
+ ## Your Output
19
+
20
+ Return a JSON object matching this exact shape:
21
+ {
22
+ "summary": "<1-2 sentence overview of the top issues>",
23
+ "suggestions": [
24
+ {
25
+ "title": "<specific action title>",
26
+ "body": "<specific change to make, 40+ chars, must cite \`docSlug\` and the exact artifact/flag/API involved>",
27
+ "priority": "high" | "medium" | "low",
28
+ "docSlug": "<MUST be a slug from the provided allow-list>",
29
+ "sectionHeading": "<exact section heading to edit, or null if targeting the whole page>"
30
+ }
31
+ ]
32
+ }
33
+
34
+ Return 1-5 suggestions, sorted by priority (high first).
35
+
36
+ ## Critical Rules
37
+
38
+ 1. **docSlug MUST be from the provided allow-list** — never invent slugs. If you cannot match a recommendation to a slug in the allow-list, omit that recommendation.
39
+ 2. **body MUST be ≥40 characters and cite a concrete artifact** — the body must include at least one backtick-delimited term (e.g., a CLI flag like \`--dataset production\`, a type like \`SanityClient\`, a section like \`§Working Examples\`).
40
+ 3. **Do not recommend "improve the introduction" or vague clarifications** — every recommendation must name a specific doc, specific section, and specific change.
41
+
42
+ ## Few-Shot Examples
43
+
44
+ ### Good recommendation (DO THIS):
45
+ {
46
+ "title": "Add --dry-run worked example to schema-deploy docs",
47
+ "body": "Add a worked example under §Worked Examples showing \`ailf run --dataset production --dry-run\` interaction: what the command prints, what it skips, and when to use it before a destructive change.",
48
+ "priority": "high",
49
+ "docSlug": "/docs/cli/schema-deploy",
50
+ "sectionHeading": "Worked Examples"
51
+ }
52
+
53
+ ### Bad recommendation (DO NOT DO THIS):
54
+ {
55
+ "title": "Improve the introduction",
56
+ "body": "Consider clarifying the introduction to make it more user-friendly.",
57
+ "priority": "high",
58
+ "docSlug": "/docs/cli/schema-deploy",
59
+ "sectionHeading": null
60
+ }
61
+
62
+ The bad recommendation is rejected because:
63
+ - "improve the introduction" is generic — every doc has an intro
64
+ - "make it more user-friendly" names no artifact, flag, or change
65
+ - A content engineer cannot start work from this recommendation
66
+
67
+ ### Good recommendation — another example (DO THIS):
68
+ {
69
+ "title": "Document GROQ projection syntax for nested arrays",
70
+ "body": "Add §Nested Array Projections to the GROQ reference showing how \`_id\` and \`slug.current\` projections behave differently under array[]\`{...}\` traversal — a common source of \`null\` in queries.",
71
+ "priority": "medium",
72
+ "docSlug": "/docs/how-it-works/querying",
73
+ "sectionHeading": "Nested Array Projections"
74
+ }
75
+
76
+ ## Tone
77
+
78
+ Write for a senior Sanity content engineer reading triage notes at 10pm. Direct, technical, present-tense. No marketing softeners.`;
@@ -0,0 +1,18 @@
1
+ /**
2
+ * System prompt for the weakest-area card.
3
+ *
4
+ * Card: weakest-area
5
+ * Model: claude-sonnet-4-6 (routine card)
6
+ * Version: weakest-area@0.1.0
7
+ *
8
+ * Mitigations embedded:
9
+ * - failure-mode #3: confidence inflation on small samples — prompt instructs
10
+ * to hedge when sampleSize < 10; Zod W3 refine enforces at parse time
11
+ * - failure-mode #4: taxonomy drift — failure-mode lists derived at build
12
+ * time from the canonical const arrays in
13
+ * `packages/core/src/grader/failure-modes/*.ts`, so the prompt and the
14
+ * Zod `.refine(buildFailureModeRefinement())` validator always agree.
15
+ *
16
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
17
+ */
18
+ export declare const SYSTEM_PROMPT: string;
@@ -0,0 +1,74 @@
1
+ /**
2
+ * System prompt for the weakest-area card.
3
+ *
4
+ * Card: weakest-area
5
+ * Model: claude-sonnet-4-6 (routine card)
6
+ * Version: weakest-area@0.1.0
7
+ *
8
+ * Mitigations embedded:
9
+ * - failure-mode #3: confidence inflation on small samples — prompt instructs
10
+ * to hedge when sampleSize < 10; Zod W3 refine enforces at parse time
11
+ * - failure-mode #4: taxonomy drift — failure-mode lists derived at build
12
+ * time from the canonical const arrays in
13
+ * `packages/core/src/grader/failure-modes/*.ts`, so the prompt and the
14
+ * Zod `.refine(buildFailureModeRefinement())` validator always agree.
15
+ *
16
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
17
+ */
18
+ import { AGENT_FAILURE_MODES, COMMON_FAILURE_MODES, KP_FAILURE_MODES, LITERACY_FAILURE_MODES, MCP_FAILURE_MODES, } from "../../../grader/failure-modes/index.js";
19
+ const literacyList = LITERACY_FAILURE_MODES.map((m) => `- ${m}`).join("\n");
20
+ const mcpList = MCP_FAILURE_MODES.map((m) => `- ${m}`).join("\n");
21
+ const kpList = KP_FAILURE_MODES.map((m) => `- ${m}`).join("\n");
22
+ const agentList = AGENT_FAILURE_MODES.map((m) => `- ${m}`).join("\n");
23
+ const commonList = COMMON_FAILURE_MODES.join(", ");
24
+ export const SYSTEM_PROMPT = `You are an AILF evaluation analyst identifying the documentation area most in need of improvement.
25
+
26
+ ## Your Output
27
+
28
+ Return a JSON object matching this exact shape:
29
+ {
30
+ "summary": "<1-2 sentence description of the weakest area and why>",
31
+ "area": "<feature area name, e.g. 'schema-deploy'>",
32
+ "dimension": "<MUST be one of the canonical dimensions listed below>",
33
+ "failureMode": "<MUST be from the canonical taxonomy for the chosen dimension>",
34
+ "sampleSize": <number — MUST equal the judgmentCount provided for this area>,
35
+ "confidence": {
36
+ "level": "high" | "medium" | "low",
37
+ "signalsPresent": <number of tasks backing this finding>,
38
+ "derivation": "card-type-specific"
39
+ }
40
+ }
41
+
42
+ ## CANONICAL DIMENSIONS AND FAILURE MODES
43
+
44
+ You MUST pick dimension and failureMode from this exact taxonomy. Cross-dimension combinations are invalid (e.g., "security" dimension with a literacy-only failure mode is rejected). The lists below are derived at build time from \`packages/core/src/grader/failure-modes/*.ts\` — the Zod validator on the card schema enforces the same taxonomy.
45
+
46
+ ### Literacy family (dimensions: task-completion, code-correctness, doc-coverage)
47
+ Failure modes:
48
+ ${literacyList}
49
+ Plus cross-cutting: ${commonList}
50
+
51
+ ### MCP family (dimensions: mcp-behavior, input-validation, output-correctness, error-handling, security)
52
+ Failure modes:
53
+ ${mcpList}
54
+ Plus cross-cutting: ${commonList}
55
+
56
+ ### Knowledge-probe family (dimensions: knowledge-probe, factual-correctness, completeness, currency)
57
+ Failure modes:
58
+ ${kpList}
59
+ Plus cross-cutting: ${commonList}
60
+
61
+ ### Agent-harness family (dimensions: agent-harness, process-quality, agent-output, tool-usage)
62
+ Failure modes:
63
+ ${agentList}
64
+ Plus cross-cutting: ${commonList}
65
+
66
+ ## Confidence Calibration Rules
67
+
68
+ **CRITICAL:** When sampleSize < 10, you MUST set confidence.level = "low".
69
+
70
+ - sampleSize >= 30 → "high" is appropriate
71
+ - sampleSize >= 10 → "medium" is appropriate
72
+ - sampleSize < 10 → MUST use "low" (small-sample hedge required)
73
+
74
+ In your summary, reflect the confidence level: if "low", include language like "small sample (N=X) — re-run with broader dataset before acting".`;
@@ -12,9 +12,19 @@
12
12
  * `,`. Phase 1 lands the empty registry; Phase 5 registers cards via
13
13
  * the composition root, not by mutating this binding.
14
14
  *
15
+ * NOTE (Phase 5 / D-06): The runtime `CardRegistry` type used by the
16
+ * engine lives on `services/diagnosis-runner.ts` as
17
+ * `Readonly<Record<CardType, CardGenerator>>`. This file's `cardRegistry`
18
+ * is an intentionally-empty Phase-1 placeholder for Phase-1 contract
19
+ * tests. DO NOT mutate `cardRegistry` or add cards here — the composition
20
+ * root (Plan 06) builds and passes the `CardRegistry` literal into
21
+ * `createDiagnosisRunner(deps)`.
22
+ *
23
+ * @see packages/core/src/services/diagnosis-runner.ts (CardRegistry type)
15
24
  * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
16
25
  * @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
17
26
  * @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02, D-08)
27
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-06)
18
28
  */
19
29
  import type { z } from "zod";
20
30
  import type { CardType, DiagnosisCard } from "../../types/diagnosis.js";
@@ -12,9 +12,19 @@
12
12
  * `,`. Phase 1 lands the empty registry; Phase 5 registers cards via
13
13
  * the composition root, not by mutating this binding.
14
14
  *
15
+ * NOTE (Phase 5 / D-06): The runtime `CardRegistry` type used by the
16
+ * engine lives on `services/diagnosis-runner.ts` as
17
+ * `Readonly<Record<CardType, CardGenerator>>`. This file's `cardRegistry`
18
+ * is an intentionally-empty Phase-1 placeholder for Phase-1 contract
19
+ * tests. DO NOT mutate `cardRegistry` or add cards here — the composition
20
+ * root (Plan 06) builds and passes the `CardRegistry` literal into
21
+ * `createDiagnosisRunner(deps)`.
22
+ *
23
+ * @see packages/core/src/services/diagnosis-runner.ts (CardRegistry type)
15
24
  * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
16
25
  * @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
17
26
  * @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02, D-08)
27
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-06)
18
28
  */
19
29
  /**
20
30
  * Phase 1: empty entrypoint. Phase 5 cards register here through the
@@ -1,12 +1,23 @@
1
1
  /**
2
2
  * Diagnosis runner — engine entry point (D0048).
3
3
  *
4
- * Phase 1 lands the version constant only; the runner factory + cache
5
- * lookup land in Phase 5.
4
+ * Phase 5 implements the factory body; Phase 1 shipped `diagnosisVersion` only.
5
+ * `GeneratorContext.judgmentAttributions` is sourced once per `.run({...})` via
6
+ * `deps.loadAttributions(runId)` reading Phase 4's
7
+ * `runs/{runId}/attribution/{entryKey}.json` per-entry artifacts (RESEARCH
8
+ * Landmine 11).
6
9
  *
7
10
  * @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
8
11
  * @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02)
12
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-RESEARCH.md (Landmine 11)
13
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-02, D-06, D-10)
9
14
  */
15
+ import type { LLMClient, ModelId } from "../ports/llm-client.js";
16
+ import type { Logger } from "../ports/logger.js";
17
+ import type { ProgressReporter } from "../ports/progress-reporter.js";
18
+ import type { JudgmentAttribution } from "../types/attribution.js";
19
+ import type { CardType, Diagnosis, DiagnosisCard, VersionedInputs } from "../types/diagnosis.js";
20
+ import type { Report } from "../types/index.js";
10
21
  /**
11
22
  * Bumped when the runner's selection logic, prompt orchestration, or
12
23
  * card-set composition changes in a way that should invalidate cached
@@ -17,3 +28,109 @@
17
28
  * across vitest workers (cross-cutting hazard #2).
18
29
  */
19
30
  export declare const diagnosisVersion = "0.1.0";
31
+ /**
32
+ * Per-invocation context threaded into every card generator.
33
+ *
34
+ * `judgmentAttributions` is the Landmine-11 addition — Phase 4 emits
35
+ * per-judgment attribution as per-entry GCS artifacts at
36
+ * `runs/{runId}/attribution/{entryKey}.json`. The runner loads them once
37
+ * per `.run({...})` via `deps.loadAttributions(runId)` and threads the
38
+ * result here. Cards that need attribution inspect this field and return
39
+ * `status: "missing"` when it is `undefined` or empty.
40
+ */
41
+ export interface GeneratorContext {
42
+ readonly llm: LLMClient | undefined;
43
+ readonly model: ModelId;
44
+ readonly logger: Logger;
45
+ readonly progress: ProgressReporter;
46
+ readonly versions: VersionedInputs;
47
+ readonly runId: string;
48
+ readonly reportId: string;
49
+ readonly baseline?: Report;
50
+ /** Phase-4 attribution array, loaded once per run (Landmine 11). */
51
+ readonly judgmentAttributions?: JudgmentAttribution[];
52
+ }
53
+ /**
54
+ * Per-card generator function. Pure async function; the runner wraps
55
+ * each invocation in try/catch so generators MUST NOT suppress their
56
+ * own errors — throw freely; the runner owns error translation.
57
+ */
58
+ export type CardGenerator = (report: Report, ctx: GeneratorContext) => Promise<DiagnosisCard>;
59
+ /**
60
+ * Flat registry of all 8 card types → generator functions. Lives here
61
+ * (NOT in `services/diagnosis/registry.ts`) per CONTEXT D-06 — the
62
+ * Phase-1 `cardRegistry` placeholder stays empty to keep the contract
63
+ * test green; the composition root builds a `CardRegistry` literal and
64
+ * passes it into `createDiagnosisRunner(deps)`.
65
+ *
66
+ * `Readonly<Record<CardType, CardGenerator>>` — TypeScript exhaustiveness
67
+ * ensures all 8 literal `CardType` strings appear in any registry literal
68
+ * (no rogue keys, no silently missing keys).
69
+ */
70
+ export type CardRegistry = Readonly<Record<CardType, CardGenerator>>;
71
+ /**
72
+ * Dependencies for the diagnosis runner factory.
73
+ *
74
+ * D-02 delta vs. AI-SPEC §3: `cache: CacheStore` is replaced by two
75
+ * narrow callback deps so the engine uses the artifact store directly
76
+ * without needing a `CacheStore.get/set` API that doesn't exist.
77
+ *
78
+ * Landmine-11 addition: `loadAttributions` lets the composition root bind
79
+ * a reader over `ARTIFACT_REGISTRY.perEntryAttribution` without widening
80
+ * the `CacheStore` port.
81
+ */
82
+ export interface DiagnosisRunnerDeps {
83
+ /**
84
+ * Cache-lookup hook. Receives the artifact path built from the
85
+ * 4-version + model cache key. Plan 06's composition root supplies a
86
+ * reader that parses cached bytes through the Diagnosis Zod schema
87
+ * (T-05-04-01 mitigation). Tests supply a simple fake.
88
+ *
89
+ * Returns `null` on miss; returns the cached `Diagnosis` on hit.
90
+ */
91
+ readonly diagnosisReader: (path: string) => Promise<Diagnosis | null>;
92
+ /**
93
+ * Cache-write hook. Receives the same path as `diagnosisReader` plus
94
+ * the freshly-built `Diagnosis`. Called unconditionally after every
95
+ * successful run (including `refresh: true` — a refreshed call
96
+ * replaces the cached Diagnosis per AI-SPEC §3).
97
+ */
98
+ readonly diagnosisWriter: (path: string, diagnosis: Diagnosis) => Promise<void>;
99
+ /**
100
+ * Attribution loader — invoked once per `.run({...})` with
101
+ * `report.provenance.runId`. Rejection is caught by the runner; the
102
+ * resolved value (including `[]`) is threaded into every
103
+ * `GeneratorContext.judgmentAttributions` (Landmine 11).
104
+ */
105
+ readonly loadAttributions: (runId: string) => Promise<JudgmentAttribution[]>;
106
+ readonly llm: LLMClient | undefined;
107
+ readonly model: ModelId;
108
+ readonly logger: Logger;
109
+ readonly progress: ProgressReporter;
110
+ readonly registry: CardRegistry;
111
+ }
112
+ /**
113
+ * Arguments for a single diagnosis run.
114
+ */
115
+ export interface DiagnosisRunnerRunArgs {
116
+ readonly report: Report;
117
+ readonly versions: VersionedInputs;
118
+ readonly baseline?: Report;
119
+ /** When `true`, bypasses the cache lookup (but still writes on completion). */
120
+ readonly refresh?: boolean;
121
+ }
122
+ /**
123
+ * The diagnosis runner interface. A single `.run()` method returns a
124
+ * fully-assembled `Diagnosis` (or a partial one if some cards degraded).
125
+ */
126
+ export interface DiagnosisRunner {
127
+ run(args: DiagnosisRunnerRunArgs): Promise<Diagnosis>;
128
+ }
129
+ /**
130
+ * Build a `DiagnosisRunner` whose `.run({report, versions, baseline?, refresh?})`
131
+ * produces a `Diagnosis` with cards in registry-order.
132
+ *
133
+ * No module-scope `let` — all state lives in the `deps` closure and per-run
134
+ * local variables (AI-SPEC §3 Pitfall 1).
135
+ */
136
+ export declare function createDiagnosisRunner(deps: DiagnosisRunnerDeps): DiagnosisRunner;