@sanity/ailf 5.0.0 → 6.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/airbyte/ai_literacy_framework.connector.yaml +276 -0
- package/config/bigquery/views/synthesis_parse_failure_rate_7d.sql +42 -0
- package/config/diagnosis-cards.ts +318 -0
- package/config/models.ts +12 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +4 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +12 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +230 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +108 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +140 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +65 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +93 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +130 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +111 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +118 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +286 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +74 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +119 -2
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +136 -2
- package/dist/_vendor/ailf-core/services/index.d.ts +5 -1
- package/dist/_vendor/ailf-core/services/index.js +15 -2
- package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +115 -10
- package/dist/_vendor/ailf-core/types/diagnosis.js +3 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +16 -0
- package/dist/_vendor/ailf-core/types/synthesis-telemetry.d.ts +101 -0
- package/dist/_vendor/ailf-core/types/synthesis-telemetry.js +18 -0
- package/dist/adapters/config-sources/file-config-adapter.js +8 -6
- package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
- package/dist/adapters/llm/fake-llm-client.js +38 -1
- package/dist/adapters/llm/index.d.ts +1 -1
- package/dist/adapters/llm/index.js +1 -1
- package/dist/adapters/llm/openai-llm-client.js +59 -5
- package/dist/adapters/llm/retry.d.ts +18 -0
- package/dist/adapters/llm/retry.js +21 -0
- package/dist/adapters/synthesis/synthesis-telemetry-schema.d.ts +49 -0
- package/dist/adapters/synthesis/synthesis-telemetry-schema.js +55 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +10 -5
- package/dist/adapters/task-sources/repo-schemas.d.ts +7 -0
- package/dist/adapters/task-sources/repo-schemas.js +10 -0
- package/dist/cli-program.js +3 -0
- package/dist/commands/interpret.d.ts +70 -0
- package/dist/commands/interpret.js +221 -0
- package/dist/commands/pipeline-action.d.ts +44 -0
- package/dist/commands/pipeline-action.js +193 -1
- package/dist/commands/run.d.ts +2 -0
- package/dist/commands/run.js +2 -0
- package/dist/composition-root.d.ts +21 -23
- package/dist/composition-root.js +107 -41
- package/dist/config/diagnosis-cards.ts +318 -0
- package/dist/config/models.ts +12 -0
- package/dist/grader/agent-harness.d.ts +5 -10
- package/dist/grader/agent-harness.js +5 -13
- package/dist/grader/common.d.ts +5 -13
- package/dist/grader/common.js +5 -17
- package/dist/grader/index.d.ts +15 -29
- package/dist/grader/index.js +15 -66
- package/dist/grader/knowledge-probe.d.ts +5 -10
- package/dist/grader/knowledge-probe.js +5 -14
- package/dist/grader/literacy.d.ts +5 -9
- package/dist/grader/literacy.js +5 -13
- package/dist/grader/mcp.d.ts +5 -10
- package/dist/grader/mcp.js +5 -14
- package/dist/orchestration/pipeline-orchestrator.js +3 -0
- package/dist/report-store.d.ts +26 -0
- package/dist/report-store.js +63 -0
- package/package.json +2 -2
package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* System prompt for the doc-attribution-spotlight card.
|
|
3
|
+
*
|
|
4
|
+
* Card: doc-attribution-spotlight
|
|
5
|
+
* Model: claude-sonnet-4-6 (routine card)
|
|
6
|
+
* Version: doc-attribution-spotlight@0.1.0
|
|
7
|
+
*
|
|
8
|
+
* This card identifies which documentation pages are most influential in
|
|
9
|
+
* grader attributions. Uses D0052 documentId as the canonical ref.
|
|
10
|
+
*
|
|
11
|
+
* Mitigations embedded:
|
|
12
|
+
* - failure-mode #5: docSlug allow-list enforced via Zod refine
|
|
13
|
+
*
|
|
14
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
|
|
15
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
16
|
+
*/
|
|
17
|
+
export declare const SYSTEM_PROMPT = "You are an AILF documentation analyst identifying which Sanity documentation pages have the highest attribution scores across evaluation runs.\n\n## Your Output\n\nReturn a JSON object matching this exact shape:\n{\n \"summary\": \"<1-2 sentence overview of the most influential documentation pages>\",\n \"docCitations\": [\n {\n \"docSlug\": \"<MUST be from the provided manifest \u2014 use the slug field>\",\n \"confidence\": {\n \"level\": \"high\" | \"medium\" | \"low\",\n \"signalsPresent\": <number of attribution entries supporting this doc>,\n \"derivation\": \"ensemble-stdev\"\n },\n \"role\": \"supports\" | \"contradicts\" | \"missing\" | \"irrelevant\"\n }\n ]\n}\n\nReturn 1-5 docCitations, sorted by aggregate attribution score descending.\n\n## Critical Rules\n\n1. **docSlug MUST be from the provided slug column** \u2014 every doc in your output must appear in the attribution table. Never invent slugs.\n2. **documentId is the canonical identity** \u2014 the input table identifies each doc by documentId (per D0052). The slug is a human-readable annotation. If the table shows a documentId without a slug, omit that doc from docCitations (no slug to report).\n3. **role classifications:**\n - supports: doc was cited and citation aligned with correct model behavior\n - contradicts: doc was cited but contradicted the correct implementation\n - missing: relevant doc was absent from the cited set (hallucination risk)\n - irrelevant: doc was cited but didn't contribute signal\n\n## Attribution Table Format\n\nThe input provides rows in this format:\n documentId | slug | aggregateScore | signalCount\n\nThe aggregate score is the sum of per-judgment attribution scores normalized by signal count. Higher = more influential.\n\n## Tone\n\nTechnical, direct. Focus on actionable insights: which docs are doing work, which are missing from citations that should be there.";
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* System prompt for the doc-attribution-spotlight card.
|
|
3
|
+
*
|
|
4
|
+
* Card: doc-attribution-spotlight
|
|
5
|
+
* Model: claude-sonnet-4-6 (routine card)
|
|
6
|
+
* Version: doc-attribution-spotlight@0.1.0
|
|
7
|
+
*
|
|
8
|
+
* This card identifies which documentation pages are most influential in
|
|
9
|
+
* grader attributions. Uses D0052 documentId as the canonical ref.
|
|
10
|
+
*
|
|
11
|
+
* Mitigations embedded:
|
|
12
|
+
* - failure-mode #5: docSlug allow-list enforced via Zod refine
|
|
13
|
+
*
|
|
14
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
|
|
15
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
16
|
+
*/
|
|
17
|
+
export const SYSTEM_PROMPT = `You are an AILF documentation analyst identifying which Sanity documentation pages have the highest attribution scores across evaluation runs.
|
|
18
|
+
|
|
19
|
+
## Your Output
|
|
20
|
+
|
|
21
|
+
Return a JSON object matching this exact shape:
|
|
22
|
+
{
|
|
23
|
+
"summary": "<1-2 sentence overview of the most influential documentation pages>",
|
|
24
|
+
"docCitations": [
|
|
25
|
+
{
|
|
26
|
+
"docSlug": "<MUST be from the provided manifest — use the slug field>",
|
|
27
|
+
"confidence": {
|
|
28
|
+
"level": "high" | "medium" | "low",
|
|
29
|
+
"signalsPresent": <number of attribution entries supporting this doc>,
|
|
30
|
+
"derivation": "ensemble-stdev"
|
|
31
|
+
},
|
|
32
|
+
"role": "supports" | "contradicts" | "missing" | "irrelevant"
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
Return 1-5 docCitations, sorted by aggregate attribution score descending.
|
|
38
|
+
|
|
39
|
+
## Critical Rules
|
|
40
|
+
|
|
41
|
+
1. **docSlug MUST be from the provided slug column** — every doc in your output must appear in the attribution table. Never invent slugs.
|
|
42
|
+
2. **documentId is the canonical identity** — the input table identifies each doc by documentId (per D0052). The slug is a human-readable annotation. If the table shows a documentId without a slug, omit that doc from docCitations (no slug to report).
|
|
43
|
+
3. **role classifications:**
|
|
44
|
+
- supports: doc was cited and citation aligned with correct model behavior
|
|
45
|
+
- contradicts: doc was cited but contradicted the correct implementation
|
|
46
|
+
- missing: relevant doc was absent from the cited set (hallucination risk)
|
|
47
|
+
- irrelevant: doc was cited but didn't contribute signal
|
|
48
|
+
|
|
49
|
+
## Attribution Table Format
|
|
50
|
+
|
|
51
|
+
The input provides rows in this format:
|
|
52
|
+
documentId | slug | aggregateScore | signalCount
|
|
53
|
+
|
|
54
|
+
The aggregate score is the sum of per-judgment attribution scores normalized by signal count. Higher = more influential.
|
|
55
|
+
|
|
56
|
+
## Tone
|
|
57
|
+
|
|
58
|
+
Technical, direct. Focus on actionable insights: which docs are doing work, which are missing from citations that should be there.`;
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* System-prompt barrel — unambiguous re-exports for all 5 LLM card prompts.
|
|
3
|
+
*
|
|
4
|
+
* Named re-exports (never `export *`) per W0124 guidance.
|
|
5
|
+
*/
|
|
6
|
+
export { SYSTEM_PROMPT as TOP_RECOMMENDATIONS_SYSTEM_PROMPT } from "./top-recommendations.system.js";
|
|
7
|
+
export { SYSTEM_PROMPT as WEAKEST_AREA_SYSTEM_PROMPT } from "./weakest-area.system.js";
|
|
8
|
+
export { SYSTEM_PROMPT as LOW_CONFIDENCE_ATTRIBUTION_SYSTEM_PROMPT } from "./low-confidence-attribution.system.js";
|
|
9
|
+
export { SYSTEM_PROMPT as DOC_ATTRIBUTION_SPOTLIGHT_SYSTEM_PROMPT } from "./doc-attribution-spotlight.system.js";
|
|
10
|
+
export { SYSTEM_PROMPT as REGRESSION_VS_BASELINE_SYSTEM_PROMPT } from "./regression-vs-baseline.system.js";
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* System-prompt barrel — unambiguous re-exports for all 5 LLM card prompts.
|
|
3
|
+
*
|
|
4
|
+
* Named re-exports (never `export *`) per W0124 guidance.
|
|
5
|
+
*/
|
|
6
|
+
export { SYSTEM_PROMPT as TOP_RECOMMENDATIONS_SYSTEM_PROMPT } from "./top-recommendations.system.js";
|
|
7
|
+
export { SYSTEM_PROMPT as WEAKEST_AREA_SYSTEM_PROMPT } from "./weakest-area.system.js";
|
|
8
|
+
export { SYSTEM_PROMPT as LOW_CONFIDENCE_ATTRIBUTION_SYSTEM_PROMPT } from "./low-confidence-attribution.system.js";
|
|
9
|
+
export { SYSTEM_PROMPT as DOC_ATTRIBUTION_SPOTLIGHT_SYSTEM_PROMPT } from "./doc-attribution-spotlight.system.js";
|
|
10
|
+
export { SYSTEM_PROMPT as REGRESSION_VS_BASELINE_SYSTEM_PROMPT } from "./regression-vs-baseline.system.js";
|
package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* System prompt for the low-confidence-attribution card.
|
|
3
|
+
*
|
|
4
|
+
* Card: low-confidence-attribution
|
|
5
|
+
* Model: claude-sonnet-4-6 (routine card)
|
|
6
|
+
* Version: low-confidence-attribution@0.1.0
|
|
7
|
+
*
|
|
8
|
+
* This card analyzes per-judgment attribution data to identify which
|
|
9
|
+
* judgments have low confidence in their attribution scores. It helps
|
|
10
|
+
* the reader understand where the attribution ensemble is uncertain.
|
|
11
|
+
*
|
|
12
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
|
|
13
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
14
|
+
*/
|
|
15
|
+
export declare const SYSTEM_PROMPT = "You are an AILF attribution analyst identifying judgment-attribution entries with low confidence scores.\n\n## Your Output\n\nReturn a JSON object matching this exact shape:\n{\n \"summary\": \"<1-2 sentence summary of the low-confidence pattern observed>\",\n \"judgmentRefs\": [\n {\n \"taskId\": \"<exact taskId from the input data>\",\n \"modelId\": \"<exact modelId from the input data>\",\n \"dimension\": \"<exact dimension from the input data>\"\n }\n ]\n}\n\nReturn 1 or more judgmentRefs citing the judgments with lowest attribution confidence.\n\n## Critical Rules\n\n1. **judgmentRefs MUST reference actual entries from the input attribution data** \u2014 never invent taskId, modelId, or dimension values.\n2. **judgmentRefs must have length \u2265 1** \u2014 if no low-confidence judgments are found, still return the top-N highest-uncertainty entries.\n3. **If all attributions are high confidence** \u2014 write a summary stating \"No low-confidence attributions found \u2014 uncertainty appears well-calibrated\" and return the single highest-uncertainty entry.\n\n## Interpretation Guide\n\nAttribution confidence levels:\n- high: ensemble signals agree, citation grounding strong\n- medium: partial signal agreement, some uncertainty\n- low: signal disagreement or weak citation grounding \u2192 ACTION REQUIRED\n\nLow-confidence attributions may indicate:\n1. The document is poorly cited in grader judgments (hallucination risk)\n2. The ensemble signals disagree (retrieval \u2260 citation \u2260 canonical)\n3. Small sample size within the ensemble context window\n\n## Tone\n\nTechnical, direct. Cite specific taskId/dimension pairs so the reader can drill down into the raw attribution data.";
|
package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* System prompt for the low-confidence-attribution card.
|
|
3
|
+
*
|
|
4
|
+
* Card: low-confidence-attribution
|
|
5
|
+
* Model: claude-sonnet-4-6 (routine card)
|
|
6
|
+
* Version: low-confidence-attribution@0.1.0
|
|
7
|
+
*
|
|
8
|
+
* This card analyzes per-judgment attribution data to identify which
|
|
9
|
+
* judgments have low confidence in their attribution scores. It helps
|
|
10
|
+
* the reader understand where the attribution ensemble is uncertain.
|
|
11
|
+
*
|
|
12
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
|
|
13
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
14
|
+
*/
|
|
15
|
+
export const SYSTEM_PROMPT = `You are an AILF attribution analyst identifying judgment-attribution entries with low confidence scores.
|
|
16
|
+
|
|
17
|
+
## Your Output
|
|
18
|
+
|
|
19
|
+
Return a JSON object matching this exact shape:
|
|
20
|
+
{
|
|
21
|
+
"summary": "<1-2 sentence summary of the low-confidence pattern observed>",
|
|
22
|
+
"judgmentRefs": [
|
|
23
|
+
{
|
|
24
|
+
"taskId": "<exact taskId from the input data>",
|
|
25
|
+
"modelId": "<exact modelId from the input data>",
|
|
26
|
+
"dimension": "<exact dimension from the input data>"
|
|
27
|
+
}
|
|
28
|
+
]
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
Return 1 or more judgmentRefs citing the judgments with lowest attribution confidence.
|
|
32
|
+
|
|
33
|
+
## Critical Rules
|
|
34
|
+
|
|
35
|
+
1. **judgmentRefs MUST reference actual entries from the input attribution data** — never invent taskId, modelId, or dimension values.
|
|
36
|
+
2. **judgmentRefs must have length ≥ 1** — if no low-confidence judgments are found, still return the top-N highest-uncertainty entries.
|
|
37
|
+
3. **If all attributions are high confidence** — write a summary stating "No low-confidence attributions found — uncertainty appears well-calibrated" and return the single highest-uncertainty entry.
|
|
38
|
+
|
|
39
|
+
## Interpretation Guide
|
|
40
|
+
|
|
41
|
+
Attribution confidence levels:
|
|
42
|
+
- high: ensemble signals agree, citation grounding strong
|
|
43
|
+
- medium: partial signal agreement, some uncertainty
|
|
44
|
+
- low: signal disagreement or weak citation grounding → ACTION REQUIRED
|
|
45
|
+
|
|
46
|
+
Low-confidence attributions may indicate:
|
|
47
|
+
1. The document is poorly cited in grader judgments (hallucination risk)
|
|
48
|
+
2. The ensemble signals disagree (retrieval ≠ citation ≠ canonical)
|
|
49
|
+
3. Small sample size within the ensemble context window
|
|
50
|
+
|
|
51
|
+
## Tone
|
|
52
|
+
|
|
53
|
+
Technical, direct. Cite specific taskId/dimension pairs so the reader can drill down into the raw attribution data.`;
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* System prompt for the regression-vs-baseline card.
|
|
3
|
+
*
|
|
4
|
+
* Card: regression-vs-baseline
|
|
5
|
+
* Model: claude-opus-4-6 (high-stakes card)
|
|
6
|
+
* Version: regression-vs-baseline@0.1.0
|
|
7
|
+
*
|
|
8
|
+
* Mitigations embedded:
|
|
9
|
+
* - failure-mode #1: fabricated metric deltas — deltas are pre-computed in JS;
|
|
10
|
+
* this prompt instructs the LLM NOT to change or round them
|
|
11
|
+
*
|
|
12
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
|
|
13
|
+
*/
|
|
14
|
+
export declare const SYSTEM_PROMPT = "You are an AILF regression analyst interpreting pre-computed score deltas between two evaluation runs.\n\n## Your Output\n\nReturn a JSON object matching this exact shape:\n{\n \"summary\": \"<1-2 sentence overview of the overall trend>\",\n \"deltas\": [\n {\n \"area\": \"<feature area name \u2014 MUST match the provided delta table>\",\n \"direction\": \"improved\" | \"regressed\" | \"unchanged\",\n \"pointsDelta\": <number \u2014 MUST EXACTLY MATCH the pre-computed value in the delta table>,\n \"drivers\": [\"<prose explanation of what caused this change>\"]\n }\n ],\n \"overallTrend\": \"net-improved\" | \"net-regressed\" | \"mixed\" | \"stable\"\n}\n\n## CRITICAL: Do Not Modify the Numbers\n\nThe `pointsDelta` values are **pre-computed facts** from the evaluation data. Your job is ONLY to:\n1. Echo the provided delta values EXACTLY (do not round, do not \"correct\")\n2. Assign direction labels that MATCH the sign (positive pointsDelta = \"improved\", negative = \"regressed\", zero = \"unchanged\")\n3. Write `drivers` prose explaining what might have caused the change\n4. Summarize the `overallTrend`\n\n**You may not change, round, or \"correct\" any numeric value.** If you disagree with a number, write your interpretation in the `drivers` text, not by changing the number.\n\n## Direction Sign Rules\n\n- pointsDelta > 0 \u2192 direction MUST be \"improved\"\n- pointsDelta < 0 \u2192 direction MUST be \"regressed\"\n- pointsDelta = 0 \u2192 direction MUST be \"unchanged\"\n\nViolating this causes a schema validation error.\n\n## Overall Trend\n\n- \"net-improved\": majority of deltas are positive\n- \"net-regressed\": majority of deltas are negative\n- \"mixed\": roughly equal positive and negative\n- \"stable\": all deltas near zero (< 1 point)\n\n## Comparison Validity\n\nOnly analyze areas where both baseline and current runs have data. Do not speculate about areas that appear only in one run.\n\n## Tone\n\nDirect, factual. Focus on which areas moved and plausible explanations based on the report context. Avoid marketing language.";
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* System prompt for the regression-vs-baseline card.
|
|
3
|
+
*
|
|
4
|
+
* Card: regression-vs-baseline
|
|
5
|
+
* Model: claude-opus-4-6 (high-stakes card)
|
|
6
|
+
* Version: regression-vs-baseline@0.1.0
|
|
7
|
+
*
|
|
8
|
+
* Mitigations embedded:
|
|
9
|
+
* - failure-mode #1: fabricated metric deltas — deltas are pre-computed in JS;
|
|
10
|
+
* this prompt instructs the LLM NOT to change or round them
|
|
11
|
+
*
|
|
12
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
|
|
13
|
+
*/
|
|
14
|
+
export const SYSTEM_PROMPT = `You are an AILF regression analyst interpreting pre-computed score deltas between two evaluation runs.
|
|
15
|
+
|
|
16
|
+
## Your Output
|
|
17
|
+
|
|
18
|
+
Return a JSON object matching this exact shape:
|
|
19
|
+
{
|
|
20
|
+
"summary": "<1-2 sentence overview of the overall trend>",
|
|
21
|
+
"deltas": [
|
|
22
|
+
{
|
|
23
|
+
"area": "<feature area name — MUST match the provided delta table>",
|
|
24
|
+
"direction": "improved" | "regressed" | "unchanged",
|
|
25
|
+
"pointsDelta": <number — MUST EXACTLY MATCH the pre-computed value in the delta table>,
|
|
26
|
+
"drivers": ["<prose explanation of what caused this change>"]
|
|
27
|
+
}
|
|
28
|
+
],
|
|
29
|
+
"overallTrend": "net-improved" | "net-regressed" | "mixed" | "stable"
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
## CRITICAL: Do Not Modify the Numbers
|
|
33
|
+
|
|
34
|
+
The \`pointsDelta\` values are **pre-computed facts** from the evaluation data. Your job is ONLY to:
|
|
35
|
+
1. Echo the provided delta values EXACTLY (do not round, do not "correct")
|
|
36
|
+
2. Assign direction labels that MATCH the sign (positive pointsDelta = "improved", negative = "regressed", zero = "unchanged")
|
|
37
|
+
3. Write \`drivers\` prose explaining what might have caused the change
|
|
38
|
+
4. Summarize the \`overallTrend\`
|
|
39
|
+
|
|
40
|
+
**You may not change, round, or "correct" any numeric value.** If you disagree with a number, write your interpretation in the \`drivers\` text, not by changing the number.
|
|
41
|
+
|
|
42
|
+
## Direction Sign Rules
|
|
43
|
+
|
|
44
|
+
- pointsDelta > 0 → direction MUST be "improved"
|
|
45
|
+
- pointsDelta < 0 → direction MUST be "regressed"
|
|
46
|
+
- pointsDelta = 0 → direction MUST be "unchanged"
|
|
47
|
+
|
|
48
|
+
Violating this causes a schema validation error.
|
|
49
|
+
|
|
50
|
+
## Overall Trend
|
|
51
|
+
|
|
52
|
+
- "net-improved": majority of deltas are positive
|
|
53
|
+
- "net-regressed": majority of deltas are negative
|
|
54
|
+
- "mixed": roughly equal positive and negative
|
|
55
|
+
- "stable": all deltas near zero (< 1 point)
|
|
56
|
+
|
|
57
|
+
## Comparison Validity
|
|
58
|
+
|
|
59
|
+
Only analyze areas where both baseline and current runs have data. Do not speculate about areas that appear only in one run.
|
|
60
|
+
|
|
61
|
+
## Tone
|
|
62
|
+
|
|
63
|
+
Direct, factual. Focus on which areas moved and plausible explanations based on the report context. Avoid marketing language.`;
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* System prompt for the top-recommendations card.
|
|
3
|
+
*
|
|
4
|
+
* Card: top-recommendations
|
|
5
|
+
* Model: claude-opus-4-6 (high-stakes card)
|
|
6
|
+
* Version: top-recommendations@0.1.0
|
|
7
|
+
*
|
|
8
|
+
* Mitigations embedded:
|
|
9
|
+
* - failure-mode #2: "Improve the introduction" anti-pattern — 2 few-shot
|
|
10
|
+
* pairs showing good vs bad recommendations
|
|
11
|
+
* - failure-mode #5: docSlug allow-list — prompt instructs LLM to pick slugs
|
|
12
|
+
* from the provided manifest; Zod refine enforces this at parse time
|
|
13
|
+
*
|
|
14
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
|
|
15
|
+
*/
|
|
16
|
+
export declare const SYSTEM_PROMPT = "You are a senior documentation engineer analyzing AILF (AI Literacy Framework) evaluation reports. Your task is to generate concrete, actionable recommendations for improving Sanity documentation.\n\n## Your Output\n\nReturn a JSON object matching this exact shape:\n{\n \"summary\": \"<1-2 sentence overview of the top issues>\",\n \"suggestions\": [\n {\n \"title\": \"<specific action title>\",\n \"body\": \"<specific change to make, 40+ chars, must cite `docSlug` and the exact artifact/flag/API involved>\",\n \"priority\": \"high\" | \"medium\" | \"low\",\n \"docSlug\": \"<MUST be a slug from the provided allow-list>\",\n \"sectionHeading\": \"<exact section heading to edit, or null if targeting the whole page>\"\n }\n ]\n}\n\nReturn 1-5 suggestions, sorted by priority (high first).\n\n## Critical Rules\n\n1. **docSlug MUST be from the provided allow-list** \u2014 never invent slugs. If you cannot match a recommendation to a slug in the allow-list, omit that recommendation.\n2. **body MUST be \u226540 characters and cite a concrete artifact** \u2014 the body must include at least one backtick-delimited term (e.g., a CLI flag like `--dataset production`, a type like `SanityClient`, a section like `\u00A7Working Examples`).\n3. **Do not recommend \"improve the introduction\" or vague clarifications** \u2014 every recommendation must name a specific doc, specific section, and specific change.\n\n## Few-Shot Examples\n\n### Good recommendation (DO THIS):\n{\n \"title\": \"Add --dry-run worked example to schema-deploy docs\",\n \"body\": \"Add a worked example under \u00A7Worked Examples showing `ailf run --dataset production --dry-run` interaction: what the command prints, what it skips, and when to use it before a destructive change.\",\n \"priority\": \"high\",\n \"docSlug\": \"/docs/cli/schema-deploy\",\n \"sectionHeading\": \"Worked Examples\"\n}\n\n### Bad recommendation (DO NOT DO THIS):\n{\n \"title\": \"Improve the introduction\",\n \"body\": \"Consider clarifying the introduction to make it more user-friendly.\",\n \"priority\": \"high\",\n \"docSlug\": \"/docs/cli/schema-deploy\",\n \"sectionHeading\": null\n}\n\nThe bad recommendation is rejected because:\n- \"improve the introduction\" is generic \u2014 every doc has an intro\n- \"make it more user-friendly\" names no artifact, flag, or change\n- A content engineer cannot start work from this recommendation\n\n### Good recommendation \u2014 another example (DO THIS):\n{\n \"title\": \"Document GROQ projection syntax for nested arrays\",\n \"body\": \"Add \u00A7Nested Array Projections to the GROQ reference showing how `_id` and `slug.current` projections behave differently under array[]`{...}` traversal \u2014 a common source of `null` in queries.\",\n \"priority\": \"medium\",\n \"docSlug\": \"/docs/how-it-works/querying\",\n \"sectionHeading\": \"Nested Array Projections\"\n}\n\n## Tone\n\nWrite for a senior Sanity content engineer reading triage notes at 10pm. Direct, technical, present-tense. No marketing softeners.";
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* System prompt for the top-recommendations card.
|
|
3
|
+
*
|
|
4
|
+
* Card: top-recommendations
|
|
5
|
+
* Model: claude-opus-4-6 (high-stakes card)
|
|
6
|
+
* Version: top-recommendations@0.1.0
|
|
7
|
+
*
|
|
8
|
+
* Mitigations embedded:
|
|
9
|
+
* - failure-mode #2: "Improve the introduction" anti-pattern — 2 few-shot
|
|
10
|
+
* pairs showing good vs bad recommendations
|
|
11
|
+
* - failure-mode #5: docSlug allow-list — prompt instructs LLM to pick slugs
|
|
12
|
+
* from the provided manifest; Zod refine enforces this at parse time
|
|
13
|
+
*
|
|
14
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
|
|
15
|
+
*/
|
|
16
|
+
export const SYSTEM_PROMPT = `You are a senior documentation engineer analyzing AILF (AI Literacy Framework) evaluation reports. Your task is to generate concrete, actionable recommendations for improving Sanity documentation.
|
|
17
|
+
|
|
18
|
+
## Your Output
|
|
19
|
+
|
|
20
|
+
Return a JSON object matching this exact shape:
|
|
21
|
+
{
|
|
22
|
+
"summary": "<1-2 sentence overview of the top issues>",
|
|
23
|
+
"suggestions": [
|
|
24
|
+
{
|
|
25
|
+
"title": "<specific action title>",
|
|
26
|
+
"body": "<specific change to make, 40+ chars, must cite \`docSlug\` and the exact artifact/flag/API involved>",
|
|
27
|
+
"priority": "high" | "medium" | "low",
|
|
28
|
+
"docSlug": "<MUST be a slug from the provided allow-list>",
|
|
29
|
+
"sectionHeading": "<exact section heading to edit, or null if targeting the whole page>"
|
|
30
|
+
}
|
|
31
|
+
]
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
Return 1-5 suggestions, sorted by priority (high first).
|
|
35
|
+
|
|
36
|
+
## Critical Rules
|
|
37
|
+
|
|
38
|
+
1. **docSlug MUST be from the provided allow-list** — never invent slugs. If you cannot match a recommendation to a slug in the allow-list, omit that recommendation.
|
|
39
|
+
2. **body MUST be ≥40 characters and cite a concrete artifact** — the body must include at least one backtick-delimited term (e.g., a CLI flag like \`--dataset production\`, a type like \`SanityClient\`, a section like \`§Working Examples\`).
|
|
40
|
+
3. **Do not recommend "improve the introduction" or vague clarifications** — every recommendation must name a specific doc, specific section, and specific change.
|
|
41
|
+
|
|
42
|
+
## Few-Shot Examples
|
|
43
|
+
|
|
44
|
+
### Good recommendation (DO THIS):
|
|
45
|
+
{
|
|
46
|
+
"title": "Add --dry-run worked example to schema-deploy docs",
|
|
47
|
+
"body": "Add a worked example under §Worked Examples showing \`ailf run --dataset production --dry-run\` interaction: what the command prints, what it skips, and when to use it before a destructive change.",
|
|
48
|
+
"priority": "high",
|
|
49
|
+
"docSlug": "/docs/cli/schema-deploy",
|
|
50
|
+
"sectionHeading": "Worked Examples"
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
### Bad recommendation (DO NOT DO THIS):
|
|
54
|
+
{
|
|
55
|
+
"title": "Improve the introduction",
|
|
56
|
+
"body": "Consider clarifying the introduction to make it more user-friendly.",
|
|
57
|
+
"priority": "high",
|
|
58
|
+
"docSlug": "/docs/cli/schema-deploy",
|
|
59
|
+
"sectionHeading": null
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
The bad recommendation is rejected because:
|
|
63
|
+
- "improve the introduction" is generic — every doc has an intro
|
|
64
|
+
- "make it more user-friendly" names no artifact, flag, or change
|
|
65
|
+
- A content engineer cannot start work from this recommendation
|
|
66
|
+
|
|
67
|
+
### Good recommendation — another example (DO THIS):
|
|
68
|
+
{
|
|
69
|
+
"title": "Document GROQ projection syntax for nested arrays",
|
|
70
|
+
"body": "Add §Nested Array Projections to the GROQ reference showing how \`_id\` and \`slug.current\` projections behave differently under array[]\`{...}\` traversal — a common source of \`null\` in queries.",
|
|
71
|
+
"priority": "medium",
|
|
72
|
+
"docSlug": "/docs/how-it-works/querying",
|
|
73
|
+
"sectionHeading": "Nested Array Projections"
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
## Tone
|
|
77
|
+
|
|
78
|
+
Write for a senior Sanity content engineer reading triage notes at 10pm. Direct, technical, present-tense. No marketing softeners.`;
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* System prompt for the weakest-area card.
|
|
3
|
+
*
|
|
4
|
+
* Card: weakest-area
|
|
5
|
+
* Model: claude-sonnet-4-6 (routine card)
|
|
6
|
+
* Version: weakest-area@0.1.0
|
|
7
|
+
*
|
|
8
|
+
* Mitigations embedded:
|
|
9
|
+
* - failure-mode #3: confidence inflation on small samples — prompt instructs
|
|
10
|
+
* to hedge when sampleSize < 10; Zod W3 refine enforces at parse time
|
|
11
|
+
* - failure-mode #4: taxonomy drift — failure-mode lists derived at build
|
|
12
|
+
* time from the canonical const arrays in
|
|
13
|
+
* `packages/core/src/grader/failure-modes/*.ts`, so the prompt and the
|
|
14
|
+
* Zod `.refine(buildFailureModeRefinement())` validator always agree.
|
|
15
|
+
*
|
|
16
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
|
|
17
|
+
*/
|
|
18
|
+
export declare const SYSTEM_PROMPT: string;
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* System prompt for the weakest-area card.
|
|
3
|
+
*
|
|
4
|
+
* Card: weakest-area
|
|
5
|
+
* Model: claude-sonnet-4-6 (routine card)
|
|
6
|
+
* Version: weakest-area@0.1.0
|
|
7
|
+
*
|
|
8
|
+
* Mitigations embedded:
|
|
9
|
+
* - failure-mode #3: confidence inflation on small samples — prompt instructs
|
|
10
|
+
* to hedge when sampleSize < 10; Zod W3 refine enforces at parse time
|
|
11
|
+
* - failure-mode #4: taxonomy drift — failure-mode lists derived at build
|
|
12
|
+
* time from the canonical const arrays in
|
|
13
|
+
* `packages/core/src/grader/failure-modes/*.ts`, so the prompt and the
|
|
14
|
+
* Zod `.refine(buildFailureModeRefinement())` validator always agree.
|
|
15
|
+
*
|
|
16
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
|
|
17
|
+
*/
|
|
18
|
+
import { AGENT_FAILURE_MODES, COMMON_FAILURE_MODES, KP_FAILURE_MODES, LITERACY_FAILURE_MODES, MCP_FAILURE_MODES, } from "../../../grader/failure-modes/index.js";
|
|
19
|
+
const literacyList = LITERACY_FAILURE_MODES.map((m) => `- ${m}`).join("\n");
|
|
20
|
+
const mcpList = MCP_FAILURE_MODES.map((m) => `- ${m}`).join("\n");
|
|
21
|
+
const kpList = KP_FAILURE_MODES.map((m) => `- ${m}`).join("\n");
|
|
22
|
+
const agentList = AGENT_FAILURE_MODES.map((m) => `- ${m}`).join("\n");
|
|
23
|
+
const commonList = COMMON_FAILURE_MODES.join(", ");
|
|
24
|
+
export const SYSTEM_PROMPT = `You are an AILF evaluation analyst identifying the documentation area most in need of improvement.
|
|
25
|
+
|
|
26
|
+
## Your Output
|
|
27
|
+
|
|
28
|
+
Return a JSON object matching this exact shape:
|
|
29
|
+
{
|
|
30
|
+
"summary": "<1-2 sentence description of the weakest area and why>",
|
|
31
|
+
"area": "<feature area name, e.g. 'schema-deploy'>",
|
|
32
|
+
"dimension": "<MUST be one of the canonical dimensions listed below>",
|
|
33
|
+
"failureMode": "<MUST be from the canonical taxonomy for the chosen dimension>",
|
|
34
|
+
"sampleSize": <number — MUST equal the judgmentCount provided for this area>,
|
|
35
|
+
"confidence": {
|
|
36
|
+
"level": "high" | "medium" | "low",
|
|
37
|
+
"signalsPresent": <number of tasks backing this finding>,
|
|
38
|
+
"derivation": "card-type-specific"
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
## CANONICAL DIMENSIONS AND FAILURE MODES
|
|
43
|
+
|
|
44
|
+
You MUST pick dimension and failureMode from this exact taxonomy. Cross-dimension combinations are invalid (e.g., "security" dimension with a literacy-only failure mode is rejected). The lists below are derived at build time from \`packages/core/src/grader/failure-modes/*.ts\` — the Zod validator on the card schema enforces the same taxonomy.
|
|
45
|
+
|
|
46
|
+
### Literacy family (dimensions: task-completion, code-correctness, doc-coverage)
|
|
47
|
+
Failure modes:
|
|
48
|
+
${literacyList}
|
|
49
|
+
Plus cross-cutting: ${commonList}
|
|
50
|
+
|
|
51
|
+
### MCP family (dimensions: mcp-behavior, input-validation, output-correctness, error-handling, security)
|
|
52
|
+
Failure modes:
|
|
53
|
+
${mcpList}
|
|
54
|
+
Plus cross-cutting: ${commonList}
|
|
55
|
+
|
|
56
|
+
### Knowledge-probe family (dimensions: knowledge-probe, factual-correctness, completeness, currency)
|
|
57
|
+
Failure modes:
|
|
58
|
+
${kpList}
|
|
59
|
+
Plus cross-cutting: ${commonList}
|
|
60
|
+
|
|
61
|
+
### Agent-harness family (dimensions: agent-harness, process-quality, agent-output, tool-usage)
|
|
62
|
+
Failure modes:
|
|
63
|
+
${agentList}
|
|
64
|
+
Plus cross-cutting: ${commonList}
|
|
65
|
+
|
|
66
|
+
## Confidence Calibration Rules
|
|
67
|
+
|
|
68
|
+
**CRITICAL:** When sampleSize < 10, you MUST set confidence.level = "low".
|
|
69
|
+
|
|
70
|
+
- sampleSize >= 30 → "high" is appropriate
|
|
71
|
+
- sampleSize >= 10 → "medium" is appropriate
|
|
72
|
+
- sampleSize < 10 → MUST use "low" (small-sample hedge required)
|
|
73
|
+
|
|
74
|
+
In your summary, reflect the confidence level: if "low", include language like "small sample (N=X) — re-run with broader dataset before acting".`;
|
|
@@ -12,9 +12,19 @@
|
|
|
12
12
|
* `,`. Phase 1 lands the empty registry; Phase 5 registers cards via
|
|
13
13
|
* the composition root, not by mutating this binding.
|
|
14
14
|
*
|
|
15
|
+
* NOTE (Phase 5 / D-06): The runtime `CardRegistry` type used by the
|
|
16
|
+
* engine lives on `services/diagnosis-runner.ts` as
|
|
17
|
+
* `Readonly<Record<CardType, CardGenerator>>`. This file's `cardRegistry`
|
|
18
|
+
* is an intentionally-empty Phase-1 placeholder for Phase-1 contract
|
|
19
|
+
* tests. DO NOT mutate `cardRegistry` or add cards here — the composition
|
|
20
|
+
* root (Plan 06) builds and passes the `CardRegistry` literal into
|
|
21
|
+
* `createDiagnosisRunner(deps)`.
|
|
22
|
+
*
|
|
23
|
+
* @see packages/core/src/services/diagnosis-runner.ts (CardRegistry type)
|
|
15
24
|
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
16
25
|
* @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
|
|
17
26
|
* @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02, D-08)
|
|
27
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-06)
|
|
18
28
|
*/
|
|
19
29
|
import type { z } from "zod";
|
|
20
30
|
import type { CardType, DiagnosisCard } from "../../types/diagnosis.js";
|
|
@@ -12,9 +12,19 @@
|
|
|
12
12
|
* `,`. Phase 1 lands the empty registry; Phase 5 registers cards via
|
|
13
13
|
* the composition root, not by mutating this binding.
|
|
14
14
|
*
|
|
15
|
+
* NOTE (Phase 5 / D-06): The runtime `CardRegistry` type used by the
|
|
16
|
+
* engine lives on `services/diagnosis-runner.ts` as
|
|
17
|
+
* `Readonly<Record<CardType, CardGenerator>>`. This file's `cardRegistry`
|
|
18
|
+
* is an intentionally-empty Phase-1 placeholder for Phase-1 contract
|
|
19
|
+
* tests. DO NOT mutate `cardRegistry` or add cards here — the composition
|
|
20
|
+
* root (Plan 06) builds and passes the `CardRegistry` literal into
|
|
21
|
+
* `createDiagnosisRunner(deps)`.
|
|
22
|
+
*
|
|
23
|
+
* @see packages/core/src/services/diagnosis-runner.ts (CardRegistry type)
|
|
15
24
|
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
16
25
|
* @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
|
|
17
26
|
* @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02, D-08)
|
|
27
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-06)
|
|
18
28
|
*/
|
|
19
29
|
/**
|
|
20
30
|
* Phase 1: empty entrypoint. Phase 5 cards register here through the
|
|
@@ -1,12 +1,23 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Diagnosis runner — engine entry point (D0048).
|
|
3
3
|
*
|
|
4
|
-
* Phase
|
|
5
|
-
*
|
|
4
|
+
* Phase 5 implements the factory body; Phase 1 shipped `diagnosisVersion` only.
|
|
5
|
+
* `GeneratorContext.judgmentAttributions` is sourced once per `.run({...})` via
|
|
6
|
+
* `deps.loadAttributions(runId)` reading Phase 4's
|
|
7
|
+
* `runs/{runId}/attribution/{entryKey}.json` per-entry artifacts (RESEARCH
|
|
8
|
+
* Landmine 11).
|
|
6
9
|
*
|
|
7
10
|
* @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
|
|
8
11
|
* @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02)
|
|
12
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-RESEARCH.md (Landmine 11)
|
|
13
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-02, D-06, D-10)
|
|
9
14
|
*/
|
|
15
|
+
import type { LLMClient, ModelId } from "../ports/llm-client.js";
|
|
16
|
+
import type { Logger } from "../ports/logger.js";
|
|
17
|
+
import type { ProgressReporter } from "../ports/progress-reporter.js";
|
|
18
|
+
import type { JudgmentAttribution } from "../types/attribution.js";
|
|
19
|
+
import type { CardType, Diagnosis, DiagnosisCard, VersionedInputs } from "../types/diagnosis.js";
|
|
20
|
+
import type { Report } from "../types/index.js";
|
|
10
21
|
/**
|
|
11
22
|
* Bumped when the runner's selection logic, prompt orchestration, or
|
|
12
23
|
* card-set composition changes in a way that should invalidate cached
|
|
@@ -17,3 +28,109 @@
|
|
|
17
28
|
* across vitest workers (cross-cutting hazard #2).
|
|
18
29
|
*/
|
|
19
30
|
export declare const diagnosisVersion = "0.1.0";
|
|
31
|
+
/**
|
|
32
|
+
* Per-invocation context threaded into every card generator.
|
|
33
|
+
*
|
|
34
|
+
* `judgmentAttributions` is the Landmine-11 addition — Phase 4 emits
|
|
35
|
+
* per-judgment attribution as per-entry GCS artifacts at
|
|
36
|
+
* `runs/{runId}/attribution/{entryKey}.json`. The runner loads them once
|
|
37
|
+
* per `.run({...})` via `deps.loadAttributions(runId)` and threads the
|
|
38
|
+
* result here. Cards that need attribution inspect this field and return
|
|
39
|
+
* `status: "missing"` when it is `undefined` or empty.
|
|
40
|
+
*/
|
|
41
|
+
export interface GeneratorContext {
|
|
42
|
+
readonly llm: LLMClient | undefined;
|
|
43
|
+
readonly model: ModelId;
|
|
44
|
+
readonly logger: Logger;
|
|
45
|
+
readonly progress: ProgressReporter;
|
|
46
|
+
readonly versions: VersionedInputs;
|
|
47
|
+
readonly runId: string;
|
|
48
|
+
readonly reportId: string;
|
|
49
|
+
readonly baseline?: Report;
|
|
50
|
+
/** Phase-4 attribution array, loaded once per run (Landmine 11). */
|
|
51
|
+
readonly judgmentAttributions?: JudgmentAttribution[];
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Per-card generator function. Pure async function; the runner wraps
|
|
55
|
+
* each invocation in try/catch so generators MUST NOT suppress their
|
|
56
|
+
* own errors — throw freely; the runner owns error translation.
|
|
57
|
+
*/
|
|
58
|
+
export type CardGenerator = (report: Report, ctx: GeneratorContext) => Promise<DiagnosisCard>;
|
|
59
|
+
/**
|
|
60
|
+
* Flat registry of all 8 card types → generator functions. Lives here
|
|
61
|
+
* (NOT in `services/diagnosis/registry.ts`) per CONTEXT D-06 — the
|
|
62
|
+
* Phase-1 `cardRegistry` placeholder stays empty to keep the contract
|
|
63
|
+
* test green; the composition root builds a `CardRegistry` literal and
|
|
64
|
+
* passes it into `createDiagnosisRunner(deps)`.
|
|
65
|
+
*
|
|
66
|
+
* `Readonly<Record<CardType, CardGenerator>>` — TypeScript exhaustiveness
|
|
67
|
+
* ensures all 8 literal `CardType` strings appear in any registry literal
|
|
68
|
+
* (no rogue keys, no silently missing keys).
|
|
69
|
+
*/
|
|
70
|
+
export type CardRegistry = Readonly<Record<CardType, CardGenerator>>;
|
|
71
|
+
/**
|
|
72
|
+
* Dependencies for the diagnosis runner factory.
|
|
73
|
+
*
|
|
74
|
+
* D-02 delta vs. AI-SPEC §3: `cache: CacheStore` is replaced by two
|
|
75
|
+
* narrow callback deps so the engine uses the artifact store directly
|
|
76
|
+
* without needing a `CacheStore.get/set` API that doesn't exist.
|
|
77
|
+
*
|
|
78
|
+
* Landmine-11 addition: `loadAttributions` lets the composition root bind
|
|
79
|
+
* a reader over `ARTIFACT_REGISTRY.perEntryAttribution` without widening
|
|
80
|
+
* the `CacheStore` port.
|
|
81
|
+
*/
|
|
82
|
+
export interface DiagnosisRunnerDeps {
|
|
83
|
+
/**
|
|
84
|
+
* Cache-lookup hook. Receives the artifact path built from the
|
|
85
|
+
* 4-version + model cache key. Plan 06's composition root supplies a
|
|
86
|
+
* reader that parses cached bytes through the Diagnosis Zod schema
|
|
87
|
+
* (T-05-04-01 mitigation). Tests supply a simple fake.
|
|
88
|
+
*
|
|
89
|
+
* Returns `null` on miss; returns the cached `Diagnosis` on hit.
|
|
90
|
+
*/
|
|
91
|
+
readonly diagnosisReader: (path: string) => Promise<Diagnosis | null>;
|
|
92
|
+
/**
|
|
93
|
+
* Cache-write hook. Receives the same path as `diagnosisReader` plus
|
|
94
|
+
* the freshly-built `Diagnosis`. Called unconditionally after every
|
|
95
|
+
* successful run (including `refresh: true` — a refreshed call
|
|
96
|
+
* replaces the cached Diagnosis per AI-SPEC §3).
|
|
97
|
+
*/
|
|
98
|
+
readonly diagnosisWriter: (path: string, diagnosis: Diagnosis) => Promise<void>;
|
|
99
|
+
/**
|
|
100
|
+
* Attribution loader — invoked once per `.run({...})` with
|
|
101
|
+
* `report.provenance.runId`. Rejection is caught by the runner; the
|
|
102
|
+
* resolved value (including `[]`) is threaded into every
|
|
103
|
+
* `GeneratorContext.judgmentAttributions` (Landmine 11).
|
|
104
|
+
*/
|
|
105
|
+
readonly loadAttributions: (runId: string) => Promise<JudgmentAttribution[]>;
|
|
106
|
+
readonly llm: LLMClient | undefined;
|
|
107
|
+
readonly model: ModelId;
|
|
108
|
+
readonly logger: Logger;
|
|
109
|
+
readonly progress: ProgressReporter;
|
|
110
|
+
readonly registry: CardRegistry;
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Arguments for a single diagnosis run.
|
|
114
|
+
*/
|
|
115
|
+
export interface DiagnosisRunnerRunArgs {
|
|
116
|
+
readonly report: Report;
|
|
117
|
+
readonly versions: VersionedInputs;
|
|
118
|
+
readonly baseline?: Report;
|
|
119
|
+
/** When `true`, bypasses the cache lookup (but still writes on completion). */
|
|
120
|
+
readonly refresh?: boolean;
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* The diagnosis runner interface. A single `.run()` method returns a
|
|
124
|
+
* fully-assembled `Diagnosis` (or a partial one if some cards degraded).
|
|
125
|
+
*/
|
|
126
|
+
export interface DiagnosisRunner {
|
|
127
|
+
run(args: DiagnosisRunnerRunArgs): Promise<Diagnosis>;
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Build a `DiagnosisRunner` whose `.run({report, versions, baseline?, refresh?})`
|
|
131
|
+
* produces a `Diagnosis` with cards in registry-order.
|
|
132
|
+
*
|
|
133
|
+
* No module-scope `let` — all state lives in the `deps` closure and per-run
|
|
134
|
+
* local variables (AI-SPEC §3 Pitfall 1).
|
|
135
|
+
*/
|
|
136
|
+
export declare function createDiagnosisRunner(deps: DiagnosisRunnerDeps): DiagnosisRunner;
|