@sanity/ailf 4.6.0 → 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/agent-harness-tools.yaml +42 -0
- package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
- package/canonical/grader-references/mcp-server-spec.yaml +51 -0
- package/canonical/grader-references/portable-text.yaml +48 -0
- package/config/diagnosis-cards.ts +318 -0
- package/config/models.ts +12 -0
- package/config/rubrics.ts +38 -2
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
- package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
- package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
- package/dist/_vendor/ailf-core/examples/index.js +146 -47
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +4 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +9 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
- package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +171 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +104 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +96 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +39 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +52 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +77 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +126 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +107 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +114 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +273 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +86 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +50 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +35 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +136 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +153 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +6 -0
- package/dist/_vendor/ailf-core/services/index.js +18 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
- package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
- package/dist/_vendor/ailf-core/types/attribution.js +18 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
- package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
- package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
- package/dist/_vendor/ailf-core/types/confidence.js +7 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +271 -0
- package/dist/_vendor/ailf-core/types/diagnosis.js +19 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
- package/dist/_vendor/ailf-core/types/index.js +15 -1
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
- package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
- package/dist/adapters/attribution/index.d.ts +9 -0
- package/dist/adapters/attribution/index.js +8 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/grader-outputs/index.d.ts +10 -0
- package/dist/adapters/grader-outputs/index.js +8 -0
- package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
- package/dist/adapters/grader-outputs/legacy/index.js +10 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
- package/dist/adapters/index.d.ts +3 -0
- package/dist/adapters/index.js +4 -0
- package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
- package/dist/adapters/llm/fake-llm-client.js +38 -1
- package/dist/adapters/llm/openai-llm-client.js +52 -3
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
- package/dist/adapters/task-sources/repo-schemas.js +19 -2
- package/dist/cli-program.js +3 -0
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/explain-handler.js +1 -1
- package/dist/commands/interpret.d.ts +50 -0
- package/dist/commands/interpret.js +212 -0
- package/dist/commands/lookup-doc.d.ts +1 -1
- package/dist/commands/lookup-doc.js +3 -3
- package/dist/commands/pipeline-action.d.ts +6 -0
- package/dist/commands/pipeline-action.js +2 -0
- package/dist/commands/remote-pipeline.js +1 -0
- package/dist/composition-root.d.ts +57 -23
- package/dist/composition-root.js +155 -41
- package/dist/config/diagnosis-cards.ts +318 -0
- package/dist/config/models.ts +12 -0
- package/dist/config/rubrics.ts +38 -2
- package/dist/grader/agent-harness.d.ts +9 -0
- package/dist/grader/agent-harness.js +9 -0
- package/dist/grader/common.d.ts +9 -0
- package/dist/grader/common.js +9 -0
- package/dist/grader/index.d.ts +24 -0
- package/dist/grader/index.js +24 -0
- package/dist/grader/knowledge-probe.d.ts +9 -0
- package/dist/grader/knowledge-probe.js +9 -0
- package/dist/grader/literacy.d.ts +9 -0
- package/dist/grader/literacy.js +9 -0
- package/dist/grader/mcp.d.ts +9 -0
- package/dist/grader/mcp.js +9 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +5 -0
- package/dist/orchestration/steps/calculate-scores-step.js +23 -1
- package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
- package/dist/orchestration/steps/compute-attribution-step.js +279 -0
- package/dist/orchestration/steps/gap-analysis-step.js +35 -7
- package/dist/orchestration/steps/index.d.ts +1 -0
- package/dist/orchestration/steps/index.js +1 -0
- package/dist/pipeline/attribution.d.ts +15 -0
- package/dist/pipeline/attribution.js +18 -9
- package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
- package/dist/pipeline/borderline-consensus-runner.js +124 -0
- package/dist/pipeline/borderline-detector.d.ts +24 -0
- package/dist/pipeline/borderline-detector.js +26 -0
- package/dist/pipeline/calculate-scores.d.ts +114 -3
- package/dist/pipeline/calculate-scores.js +426 -24
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +35 -17
- package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
- package/dist/pipeline/compiler/rubric-resolution.js +9 -1
- package/dist/pipeline/compute-attribution.d.ts +80 -0
- package/dist/pipeline/compute-attribution.js +196 -0
- package/dist/pipeline/failure-modes.d.ts +52 -17
- package/dist/pipeline/failure-modes.js +178 -117
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/package.json +7 -5
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* System prompt for the weakest-area card.
|
|
3
|
+
*
|
|
4
|
+
* Card: weakest-area
|
|
5
|
+
* Model: claude-sonnet-4-6 (routine card)
|
|
6
|
+
* Version: weakest-area@0.1.0
|
|
7
|
+
*
|
|
8
|
+
* Mitigations embedded:
|
|
9
|
+
* - failure-mode #3: confidence inflation on small samples — prompt instructs
|
|
10
|
+
* to hedge when sampleSize < 10; Zod W3 refine enforces at parse time
|
|
11
|
+
* - failure-mode #4: taxonomy drift — full canonical taxonomy enumerated
|
|
12
|
+
* verbatim in this prompt so the LLM picks from a known list
|
|
13
|
+
*
|
|
14
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
|
|
15
|
+
*/
|
|
16
|
+
export const SYSTEM_PROMPT = `You are an AILF evaluation analyst identifying the documentation area most in need of improvement.
|
|
17
|
+
|
|
18
|
+
## Your Output
|
|
19
|
+
|
|
20
|
+
Return a JSON object matching this exact shape:
|
|
21
|
+
{
|
|
22
|
+
"summary": "<1-2 sentence description of the weakest area and why>",
|
|
23
|
+
"area": "<feature area name, e.g. 'schema-deploy'>",
|
|
24
|
+
"dimension": "<MUST be one of the canonical dimensions listed below>",
|
|
25
|
+
"failureMode": "<MUST be from the canonical taxonomy for the chosen dimension>",
|
|
26
|
+
"sampleSize": <number — MUST equal the judgmentCount provided for this area>,
|
|
27
|
+
"confidence": {
|
|
28
|
+
"level": "high" | "medium" | "low",
|
|
29
|
+
"signalsPresent": <number of tasks backing this finding>,
|
|
30
|
+
"derivation": "card-type-specific"
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
## CANONICAL DIMENSIONS AND FAILURE MODES
|
|
35
|
+
|
|
36
|
+
You MUST pick dimension and failureMode from this exact taxonomy. Cross-dimension combinations are invalid (e.g., "security" dimension with "missing-docs" failure mode is rejected).
|
|
37
|
+
|
|
38
|
+
### Literacy family (dimensions: task-completion, code-correctness, doc-coverage)
|
|
39
|
+
Failure modes:
|
|
40
|
+
- missing-docs — relevant doc didn't exist
|
|
41
|
+
- outdated-docs — doc reflects an older API/version
|
|
42
|
+
- incorrect-docs — doc states something factually wrong
|
|
43
|
+
- poor-structure — doc exists but is hard to find or follow
|
|
44
|
+
Plus cross-cutting: api-error, model-limitation, false-floor, unclassified
|
|
45
|
+
|
|
46
|
+
### MCP family (dimensions: mcp-behavior, input-validation, output-correctness, error-handling, security)
|
|
47
|
+
Failure modes:
|
|
48
|
+
- invalid-tool-call — model called tool with wrong args
|
|
49
|
+
- missing-required-param — required parameter omitted
|
|
50
|
+
- extra-param — unexpected extra parameter sent
|
|
51
|
+
- wrong-tool-selected — chose wrong tool for task
|
|
52
|
+
- tool-call-order — tools called in wrong sequence
|
|
53
|
+
- no-tool-call — should have used a tool but didn't
|
|
54
|
+
- schema-mismatch — response did not match expected schema
|
|
55
|
+
- unsafe-operation — operation could cause data loss
|
|
56
|
+
- auth-bypass — security check skipped
|
|
57
|
+
Plus cross-cutting: api-error, model-limitation, false-floor, unclassified
|
|
58
|
+
|
|
59
|
+
### Knowledge-probe family (dimensions: knowledge-probe, factual-correctness, completeness, currency)
|
|
60
|
+
Failure modes:
|
|
61
|
+
- factual-error — stated an incorrect fact
|
|
62
|
+
- out-of-date — used deprecated API or old syntax
|
|
63
|
+
- missing-step — omitted a required step
|
|
64
|
+
- hallucinated-api — invented an API that does not exist
|
|
65
|
+
- wrong-version — used v1 API when v2 was required
|
|
66
|
+
- incomplete-coverage — missed important edge case
|
|
67
|
+
Plus cross-cutting: api-error, model-limitation, false-floor, unclassified
|
|
68
|
+
|
|
69
|
+
### Agent-harness family (dimensions: agent-harness, process-quality, agent-output, tool-usage)
|
|
70
|
+
Failure modes:
|
|
71
|
+
- excessive-loops — agent looped unnecessarily
|
|
72
|
+
- premature-stop — stopped before completing the task
|
|
73
|
+
- incorrect-output — output was wrong or incomplete
|
|
74
|
+
- inefficient-path — completed task but via unnecessary steps
|
|
75
|
+
- assertion-failure — failed a structural assertion check
|
|
76
|
+
Plus cross-cutting: api-error, model-limitation, false-floor, unclassified
|
|
77
|
+
|
|
78
|
+
## Confidence Calibration Rules
|
|
79
|
+
|
|
80
|
+
**CRITICAL:** When sampleSize < 10, you MUST set confidence.level = "low".
|
|
81
|
+
|
|
82
|
+
- sampleSize >= 30 → "high" is appropriate
|
|
83
|
+
- sampleSize >= 10 → "medium" is appropriate
|
|
84
|
+
- sampleSize < 10 → MUST use "low" (small-sample hedge required)
|
|
85
|
+
|
|
86
|
+
In your summary, reflect the confidence level: if "low", include language like "small sample (N=X) — re-run with broader dataset before acting".`;
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Diagnosis card registry — placeholder home for Phase 5 cards.
|
|
3
|
+
*
|
|
4
|
+
* Phase 5 cards declare:
|
|
5
|
+
*
|
|
6
|
+
* export const card = {
|
|
7
|
+
* type, version, schema, generate
|
|
8
|
+
* } satisfies CardDefinition
|
|
9
|
+
*
|
|
10
|
+
* The compound `cardVersion` (VER-01 / D-02) is built from per-card
|
|
11
|
+
* `version` by sorting `${type}@${version}` ascending and joining with
|
|
12
|
+
* `,`. Phase 1 lands the empty registry; Phase 5 registers cards via
|
|
13
|
+
* the composition root, not by mutating this binding.
|
|
14
|
+
*
|
|
15
|
+
* NOTE (Phase 5 / D-06): The runtime `CardRegistry` type used by the
|
|
16
|
+
* engine lives on `services/diagnosis-runner.ts` as
|
|
17
|
+
* `Readonly<Record<CardType, CardGenerator>>`. This file's `cardRegistry`
|
|
18
|
+
* is an intentionally-empty Phase-1 placeholder for Phase-1 contract
|
|
19
|
+
* tests. DO NOT mutate `cardRegistry` or add cards here — the composition
|
|
20
|
+
* root (Plan 06) builds and passes the `CardRegistry` literal into
|
|
21
|
+
* `createDiagnosisRunner(deps)`.
|
|
22
|
+
*
|
|
23
|
+
* @see packages/core/src/services/diagnosis-runner.ts (CardRegistry type)
|
|
24
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
25
|
+
* @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
|
|
26
|
+
* @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02, D-08)
|
|
27
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-06)
|
|
28
|
+
*/
|
|
29
|
+
import type { z } from "zod";
|
|
30
|
+
import type { CardType, DiagnosisCard } from "../../types/diagnosis.js";
|
|
31
|
+
/**
|
|
32
|
+
* Per-card definition. `schema` is the per-card body parser; `generate`
|
|
33
|
+
* is the runner-invoked builder. Phase 5 fills in the
|
|
34
|
+
* `report+attribution+llm` parameter list when card files land — Phase 1
|
|
35
|
+
* keeps the signature minimal so the registry compiles before any cards
|
|
36
|
+
* exist.
|
|
37
|
+
*/
|
|
38
|
+
export interface CardDefinition<TBody = unknown> {
|
|
39
|
+
readonly type: CardType;
|
|
40
|
+
readonly version: string;
|
|
41
|
+
readonly schema: z.ZodType<TBody>;
|
|
42
|
+
readonly generate: () => Promise<DiagnosisCard>;
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Phase 1: empty entrypoint. Phase 5 cards register here through the
|
|
46
|
+
* composition root. The exported binding is a `ReadonlyMap` so
|
|
47
|
+
* downstream consumers cannot mutate it (would re-introduce the vitest
|
|
48
|
+
* worker-leak hazard).
|
|
49
|
+
*/
|
|
50
|
+
export declare const cardRegistry: ReadonlyMap<CardType, CardDefinition>;
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Diagnosis card registry — placeholder home for Phase 5 cards.
|
|
3
|
+
*
|
|
4
|
+
* Phase 5 cards declare:
|
|
5
|
+
*
|
|
6
|
+
* export const card = {
|
|
7
|
+
* type, version, schema, generate
|
|
8
|
+
* } satisfies CardDefinition
|
|
9
|
+
*
|
|
10
|
+
* The compound `cardVersion` (VER-01 / D-02) is built from per-card
|
|
11
|
+
* `version` by sorting `${type}@${version}` ascending and joining with
|
|
12
|
+
* `,`. Phase 1 lands the empty registry; Phase 5 registers cards via
|
|
13
|
+
* the composition root, not by mutating this binding.
|
|
14
|
+
*
|
|
15
|
+
* NOTE (Phase 5 / D-06): The runtime `CardRegistry` type used by the
|
|
16
|
+
* engine lives on `services/diagnosis-runner.ts` as
|
|
17
|
+
* `Readonly<Record<CardType, CardGenerator>>`. This file's `cardRegistry`
|
|
18
|
+
* is an intentionally-empty Phase-1 placeholder for Phase-1 contract
|
|
19
|
+
* tests. DO NOT mutate `cardRegistry` or add cards here — the composition
|
|
20
|
+
* root (Plan 06) builds and passes the `CardRegistry` literal into
|
|
21
|
+
* `createDiagnosisRunner(deps)`.
|
|
22
|
+
*
|
|
23
|
+
* @see packages/core/src/services/diagnosis-runner.ts (CardRegistry type)
|
|
24
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
25
|
+
* @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
|
|
26
|
+
* @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02, D-08)
|
|
27
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-06)
|
|
28
|
+
*/
|
|
29
|
+
/**
|
|
30
|
+
* Phase 1: empty entrypoint. Phase 5 cards register here through the
|
|
31
|
+
* composition root. The exported binding is a `ReadonlyMap` so
|
|
32
|
+
* downstream consumers cannot mutate it (would re-introduce the vitest
|
|
33
|
+
* worker-leak hazard).
|
|
34
|
+
*/
|
|
35
|
+
export const cardRegistry = new Map();
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Diagnosis runner — engine entry point (D0048).
|
|
3
|
+
*
|
|
4
|
+
* Phase 5 implements the factory body; Phase 1 shipped `diagnosisVersion` only.
|
|
5
|
+
* `GeneratorContext.judgmentAttributions` is sourced once per `.run({...})` via
|
|
6
|
+
* `deps.loadAttributions(runId)` reading Phase 4's
|
|
7
|
+
* `runs/{runId}/attribution/{entryKey}.json` per-entry artifacts (RESEARCH
|
|
8
|
+
* Landmine 11).
|
|
9
|
+
*
|
|
10
|
+
* @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
|
|
11
|
+
* @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02)
|
|
12
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-RESEARCH.md (Landmine 11)
|
|
13
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-02, D-06, D-10)
|
|
14
|
+
*/
|
|
15
|
+
import type { LLMClient, ModelId } from "../ports/llm-client.js";
|
|
16
|
+
import type { Logger } from "../ports/logger.js";
|
|
17
|
+
import type { ProgressReporter } from "../ports/progress-reporter.js";
|
|
18
|
+
import type { JudgmentAttribution } from "../types/attribution.js";
|
|
19
|
+
import type { CardType, Diagnosis, DiagnosisCard, VersionedInputs } from "../types/diagnosis.js";
|
|
20
|
+
import type { Report } from "../types/index.js";
|
|
21
|
+
/**
|
|
22
|
+
* Bumped when the runner's selection logic, prompt orchestration, or
|
|
23
|
+
* card-set composition changes in a way that should invalidate cached
|
|
24
|
+
* Diagnoses (VER-01 / D-02). Co-located here so the cache-invalidation
|
|
25
|
+
* contract test reads the canonical value.
|
|
26
|
+
*
|
|
27
|
+
* `export const` (never `export let`) — module-scope mutables leak
|
|
28
|
+
* across vitest workers (cross-cutting hazard #2).
|
|
29
|
+
*/
|
|
30
|
+
export declare const diagnosisVersion = "0.1.0";
|
|
31
|
+
/**
|
|
32
|
+
* Per-invocation context threaded into every card generator.
|
|
33
|
+
*
|
|
34
|
+
* `judgmentAttributions` is the Landmine-11 addition — Phase 4 emits
|
|
35
|
+
* per-judgment attribution as per-entry GCS artifacts at
|
|
36
|
+
* `runs/{runId}/attribution/{entryKey}.json`. The runner loads them once
|
|
37
|
+
* per `.run({...})` via `deps.loadAttributions(runId)` and threads the
|
|
38
|
+
* result here. Cards that need attribution inspect this field and return
|
|
39
|
+
* `status: "missing"` when it is `undefined` or empty.
|
|
40
|
+
*/
|
|
41
|
+
export interface GeneratorContext {
|
|
42
|
+
readonly llm: LLMClient | undefined;
|
|
43
|
+
readonly model: ModelId;
|
|
44
|
+
readonly logger: Logger;
|
|
45
|
+
readonly progress: ProgressReporter;
|
|
46
|
+
readonly versions: VersionedInputs;
|
|
47
|
+
readonly runId: string;
|
|
48
|
+
readonly reportId: string;
|
|
49
|
+
readonly baseline?: Report;
|
|
50
|
+
/** Phase-4 attribution array, loaded once per run (Landmine 11). */
|
|
51
|
+
readonly judgmentAttributions?: JudgmentAttribution[];
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Per-card generator function. Pure async function; the runner wraps
|
|
55
|
+
* each invocation in try/catch so generators MUST NOT suppress their
|
|
56
|
+
* own errors — throw freely; the runner owns error translation.
|
|
57
|
+
*/
|
|
58
|
+
export type CardGenerator = (report: Report, ctx: GeneratorContext) => Promise<DiagnosisCard>;
|
|
59
|
+
/**
|
|
60
|
+
* Flat registry of all 8 card types → generator functions. Lives here
|
|
61
|
+
* (NOT in `services/diagnosis/registry.ts`) per CONTEXT D-06 — the
|
|
62
|
+
* Phase-1 `cardRegistry` placeholder stays empty to keep the contract
|
|
63
|
+
* test green; the composition root builds a `CardRegistry` literal and
|
|
64
|
+
* passes it into `createDiagnosisRunner(deps)`.
|
|
65
|
+
*
|
|
66
|
+
* `Readonly<Record<CardType, CardGenerator>>` — TypeScript exhaustiveness
|
|
67
|
+
* ensures all 8 literal `CardType` strings appear in any registry literal
|
|
68
|
+
* (no rogue keys, no silently missing keys).
|
|
69
|
+
*/
|
|
70
|
+
export type CardRegistry = Readonly<Record<CardType, CardGenerator>>;
|
|
71
|
+
/**
|
|
72
|
+
* Dependencies for the diagnosis runner factory.
|
|
73
|
+
*
|
|
74
|
+
* D-02 delta vs. AI-SPEC §3: `cache: CacheStore` is replaced by two
|
|
75
|
+
* narrow callback deps so the engine uses the artifact store directly
|
|
76
|
+
* without needing a `CacheStore.get/set` API that doesn't exist.
|
|
77
|
+
*
|
|
78
|
+
* Landmine-11 addition: `loadAttributions` lets the composition root bind
|
|
79
|
+
* a reader over `ARTIFACT_REGISTRY.perEntryAttribution` without widening
|
|
80
|
+
* the `CacheStore` port.
|
|
81
|
+
*/
|
|
82
|
+
export interface DiagnosisRunnerDeps {
|
|
83
|
+
/**
|
|
84
|
+
* Cache-lookup hook. Receives the artifact path built from the
|
|
85
|
+
* 4-version + model cache key. Plan 06's composition root supplies a
|
|
86
|
+
* reader that parses cached bytes through the Diagnosis Zod schema
|
|
87
|
+
* (T-05-04-01 mitigation). Tests supply a simple fake.
|
|
88
|
+
*
|
|
89
|
+
* Returns `null` on miss; returns the cached `Diagnosis` on hit.
|
|
90
|
+
*/
|
|
91
|
+
readonly diagnosisReader: (path: string) => Promise<Diagnosis | null>;
|
|
92
|
+
/**
|
|
93
|
+
* Cache-write hook. Receives the same path as `diagnosisReader` plus
|
|
94
|
+
* the freshly-built `Diagnosis`. Called unconditionally after every
|
|
95
|
+
* successful run (including `refresh: true` — a refreshed call
|
|
96
|
+
* replaces the cached Diagnosis per AI-SPEC §3).
|
|
97
|
+
*/
|
|
98
|
+
readonly diagnosisWriter: (path: string, diagnosis: Diagnosis) => Promise<void>;
|
|
99
|
+
/**
|
|
100
|
+
* Attribution loader — invoked once per `.run({...})` with
|
|
101
|
+
* `report.provenance.runId`. Rejection is caught by the runner; the
|
|
102
|
+
* resolved value (including `[]`) is threaded into every
|
|
103
|
+
* `GeneratorContext.judgmentAttributions` (Landmine 11).
|
|
104
|
+
*/
|
|
105
|
+
readonly loadAttributions: (runId: string) => Promise<JudgmentAttribution[]>;
|
|
106
|
+
readonly llm: LLMClient | undefined;
|
|
107
|
+
readonly model: ModelId;
|
|
108
|
+
readonly logger: Logger;
|
|
109
|
+
readonly progress: ProgressReporter;
|
|
110
|
+
readonly registry: CardRegistry;
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Arguments for a single diagnosis run.
|
|
114
|
+
*/
|
|
115
|
+
export interface DiagnosisRunnerRunArgs {
|
|
116
|
+
readonly report: Report;
|
|
117
|
+
readonly versions: VersionedInputs;
|
|
118
|
+
readonly baseline?: Report;
|
|
119
|
+
/** When `true`, bypasses the cache lookup (but still writes on completion). */
|
|
120
|
+
readonly refresh?: boolean;
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* The diagnosis runner interface. A single `.run()` method returns a
|
|
124
|
+
* fully-assembled `Diagnosis` (or a partial one if some cards degraded).
|
|
125
|
+
*/
|
|
126
|
+
export interface DiagnosisRunner {
|
|
127
|
+
run(args: DiagnosisRunnerRunArgs): Promise<Diagnosis>;
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Build a `DiagnosisRunner` whose `.run({report, versions, baseline?, refresh?})`
|
|
131
|
+
* produces a `Diagnosis` with cards in registry-order.
|
|
132
|
+
*
|
|
133
|
+
* No module-scope `let` — all state lives in the `deps` closure and per-run
|
|
134
|
+
* local variables (AI-SPEC §3 Pitfall 1).
|
|
135
|
+
*/
|
|
136
|
+
export declare function createDiagnosisRunner(deps: DiagnosisRunnerDeps): DiagnosisRunner;
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Diagnosis runner — engine entry point (D0048).
|
|
3
|
+
*
|
|
4
|
+
* Phase 5 implements the factory body; Phase 1 shipped `diagnosisVersion` only.
|
|
5
|
+
* `GeneratorContext.judgmentAttributions` is sourced once per `.run({...})` via
|
|
6
|
+
* `deps.loadAttributions(runId)` reading Phase 4's
|
|
7
|
+
* `runs/{runId}/attribution/{entryKey}.json` per-entry artifacts (RESEARCH
|
|
8
|
+
* Landmine 11).
|
|
9
|
+
*
|
|
10
|
+
* @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
|
|
11
|
+
* @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02)
|
|
12
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-RESEARCH.md (Landmine 11)
|
|
13
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-02, D-06, D-10)
|
|
14
|
+
*/
|
|
15
|
+
import { z } from "zod";
|
|
16
|
+
import { ARTIFACT_REGISTRY, encodeDiagnosisPathVersion, } from "../artifact-registry.js";
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
// Version constant (Phase 1 / VER-01 / D-02)
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
/**
|
|
21
|
+
* Bumped when the runner's selection logic, prompt orchestration, or
|
|
22
|
+
* card-set composition changes in a way that should invalidate cached
|
|
23
|
+
* Diagnoses (VER-01 / D-02). Co-located here so the cache-invalidation
|
|
24
|
+
* contract test reads the canonical value.
|
|
25
|
+
*
|
|
26
|
+
* `export const` (never `export let`) — module-scope mutables leak
|
|
27
|
+
* across vitest workers (cross-cutting hazard #2).
|
|
28
|
+
*/
|
|
29
|
+
export const diagnosisVersion = "0.1.0";
|
|
30
|
+
// ---------------------------------------------------------------------------
|
|
31
|
+
// Private helpers
|
|
32
|
+
// ---------------------------------------------------------------------------
|
|
33
|
+
/**
|
|
34
|
+
* Build the deterministic cache path that incorporates all four version
|
|
35
|
+
* segments AND the model id (AI-SPEC §3 lines 463-473 + D-02).
|
|
36
|
+
*
|
|
37
|
+
* The artifact path from `ARTIFACT_REGISTRY.diagnosis.objectPath(...)` is
|
|
38
|
+
* already version-scoped; we append `::${model}` to include model identity
|
|
39
|
+
* in the key without changing the artifact path shape.
|
|
40
|
+
*/
|
|
41
|
+
function buildCacheKey(report, versions, model) {
|
|
42
|
+
const artifactPath = ARTIFACT_REGISTRY.diagnosis.objectPath(report.provenance.runId, report.id, encodeDiagnosisPathVersion(versions.diagnosisVersion, versions.cardVersion));
|
|
43
|
+
// Embed the remaining two version axes + model in the key string. The
|
|
44
|
+
// artifact path already carries diagnosisVersion + cardVersion; the other
|
|
45
|
+
// two axes are appended here so any single-segment bump produces a
|
|
46
|
+
// distinct key.
|
|
47
|
+
return `${artifactPath}::grader=${versions.graderJudgmentsVersion}::ensemble=${versions.ensembleVersion}::model=${model}`;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Per-card invocation — never panics. ZodError or any other thrown value
|
|
51
|
+
* both translate to a degraded card (AI-SPEC §3 lines 530-552).
|
|
52
|
+
*/
|
|
53
|
+
async function runOne(generator, report, ctx, cardType) {
|
|
54
|
+
try {
|
|
55
|
+
return await generator(report, ctx);
|
|
56
|
+
}
|
|
57
|
+
catch (err) {
|
|
58
|
+
const meta = {
|
|
59
|
+
cardVersion: `${cardType}@unknown`,
|
|
60
|
+
generatedAt: new Date().toISOString(),
|
|
61
|
+
};
|
|
62
|
+
const isZodErr = err instanceof z.ZodError;
|
|
63
|
+
return {
|
|
64
|
+
status: "degraded",
|
|
65
|
+
cardType,
|
|
66
|
+
reason: err instanceof Error ? err.message : String(err),
|
|
67
|
+
parseFailed: isZodErr,
|
|
68
|
+
meta,
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
// ---------------------------------------------------------------------------
|
|
73
|
+
// Factory (AI-SPEC §3 lines 458-523 + D-02 / Landmine-11 deltas)
|
|
74
|
+
// ---------------------------------------------------------------------------
|
|
75
|
+
/**
|
|
76
|
+
* Build a `DiagnosisRunner` whose `.run({report, versions, baseline?, refresh?})`
|
|
77
|
+
* produces a `Diagnosis` with cards in registry-order.
|
|
78
|
+
*
|
|
79
|
+
* No module-scope `let` — all state lives in the `deps` closure and per-run
|
|
80
|
+
* local variables (AI-SPEC §3 Pitfall 1).
|
|
81
|
+
*/
|
|
82
|
+
export function createDiagnosisRunner(deps) {
|
|
83
|
+
return {
|
|
84
|
+
async run({ report, versions, baseline, refresh }) {
|
|
85
|
+
const cachePath = buildCacheKey(report, versions, deps.model);
|
|
86
|
+
// Cache lookup (bypassed when --refresh).
|
|
87
|
+
if (!refresh) {
|
|
88
|
+
const cached = await deps.diagnosisReader(cachePath);
|
|
89
|
+
if (cached !== null)
|
|
90
|
+
return cached;
|
|
91
|
+
}
|
|
92
|
+
// One-shot attribution load (Landmine 11 — Phase 4 per-entry artifacts).
|
|
93
|
+
let judgmentAttributions;
|
|
94
|
+
try {
|
|
95
|
+
judgmentAttributions = await deps.loadAttributions(report.provenance.runId);
|
|
96
|
+
}
|
|
97
|
+
catch (err) {
|
|
98
|
+
deps.logger.warn("diagnosis-runner: loadAttributions failed", {
|
|
99
|
+
runId: report.provenance.runId,
|
|
100
|
+
error: err instanceof Error ? err.message : String(err),
|
|
101
|
+
});
|
|
102
|
+
judgmentAttributions = undefined;
|
|
103
|
+
}
|
|
104
|
+
const ctx = {
|
|
105
|
+
llm: deps.llm,
|
|
106
|
+
model: deps.model,
|
|
107
|
+
logger: deps.logger,
|
|
108
|
+
progress: deps.progress,
|
|
109
|
+
versions,
|
|
110
|
+
runId: report.provenance.runId, // D-10: provenance.runId, NOT report.runId
|
|
111
|
+
reportId: report.id,
|
|
112
|
+
judgmentAttributions, // Landmine 11
|
|
113
|
+
...(baseline ? { baseline } : {}),
|
|
114
|
+
};
|
|
115
|
+
const cardTypes = Object.keys(deps.registry);
|
|
116
|
+
const cards = [];
|
|
117
|
+
let parseFailures = 0;
|
|
118
|
+
for (const cardType of cardTypes) {
|
|
119
|
+
const generator = deps.registry[cardType];
|
|
120
|
+
// Budget enforcement: once ≤1 budget is breached, downgrade
|
|
121
|
+
// subsequent parse-failing cards to "missing" before even running
|
|
122
|
+
// the generator (AI-SPEC §3 lines 496-510 + must-have #4).
|
|
123
|
+
// We still RUN the generator here to match the behavior spec —
|
|
124
|
+
// the budget check happens AFTER the card result is obtained.
|
|
125
|
+
const card = await runOne(generator, report, ctx, cardType);
|
|
126
|
+
if (card.status === "degraded" && card.parseFailed) {
|
|
127
|
+
if (parseFailures >= 1) {
|
|
128
|
+
// Budget exceeded — demote to missing.
|
|
129
|
+
deps.logger.warn(`diagnosis-runner: parse-failure budget exceeded for card "${cardType}"; demoting to missing`, { reportId: report.id });
|
|
130
|
+
cards.push({
|
|
131
|
+
status: "missing",
|
|
132
|
+
cardType,
|
|
133
|
+
reason: "degraded-budget-exceeded",
|
|
134
|
+
});
|
|
135
|
+
continue;
|
|
136
|
+
}
|
|
137
|
+
parseFailures++;
|
|
138
|
+
}
|
|
139
|
+
cards.push(card);
|
|
140
|
+
}
|
|
141
|
+
const diagnosis = {
|
|
142
|
+
runId: report.provenance.runId, // D-10: provenance.runId
|
|
143
|
+
reportId: report.id,
|
|
144
|
+
inputs: versions,
|
|
145
|
+
cards,
|
|
146
|
+
generatedAt: new Date().toISOString(),
|
|
147
|
+
};
|
|
148
|
+
// Unconditional write — a refreshed call replaces the cached Diagnosis.
|
|
149
|
+
await deps.diagnosisWriter(cachePath, diagnosis);
|
|
150
|
+
return diagnosis;
|
|
151
|
+
},
|
|
152
|
+
};
|
|
153
|
+
}
|
|
@@ -13,3 +13,9 @@ export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskS
|
|
|
13
13
|
export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
|
|
14
14
|
export { buildSlimReportSummary } from "./slim-report-summary.js";
|
|
15
15
|
export { reportToMarkdown, type RenderableReport, } from "./report-to-markdown.js";
|
|
16
|
+
export { createDiagnosisRunner, diagnosisVersion, type CardGenerator, type CardRegistry, type DiagnosisRunner, type DiagnosisRunnerDeps, type DiagnosisRunnerRunArgs, type GeneratorContext, } from "./diagnosis-runner.js";
|
|
17
|
+
export { cardRegistry, type CardDefinition } from "./diagnosis/registry.js";
|
|
18
|
+
export { createLLMClient, type LLMClientAdapters, type LLMClientFactoryConfig, type LLMClientKeys, } from "./llm-client-factory.js";
|
|
19
|
+
export { buildFailureModeRefinement, isFailureModeInDimensionTaxonomy, } from "./diagnosis/card-validators.js";
|
|
20
|
+
export { DIAGNOSIS_CARD_GENERATORS, generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, } from "./diagnosis/cards/index.js";
|
|
21
|
+
export { buildTopRecommendationsPrompt, buildWeakestAreaPrompt, buildLowConfidenceAttributionPrompt, buildDocAttributionSpotlightPrompt, buildRegressionVsBaselinePrompt, buildDocSlugAllowList, } from "./diagnosis/prompt-builders.js";
|
|
@@ -13,3 +13,21 @@ export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskS
|
|
|
13
13
|
export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
|
|
14
14
|
export { buildSlimReportSummary } from "./slim-report-summary.js";
|
|
15
15
|
export { reportToMarkdown, } from "./report-to-markdown.js";
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
// Actionability ladder Phase 1 + Phase 5 — diagnosis runner + card registry
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
export { createDiagnosisRunner, diagnosisVersion, } from "./diagnosis-runner.js";
|
|
20
|
+
export { cardRegistry } from "./diagnosis/registry.js";
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// Phase 5 — LLM client factory (D-01 hoist)
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
export { createLLMClient, } from "./llm-client-factory.js";
|
|
25
|
+
// ---------------------------------------------------------------------------
|
|
26
|
+
// Phase 5 — card validators (D-05 refine helpers)
|
|
27
|
+
// ---------------------------------------------------------------------------
|
|
28
|
+
export { buildFailureModeRefinement, isFailureModeInDimensionTaxonomy, } from "./diagnosis/card-validators.js";
|
|
29
|
+
// ---------------------------------------------------------------------------
|
|
30
|
+
// Phase 5 Plan 05 — card generators barrel + prompt builders
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
export { DIAGNOSIS_CARD_GENERATORS, generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, } from "./diagnosis/cards/index.js";
|
|
33
|
+
export { buildTopRecommendationsPrompt, buildWeakestAreaPrompt, buildLowConfidenceAttributionPrompt, buildDocAttributionSpotlightPrompt, buildRegressionVsBaselinePrompt, buildDocSlugAllowList, } from "./diagnosis/prompt-builders.js";
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM client factory — hoisted from packages/eval/src/composition-root.ts
|
|
3
|
+
* so packages/api can build a DiagnosisRunner without importing eval (D-01).
|
|
4
|
+
*
|
|
5
|
+
* Adapter CLASSES stay in packages/eval/src/adapters/llm/. Only the factory
|
|
6
|
+
* function lives here. Adapter constructors are injected via `LLMClientAdapters`
|
|
7
|
+
* so core never static-imports vendor SDK code (D0051 invariant / T-05-01-01).
|
|
8
|
+
*
|
|
9
|
+
* @see docs/decisions/D0051-llm-client-port.md
|
|
10
|
+
* @see packages/eval/src/composition-root.ts — call site (updated to use this)
|
|
11
|
+
*/
|
|
12
|
+
import type { LLMClient } from "../ports/llm-client.js";
|
|
13
|
+
import type { Logger } from "../ports/logger.js";
|
|
14
|
+
/**
|
|
15
|
+
* Narrow config slice consumed by the LLM client factory.
|
|
16
|
+
* Does NOT depend on `ResolvedConfig` from packages/eval — only the
|
|
17
|
+
* llmProvider field is needed here.
|
|
18
|
+
*/
|
|
19
|
+
export interface LLMClientFactoryConfig {
|
|
20
|
+
readonly llmProvider?: "anthropic" | "openai";
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Typed key bag passed to `createLLMClient`. The composition root reads
|
|
24
|
+
* env once and supplies values here; the factory stays pure so tests don't
|
|
25
|
+
* have to mutate `process.env`.
|
|
26
|
+
*/
|
|
27
|
+
export interface LLMClientKeys {
|
|
28
|
+
readonly anthropicApiKey?: string;
|
|
29
|
+
readonly openaiApiKey?: string;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Constructor callbacks for adapter classes that live in packages/eval.
|
|
33
|
+
* The eval composition root passes real constructors; tests pass spies.
|
|
34
|
+
*
|
|
35
|
+
* This pattern satisfies T-05-01-01: core never static-imports
|
|
36
|
+
* openai / @anthropic-ai/sdk. The vendor code stays in eval.
|
|
37
|
+
*/
|
|
38
|
+
export interface LLMClientAdapters {
|
|
39
|
+
readonly newAnthropicClient: (opts: {
|
|
40
|
+
apiKey: string;
|
|
41
|
+
logger: Logger;
|
|
42
|
+
}) => LLMClient;
|
|
43
|
+
readonly newOpenAIClient: (opts: {
|
|
44
|
+
apiKey: string;
|
|
45
|
+
logger: Logger;
|
|
46
|
+
}) => LLMClient;
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Select the LLMClient adapter based on `config.llmProvider` and the
|
|
50
|
+
* supplied API keys. Returns `undefined` when no usable credential is
|
|
51
|
+
* present — `AppContext.llmClient` stays unset and consumers handle that
|
|
52
|
+
* explicitly.
|
|
53
|
+
*
|
|
54
|
+
* Adapters never read `process.env` themselves (per
|
|
55
|
+
* `.claude/rules/typescript.md`); env mapping happens at the call site
|
|
56
|
+
* (typically `createAppContext`).
|
|
57
|
+
*
|
|
58
|
+
* Adapter classes stay in packages/eval; they are passed in via `deps.adapters`
|
|
59
|
+
* so this factory has zero eval imports (D-01 / T-05-01-01).
|
|
60
|
+
*/
|
|
61
|
+
export declare function createLLMClient(config: LLMClientFactoryConfig, keys: LLMClientKeys, deps: {
|
|
62
|
+
logger: Logger;
|
|
63
|
+
adapters: LLMClientAdapters;
|
|
64
|
+
}): LLMClient | undefined;
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM client factory — hoisted from packages/eval/src/composition-root.ts
|
|
3
|
+
* so packages/api can build a DiagnosisRunner without importing eval (D-01).
|
|
4
|
+
*
|
|
5
|
+
* Adapter CLASSES stay in packages/eval/src/adapters/llm/. Only the factory
|
|
6
|
+
* function lives here. Adapter constructors are injected via `LLMClientAdapters`
|
|
7
|
+
* so core never static-imports vendor SDK code (D0051 invariant / T-05-01-01).
|
|
8
|
+
*
|
|
9
|
+
* @see docs/decisions/D0051-llm-client-port.md
|
|
10
|
+
* @see packages/eval/src/composition-root.ts — call site (updated to use this)
|
|
11
|
+
*/
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
// Factory function
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
/**
|
|
16
|
+
* Select the LLMClient adapter based on `config.llmProvider` and the
|
|
17
|
+
* supplied API keys. Returns `undefined` when no usable credential is
|
|
18
|
+
* present — `AppContext.llmClient` stays unset and consumers handle that
|
|
19
|
+
* explicitly.
|
|
20
|
+
*
|
|
21
|
+
* Adapters never read `process.env` themselves (per
|
|
22
|
+
* `.claude/rules/typescript.md`); env mapping happens at the call site
|
|
23
|
+
* (typically `createAppContext`).
|
|
24
|
+
*
|
|
25
|
+
* Adapter classes stay in packages/eval; they are passed in via `deps.adapters`
|
|
26
|
+
* so this factory has zero eval imports (D-01 / T-05-01-01).
|
|
27
|
+
*/
|
|
28
|
+
export function createLLMClient(config, keys, deps) {
|
|
29
|
+
const { logger, adapters } = deps;
|
|
30
|
+
const explicit = config.llmProvider;
|
|
31
|
+
const anthropicKey = keys.anthropicApiKey;
|
|
32
|
+
const openaiKey = keys.openaiApiKey;
|
|
33
|
+
// Auto-select: prefer Anthropic when both are present (matches the
|
|
34
|
+
// current grader's default model in config/models.ts).
|
|
35
|
+
const provider = explicit ?? (anthropicKey ? "anthropic" : openaiKey ? "openai" : undefined);
|
|
36
|
+
if (!provider) {
|
|
37
|
+
logger.debug("LLM client: not wired — no Anthropic or OpenAI API key supplied");
|
|
38
|
+
return undefined;
|
|
39
|
+
}
|
|
40
|
+
if (provider === "anthropic") {
|
|
41
|
+
if (!anthropicKey) {
|
|
42
|
+
logger.warn('llmProvider="anthropic" but no Anthropic API key supplied — LLMClient not wired');
|
|
43
|
+
return undefined;
|
|
44
|
+
}
|
|
45
|
+
logger.debug("LLM client: AnthropicLLMClient");
|
|
46
|
+
return adapters.newAnthropicClient({ apiKey: anthropicKey, logger });
|
|
47
|
+
}
|
|
48
|
+
if (!openaiKey) {
|
|
49
|
+
logger.warn('llmProvider="openai" but no OpenAI API key supplied — LLMClient not wired');
|
|
50
|
+
return undefined;
|
|
51
|
+
}
|
|
52
|
+
logger.debug("LLM client: OpenAILLMClient");
|
|
53
|
+
return adapters.newOpenAIClient({ apiKey: openaiKey, logger });
|
|
54
|
+
}
|
|
@@ -493,8 +493,9 @@ function renderLowScoringJudgments(md, judgments) {
|
|
|
493
493
|
.join("\n");
|
|
494
494
|
md.line(reasonLines);
|
|
495
495
|
md.blank();
|
|
496
|
-
|
|
497
|
-
|
|
496
|
+
const jDocs = j.contextDocs ?? j.canonicalDocs;
|
|
497
|
+
if (jDocs && jDocs.length > 0) {
|
|
498
|
+
const docList = jDocs.map((d) => `\`${d.slug}\``).join(", ");
|
|
498
499
|
md.line(`*Expected docs: ${docList}*`);
|
|
499
500
|
md.blank();
|
|
500
501
|
}
|