@sanity/ailf 5.0.0 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/config/diagnosis-cards.ts +318 -0
  2. package/config/models.ts +12 -0
  3. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
  4. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
  5. package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
  6. package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
  7. package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
  8. package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
  9. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
  10. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
  11. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
  12. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
  13. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
  14. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
  15. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  16. package/dist/_vendor/ailf-core/index.js +4 -0
  17. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
  18. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
  19. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
  20. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
  21. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
  22. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +171 -0
  23. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
  24. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
  25. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
  26. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
  27. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
  28. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +104 -0
  29. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
  30. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +96 -0
  31. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +39 -0
  32. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +52 -0
  33. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
  34. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +77 -0
  35. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
  36. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
  37. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
  38. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +126 -0
  39. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
  40. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +107 -0
  41. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
  42. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +114 -0
  43. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
  44. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +273 -0
  45. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
  46. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
  47. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
  48. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
  49. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
  50. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
  51. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
  52. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
  53. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
  54. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
  55. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +16 -0
  56. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +86 -0
  57. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +10 -0
  58. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +10 -0
  59. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +119 -2
  60. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +136 -2
  61. package/dist/_vendor/ailf-core/services/index.d.ts +5 -1
  62. package/dist/_vendor/ailf-core/services/index.js +15 -2
  63. package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
  64. package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
  65. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +112 -10
  66. package/dist/_vendor/ailf-core/types/diagnosis.js +3 -1
  67. package/dist/_vendor/ailf-core/types/index.d.ts +1 -1
  68. package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
  69. package/dist/adapters/llm/fake-llm-client.js +38 -1
  70. package/dist/adapters/llm/openai-llm-client.js +52 -3
  71. package/dist/cli-program.js +3 -0
  72. package/dist/commands/interpret.d.ts +50 -0
  73. package/dist/commands/interpret.js +212 -0
  74. package/dist/composition-root.d.ts +21 -23
  75. package/dist/composition-root.js +107 -41
  76. package/dist/config/diagnosis-cards.ts +318 -0
  77. package/dist/config/models.ts +12 -0
  78. package/dist/grader/agent-harness.d.ts +5 -10
  79. package/dist/grader/agent-harness.js +5 -13
  80. package/dist/grader/common.d.ts +5 -13
  81. package/dist/grader/common.js +5 -17
  82. package/dist/grader/index.d.ts +15 -29
  83. package/dist/grader/index.js +15 -66
  84. package/dist/grader/knowledge-probe.d.ts +5 -10
  85. package/dist/grader/knowledge-probe.js +5 -14
  86. package/dist/grader/literacy.d.ts +5 -9
  87. package/dist/grader/literacy.js +5 -13
  88. package/dist/grader/mcp.d.ts +5 -10
  89. package/dist/grader/mcp.js +5 -14
  90. package/package.json +2 -2
@@ -0,0 +1,86 @@
1
+ /**
2
+ * System prompt for the weakest-area card.
3
+ *
4
+ * Card: weakest-area
5
+ * Model: claude-sonnet-4-6 (routine card)
6
+ * Version: weakest-area@0.1.0
7
+ *
8
+ * Mitigations embedded:
9
+ * - failure-mode #3: confidence inflation on small samples — prompt instructs
10
+ * to hedge when sampleSize < 10; Zod W3 refine enforces at parse time
11
+ * - failure-mode #4: taxonomy drift — full canonical taxonomy enumerated
12
+ * verbatim in this prompt so the LLM picks from a known list
13
+ *
14
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
15
+ */
16
+ export const SYSTEM_PROMPT = `You are an AILF evaluation analyst identifying the documentation area most in need of improvement.
17
+
18
+ ## Your Output
19
+
20
+ Return a JSON object matching this exact shape:
21
+ {
22
+ "summary": "<1-2 sentence description of the weakest area and why>",
23
+ "area": "<feature area name, e.g. 'schema-deploy'>",
24
+ "dimension": "<MUST be one of the canonical dimensions listed below>",
25
+ "failureMode": "<MUST be from the canonical taxonomy for the chosen dimension>",
26
+ "sampleSize": <number — MUST equal the judgmentCount provided for this area>,
27
+ "confidence": {
28
+ "level": "high" | "medium" | "low",
29
+ "signalsPresent": <number of tasks backing this finding>,
30
+ "derivation": "card-type-specific"
31
+ }
32
+ }
33
+
34
+ ## CANONICAL DIMENSIONS AND FAILURE MODES
35
+
36
+ You MUST pick dimension and failureMode from this exact taxonomy. Cross-dimension combinations are invalid (e.g., "security" dimension with "missing-docs" failure mode is rejected).
37
+
38
+ ### Literacy family (dimensions: task-completion, code-correctness, doc-coverage)
39
+ Failure modes:
40
+ - missing-docs — relevant doc didn't exist
41
+ - outdated-docs — doc reflects an older API/version
42
+ - incorrect-docs — doc states something factually wrong
43
+ - poor-structure — doc exists but is hard to find or follow
44
+ Plus cross-cutting: api-error, model-limitation, false-floor, unclassified
45
+
46
+ ### MCP family (dimensions: mcp-behavior, input-validation, output-correctness, error-handling, security)
47
+ Failure modes:
48
+ - invalid-tool-call — model called tool with wrong args
49
+ - missing-required-param — required parameter omitted
50
+ - extra-param — unexpected extra parameter sent
51
+ - wrong-tool-selected — chose wrong tool for task
52
+ - tool-call-order — tools called in wrong sequence
53
+ - no-tool-call — should have used a tool but didn't
54
+ - schema-mismatch — response did not match expected schema
55
+ - unsafe-operation — operation could cause data loss
56
+ - auth-bypass — security check skipped
57
+ Plus cross-cutting: api-error, model-limitation, false-floor, unclassified
58
+
59
+ ### Knowledge-probe family (dimensions: knowledge-probe, factual-correctness, completeness, currency)
60
+ Failure modes:
61
+ - factual-error — stated an incorrect fact
62
+ - out-of-date — used deprecated API or old syntax
63
+ - missing-step — omitted a required step
64
+ - hallucinated-api — invented an API that does not exist
65
+ - wrong-version — used v1 API when v2 was required
66
+ - incomplete-coverage — missed important edge case
67
+ Plus cross-cutting: api-error, model-limitation, false-floor, unclassified
68
+
69
+ ### Agent-harness family (dimensions: agent-harness, process-quality, agent-output, tool-usage)
70
+ Failure modes:
71
+ - excessive-loops — agent looped unnecessarily
72
+ - premature-stop — stopped before completing the task
73
+ - incorrect-output — output was wrong or incomplete
74
+ - inefficient-path — completed task but via unnecessary steps
75
+ - assertion-failure — failed a structural assertion check
76
+ Plus cross-cutting: api-error, model-limitation, false-floor, unclassified
77
+
78
+ ## Confidence Calibration Rules
79
+
80
+ **CRITICAL:** When sampleSize < 10, you MUST set confidence.level = "low".
81
+
82
+ - sampleSize >= 30 → "high" is appropriate
83
+ - sampleSize >= 10 → "medium" is appropriate
84
+ - sampleSize < 10 → MUST use "low" (small-sample hedge required)
85
+
86
+ In your summary, reflect the confidence level: if "low", include language like "small sample (N=X) — re-run with broader dataset before acting".`;
@@ -12,9 +12,19 @@
12
12
  * `,`. Phase 1 lands the empty registry; Phase 5 registers cards via
13
13
  * the composition root, not by mutating this binding.
14
14
  *
15
+ * NOTE (Phase 5 / D-06): The runtime `CardRegistry` type used by the
16
+ * engine lives on `services/diagnosis-runner.ts` as
17
+ * `Readonly<Record<CardType, CardGenerator>>`. This file's `cardRegistry`
18
+ * is an intentionally-empty Phase-1 placeholder for Phase-1 contract
19
+ * tests. DO NOT mutate `cardRegistry` or add cards here — the composition
20
+ * root (Plan 06) builds and passes the `CardRegistry` literal into
21
+ * `createDiagnosisRunner(deps)`.
22
+ *
23
+ * @see packages/core/src/services/diagnosis-runner.ts (CardRegistry type)
15
24
  * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
16
25
  * @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
17
26
  * @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02, D-08)
27
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-06)
18
28
  */
19
29
  import type { z } from "zod";
20
30
  import type { CardType, DiagnosisCard } from "../../types/diagnosis.js";
@@ -12,9 +12,19 @@
12
12
  * `,`. Phase 1 lands the empty registry; Phase 5 registers cards via
13
13
  * the composition root, not by mutating this binding.
14
14
  *
15
+ * NOTE (Phase 5 / D-06): The runtime `CardRegistry` type used by the
16
+ * engine lives on `services/diagnosis-runner.ts` as
17
+ * `Readonly<Record<CardType, CardGenerator>>`. This file's `cardRegistry`
18
+ * is an intentionally-empty Phase-1 placeholder for Phase-1 contract
19
+ * tests. DO NOT mutate `cardRegistry` or add cards here — the composition
20
+ * root (Plan 06) builds and passes the `CardRegistry` literal into
21
+ * `createDiagnosisRunner(deps)`.
22
+ *
23
+ * @see packages/core/src/services/diagnosis-runner.ts (CardRegistry type)
15
24
  * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
16
25
  * @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
17
26
  * @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02, D-08)
27
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-06)
18
28
  */
19
29
  /**
20
30
  * Phase 1: empty entrypoint. Phase 5 cards register here through the
@@ -1,12 +1,23 @@
1
1
  /**
2
2
  * Diagnosis runner — engine entry point (D0048).
3
3
  *
4
- * Phase 1 lands the version constant only; the runner factory + cache
5
- * lookup land in Phase 5.
4
+ * Phase 5 implements the factory body; Phase 1 shipped `diagnosisVersion` only.
5
+ * `GeneratorContext.judgmentAttributions` is sourced once per `.run({...})` via
6
+ * `deps.loadAttributions(runId)` reading Phase 4's
7
+ * `runs/{runId}/attribution/{entryKey}.json` per-entry artifacts (RESEARCH
8
+ * Landmine 11).
6
9
  *
7
10
  * @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
8
11
  * @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02)
12
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-RESEARCH.md (Landmine 11)
13
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-02, D-06, D-10)
9
14
  */
15
+ import type { LLMClient, ModelId } from "../ports/llm-client.js";
16
+ import type { Logger } from "../ports/logger.js";
17
+ import type { ProgressReporter } from "../ports/progress-reporter.js";
18
+ import type { JudgmentAttribution } from "../types/attribution.js";
19
+ import type { CardType, Diagnosis, DiagnosisCard, VersionedInputs } from "../types/diagnosis.js";
20
+ import type { Report } from "../types/index.js";
10
21
  /**
11
22
  * Bumped when the runner's selection logic, prompt orchestration, or
12
23
  * card-set composition changes in a way that should invalidate cached
@@ -17,3 +28,109 @@
17
28
  * across vitest workers (cross-cutting hazard #2).
18
29
  */
19
30
  export declare const diagnosisVersion = "0.1.0";
31
+ /**
32
+ * Per-invocation context threaded into every card generator.
33
+ *
34
+ * `judgmentAttributions` is the Landmine-11 addition — Phase 4 emits
35
+ * per-judgment attribution as per-entry GCS artifacts at
36
+ * `runs/{runId}/attribution/{entryKey}.json`. The runner loads them once
37
+ * per `.run({...})` via `deps.loadAttributions(runId)` and threads the
38
+ * result here. Cards that need attribution inspect this field and return
39
+ * `status: "missing"` when it is `undefined` or empty.
40
+ */
41
+ export interface GeneratorContext {
42
+ readonly llm: LLMClient | undefined;
43
+ readonly model: ModelId;
44
+ readonly logger: Logger;
45
+ readonly progress: ProgressReporter;
46
+ readonly versions: VersionedInputs;
47
+ readonly runId: string;
48
+ readonly reportId: string;
49
+ readonly baseline?: Report;
50
+ /** Phase-4 attribution array, loaded once per run (Landmine 11). */
51
+ readonly judgmentAttributions?: JudgmentAttribution[];
52
+ }
53
+ /**
54
+ * Per-card generator function. Pure async function; the runner wraps
55
+ * each invocation in try/catch so generators MUST NOT suppress their
56
+ * own errors — throw freely; the runner owns error translation.
57
+ */
58
+ export type CardGenerator = (report: Report, ctx: GeneratorContext) => Promise<DiagnosisCard>;
59
+ /**
60
+ * Flat registry of all 8 card types → generator functions. Lives here
61
+ * (NOT in `services/diagnosis/registry.ts`) per CONTEXT D-06 — the
62
+ * Phase-1 `cardRegistry` placeholder stays empty to keep the contract
63
+ * test green; the composition root builds a `CardRegistry` literal and
64
+ * passes it into `createDiagnosisRunner(deps)`.
65
+ *
66
+ * `Readonly<Record<CardType, CardGenerator>>` — TypeScript exhaustiveness
67
+ * ensures all 8 literal `CardType` strings appear in any registry literal
68
+ * (no rogue keys, no silently missing keys).
69
+ */
70
+ export type CardRegistry = Readonly<Record<CardType, CardGenerator>>;
71
+ /**
72
+ * Dependencies for the diagnosis runner factory.
73
+ *
74
+ * D-02 delta vs. AI-SPEC §3: `cache: CacheStore` is replaced by two
75
+ * narrow callback deps so the engine uses the artifact store directly
76
+ * without needing a `CacheStore.get/set` API that doesn't exist.
77
+ *
78
+ * Landmine-11 addition: `loadAttributions` lets the composition root bind
79
+ * a reader over `ARTIFACT_REGISTRY.perEntryAttribution` without widening
80
+ * the `CacheStore` port.
81
+ */
82
+ export interface DiagnosisRunnerDeps {
83
+ /**
84
+ * Cache-lookup hook. Receives the artifact path built from the
85
+ * 4-version + model cache key. Plan 06's composition root supplies a
86
+ * reader that parses cached bytes through the Diagnosis Zod schema
87
+ * (T-05-04-01 mitigation). Tests supply a simple fake.
88
+ *
89
+ * Returns `null` on miss; returns the cached `Diagnosis` on hit.
90
+ */
91
+ readonly diagnosisReader: (path: string) => Promise<Diagnosis | null>;
92
+ /**
93
+ * Cache-write hook. Receives the same path as `diagnosisReader` plus
94
+ * the freshly-built `Diagnosis`. Called unconditionally after every
95
+ * successful run (including `refresh: true` — a refreshed call
96
+ * replaces the cached Diagnosis per AI-SPEC §3).
97
+ */
98
+ readonly diagnosisWriter: (path: string, diagnosis: Diagnosis) => Promise<void>;
99
+ /**
100
+ * Attribution loader — invoked once per `.run({...})` with
101
+ * `report.provenance.runId`. Rejection is caught by the runner; the
102
+ * resolved value (including `[]`) is threaded into every
103
+ * `GeneratorContext.judgmentAttributions` (Landmine 11).
104
+ */
105
+ readonly loadAttributions: (runId: string) => Promise<JudgmentAttribution[]>;
106
+ readonly llm: LLMClient | undefined;
107
+ readonly model: ModelId;
108
+ readonly logger: Logger;
109
+ readonly progress: ProgressReporter;
110
+ readonly registry: CardRegistry;
111
+ }
112
+ /**
113
+ * Arguments for a single diagnosis run.
114
+ */
115
+ export interface DiagnosisRunnerRunArgs {
116
+ readonly report: Report;
117
+ readonly versions: VersionedInputs;
118
+ readonly baseline?: Report;
119
+ /** When `true`, bypasses the cache lookup (but still writes on completion). */
120
+ readonly refresh?: boolean;
121
+ }
122
+ /**
123
+ * The diagnosis runner interface. A single `.run()` method returns a
124
+ * fully-assembled `Diagnosis` (or a partial one if some cards degraded).
125
+ */
126
+ export interface DiagnosisRunner {
127
+ run(args: DiagnosisRunnerRunArgs): Promise<Diagnosis>;
128
+ }
129
+ /**
130
+ * Build a `DiagnosisRunner` whose `.run({report, versions, baseline?, refresh?})`
131
+ * produces a `Diagnosis` with cards in registry-order.
132
+ *
133
+ * No module-scope `let` — all state lives in the `deps` closure and per-run
134
+ * local variables (AI-SPEC §3 Pitfall 1).
135
+ */
136
+ export declare function createDiagnosisRunner(deps: DiagnosisRunnerDeps): DiagnosisRunner;
@@ -1,12 +1,22 @@
1
1
  /**
2
2
  * Diagnosis runner — engine entry point (D0048).
3
3
  *
4
- * Phase 1 lands the version constant only; the runner factory + cache
5
- * lookup land in Phase 5.
4
+ * Phase 5 implements the factory body; Phase 1 shipped `diagnosisVersion` only.
5
+ * `GeneratorContext.judgmentAttributions` is sourced once per `.run({...})` via
6
+ * `deps.loadAttributions(runId)` reading Phase 4's
7
+ * `runs/{runId}/attribution/{entryKey}.json` per-entry artifacts (RESEARCH
8
+ * Landmine 11).
6
9
  *
7
10
  * @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
8
11
  * @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02)
12
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-RESEARCH.md (Landmine 11)
13
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-02, D-06, D-10)
9
14
  */
15
+ import { z } from "zod";
16
+ import { ARTIFACT_REGISTRY, encodeDiagnosisPathVersion, } from "../artifact-registry.js";
17
+ // ---------------------------------------------------------------------------
18
+ // Version constant (Phase 1 / VER-01 / D-02)
19
+ // ---------------------------------------------------------------------------
10
20
  /**
11
21
  * Bumped when the runner's selection logic, prompt orchestration, or
12
22
  * card-set composition changes in a way that should invalidate cached
@@ -17,3 +27,127 @@
17
27
  * across vitest workers (cross-cutting hazard #2).
18
28
  */
19
29
  export const diagnosisVersion = "0.1.0";
30
+ // ---------------------------------------------------------------------------
31
+ // Private helpers
32
+ // ---------------------------------------------------------------------------
33
+ /**
34
+ * Build the deterministic cache path that incorporates all four version
35
+ * segments AND the model id (AI-SPEC §3 lines 463-473 + D-02).
36
+ *
37
+ * The artifact path from `ARTIFACT_REGISTRY.diagnosis.objectPath(...)` is
38
+ * already version-scoped; we append `::${model}` to include model identity
39
+ * in the key without changing the artifact path shape.
40
+ */
41
+ function buildCacheKey(report, versions, model) {
42
+ const artifactPath = ARTIFACT_REGISTRY.diagnosis.objectPath(report.provenance.runId, report.id, encodeDiagnosisPathVersion(versions.diagnosisVersion, versions.cardVersion));
43
+ // Embed the remaining two version axes + model in the key string. The
44
+ // artifact path already carries diagnosisVersion + cardVersion; the other
45
+ // two axes are appended here so any single-segment bump produces a
46
+ // distinct key.
47
+ return `${artifactPath}::grader=${versions.graderJudgmentsVersion}::ensemble=${versions.ensembleVersion}::model=${model}`;
48
+ }
49
+ /**
50
+ * Per-card invocation — never panics. ZodError or any other thrown value
51
+ * both translate to a degraded card (AI-SPEC §3 lines 530-552).
52
+ */
53
+ async function runOne(generator, report, ctx, cardType) {
54
+ try {
55
+ return await generator(report, ctx);
56
+ }
57
+ catch (err) {
58
+ const meta = {
59
+ cardVersion: `${cardType}@unknown`,
60
+ generatedAt: new Date().toISOString(),
61
+ };
62
+ const isZodErr = err instanceof z.ZodError;
63
+ return {
64
+ status: "degraded",
65
+ cardType,
66
+ reason: err instanceof Error ? err.message : String(err),
67
+ parseFailed: isZodErr,
68
+ meta,
69
+ };
70
+ }
71
+ }
72
+ // ---------------------------------------------------------------------------
73
+ // Factory (AI-SPEC §3 lines 458-523 + D-02 / Landmine-11 deltas)
74
+ // ---------------------------------------------------------------------------
75
+ /**
76
+ * Build a `DiagnosisRunner` whose `.run({report, versions, baseline?, refresh?})`
77
+ * produces a `Diagnosis` with cards in registry-order.
78
+ *
79
+ * No module-scope `let` — all state lives in the `deps` closure and per-run
80
+ * local variables (AI-SPEC §3 Pitfall 1).
81
+ */
82
+ export function createDiagnosisRunner(deps) {
83
+ return {
84
+ async run({ report, versions, baseline, refresh }) {
85
+ const cachePath = buildCacheKey(report, versions, deps.model);
86
+ // Cache lookup (bypassed when --refresh).
87
+ if (!refresh) {
88
+ const cached = await deps.diagnosisReader(cachePath);
89
+ if (cached !== null)
90
+ return cached;
91
+ }
92
+ // One-shot attribution load (Landmine 11 — Phase 4 per-entry artifacts).
93
+ let judgmentAttributions;
94
+ try {
95
+ judgmentAttributions = await deps.loadAttributions(report.provenance.runId);
96
+ }
97
+ catch (err) {
98
+ deps.logger.warn("diagnosis-runner: loadAttributions failed", {
99
+ runId: report.provenance.runId,
100
+ error: err instanceof Error ? err.message : String(err),
101
+ });
102
+ judgmentAttributions = undefined;
103
+ }
104
+ const ctx = {
105
+ llm: deps.llm,
106
+ model: deps.model,
107
+ logger: deps.logger,
108
+ progress: deps.progress,
109
+ versions,
110
+ runId: report.provenance.runId, // D-10: provenance.runId, NOT report.runId
111
+ reportId: report.id,
112
+ judgmentAttributions, // Landmine 11
113
+ ...(baseline ? { baseline } : {}),
114
+ };
115
+ const cardTypes = Object.keys(deps.registry);
116
+ const cards = [];
117
+ let parseFailures = 0;
118
+ for (const cardType of cardTypes) {
119
+ const generator = deps.registry[cardType];
120
+ // Budget enforcement: once ≤1 budget is breached, downgrade
121
+ // subsequent parse-failing cards to "missing" before even running
122
+ // the generator (AI-SPEC §3 lines 496-510 + must-have #4).
123
+ // We still RUN the generator here to match the behavior spec —
124
+ // the budget check happens AFTER the card result is obtained.
125
+ const card = await runOne(generator, report, ctx, cardType);
126
+ if (card.status === "degraded" && card.parseFailed) {
127
+ if (parseFailures >= 1) {
128
+ // Budget exceeded — demote to missing.
129
+ deps.logger.warn(`diagnosis-runner: parse-failure budget exceeded for card "${cardType}"; demoting to missing`, { reportId: report.id });
130
+ cards.push({
131
+ status: "missing",
132
+ cardType,
133
+ reason: "degraded-budget-exceeded",
134
+ });
135
+ continue;
136
+ }
137
+ parseFailures++;
138
+ }
139
+ cards.push(card);
140
+ }
141
+ const diagnosis = {
142
+ runId: report.provenance.runId, // D-10: provenance.runId
143
+ reportId: report.id,
144
+ inputs: versions,
145
+ cards,
146
+ generatedAt: new Date().toISOString(),
147
+ };
148
+ // Unconditional write — a refreshed call replaces the cached Diagnosis.
149
+ await deps.diagnosisWriter(cachePath, diagnosis);
150
+ return diagnosis;
151
+ },
152
+ };
153
+ }
@@ -13,5 +13,9 @@ export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskS
13
13
  export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
14
14
  export { buildSlimReportSummary } from "./slim-report-summary.js";
15
15
  export { reportToMarkdown, type RenderableReport, } from "./report-to-markdown.js";
16
- export { diagnosisVersion } from "./diagnosis-runner.js";
16
+ export { createDiagnosisRunner, diagnosisVersion, type CardGenerator, type CardRegistry, type DiagnosisRunner, type DiagnosisRunnerDeps, type DiagnosisRunnerRunArgs, type GeneratorContext, } from "./diagnosis-runner.js";
17
17
  export { cardRegistry, type CardDefinition } from "./diagnosis/registry.js";
18
+ export { createLLMClient, type LLMClientAdapters, type LLMClientFactoryConfig, type LLMClientKeys, } from "./llm-client-factory.js";
19
+ export { buildFailureModeRefinement, isFailureModeInDimensionTaxonomy, } from "./diagnosis/card-validators.js";
20
+ export { DIAGNOSIS_CARD_GENERATORS, generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, } from "./diagnosis/cards/index.js";
21
+ export { buildTopRecommendationsPrompt, buildWeakestAreaPrompt, buildLowConfidenceAttributionPrompt, buildDocAttributionSpotlightPrompt, buildRegressionVsBaselinePrompt, buildDocSlugAllowList, } from "./diagnosis/prompt-builders.js";
@@ -14,7 +14,20 @@ export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resol
14
14
  export { buildSlimReportSummary } from "./slim-report-summary.js";
15
15
  export { reportToMarkdown, } from "./report-to-markdown.js";
16
16
  // ---------------------------------------------------------------------------
17
- // Actionability ladder Phase 1 — diagnosis runner + card registry
17
+ // Actionability ladder Phase 1 + Phase 5 — diagnosis runner + card registry
18
18
  // ---------------------------------------------------------------------------
19
- export { diagnosisVersion } from "./diagnosis-runner.js";
19
+ export { createDiagnosisRunner, diagnosisVersion, } from "./diagnosis-runner.js";
20
20
  export { cardRegistry } from "./diagnosis/registry.js";
21
+ // ---------------------------------------------------------------------------
22
+ // Phase 5 — LLM client factory (D-01 hoist)
23
+ // ---------------------------------------------------------------------------
24
+ export { createLLMClient, } from "./llm-client-factory.js";
25
+ // ---------------------------------------------------------------------------
26
+ // Phase 5 — card validators (D-05 refine helpers)
27
+ // ---------------------------------------------------------------------------
28
+ export { buildFailureModeRefinement, isFailureModeInDimensionTaxonomy, } from "./diagnosis/card-validators.js";
29
+ // ---------------------------------------------------------------------------
30
+ // Phase 5 Plan 05 — card generators barrel + prompt builders
31
+ // ---------------------------------------------------------------------------
32
+ export { DIAGNOSIS_CARD_GENERATORS, generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, } from "./diagnosis/cards/index.js";
33
+ export { buildTopRecommendationsPrompt, buildWeakestAreaPrompt, buildLowConfidenceAttributionPrompt, buildDocAttributionSpotlightPrompt, buildRegressionVsBaselinePrompt, buildDocSlugAllowList, } from "./diagnosis/prompt-builders.js";
@@ -0,0 +1,64 @@
1
+ /**
2
+ * LLM client factory — hoisted from packages/eval/src/composition-root.ts
3
+ * so packages/api can build a DiagnosisRunner without importing eval (D-01).
4
+ *
5
+ * Adapter CLASSES stay in packages/eval/src/adapters/llm/. Only the factory
6
+ * function lives here. Adapter constructors are injected via `LLMClientAdapters`
7
+ * so core never static-imports vendor SDK code (D0051 invariant / T-05-01-01).
8
+ *
9
+ * @see docs/decisions/D0051-llm-client-port.md
10
+ * @see packages/eval/src/composition-root.ts — call site (updated to use this)
11
+ */
12
+ import type { LLMClient } from "../ports/llm-client.js";
13
+ import type { Logger } from "../ports/logger.js";
14
+ /**
15
+ * Narrow config slice consumed by the LLM client factory.
16
+ * Does NOT depend on `ResolvedConfig` from packages/eval — only the
17
+ * llmProvider field is needed here.
18
+ */
19
+ export interface LLMClientFactoryConfig {
20
+ readonly llmProvider?: "anthropic" | "openai";
21
+ }
22
+ /**
23
+ * Typed key bag passed to `createLLMClient`. The composition root reads
24
+ * env once and supplies values here; the factory stays pure so tests don't
25
+ * have to mutate `process.env`.
26
+ */
27
+ export interface LLMClientKeys {
28
+ readonly anthropicApiKey?: string;
29
+ readonly openaiApiKey?: string;
30
+ }
31
+ /**
32
+ * Constructor callbacks for adapter classes that live in packages/eval.
33
+ * The eval composition root passes real constructors; tests pass spies.
34
+ *
35
+ * This pattern satisfies T-05-01-01: core never static-imports
36
+ * openai / @anthropic-ai/sdk. The vendor code stays in eval.
37
+ */
38
+ export interface LLMClientAdapters {
39
+ readonly newAnthropicClient: (opts: {
40
+ apiKey: string;
41
+ logger: Logger;
42
+ }) => LLMClient;
43
+ readonly newOpenAIClient: (opts: {
44
+ apiKey: string;
45
+ logger: Logger;
46
+ }) => LLMClient;
47
+ }
48
+ /**
49
+ * Select the LLMClient adapter based on `config.llmProvider` and the
50
+ * supplied API keys. Returns `undefined` when no usable credential is
51
+ * present — `AppContext.llmClient` stays unset and consumers handle that
52
+ * explicitly.
53
+ *
54
+ * Adapters never read `process.env` themselves (per
55
+ * `.claude/rules/typescript.md`); env mapping happens at the call site
56
+ * (typically `createAppContext`).
57
+ *
58
+ * Adapter classes stay in packages/eval; they are passed in via `deps.adapters`
59
+ * so this factory has zero eval imports (D-01 / T-05-01-01).
60
+ */
61
+ export declare function createLLMClient(config: LLMClientFactoryConfig, keys: LLMClientKeys, deps: {
62
+ logger: Logger;
63
+ adapters: LLMClientAdapters;
64
+ }): LLMClient | undefined;
@@ -0,0 +1,54 @@
1
+ /**
2
+ * LLM client factory — hoisted from packages/eval/src/composition-root.ts
3
+ * so packages/api can build a DiagnosisRunner without importing eval (D-01).
4
+ *
5
+ * Adapter CLASSES stay in packages/eval/src/adapters/llm/. Only the factory
6
+ * function lives here. Adapter constructors are injected via `LLMClientAdapters`
7
+ * so core never static-imports vendor SDK code (D0051 invariant / T-05-01-01).
8
+ *
9
+ * @see docs/decisions/D0051-llm-client-port.md
10
+ * @see packages/eval/src/composition-root.ts — call site (updated to use this)
11
+ */
12
+ // ---------------------------------------------------------------------------
13
+ // Factory function
14
+ // ---------------------------------------------------------------------------
15
+ /**
16
+ * Select the LLMClient adapter based on `config.llmProvider` and the
17
+ * supplied API keys. Returns `undefined` when no usable credential is
18
+ * present — `AppContext.llmClient` stays unset and consumers handle that
19
+ * explicitly.
20
+ *
21
+ * Adapters never read `process.env` themselves (per
22
+ * `.claude/rules/typescript.md`); env mapping happens at the call site
23
+ * (typically `createAppContext`).
24
+ *
25
+ * Adapter classes stay in packages/eval; they are passed in via `deps.adapters`
26
+ * so this factory has zero eval imports (D-01 / T-05-01-01).
27
+ */
28
+ export function createLLMClient(config, keys, deps) {
29
+ const { logger, adapters } = deps;
30
+ const explicit = config.llmProvider;
31
+ const anthropicKey = keys.anthropicApiKey;
32
+ const openaiKey = keys.openaiApiKey;
33
+ // Auto-select: prefer Anthropic when both are present (matches the
34
+ // current grader's default model in config/models.ts).
35
+ const provider = explicit ?? (anthropicKey ? "anthropic" : openaiKey ? "openai" : undefined);
36
+ if (!provider) {
37
+ logger.debug("LLM client: not wired — no Anthropic or OpenAI API key supplied");
38
+ return undefined;
39
+ }
40
+ if (provider === "anthropic") {
41
+ if (!anthropicKey) {
42
+ logger.warn('llmProvider="anthropic" but no Anthropic API key supplied — LLMClient not wired');
43
+ return undefined;
44
+ }
45
+ logger.debug("LLM client: AnthropicLLMClient");
46
+ return adapters.newAnthropicClient({ apiKey: anthropicKey, logger });
47
+ }
48
+ if (!openaiKey) {
49
+ logger.warn('llmProvider="openai" but no OpenAI API key supplied — LLMClient not wired');
50
+ return undefined;
51
+ }
52
+ logger.debug("LLM client: OpenAILLMClient");
53
+ return adapters.newOpenAIClient({ apiKey: openaiKey, logger });
54
+ }