@sanity/ailf 4.6.0 → 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/agent-harness-tools.yaml +42 -0
- package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
- package/canonical/grader-references/mcp-server-spec.yaml +51 -0
- package/canonical/grader-references/portable-text.yaml +48 -0
- package/config/diagnosis-cards.ts +318 -0
- package/config/models.ts +12 -0
- package/config/rubrics.ts +38 -2
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
- package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
- package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
- package/dist/_vendor/ailf-core/examples/index.js +146 -47
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +4 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +9 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
- package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +171 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +104 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +96 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +39 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +52 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +77 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +126 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +107 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +114 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +273 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +86 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +50 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +35 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +136 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +153 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +6 -0
- package/dist/_vendor/ailf-core/services/index.js +18 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
- package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
- package/dist/_vendor/ailf-core/types/attribution.js +18 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
- package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
- package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
- package/dist/_vendor/ailf-core/types/confidence.js +7 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +271 -0
- package/dist/_vendor/ailf-core/types/diagnosis.js +19 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
- package/dist/_vendor/ailf-core/types/index.js +15 -1
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
- package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
- package/dist/adapters/attribution/index.d.ts +9 -0
- package/dist/adapters/attribution/index.js +8 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/grader-outputs/index.d.ts +10 -0
- package/dist/adapters/grader-outputs/index.js +8 -0
- package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
- package/dist/adapters/grader-outputs/legacy/index.js +10 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
- package/dist/adapters/index.d.ts +3 -0
- package/dist/adapters/index.js +4 -0
- package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
- package/dist/adapters/llm/fake-llm-client.js +38 -1
- package/dist/adapters/llm/openai-llm-client.js +52 -3
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
- package/dist/adapters/task-sources/repo-schemas.js +19 -2
- package/dist/cli-program.js +3 -0
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/explain-handler.js +1 -1
- package/dist/commands/interpret.d.ts +50 -0
- package/dist/commands/interpret.js +212 -0
- package/dist/commands/lookup-doc.d.ts +1 -1
- package/dist/commands/lookup-doc.js +3 -3
- package/dist/commands/pipeline-action.d.ts +6 -0
- package/dist/commands/pipeline-action.js +2 -0
- package/dist/commands/remote-pipeline.js +1 -0
- package/dist/composition-root.d.ts +57 -23
- package/dist/composition-root.js +155 -41
- package/dist/config/diagnosis-cards.ts +318 -0
- package/dist/config/models.ts +12 -0
- package/dist/config/rubrics.ts +38 -2
- package/dist/grader/agent-harness.d.ts +9 -0
- package/dist/grader/agent-harness.js +9 -0
- package/dist/grader/common.d.ts +9 -0
- package/dist/grader/common.js +9 -0
- package/dist/grader/index.d.ts +24 -0
- package/dist/grader/index.js +24 -0
- package/dist/grader/knowledge-probe.d.ts +9 -0
- package/dist/grader/knowledge-probe.js +9 -0
- package/dist/grader/literacy.d.ts +9 -0
- package/dist/grader/literacy.js +9 -0
- package/dist/grader/mcp.d.ts +9 -0
- package/dist/grader/mcp.js +9 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +5 -0
- package/dist/orchestration/steps/calculate-scores-step.js +23 -1
- package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
- package/dist/orchestration/steps/compute-attribution-step.js +279 -0
- package/dist/orchestration/steps/gap-analysis-step.js +35 -7
- package/dist/orchestration/steps/index.d.ts +1 -0
- package/dist/orchestration/steps/index.js +1 -0
- package/dist/pipeline/attribution.d.ts +15 -0
- package/dist/pipeline/attribution.js +18 -9
- package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
- package/dist/pipeline/borderline-consensus-runner.js +124 -0
- package/dist/pipeline/borderline-detector.d.ts +24 -0
- package/dist/pipeline/borderline-detector.js +26 -0
- package/dist/pipeline/calculate-scores.d.ts +114 -3
- package/dist/pipeline/calculate-scores.js +426 -24
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +35 -17
- package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
- package/dist/pipeline/compiler/rubric-resolution.js +9 -1
- package/dist/pipeline/compute-attribution.d.ts +80 -0
- package/dist/pipeline/compute-attribution.js +196 -0
- package/dist/pipeline/failure-modes.d.ts +52 -17
- package/dist/pipeline/failure-modes.js +178 -117
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/package.json +7 -5
|
@@ -15,7 +15,9 @@
|
|
|
15
15
|
* @see packages/core/src/ports/context.ts — AppContext interface
|
|
16
16
|
* @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
|
|
17
17
|
*/
|
|
18
|
-
import { type AppContext, type ArtifactWriter, type ArtifactWriterProgressOptions, type AssertionRegistration, type
|
|
18
|
+
import { type AppContext, type ArtifactWriter, type ArtifactWriterProgressOptions, type AssertionRegistration, type CardRegistry, type DiagnosisRunner, type Logger, type ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
|
|
19
|
+
export type { LLMClientKeys } from "./_vendor/ailf-core/index.d.ts";
|
|
20
|
+
import { type BorderlineConsensusOptions, type BorderlineConsensusResult } from "./pipeline/borderline-consensus-runner.js";
|
|
19
21
|
import { CompositeTaskSource, ContentLakeTaskSource, RepoTaskSource } from "./adapters/task-sources/index.js";
|
|
20
22
|
/**
|
|
21
23
|
* Create a fully wired AppContext from resolved configuration.
|
|
@@ -24,28 +26,6 @@ import { CompositeTaskSource, ContentLakeTaskSource, RepoTaskSource } from "./ad
|
|
|
24
26
|
* Swapping an adapter is a one-line change in this function.
|
|
25
27
|
*/
|
|
26
28
|
export declare function createAppContext(config: ResolvedConfig): AppContext;
|
|
27
|
-
/**
|
|
28
|
-
* Typed key bag passed to `createLLMClient`. The composition root reads
|
|
29
|
-
* env once and supplies values here; the factory stays pure so tests don't
|
|
30
|
-
* have to mutate `process.env`.
|
|
31
|
-
*/
|
|
32
|
-
export interface LLMClientKeys {
|
|
33
|
-
anthropicApiKey?: string;
|
|
34
|
-
openaiApiKey?: string;
|
|
35
|
-
}
|
|
36
|
-
/**
|
|
37
|
-
* Select the LLMClient adapter based on `config.llmProvider` and the
|
|
38
|
-
* supplied API keys. Returns `undefined` when no usable credential is
|
|
39
|
-
* present — `AppContext.llmClient` stays unset and consumers handle that
|
|
40
|
-
* explicitly.
|
|
41
|
-
*
|
|
42
|
-
* Adapters never read `process.env` themselves (per
|
|
43
|
-
* `.claude/rules/typescript.md`); env mapping happens at the call site
|
|
44
|
-
* (typically `createAppContext`).
|
|
45
|
-
*
|
|
46
|
-
* Exported for unit-test access; not part of the public package API.
|
|
47
|
-
*/
|
|
48
|
-
export declare function createLLMClient(config: ResolvedConfig, keys: LLMClientKeys, logger: Logger): LLMClient | undefined;
|
|
49
29
|
/**
|
|
50
30
|
* Selects the `ArtifactWriter` wiring per D0033 M4:
|
|
51
31
|
*
|
|
@@ -83,3 +63,57 @@ export declare function createTaskSource(config: ResolvedConfig): CompositeTaskS
|
|
|
83
63
|
* explicit mode whitelists.
|
|
84
64
|
*/
|
|
85
65
|
export declare const FRAMEWORK_ASSERTIONS: AssertionRegistration[];
|
|
66
|
+
/**
|
|
67
|
+
* Severity boundaries from `packages/eval/config/thresholds.ts`
|
|
68
|
+
* (severity.critical/warning/info `composite-below` at L50/54/58 — 30, 50,
|
|
69
|
+
* 60). The borderline detector flags a judgment when its score is within
|
|
70
|
+
* ±5 of any of these. Composition-root reads them ONCE and threads the
|
|
71
|
+
* typed `readonly number[]` into `runBorderlineConsensus` rather than
|
|
72
|
+
* re-deriving them at each call site (Pitfall 5 — single source of truth
|
|
73
|
+
* for the scale).
|
|
74
|
+
*/
|
|
75
|
+
export declare const BORDERLINE_SEVERITY_THRESHOLDS: readonly number[];
|
|
76
|
+
/**
|
|
77
|
+
* Default replications per borderline judgment when the caller's
|
|
78
|
+
* `RepoConfig.execution.borderlineReplications` is unset (locked answer
|
|
79
|
+
* #4 in plan 03-04). Three replications + the original score = four
|
|
80
|
+
* scores per consistency record, which is the minimum that produces a
|
|
81
|
+
* non-degenerate stdDev / median split.
|
|
82
|
+
*/
|
|
83
|
+
export declare const DEFAULT_BORDERLINE_REPLICATIONS = 3;
|
|
84
|
+
/**
|
|
85
|
+
* Factory for the borderline-consensus runner. Returns a function that
|
|
86
|
+
* applies the severity-threshold and replication defaults from
|
|
87
|
+
* composition-root, leaving the live grader entry point (the `regrade`
|
|
88
|
+
* callback) and the candidate `judgments` array as runtime inputs.
|
|
89
|
+
*
|
|
90
|
+
* The pipeline-side caller (currently `pipeline/calculate-scores.ts`'s
|
|
91
|
+
* post-extraction junction) supplies the `regrade` callback that maps a
|
|
92
|
+
* `GraderJudgment` to a fresh score via the response/rubric text from
|
|
93
|
+
* the original Promptfoo result. See the runner's header for the
|
|
94
|
+
* rationale on injecting the regrader rather than calling `gradeOnce`
|
|
95
|
+
* inline (Pitfall 6 — preserve the runner's purity wrt the existing
|
|
96
|
+
* grader-comparison split).
|
|
97
|
+
*/
|
|
98
|
+
export declare function createBorderlineConsensusRunner(opts: {
|
|
99
|
+
borderlineReplications?: number;
|
|
100
|
+
}): (args: Pick<BorderlineConsensusOptions, "judgments" | "logger" | "regrade">) => Promise<BorderlineConsensusResult>;
|
|
101
|
+
/**
|
|
102
|
+
* Returns the full 8-card `CardRegistry` backed by `DIAGNOSIS_CARD_GENERATORS`
|
|
103
|
+
* from `@sanity/ailf-core`. Exposed as a function (not a module-level const)
|
|
104
|
+
* so the composition root remains the single-seam factory and tests can assert
|
|
105
|
+
* the call site (AI-SPEC §3 Pitfall 1 — no module-scope mutables).
|
|
106
|
+
*/
|
|
107
|
+
export declare function buildDiagnosisRegistry(): CardRegistry;
|
|
108
|
+
/**
|
|
109
|
+
* Build a fully-wired `DiagnosisRunner` from an `AppContext`.
|
|
110
|
+
*
|
|
111
|
+
* Wires the full 8-card registry, `loadAttributions` bound to the local
|
|
112
|
+
* filesystem (Phase-4 per-entry attribution objects at
|
|
113
|
+
* `{artifactsDir}/runs/{runId}/attribution/*.json`), and no-op cache
|
|
114
|
+
* reader/writer (Plan-06 CLI command will wire the real cache seam).
|
|
115
|
+
*
|
|
116
|
+
* Plan-06 API/CLI consumers import this function from the composition root
|
|
117
|
+
* and pass `ctx` from `createAppContext(config)`.
|
|
118
|
+
*/
|
|
119
|
+
export declare function getDiagnosisRunner(ctx: AppContext): DiagnosisRunner;
|
package/dist/composition-root.js
CHANGED
|
@@ -15,7 +15,10 @@
|
|
|
15
15
|
* @see packages/core/src/ports/context.ts — AppContext interface
|
|
16
16
|
* @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
|
|
17
17
|
*/
|
|
18
|
-
import {
|
|
18
|
+
import { promises as fs } from "node:fs";
|
|
19
|
+
import path from "node:path";
|
|
20
|
+
import { ARTIFACT_EXPORT_PHASE_ID, DIAGNOSIS_CARD_GENERATORS, InMemoryPluginRegistry, NoOpArtifactWriter, NoOpProgressReporter, createDiagnosisRunner, createLLMClient, generateRunId, isArtifactType, modelId, } from "./_vendor/ailf-core/index.js";
|
|
21
|
+
import { JudgmentAttributionSchema } from "./adapters/attribution/per-entry-attribution-writer.js";
|
|
19
22
|
import { AccumulatingArtifactWriter } from "./artifact-capture/accumulating-artifact-writer.js";
|
|
20
23
|
import { ApiGatewayArtifactWriter } from "./artifact-capture/api-gateway-artifact-writer.js";
|
|
21
24
|
import { BatchingApiGatewayArtifactWriter } from "./artifact-capture/batching-api-gateway-artifact-writer.js";
|
|
@@ -27,6 +30,7 @@ import { resolveUploadConcurrency, setDefaultUploadConcurrency, } from "./artifa
|
|
|
27
30
|
import { UploadMetrics } from "./artifact-capture/upload-metrics.js";
|
|
28
31
|
import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js";
|
|
29
32
|
import { AnthropicLLMClient, OpenAILLMClient } from "./adapters/llm/index.js";
|
|
33
|
+
import { runBorderlineConsensus, } from "./pipeline/borderline-consensus-runner.js";
|
|
30
34
|
import { loadExternalPresets } from "./pipeline/compiler/preset-loader.js";
|
|
31
35
|
import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
|
|
32
36
|
import { PromptfooEvalAdapter } from "./adapters/eval-runners/promptfoo-eval-adapter.js";
|
|
@@ -95,10 +99,19 @@ export function createAppContext(config) {
|
|
|
95
99
|
// LLM client (D0051) — wired when an API key is present. The grader path
|
|
96
100
|
// does NOT consume this; D0051 defers grader migration as a follow-up.
|
|
97
101
|
// Env mapping happens here so `createLLMClient` stays pure and testable.
|
|
98
|
-
|
|
102
|
+
// D-01: factory hoisted to @sanity/ailf-core; adapter ctors injected here.
|
|
103
|
+
const llmAdapters = {
|
|
104
|
+
newAnthropicClient: (opts) => new AnthropicLLMClient(opts),
|
|
105
|
+
newOpenAIClient: (opts) => new OpenAILLMClient(opts),
|
|
106
|
+
};
|
|
107
|
+
const llmKeys = {
|
|
99
108
|
anthropicApiKey: process.env.ANTHROPIC_API_KEY,
|
|
100
109
|
openaiApiKey: process.env.OPENAI_API_KEY,
|
|
101
|
-
}
|
|
110
|
+
};
|
|
111
|
+
const llmClient = createLLMClient(config, llmKeys, {
|
|
112
|
+
logger,
|
|
113
|
+
adapters: llmAdapters,
|
|
114
|
+
});
|
|
102
115
|
return {
|
|
103
116
|
artifactWriter,
|
|
104
117
|
cache,
|
|
@@ -116,44 +129,6 @@ export function createAppContext(config) {
|
|
|
116
129
|
taskSource,
|
|
117
130
|
};
|
|
118
131
|
}
|
|
119
|
-
/**
|
|
120
|
-
* Select the LLMClient adapter based on `config.llmProvider` and the
|
|
121
|
-
* supplied API keys. Returns `undefined` when no usable credential is
|
|
122
|
-
* present — `AppContext.llmClient` stays unset and consumers handle that
|
|
123
|
-
* explicitly.
|
|
124
|
-
*
|
|
125
|
-
* Adapters never read `process.env` themselves (per
|
|
126
|
-
* `.claude/rules/typescript.md`); env mapping happens at the call site
|
|
127
|
-
* (typically `createAppContext`).
|
|
128
|
-
*
|
|
129
|
-
* Exported for unit-test access; not part of the public package API.
|
|
130
|
-
*/
|
|
131
|
-
export function createLLMClient(config, keys, logger) {
|
|
132
|
-
const explicit = config.llmProvider;
|
|
133
|
-
const anthropicKey = keys.anthropicApiKey;
|
|
134
|
-
const openaiKey = keys.openaiApiKey;
|
|
135
|
-
// Auto-select: prefer Anthropic when both are present (matches the
|
|
136
|
-
// current grader's default model in `config/models.ts`).
|
|
137
|
-
const provider = explicit ?? (anthropicKey ? "anthropic" : openaiKey ? "openai" : undefined);
|
|
138
|
-
if (!provider) {
|
|
139
|
-
logger.debug("LLM client: not wired — no Anthropic or OpenAI API key supplied");
|
|
140
|
-
return undefined;
|
|
141
|
-
}
|
|
142
|
-
if (provider === "anthropic") {
|
|
143
|
-
if (!anthropicKey) {
|
|
144
|
-
logger.warn('llmProvider="anthropic" but no Anthropic API key supplied — LLMClient not wired');
|
|
145
|
-
return undefined;
|
|
146
|
-
}
|
|
147
|
-
logger.debug("LLM client: AnthropicLLMClient");
|
|
148
|
-
return new AnthropicLLMClient({ apiKey: anthropicKey, logger });
|
|
149
|
-
}
|
|
150
|
-
if (!openaiKey) {
|
|
151
|
-
logger.warn('llmProvider="openai" but no OpenAI API key supplied — LLMClient not wired');
|
|
152
|
-
return undefined;
|
|
153
|
-
}
|
|
154
|
-
logger.debug("LLM client: OpenAILLMClient");
|
|
155
|
-
return new OpenAILLMClient({ apiKey: openaiKey, logger });
|
|
156
|
-
}
|
|
157
132
|
// ---------------------------------------------------------------------------
|
|
158
133
|
// Sub-factories (extracted to keep createAppContext readable)
|
|
159
134
|
// ---------------------------------------------------------------------------
|
|
@@ -493,3 +468,142 @@ function createReportStore(config) {
|
|
|
493
468
|
undefined,
|
|
494
469
|
});
|
|
495
470
|
}
|
|
471
|
+
// ---------------------------------------------------------------------------
|
|
472
|
+
// Borderline-consensus wiring (Plan 03-04 / GRAD-04)
|
|
473
|
+
// ---------------------------------------------------------------------------
|
|
474
|
+
/**
|
|
475
|
+
* Severity boundaries from `packages/eval/config/thresholds.ts`
|
|
476
|
+
* (severity.critical/warning/info `composite-below` at L50/54/58 — 30, 50,
|
|
477
|
+
* 60). The borderline detector flags a judgment when its score is within
|
|
478
|
+
* ±5 of any of these. Composition-root reads them ONCE and threads the
|
|
479
|
+
* typed `readonly number[]` into `runBorderlineConsensus` rather than
|
|
480
|
+
* re-deriving them at each call site (Pitfall 5 — single source of truth
|
|
481
|
+
* for the scale).
|
|
482
|
+
*/
|
|
483
|
+
export const BORDERLINE_SEVERITY_THRESHOLDS = [
|
|
484
|
+
30, 50, 60,
|
|
485
|
+
];
|
|
486
|
+
/**
|
|
487
|
+
* Default replications per borderline judgment when the caller's
|
|
488
|
+
* `RepoConfig.execution.borderlineReplications` is unset (locked answer
|
|
489
|
+
* #4 in plan 03-04). Three replications + the original score = four
|
|
490
|
+
* scores per consistency record, which is the minimum that produces a
|
|
491
|
+
* non-degenerate stdDev / median split.
|
|
492
|
+
*/
|
|
493
|
+
export const DEFAULT_BORDERLINE_REPLICATIONS = 3;
|
|
494
|
+
/**
|
|
495
|
+
* Factory for the borderline-consensus runner. Returns a function that
|
|
496
|
+
* applies the severity-threshold and replication defaults from
|
|
497
|
+
* composition-root, leaving the live grader entry point (the `regrade`
|
|
498
|
+
* callback) and the candidate `judgments` array as runtime inputs.
|
|
499
|
+
*
|
|
500
|
+
* The pipeline-side caller (currently `pipeline/calculate-scores.ts`'s
|
|
501
|
+
* post-extraction junction) supplies the `regrade` callback that maps a
|
|
502
|
+
* `GraderJudgment` to a fresh score via the response/rubric text from
|
|
503
|
+
* the original Promptfoo result. See the runner's header for the
|
|
504
|
+
* rationale on injecting the regrader rather than calling `gradeOnce`
|
|
505
|
+
* inline (Pitfall 6 — preserve the runner's purity wrt the existing
|
|
506
|
+
* grader-comparison split).
|
|
507
|
+
*/
|
|
508
|
+
export function createBorderlineConsensusRunner(opts) {
|
|
509
|
+
const replications = opts.borderlineReplications ?? DEFAULT_BORDERLINE_REPLICATIONS;
|
|
510
|
+
return (args) => runBorderlineConsensus({
|
|
511
|
+
judgments: args.judgments,
|
|
512
|
+
...(args.logger ? { logger: args.logger } : {}),
|
|
513
|
+
regrade: args.regrade,
|
|
514
|
+
replications,
|
|
515
|
+
thresholds: BORDERLINE_SEVERITY_THRESHOLDS,
|
|
516
|
+
});
|
|
517
|
+
}
|
|
518
|
+
// ---------------------------------------------------------------------------
|
|
519
|
+
// Diagnosis runner wiring (Plan 05-05 / D0048)
|
|
520
|
+
// ---------------------------------------------------------------------------
|
|
521
|
+
/**
|
|
522
|
+
* Returns the full 8-card `CardRegistry` backed by `DIAGNOSIS_CARD_GENERATORS`
|
|
523
|
+
* from `@sanity/ailf-core`. Exposed as a function (not a module-level const)
|
|
524
|
+
* so the composition root remains the single-seam factory and tests can assert
|
|
525
|
+
* the call site (AI-SPEC §3 Pitfall 1 — no module-scope mutables).
|
|
526
|
+
*/
|
|
527
|
+
export function buildDiagnosisRegistry() {
|
|
528
|
+
return DIAGNOSIS_CARD_GENERATORS;
|
|
529
|
+
}
|
|
530
|
+
/**
|
|
531
|
+
* Default local artifacts root (mirrors `createArtifactWriter` default above).
|
|
532
|
+
*/
|
|
533
|
+
const DIAGNOSIS_LOCAL_ARTIFACTS_DIR = ".ailf/results/captures";
|
|
534
|
+
/**
|
|
535
|
+
* Load all per-entry attribution objects for a given `runId` from the local
|
|
536
|
+
* filesystem at `{artifactsDir}/runs/{runId}/attribution/*.json`.
|
|
537
|
+
*
|
|
538
|
+
* Reads every `.json` file in the run's `attribution/` directory, JSON-parses
|
|
539
|
+
* it, and Zod-validates each entry through `JudgmentAttributionSchema` (Phase-4
|
|
540
|
+
* canonical schema — D0045). Malformed entries are skipped with a warning.
|
|
541
|
+
*
|
|
542
|
+
* Returns an empty array when the directory does not exist (expected on runs
|
|
543
|
+
* without Phase-4 attribution data — the runner treats this as Landmine-11
|
|
544
|
+
* "no data" and both attribution cards return `status: "missing"`).
|
|
545
|
+
*/
|
|
546
|
+
async function loadAttributionsFromLocalFs(runId, artifactsDir, logger) {
|
|
547
|
+
const attrDir = path.join(artifactsDir, "runs", runId, "attribution");
|
|
548
|
+
let entries;
|
|
549
|
+
try {
|
|
550
|
+
entries = (await fs.readdir(attrDir)).filter((f) => f.endsWith(".json"));
|
|
551
|
+
}
|
|
552
|
+
catch {
|
|
553
|
+
// Directory missing — no attribution data for this run.
|
|
554
|
+
return [];
|
|
555
|
+
}
|
|
556
|
+
const results = [];
|
|
557
|
+
for (const filename of entries) {
|
|
558
|
+
const filePath = path.join(attrDir, filename);
|
|
559
|
+
let raw;
|
|
560
|
+
try {
|
|
561
|
+
const bytes = await fs.readFile(filePath, "utf8");
|
|
562
|
+
raw = JSON.parse(bytes);
|
|
563
|
+
}
|
|
564
|
+
catch (err) {
|
|
565
|
+
logger?.warn("loadAttributions: failed to read/parse attribution file", {
|
|
566
|
+
file: filePath,
|
|
567
|
+
error: err instanceof Error ? err.message : String(err),
|
|
568
|
+
});
|
|
569
|
+
continue;
|
|
570
|
+
}
|
|
571
|
+
const parsed = JudgmentAttributionSchema.safeParse(raw);
|
|
572
|
+
if (!parsed.success) {
|
|
573
|
+
logger?.warn("loadAttributions: attribution file failed schema validation", {
|
|
574
|
+
file: filePath,
|
|
575
|
+
errors: parsed.error.flatten(),
|
|
576
|
+
});
|
|
577
|
+
continue;
|
|
578
|
+
}
|
|
579
|
+
results.push(parsed.data);
|
|
580
|
+
}
|
|
581
|
+
return results;
|
|
582
|
+
}
|
|
583
|
+
/**
|
|
584
|
+
* Build a fully-wired `DiagnosisRunner` from an `AppContext`.
|
|
585
|
+
*
|
|
586
|
+
* Wires the full 8-card registry, `loadAttributions` bound to the local
|
|
587
|
+
* filesystem (Phase-4 per-entry attribution objects at
|
|
588
|
+
* `{artifactsDir}/runs/{runId}/attribution/*.json`), and no-op cache
|
|
589
|
+
* reader/writer (Plan-06 CLI command will wire the real cache seam).
|
|
590
|
+
*
|
|
591
|
+
* Plan-06 API/CLI consumers import this function from the composition root
|
|
592
|
+
* and pass `ctx` from `createAppContext(config)`.
|
|
593
|
+
*/
|
|
594
|
+
export function getDiagnosisRunner(ctx) {
|
|
595
|
+
const artifactsDir = ctx.config.artifactsDir ?? DIAGNOSIS_LOCAL_ARTIFACTS_DIR;
|
|
596
|
+
// No-op cache shims — Plan 06 wires the real cache.
|
|
597
|
+
const diagnosisReader = async (_path) => null;
|
|
598
|
+
const diagnosisWriter = async (_path, _diagnosis) => { };
|
|
599
|
+
return createDiagnosisRunner({
|
|
600
|
+
llm: ctx.llmClient,
|
|
601
|
+
model: modelId("anthropic:claude-opus-4-6"),
|
|
602
|
+
logger: ctx.logger,
|
|
603
|
+
progress: ctx.progress,
|
|
604
|
+
registry: buildDiagnosisRegistry(),
|
|
605
|
+
diagnosisReader,
|
|
606
|
+
diagnosisWriter,
|
|
607
|
+
loadAttributions: (runId) => loadAttributionsFromLocalFs(runId, artifactsDir, ctx.logger),
|
|
608
|
+
});
|
|
609
|
+
}
|
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* diagnosis-cards.ts — Diagnosis eval matrix config.
|
|
3
|
+
*
|
|
4
|
+
* TS-first config (per .claude/rules/config.md) defining the 5 LLM card types
|
|
5
|
+
* × 3 first-class models eval matrix. Consumed by
|
|
6
|
+
* `scripts/generate-diagnosis-config.ts` to emit
|
|
7
|
+
* `promptfooconfig-diagnosis.yaml`. Never hand-edit the YAML — run
|
|
8
|
+
* `pnpm generate-configs` instead.
|
|
9
|
+
*
|
|
10
|
+
* Per AI-SPEC §5 and CONTEXT D-04 (path b: standalone generator entry point
|
|
11
|
+
* for the diagnosis config, additive — does not modify the existing literacy
|
|
12
|
+
* generate-configs pipeline).
|
|
13
|
+
*
|
|
14
|
+
* @see packages/eval/scripts/generate-diagnosis-config.ts — generator
|
|
15
|
+
* @see packages/eval/promptfooconfig-diagnosis.yaml — generated output
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
// Types
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* A first-class model entry in the diagnosis eval matrix.
|
|
24
|
+
* Mirrors the shape of model entries in `config/models.ts`.
|
|
25
|
+
*/
|
|
26
|
+
export interface DiagnosisModelEntry {
|
|
27
|
+
/** Promptfoo provider string (e.g. "anthropic:messages:claude-opus-4-6") */
|
|
28
|
+
id: string
|
|
29
|
+
/** Human-readable label for reports */
|
|
30
|
+
label: string
|
|
31
|
+
/** Per-model config overrides (temperature, max_tokens, etc.) */
|
|
32
|
+
config?: Record<string, unknown>
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* The 5 LLM-driven card types under evaluation.
|
|
37
|
+
* Deterministic cards (area-summary, failure-mode-summary, no-issues) are
|
|
38
|
+
* tested via `fixture-matrix.test.ts` (vitest), not via the promptfoo matrix.
|
|
39
|
+
*/
|
|
40
|
+
export type LLMCardType =
|
|
41
|
+
| "top-recommendations"
|
|
42
|
+
| "weakest-area"
|
|
43
|
+
| "low-confidence-attribution"
|
|
44
|
+
| "doc-attribution-spotlight"
|
|
45
|
+
| "regression-vs-baseline"
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* A single evaluation scenario: one fixture path × one expected outcome.
|
|
49
|
+
*
|
|
50
|
+
* The `fixturePath` is relative to `packages/eval/` so the promptfoo config
|
|
51
|
+
* can resolve it from any working directory. `expectedStatus` drives the
|
|
52
|
+
* pass/fail assertion in the generated YAML.
|
|
53
|
+
*/
|
|
54
|
+
export interface DiagnosisScenario {
|
|
55
|
+
/** Short slug used in promptfoo `description` fields */
|
|
56
|
+
name: string
|
|
57
|
+
/** Path to the Report JSON fixture, relative to `packages/eval/` */
|
|
58
|
+
fixturePath: string
|
|
59
|
+
/**
|
|
60
|
+
* Card type this scenario exercises. The eval matrix runs all LLM cards
|
|
61
|
+
* per scenario; this field annotates which card type is the primary focus
|
|
62
|
+
* for the rubric.
|
|
63
|
+
*/
|
|
64
|
+
primaryCard: LLMCardType
|
|
65
|
+
/** Expected card status when all LLM calls succeed */
|
|
66
|
+
expectedStatus: "ready" | "degraded" | "missing"
|
|
67
|
+
/** Optional: path to canned LLM response for adversarial scenarios */
|
|
68
|
+
cannedResponsePath?: string
|
|
69
|
+
/**
|
|
70
|
+
* Optional: cardId to key the canned response against (for FakeLLMClient
|
|
71
|
+
* keyedResponses in vitest; mirrored in the promptfoo scenario description
|
|
72
|
+
* for documentation).
|
|
73
|
+
*/
|
|
74
|
+
cannedCardId?: LLMCardType
|
|
75
|
+
/** Free-text note about what this scenario tests */
|
|
76
|
+
note?: string
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Top-level diagnosis eval matrix config.
|
|
81
|
+
* Exported as the default export of this file (mirrors models.ts convention).
|
|
82
|
+
*/
|
|
83
|
+
export interface DiagnosisCardsConfig {
|
|
84
|
+
/** All LLM card evaluation scenarios */
|
|
85
|
+
scenarios: DiagnosisScenario[]
|
|
86
|
+
/** Models to run each scenario against */
|
|
87
|
+
models: DiagnosisModelEntry[]
|
|
88
|
+
/** Grader model for LLM-judge assertions */
|
|
89
|
+
grader: DiagnosisModelEntry
|
|
90
|
+
/** Eval budget in milliseconds (kill switch) */
|
|
91
|
+
evalBudgetMs: number
|
|
92
|
+
/** Max parallel API calls */
|
|
93
|
+
maxConcurrency: number
|
|
94
|
+
/** Default per-model config */
|
|
95
|
+
defaults: {
|
|
96
|
+
temperature: number
|
|
97
|
+
max_tokens: number
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// ---------------------------------------------------------------------------
|
|
102
|
+
// Helper
|
|
103
|
+
// ---------------------------------------------------------------------------
|
|
104
|
+
|
|
105
|
+
export function defineDiagnosisCards(
|
|
106
|
+
config: DiagnosisCardsConfig
|
|
107
|
+
): DiagnosisCardsConfig {
|
|
108
|
+
return config
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// ---------------------------------------------------------------------------
|
|
112
|
+
// Config definition
|
|
113
|
+
// ---------------------------------------------------------------------------
|
|
114
|
+
|
|
115
|
+
const diagnosisCardsConfig: DiagnosisCardsConfig = defineDiagnosisCards({
|
|
116
|
+
// ── Models under evaluation ────────────────────────────────────────────────
|
|
117
|
+
models: [
|
|
118
|
+
{
|
|
119
|
+
id: "anthropic:messages:claude-opus-4-6",
|
|
120
|
+
label: "Claude Opus 4.6",
|
|
121
|
+
config: { temperature: 0.2, max_tokens: 4096 },
|
|
122
|
+
},
|
|
123
|
+
{
|
|
124
|
+
id: "anthropic:messages:claude-sonnet-4-6",
|
|
125
|
+
label: "Claude Sonnet 4.6",
|
|
126
|
+
config: { temperature: 0.2, max_tokens: 4096 },
|
|
127
|
+
},
|
|
128
|
+
{
|
|
129
|
+
id: "openai:chat:gpt-5.2",
|
|
130
|
+
label: "GPT 5.2",
|
|
131
|
+
config: { max_completion_tokens: 4096 },
|
|
132
|
+
},
|
|
133
|
+
],
|
|
134
|
+
|
|
135
|
+
// ── Grader model ────────────────────────────────────────────────────────────
|
|
136
|
+
grader: {
|
|
137
|
+
id: "anthropic:messages:claude-opus-4-5-20251101",
|
|
138
|
+
label: "Claude Opus 4.5 (grader)",
|
|
139
|
+
},
|
|
140
|
+
|
|
141
|
+
// ── Eval budget ─────────────────────────────────────────────────────────────
|
|
142
|
+
evalBudgetMs: 3_600_000, // 60 min — full matrix across 3 models × 17 scenarios
|
|
143
|
+
maxConcurrency: 8, // conservative for diagnosis (longer prompts than literacy)
|
|
144
|
+
|
|
145
|
+
// ── Default config ──────────────────────────────────────────────────────────
|
|
146
|
+
defaults: {
|
|
147
|
+
temperature: 0.2,
|
|
148
|
+
max_tokens: 4096,
|
|
149
|
+
},
|
|
150
|
+
|
|
151
|
+
// ── Scenarios (17 logical fixtures) ─────────────────────────────────────────
|
|
152
|
+
scenarios: [
|
|
153
|
+
// ── Critical-path: top-recommendations ──────────────────────────────────
|
|
154
|
+
{
|
|
155
|
+
name: "healthy-top-recommendations",
|
|
156
|
+
fixturePath:
|
|
157
|
+
"test-fixtures/diagnosis/reports/healthy-top-recommendations.json",
|
|
158
|
+
primaryCard: "top-recommendations",
|
|
159
|
+
expectedStatus: "ready",
|
|
160
|
+
note: "Healthy report (mean 91) — top-recommendations card should produce 2+ actionable suggestions with docSlug references from the manifest.",
|
|
161
|
+
},
|
|
162
|
+
{
|
|
163
|
+
name: "low-top-recommendations",
|
|
164
|
+
fixturePath:
|
|
165
|
+
"test-fixtures/diagnosis/reports/low-top-recommendations.json",
|
|
166
|
+
primaryCard: "top-recommendations",
|
|
167
|
+
expectedStatus: "ready",
|
|
168
|
+
note: "Low-scoring report (mean 42) — top-recommendations card should produce high-priority suggestions addressing the dominant failure modes (outdated-docs, missing-docs).",
|
|
169
|
+
},
|
|
170
|
+
|
|
171
|
+
// ── Critical-path: weakest-area ──────────────────────────────────────────
|
|
172
|
+
{
|
|
173
|
+
name: "healthy-weakest-area",
|
|
174
|
+
fixturePath: "test-fixtures/diagnosis/reports/healthy-weakest-area.json",
|
|
175
|
+
primaryCard: "weakest-area",
|
|
176
|
+
expectedStatus: "ready",
|
|
177
|
+
note: "Healthy report with clear weakest area (content-modeling at 82) — weakest-area card should identify the area and provide high-confidence analysis.",
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
name: "low-weakest-area",
|
|
181
|
+
fixturePath: "test-fixtures/diagnosis/reports/low-weakest-area.json",
|
|
182
|
+
primaryCard: "weakest-area",
|
|
183
|
+
expectedStatus: "ready",
|
|
184
|
+
note: "Low-scoring report with clear weakest area (content-modeling at 28) — weakest-area card should identify the most critical area with multiple failure modes.",
|
|
185
|
+
},
|
|
186
|
+
|
|
187
|
+
// ── Critical-path: low-confidence-attribution ────────────────────────────
|
|
188
|
+
{
|
|
189
|
+
name: "healthy-low-confidence-attribution",
|
|
190
|
+
fixturePath:
|
|
191
|
+
"test-fixtures/diagnosis/reports/healthy-low-confidence-attribution.json",
|
|
192
|
+
primaryCard: "low-confidence-attribution",
|
|
193
|
+
expectedStatus: "ready",
|
|
194
|
+
note: "Healthy report with small sample sizes (2-3 judgments per area) — low-confidence-attribution card should identify attribution uncertainty despite positive scores.",
|
|
195
|
+
},
|
|
196
|
+
{
|
|
197
|
+
name: "low-low-confidence-attribution",
|
|
198
|
+
fixturePath:
|
|
199
|
+
"test-fixtures/diagnosis/reports/low-low-confidence-attribution.json",
|
|
200
|
+
primaryCard: "low-confidence-attribution",
|
|
201
|
+
expectedStatus: "ready",
|
|
202
|
+
note: "Low-scoring report with small sample sizes (2 judgments per area) — low-confidence-attribution card should flag both score quality and attribution uncertainty.",
|
|
203
|
+
},
|
|
204
|
+
|
|
205
|
+
// ── Critical-path: doc-attribution-spotlight ─────────────────────────────
|
|
206
|
+
{
|
|
207
|
+
name: "healthy-doc-attribution-spotlight",
|
|
208
|
+
fixturePath:
|
|
209
|
+
"test-fixtures/diagnosis/reports/healthy-doc-attribution-spotlight.json",
|
|
210
|
+
primaryCard: "doc-attribution-spotlight",
|
|
211
|
+
expectedStatus: "ready",
|
|
212
|
+
note: "Healthy 5-area report — doc-attribution-spotlight card should identify the highest-impact document in the manifest.",
|
|
213
|
+
},
|
|
214
|
+
{
|
|
215
|
+
name: "low-doc-attribution-spotlight",
|
|
216
|
+
fixturePath:
|
|
217
|
+
"test-fixtures/diagnosis/reports/low-doc-attribution-spotlight.json",
|
|
218
|
+
primaryCard: "doc-attribution-spotlight",
|
|
219
|
+
expectedStatus: "ready",
|
|
220
|
+
note: "Low-scoring 5-area report with multiple failure modes — doc-attribution-spotlight card should identify the most critical document.",
|
|
221
|
+
},
|
|
222
|
+
|
|
223
|
+
// ── Edge cases ───────────────────────────────────────────────────────────
|
|
224
|
+
{
|
|
225
|
+
name: "empty-report",
|
|
226
|
+
fixturePath: "test-fixtures/diagnosis/reports/empty.json",
|
|
227
|
+
primaryCard: "top-recommendations",
|
|
228
|
+
expectedStatus: "missing",
|
|
229
|
+
note: "Edge case (a): zero-area report — all LLM cards should emit status: missing (no data to reason about).",
|
|
230
|
+
},
|
|
231
|
+
{
|
|
232
|
+
name: "single-judgment-per-area",
|
|
233
|
+
fixturePath:
|
|
234
|
+
"test-fixtures/diagnosis/reports/single-judgment-per-area.json",
|
|
235
|
+
primaryCard: "weakest-area",
|
|
236
|
+
expectedStatus: "ready",
|
|
237
|
+
note: "Edge case (b): single-judgment sample size — weakest-area card should reflect low-confidence calibration (sampleSize: 1).",
|
|
238
|
+
},
|
|
239
|
+
{
|
|
240
|
+
name: "all-areas-tied",
|
|
241
|
+
fixturePath: "test-fixtures/diagnosis/reports/all-areas-tied.json",
|
|
242
|
+
primaryCard: "weakest-area",
|
|
243
|
+
expectedStatus: "missing",
|
|
244
|
+
note: "Edge case (c): all areas scored identically (70) — weakest-area card should emit status: missing with reason: no-clear-weakest.",
|
|
245
|
+
},
|
|
246
|
+
{
|
|
247
|
+
name: "grader-major-mismatch-baseline",
|
|
248
|
+
fixturePath:
|
|
249
|
+
"test-fixtures/diagnosis/reports/grader-major-mismatch-baseline.json",
|
|
250
|
+
primaryCard: "regression-vs-baseline",
|
|
251
|
+
expectedStatus: "missing",
|
|
252
|
+
note: "Edge case (d): grader-major-version mismatch — regression-vs-baseline should emit missing with reason: grader-major-version-mismatch. Run as pair with grader-major-mismatch-current.",
|
|
253
|
+
},
|
|
254
|
+
{
|
|
255
|
+
name: "grader-major-mismatch-current",
|
|
256
|
+
fixturePath:
|
|
257
|
+
"test-fixtures/diagnosis/reports/grader-major-mismatch-current.json",
|
|
258
|
+
primaryCard: "regression-vs-baseline",
|
|
259
|
+
expectedStatus: "missing",
|
|
260
|
+
note: "Edge case (d) pair: current report with different graderModel — regression-vs-baseline mismatch guard triggers when paired with grader-major-mismatch-baseline.",
|
|
261
|
+
},
|
|
262
|
+
{
|
|
263
|
+
name: "near-deprecated-taxonomy",
|
|
264
|
+
fixturePath:
|
|
265
|
+
"test-fixtures/diagnosis/reports/near-deprecated-taxonomy.json",
|
|
266
|
+
primaryCard: "weakest-area",
|
|
267
|
+
expectedStatus: "ready",
|
|
268
|
+
note: "Edge case (e): report using unclassified failure mode (currently canonical but watch for taxonomy retirement). Zod refine() must accept canonical modes.",
|
|
269
|
+
},
|
|
270
|
+
|
|
271
|
+
// ── Adversarial canned responses ─────────────────────────────────────────
|
|
272
|
+
{
|
|
273
|
+
name: "adversarial-fabricated-delta",
|
|
274
|
+
fixturePath:
|
|
275
|
+
"test-fixtures/diagnosis/reports/grader-major-mismatch-current.json",
|
|
276
|
+
primaryCard: "regression-vs-baseline",
|
|
277
|
+
expectedStatus: "degraded",
|
|
278
|
+
cannedResponsePath:
|
|
279
|
+
"test-fixtures/diagnosis/canned-responses/fabricated-delta-regression.json",
|
|
280
|
+
cannedCardId: "regression-vs-baseline",
|
|
281
|
+
note: "Adversarial: fabricated delta (AI-SPEC §1b failure-mode #1). LLM claims -7.3 delta; direction-sign refine triggers degraded card.",
|
|
282
|
+
},
|
|
283
|
+
{
|
|
284
|
+
name: "adversarial-improve-introduction",
|
|
285
|
+
fixturePath:
|
|
286
|
+
"test-fixtures/diagnosis/reports/low-top-recommendations.json",
|
|
287
|
+
primaryCard: "top-recommendations",
|
|
288
|
+
expectedStatus: "degraded",
|
|
289
|
+
cannedResponsePath:
|
|
290
|
+
"test-fixtures/diagnosis/canned-responses/improve-introduction.json",
|
|
291
|
+
cannedCardId: "top-recommendations",
|
|
292
|
+
note: "Adversarial: generic anti-pattern recommendation (AI-SPEC §1b failure-mode #2). Actionability refine triggers degraded card.",
|
|
293
|
+
},
|
|
294
|
+
{
|
|
295
|
+
name: "adversarial-hallucinated-docslug",
|
|
296
|
+
fixturePath:
|
|
297
|
+
"test-fixtures/diagnosis/reports/low-top-recommendations.json",
|
|
298
|
+
primaryCard: "top-recommendations",
|
|
299
|
+
expectedStatus: "degraded",
|
|
300
|
+
cannedResponsePath:
|
|
301
|
+
"test-fixtures/diagnosis/canned-responses/hallucinated-docslug.json",
|
|
302
|
+
cannedCardId: "top-recommendations",
|
|
303
|
+
note: "Adversarial: hallucinated docSlug (AI-SPEC §1b failure-mode #3). Allow-list refine triggers degraded card.",
|
|
304
|
+
},
|
|
305
|
+
{
|
|
306
|
+
name: "adversarial-taxonomy-drift",
|
|
307
|
+
fixturePath: "test-fixtures/diagnosis/reports/low-weakest-area.json",
|
|
308
|
+
primaryCard: "weakest-area",
|
|
309
|
+
expectedStatus: "degraded",
|
|
310
|
+
cannedResponsePath:
|
|
311
|
+
"test-fixtures/diagnosis/canned-responses/taxonomy-drift.json",
|
|
312
|
+
cannedCardId: "weakest-area",
|
|
313
|
+
note: "Adversarial: taxonomy drift (AI-SPEC §1b failure-mode #4). Per-dimension failureMode refine triggers degraded card.",
|
|
314
|
+
},
|
|
315
|
+
],
|
|
316
|
+
})
|
|
317
|
+
|
|
318
|
+
export default diagnosisCardsConfig
|
package/dist/config/models.ts
CHANGED
|
@@ -24,6 +24,18 @@ export default defineModels({
|
|
|
24
24
|
// All literacy variants included by default (baseline, observed,
|
|
25
25
|
// agentic-naive, agentic-optimized)
|
|
26
26
|
},
|
|
27
|
+
{
|
|
28
|
+
// Phase 5 LLM card routing (D-07). AI-SPEC §4 routes 3 routine cards
|
|
29
|
+
// (top-recommendations, weakest-area, regression-vs-baseline) here.
|
|
30
|
+
// Pricing already in AnthropicLLMClient; baseline literacy variant only.
|
|
31
|
+
id: "anthropic:messages:claude-sonnet-4-6",
|
|
32
|
+
label: "Claude Sonnet 4.6",
|
|
33
|
+
config: { temperature: 0.2, max_tokens: 4096 },
|
|
34
|
+
modes: ["literacy"],
|
|
35
|
+
variants: {
|
|
36
|
+
literacy: ["baseline"],
|
|
37
|
+
},
|
|
38
|
+
},
|
|
27
39
|
|
|
28
40
|
// ── Google ─────────────────────────────────────────────────
|
|
29
41
|
// {
|