@sanity/ailf 5.0.0 → 6.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/airbyte/ai_literacy_framework.connector.yaml +276 -0
- package/config/bigquery/views/synthesis_parse_failure_rate_7d.sql +42 -0
- package/config/diagnosis-cards.ts +318 -0
- package/config/models.ts +12 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +4 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +12 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +230 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +108 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +140 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +65 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +93 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +130 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +111 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +118 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +286 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +74 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +119 -2
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +136 -2
- package/dist/_vendor/ailf-core/services/index.d.ts +5 -1
- package/dist/_vendor/ailf-core/services/index.js +15 -2
- package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +115 -10
- package/dist/_vendor/ailf-core/types/diagnosis.js +3 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +16 -0
- package/dist/_vendor/ailf-core/types/synthesis-telemetry.d.ts +101 -0
- package/dist/_vendor/ailf-core/types/synthesis-telemetry.js +18 -0
- package/dist/adapters/config-sources/file-config-adapter.js +8 -6
- package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
- package/dist/adapters/llm/fake-llm-client.js +38 -1
- package/dist/adapters/llm/index.d.ts +1 -1
- package/dist/adapters/llm/index.js +1 -1
- package/dist/adapters/llm/openai-llm-client.js +59 -5
- package/dist/adapters/llm/retry.d.ts +18 -0
- package/dist/adapters/llm/retry.js +21 -0
- package/dist/adapters/synthesis/synthesis-telemetry-schema.d.ts +49 -0
- package/dist/adapters/synthesis/synthesis-telemetry-schema.js +55 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +10 -5
- package/dist/adapters/task-sources/repo-schemas.d.ts +7 -0
- package/dist/adapters/task-sources/repo-schemas.js +10 -0
- package/dist/cli-program.js +3 -0
- package/dist/commands/interpret.d.ts +70 -0
- package/dist/commands/interpret.js +221 -0
- package/dist/commands/pipeline-action.d.ts +44 -0
- package/dist/commands/pipeline-action.js +193 -1
- package/dist/commands/run.d.ts +2 -0
- package/dist/commands/run.js +2 -0
- package/dist/composition-root.d.ts +21 -23
- package/dist/composition-root.js +107 -41
- package/dist/config/diagnosis-cards.ts +318 -0
- package/dist/config/models.ts +12 -0
- package/dist/grader/agent-harness.d.ts +5 -10
- package/dist/grader/agent-harness.js +5 -13
- package/dist/grader/common.d.ts +5 -13
- package/dist/grader/common.js +5 -17
- package/dist/grader/index.d.ts +15 -29
- package/dist/grader/index.js +15 -66
- package/dist/grader/knowledge-probe.d.ts +5 -10
- package/dist/grader/knowledge-probe.js +5 -14
- package/dist/grader/literacy.d.ts +5 -9
- package/dist/grader/literacy.js +5 -13
- package/dist/grader/mcp.d.ts +5 -10
- package/dist/grader/mcp.js +5 -14
- package/dist/orchestration/pipeline-orchestrator.js +3 -0
- package/dist/report-store.d.ts +26 -0
- package/dist/report-store.js +63 -0
- package/package.json +2 -2
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* failure-mode-summary card — deterministic projection over Report.summary.failureModes.
|
|
3
|
+
*
|
|
4
|
+
* Pure computation, no LLM call. Identifies the dominant failure mode
|
|
5
|
+
* across all dimensions in the report's slim failure-mode summary.
|
|
6
|
+
*
|
|
7
|
+
* D-05: `.refine(buildFailureModeRefinement())` rejects cross-dimension
|
|
8
|
+
* (dimension, failureMode) pairs that the schema otherwise would accept —
|
|
9
|
+
* turning a "Zod-passes, semantically wrong" LLM output into a
|
|
10
|
+
* `parseFailed: true` degraded card. For this deterministic card, the
|
|
11
|
+
* refinement also defends against bad Report data.
|
|
12
|
+
*
|
|
13
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
14
|
+
* mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
|
|
15
|
+
*
|
|
16
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-05)
|
|
17
|
+
* @see packages/core/src/services/diagnosis/card-validators.ts
|
|
18
|
+
*/
|
|
19
|
+
import { z } from "zod";
|
|
20
|
+
import { CANONICAL_DIMENSIONS, failureModesForDimension, } from "../../../grader/failure-modes/index.js";
|
|
21
|
+
import { buildFailureModeRefinement } from "../card-validators.js";
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
// Body schema (D0045 trust boundary — satisfies required; D-05 refine)
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
export const FailureModeSummaryBodySchema = z
|
|
26
|
+
.object({
|
|
27
|
+
summary: z.string().min(1).max(800),
|
|
28
|
+
dimension: z.string().min(1),
|
|
29
|
+
failureMode: z.string().min(1),
|
|
30
|
+
count: z.number().int().nonnegative(),
|
|
31
|
+
sampleSize: z.number().int().nonnegative(),
|
|
32
|
+
})
|
|
33
|
+
.refine(buildFailureModeRefinement(), {
|
|
34
|
+
message: "failureMode is not in the canonical taxonomy for this dimension",
|
|
35
|
+
path: ["failureMode"],
|
|
36
|
+
});
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
// Private helper — find the dimension a failure mode belongs to
|
|
39
|
+
// ---------------------------------------------------------------------------
|
|
40
|
+
/**
|
|
41
|
+
* Per-family dimension preference order.
|
|
42
|
+
*
|
|
43
|
+
* Some failure modes (e.g., `missing-docs`) appear in multiple dimension
|
|
44
|
+
* families. When we know the report's eval mode, we should resolve the mode
|
|
45
|
+
* to a dimension in the matching family first, falling back to the linear
|
|
46
|
+
* scan only when the mode-preferred family doesn't carry the failure mode.
|
|
47
|
+
*/
|
|
48
|
+
const MODE_TO_PREFERRED_DIMENSIONS = {
|
|
49
|
+
literacy: ["task-completion", "code-correctness", "doc-coverage"],
|
|
50
|
+
"mcp-server": [
|
|
51
|
+
"mcp-behavior",
|
|
52
|
+
"input-validation",
|
|
53
|
+
"output-correctness",
|
|
54
|
+
"error-handling",
|
|
55
|
+
"security",
|
|
56
|
+
],
|
|
57
|
+
"knowledge-probe": [
|
|
58
|
+
"knowledge-probe",
|
|
59
|
+
"factual-correctness",
|
|
60
|
+
"completeness",
|
|
61
|
+
"currency",
|
|
62
|
+
],
|
|
63
|
+
"agent-harness": [
|
|
64
|
+
"agent-harness",
|
|
65
|
+
"process-quality",
|
|
66
|
+
"agent-output",
|
|
67
|
+
"tool-usage",
|
|
68
|
+
],
|
|
69
|
+
custom: [],
|
|
70
|
+
};
|
|
71
|
+
/**
|
|
72
|
+
* Find a canonical dimension whose taxonomy includes `mode`. When a
|
|
73
|
+
* `preferredEvalMode` is supplied, prefer dimensions in the eval mode's
|
|
74
|
+
* family — e.g. a mode appearing in both literacy and MCP resolves to MCP on
|
|
75
|
+
* an MCP-only run. Falls back to the linear scan of CANONICAL_DIMENSIONS so
|
|
76
|
+
* cross-cutting modes (and modes from unknown eval modes) still resolve.
|
|
77
|
+
*
|
|
78
|
+
* Returns `undefined` if the mode is not in any dimension's taxonomy.
|
|
79
|
+
*/
|
|
80
|
+
function findDimensionForMode(mode, preferredEvalMode) {
|
|
81
|
+
if (preferredEvalMode) {
|
|
82
|
+
const preferred = MODE_TO_PREFERRED_DIMENSIONS[preferredEvalMode] ?? [];
|
|
83
|
+
for (const dim of preferred) {
|
|
84
|
+
if (failureModesForDimension(dim).includes(mode)) {
|
|
85
|
+
return dim;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
for (const dim of CANONICAL_DIMENSIONS) {
|
|
90
|
+
if (failureModesForDimension(dim).includes(mode)) {
|
|
91
|
+
return dim;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
return undefined;
|
|
95
|
+
}
|
|
96
|
+
// ---------------------------------------------------------------------------
|
|
97
|
+
// Generator
|
|
98
|
+
// ---------------------------------------------------------------------------
|
|
99
|
+
export const generateFailureModeSummary = async (report) => {
|
|
100
|
+
const slimFm = report.summary.failureModes;
|
|
101
|
+
if (!slimFm ||
|
|
102
|
+
!slimFm.topTitles ||
|
|
103
|
+
slimFm.topTitles.length === 0 ||
|
|
104
|
+
slimFm.totalJudgments === 0) {
|
|
105
|
+
return {
|
|
106
|
+
status: "missing",
|
|
107
|
+
cardType: "failure-mode-summary",
|
|
108
|
+
reason: "report has no failure modes",
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
// Find the top entry — topTitles is already sorted by count descending
|
|
112
|
+
const topEntry = slimFm.topTitles.reduce((best, entry) => (entry.count > best.count ? entry : best), slimFm.topTitles[0]);
|
|
113
|
+
const failureMode = topEntry.category;
|
|
114
|
+
const dimension = findDimensionForMode(failureMode, report.provenance?.mode);
|
|
115
|
+
if (!dimension) {
|
|
116
|
+
return {
|
|
117
|
+
status: "missing",
|
|
118
|
+
cardType: "failure-mode-summary",
|
|
119
|
+
reason: `failure mode "${failureMode}" is not in the canonical taxonomy`,
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
const sampleSize = slimFm.totalJudgments;
|
|
123
|
+
const summary = `The most frequent failure mode is "${failureMode}" (${topEntry.count} of ${sampleSize} judgments in dimension "${dimension}").`;
|
|
124
|
+
const body = FailureModeSummaryBodySchema.parse({
|
|
125
|
+
summary,
|
|
126
|
+
dimension,
|
|
127
|
+
failureMode,
|
|
128
|
+
count: topEntry.count,
|
|
129
|
+
sampleSize,
|
|
130
|
+
});
|
|
131
|
+
return {
|
|
132
|
+
status: "ready",
|
|
133
|
+
cardType: "failure-mode-summary",
|
|
134
|
+
body,
|
|
135
|
+
meta: {
|
|
136
|
+
cardVersion: "failure-mode-summary@0.1.0",
|
|
137
|
+
generatedAt: new Date().toISOString(),
|
|
138
|
+
},
|
|
139
|
+
};
|
|
140
|
+
};
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Card-generator barrel — exports all 8 generators + DIAGNOSIS_CARD_GENERATORS.
|
|
3
|
+
*
|
|
4
|
+
* This barrel lives in @sanity/ailf-core so the Plan-06 API route can import
|
|
5
|
+
* `DIAGNOSIS_CARD_GENERATORS` via `import { DIAGNOSIS_CARD_GENERATORS } from
|
|
6
|
+
* "@sanity/ailf-core"` without depending on `@sanity/ailf` (D-01 boundary).
|
|
7
|
+
*
|
|
8
|
+
* This is a CARD-GENERATOR barrel only — no vendor SDK classes, no adapter
|
|
9
|
+
* implementations. Cards speak the `LLMClient` port exclusively (D0051).
|
|
10
|
+
*
|
|
11
|
+
* TypeScript exhaustiveness: `CardType` is a 8-element literal union; the
|
|
12
|
+
* `Record<CardType, CardGenerator>` annotation causes a build error if any
|
|
13
|
+
* key is missing or extra.
|
|
14
|
+
*
|
|
15
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-01)
|
|
16
|
+
* @see docs/decisions/D0051-llm-client-port.md
|
|
17
|
+
*/
|
|
18
|
+
import { generateAreaSummary } from "./area-summary.js";
|
|
19
|
+
import { generateFailureModeSummary } from "./failure-mode-summary.js";
|
|
20
|
+
import { generateNoIssues } from "./no-issues.js";
|
|
21
|
+
import { generateTopRecommendations } from "./top-recommendations.js";
|
|
22
|
+
import { generateWeakestArea } from "./weakest-area.js";
|
|
23
|
+
import { generateLowConfidenceAttribution } from "./low-confidence-attribution.js";
|
|
24
|
+
import { generateDocAttributionSpotlight } from "./doc-attribution-spotlight.js";
|
|
25
|
+
import { generateRegressionVsBaseline } from "./regression-vs-baseline.js";
|
|
26
|
+
import type { CardGenerator } from "../../diagnosis-runner.js";
|
|
27
|
+
import type { CardType } from "../../../types/diagnosis.js";
|
|
28
|
+
/**
|
|
29
|
+
* Canonical version of the card-registry surface — bumped whenever any card
|
|
30
|
+
* generator or body schema in this barrel changes. Used as the `cardVersion`
|
|
31
|
+
* fallback in version-resolver helpers (CLI `interpret`, API
|
|
32
|
+
* `versionsFromRecord`) so the four-version cache invalidation envelope stays
|
|
33
|
+
* in sync with the actual registry.
|
|
34
|
+
*
|
|
35
|
+
* Mirrors the pattern of `diagnosisVersion` exported from `diagnosis-runner.ts`.
|
|
36
|
+
*/
|
|
37
|
+
export declare const CARD_REGISTRY_VERSION = "0.1.0";
|
|
38
|
+
/**
|
|
39
|
+
* The canonical card-generator registry for the diagnosis engine.
|
|
40
|
+
*
|
|
41
|
+
* `Readonly<Record<CardType, CardGenerator>>` — TypeScript exhaustiveness
|
|
42
|
+
* ensures all 8 `CardType` strings appear (no rogue keys, no missing keys).
|
|
43
|
+
* The composition root (`packages/eval/src/composition-root.ts`) passes this
|
|
44
|
+
* directly into `createDiagnosisRunner(deps)`.
|
|
45
|
+
*
|
|
46
|
+
* Also consumed by the Plan-06 API route, which imports via `@sanity/ailf-core`.
|
|
47
|
+
*/
|
|
48
|
+
export declare const DIAGNOSIS_CARD_GENERATORS: Readonly<Record<CardType, CardGenerator>>;
|
|
49
|
+
export { generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, };
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Card-generator barrel — exports all 8 generators + DIAGNOSIS_CARD_GENERATORS.
|
|
3
|
+
*
|
|
4
|
+
* This barrel lives in @sanity/ailf-core so the Plan-06 API route can import
|
|
5
|
+
* `DIAGNOSIS_CARD_GENERATORS` via `import { DIAGNOSIS_CARD_GENERATORS } from
|
|
6
|
+
* "@sanity/ailf-core"` without depending on `@sanity/ailf` (D-01 boundary).
|
|
7
|
+
*
|
|
8
|
+
* This is a CARD-GENERATOR barrel only — no vendor SDK classes, no adapter
|
|
9
|
+
* implementations. Cards speak the `LLMClient` port exclusively (D0051).
|
|
10
|
+
*
|
|
11
|
+
* TypeScript exhaustiveness: `CardType` is a 8-element literal union; the
|
|
12
|
+
* `Record<CardType, CardGenerator>` annotation causes a build error if any
|
|
13
|
+
* key is missing or extra.
|
|
14
|
+
*
|
|
15
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-01)
|
|
16
|
+
* @see docs/decisions/D0051-llm-client-port.md
|
|
17
|
+
*/
|
|
18
|
+
import { generateAreaSummary } from "./area-summary.js";
|
|
19
|
+
import { generateFailureModeSummary } from "./failure-mode-summary.js";
|
|
20
|
+
import { generateNoIssues } from "./no-issues.js";
|
|
21
|
+
import { generateTopRecommendations } from "./top-recommendations.js";
|
|
22
|
+
import { generateWeakestArea } from "./weakest-area.js";
|
|
23
|
+
import { generateLowConfidenceAttribution } from "./low-confidence-attribution.js";
|
|
24
|
+
import { generateDocAttributionSpotlight } from "./doc-attribution-spotlight.js";
|
|
25
|
+
import { generateRegressionVsBaseline } from "./regression-vs-baseline.js";
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
// Card registry version (cache invalidation segment)
|
|
28
|
+
// ---------------------------------------------------------------------------
|
|
29
|
+
/**
|
|
30
|
+
* Canonical version of the card-registry surface — bumped whenever any card
|
|
31
|
+
* generator or body schema in this barrel changes. Used as the `cardVersion`
|
|
32
|
+
* fallback in version-resolver helpers (CLI `interpret`, API
|
|
33
|
+
* `versionsFromRecord`) so the four-version cache invalidation envelope stays
|
|
34
|
+
* in sync with the actual registry.
|
|
35
|
+
*
|
|
36
|
+
* Mirrors the pattern of `diagnosisVersion` exported from `diagnosis-runner.ts`.
|
|
37
|
+
*/
|
|
38
|
+
export const CARD_REGISTRY_VERSION = "0.1.0";
|
|
39
|
+
// ---------------------------------------------------------------------------
|
|
40
|
+
// DIAGNOSIS_CARD_GENERATORS — full 8-card registry literal
|
|
41
|
+
// ---------------------------------------------------------------------------
|
|
42
|
+
/**
|
|
43
|
+
* The canonical card-generator registry for the diagnosis engine.
|
|
44
|
+
*
|
|
45
|
+
* `Readonly<Record<CardType, CardGenerator>>` — TypeScript exhaustiveness
|
|
46
|
+
* ensures all 8 `CardType` strings appear (no rogue keys, no missing keys).
|
|
47
|
+
* The composition root (`packages/eval/src/composition-root.ts`) passes this
|
|
48
|
+
* directly into `createDiagnosisRunner(deps)`.
|
|
49
|
+
*
|
|
50
|
+
* Also consumed by the Plan-06 API route, which imports via `@sanity/ailf-core`.
|
|
51
|
+
*/
|
|
52
|
+
export const DIAGNOSIS_CARD_GENERATORS = {
|
|
53
|
+
"area-summary": generateAreaSummary,
|
|
54
|
+
"failure-mode-summary": generateFailureModeSummary,
|
|
55
|
+
"no-issues": generateNoIssues,
|
|
56
|
+
"top-recommendations": generateTopRecommendations,
|
|
57
|
+
"weakest-area": generateWeakestArea,
|
|
58
|
+
"low-confidence-attribution": generateLowConfidenceAttribution,
|
|
59
|
+
"doc-attribution-spotlight": generateDocAttributionSpotlight,
|
|
60
|
+
"regression-vs-baseline": generateRegressionVsBaseline,
|
|
61
|
+
};
|
|
62
|
+
// ---------------------------------------------------------------------------
|
|
63
|
+
// Individual re-exports (for callers that want a single generator)
|
|
64
|
+
// ---------------------------------------------------------------------------
|
|
65
|
+
export { generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, };
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* low-confidence-attribution card — LLM-driven uncertain-attribution finder.
|
|
3
|
+
*
|
|
4
|
+
* Model: claude-sonnet-4-6 (routine per AI-SPEC §4 model routing)
|
|
5
|
+
* Version: low-confidence-attribution@0.1.0
|
|
6
|
+
*
|
|
7
|
+
* Landmine 11: this card reads `ctx.judgmentAttributions` (NOT Report.summary).
|
|
8
|
+
* Returns `status: "missing"` when attributions are undefined or empty BEFORE
|
|
9
|
+
* calling the LLM — this is a structural seam, not an error path.
|
|
10
|
+
*
|
|
11
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
12
|
+
* mandatory.
|
|
13
|
+
*
|
|
14
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-RESEARCH.md (Landmine 11)
|
|
15
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
16
|
+
*/
|
|
17
|
+
import { z } from "zod";
|
|
18
|
+
import type { CardGenerator } from "../../diagnosis-runner.js";
|
|
19
|
+
export declare const LowConfidenceAttributionBodySchema: z.ZodObject<{
|
|
20
|
+
summary: z.ZodString;
|
|
21
|
+
judgmentRefs: z.ZodArray<z.ZodObject<{
|
|
22
|
+
taskId: z.ZodString;
|
|
23
|
+
modelId: z.ZodString;
|
|
24
|
+
dimension: z.ZodString;
|
|
25
|
+
}, z.core.$strip>>;
|
|
26
|
+
}, z.core.$strip>;
|
|
27
|
+
export declare const generateLowConfidenceAttribution: CardGenerator;
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* low-confidence-attribution card — LLM-driven uncertain-attribution finder.
|
|
3
|
+
*
|
|
4
|
+
* Model: claude-sonnet-4-6 (routine per AI-SPEC §4 model routing)
|
|
5
|
+
* Version: low-confidence-attribution@0.1.0
|
|
6
|
+
*
|
|
7
|
+
* Landmine 11: this card reads `ctx.judgmentAttributions` (NOT Report.summary).
|
|
8
|
+
* Returns `status: "missing"` when attributions are undefined or empty BEFORE
|
|
9
|
+
* calling the LLM — this is a structural seam, not an error path.
|
|
10
|
+
*
|
|
11
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
12
|
+
* mandatory.
|
|
13
|
+
*
|
|
14
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-RESEARCH.md (Landmine 11)
|
|
15
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
16
|
+
*/
|
|
17
|
+
import { z } from "zod";
|
|
18
|
+
import { modelId as mkModelId } from "../../../ports/llm-client.js";
|
|
19
|
+
import { buildLowConfidenceAttributionPrompt } from "../prompt-builders.js";
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
// Body schema (D0045 trust boundary — satisfies required)
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
export const LowConfidenceAttributionBodySchema = z.object({
|
|
24
|
+
summary: z.string().min(1).max(800),
|
|
25
|
+
judgmentRefs: z
|
|
26
|
+
.array(z.object({
|
|
27
|
+
taskId: z.string().min(1),
|
|
28
|
+
modelId: z.string().min(1),
|
|
29
|
+
dimension: z.string().min(1),
|
|
30
|
+
}))
|
|
31
|
+
.min(1),
|
|
32
|
+
});
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
// Generator
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
const CARD_MODEL = mkModelId("anthropic:claude-sonnet-4-6");
|
|
37
|
+
export const generateLowConfidenceAttribution = async (report, ctx) => {
|
|
38
|
+
// C1: no LLM → missing
|
|
39
|
+
if (!ctx.llm) {
|
|
40
|
+
return {
|
|
41
|
+
status: "missing",
|
|
42
|
+
cardType: "low-confidence-attribution",
|
|
43
|
+
reason: "no LLMClient wired",
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
// L1: Landmine 11 — short-circuit BEFORE calling LLM when no attribution data
|
|
47
|
+
if (!ctx.judgmentAttributions || ctx.judgmentAttributions.length === 0) {
|
|
48
|
+
return {
|
|
49
|
+
status: "missing",
|
|
50
|
+
cardType: "low-confidence-attribution",
|
|
51
|
+
reason: "no attribution data for this run",
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
// Also short-circuit when every JudgmentAttribution has an empty
|
|
55
|
+
// `attributions` array. The card schema requires `judgmentRefs.min(1)`, so
|
|
56
|
+
// the LLM would have nothing to cite — degrade gracefully instead of
|
|
57
|
+
// building an empty-table prompt and failing parse downstream.
|
|
58
|
+
const hasAnyAttributions = ctx.judgmentAttributions.some((ja) => ja.attributions.length > 0);
|
|
59
|
+
if (!hasAnyAttributions) {
|
|
60
|
+
return {
|
|
61
|
+
status: "missing",
|
|
62
|
+
cardType: "low-confidence-attribution",
|
|
63
|
+
reason: "attribution entries exist but all are empty",
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
const prompt = buildLowConfidenceAttributionPrompt(report, ctx.judgmentAttributions);
|
|
67
|
+
// Destructure `cost` and `model` from the LLMClient return —
|
|
68
|
+
// already provided per llm-client.ts:139-144, previously discarded.
|
|
69
|
+
const { value, usage, cost, model } = await ctx.llm.completeStructured({
|
|
70
|
+
model: CARD_MODEL,
|
|
71
|
+
prompt: `${prompt.system}\n\n${prompt.user}`,
|
|
72
|
+
schema: LowConfidenceAttributionBodySchema,
|
|
73
|
+
temperature: 0.1,
|
|
74
|
+
maxTokens: 2000,
|
|
75
|
+
context: {
|
|
76
|
+
feature: "diagnosis",
|
|
77
|
+
runId: ctx.runId,
|
|
78
|
+
cardId: "low-confidence-attribution",
|
|
79
|
+
},
|
|
80
|
+
});
|
|
81
|
+
return {
|
|
82
|
+
status: "ready",
|
|
83
|
+
cardType: "low-confidence-attribution",
|
|
84
|
+
body: value,
|
|
85
|
+
meta: {
|
|
86
|
+
cardVersion: "low-confidence-attribution@0.1.0",
|
|
87
|
+
tokenUsage: { input: usage.promptTokens, output: usage.completionTokens },
|
|
88
|
+
generatedAt: new Date().toISOString(),
|
|
89
|
+
cost,
|
|
90
|
+
model,
|
|
91
|
+
},
|
|
92
|
+
};
|
|
93
|
+
};
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* no-issues card — deterministic guard against sycophantic "all is well" reports.
|
|
3
|
+
*
|
|
4
|
+
* Only fires when ALL areas in the report scored at or above
|
|
5
|
+
* `NO_ISSUES_THRESHOLD`. The threshold is calibrated to keep the firing rate
|
|
6
|
+
* ≤30% per AI-SPEC §1b failure-mode #7 (sycophantic no-issues).
|
|
7
|
+
*
|
|
8
|
+
* Pure computation, no LLM call. Uses `report.summary.scores` — each
|
|
9
|
+
* `FeatureScore.totalScore` is the composite (0–100 scale).
|
|
10
|
+
*
|
|
11
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
12
|
+
* mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
|
|
13
|
+
*
|
|
14
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §1b failure-mode #7
|
|
15
|
+
*/
|
|
16
|
+
import { z } from "zod";
|
|
17
|
+
import type { CardGenerator } from "../../diagnosis-runner.js";
|
|
18
|
+
/**
|
|
19
|
+
* Threshold above which a Report area is considered "no issues" per the
|
|
20
|
+
* team's action threshold (AI-SPEC §1b failure-mode #7). Tuned against
|
|
21
|
+
* the fixture set in no-issues.test.ts to keep firing rate ≤30%.
|
|
22
|
+
*
|
|
23
|
+
* At 85, only reports where every area scores ≥85 trigger this card.
|
|
24
|
+
* On the 10-report fixture spanning 0–99, only reports at [90,92],
|
|
25
|
+
* [95,97], [98,99] qualify — a 3/10 = 30% firing rate (right at the cap).
|
|
26
|
+
*/
|
|
27
|
+
export declare const NO_ISSUES_THRESHOLD = 85;
|
|
28
|
+
export declare const NoIssuesBodySchema: z.ZodObject<{
|
|
29
|
+
summary: z.ZodString;
|
|
30
|
+
thresholdScore: z.ZodNumber;
|
|
31
|
+
}, z.core.$strip>;
|
|
32
|
+
export declare const generateNoIssues: CardGenerator;
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* no-issues card — deterministic guard against sycophantic "all is well" reports.
|
|
3
|
+
*
|
|
4
|
+
* Only fires when ALL areas in the report scored at or above
|
|
5
|
+
* `NO_ISSUES_THRESHOLD`. The threshold is calibrated to keep the firing rate
|
|
6
|
+
* ≤30% per AI-SPEC §1b failure-mode #7 (sycophantic no-issues).
|
|
7
|
+
*
|
|
8
|
+
* Pure computation, no LLM call. Uses `report.summary.scores` — each
|
|
9
|
+
* `FeatureScore.totalScore` is the composite (0–100 scale).
|
|
10
|
+
*
|
|
11
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
12
|
+
* mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
|
|
13
|
+
*
|
|
14
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §1b failure-mode #7
|
|
15
|
+
*/
|
|
16
|
+
import { z } from "zod";
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
// Threshold constant (calibration)
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
/**
|
|
21
|
+
* Threshold above which a Report area is considered "no issues" per the
|
|
22
|
+
* team's action threshold (AI-SPEC §1b failure-mode #7). Tuned against
|
|
23
|
+
* the fixture set in no-issues.test.ts to keep firing rate ≤30%.
|
|
24
|
+
*
|
|
25
|
+
* At 85, only reports where every area scores ≥85 trigger this card.
|
|
26
|
+
* On the 10-report fixture spanning 0–99, only reports at [90,92],
|
|
27
|
+
* [95,97], [98,99] qualify — a 3/10 = 30% firing rate (right at the cap).
|
|
28
|
+
*/
|
|
29
|
+
export const NO_ISSUES_THRESHOLD = 85;
|
|
30
|
+
// ---------------------------------------------------------------------------
|
|
31
|
+
// Body schema (D0045 trust boundary — satisfies required)
|
|
32
|
+
// ---------------------------------------------------------------------------
|
|
33
|
+
export const NoIssuesBodySchema = z.object({
|
|
34
|
+
summary: z.string().min(1).max(500),
|
|
35
|
+
thresholdScore: z.number(),
|
|
36
|
+
});
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
// Generator
|
|
39
|
+
// ---------------------------------------------------------------------------
|
|
40
|
+
export const generateNoIssues = async (report) => {
|
|
41
|
+
const scores = report.summary.scores;
|
|
42
|
+
if (!scores || scores.length === 0) {
|
|
43
|
+
return {
|
|
44
|
+
status: "missing",
|
|
45
|
+
cardType: "no-issues",
|
|
46
|
+
reason: "report has no areas",
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
const allAboveThreshold = scores.every((s) => s.totalScore >= NO_ISSUES_THRESHOLD);
|
|
50
|
+
if (!allAboveThreshold) {
|
|
51
|
+
return {
|
|
52
|
+
status: "missing",
|
|
53
|
+
cardType: "no-issues",
|
|
54
|
+
reason: "at least one area scored below threshold",
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
const summary = `All ${scores.length} areas scored ≥${NO_ISSUES_THRESHOLD} — no action required.`;
|
|
58
|
+
const body = NoIssuesBodySchema.parse({
|
|
59
|
+
summary,
|
|
60
|
+
thresholdScore: NO_ISSUES_THRESHOLD,
|
|
61
|
+
});
|
|
62
|
+
return {
|
|
63
|
+
status: "ready",
|
|
64
|
+
cardType: "no-issues",
|
|
65
|
+
body,
|
|
66
|
+
meta: {
|
|
67
|
+
cardVersion: "no-issues@0.1.0",
|
|
68
|
+
generatedAt: new Date().toISOString(),
|
|
69
|
+
},
|
|
70
|
+
};
|
|
71
|
+
};
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* regression-vs-baseline card — LLM-driven run comparison card.
|
|
3
|
+
*
|
|
4
|
+
* Model: claude-opus-4-6 (high-stakes per AI-SPEC §4 model routing)
|
|
5
|
+
* Version: regression-vs-baseline@0.1.0
|
|
6
|
+
*
|
|
7
|
+
* DIAG-05: emits ONLY when `ctx.baseline` is set. When baseline is absent,
|
|
8
|
+
* returns `status: "missing", reason: "no --compare baseline supplied"`.
|
|
9
|
+
*
|
|
10
|
+
* Mitigations:
|
|
11
|
+
* - failure-mode #1: `buildRegressionVsBaselinePrompt` computes deltas in JS
|
|
12
|
+
* BEFORE the LLM call; schema refine asserts sign-consistency (R3)
|
|
13
|
+
*
|
|
14
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
15
|
+
* mandatory.
|
|
16
|
+
*
|
|
17
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3 lines 603-664
|
|
18
|
+
*/
|
|
19
|
+
import { z } from "zod";
|
|
20
|
+
import type { CardGenerator } from "../../diagnosis-runner.js";
|
|
21
|
+
/**
|
|
22
|
+
* Module-level static shape only. Per-call adds:
|
|
23
|
+
* - sign-consistency refine (R3): Math.sign(pointsDelta) === directionSign(direction)
|
|
24
|
+
*/
|
|
25
|
+
export declare const RegressionVsBaselineBodySchema: z.ZodObject<{
|
|
26
|
+
summary: z.ZodString;
|
|
27
|
+
deltas: z.ZodArray<z.ZodObject<{
|
|
28
|
+
area: z.ZodString;
|
|
29
|
+
direction: z.ZodEnum<{
|
|
30
|
+
improved: "improved";
|
|
31
|
+
regressed: "regressed";
|
|
32
|
+
unchanged: "unchanged";
|
|
33
|
+
}>;
|
|
34
|
+
pointsDelta: z.ZodNumber;
|
|
35
|
+
drivers: z.ZodArray<z.ZodString>;
|
|
36
|
+
}, z.core.$strip>>;
|
|
37
|
+
overallTrend: z.ZodEnum<{
|
|
38
|
+
"net-improved": "net-improved";
|
|
39
|
+
"net-regressed": "net-regressed";
|
|
40
|
+
mixed: "mixed";
|
|
41
|
+
stable: "stable";
|
|
42
|
+
}>;
|
|
43
|
+
}, z.core.$strip>;
|
|
44
|
+
export declare const generateRegressionVsBaseline: CardGenerator;
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* regression-vs-baseline card — LLM-driven run comparison card.
|
|
3
|
+
*
|
|
4
|
+
* Model: claude-opus-4-6 (high-stakes per AI-SPEC §4 model routing)
|
|
5
|
+
* Version: regression-vs-baseline@0.1.0
|
|
6
|
+
*
|
|
7
|
+
* DIAG-05: emits ONLY when `ctx.baseline` is set. When baseline is absent,
|
|
8
|
+
* returns `status: "missing", reason: "no --compare baseline supplied"`.
|
|
9
|
+
*
|
|
10
|
+
* Mitigations:
|
|
11
|
+
* - failure-mode #1: `buildRegressionVsBaselinePrompt` computes deltas in JS
|
|
12
|
+
* BEFORE the LLM call; schema refine asserts sign-consistency (R3)
|
|
13
|
+
*
|
|
14
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
15
|
+
* mandatory.
|
|
16
|
+
*
|
|
17
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3 lines 603-664
|
|
18
|
+
*/
|
|
19
|
+
import { z } from "zod";
|
|
20
|
+
import { modelId as mkModelId } from "../../../ports/llm-client.js";
|
|
21
|
+
import { buildRegressionVsBaselinePrompt } from "../prompt-builders.js";
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
// Helper: direction sign check
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
/**
|
|
26
|
+
* Map direction label to its expected sign. Returns the expected sign:
|
|
27
|
+
* +1 for improved, -1 for regressed, 0 for unchanged.
|
|
28
|
+
*/
|
|
29
|
+
function directionSign(direction) {
|
|
30
|
+
if (direction === "improved")
|
|
31
|
+
return 1;
|
|
32
|
+
if (direction === "regressed")
|
|
33
|
+
return -1;
|
|
34
|
+
return 0;
|
|
35
|
+
}
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
// Body schema (D0045 trust boundary — satisfies required)
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
/**
|
|
40
|
+
* Module-level static shape only. Per-call adds:
|
|
41
|
+
* - sign-consistency refine (R3): Math.sign(pointsDelta) === directionSign(direction)
|
|
42
|
+
*/
|
|
43
|
+
export const RegressionVsBaselineBodySchema = z.object({
|
|
44
|
+
summary: z.string().min(1).max(800),
|
|
45
|
+
deltas: z
|
|
46
|
+
.array(z.object({
|
|
47
|
+
area: z.string().min(1),
|
|
48
|
+
direction: z.enum(["improved", "regressed", "unchanged"]),
|
|
49
|
+
pointsDelta: z.number(),
|
|
50
|
+
drivers: z.array(z.string()),
|
|
51
|
+
}))
|
|
52
|
+
.max(10),
|
|
53
|
+
overallTrend: z.enum(["net-improved", "net-regressed", "mixed", "stable"]),
|
|
54
|
+
});
|
|
55
|
+
// ---------------------------------------------------------------------------
|
|
56
|
+
// Generator
|
|
57
|
+
// ---------------------------------------------------------------------------
|
|
58
|
+
const CARD_MODEL = mkModelId("anthropic:claude-opus-4-6");
|
|
59
|
+
export const generateRegressionVsBaseline = async (report, ctx) => {
|
|
60
|
+
// C1: no LLM → missing
|
|
61
|
+
if (!ctx.llm) {
|
|
62
|
+
return {
|
|
63
|
+
status: "missing",
|
|
64
|
+
cardType: "regression-vs-baseline",
|
|
65
|
+
reason: "no LLMClient wired",
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
// R1: DIAG-05 no-auto-comparison — only emits when baseline is supplied
|
|
69
|
+
if (!ctx.baseline) {
|
|
70
|
+
return {
|
|
71
|
+
status: "missing",
|
|
72
|
+
cardType: "regression-vs-baseline",
|
|
73
|
+
reason: "no --compare baseline supplied",
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
// Compute deltas in JS BEFORE the LLM call (failure-mode #1 mitigation).
|
|
77
|
+
// The prompt embeds the precomputed numbers so the LLM cannot fabricate deltas.
|
|
78
|
+
const { system, user } = buildRegressionVsBaselinePrompt(report, ctx.baseline);
|
|
79
|
+
// Per-call schema: additive R3 sign-consistency refine
|
|
80
|
+
// (AI-SPEC §3 Pitfall 1 — schema local to generator, not module scope)
|
|
81
|
+
const PerCallSchema = z
|
|
82
|
+
.object({
|
|
83
|
+
summary: z.string().min(1).max(800),
|
|
84
|
+
deltas: z
|
|
85
|
+
.array(z.object({
|
|
86
|
+
area: z.string().min(1),
|
|
87
|
+
direction: z.enum(["improved", "regressed", "unchanged"]),
|
|
88
|
+
pointsDelta: z.number(),
|
|
89
|
+
drivers: z.array(z.string()),
|
|
90
|
+
}))
|
|
91
|
+
.max(10),
|
|
92
|
+
overallTrend: z.enum([
|
|
93
|
+
"net-improved",
|
|
94
|
+
"net-regressed",
|
|
95
|
+
"mixed",
|
|
96
|
+
"stable",
|
|
97
|
+
]),
|
|
98
|
+
})
|
|
99
|
+
// R3: sign-consistency guardrail — LLM must not invert direction vs delta
|
|
100
|
+
.refine((body) => body.deltas.every((d) => Math.sign(d.pointsDelta) === directionSign(d.direction)), {
|
|
101
|
+
message: "direction label must match sign of pointsDelta (improved=positive, regressed=negative, unchanged=zero)",
|
|
102
|
+
path: ["deltas"],
|
|
103
|
+
});
|
|
104
|
+
// Destructure `cost` and `model` from the LLMClient return —
|
|
105
|
+
// already provided per llm-client.ts:139-144, previously discarded.
|
|
106
|
+
const { value, usage, cost, model } = await ctx.llm.completeStructured({
|
|
107
|
+
model: CARD_MODEL,
|
|
108
|
+
prompt: `${system}\n\n${user}`,
|
|
109
|
+
schema: PerCallSchema,
|
|
110
|
+
temperature: 0.1,
|
|
111
|
+
maxTokens: 2000,
|
|
112
|
+
context: {
|
|
113
|
+
feature: "diagnosis",
|
|
114
|
+
runId: ctx.runId,
|
|
115
|
+
cardId: "regression-vs-baseline",
|
|
116
|
+
},
|
|
117
|
+
});
|
|
118
|
+
return {
|
|
119
|
+
status: "ready",
|
|
120
|
+
cardType: "regression-vs-baseline",
|
|
121
|
+
body: value,
|
|
122
|
+
meta: {
|
|
123
|
+
cardVersion: "regression-vs-baseline@0.1.0",
|
|
124
|
+
tokenUsage: { input: usage.promptTokens, output: usage.completionTokens },
|
|
125
|
+
generatedAt: new Date().toISOString(),
|
|
126
|
+
cost,
|
|
127
|
+
model,
|
|
128
|
+
},
|
|
129
|
+
};
|
|
130
|
+
};
|