@sanity/ailf 5.0.0 → 6.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/airbyte/ai_literacy_framework.connector.yaml +276 -0
- package/config/bigquery/views/synthesis_parse_failure_rate_7d.sql +42 -0
- package/config/diagnosis-cards.ts +318 -0
- package/config/models.ts +12 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +4 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +12 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +230 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +108 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +140 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +65 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +93 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +130 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +111 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +118 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +286 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +74 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +119 -2
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +136 -2
- package/dist/_vendor/ailf-core/services/index.d.ts +5 -1
- package/dist/_vendor/ailf-core/services/index.js +15 -2
- package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +115 -10
- package/dist/_vendor/ailf-core/types/diagnosis.js +3 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +16 -0
- package/dist/_vendor/ailf-core/types/synthesis-telemetry.d.ts +101 -0
- package/dist/_vendor/ailf-core/types/synthesis-telemetry.js +18 -0
- package/dist/adapters/config-sources/file-config-adapter.js +8 -6
- package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
- package/dist/adapters/llm/fake-llm-client.js +38 -1
- package/dist/adapters/llm/index.d.ts +1 -1
- package/dist/adapters/llm/index.js +1 -1
- package/dist/adapters/llm/openai-llm-client.js +59 -5
- package/dist/adapters/llm/retry.d.ts +18 -0
- package/dist/adapters/llm/retry.js +21 -0
- package/dist/adapters/synthesis/synthesis-telemetry-schema.d.ts +49 -0
- package/dist/adapters/synthesis/synthesis-telemetry-schema.js +55 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +10 -5
- package/dist/adapters/task-sources/repo-schemas.d.ts +7 -0
- package/dist/adapters/task-sources/repo-schemas.js +10 -0
- package/dist/cli-program.js +3 -0
- package/dist/commands/interpret.d.ts +70 -0
- package/dist/commands/interpret.js +221 -0
- package/dist/commands/pipeline-action.d.ts +44 -0
- package/dist/commands/pipeline-action.js +193 -1
- package/dist/commands/run.d.ts +2 -0
- package/dist/commands/run.js +2 -0
- package/dist/composition-root.d.ts +21 -23
- package/dist/composition-root.js +107 -41
- package/dist/config/diagnosis-cards.ts +318 -0
- package/dist/config/models.ts +12 -0
- package/dist/grader/agent-harness.d.ts +5 -10
- package/dist/grader/agent-harness.js +5 -13
- package/dist/grader/common.d.ts +5 -13
- package/dist/grader/common.js +5 -17
- package/dist/grader/index.d.ts +15 -29
- package/dist/grader/index.js +15 -66
- package/dist/grader/knowledge-probe.d.ts +5 -10
- package/dist/grader/knowledge-probe.js +5 -14
- package/dist/grader/literacy.d.ts +5 -9
- package/dist/grader/literacy.js +5 -13
- package/dist/grader/mcp.d.ts +5 -10
- package/dist/grader/mcp.js +5 -14
- package/dist/orchestration/pipeline-orchestrator.js +3 -0
- package/dist/report-store.d.ts +26 -0
- package/dist/report-store.js +63 -0
- package/package.json +2 -2
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* interpret command — generate a Diagnosis for a Report.
|
|
3
|
+
*
|
|
4
|
+
* Wraps `getDiagnosisRunner(ctx)` from the composition root in a Commander
|
|
5
|
+
* command for consistent CLI integration. Closest analog: compare.ts.
|
|
6
|
+
*
|
|
7
|
+
* Entry points:
|
|
8
|
+
* ailf interpret <reportId> — one-line-per-card summary
|
|
9
|
+
* ailf interpret <reportId> --json — full Diagnosis JSON
|
|
10
|
+
* ailf interpret latest — most recent report
|
|
11
|
+
* ailf interpret <id> --compare <ref> — DIAG-05 regression comparison
|
|
12
|
+
* ailf interpret <id> --refresh — bypass version-keyed cache
|
|
13
|
+
*
|
|
14
|
+
* @see packages/eval/src/commands/compare.ts — CLI factory analog
|
|
15
|
+
* @see packages/eval/src/composition-root.ts — getDiagnosisRunner
|
|
16
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §6
|
|
17
|
+
*/
|
|
18
|
+
import { dirname, resolve } from "path";
|
|
19
|
+
import { fileURLToPath } from "url";
|
|
20
|
+
import { Command } from "commander";
|
|
21
|
+
import { CARD_REGISTRY_VERSION, diagnosisVersion, } from "../_vendor/ailf-core/index.js";
|
|
22
|
+
import { addOutputDirOption } from "./shared/options.js";
|
|
23
|
+
import { resolveOutputDir } from "./shared/resolve-output-dir.js";
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
// Module-level root constant (same pattern as compare.ts)
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
28
|
+
const ROOT = resolve(__dirname, "..", "..");
|
|
29
|
+
// ---------------------------------------------------------------------------
|
|
30
|
+
// Card output formatting (AI-SPEC §6 graceful-degradation-visibility)
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
/**
|
|
33
|
+
* Visual status markers — locked visual contract per plan Test 7:
|
|
34
|
+
* ready: "✓", degraded: "⚠", missing: "—"
|
|
35
|
+
*
|
|
36
|
+
* Exported so Plan 06-04's post-run hook imports the SAME object and
|
|
37
|
+
* D6-04's "single formatter, single visual contract" is physically
|
|
38
|
+
* enforced — no copy/paste drift possible.
|
|
39
|
+
*/
|
|
40
|
+
export const STATUS_ICONS = {
|
|
41
|
+
ready: "✓",
|
|
42
|
+
degraded: "⚠",
|
|
43
|
+
missing: "—",
|
|
44
|
+
};
|
|
45
|
+
function getCardSummaryText(card) {
|
|
46
|
+
if (card.status === "ready") {
|
|
47
|
+
return card.body.summary;
|
|
48
|
+
}
|
|
49
|
+
if (card.status === "degraded") {
|
|
50
|
+
return card.reason;
|
|
51
|
+
}
|
|
52
|
+
// missing
|
|
53
|
+
return card.reason;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Format a single card as a one-line summary string.
|
|
57
|
+
*
|
|
58
|
+
* Format: `<icon> <cardType>: <summary>`
|
|
59
|
+
* Per AI-SPEC §6: distinct icons for ready / degraded / missing.
|
|
60
|
+
*
|
|
61
|
+
* Exported so Plan 06-04's post-run hook imports the SAME function and
|
|
62
|
+
* D6-04's "single formatter, single visual contract" is physically
|
|
63
|
+
* enforced — no copy/paste drift possible.
|
|
64
|
+
*/
|
|
65
|
+
export function formatCardSummaryLine(card) {
|
|
66
|
+
const icon = STATUS_ICONS[card.status];
|
|
67
|
+
const text = getCardSummaryText(card);
|
|
68
|
+
return `${icon} ${card.cardType}: ${text}`;
|
|
69
|
+
}
|
|
70
|
+
// ---------------------------------------------------------------------------
|
|
71
|
+
// Default versions resolver
|
|
72
|
+
// ---------------------------------------------------------------------------
|
|
73
|
+
/**
|
|
74
|
+
* Derive VersionedInputs from a stored report record.
|
|
75
|
+
*
|
|
76
|
+
* The four-version chain is carried in `report.summary.versions` per the
|
|
77
|
+
* Phase 5 schema, with `diagnosisVersion` sourced from the runner's const.
|
|
78
|
+
* Falls back to hard-coded "unknown" values when the fields are not present
|
|
79
|
+
* (legacy reports without version metadata).
|
|
80
|
+
*/
|
|
81
|
+
function defaultVersionsFromReport(report) {
|
|
82
|
+
const rec = report;
|
|
83
|
+
const summary = rec.summary;
|
|
84
|
+
const versions = summary?.versions;
|
|
85
|
+
return {
|
|
86
|
+
graderJudgmentsVersion: typeof versions?.graderJudgmentsVersion === "string"
|
|
87
|
+
? versions.graderJudgmentsVersion
|
|
88
|
+
: "unknown",
|
|
89
|
+
ensembleVersion: typeof versions?.ensembleVersion === "string"
|
|
90
|
+
? versions.ensembleVersion
|
|
91
|
+
: "unknown",
|
|
92
|
+
diagnosisVersion: typeof versions?.diagnosisVersion === "string"
|
|
93
|
+
? versions.diagnosisVersion
|
|
94
|
+
: diagnosisVersion,
|
|
95
|
+
cardVersion: typeof versions?.cardVersion === "string"
|
|
96
|
+
? versions.cardVersion
|
|
97
|
+
: CARD_REGISTRY_VERSION,
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
// ---------------------------------------------------------------------------
|
|
101
|
+
// Command factory
|
|
102
|
+
// ---------------------------------------------------------------------------
|
|
103
|
+
/**
|
|
104
|
+
* Create the `ailf interpret <reportId>` Commander command.
|
|
105
|
+
*
|
|
106
|
+
* Accepts optional `InterpretCommandOptions` for testability — tests can
|
|
107
|
+
* inject a fake runner factory and store factory without touching module
|
|
108
|
+
* mocks (preferred per testing.md).
|
|
109
|
+
*/
|
|
110
|
+
export function createInterpretCommand(options = {}) {
|
|
111
|
+
const { runnerFactory, storeFactory, versionsFromReport } = options;
|
|
112
|
+
const cmd = new Command("interpret")
|
|
113
|
+
.description("Generate a Diagnosis for a Report — 8 typed cards explaining what's weak and what to do")
|
|
114
|
+
.argument("<reportId>", "Report ID (or 'latest' for the most recent)")
|
|
115
|
+
.option("-c, --compare <ref>", "Baseline report ID for regression-vs-baseline comparison")
|
|
116
|
+
.option("--refresh", "Bypass the version-keyed cache and recompute")
|
|
117
|
+
.option("--json", "Print full Diagnosis JSON instead of one-line-per-card summary")
|
|
118
|
+
.action(async (reportId, opts) => {
|
|
119
|
+
const outputDir = resolveOutputDir(opts.outputDir);
|
|
120
|
+
// ---------------------------------------------------------------------------
|
|
121
|
+
// Resolve store: injected factory (tests) or composition root (production)
|
|
122
|
+
// ---------------------------------------------------------------------------
|
|
123
|
+
let store;
|
|
124
|
+
let ctx;
|
|
125
|
+
if (storeFactory) {
|
|
126
|
+
store = storeFactory();
|
|
127
|
+
ctx = null;
|
|
128
|
+
}
|
|
129
|
+
else {
|
|
130
|
+
// Production path — lazy import to keep the module fast in tests
|
|
131
|
+
// Minimal config: report-read-only, no eval/fetch/publish.
|
|
132
|
+
const { createAppContext } = await import("../composition-root.js");
|
|
133
|
+
ctx = createAppContext({
|
|
134
|
+
compareEnabled: false,
|
|
135
|
+
gapAnalysisEnabled: false,
|
|
136
|
+
mode: "literacy",
|
|
137
|
+
noAutoScope: false,
|
|
138
|
+
noCache: true,
|
|
139
|
+
noRemoteCache: true,
|
|
140
|
+
outputDir,
|
|
141
|
+
publishEnabled: false,
|
|
142
|
+
rootDir: ROOT,
|
|
143
|
+
searchMode: "open",
|
|
144
|
+
skipEval: true,
|
|
145
|
+
skipFetch: true,
|
|
146
|
+
remote: false,
|
|
147
|
+
apiUrl: "https://ailf-api.sanity.build",
|
|
148
|
+
});
|
|
149
|
+
const prodCtx = ctx;
|
|
150
|
+
store = prodCtx.reportStore;
|
|
151
|
+
}
|
|
152
|
+
if (!store) {
|
|
153
|
+
process.stderr.write("Error: report store is not available\n");
|
|
154
|
+
process.exit(1);
|
|
155
|
+
}
|
|
156
|
+
// ---------------------------------------------------------------------------
|
|
157
|
+
// Resolve main report
|
|
158
|
+
// ---------------------------------------------------------------------------
|
|
159
|
+
const report = reportId === "latest"
|
|
160
|
+
? await store.latest()
|
|
161
|
+
: await store.read(reportId);
|
|
162
|
+
if (!report) {
|
|
163
|
+
process.stderr.write(`Error: report not found: ${reportId}\n`);
|
|
164
|
+
process.exit(1);
|
|
165
|
+
}
|
|
166
|
+
// ---------------------------------------------------------------------------
|
|
167
|
+
// Optionally resolve baseline (DIAG-05)
|
|
168
|
+
// ---------------------------------------------------------------------------
|
|
169
|
+
let baseline;
|
|
170
|
+
if (opts.compare) {
|
|
171
|
+
baseline = await store.read(opts.compare);
|
|
172
|
+
if (!baseline) {
|
|
173
|
+
process.stderr.write(`Error: baseline report not found: ${opts.compare}\n`);
|
|
174
|
+
process.exit(1);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
// ---------------------------------------------------------------------------
|
|
178
|
+
// Resolve versions
|
|
179
|
+
// ---------------------------------------------------------------------------
|
|
180
|
+
const versions = versionsFromReport
|
|
181
|
+
? versionsFromReport(report)
|
|
182
|
+
: defaultVersionsFromReport(report);
|
|
183
|
+
// ---------------------------------------------------------------------------
|
|
184
|
+
// Build runner
|
|
185
|
+
// ---------------------------------------------------------------------------
|
|
186
|
+
let runner;
|
|
187
|
+
if (runnerFactory) {
|
|
188
|
+
runner = runnerFactory(ctx);
|
|
189
|
+
}
|
|
190
|
+
else {
|
|
191
|
+
const { getDiagnosisRunner } = await import("../composition-root.js");
|
|
192
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
193
|
+
runner = getDiagnosisRunner(ctx);
|
|
194
|
+
}
|
|
195
|
+
// ---------------------------------------------------------------------------
|
|
196
|
+
// Run diagnosis
|
|
197
|
+
// ---------------------------------------------------------------------------
|
|
198
|
+
const diagnosis = await runner.run({
|
|
199
|
+
// The report here is the eval's ReportStore record, which satisfies
|
|
200
|
+
// the Report interface for runner.run purposes (both carry id + provenance.runId).
|
|
201
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
202
|
+
report: report,
|
|
203
|
+
versions,
|
|
204
|
+
...(baseline ? { baseline: baseline } : {}),
|
|
205
|
+
refresh: opts.refresh ?? false,
|
|
206
|
+
});
|
|
207
|
+
// ---------------------------------------------------------------------------
|
|
208
|
+
// Print output
|
|
209
|
+
// ---------------------------------------------------------------------------
|
|
210
|
+
if (opts.json) {
|
|
211
|
+
process.stdout.write(`${JSON.stringify(diagnosis, null, 2)}\n`);
|
|
212
|
+
}
|
|
213
|
+
else {
|
|
214
|
+
for (const card of diagnosis.cards) {
|
|
215
|
+
process.stdout.write(`${formatCardSummaryLine(card)}\n`);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
});
|
|
219
|
+
addOutputDirOption(cmd);
|
|
220
|
+
return cmd;
|
|
221
|
+
}
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
*/
|
|
13
13
|
import { type ImpactSummary } from "../pipeline/reverse-mapping.js";
|
|
14
14
|
import type { DebugOptions, EvalMode } from "../pipeline/types.js";
|
|
15
|
+
import { type Diagnosis, type ReportStorePort, type SynthesisCostTelemetry } from "../_vendor/ailf-core/index.d.ts";
|
|
15
16
|
import type { PipelineCliOptions } from "./run.js";
|
|
16
17
|
export interface ResolvedOptions {
|
|
17
18
|
allowedOriginArgs: string[];
|
|
@@ -63,6 +64,12 @@ export interface ResolvedOptions {
|
|
|
63
64
|
studioOriginOverride?: string;
|
|
64
65
|
remote: boolean;
|
|
65
66
|
repoTasksPath?: string;
|
|
67
|
+
/** Phase 6 / DIAG-06: post-run diagnosis summary policy. Precedence
|
|
68
|
+
* resolution (CLI flag > env > config > auto) lives in
|
|
69
|
+
* shouldRunPostSummary() — this field carries only the config-file
|
|
70
|
+
* signal so the helper has a single typed input.
|
|
71
|
+
*/
|
|
72
|
+
summaryOnRun?: "auto" | "always" | "never";
|
|
66
73
|
taskOption?: string;
|
|
67
74
|
tagOption?: string[];
|
|
68
75
|
taskSourceType?: "content-lake" | "repo";
|
|
@@ -88,6 +95,43 @@ export interface ResolvedOptions {
|
|
|
88
95
|
* Exported so the plan builder can call it independently.
|
|
89
96
|
*/
|
|
90
97
|
export declare function computeResolvedOptions(opts: PipelineCliOptions): ResolvedOptions;
|
|
98
|
+
/**
|
|
99
|
+
* Determine whether the post-run diagnosis summary hook should fire.
|
|
100
|
+
*
|
|
101
|
+
* 4-level precedence chain (D6-20):
|
|
102
|
+
* Level 1 — CLI flag (absolute): if `cliOpts.summary` is boolean, use it.
|
|
103
|
+
* Level 2 — AILF_INTERPRET_ON_RUN env var (absolute): strict "1"/"0" parse;
|
|
104
|
+
* anything else falls through (T-06-11 spoofing mitigation).
|
|
105
|
+
* Level 3 — config `summary.onRun` (absolute): "always" → true; "never" → false;
|
|
106
|
+
* "auto" or absent falls through to level 4.
|
|
107
|
+
* Level 4 — default auto: TTY && !CI (SC1 default-off in CI).
|
|
108
|
+
*/
|
|
109
|
+
export declare function shouldRunPostSummary(cliOpts: PipelineCliOptions, resolvedOnRun: "auto" | "always" | "never" | undefined): boolean;
|
|
110
|
+
export declare function buildSynthesisTelemetry(diagnosis: Diagnosis): SynthesisCostTelemetry;
|
|
111
|
+
/**
|
|
112
|
+
* Run post-pipeline hooks after the pipeline completes.
|
|
113
|
+
*
|
|
114
|
+
* Fires after orchestratePipeline() + writePipelineResult() (D6-02).
|
|
115
|
+
* Hook failure prints to stderr but does NOT change exit code (D6-03).
|
|
116
|
+
* CI default-off: fires only when shouldRunPostSummary returns true (D6-20).
|
|
117
|
+
*
|
|
118
|
+
* @param ctx - App context (composition root wiring)
|
|
119
|
+
* @param result - Pipeline result (includes reportId when published)
|
|
120
|
+
* @param args - Hook options (cliOpts, summaryOnRun from config, optional runnerFactory for tests)
|
|
121
|
+
*/
|
|
122
|
+
export declare function runPostPipelineHooks(ctx: {
|
|
123
|
+
reportStore?: ReportStorePort;
|
|
124
|
+
}, result: {
|
|
125
|
+
success: boolean;
|
|
126
|
+
reportId?: string;
|
|
127
|
+
}, args: {
|
|
128
|
+
cliOpts: PipelineCliOptions;
|
|
129
|
+
summaryOnRun?: "auto" | "always" | "never";
|
|
130
|
+
/** Override runner factory for tests — avoids vi.mock of composition root */
|
|
131
|
+
runnerFactory?: (ctx: unknown) => {
|
|
132
|
+
run(opts: unknown): Promise<Diagnosis>;
|
|
133
|
+
};
|
|
134
|
+
}): Promise<void>;
|
|
91
135
|
/**
|
|
92
136
|
* Execute the evaluation pipeline.
|
|
93
137
|
*
|
|
@@ -20,9 +20,13 @@ import { buildAppContext, parseArtifactUploadEnv, } from "../orchestration/build
|
|
|
20
20
|
import { buildStepSequence } from "../orchestration/build-step-sequence.js";
|
|
21
21
|
import { orchestratePipeline } from "../orchestration/pipeline-orchestrator.js";
|
|
22
22
|
import { load } from "js-yaml";
|
|
23
|
-
import { PLACEHOLDER_OWNER_TEAM } from "../_vendor/ailf-core/index.js";
|
|
23
|
+
import { PLACEHOLDER_OWNER_TEAM, } from "../_vendor/ailf-core/index.js";
|
|
24
24
|
import { parseRepoConfig, } from "../adapters/task-sources/repo-schemas.js";
|
|
25
25
|
import { getCallerCwd, resolveOutputDir } from "./shared/resolve-output-dir.js";
|
|
26
|
+
// Phase 6 / DIAG-06 — single formatter, single visual contract (D6-04).
|
|
27
|
+
// Import statically so bundlers and type-checkers can verify the export
|
|
28
|
+
// exists at build time rather than deferring to runtime dynamic import.
|
|
29
|
+
import { formatCardSummaryLine } from "./interpret.js";
|
|
26
30
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
27
31
|
const ROOT = resolve(__dirname, "..", "..");
|
|
28
32
|
// ---------------------------------------------------------------------------
|
|
@@ -250,6 +254,10 @@ export function computeResolvedOptions(opts) {
|
|
|
250
254
|
const graderReplications = repoConfig?.execution?.graderReplications;
|
|
251
255
|
const borderlineReplications = repoConfig?.execution?.borderlineReplications;
|
|
252
256
|
const gapAnalysisEnabled = repoConfig?.execution?.gapAnalysis ?? true;
|
|
257
|
+
// Phase 6 / DIAG-06 — post-run diagnosis summary policy from .ailf/config.yaml.
|
|
258
|
+
// Precedence resolution (CLI flag > env > config > auto) lives in
|
|
259
|
+
// shouldRunPostSummary(); this only carries the config-file signal.
|
|
260
|
+
const summaryOnRun = repoConfig?.summary?.onRun;
|
|
253
261
|
// Grader context policy. Cascade: env var > .ailf/config.yaml > unset
|
|
254
262
|
// (defaults to rubric-only at the EvalConfig boundary). The env var is the
|
|
255
263
|
// operational lever for one-shot comparison runs without editing the config file.
|
|
@@ -348,6 +356,7 @@ export function computeResolvedOptions(opts) {
|
|
|
348
356
|
undefined,
|
|
349
357
|
purposeOption: opts.purpose?.trim() || undefined,
|
|
350
358
|
labelOptions: opts.label ?? [],
|
|
359
|
+
summaryOnRun,
|
|
351
360
|
};
|
|
352
361
|
}
|
|
353
362
|
const PUBLISH_AUTO_VALUES = ["always", "full-runs", "never"];
|
|
@@ -373,6 +382,179 @@ function resolvePublishAuto(repoValue) {
|
|
|
373
382
|
}
|
|
374
383
|
return "full-runs";
|
|
375
384
|
}
|
|
385
|
+
// ---------------------------------------------------------------------------
|
|
386
|
+
// Phase 6 / DIAG-06 — post-run diagnosis summary helpers
|
|
387
|
+
// ---------------------------------------------------------------------------
|
|
388
|
+
/**
|
|
389
|
+
* Determine whether the post-run diagnosis summary hook should fire.
|
|
390
|
+
*
|
|
391
|
+
* 4-level precedence chain (D6-20):
|
|
392
|
+
* Level 1 — CLI flag (absolute): if `cliOpts.summary` is boolean, use it.
|
|
393
|
+
* Level 2 — AILF_INTERPRET_ON_RUN env var (absolute): strict "1"/"0" parse;
|
|
394
|
+
* anything else falls through (T-06-11 spoofing mitigation).
|
|
395
|
+
* Level 3 — config `summary.onRun` (absolute): "always" → true; "never" → false;
|
|
396
|
+
* "auto" or absent falls through to level 4.
|
|
397
|
+
* Level 4 — default auto: TTY && !CI (SC1 default-off in CI).
|
|
398
|
+
*/
|
|
399
|
+
export function shouldRunPostSummary(cliOpts, resolvedOnRun) {
|
|
400
|
+
// Level 1: CLI flag wins absolutely
|
|
401
|
+
if (cliOpts.summary === true)
|
|
402
|
+
return true;
|
|
403
|
+
if (cliOpts.summary === false)
|
|
404
|
+
return false;
|
|
405
|
+
// Level 2: AILF_INTERPRET_ON_RUN env var (strict parse)
|
|
406
|
+
const envVal = process.env.AILF_INTERPRET_ON_RUN;
|
|
407
|
+
if (envVal === "1")
|
|
408
|
+
return true;
|
|
409
|
+
if (envVal === "0")
|
|
410
|
+
return false;
|
|
411
|
+
// Anything else (garbage, unset) falls through
|
|
412
|
+
// Level 3: config summary.onRun
|
|
413
|
+
if (resolvedOnRun === "always")
|
|
414
|
+
return true;
|
|
415
|
+
if (resolvedOnRun === "never")
|
|
416
|
+
return false;
|
|
417
|
+
// "auto" or undefined falls through
|
|
418
|
+
// Level 4: default auto — fire only when stdout is interactive and not in CI
|
|
419
|
+
return Boolean(process.stdout.isTTY) && process.env.CI !== "true";
|
|
420
|
+
}
|
|
421
|
+
/**
|
|
422
|
+
* Build a SynthesisCostTelemetry payload from a completed Diagnosis.
|
|
423
|
+
*
|
|
424
|
+
* Aggregates:
|
|
425
|
+
* - cost: sum of meta.cost across all cards (undefined treated as 0)
|
|
426
|
+
* - parseFailureCount: cards where status==="degraded" (parse failures)
|
|
427
|
+
* - parseFailureRate: parseFailureCount / total-cards (max 8 per D6-09)
|
|
428
|
+
* - perCard: per-card row with safe-extracted structured metadata
|
|
429
|
+
*
|
|
430
|
+
* Deliberately does NOT read card.body — only structured meta fields are
|
|
431
|
+
* persisted (T-06-14 PII guard per threat model).
|
|
432
|
+
*/
|
|
433
|
+
// D6-09: denominator is always the fixed card-registry size, not cards.length.
|
|
434
|
+
// Using cards.length would allow parseFailureRate > 1.0 when the registry is
|
|
435
|
+
// a subset (e.g. test registries), violating the SynthesisCostTelemetrySchema
|
|
436
|
+
// min(0).max(1) constraint. Single edit point if the registry ever grows.
|
|
437
|
+
const CARD_REGISTRY_SIZE = 8;
|
|
438
|
+
export function buildSynthesisTelemetry(diagnosis) {
|
|
439
|
+
const cards = diagnosis.cards;
|
|
440
|
+
let totalCost = 0;
|
|
441
|
+
let parseFailureCount = 0;
|
|
442
|
+
const perCard = cards.map((card) => {
|
|
443
|
+
// "missing" cards have no `meta` — narrow with status guard
|
|
444
|
+
const meta = card.status !== "missing" ? card.meta : undefined;
|
|
445
|
+
const cost = meta?.cost ?? 0;
|
|
446
|
+
totalCost += cost;
|
|
447
|
+
// Parse failures produce status="degraded" (not "missing") in the current
|
|
448
|
+
// runner (diagnosis-runner.ts). A "missing" card is absence, not failure.
|
|
449
|
+
// If a future code path can produce status="missing" from a parse failure,
|
|
450
|
+
// this line must be updated and the parseFailed contract re-evaluated.
|
|
451
|
+
const parseFailed = card.status === "degraded";
|
|
452
|
+
if (parseFailed)
|
|
453
|
+
parseFailureCount++;
|
|
454
|
+
const row = {
|
|
455
|
+
cardType: card.cardType,
|
|
456
|
+
parseFailed,
|
|
457
|
+
cardVersion: meta?.cardVersion ?? "unknown",
|
|
458
|
+
generatedAt: meta?.generatedAt ?? new Date().toISOString(),
|
|
459
|
+
};
|
|
460
|
+
if (cost > 0)
|
|
461
|
+
row.cost = cost;
|
|
462
|
+
if (meta?.latencyMs !== undefined)
|
|
463
|
+
row.latencyMs = meta.latencyMs;
|
|
464
|
+
if (meta?.tokenUsage?.input !== undefined)
|
|
465
|
+
row.tokenInput = meta.tokenUsage.input;
|
|
466
|
+
if (meta?.tokenUsage?.output !== undefined)
|
|
467
|
+
row.tokenOutput = meta.tokenUsage.output;
|
|
468
|
+
return row;
|
|
469
|
+
});
|
|
470
|
+
return {
|
|
471
|
+
cost: totalCost,
|
|
472
|
+
parseFailureCount,
|
|
473
|
+
parseFailureRate: parseFailureCount / CARD_REGISTRY_SIZE,
|
|
474
|
+
perCard,
|
|
475
|
+
};
|
|
476
|
+
}
|
|
477
|
+
/**
|
|
478
|
+
* Run post-pipeline hooks after the pipeline completes.
|
|
479
|
+
*
|
|
480
|
+
* Fires after orchestratePipeline() + writePipelineResult() (D6-02).
|
|
481
|
+
* Hook failure prints to stderr but does NOT change exit code (D6-03).
|
|
482
|
+
* CI default-off: fires only when shouldRunPostSummary returns true (D6-20).
|
|
483
|
+
*
|
|
484
|
+
* @param ctx - App context (composition root wiring)
|
|
485
|
+
* @param result - Pipeline result (includes reportId when published)
|
|
486
|
+
* @param args - Hook options (cliOpts, summaryOnRun from config, optional runnerFactory for tests)
|
|
487
|
+
*/
|
|
488
|
+
export async function runPostPipelineHooks(ctx, result, args) {
|
|
489
|
+
if (!shouldRunPostSummary(args.cliOpts, args.summaryOnRun))
|
|
490
|
+
return;
|
|
491
|
+
if (!result.reportId) {
|
|
492
|
+
process.stderr.write("ℹ️ No report published — skipping post-summary.\n");
|
|
493
|
+
return;
|
|
494
|
+
}
|
|
495
|
+
const reportId = result.reportId;
|
|
496
|
+
try {
|
|
497
|
+
// Build the runner — use injected factory (tests) or composition root (production)
|
|
498
|
+
let runner;
|
|
499
|
+
if (args.runnerFactory) {
|
|
500
|
+
runner = args.runnerFactory(ctx);
|
|
501
|
+
}
|
|
502
|
+
else {
|
|
503
|
+
const { getDiagnosisRunner } = await import("../composition-root.js");
|
|
504
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
505
|
+
runner = getDiagnosisRunner(ctx);
|
|
506
|
+
}
|
|
507
|
+
// Read the stored report — needed by the runner for version metadata
|
|
508
|
+
const report = await ctx.reportStore?.read(reportId);
|
|
509
|
+
if (!report) {
|
|
510
|
+
process.stderr.write(`ℹ️ Report not found: ${reportId} — skipping post-summary.\n`);
|
|
511
|
+
return;
|
|
512
|
+
}
|
|
513
|
+
// Derive version metadata from the stored report (same approach as interpret.ts)
|
|
514
|
+
const rec = report;
|
|
515
|
+
const summary = rec.summary;
|
|
516
|
+
const versions = summary?.versions;
|
|
517
|
+
const versionedInputs = {
|
|
518
|
+
graderJudgmentsVersion: typeof versions?.graderJudgmentsVersion === "string"
|
|
519
|
+
? versions.graderJudgmentsVersion
|
|
520
|
+
: "unknown",
|
|
521
|
+
ensembleVersion: typeof versions?.ensembleVersion === "string"
|
|
522
|
+
? versions.ensembleVersion
|
|
523
|
+
: "unknown",
|
|
524
|
+
diagnosisVersion: typeof versions?.diagnosisVersion === "string"
|
|
525
|
+
? versions.diagnosisVersion
|
|
526
|
+
: "unknown",
|
|
527
|
+
cardVersion: typeof versions?.cardVersion === "string"
|
|
528
|
+
? versions.cardVersion
|
|
529
|
+
: "unknown",
|
|
530
|
+
};
|
|
531
|
+
// Run the diagnosis
|
|
532
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
533
|
+
const diagnosis = await runner.run({
|
|
534
|
+
report: report,
|
|
535
|
+
versions: versionedInputs,
|
|
536
|
+
refresh: false,
|
|
537
|
+
});
|
|
538
|
+
// Print per-card summary lines to stdout (D6-04 single formatter)
|
|
539
|
+
for (const card of diagnosis.cards) {
|
|
540
|
+
process.stdout.write(`${formatCardSummaryLine(card)}\n`);
|
|
541
|
+
}
|
|
542
|
+
// Build and write synthesis telemetry back to the report doc (D6-08)
|
|
543
|
+
// patchSynthesis is now part of ReportStorePort (CR-01) — guard on store
|
|
544
|
+
// presence only; absent store means no report store is configured (expected).
|
|
545
|
+
if (ctx.reportStore) {
|
|
546
|
+
const telemetry = buildSynthesisTelemetry(diagnosis);
|
|
547
|
+
await ctx.reportStore.patchSynthesis(reportId, telemetry);
|
|
548
|
+
}
|
|
549
|
+
else {
|
|
550
|
+
process.stderr.write("ℹ️ No reportStore configured — synthesis telemetry not written to Sanity.\n");
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
catch (err) {
|
|
554
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
555
|
+
process.stderr.write(`⚠️ Diagnosis failed: ${msg}. Run \`ailf interpret ${reportId}\` to retry.\n`);
|
|
556
|
+
}
|
|
557
|
+
}
|
|
376
558
|
/** Resolve and validate the --task-source flag value. */
|
|
377
559
|
function resolveTaskSourceType(raw) {
|
|
378
560
|
if (!raw || raw === "content-lake")
|
|
@@ -471,6 +653,11 @@ export async function executePipeline(cliOpts) {
|
|
|
471
653
|
const steps = buildStepSequence(ctx, pipelineStart);
|
|
472
654
|
const result = await orchestratePipeline(ctx, steps);
|
|
473
655
|
writePipelineResult(result, config.outputDir);
|
|
656
|
+
// Phase 6 / DIAG-06: post-run hook fires after artifacts are written (D6-02)
|
|
657
|
+
await runPostPipelineHooks(ctx, result, {
|
|
658
|
+
cliOpts,
|
|
659
|
+
summaryOnRun: config.summaryOnRun,
|
|
660
|
+
});
|
|
474
661
|
process.exit(result.success ? 0 : 1);
|
|
475
662
|
}
|
|
476
663
|
const o = resolveOptions(cliOpts);
|
|
@@ -510,6 +697,11 @@ export async function executePipeline(cliOpts) {
|
|
|
510
697
|
const steps = buildStepSequence(ctx, pipelineStart);
|
|
511
698
|
const result = await orchestratePipeline(ctx, steps);
|
|
512
699
|
writePipelineResult(result, o.outputDir);
|
|
700
|
+
// Phase 6 / DIAG-06: post-run hook fires after artifacts are written (D6-02)
|
|
701
|
+
await runPostPipelineHooks(ctx, result, {
|
|
702
|
+
cliOpts,
|
|
703
|
+
summaryOnRun: o.summaryOnRun,
|
|
704
|
+
});
|
|
513
705
|
process.exit(result.success ? 0 : 1);
|
|
514
706
|
}
|
|
515
707
|
// ---------------------------------------------------------------------------
|
package/dist/commands/run.d.ts
CHANGED
|
@@ -47,6 +47,8 @@ export interface PipelineCliOptions {
|
|
|
47
47
|
publish?: boolean;
|
|
48
48
|
publishTag?: string;
|
|
49
49
|
remoteCache?: boolean;
|
|
50
|
+
/** Phase 6 / DIAG-06: post-run diagnosis summary toggle. Undefined when neither flag is passed. */
|
|
51
|
+
summary?: boolean;
|
|
50
52
|
sanityDocument: string[];
|
|
51
53
|
sanityPerspective?: string;
|
|
52
54
|
search?: string;
|
package/dist/commands/run.js
CHANGED
|
@@ -43,6 +43,8 @@ export function createRunCommand() {
|
|
|
43
43
|
.option("-p, --publish", "Write report to Sanity + fan out to sinks (auto-enabled for full runs when report store is configured)")
|
|
44
44
|
.option("--no-publish", "Suppress auto-publishing")
|
|
45
45
|
.option("--publish-tag <tag>", "Label for published report")
|
|
46
|
+
.option("--summary", "Force post-run diagnosis summary (overrides config and CI default-off)")
|
|
47
|
+
.option("--no-summary", "Suppress post-run diagnosis summary")
|
|
46
48
|
.option("--config <path>", "Load pipeline config from a TS/JS/YAML/JSON file (overrides most CLI flags)")
|
|
47
49
|
.option("-o, --output <path>", "Write PR comment markdown to file")
|
|
48
50
|
.option("--promptfoo-url <url>", "Promptfoo share URL for report")
|
|
@@ -15,7 +15,8 @@
|
|
|
15
15
|
* @see packages/core/src/ports/context.ts — AppContext interface
|
|
16
16
|
* @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
|
|
17
17
|
*/
|
|
18
|
-
import { type AppContext, type ArtifactWriter, type ArtifactWriterProgressOptions, type AssertionRegistration, type
|
|
18
|
+
import { type AppContext, type ArtifactWriter, type ArtifactWriterProgressOptions, type AssertionRegistration, type CardRegistry, type DiagnosisRunner, type Logger, type ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
|
|
19
|
+
export type { LLMClientKeys } from "./_vendor/ailf-core/index.d.ts";
|
|
19
20
|
import { type BorderlineConsensusOptions, type BorderlineConsensusResult } from "./pipeline/borderline-consensus-runner.js";
|
|
20
21
|
import { CompositeTaskSource, ContentLakeTaskSource, RepoTaskSource } from "./adapters/task-sources/index.js";
|
|
21
22
|
/**
|
|
@@ -25,28 +26,6 @@ import { CompositeTaskSource, ContentLakeTaskSource, RepoTaskSource } from "./ad
|
|
|
25
26
|
* Swapping an adapter is a one-line change in this function.
|
|
26
27
|
*/
|
|
27
28
|
export declare function createAppContext(config: ResolvedConfig): AppContext;
|
|
28
|
-
/**
|
|
29
|
-
* Typed key bag passed to `createLLMClient`. The composition root reads
|
|
30
|
-
* env once and supplies values here; the factory stays pure so tests don't
|
|
31
|
-
* have to mutate `process.env`.
|
|
32
|
-
*/
|
|
33
|
-
export interface LLMClientKeys {
|
|
34
|
-
anthropicApiKey?: string;
|
|
35
|
-
openaiApiKey?: string;
|
|
36
|
-
}
|
|
37
|
-
/**
|
|
38
|
-
* Select the LLMClient adapter based on `config.llmProvider` and the
|
|
39
|
-
* supplied API keys. Returns `undefined` when no usable credential is
|
|
40
|
-
* present — `AppContext.llmClient` stays unset and consumers handle that
|
|
41
|
-
* explicitly.
|
|
42
|
-
*
|
|
43
|
-
* Adapters never read `process.env` themselves (per
|
|
44
|
-
* `.claude/rules/typescript.md`); env mapping happens at the call site
|
|
45
|
-
* (typically `createAppContext`).
|
|
46
|
-
*
|
|
47
|
-
* Exported for unit-test access; not part of the public package API.
|
|
48
|
-
*/
|
|
49
|
-
export declare function createLLMClient(config: ResolvedConfig, keys: LLMClientKeys, logger: Logger): LLMClient | undefined;
|
|
50
29
|
/**
|
|
51
30
|
* Selects the `ArtifactWriter` wiring per D0033 M4:
|
|
52
31
|
*
|
|
@@ -119,3 +98,22 @@ export declare const DEFAULT_BORDERLINE_REPLICATIONS = 3;
|
|
|
119
98
|
export declare function createBorderlineConsensusRunner(opts: {
|
|
120
99
|
borderlineReplications?: number;
|
|
121
100
|
}): (args: Pick<BorderlineConsensusOptions, "judgments" | "logger" | "regrade">) => Promise<BorderlineConsensusResult>;
|
|
101
|
+
/**
|
|
102
|
+
* Returns the full 8-card `CardRegistry` backed by `DIAGNOSIS_CARD_GENERATORS`
|
|
103
|
+
* from `@sanity/ailf-core`. Exposed as a function (not a module-level const)
|
|
104
|
+
* so the composition root remains the single-seam factory and tests can assert
|
|
105
|
+
* the call site (AI-SPEC §3 Pitfall 1 — no module-scope mutables).
|
|
106
|
+
*/
|
|
107
|
+
export declare function buildDiagnosisRegistry(): CardRegistry;
|
|
108
|
+
/**
|
|
109
|
+
* Build a fully-wired `DiagnosisRunner` from an `AppContext`.
|
|
110
|
+
*
|
|
111
|
+
* Wires the full 8-card registry, `loadAttributions` bound to the local
|
|
112
|
+
* filesystem (Phase-4 per-entry attribution objects at
|
|
113
|
+
* `{artifactsDir}/runs/{runId}/attribution/*.json`), and no-op cache
|
|
114
|
+
* reader/writer (Plan-06 CLI command will wire the real cache seam).
|
|
115
|
+
*
|
|
116
|
+
* Plan-06 API/CLI consumers import this function from the composition root
|
|
117
|
+
* and pass `ctx` from `createAppContext(config)`.
|
|
118
|
+
*/
|
|
119
|
+
export declare function getDiagnosisRunner(ctx: AppContext): DiagnosisRunner;
|