@sanity/ailf 4.6.0 → 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/agent-harness-tools.yaml +42 -0
- package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
- package/canonical/grader-references/mcp-server-spec.yaml +51 -0
- package/canonical/grader-references/portable-text.yaml +48 -0
- package/config/diagnosis-cards.ts +318 -0
- package/config/models.ts +12 -0
- package/config/rubrics.ts +38 -2
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
- package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
- package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
- package/dist/_vendor/ailf-core/examples/index.js +146 -47
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +4 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +9 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
- package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +171 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +104 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +96 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +39 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +52 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +77 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +126 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +107 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +114 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +273 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +86 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +50 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +35 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +136 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +153 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +6 -0
- package/dist/_vendor/ailf-core/services/index.js +18 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
- package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
- package/dist/_vendor/ailf-core/types/attribution.js +18 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
- package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
- package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
- package/dist/_vendor/ailf-core/types/confidence.js +7 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +271 -0
- package/dist/_vendor/ailf-core/types/diagnosis.js +19 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
- package/dist/_vendor/ailf-core/types/index.js +15 -1
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
- package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
- package/dist/adapters/attribution/index.d.ts +9 -0
- package/dist/adapters/attribution/index.js +8 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/grader-outputs/index.d.ts +10 -0
- package/dist/adapters/grader-outputs/index.js +8 -0
- package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
- package/dist/adapters/grader-outputs/legacy/index.js +10 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
- package/dist/adapters/index.d.ts +3 -0
- package/dist/adapters/index.js +4 -0
- package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
- package/dist/adapters/llm/fake-llm-client.js +38 -1
- package/dist/adapters/llm/openai-llm-client.js +52 -3
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
- package/dist/adapters/task-sources/repo-schemas.js +19 -2
- package/dist/cli-program.js +3 -0
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/explain-handler.js +1 -1
- package/dist/commands/interpret.d.ts +50 -0
- package/dist/commands/interpret.js +212 -0
- package/dist/commands/lookup-doc.d.ts +1 -1
- package/dist/commands/lookup-doc.js +3 -3
- package/dist/commands/pipeline-action.d.ts +6 -0
- package/dist/commands/pipeline-action.js +2 -0
- package/dist/commands/remote-pipeline.js +1 -0
- package/dist/composition-root.d.ts +57 -23
- package/dist/composition-root.js +155 -41
- package/dist/config/diagnosis-cards.ts +318 -0
- package/dist/config/models.ts +12 -0
- package/dist/config/rubrics.ts +38 -2
- package/dist/grader/agent-harness.d.ts +9 -0
- package/dist/grader/agent-harness.js +9 -0
- package/dist/grader/common.d.ts +9 -0
- package/dist/grader/common.js +9 -0
- package/dist/grader/index.d.ts +24 -0
- package/dist/grader/index.js +24 -0
- package/dist/grader/knowledge-probe.d.ts +9 -0
- package/dist/grader/knowledge-probe.js +9 -0
- package/dist/grader/literacy.d.ts +9 -0
- package/dist/grader/literacy.js +9 -0
- package/dist/grader/mcp.d.ts +9 -0
- package/dist/grader/mcp.js +9 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +5 -0
- package/dist/orchestration/steps/calculate-scores-step.js +23 -1
- package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
- package/dist/orchestration/steps/compute-attribution-step.js +279 -0
- package/dist/orchestration/steps/gap-analysis-step.js +35 -7
- package/dist/orchestration/steps/index.d.ts +1 -0
- package/dist/orchestration/steps/index.js +1 -0
- package/dist/pipeline/attribution.d.ts +15 -0
- package/dist/pipeline/attribution.js +18 -9
- package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
- package/dist/pipeline/borderline-consensus-runner.js +124 -0
- package/dist/pipeline/borderline-detector.d.ts +24 -0
- package/dist/pipeline/borderline-detector.js +26 -0
- package/dist/pipeline/calculate-scores.d.ts +114 -3
- package/dist/pipeline/calculate-scores.js +426 -24
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +35 -17
- package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
- package/dist/pipeline/compiler/rubric-resolution.js +9 -1
- package/dist/pipeline/compute-attribution.d.ts +80 -0
- package/dist/pipeline/compute-attribution.js +196 -0
- package/dist/pipeline/failure-modes.d.ts +52 -17
- package/dist/pipeline/failure-modes.js +178 -117
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/package.json +7 -5
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline step: Per-judgment attribution ensemble (v0).
|
|
3
|
+
*
|
|
4
|
+
* Reads `grader-judgments.json` and `score-summary.json` from the latest
|
|
5
|
+
* results, calls the pure `computeJudgmentAttribution(...)` helper for each
|
|
6
|
+
* judgment, and emits:
|
|
7
|
+
*
|
|
8
|
+
* - One `perEntryAttribution` artifact per judgment at
|
|
9
|
+
* `runs/{runId}/attribution/{entryKey}.json`
|
|
10
|
+
* - One `attributionMeta` artifact at
|
|
11
|
+
* `runs/{runId}/attribution-meta.json`
|
|
12
|
+
*
|
|
13
|
+
* Additionally, when any hallucinated citations are detected, the step
|
|
14
|
+
* atomically rewrites `score-summary.json` to persist
|
|
15
|
+
* `graderReliability.hallucinationCount` (D-05 — only this one direct-
|
|
16
|
+
* mutation path uses the temp+rename pattern; all artifact emissions go
|
|
17
|
+
* through `ctx.artifactWriter.emit` which handles atomicity internally).
|
|
18
|
+
*
|
|
19
|
+
* This step is `optional: true` — it self-skips when either
|
|
20
|
+
* `grader-judgments.json` or `score-summary.json` is missing, so
|
|
21
|
+
* non-graded runs are unaffected.
|
|
22
|
+
*
|
|
23
|
+
* Task → judgment join (D-10): `judgment.taskId` is the promptfoo row
|
|
24
|
+
* description, which for literacy mode is `"${task.title} (gold|baseline)"`.
|
|
25
|
+
* The join strips the variant suffix and looks up in a triple-keyed cache
|
|
26
|
+
* by `task.title` (primary), `task.description`, and `task.id` (defensive
|
|
27
|
+
* fallbacks for non-literacy modes).
|
|
28
|
+
*
|
|
29
|
+
* Retrieved signal (D-11/D-12): `FeatureAgentBehavior.feature` is the join
|
|
30
|
+
* key — for literacy mode it equals `task.area` exactly (compiler propagates
|
|
31
|
+
* `task.area → __featureArea → ab.feature`).
|
|
32
|
+
*
|
|
33
|
+
* @see docs/decisions/D0033-unified-artifact-writer.md
|
|
34
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
35
|
+
* @see docs/decisions/D0050-per-entry-attribution-layout.md
|
|
36
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
37
|
+
*/
|
|
38
|
+
import { existsSync, readFileSync, renameSync, unlinkSync, writeFileSync, } from "node:fs";
|
|
39
|
+
import { resolve } from "node:path";
|
|
40
|
+
import { isSlugRef } from "../../_vendor/ailf-core/index.js";
|
|
41
|
+
import { calibrationSetVersion, embeddingModel, ensembleVersion, } from "../../pipeline/attribution.js";
|
|
42
|
+
import { V0_WEIGHTS, computeJudgmentAttribution, } from "../../pipeline/compute-attribution.js";
|
|
43
|
+
// ---------------------------------------------------------------------------
|
|
44
|
+
// Step implementation
|
|
45
|
+
// ---------------------------------------------------------------------------
|
|
46
|
+
export class ComputeAttributionStep {
|
|
47
|
+
name = "compute-attribution";
|
|
48
|
+
optional = true;
|
|
49
|
+
check(ctx) {
|
|
50
|
+
const issues = [];
|
|
51
|
+
const judgmentsPath = resolve(ctx.config.rootDir, "results", "latest", "grader-judgments.json");
|
|
52
|
+
const summaryPath = resolve(ctx.config.rootDir, "results", "latest", "score-summary.json");
|
|
53
|
+
if (!existsSync(judgmentsPath)) {
|
|
54
|
+
issues.push({
|
|
55
|
+
message: "No grader-judgments.json — attribution computation will skip",
|
|
56
|
+
severity: "warning",
|
|
57
|
+
source: "compute-attribution",
|
|
58
|
+
});
|
|
59
|
+
return issues;
|
|
60
|
+
}
|
|
61
|
+
// WARN 5: documentManifest must be present and non-empty (gap-analysis
|
|
62
|
+
// enriches it). Without it, the canonical signal is permanently 0 and
|
|
63
|
+
// the three-signal ensemble silently degrades to citation-only.
|
|
64
|
+
if (existsSync(summaryPath)) {
|
|
65
|
+
try {
|
|
66
|
+
const summary = JSON.parse(readFileSync(summaryPath, "utf-8"));
|
|
67
|
+
const dm = summary.documentManifest;
|
|
68
|
+
if (!dm || dm.length === 0) {
|
|
69
|
+
issues.push({
|
|
70
|
+
message: "documentManifest is empty — attribution canonical signal will be permanently 0. Ensure gap-analysis runs before compute-attribution.",
|
|
71
|
+
severity: "warning",
|
|
72
|
+
source: "compute-attribution",
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
catch {
|
|
77
|
+
// Surfaces at execute() with a failed StepResult.
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
return issues;
|
|
81
|
+
}
|
|
82
|
+
async execute(ctx, _state) {
|
|
83
|
+
const start = Date.now();
|
|
84
|
+
const root = ctx.config.rootDir;
|
|
85
|
+
const judgmentsPath = resolve(root, "results", "latest", "grader-judgments.json");
|
|
86
|
+
const summaryPath = resolve(root, "results", "latest", "score-summary.json");
|
|
87
|
+
if (!existsSync(judgmentsPath)) {
|
|
88
|
+
return { status: "skipped", reason: "No grader-judgments.json" };
|
|
89
|
+
}
|
|
90
|
+
if (!existsSync(summaryPath)) {
|
|
91
|
+
return { status: "skipped", reason: "No score-summary.json" };
|
|
92
|
+
}
|
|
93
|
+
try {
|
|
94
|
+
const judgments = JSON.parse(readFileSync(judgmentsPath, "utf-8"));
|
|
95
|
+
const summary = JSON.parse(readFileSync(summaryPath, "utf-8"));
|
|
96
|
+
// D-10: judgment.taskId is the promptfoo row description, sometimes
|
|
97
|
+
// suffixed "(gold)" / "(baseline)" by the literacy compiler
|
|
98
|
+
// (literacy/compiler.ts:184). Build a triple-keyed task cache:
|
|
99
|
+
// - task.title (primary for literacy — the compiler binds task.title
|
|
100
|
+
// as the row description before appending the suffix)
|
|
101
|
+
// - task.description (defensive fallback for future modes)
|
|
102
|
+
// - task.id (defensive fallback for KP / agent-harness modes)
|
|
103
|
+
// Strip the variant suffix before lookup so the join matches.
|
|
104
|
+
const tasksByKey = new Map();
|
|
105
|
+
if (ctx.taskSource) {
|
|
106
|
+
try {
|
|
107
|
+
const tasks = (await ctx.taskSource.loadTasks()).filter((t) => t.mode === "literacy");
|
|
108
|
+
for (const t of tasks) {
|
|
109
|
+
if (t.title)
|
|
110
|
+
tasksByKey.set(t.title, t);
|
|
111
|
+
if (t.description)
|
|
112
|
+
tasksByKey.set(t.description, t);
|
|
113
|
+
tasksByKey.set(t.id, t);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
catch (err) {
|
|
117
|
+
// Surface the failure so operators see why the canonical signal
|
|
118
|
+
// is permanently 0 for this run. Behavior is unchanged — we
|
|
119
|
+
// degrade to an empty tasksByKey rather than failing the step.
|
|
120
|
+
ctx.logger.warn(`[compute-attribution] taskSource.loadTasks() failed; canonical signal will be 0 for all judgments. ${err instanceof Error ? err.message : String(err)}`);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
// D-12: FeatureAgentBehavior.feature is the correct field (NOT .area).
|
|
124
|
+
// Verified at packages/core/src/types/index.ts:1180-1190.
|
|
125
|
+
// D-11: for literacy mode, ab.feature === task.area via the compiler's
|
|
126
|
+
// __featureArea propagation (literacy/compiler.ts:180).
|
|
127
|
+
const visitedByFeature = new Map();
|
|
128
|
+
for (const ab of summary.agentBehavior ?? []) {
|
|
129
|
+
visitedByFeature.set(ab.feature, new Set(ab.docSlugsVisited ?? []));
|
|
130
|
+
}
|
|
131
|
+
// Build a slug→DocumentRef map from documentManifest for candidate resolution.
|
|
132
|
+
const manifestBySlug = new Map();
|
|
133
|
+
for (const ref of summary.documentManifest ?? []) {
|
|
134
|
+
if (ref.slug)
|
|
135
|
+
manifestBySlug.set(ref.slug, ref);
|
|
136
|
+
}
|
|
137
|
+
const reliability = { hallucinationCount: 0 };
|
|
138
|
+
for (const j of judgments) {
|
|
139
|
+
// D-10: strip variant suffix before lookup — mirrors calculate-scores.ts:696.
|
|
140
|
+
const baseDesc = stripVariantSuffix(j.taskId);
|
|
141
|
+
const task = tasksByKey.get(baseDesc) ?? tasksByKey.get(j.taskId);
|
|
142
|
+
// Resolve context docs → DocumentRef[] via manifest lookup.
|
|
143
|
+
const contextSlugs = extractContextSlugs(task);
|
|
144
|
+
const contextDocs = contextSlugs
|
|
145
|
+
.map((s) => manifestBySlug.get(s))
|
|
146
|
+
.filter((r) => r !== undefined);
|
|
147
|
+
const contextDocIds = new Set(contextDocs.map((r) => r.documentId));
|
|
148
|
+
// D-11: retrieved signal keys by task.area (= ab.feature in literacy mode).
|
|
149
|
+
// When agentBehavior is absent (baseline run), drop the retrieved signal
|
|
150
|
+
// (pass undefined — Pitfall #4 / locked D-04).
|
|
151
|
+
const area = task?.area;
|
|
152
|
+
const retrievedSlugs = summary.agentBehavior === undefined || summary.agentBehavior === null
|
|
153
|
+
? undefined
|
|
154
|
+
: area === undefined
|
|
155
|
+
? new Set()
|
|
156
|
+
: (visitedByFeature.get(area) ?? new Set());
|
|
157
|
+
// Candidate set: contextDocs ∪ docs in the manifest cited by the judgment.
|
|
158
|
+
const citedDocIds = new Set(j.docCitations.map((c) => c.documentId));
|
|
159
|
+
const candidates = [
|
|
160
|
+
...contextDocs,
|
|
161
|
+
...Array.from(manifestBySlug.values()).filter((r) => !contextDocIds.has(r.documentId) &&
|
|
162
|
+
(citedDocIds.has(r.documentId) ||
|
|
163
|
+
(r.slug !== undefined &&
|
|
164
|
+
(retrievedSlugs?.has(r.slug) ?? false)))),
|
|
165
|
+
];
|
|
166
|
+
const judgmentAttribution = computeJudgmentAttribution(j, candidates, contextDocIds, retrievedSlugs, reliability, V0_WEIGHTS);
|
|
167
|
+
// D-06: pass { run, name } where name is the entry key.
|
|
168
|
+
// formatKeyFromAxes requires assoc.name for per-entry descriptors —
|
|
169
|
+
// forgetting it causes a hard throw at emit time.
|
|
170
|
+
const entryKey = `${j.taskId}--${j.modelId}--${j.dimension}`;
|
|
171
|
+
await ctx.artifactWriter.emit("perEntryAttribution", { run: ctx.runId, name: entryKey }, judgmentAttribution);
|
|
172
|
+
}
|
|
173
|
+
await ctx.artifactWriter.emit("attributionMeta", { run: ctx.runId }, {
|
|
174
|
+
ensembleVersion,
|
|
175
|
+
embeddingModel,
|
|
176
|
+
weights: V0_WEIGHTS,
|
|
177
|
+
calibrationSetVersion,
|
|
178
|
+
});
|
|
179
|
+
// Atomically persist reliability.hallucinationCount onto score-summary.json
|
|
180
|
+
// (D-05: the atomic write applies ONLY to this direct-mutation path;
|
|
181
|
+
// ctx.artifactWriter.emit handles atomicity for all other writes).
|
|
182
|
+
if (reliability.hallucinationCount > 0) {
|
|
183
|
+
// Build the updated graderReliability by merging only the new count.
|
|
184
|
+
// graderReliability.graderModel is a required field — spread to preserve
|
|
185
|
+
// all existing required fields before adding/updating hallucinationCount.
|
|
186
|
+
const existingReliability = summary.graderReliability;
|
|
187
|
+
const updatedReliability = existingReliability
|
|
188
|
+
? {
|
|
189
|
+
...existingReliability,
|
|
190
|
+
hallucinationCount: (existingReliability.hallucinationCount ?? 0) +
|
|
191
|
+
reliability.hallucinationCount,
|
|
192
|
+
}
|
|
193
|
+
: {
|
|
194
|
+
// When there is no existing graderReliability, we cannot provide
|
|
195
|
+
// the required graderModel field from the run context here.
|
|
196
|
+
// Persist only the new counter under a minimal shape.
|
|
197
|
+
graderModel: "unknown",
|
|
198
|
+
hallucinationCount: reliability.hallucinationCount,
|
|
199
|
+
};
|
|
200
|
+
const updated = {
|
|
201
|
+
...summary,
|
|
202
|
+
graderReliability: updatedReliability,
|
|
203
|
+
};
|
|
204
|
+
const tmp = `${summaryPath}.tmp`;
|
|
205
|
+
try {
|
|
206
|
+
writeFileSync(tmp, JSON.stringify(updated, null, 2));
|
|
207
|
+
renameSync(tmp, summaryPath);
|
|
208
|
+
}
|
|
209
|
+
catch (err) {
|
|
210
|
+
// Best-effort cleanup of the .tmp file so a stale leftover does
|
|
211
|
+
// not confuse subsequent runs (cross-device move, EACCES, ENOSPC
|
|
212
|
+
// after writeFileSync). Re-throw to surface the underlying error
|
|
213
|
+
// via the outer try/catch's failed StepResult.
|
|
214
|
+
try {
|
|
215
|
+
unlinkSync(tmp);
|
|
216
|
+
}
|
|
217
|
+
catch {
|
|
218
|
+
/* best-effort cleanup */
|
|
219
|
+
}
|
|
220
|
+
throw err;
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
return {
|
|
224
|
+
durationMs: Date.now() - start,
|
|
225
|
+
status: "success",
|
|
226
|
+
summary: `Attribution computed for ${judgments.length} judgments (${reliability.hallucinationCount} hallucinated citations)`,
|
|
227
|
+
};
|
|
228
|
+
}
|
|
229
|
+
catch (err) {
|
|
230
|
+
return {
|
|
231
|
+
durationMs: Date.now() - start,
|
|
232
|
+
error: err instanceof Error ? err.message : String(err),
|
|
233
|
+
status: "failed",
|
|
234
|
+
};
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
// ---------------------------------------------------------------------------
|
|
239
|
+
// Internal helpers
|
|
240
|
+
// ---------------------------------------------------------------------------
|
|
241
|
+
/**
|
|
242
|
+
* Strip the literacy compiler's variant suffix from a row description.
|
|
243
|
+
* Mirrors the canonical Phase-3 strip site at
|
|
244
|
+
* packages/eval/src/pipeline/calculate-scores.ts:696 (D-10).
|
|
245
|
+
*/
|
|
246
|
+
function stripVariantSuffix(taskId) {
|
|
247
|
+
return taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
|
|
248
|
+
}
|
|
249
|
+
/** True for any ref whose shape carries a string-valued `slug` field. */
|
|
250
|
+
function hasSlugField(ref) {
|
|
251
|
+
return "slug" in ref && typeof ref.slug === "string";
|
|
252
|
+
}
|
|
253
|
+
/**
|
|
254
|
+
* Extract slug strings from a task's context.docs array (D-09).
|
|
255
|
+
*
|
|
256
|
+
* Uses isSlugRef for SlugDocRef and falls back to the informational
|
|
257
|
+
* `slug` annotation on IdDocRef. PathDocRef refs (and the `path`
|
|
258
|
+
* annotation on IdDocRef) are intentionally NOT included — paths are
|
|
259
|
+
* filesystem-style and never match manifestBySlug, which is keyed by
|
|
260
|
+
* article slug. A PathDocRef contributes zero canonical signal (correct
|
|
261
|
+
* fallback) rather than polluting the slug lookup with stale keys.
|
|
262
|
+
*/
|
|
263
|
+
function extractContextSlugs(task) {
|
|
264
|
+
if (!task?.context?.docs)
|
|
265
|
+
return [];
|
|
266
|
+
const out = [];
|
|
267
|
+
for (const ref of task.context.docs) {
|
|
268
|
+
if (isSlugRef(ref)) {
|
|
269
|
+
out.push(ref.slug);
|
|
270
|
+
}
|
|
271
|
+
else if (hasSlugField(ref)) {
|
|
272
|
+
// IdDocRef carries an informational `slug` annotation — use it
|
|
273
|
+
// for the manifest lookup. PerspectiveDocRef has neither slug
|
|
274
|
+
// nor a usable lookup key and is skipped.
|
|
275
|
+
out.push(ref.slug);
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
return out;
|
|
279
|
+
}
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
*
|
|
15
15
|
* This is an optional step — failure doesn't stop the pipeline.
|
|
16
16
|
*/
|
|
17
|
-
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
17
|
+
import { existsSync, mkdirSync, readFileSync, renameSync, writeFileSync, } from "fs";
|
|
18
18
|
import { join, resolve } from "path";
|
|
19
19
|
import { assoc, isSlugRef } from "../../_vendor/ailf-core/index.js";
|
|
20
20
|
import { emitFileContents } from "../../artifact-capture/emit-file.js";
|
|
@@ -151,19 +151,44 @@ export class GapAnalysisStep {
|
|
|
151
151
|
...s,
|
|
152
152
|
documents: areaToDocRefs.get(s.feature),
|
|
153
153
|
}));
|
|
154
|
+
// Pitfall #11 hallucination cross-check (GRAD-05). The required
|
|
155
|
+
// `hallucinationCheckedAgainst` and per-citation `hallucinated`
|
|
156
|
+
// fields are populated here because `extractGraderJudgments`
|
|
157
|
+
// (the upstream emitter) does not have access to either the
|
|
158
|
+
// task contextDocs map or the run's document manifest. The
|
|
159
|
+
// populator mutates `judgments` in place and the rewrite below
|
|
160
|
+
// persists the enrichment back to disk so downstream consumers
|
|
161
|
+
// (Studio, gap-analysis followups) see populated fields.
|
|
162
|
+
const taskDocSlugs = new Map();
|
|
163
|
+
for (const [desc, refs] of descToDocRefs) {
|
|
164
|
+
taskDocSlugs.set(desc, refs
|
|
165
|
+
.map((r) => r.slug)
|
|
166
|
+
.filter((s) => typeof s === "string" && s.length > 0));
|
|
167
|
+
}
|
|
168
|
+
const manifestSlugs = (documentManifest ?? [])
|
|
169
|
+
.map((d) => d.slug)
|
|
170
|
+
.filter((s) => typeof s === "string" && s.length > 0);
|
|
171
|
+
const { populateHallucinationFields } = await import("../../pipeline/calculate-scores.js");
|
|
172
|
+
populateHallucinationFields(judgments, taskDocSlugs, manifestSlugs);
|
|
173
|
+
// Atomic write — POSIX rename is atomic on the same filesystem,
|
|
174
|
+
// so a mid-write failure leaves either the prior file or the new
|
|
175
|
+
// file intact, never a half-written JSON document.
|
|
176
|
+
const tmpPath = `${judgmentsPath}.tmp`;
|
|
177
|
+
writeFileSync(tmpPath, JSON.stringify(judgments, null, 2));
|
|
178
|
+
renameSync(tmpPath, judgmentsPath);
|
|
154
179
|
}
|
|
155
180
|
// ── Per-test results (D0029: model output + metadata) ──────
|
|
156
181
|
const testResultsPath = resolve(root, "results", "latest", "test-results.json");
|
|
157
182
|
let testResults;
|
|
158
183
|
if (existsSync(testResultsPath)) {
|
|
159
184
|
const rawTestResults = JSON.parse(readFileSync(testResultsPath, "utf-8"));
|
|
160
|
-
// Enrich with
|
|
185
|
+
// Enrich with context docs (literacy mode only)
|
|
161
186
|
testResults = rawTestResults.map((tr) => {
|
|
162
187
|
if (!isLiteracyMode)
|
|
163
188
|
return tr;
|
|
164
189
|
const baseDesc = tr.taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
|
|
165
|
-
const
|
|
166
|
-
return
|
|
190
|
+
const contextDocs = descToDocRefs.get(baseDesc);
|
|
191
|
+
return contextDocs ? { ...tr, contextDocs } : tr;
|
|
167
192
|
});
|
|
168
193
|
}
|
|
169
194
|
// ── Low-scoring judgments ────────────────────────────────────
|
|
@@ -182,8 +207,8 @@ export class GapAnalysisStep {
|
|
|
182
207
|
return j;
|
|
183
208
|
// Judgment taskId is the description with "(gold)" or "(baseline)" suffix
|
|
184
209
|
const baseDesc = j.taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
|
|
185
|
-
const
|
|
186
|
-
return
|
|
210
|
+
const contextDocs = descToDocRefs.get(baseDesc);
|
|
211
|
+
return contextDocs ? { ...j, contextDocs } : j;
|
|
187
212
|
});
|
|
188
213
|
const enrichedSummary = {
|
|
189
214
|
...scoreSummary,
|
|
@@ -194,7 +219,10 @@ export class GapAnalysisStep {
|
|
|
194
219
|
scores: enrichedScores,
|
|
195
220
|
...(testResults !== undefined && { testResults }),
|
|
196
221
|
};
|
|
197
|
-
|
|
222
|
+
// Atomic write — see judgmentsPath above for rationale.
|
|
223
|
+
const scoreSummaryTmpPath = `${scoreSummaryPath}.tmp`;
|
|
224
|
+
writeFileSync(scoreSummaryTmpPath, JSON.stringify(enrichedSummary, null, 2));
|
|
225
|
+
renameSync(scoreSummaryTmpPath, scoreSummaryPath);
|
|
198
226
|
// W0051 Slice 2 — failureModes is per-entry keyed by {mode, category};
|
|
199
227
|
// one entry per classified FailureModeType. Zero-count categories are
|
|
200
228
|
// skipped to keep the manifest honest about what the run surfaced.
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
* AppContext instead of positional parameters.
|
|
6
6
|
*/
|
|
7
7
|
export { CalculateScoresStep } from "./calculate-scores-step.js";
|
|
8
|
+
export { ComputeAttributionStep } from "./compute-attribution-step.js";
|
|
8
9
|
export { CompareStep } from "./compare-step.js";
|
|
9
10
|
export { FetchDocsStep } from "./fetch-docs-step.js";
|
|
10
11
|
export { GapAnalysisStep } from "./gap-analysis-step.js";
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
* AppContext instead of positional parameters.
|
|
6
6
|
*/
|
|
7
7
|
export { CalculateScoresStep } from "./calculate-scores-step.js";
|
|
8
|
+
export { ComputeAttributionStep } from "./compute-attribution-step.js";
|
|
8
9
|
export { CompareStep } from "./compare-step.js";
|
|
9
10
|
export { FetchDocsStep } from "./fetch-docs-step.js";
|
|
10
11
|
export { GapAnalysisStep } from "./gap-analysis-step.js";
|
|
@@ -17,6 +17,21 @@
|
|
|
17
17
|
*/
|
|
18
18
|
import type { AttributionReport, ComparisonReport } from "./types.js";
|
|
19
19
|
import type { ResolvedMappings } from "./resolve-mappings.js";
|
|
20
|
+
/** v0 sentinel — no embedding call in v0; v1.2's flip to a real
|
|
21
|
+
* model name (e.g., "text-embedding-3-small") mechanically forces
|
|
22
|
+
* ensembleVersion's right segment to change, invalidating cached
|
|
23
|
+
* weights downstream. */
|
|
24
|
+
export declare const embeddingModel = "none";
|
|
25
|
+
/** VER-01 D-02 — co-located ensemble version. Compound semver-ish
|
|
26
|
+
* shape: "{algorithmVersion}+{embeddingModel}" (per D-02 of Phase
|
|
27
|
+
* 4 CONTEXT.md). Phase 1 landed the constant; Phase 4 wires the
|
|
28
|
+
* compound shape and the bump-by-calibration discipline
|
|
29
|
+
* (calibrate-attribution.ts is the ONLY allowed bumper). */
|
|
30
|
+
export declare const ensembleVersion: "0.1.0+none";
|
|
31
|
+
/** Version tag for the calibration set fixture co-located in the repo.
|
|
32
|
+
* Bump when the fixture structure changes (e.g. when v1 moves the
|
|
33
|
+
* calibration set to Content Lake or expands to ~30 stratified rows). */
|
|
34
|
+
export declare const calibrationSetVersion = "v0-fixture";
|
|
20
35
|
/**
|
|
21
36
|
* Attribute score changes to individual documents.
|
|
22
37
|
*
|
|
@@ -15,6 +15,21 @@
|
|
|
15
15
|
* @see docs/design-docs/scenario-matrix/per-document-attribution.md
|
|
16
16
|
* @see docs/archive/exec-plans/scenario-matrix-implementation/phase-2-impact-scenarios.md
|
|
17
17
|
*/
|
|
18
|
+
/** v0 sentinel — no embedding call in v0; v1.2's flip to a real
|
|
19
|
+
* model name (e.g., "text-embedding-3-small") mechanically forces
|
|
20
|
+
* ensembleVersion's right segment to change, invalidating cached
|
|
21
|
+
* weights downstream. */
|
|
22
|
+
export const embeddingModel = "none";
|
|
23
|
+
/** VER-01 D-02 — co-located ensemble version. Compound semver-ish
|
|
24
|
+
* shape: "{algorithmVersion}+{embeddingModel}" (per D-02 of Phase
|
|
25
|
+
* 4 CONTEXT.md). Phase 1 landed the constant; Phase 4 wires the
|
|
26
|
+
* compound shape and the bump-by-calibration discipline
|
|
27
|
+
* (calibrate-attribution.ts is the ONLY allowed bumper). */
|
|
28
|
+
export const ensembleVersion = `0.1.0+${embeddingModel}`;
|
|
29
|
+
/** Version tag for the calibration set fixture co-located in the repo.
|
|
30
|
+
* Bump when the fixture structure changes (e.g. when v1 moves the
|
|
31
|
+
* calibration set to Content Lake or expands to ~30 stratified rows). */
|
|
32
|
+
export const calibrationSetVersion = "v0-fixture";
|
|
18
33
|
// ---------------------------------------------------------------------------
|
|
19
34
|
// Public API
|
|
20
35
|
// ---------------------------------------------------------------------------
|
|
@@ -59,11 +74,9 @@ export function attributeChanges(comparison, changedSlugs, mappings, noiseThresh
|
|
|
59
74
|
// (shouldn't happen in practice, but handle gracefully)
|
|
60
75
|
continue;
|
|
61
76
|
}
|
|
62
|
-
//
|
|
63
|
-
//
|
|
64
|
-
|
|
65
|
-
const matchingSlugs = changedSlugs.filter((s) => areaCanonicalSlugs.has(s));
|
|
66
|
-
// Classify each task
|
|
77
|
+
// Classify each task. When no task in this area has a matching
|
|
78
|
+
// changed doc, every per-task classification will fall through to
|
|
79
|
+
// `uncorrelated` here — no separate area-level record is needed.
|
|
67
80
|
for (const [taskId, taskInfo] of areaTasks) {
|
|
68
81
|
const taskMatchingSlugs = taskInfo.slugs.filter((s) => changedSet.has(s));
|
|
69
82
|
const classification = classifyAttribution(taskMatchingSlugs.length);
|
|
@@ -76,10 +89,6 @@ export function attributeChanges(comparison, changedSlugs, mappings, noiseThresh
|
|
|
76
89
|
withinNoiseFloor: Math.abs(areaDelta.delta) <= noiseThreshold,
|
|
77
90
|
});
|
|
78
91
|
}
|
|
79
|
-
// If no task-level matches but area has a delta, record area-level
|
|
80
|
-
if (areaTasks.length > 0 && matchingSlugs.length === 0) {
|
|
81
|
-
// All tasks in this area are uncorrelated — already handled above
|
|
82
|
-
}
|
|
83
92
|
}
|
|
84
93
|
// Find untracked documents: changed slugs not in ANY task's canonical docs
|
|
85
94
|
const allTrackedSlugs = new Set([...taskCanonicalDocs.values()].flatMap((info) => info.slugs));
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/borderline-consensus-runner.ts
|
|
3
|
+
*
|
|
4
|
+
* GRAD-04 borderline-only intra-grader consensus runner. Thin sibling
|
|
5
|
+
* of runGraderConsistency — re-grades ONLY judgments where
|
|
6
|
+
* `isBorderline(score, thresholds)` returns true. Non-borderline
|
|
7
|
+
* judgments pass through unchanged.
|
|
8
|
+
*
|
|
9
|
+
* Per D0005 (grader-model separation), replicates the SAME pinned
|
|
10
|
+
* grader N times (default 3, configurable via
|
|
11
|
+
* RepoConfig.execution.borderlineReplications); NOT the inter-grader
|
|
12
|
+
* ensemble path. Doc 03 §"Multi-grader consensus" + the GRAD-04 "Doc 03
|
|
13
|
+
* source bug" callout pinned this to intra-grader replication only.
|
|
14
|
+
*
|
|
15
|
+
* The re-grade hook is supplied by the caller as a `regrade` callback.
|
|
16
|
+
* The composition root wires it to `gradeOnce` from grader-api.js with
|
|
17
|
+
* the response/rubric text drawn from the original Promptfoo result.
|
|
18
|
+
* The runner itself imports `gradeOnce` only as the default regrader
|
|
19
|
+
* fallback so unit tests can spy/inject without re-wiring.
|
|
20
|
+
*
|
|
21
|
+
* @see docs/decisions/D0005-grader-model-separation.md
|
|
22
|
+
* @see ./borderline-detector.ts — pure predicate
|
|
23
|
+
* @see ./grader-consistency.ts — JudgmentConsistency shape we emit
|
|
24
|
+
*/
|
|
25
|
+
import type { GraderJudgment, Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
26
|
+
import { gradeOnce } from "./grader-api.js";
|
|
27
|
+
import { type JudgmentConsistency } from "./grader-consistency.js";
|
|
28
|
+
/**
|
|
29
|
+
* Re-export `gradeOnce` so callers that need to wire the default regrader
|
|
30
|
+
* (composition-root, integration tests) can import the grader entry point
|
|
31
|
+
* from this module rather than rediscovering grader-api.js. The runner
|
|
32
|
+
* itself does not invoke `gradeOnce` — the caller-supplied `regrade`
|
|
33
|
+
* callback owns the live grader call (Pitfall 6 — runner stays pure wrt
|
|
34
|
+
* provider config).
|
|
35
|
+
*/
|
|
36
|
+
export { gradeOnce };
|
|
37
|
+
export interface BorderlineConsensusOptions {
|
|
38
|
+
judgments: GraderJudgment[];
|
|
39
|
+
logger?: Logger;
|
|
40
|
+
/** Callback that re-grades a single judgment once. Returns a fresh score. */
|
|
41
|
+
regrade: (judgment: GraderJudgment) => Promise<number>;
|
|
42
|
+
/** Default 3 — see RepoConfig.execution.borderlineReplications. */
|
|
43
|
+
replications: number;
|
|
44
|
+
/** Severity boundaries from config/thresholds.ts (default [30, 50, 60]). */
|
|
45
|
+
thresholds: readonly number[];
|
|
46
|
+
}
|
|
47
|
+
export interface BorderlineConsensusResult {
|
|
48
|
+
consistencyByJudgment: Map<string, JudgmentConsistency>;
|
|
49
|
+
judgments: GraderJudgment[];
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Run intra-grader consensus on the borderline subset of `judgments`.
|
|
53
|
+
*
|
|
54
|
+
* - Borderline (per `isBorderline(score, thresholds)`): re-grade
|
|
55
|
+
* `replications` times via `regrade`; emit a `JudgmentConsistency`
|
|
56
|
+
* keyed by `${taskId}::${dimension}::${modelId}`; merge the consensus
|
|
57
|
+
* median back into the canonical judgment's `score`.
|
|
58
|
+
* - Non-borderline: pass through unchanged. Output array length == input.
|
|
59
|
+
*
|
|
60
|
+
* The function is order-preserving — the returned `judgments` array
|
|
61
|
+
* keeps the same element order as the input.
|
|
62
|
+
*/
|
|
63
|
+
export declare function runBorderlineConsensus(options: BorderlineConsensusOptions): Promise<BorderlineConsensusResult>;
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/borderline-consensus-runner.ts
|
|
3
|
+
*
|
|
4
|
+
* GRAD-04 borderline-only intra-grader consensus runner. Thin sibling
|
|
5
|
+
* of runGraderConsistency — re-grades ONLY judgments where
|
|
6
|
+
* `isBorderline(score, thresholds)` returns true. Non-borderline
|
|
7
|
+
* judgments pass through unchanged.
|
|
8
|
+
*
|
|
9
|
+
* Per D0005 (grader-model separation), replicates the SAME pinned
|
|
10
|
+
* grader N times (default 3, configurable via
|
|
11
|
+
* RepoConfig.execution.borderlineReplications); NOT the inter-grader
|
|
12
|
+
* ensemble path. Doc 03 §"Multi-grader consensus" + the GRAD-04 "Doc 03
|
|
13
|
+
* source bug" callout pinned this to intra-grader replication only.
|
|
14
|
+
*
|
|
15
|
+
* The re-grade hook is supplied by the caller as a `regrade` callback.
|
|
16
|
+
* The composition root wires it to `gradeOnce` from grader-api.js with
|
|
17
|
+
* the response/rubric text drawn from the original Promptfoo result.
|
|
18
|
+
* The runner itself imports `gradeOnce` only as the default regrader
|
|
19
|
+
* fallback so unit tests can spy/inject without re-wiring.
|
|
20
|
+
*
|
|
21
|
+
* @see docs/decisions/D0005-grader-model-separation.md
|
|
22
|
+
* @see ./borderline-detector.ts — pure predicate
|
|
23
|
+
* @see ./grader-consistency.ts — JudgmentConsistency shape we emit
|
|
24
|
+
*/
|
|
25
|
+
import { isBorderline } from "./borderline-detector.js";
|
|
26
|
+
// Imported for the default-regrader fallback documented in the header.
|
|
27
|
+
// The runner does not invoke gradeOnce directly when `regrade` is supplied.
|
|
28
|
+
// Keeping the import on the public surface preserves the architectural
|
|
29
|
+
// rule that the runner's grader entry point lives in grader-api.js
|
|
30
|
+
// (Pitfall 6 — the inter-grader ensemble module is intentionally NOT
|
|
31
|
+
// reached for on this path).
|
|
32
|
+
import { gradeOnce } from "./grader-api.js";
|
|
33
|
+
import { analyzeJudgment, } from "./grader-consistency.js";
|
|
34
|
+
/**
|
|
35
|
+
* Re-export `gradeOnce` so callers that need to wire the default regrader
|
|
36
|
+
* (composition-root, integration tests) can import the grader entry point
|
|
37
|
+
* from this module rather than rediscovering grader-api.js. The runner
|
|
38
|
+
* itself does not invoke `gradeOnce` — the caller-supplied `regrade`
|
|
39
|
+
* callback owns the live grader call (Pitfall 6 — runner stays pure wrt
|
|
40
|
+
* provider config).
|
|
41
|
+
*/
|
|
42
|
+
export { gradeOnce };
|
|
43
|
+
/** Map key for the per-judgment consistency record. */
|
|
44
|
+
function consistencyKey(j) {
|
|
45
|
+
return `${j.taskId}::${j.dimension}::${j.modelId}`;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Run intra-grader consensus on the borderline subset of `judgments`.
|
|
49
|
+
*
|
|
50
|
+
* - Borderline (per `isBorderline(score, thresholds)`): re-grade
|
|
51
|
+
* `replications` times via `regrade`; emit a `JudgmentConsistency`
|
|
52
|
+
* keyed by `${taskId}::${dimension}::${modelId}`; merge the consensus
|
|
53
|
+
* median back into the canonical judgment's `score`.
|
|
54
|
+
* - Non-borderline: pass through unchanged. Output array length == input.
|
|
55
|
+
*
|
|
56
|
+
* The function is order-preserving — the returned `judgments` array
|
|
57
|
+
* keeps the same element order as the input.
|
|
58
|
+
*/
|
|
59
|
+
export async function runBorderlineConsensus(options) {
|
|
60
|
+
const { judgments, logger, regrade, replications, thresholds } = options;
|
|
61
|
+
const consistencyByJudgment = new Map();
|
|
62
|
+
// Filter to borderline subset; bypass entirely if empty.
|
|
63
|
+
const borderlineKeys = new Set();
|
|
64
|
+
for (const j of judgments) {
|
|
65
|
+
if (isBorderline(j.score, thresholds)) {
|
|
66
|
+
borderlineKeys.add(consistencyKey(j));
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
if (borderlineKeys.size === 0) {
|
|
70
|
+
return { consistencyByJudgment, judgments };
|
|
71
|
+
}
|
|
72
|
+
const out = [];
|
|
73
|
+
for (const j of judgments) {
|
|
74
|
+
const key = consistencyKey(j);
|
|
75
|
+
if (!borderlineKeys.has(key)) {
|
|
76
|
+
out.push(j); // non-borderline — single replica
|
|
77
|
+
continue;
|
|
78
|
+
}
|
|
79
|
+
// Re-grade `replications` times via the same pinned grader. The
|
|
80
|
+
// replications carry network-bound side effects (LLM calls), so run
|
|
81
|
+
// them concurrently — `Promise.allSettled` preserves the per-replica
|
|
82
|
+
// try/catch shape (failures log + drop, surviving replicas still
|
|
83
|
+
// contribute to the consensus median). Worst-case wall time drops
|
|
84
|
+
// from `replications * roundTrip` to a single `roundTrip`.
|
|
85
|
+
const scores = [j.score];
|
|
86
|
+
const settled = await Promise.allSettled(Array.from({ length: replications }, () => regrade(j)));
|
|
87
|
+
settled.forEach((outcome, i) => {
|
|
88
|
+
if (outcome.status === "fulfilled") {
|
|
89
|
+
scores.push(outcome.value);
|
|
90
|
+
}
|
|
91
|
+
else {
|
|
92
|
+
const err = outcome.reason;
|
|
93
|
+
logger?.warn(`Borderline replication ${i + 1}/${replications} failed for ${key}: ` +
|
|
94
|
+
(err instanceof Error ? err.message : String(err)));
|
|
95
|
+
}
|
|
96
|
+
});
|
|
97
|
+
const grading = {
|
|
98
|
+
area: "",
|
|
99
|
+
dimension: j.dimension,
|
|
100
|
+
...(j.modelId ? { providerId: j.modelId } : {}),
|
|
101
|
+
scores,
|
|
102
|
+
taskId: j.taskId,
|
|
103
|
+
};
|
|
104
|
+
const consistency = analyzeJudgment(grading);
|
|
105
|
+
consistencyByJudgment.set(key, consistency);
|
|
106
|
+
// Merge consensus (median across replicas) into the canonical judgment.
|
|
107
|
+
out.push({ ...j, score: median(scores) });
|
|
108
|
+
}
|
|
109
|
+
return { consistencyByJudgment, judgments: out };
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Compute the median of an array of numbers. The runner uses median
|
|
113
|
+
* (not mean) so a single outlier replica doesn't drag the consensus
|
|
114
|
+
* score across a severity threshold.
|
|
115
|
+
*/
|
|
116
|
+
function median(values) {
|
|
117
|
+
if (values.length === 0)
|
|
118
|
+
return 0;
|
|
119
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
120
|
+
const mid = Math.floor(sorted.length / 2);
|
|
121
|
+
return sorted.length % 2 === 0
|
|
122
|
+
? (sorted[mid - 1] + sorted[mid]) / 2
|
|
123
|
+
: sorted[mid];
|
|
124
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/borderline-detector.ts
|
|
3
|
+
*
|
|
4
|
+
* GRAD-04 borderline-band predicate. Pure computation; no I/O.
|
|
5
|
+
*
|
|
6
|
+
* A judgment is "borderline" when its score lies within ±5 of any of
|
|
7
|
+
* the three rubric thresholds (severity boundaries 30 / 50 / 60 from
|
|
8
|
+
* packages/eval/config/thresholds.ts:50/54/58 — critical / warning /
|
|
9
|
+
* info edges).
|
|
10
|
+
*
|
|
11
|
+
* Per D0005 (grader-model separation), borderline judgments trigger
|
|
12
|
+
* intra-grader consensus replication of the SAME pinned grader rather
|
|
13
|
+
* than inter-grader ensemble — preserving D0005's reproducibility
|
|
14
|
+
* posture.
|
|
15
|
+
*
|
|
16
|
+
* @see docs/decisions/D0005-grader-model-separation.md
|
|
17
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
18
|
+
*/
|
|
19
|
+
export declare const BORDERLINE_BAND = 5;
|
|
20
|
+
/**
|
|
21
|
+
* Returns true when `score` lies within ±BORDERLINE_BAND of any
|
|
22
|
+
* configured threshold. Pure function — safe to call N×.
|
|
23
|
+
*/
|
|
24
|
+
export declare function isBorderline(score: number, thresholds: readonly number[]): boolean;
|