@sanity/ailf 4.5.0 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/agent-harness-tools.yaml +42 -0
- package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
- package/canonical/grader-references/mcp-server-spec.yaml +51 -0
- package/canonical/grader-references/portable-text.yaml +48 -0
- package/config/rubrics.ts +38 -2
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +197 -2
- package/dist/_vendor/ailf-core/artifact-registry.js +419 -5
- package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
- package/dist/_vendor/ailf-core/examples/index.js +146 -47
- package/dist/_vendor/ailf-core/ports/context.d.ts +26 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/llm-client.d.ts +112 -0
- package/dist/_vendor/ailf-core/ports/llm-client.js +68 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +9 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
- package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +25 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +19 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +19 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/services/index.js +5 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
- package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
- package/dist/_vendor/ailf-core/types/attribution.js +18 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
- package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
- package/dist/_vendor/ailf-core/types/confidence.d.ts +68 -0
- package/dist/_vendor/ailf-core/types/confidence.js +56 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +169 -0
- package/dist/_vendor/ailf-core/types/diagnosis.js +17 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +82 -29
- package/dist/_vendor/ailf-core/types/index.js +16 -1
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
- package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
- package/dist/adapters/attribution/index.d.ts +9 -0
- package/dist/adapters/attribution/index.js +8 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/grader-outputs/index.d.ts +10 -0
- package/dist/adapters/grader-outputs/index.js +8 -0
- package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
- package/dist/adapters/grader-outputs/legacy/index.js +10 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
- package/dist/adapters/index.d.ts +3 -0
- package/dist/adapters/index.js +4 -0
- package/dist/adapters/llm/anthropic-llm-client.d.ts +48 -0
- package/dist/adapters/llm/anthropic-llm-client.js +205 -0
- package/dist/adapters/llm/fake-llm-client.d.ts +49 -0
- package/dist/adapters/llm/fake-llm-client.js +63 -0
- package/dist/adapters/llm/index.d.ts +9 -0
- package/dist/adapters/llm/index.js +4 -0
- package/dist/adapters/llm/openai-llm-client.d.ts +44 -0
- package/dist/adapters/llm/openai-llm-client.js +168 -0
- package/dist/adapters/llm/pricing.d.ts +12 -0
- package/dist/adapters/llm/pricing.js +8 -0
- package/dist/adapters/llm/retry.d.ts +56 -0
- package/dist/adapters/llm/retry.js +66 -0
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +90 -22
- package/dist/adapters/task-sources/repo-schemas.js +19 -2
- package/dist/artifact-capture/api-gateway-artifact-writer.js +2 -1
- package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +2 -1
- package/dist/artifact-capture/gcs-artifact-writer.js +3 -1
- package/dist/artifact-capture/local-fs-artifact-writer.js +3 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/explain-handler.js +1 -1
- package/dist/commands/lookup-doc.d.ts +1 -1
- package/dist/commands/lookup-doc.js +3 -3
- package/dist/commands/pipeline-action.d.ts +6 -0
- package/dist/commands/pipeline-action.js +2 -0
- package/dist/commands/remote-pipeline.js +1 -0
- package/dist/composition-root.d.ts +59 -1
- package/dist/composition-root.js +95 -0
- package/dist/config/rubrics.ts +38 -2
- package/dist/grader/agent-harness.d.ts +14 -0
- package/dist/grader/agent-harness.js +17 -0
- package/dist/grader/common.d.ts +17 -0
- package/dist/grader/common.js +21 -0
- package/dist/grader/index.d.ts +38 -0
- package/dist/grader/index.js +75 -0
- package/dist/grader/knowledge-probe.d.ts +14 -0
- package/dist/grader/knowledge-probe.js +18 -0
- package/dist/grader/literacy.d.ts +13 -0
- package/dist/grader/literacy.js +17 -0
- package/dist/grader/mcp.d.ts +14 -0
- package/dist/grader/mcp.js +18 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +5 -0
- package/dist/orchestration/steps/calculate-scores-step.js +23 -1
- package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
- package/dist/orchestration/steps/compute-attribution-step.js +279 -0
- package/dist/orchestration/steps/gap-analysis-step.js +35 -7
- package/dist/orchestration/steps/index.d.ts +1 -0
- package/dist/orchestration/steps/index.js +1 -0
- package/dist/pipeline/attribution.d.ts +15 -0
- package/dist/pipeline/attribution.js +18 -9
- package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
- package/dist/pipeline/borderline-consensus-runner.js +124 -0
- package/dist/pipeline/borderline-detector.d.ts +24 -0
- package/dist/pipeline/borderline-detector.js +26 -0
- package/dist/pipeline/calculate-scores.d.ts +114 -3
- package/dist/pipeline/calculate-scores.js +426 -24
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +35 -17
- package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
- package/dist/pipeline/compiler/rubric-resolution.js +9 -1
- package/dist/pipeline/compute-attribution.d.ts +80 -0
- package/dist/pipeline/compute-attribution.js +196 -0
- package/dist/pipeline/failure-modes.d.ts +52 -17
- package/dist/pipeline/failure-modes.js +178 -117
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/package.json +6 -4
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/borderline-consensus-runner.ts
|
|
3
|
+
*
|
|
4
|
+
* GRAD-04 borderline-only intra-grader consensus runner. Thin sibling
|
|
5
|
+
* of runGraderConsistency — re-grades ONLY judgments where
|
|
6
|
+
* `isBorderline(score, thresholds)` returns true. Non-borderline
|
|
7
|
+
* judgments pass through unchanged.
|
|
8
|
+
*
|
|
9
|
+
* Per D0005 (grader-model separation), replicates the SAME pinned
|
|
10
|
+
* grader N times (default 3, configurable via
|
|
11
|
+
* RepoConfig.execution.borderlineReplications); NOT the inter-grader
|
|
12
|
+
* ensemble path. Doc 03 §"Multi-grader consensus" + the GRAD-04 "Doc 03
|
|
13
|
+
* source bug" callout pinned this to intra-grader replication only.
|
|
14
|
+
*
|
|
15
|
+
* The re-grade hook is supplied by the caller as a `regrade` callback.
|
|
16
|
+
* The composition root wires it to `gradeOnce` from grader-api.js with
|
|
17
|
+
* the response/rubric text drawn from the original Promptfoo result.
|
|
18
|
+
* The runner itself imports `gradeOnce` only as the default regrader
|
|
19
|
+
* fallback so unit tests can spy/inject without re-wiring.
|
|
20
|
+
*
|
|
21
|
+
* @see docs/decisions/D0005-grader-model-separation.md
|
|
22
|
+
* @see ./borderline-detector.ts — pure predicate
|
|
23
|
+
* @see ./grader-consistency.ts — JudgmentConsistency shape we emit
|
|
24
|
+
*/
|
|
25
|
+
import { isBorderline } from "./borderline-detector.js";
|
|
26
|
+
// Imported for the default-regrader fallback documented in the header.
|
|
27
|
+
// The runner does not invoke gradeOnce directly when `regrade` is supplied.
|
|
28
|
+
// Keeping the import on the public surface preserves the architectural
|
|
29
|
+
// rule that the runner's grader entry point lives in grader-api.js
|
|
30
|
+
// (Pitfall 6 — the inter-grader ensemble module is intentionally NOT
|
|
31
|
+
// reached for on this path).
|
|
32
|
+
import { gradeOnce } from "./grader-api.js";
|
|
33
|
+
import { analyzeJudgment, } from "./grader-consistency.js";
|
|
34
|
+
/**
|
|
35
|
+
* Re-export `gradeOnce` so callers that need to wire the default regrader
|
|
36
|
+
* (composition-root, integration tests) can import the grader entry point
|
|
37
|
+
* from this module rather than rediscovering grader-api.js. The runner
|
|
38
|
+
* itself does not invoke `gradeOnce` — the caller-supplied `regrade`
|
|
39
|
+
* callback owns the live grader call (Pitfall 6 — runner stays pure wrt
|
|
40
|
+
* provider config).
|
|
41
|
+
*/
|
|
42
|
+
export { gradeOnce };
|
|
43
|
+
/** Map key for the per-judgment consistency record. */
|
|
44
|
+
function consistencyKey(j) {
|
|
45
|
+
return `${j.taskId}::${j.dimension}::${j.modelId}`;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Run intra-grader consensus on the borderline subset of `judgments`.
|
|
49
|
+
*
|
|
50
|
+
* - Borderline (per `isBorderline(score, thresholds)`): re-grade
|
|
51
|
+
* `replications` times via `regrade`; emit a `JudgmentConsistency`
|
|
52
|
+
* keyed by `${taskId}::${dimension}::${modelId}`; merge the consensus
|
|
53
|
+
* median back into the canonical judgment's `score`.
|
|
54
|
+
* - Non-borderline: pass through unchanged. Output array length == input.
|
|
55
|
+
*
|
|
56
|
+
* The function is order-preserving — the returned `judgments` array
|
|
57
|
+
* keeps the same element order as the input.
|
|
58
|
+
*/
|
|
59
|
+
export async function runBorderlineConsensus(options) {
|
|
60
|
+
const { judgments, logger, regrade, replications, thresholds } = options;
|
|
61
|
+
const consistencyByJudgment = new Map();
|
|
62
|
+
// Filter to borderline subset; bypass entirely if empty.
|
|
63
|
+
const borderlineKeys = new Set();
|
|
64
|
+
for (const j of judgments) {
|
|
65
|
+
if (isBorderline(j.score, thresholds)) {
|
|
66
|
+
borderlineKeys.add(consistencyKey(j));
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
if (borderlineKeys.size === 0) {
|
|
70
|
+
return { consistencyByJudgment, judgments };
|
|
71
|
+
}
|
|
72
|
+
const out = [];
|
|
73
|
+
for (const j of judgments) {
|
|
74
|
+
const key = consistencyKey(j);
|
|
75
|
+
if (!borderlineKeys.has(key)) {
|
|
76
|
+
out.push(j); // non-borderline — single replica
|
|
77
|
+
continue;
|
|
78
|
+
}
|
|
79
|
+
// Re-grade `replications` times via the same pinned grader. The
|
|
80
|
+
// replications carry network-bound side effects (LLM calls), so run
|
|
81
|
+
// them concurrently — `Promise.allSettled` preserves the per-replica
|
|
82
|
+
// try/catch shape (failures log + drop, surviving replicas still
|
|
83
|
+
// contribute to the consensus median). Worst-case wall time drops
|
|
84
|
+
// from `replications * roundTrip` to a single `roundTrip`.
|
|
85
|
+
const scores = [j.score];
|
|
86
|
+
const settled = await Promise.allSettled(Array.from({ length: replications }, () => regrade(j)));
|
|
87
|
+
settled.forEach((outcome, i) => {
|
|
88
|
+
if (outcome.status === "fulfilled") {
|
|
89
|
+
scores.push(outcome.value);
|
|
90
|
+
}
|
|
91
|
+
else {
|
|
92
|
+
const err = outcome.reason;
|
|
93
|
+
logger?.warn(`Borderline replication ${i + 1}/${replications} failed for ${key}: ` +
|
|
94
|
+
(err instanceof Error ? err.message : String(err)));
|
|
95
|
+
}
|
|
96
|
+
});
|
|
97
|
+
const grading = {
|
|
98
|
+
area: "",
|
|
99
|
+
dimension: j.dimension,
|
|
100
|
+
...(j.modelId ? { providerId: j.modelId } : {}),
|
|
101
|
+
scores,
|
|
102
|
+
taskId: j.taskId,
|
|
103
|
+
};
|
|
104
|
+
const consistency = analyzeJudgment(grading);
|
|
105
|
+
consistencyByJudgment.set(key, consistency);
|
|
106
|
+
// Merge consensus (median across replicas) into the canonical judgment.
|
|
107
|
+
out.push({ ...j, score: median(scores) });
|
|
108
|
+
}
|
|
109
|
+
return { consistencyByJudgment, judgments: out };
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Compute the median of an array of numbers. The runner uses median
|
|
113
|
+
* (not mean) so a single outlier replica doesn't drag the consensus
|
|
114
|
+
* score across a severity threshold.
|
|
115
|
+
*/
|
|
116
|
+
function median(values) {
|
|
117
|
+
if (values.length === 0)
|
|
118
|
+
return 0;
|
|
119
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
120
|
+
const mid = Math.floor(sorted.length / 2);
|
|
121
|
+
return sorted.length % 2 === 0
|
|
122
|
+
? (sorted[mid - 1] + sorted[mid]) / 2
|
|
123
|
+
: sorted[mid];
|
|
124
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/borderline-detector.ts
|
|
3
|
+
*
|
|
4
|
+
* GRAD-04 borderline-band predicate. Pure computation; no I/O.
|
|
5
|
+
*
|
|
6
|
+
* A judgment is "borderline" when its score lies within ±5 of any of
|
|
7
|
+
* the three rubric thresholds (severity boundaries 30 / 50 / 60 from
|
|
8
|
+
* packages/eval/config/thresholds.ts:50/54/58 — critical / warning /
|
|
9
|
+
* info edges).
|
|
10
|
+
*
|
|
11
|
+
* Per D0005 (grader-model separation), borderline judgments trigger
|
|
12
|
+
* intra-grader consensus replication of the SAME pinned grader rather
|
|
13
|
+
* than inter-grader ensemble — preserving D0005's reproducibility
|
|
14
|
+
* posture.
|
|
15
|
+
*
|
|
16
|
+
* @see docs/decisions/D0005-grader-model-separation.md
|
|
17
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
18
|
+
*/
|
|
19
|
+
export declare const BORDERLINE_BAND = 5;
|
|
20
|
+
/**
|
|
21
|
+
* Returns true when `score` lies within ±BORDERLINE_BAND of any
|
|
22
|
+
* configured threshold. Pure function — safe to call N×.
|
|
23
|
+
*/
|
|
24
|
+
export declare function isBorderline(score: number, thresholds: readonly number[]): boolean;
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/borderline-detector.ts
|
|
3
|
+
*
|
|
4
|
+
* GRAD-04 borderline-band predicate. Pure computation; no I/O.
|
|
5
|
+
*
|
|
6
|
+
* A judgment is "borderline" when its score lies within ±5 of any of
|
|
7
|
+
* the three rubric thresholds (severity boundaries 30 / 50 / 60 from
|
|
8
|
+
* packages/eval/config/thresholds.ts:50/54/58 — critical / warning /
|
|
9
|
+
* info edges).
|
|
10
|
+
*
|
|
11
|
+
* Per D0005 (grader-model separation), borderline judgments trigger
|
|
12
|
+
* intra-grader consensus replication of the SAME pinned grader rather
|
|
13
|
+
* than inter-grader ensemble — preserving D0005's reproducibility
|
|
14
|
+
* posture.
|
|
15
|
+
*
|
|
16
|
+
* @see docs/decisions/D0005-grader-model-separation.md
|
|
17
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
18
|
+
*/
|
|
19
|
+
export const BORDERLINE_BAND = 5;
|
|
20
|
+
/**
|
|
21
|
+
* Returns true when `score` lies within ±BORDERLINE_BAND of any
|
|
22
|
+
* configured threshold. Pure function — safe to call N×.
|
|
23
|
+
*/
|
|
24
|
+
export function isBorderline(score, thresholds) {
|
|
25
|
+
return thresholds.some((t) => Math.abs(score - t) <= BORDERLINE_BAND);
|
|
26
|
+
}
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type SymbolPreflightReport, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
|
|
1
|
+
import { type ActualScoreEntry, type ComponentResult, type GraderReliability, type Logger, type StoredTestResult, type SymbolPreflightReport, type TestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
|
|
2
|
+
import type { JudgmentConsistency } from "./grader-consistency.js";
|
|
2
3
|
import { type ResolvedSourceConfig } from "../sources.js";
|
|
3
4
|
import type { FeatureScore, GraderJudgment, PerModelEntry } from "./types.js";
|
|
4
5
|
import { type ScoreTestGroupOptions } from "./compiler/scoring-bridge.js";
|
|
@@ -108,8 +109,84 @@ export declare function calculateScoresPerModel(resultsPath: string, goldProfile
|
|
|
108
109
|
* assertion produces one GraderJudgment entry.
|
|
109
110
|
*
|
|
110
111
|
* Phase 3a prerequisite: structured judgment data for failure mode extraction.
|
|
112
|
+
*
|
|
113
|
+
* @param resultsPath - Path to the Promptfoo results JSON file.
|
|
114
|
+
* @param telemetry - Optional reliability counter (Plan 03-03). When passed,
|
|
115
|
+
* `parseFailures` is incremented on every strict-schema rejection so the
|
|
116
|
+
* live pipeline can surface schema drift over time.
|
|
117
|
+
* `runId` (when supplied) is threaded into synthesized fall-back judgment
|
|
118
|
+
* ids so dedup keys are unique per-run.
|
|
119
|
+
*/
|
|
120
|
+
export declare function extractGraderJudgments(resultsPath: string, telemetry?: {
|
|
121
|
+
reliability: GraderReliability;
|
|
122
|
+
runId?: string;
|
|
123
|
+
}): GraderJudgment[];
|
|
124
|
+
/**
|
|
125
|
+
* Stamp every grader judgment with a D0049 ceiling-cross-check confidence
|
|
126
|
+
* triple and increment `GraderReliability.failureModeCalibration` whenever
|
|
127
|
+
* the grader's emitted `failureMode` disagrees with the
|
|
128
|
+
* ceiling-decomposition mode.
|
|
129
|
+
*
|
|
130
|
+
* Plan 03-03 — the grader's emitted `failureMode` is the source of truth
|
|
131
|
+
* for the mode itself (Plan 03-02 per-dimension taxonomies); this pass
|
|
132
|
+
* stamps confidence based on whether the structural ceiling signal agrees
|
|
133
|
+
* and surfaces calibration drift as a counter on `GraderReliability`.
|
|
134
|
+
*
|
|
135
|
+
* The function mutates `judgments` in place — it overlays
|
|
136
|
+
* `judgment.confidence` with the ceiling-cross-check stamp. If a judgment
|
|
137
|
+
* already carries a confidence from the strict-schema parse (Plan 03-01),
|
|
138
|
+
* the ceiling-cross-check stamp REPLACES it because the validator's
|
|
139
|
+
* derivation tag is the live-pipeline contract; the parsed-shape
|
|
140
|
+
* confidence (if emitted by the grader) is preserved on the original
|
|
141
|
+
* `parsedJudgment` upstream of this site.
|
|
142
|
+
*
|
|
143
|
+
* `hallucinationCheckedAgainst` is NOT populated here — `extractGraderJudgments`
|
|
144
|
+
* does not have access to `task.contextDocs ∪ run.documentManifest` at this
|
|
145
|
+
* site (the union travels through a separate path in
|
|
146
|
+
* `gap-analysis-step.ts`'s document-enrichment flow). Plan 03-04 will
|
|
147
|
+
* couple the doc-union population at the strict-schema flip site so the
|
|
148
|
+
* field is populated alongside the required-flip.
|
|
149
|
+
*
|
|
150
|
+
* @param judgments - Grader judgments produced by `extractGraderJudgments`.
|
|
151
|
+
* @param scores - Per-area feature scores; `ceilingScore` and `floorScore`
|
|
152
|
+
* come from this lookup. Missing areas default to ceiling 100, floor 0
|
|
153
|
+
* (preserves the pre-Plan-03-03 fall-back from `buildFailureModeReport`).
|
|
154
|
+
* @param reliability - `GraderReliability` sink whose
|
|
155
|
+
* `failureModeCalibration` counter is incremented on disagreement.
|
|
156
|
+
*
|
|
157
|
+
* @see docs/decisions/D0005-grader-model-separation.md
|
|
158
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
111
159
|
*/
|
|
112
|
-
export declare function
|
|
160
|
+
export declare function validateGraderJudgmentsCalibration(judgments: GraderJudgment[], scores: FeatureScore[], reliability: GraderReliability): void;
|
|
161
|
+
/**
|
|
162
|
+
* Populate Pitfall #11 hallucination cross-check fields on grader
|
|
163
|
+
* judgments (Plan 03-04 GRAD-05).
|
|
164
|
+
*
|
|
165
|
+
* For each judgment, sets `hallucinationCheckedAgainst` to the union of
|
|
166
|
+
* (a) the slugs of docs the task declared in `context.docs` and (b) the
|
|
167
|
+
* run's full document manifest. For each entry in `judgment.docCitations`
|
|
168
|
+
* that carries a `slug`, sets `hallucinated: !union.has(slug)` — a slug
|
|
169
|
+
* that does not appear in either set is a fabrication, not a real
|
|
170
|
+
* resolvable doc.
|
|
171
|
+
*
|
|
172
|
+
* Mutates `judgments` in place. Slug-less citations are left with
|
|
173
|
+
* `hallucinated` undefined since the lookup key is the slug per the
|
|
174
|
+
* GraderJudgment domain doc-comment ("slug does not resolve against the
|
|
175
|
+
* task's contextDocs set").
|
|
176
|
+
*
|
|
177
|
+
* Called from `gap-analysis-step` (literacy mode only) after
|
|
178
|
+
* `descToDocRefs` and `documentManifest` are built — both inputs are
|
|
179
|
+
* unavailable at extract time. Non-literacy modes skip this step
|
|
180
|
+
* entirely (the runtime contract has no canonical doc set to check
|
|
181
|
+
* against).
|
|
182
|
+
*
|
|
183
|
+
* @param judgments - Grader judgments to enrich. Mutated in place.
|
|
184
|
+
* @param taskDocSlugs - Map from base task description (the form judgment
|
|
185
|
+
* `taskId` carries after stripping the `(gold)` / `(baseline)` suffix)
|
|
186
|
+
* to the slugs declared in that task's `context.docs`.
|
|
187
|
+
* @param manifestSlugs - All slugs in the run's document manifest.
|
|
188
|
+
*/
|
|
189
|
+
export declare function populateHallucinationFields(judgments: GraderJudgment[], taskDocSlugs: Map<string, string[]>, manifestSlugs: Iterable<string>): void;
|
|
113
190
|
/**
|
|
114
191
|
* Extract per-test results with model output from evaluation results.
|
|
115
192
|
*
|
|
@@ -165,9 +242,43 @@ export declare function scoreAgenticResults(resultsPath: string, profile: Record
|
|
|
165
242
|
*/
|
|
166
243
|
export declare function scoreAgenticResultsPerModel(resultsPath: string, profile: Record<string, number>, preflightOptions?: ScoreTestGroupOptions): Record<string, Record<string, ActualScoreEntry>>;
|
|
167
244
|
/** Options for the calculate-scores main() function. */
|
|
245
|
+
/**
|
|
246
|
+
* Pre-built runner closure for the GRAD-04 borderline-consensus pass.
|
|
247
|
+
*
|
|
248
|
+
* The composition root produces one of these via
|
|
249
|
+
* `createBorderlineConsensusRunner` so the threshold + replication
|
|
250
|
+
* defaults stay co-located with the rest of the pipeline wiring.
|
|
251
|
+
* `calculateAndWriteScores` invokes it after `extractGraderJudgments`
|
|
252
|
+
* and before persisting `grader-judgments.json` so the judgments file
|
|
253
|
+
* carries the consensus-merged scores rather than the original
|
|
254
|
+
* single-replica grader output (CR-01).
|
|
255
|
+
*/
|
|
256
|
+
export type BorderlineConsensusRunner = (args: {
|
|
257
|
+
judgments: GraderJudgment[];
|
|
258
|
+
logger?: Logger;
|
|
259
|
+
regrade: (judgment: GraderJudgment) => Promise<number>;
|
|
260
|
+
}) => Promise<{
|
|
261
|
+
consistencyByJudgment: Map<string, JudgmentConsistency>;
|
|
262
|
+
judgments: GraderJudgment[];
|
|
263
|
+
}>;
|
|
168
264
|
export interface CalculateScoresOptions {
|
|
169
265
|
/** Allowed origins for source isolation reporting */
|
|
170
266
|
allowedOrigins?: string[];
|
|
267
|
+
/**
|
|
268
|
+
* Pre-built borderline-consensus runner (CR-01). When provided AND
|
|
269
|
+
* non-zero `borderlineReplications`, runs after extraction and
|
|
270
|
+
* persists `borderline-consistency.json` alongside
|
|
271
|
+
* `grader-judgments.json`. When omitted, the pipeline keeps the
|
|
272
|
+
* single-replica scores — preserving Phase 2 behavior for callers
|
|
273
|
+
* that haven't opted in.
|
|
274
|
+
*/
|
|
275
|
+
borderlineConsensusRunner?: BorderlineConsensusRunner;
|
|
276
|
+
/**
|
|
277
|
+
* Optional regrade entry point used by the borderline runner. Wired
|
|
278
|
+
* in by the orchestration step from grader-api.ts; when absent, the
|
|
279
|
+
* runner is skipped because there's nothing to call.
|
|
280
|
+
*/
|
|
281
|
+
borderlineRegradeOnce?: (responseText: string, rubricText: string) => Promise<null | number>;
|
|
171
282
|
/** Logger instance (defaults to ConsoleLogger if not provided) */
|
|
172
283
|
logger?: Logger;
|
|
173
284
|
/** Evaluation mode (controls which result files are read) */
|
|
@@ -209,4 +320,4 @@ export interface CalculateScoresResult {
|
|
|
209
320
|
/** Summary of test execution outcomes (total, passed, failed, errored). */
|
|
210
321
|
testSummary?: TestSummary;
|
|
211
322
|
}
|
|
212
|
-
export declare function calculateAndWriteScores(options: CalculateScoresOptions): CalculateScoresResult
|
|
323
|
+
export declare function calculateAndWriteScores(options: CalculateScoresOptions): Promise<CalculateScoresResult>;
|