@sanity/ailf 4.6.0 → 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/agent-harness-tools.yaml +42 -0
- package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
- package/canonical/grader-references/mcp-server-spec.yaml +51 -0
- package/canonical/grader-references/portable-text.yaml +48 -0
- package/config/diagnosis-cards.ts +318 -0
- package/config/models.ts +12 -0
- package/config/rubrics.ts +38 -2
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
- package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
- package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
- package/dist/_vendor/ailf-core/examples/index.js +146 -47
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +4 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +9 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
- package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +171 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +104 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +96 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +39 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +52 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +77 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +126 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +107 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +114 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +273 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +86 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +50 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +35 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +136 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +153 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +6 -0
- package/dist/_vendor/ailf-core/services/index.js +18 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
- package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
- package/dist/_vendor/ailf-core/types/attribution.js +18 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
- package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
- package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
- package/dist/_vendor/ailf-core/types/confidence.js +7 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +271 -0
- package/dist/_vendor/ailf-core/types/diagnosis.js +19 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
- package/dist/_vendor/ailf-core/types/index.js +15 -1
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
- package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
- package/dist/adapters/attribution/index.d.ts +9 -0
- package/dist/adapters/attribution/index.js +8 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/grader-outputs/index.d.ts +10 -0
- package/dist/adapters/grader-outputs/index.js +8 -0
- package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
- package/dist/adapters/grader-outputs/legacy/index.js +10 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
- package/dist/adapters/index.d.ts +3 -0
- package/dist/adapters/index.js +4 -0
- package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
- package/dist/adapters/llm/fake-llm-client.js +38 -1
- package/dist/adapters/llm/openai-llm-client.js +52 -3
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
- package/dist/adapters/task-sources/repo-schemas.js +19 -2
- package/dist/cli-program.js +3 -0
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/explain-handler.js +1 -1
- package/dist/commands/interpret.d.ts +50 -0
- package/dist/commands/interpret.js +212 -0
- package/dist/commands/lookup-doc.d.ts +1 -1
- package/dist/commands/lookup-doc.js +3 -3
- package/dist/commands/pipeline-action.d.ts +6 -0
- package/dist/commands/pipeline-action.js +2 -0
- package/dist/commands/remote-pipeline.js +1 -0
- package/dist/composition-root.d.ts +57 -23
- package/dist/composition-root.js +155 -41
- package/dist/config/diagnosis-cards.ts +318 -0
- package/dist/config/models.ts +12 -0
- package/dist/config/rubrics.ts +38 -2
- package/dist/grader/agent-harness.d.ts +9 -0
- package/dist/grader/agent-harness.js +9 -0
- package/dist/grader/common.d.ts +9 -0
- package/dist/grader/common.js +9 -0
- package/dist/grader/index.d.ts +24 -0
- package/dist/grader/index.js +24 -0
- package/dist/grader/knowledge-probe.d.ts +9 -0
- package/dist/grader/knowledge-probe.js +9 -0
- package/dist/grader/literacy.d.ts +9 -0
- package/dist/grader/literacy.js +9 -0
- package/dist/grader/mcp.d.ts +9 -0
- package/dist/grader/mcp.js +9 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +5 -0
- package/dist/orchestration/steps/calculate-scores-step.js +23 -1
- package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
- package/dist/orchestration/steps/compute-attribution-step.js +279 -0
- package/dist/orchestration/steps/gap-analysis-step.js +35 -7
- package/dist/orchestration/steps/index.d.ts +1 -0
- package/dist/orchestration/steps/index.js +1 -0
- package/dist/pipeline/attribution.d.ts +15 -0
- package/dist/pipeline/attribution.js +18 -9
- package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
- package/dist/pipeline/borderline-consensus-runner.js +124 -0
- package/dist/pipeline/borderline-detector.d.ts +24 -0
- package/dist/pipeline/borderline-detector.js +26 -0
- package/dist/pipeline/calculate-scores.d.ts +114 -3
- package/dist/pipeline/calculate-scores.js +426 -24
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +35 -17
- package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
- package/dist/pipeline/compiler/rubric-resolution.js +9 -1
- package/dist/pipeline/compute-attribution.d.ts +80 -0
- package/dist/pipeline/compute-attribution.js +196 -0
- package/dist/pipeline/failure-modes.d.ts +52 -17
- package/dist/pipeline/failure-modes.js +178 -117
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/package.json +7 -5
|
@@ -1,52 +1,47 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* pipeline/failure-modes.ts
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
* cross-referenced with ceiling decomposition data.
|
|
4
|
+
* Ceiling-cross-check failure-mode validator + report assembly.
|
|
6
5
|
*
|
|
7
|
-
*
|
|
6
|
+
* The grader emits `failureMode` directly under the per-dimension taxonomy
|
|
7
|
+
* (Plan 03-02 — `packages/eval/src/grader/`). This module consumes the
|
|
8
|
+
* grader's emission as the source of truth and uses the surviving ceiling
|
|
9
|
+
* decomposition (`classifyByCeiling`) only as a CONFIDENCE VALIDATOR — it
|
|
10
|
+
* cross-checks the emitted mode against structural score signals and emits
|
|
11
|
+
* a D0049 `Confidence` triple stamped with `derivation: "ceiling-cross-check"`.
|
|
8
12
|
*
|
|
9
|
-
* The classifier
|
|
10
|
-
*
|
|
11
|
-
*
|
|
13
|
+
* The legacy keyword-pattern classifier (and its five regex pattern
|
|
14
|
+
* constants) was deleted in Plan 03-03 — its production coverage was ~1%
|
|
15
|
+
* (Doc 03 §"Where classification lives"), and resurrecting it as a fallback
|
|
16
|
+
* is explicitly out of scope.
|
|
12
17
|
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
18
|
+
* @see docs/decisions/D0005-grader-model-separation.md — single grader emits
|
|
19
|
+
* failureMode under the per-dimension taxonomy
|
|
20
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md — Confidence triple
|
|
21
|
+
* shape and `ceiling-cross-check` derivation tag
|
|
17
22
|
*/
|
|
18
|
-
import { detectFeatureArea } from "../_vendor/ailf-core/index.js";
|
|
23
|
+
import { LEGACY_FAILURE_MODES, detectFeatureArea } from "../_vendor/ailf-core/index.js";
|
|
19
24
|
// ---------------------------------------------------------------------------
|
|
20
25
|
// Constants
|
|
21
26
|
// ---------------------------------------------------------------------------
|
|
22
27
|
/** Only classify judgments with scores below this threshold */
|
|
23
28
|
const CLASSIFICATION_THRESHOLD = 60;
|
|
24
|
-
/** All failure mode types for initializing empty counts */
|
|
25
|
-
const ALL_MODES = [
|
|
26
|
-
"api-error",
|
|
27
|
-
"incorrect-docs",
|
|
28
|
-
"missing-docs",
|
|
29
|
-
"model-limitation",
|
|
30
|
-
"outdated-docs",
|
|
31
|
-
"poor-structure",
|
|
32
|
-
"unclassified",
|
|
33
|
-
];
|
|
34
|
-
// ---------------------------------------------------------------------------
|
|
35
|
-
// Keyword patterns
|
|
36
|
-
// ---------------------------------------------------------------------------
|
|
37
|
-
/** API error pattern — checked FIRST to prevent timeout errors containing
|
|
38
|
-
* "deprecated" from being misclassified as outdated-docs. */
|
|
39
|
-
const API_ERROR_PATTERN = /\[api-error\]|timeout|timed out|rate limit|429|503|ECONNRESET|ETIMEDOUT|socket hang up|fetch failed/i;
|
|
40
|
-
const OUTDATED_PATTERN = /deprecated|old api|v[0-9]+ syntax|no longer supported|legacy|previous version|outdated|superseded|replaced by/i;
|
|
41
|
-
const MISSING_PATTERN = /no documentation|not covered|had to guess|not found|missing.*doc|no.*information|undocumented|couldn't find|without.*documentation/i;
|
|
42
|
-
const INCORRECT_PATTERN = /contradicts|incorrect.*doc|doc.*incorrect|wrong.*doc|doc.*wrong|documentation says.*but|factual error|inaccurate|misleading.*doc/i;
|
|
43
|
-
const POOR_STRUCTURE_PATTERN = /unclear|ambiguous|couldn't determine|conflicting|confusing|hard to follow|poorly organized|scattered|fragmented/i;
|
|
44
29
|
// ---------------------------------------------------------------------------
|
|
45
30
|
// Public API
|
|
46
31
|
// ---------------------------------------------------------------------------
|
|
47
32
|
/**
|
|
48
33
|
* Build a complete failure mode report from grader judgments and scores.
|
|
49
34
|
*
|
|
35
|
+
* The grader-emitted `judgment.failureMode` is the source of truth (Plan
|
|
36
|
+
* 03-03 — keyword classifier deleted). `validateFailureMode` cross-checks
|
|
37
|
+
* the emission against ceiling decomposition and stamps a D0049 confidence.
|
|
38
|
+
*
|
|
39
|
+
* The `FailureMode` triple shape (`mode`, `confidence`, `source`) is
|
|
40
|
+
* preserved for backward compatibility with downstream consumers
|
|
41
|
+
* (gap-analysis, manifest emission) — the bucketed `confidence` enum maps
|
|
42
|
+
* 1:1 from `Confidence.level`, and `source` is always `"ceiling"` now that
|
|
43
|
+
* the keyword path is gone.
|
|
44
|
+
*
|
|
50
45
|
* @param judgments - All grader judgments from the evaluation
|
|
51
46
|
* @param scores - Per-area feature scores (for ceiling decomposition)
|
|
52
47
|
* @returns Failure mode report with per-area breakdowns
|
|
@@ -66,9 +61,23 @@ export function buildFailureModeReport(judgments, scores) {
|
|
|
66
61
|
const areaScore = area ? scoreByArea.get(area) : undefined;
|
|
67
62
|
const ceilingScore = areaScore?.ceilingScore ?? 100;
|
|
68
63
|
const floorScore = areaScore?.floorScore ?? 0;
|
|
69
|
-
|
|
64
|
+
// Source the failure mode from the grader's emission. CR-02:
|
|
65
|
+
// FailureModeType is open-set (`string`) since Plan 03-02
|
|
66
|
+
// introduced per-dimension extensions (`false-floor`,
|
|
67
|
+
// `spec-mismatch`, `tool-misuse`, `factual-error`, …). The report
|
|
68
|
+
// surfaces the emission directly so downstream consumers see the
|
|
69
|
+
// grader's actual taxonomy choice rather than a collapsed
|
|
70
|
+
// `"unclassified"` bucket.
|
|
71
|
+
const emittedMode = readEmittedMode(judgment);
|
|
72
|
+
// Cross-check the grader's emission against ceiling decomposition.
|
|
73
|
+
const stamp = validateFailureMode(judgment, ceilingScore, floorScore);
|
|
74
|
+
const classification = {
|
|
75
|
+
confidence: stamp.level,
|
|
76
|
+
mode: emittedMode,
|
|
77
|
+
source: "ceiling",
|
|
78
|
+
};
|
|
70
79
|
classifiedJudgments.push({ classification, judgment });
|
|
71
|
-
summary[classification.mode]
|
|
80
|
+
summary[classification.mode] = (summary[classification.mode] ?? 0) + 1;
|
|
72
81
|
// Per-area tracking
|
|
73
82
|
if (area) {
|
|
74
83
|
if (!byArea[area]) {
|
|
@@ -79,7 +88,8 @@ export function buildFailureModeReport(judgments, scores) {
|
|
|
79
88
|
totalJudgments: 0,
|
|
80
89
|
};
|
|
81
90
|
}
|
|
82
|
-
byArea[area].modes[classification.mode]
|
|
91
|
+
byArea[area].modes[classification.mode] =
|
|
92
|
+
(byArea[area].modes[classification.mode] ?? 0) + 1;
|
|
83
93
|
byArea[area].totalJudgments++;
|
|
84
94
|
}
|
|
85
95
|
}
|
|
@@ -99,28 +109,74 @@ export function buildFailureModeReport(judgments, scores) {
|
|
|
99
109
|
};
|
|
100
110
|
}
|
|
101
111
|
/**
|
|
102
|
-
*
|
|
112
|
+
* Cross-check a grader-emitted `failureMode` against ceiling decomposition
|
|
113
|
+
* and emit a D0049 `Confidence` triple stamped with
|
|
114
|
+
* `derivation: "ceiling-cross-check"`.
|
|
103
115
|
*
|
|
104
|
-
*
|
|
105
|
-
*
|
|
116
|
+
* Replaces the deleted keyword-pattern + ceiling combine classifier — the
|
|
117
|
+
* grader's emission is now the source of truth for the mode itself; this
|
|
118
|
+
* function only stamps confidence based on whether the structural ceiling
|
|
119
|
+
* signal agrees.
|
|
106
120
|
*
|
|
107
|
-
*
|
|
121
|
+
* - `level: "high"` (`signalsPresent: 2`) — grader emission and ceiling
|
|
122
|
+
* decomposition agree on the same mode.
|
|
123
|
+
* - `level: "medium"` (`signalsPresent: 2`) — both signals present but
|
|
124
|
+
* disagree. The live pipeline increments
|
|
125
|
+
* `GraderReliability.failureModeCalibration` ONLY on this branch — a
|
|
126
|
+
* true calibration miss requires both signals to be present.
|
|
127
|
+
* - `level: "low"` (`signalsPresent: 1`) — only the grader's emission;
|
|
128
|
+
* ceiling decomposition produced no structural signal. Not a
|
|
129
|
+
* calibration miss (we have nothing to cross-check against).
|
|
130
|
+
* - `level: "low"` (`signalsPresent: 0`) — passing scores
|
|
131
|
+
* (`>= CLASSIFICATION_THRESHOLD`) don't classify; emit absent.
|
|
132
|
+
*
|
|
133
|
+
* @param judgment - The grader judgment carrying `failureMode` + `score`
|
|
108
134
|
* @param ceilingScore - The area's ceiling score (with-docs best case)
|
|
109
135
|
* @param floorScore - The area's floor score (no-docs baseline)
|
|
110
|
-
* @returns
|
|
136
|
+
* @returns D0049 Confidence triple stamped `derivation: "ceiling-cross-check"`
|
|
137
|
+
*
|
|
138
|
+
* @see docs/decisions/D0005-grader-model-separation.md
|
|
139
|
+
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
111
140
|
*/
|
|
112
|
-
export function
|
|
113
|
-
// Passing scores don't
|
|
141
|
+
export function validateFailureMode(judgment, ceilingScore, floorScore) {
|
|
142
|
+
// Passing scores don't classify — emit low-confidence absent.
|
|
114
143
|
if (judgment.score >= CLASSIFICATION_THRESHOLD) {
|
|
115
|
-
return {
|
|
144
|
+
return {
|
|
145
|
+
level: "low",
|
|
146
|
+
signalsPresent: 0,
|
|
147
|
+
derivation: "ceiling-cross-check",
|
|
148
|
+
};
|
|
116
149
|
}
|
|
117
|
-
const reason = judgment.reason.toLowerCase();
|
|
118
|
-
// Step 1: Keyword-based classification
|
|
119
|
-
const keywordMode = classifyByKeyword(reason);
|
|
120
|
-
// Step 2: Ceiling-based structural classification
|
|
121
150
|
const ceilingMode = classifyByCeiling(judgment.score, ceilingScore, floorScore);
|
|
122
|
-
|
|
123
|
-
|
|
151
|
+
if (!ceilingMode) {
|
|
152
|
+
// No structural ceiling signal — the grader's emission stands but
|
|
153
|
+
// there's nothing to cross-check against. Surface as low-confidence
|
|
154
|
+
// (signalsPresent: 1) so the caller can distinguish "we have one
|
|
155
|
+
// signal, not two" from "the two signals disagree" — and leave
|
|
156
|
+
// failureModeCalibration alone (folding this case in over-counts
|
|
157
|
+
// the reliability metric, see CR-04 in the Phase 3 review).
|
|
158
|
+
return {
|
|
159
|
+
level: "low",
|
|
160
|
+
signalsPresent: 1,
|
|
161
|
+
derivation: "ceiling-cross-check",
|
|
162
|
+
};
|
|
163
|
+
}
|
|
164
|
+
if (ceilingMode.mode === judgment.failureMode) {
|
|
165
|
+
// Both signals agree → high confidence stamp.
|
|
166
|
+
return {
|
|
167
|
+
level: "high",
|
|
168
|
+
signalsPresent: 2,
|
|
169
|
+
derivation: "ceiling-cross-check",
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
// Both signals present and disagree — the actual calibration-miss
|
|
173
|
+
// branch. The caller increments GraderReliability.failureModeCalibration
|
|
174
|
+
// only when signalsPresent === 2 here.
|
|
175
|
+
return {
|
|
176
|
+
level: "medium",
|
|
177
|
+
signalsPresent: 2,
|
|
178
|
+
derivation: "ceiling-cross-check",
|
|
179
|
+
};
|
|
124
180
|
}
|
|
125
181
|
// ---------------------------------------------------------------------------
|
|
126
182
|
// Formatting
|
|
@@ -134,10 +190,13 @@ export function formatFailureModesConsole(report) {
|
|
|
134
190
|
lines.push("");
|
|
135
191
|
lines.push(` ${report.totalJudgments} judgments analyzed, ${report.classificationRate.toFixed(0)}% classified`);
|
|
136
192
|
lines.push("");
|
|
137
|
-
// Summary table
|
|
193
|
+
// Summary table — legacy modes first (in canonical order), then any
|
|
194
|
+
// per-dimension extensions present in the run sorted by count desc.
|
|
195
|
+
// CR-02: extensions are no longer narrowed to "unclassified"; the
|
|
196
|
+
// formatter now surfaces them rather than dropping the signal.
|
|
138
197
|
lines.push(" Mode Count");
|
|
139
198
|
lines.push(" ────────────────── ─────");
|
|
140
|
-
for (const mode of
|
|
199
|
+
for (const mode of orderedSummaryKeys(report.summary)) {
|
|
141
200
|
const count = report.summary[mode] ?? 0;
|
|
142
201
|
if (count > 0) {
|
|
143
202
|
const icon = modeIcon(mode);
|
|
@@ -169,10 +228,12 @@ export function formatFailureModesMarkdown(report) {
|
|
|
169
228
|
}
|
|
170
229
|
lines.push(`**${report.totalJudgments} judgments** analyzed, **${report.classificationRate.toFixed(0)}%** classified`);
|
|
171
230
|
lines.push("");
|
|
172
|
-
// Summary table
|
|
231
|
+
// Summary table — legacy modes first, per-dimension extensions after
|
|
232
|
+
// (CR-02 — emission is now visible in aggregation rather than being
|
|
233
|
+
// collapsed to 'unclassified').
|
|
173
234
|
lines.push("| Mode | Count | % |");
|
|
174
235
|
lines.push("|------|-------|---|");
|
|
175
|
-
for (const mode of
|
|
236
|
+
for (const mode of orderedSummaryKeys(report.summary)) {
|
|
176
237
|
const count = report.summary[mode] ?? 0;
|
|
177
238
|
if (count > 0) {
|
|
178
239
|
const pct = report.totalJudgments > 0
|
|
@@ -203,7 +264,31 @@ export function formatFailureModesMarkdown(report) {
|
|
|
203
264
|
// ---------------------------------------------------------------------------
|
|
204
265
|
// Internal helpers
|
|
205
266
|
// ---------------------------------------------------------------------------
|
|
206
|
-
/**
|
|
267
|
+
/**
|
|
268
|
+
* Read the grader's emitted failureMode as the open-set
|
|
269
|
+
* `FailureModeType` (string). Per-dimension extensions from Plan 03-02
|
|
270
|
+
* (`false-floor`, `spec-mismatch`, `tool-misuse`, `factual-error`, …)
|
|
271
|
+
* survive the report aggregation as their own buckets — narrowing them
|
|
272
|
+
* to `"unclassified"` (the pre-CR-02 behavior) silently dropped at
|
|
273
|
+
* least 11 documented legal modes from `report.summary` and
|
|
274
|
+
* `report.byArea[*].topMode`. An absent or empty `failureMode` still
|
|
275
|
+
* buckets as `"unclassified"` so consumers see a stable label rather
|
|
276
|
+
* than an empty key.
|
|
277
|
+
*/
|
|
278
|
+
function readEmittedMode(judgment) {
|
|
279
|
+
const emitted = judgment.failureMode;
|
|
280
|
+
if (typeof emitted !== "string" || emitted.length === 0) {
|
|
281
|
+
return "unclassified";
|
|
282
|
+
}
|
|
283
|
+
return emitted;
|
|
284
|
+
}
|
|
285
|
+
/**
|
|
286
|
+
* Classify by ceiling-decomposition structural signals — preserved
|
|
287
|
+
* verbatim from the pre-Plan-03-03 implementation. The function itself
|
|
288
|
+
* does not change; only its CALLER (`validateFailureMode`) changes how
|
|
289
|
+
* the output is consumed (confidence stamp instead of parallel
|
|
290
|
+
* classification signal).
|
|
291
|
+
*/
|
|
207
292
|
function classifyByCeiling(score, ceilingScore, floorScore) {
|
|
208
293
|
const docLift = ceilingScore - floorScore;
|
|
209
294
|
// Negative Doc Lift: docs are actively harmful
|
|
@@ -228,55 +313,6 @@ function classifyByCeiling(score, ceilingScore, floorScore) {
|
|
|
228
313
|
}
|
|
229
314
|
return null;
|
|
230
315
|
}
|
|
231
|
-
/** Classify by keyword matching on the reason text */
|
|
232
|
-
function classifyByKeyword(reason) {
|
|
233
|
-
// API errors checked first — prevents timeout messages containing
|
|
234
|
-
// "deprecated" from being misclassified as outdated-docs.
|
|
235
|
-
if (API_ERROR_PATTERN.test(reason)) {
|
|
236
|
-
return { confidence: "high", mode: "api-error", source: "keyword" };
|
|
237
|
-
}
|
|
238
|
-
if (OUTDATED_PATTERN.test(reason)) {
|
|
239
|
-
return { confidence: "high", mode: "outdated-docs", source: "keyword" };
|
|
240
|
-
}
|
|
241
|
-
if (MISSING_PATTERN.test(reason)) {
|
|
242
|
-
return { confidence: "high", mode: "missing-docs", source: "keyword" };
|
|
243
|
-
}
|
|
244
|
-
if (INCORRECT_PATTERN.test(reason)) {
|
|
245
|
-
return { confidence: "medium", mode: "incorrect-docs", source: "keyword" };
|
|
246
|
-
}
|
|
247
|
-
if (POOR_STRUCTURE_PATTERN.test(reason)) {
|
|
248
|
-
return { confidence: "medium", mode: "poor-structure", source: "keyword" };
|
|
249
|
-
}
|
|
250
|
-
return null;
|
|
251
|
-
}
|
|
252
|
-
/**
|
|
253
|
-
* Combine keyword and ceiling classifications.
|
|
254
|
-
*
|
|
255
|
-
* Priority:
|
|
256
|
-
* 1. If both agree on mode → high confidence, source = "keyword+ceiling"
|
|
257
|
-
* 2. If keyword matched → use keyword result
|
|
258
|
-
* 3. If only ceiling matched → use ceiling result (lower confidence)
|
|
259
|
-
* 4. If neither matched → unclassified
|
|
260
|
-
*/
|
|
261
|
-
function combineClassifications(keyword, ceiling) {
|
|
262
|
-
if (keyword && ceiling) {
|
|
263
|
-
if (keyword.mode === ceiling.mode) {
|
|
264
|
-
// Both agree — boost confidence
|
|
265
|
-
return {
|
|
266
|
-
confidence: "high",
|
|
267
|
-
mode: keyword.mode,
|
|
268
|
-
source: "keyword+ceiling",
|
|
269
|
-
};
|
|
270
|
-
}
|
|
271
|
-
// Disagree — prefer keyword (it has more signal)
|
|
272
|
-
return keyword;
|
|
273
|
-
}
|
|
274
|
-
if (keyword)
|
|
275
|
-
return keyword;
|
|
276
|
-
if (ceiling)
|
|
277
|
-
return ceiling;
|
|
278
|
-
return { confidence: "low", mode: "unclassified", source: "keyword" };
|
|
279
|
-
}
|
|
280
316
|
/**
|
|
281
317
|
* Resolve area name from a task ID or description.
|
|
282
318
|
*
|
|
@@ -310,15 +346,22 @@ function resolveArea(taskId, scoreByArea) {
|
|
|
310
346
|
}
|
|
311
347
|
return undefined;
|
|
312
348
|
}
|
|
313
|
-
/**
|
|
349
|
+
/**
|
|
350
|
+
* Find the most common failure mode in the per-area `modes` record.
|
|
351
|
+
*
|
|
352
|
+
* Iterates every key in the record (CR-02 — record is open-set since
|
|
353
|
+
* Plan 03-02 introduced per-dimension extensions) and picks the
|
|
354
|
+
* highest-count classified mode. Falls back to "unclassified" when
|
|
355
|
+
* the area has no classified emissions at all.
|
|
356
|
+
*/
|
|
314
357
|
function findTopMode(modes) {
|
|
315
358
|
let topMode = "unclassified";
|
|
316
359
|
let topCount = 0;
|
|
317
|
-
for (const mode of
|
|
360
|
+
for (const [mode, count] of Object.entries(modes)) {
|
|
318
361
|
if (mode === "unclassified")
|
|
319
362
|
continue; // Prefer classified modes
|
|
320
|
-
if (
|
|
321
|
-
topCount =
|
|
363
|
+
if (count > topCount) {
|
|
364
|
+
topCount = count;
|
|
322
365
|
topMode = mode;
|
|
323
366
|
}
|
|
324
367
|
}
|
|
@@ -327,19 +370,33 @@ function findTopMode(modes) {
|
|
|
327
370
|
return "unclassified";
|
|
328
371
|
return topMode;
|
|
329
372
|
}
|
|
330
|
-
/**
|
|
373
|
+
/**
|
|
374
|
+
* Initialize the per-area / per-summary mode-count record.
|
|
375
|
+
*
|
|
376
|
+
* Pre-allocates buckets for the legacy literacy modes (CR-02 — keeps
|
|
377
|
+
* stable presence for downstream consumers like Studio columns) and
|
|
378
|
+
* leaves per-dimension extensions to be added on first emission.
|
|
379
|
+
*/
|
|
331
380
|
function initModeCounts() {
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
381
|
+
const counts = {};
|
|
382
|
+
for (const mode of LEGACY_FAILURE_MODES) {
|
|
383
|
+
counts[mode] = 0;
|
|
384
|
+
}
|
|
385
|
+
return counts;
|
|
386
|
+
}
|
|
387
|
+
/** Stable display order for the summary tables — legacy first, then extensions. */
|
|
388
|
+
function orderedSummaryKeys(summary) {
|
|
389
|
+
const legacy = LEGACY_FAILURE_MODES.filter((m) => m in summary);
|
|
390
|
+
const extensions = Object.keys(summary)
|
|
391
|
+
.filter((m) => !isLegacyMode(m))
|
|
392
|
+
.sort((a, b) => (summary[b] ?? 0) - (summary[a] ?? 0));
|
|
393
|
+
return [...legacy, ...extensions];
|
|
394
|
+
}
|
|
395
|
+
const LEGACY_MODE_SET = new Set(LEGACY_FAILURE_MODES);
|
|
396
|
+
function isLegacyMode(mode) {
|
|
397
|
+
return LEGACY_MODE_SET.has(mode);
|
|
341
398
|
}
|
|
342
|
-
/** Get icon for a failure mode */
|
|
399
|
+
/** Get icon for a failure mode — legacy modes have dedicated icons; extensions fall back to a neutral marker. */
|
|
343
400
|
function modeIcon(mode) {
|
|
344
401
|
switch (mode) {
|
|
345
402
|
case "api-error":
|
|
@@ -356,5 +413,9 @@ function modeIcon(mode) {
|
|
|
356
413
|
return "🏗️";
|
|
357
414
|
case "unclassified":
|
|
358
415
|
return "❓";
|
|
416
|
+
default:
|
|
417
|
+
// Per-dimension extensions (Plan 03-02) — neutral icon, the mode
|
|
418
|
+
// name in the table still identifies the family.
|
|
419
|
+
return "•";
|
|
359
420
|
}
|
|
360
421
|
}
|
|
@@ -54,6 +54,7 @@ export function mapRequestToConfig(request, rootDir) {
|
|
|
54
54
|
noRemoteCache: request.noRemoteCache ?? false,
|
|
55
55
|
graderContext: request.graderContext,
|
|
56
56
|
graderReplications: request.graderReplications,
|
|
57
|
+
borderlineReplications: request.borderlineReplications,
|
|
57
58
|
urls: request.urls,
|
|
58
59
|
headers: request.headers,
|
|
59
60
|
allowedOrigins: request.allowedOrigins,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sanity/ailf",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "6.0.0",
|
|
4
4
|
"private": false,
|
|
5
5
|
"publishConfig": {
|
|
6
6
|
"access": "public"
|
|
@@ -52,15 +52,16 @@
|
|
|
52
52
|
"@types/js-yaml": "^4.0.9",
|
|
53
53
|
"@types/node": "^22.13.1",
|
|
54
54
|
"nock": "^14.0.13",
|
|
55
|
+
"simple-statistics": "7.8.9",
|
|
55
56
|
"tsx": "^4.19.2",
|
|
56
57
|
"typescript": "^5.7.3",
|
|
57
58
|
"vitest": "^4.1.5",
|
|
58
|
-
"@sanity/ailf-
|
|
59
|
-
"@sanity/ailf-
|
|
59
|
+
"@sanity/ailf-core": "0.1.0",
|
|
60
|
+
"@sanity/ailf-shared": "0.1.0"
|
|
60
61
|
},
|
|
61
62
|
"scripts": {
|
|
62
|
-
"build": "tsc && tsx scripts/bundle-workspace-deps.ts",
|
|
63
|
-
"generate-configs": "tsx src/cli.ts generate-configs",
|
|
63
|
+
"build": "tsc && tsc -p tsconfig.scripts.json && tsx scripts/bundle-workspace-deps.ts",
|
|
64
|
+
"generate-configs": "tsx src/cli.ts generate-configs && tsx scripts/generate-diagnosis-config.ts",
|
|
64
65
|
"fetch-docs": "tsx src/cli.ts fetch-docs",
|
|
65
66
|
"measure-retrieval": "tsx src/cli.ts measure-retrieval",
|
|
66
67
|
"eval": "tsx src/cli.ts eval",
|
|
@@ -77,6 +78,7 @@
|
|
|
77
78
|
"pipeline": "tsx src/cli.ts pipeline",
|
|
78
79
|
"validate": "tsx src/cli.ts validate config",
|
|
79
80
|
"test": "vitest run",
|
|
81
|
+
"test:compiler": "AILF_E2E=1 vitest run src/pipeline/compiler/__tests__",
|
|
80
82
|
"test:e2e": "AILF_E2E=1 vitest run src/__tests__/e2e",
|
|
81
83
|
"test:e2e:adapters": "AILF_E2E=1 vitest run src/adapters",
|
|
82
84
|
"test:e2e:api": "AILF_E2E_API=1 vitest run src/__tests__/api-tier2-tenant-integration.test.ts src/__tests__/gcs-artifact-writer-roundtrip.test.ts",
|