@sanity/ailf 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/cli.js +0 -0
- package/dist/orchestration/steps/run-eval-step.js +1 -1
- package/dist/pipeline/checks.d.ts +8 -3
- package/dist/pipeline/checks.js +23 -3
- package/package.json +25 -25
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
- package/dist/adapters/task-sources/yaml-task-source.js +0 -139
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
- package/dist/commands/update-quality-scores.d.ts +0 -5
- package/dist/commands/update-quality-scores.js +0 -20
- package/dist/lib/agent-behavior-report.d.ts +0 -8
- package/dist/lib/agent-behavior-report.js +0 -185
- package/dist/lib/baseline.d.ts +0 -19
- package/dist/lib/baseline.js +0 -153
- package/dist/lib/calculate-scores.d.ts +0 -23
- package/dist/lib/calculate-scores.js +0 -42
- package/dist/lib/compare.d.ts +0 -18
- package/dist/lib/compare.js +0 -170
- package/dist/lib/coverage-audit.d.ts +0 -4
- package/dist/lib/coverage-audit.js +0 -42
- package/dist/lib/discovery-report.d.ts +0 -13
- package/dist/lib/discovery-report.js +0 -57
- package/dist/lib/fetch-docs.d.ts +0 -30
- package/dist/lib/fetch-docs.js +0 -171
- package/dist/lib/generate-configs.d.ts +0 -25
- package/dist/lib/generate-configs.js +0 -42
- package/dist/lib/grader-api.d.ts +0 -21
- package/dist/lib/grader-api.js +0 -34
- package/dist/lib/grader-compare.d.ts +0 -19
- package/dist/lib/grader-compare.js +0 -91
- package/dist/lib/grader-consistency.d.ts +0 -27
- package/dist/lib/grader-consistency.js +0 -79
- package/dist/lib/grader-sensitivity.d.ts +0 -19
- package/dist/lib/grader-sensitivity.js +0 -75
- package/dist/lib/grader-validate.d.ts +0 -19
- package/dist/lib/grader-validate.js +0 -78
- package/dist/lib/measure-retrieval.d.ts +0 -14
- package/dist/lib/measure-retrieval.js +0 -71
- package/dist/lib/pr-comment.d.ts +0 -16
- package/dist/lib/pr-comment.js +0 -28
- package/dist/lib/readiness-report.d.ts +0 -13
- package/dist/lib/readiness-report.js +0 -108
- package/dist/lib/webhook-server.d.ts +0 -11
- package/dist/lib/webhook-server.js +0 -24
- package/dist/lib/weekly-digest.d.ts +0 -24
- package/dist/lib/weekly-digest.js +0 -148
- package/dist/orchestration/env-bridge.d.ts +0 -21
- package/dist/orchestration/env-bridge.js +0 -66
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
- package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
- package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
- package/dist/pipeline/compiler/task-bridge.js +0 -92
- package/dist/pipeline/expand-tasks.d.ts +0 -232
- package/dist/pipeline/expand-tasks.js +0 -467
- package/dist/pipeline/generate-configs.d.ts +0 -92
- package/dist/pipeline/generate-configs.js +0 -445
- package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/calculate-scores-step.js +0 -89
- package/dist/pipeline/steps/compare-step.d.ts +0 -18
- package/dist/pipeline/steps/compare-step.js +0 -90
- package/dist/pipeline/steps/eval-step.d.ts +0 -53
- package/dist/pipeline/steps/eval-step.js +0 -347
- package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
- package/dist/pipeline/steps/fetch-docs-step.js +0 -84
- package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
- package/dist/pipeline/steps/generate-configs-step.js +0 -98
- package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
- package/dist/pipeline/steps/grader-consistency-step.js +0 -74
- package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
- package/dist/pipeline/steps/publish-report-step.js +0 -243
- package/dist/pipeline/steps/report-step.d.ts +0 -13
- package/dist/pipeline/steps/report-step.js +0 -56
- package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/update-scores-step.js +0 -42
- package/dist/scripts/agent-behavior-report.d.ts +0 -19
- package/dist/scripts/agent-behavior-report.js +0 -315
- package/dist/scripts/baseline.d.ts +0 -43
- package/dist/scripts/baseline.js +0 -267
- package/dist/scripts/calculate-scores.d.ts +0 -166
- package/dist/scripts/calculate-scores.js +0 -1296
- package/dist/scripts/compare.d.ts +0 -22
- package/dist/scripts/compare.js +0 -334
- package/dist/scripts/coverage-audit.d.ts +0 -44
- package/dist/scripts/coverage-audit.js +0 -209
- package/dist/scripts/debug-eval.d.ts +0 -19
- package/dist/scripts/debug-eval.js +0 -73
- package/dist/scripts/discovery-report.d.ts +0 -58
- package/dist/scripts/discovery-report.js +0 -250
- package/dist/scripts/fetch-docs.d.ts +0 -35
- package/dist/scripts/fetch-docs.js +0 -472
- package/dist/scripts/generate-configs.d.ts +0 -66
- package/dist/scripts/generate-configs.js +0 -459
- package/dist/scripts/grader-api.d.ts +0 -27
- package/dist/scripts/grader-api.js +0 -206
- package/dist/scripts/grader-compare.d.ts +0 -22
- package/dist/scripts/grader-compare.js +0 -368
- package/dist/scripts/grader-consistency.d.ts +0 -20
- package/dist/scripts/grader-consistency.js +0 -313
- package/dist/scripts/grader-sensitivity.d.ts +0 -22
- package/dist/scripts/grader-sensitivity.js +0 -354
- package/dist/scripts/grader-validate.d.ts +0 -19
- package/dist/scripts/grader-validate.js +0 -267
- package/dist/scripts/measure-retrieval.d.ts +0 -10
- package/dist/scripts/measure-retrieval.js +0 -145
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
- package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
- package/dist/scripts/pipeline.d.ts +0 -76
- package/dist/scripts/pipeline.js +0 -1031
- package/dist/scripts/pr-comment.d.ts +0 -10
- package/dist/scripts/pr-comment.js +0 -510
- package/dist/scripts/readiness-report.d.ts +0 -88
- package/dist/scripts/readiness-report.js +0 -342
- package/dist/scripts/update-quality-scores.d.ts +0 -15
- package/dist/scripts/update-quality-scores.js +0 -184
- package/dist/scripts/validate-task-sources.d.ts +0 -21
- package/dist/scripts/validate-task-sources.js +0 -210
- package/dist/scripts/validate.d.ts +0 -13
- package/dist/scripts/validate.js +0 -79
- package/dist/scripts/webhook-server.d.ts +0 -26
- package/dist/scripts/webhook-server.js +0 -147
- package/dist/scripts/weekly-digest.d.ts +0 -24
- package/dist/scripts/weekly-digest.js +0 -144
- package/dist/sinks/format-slack.d.ts +0 -64
- package/dist/sinks/format-slack.js +0 -306
- package/dist/sinks/slack-sink.d.ts +0 -27
- package/dist/sinks/slack-sink.js +0 -78
- package/dist/sinks/webhook-sink.d.ts +0 -19
- package/dist/sinks/webhook-sink.js +0 -50
- package/tasks/.expanded.agentic.yaml +0 -280
- package/tasks/.expanded.yaml +0 -565
|
@@ -1,1296 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Calculate-scores.ts
|
|
3
|
-
*
|
|
4
|
-
* Reads Promptfoo evaluation output and computes the AI Literacy Score
|
|
5
|
-
* for each feature area. Each dimension is scored on a uniform 0–100 scale:
|
|
6
|
-
*
|
|
7
|
-
* Task Completion (0–100) — Can the LLM implement the feature?
|
|
8
|
-
* Code Correctness (0–100) — Is the code idiomatic and correct?
|
|
9
|
-
* Doc Coverage (0–100) — Did docs provide the needed info?
|
|
10
|
-
*
|
|
11
|
-
* Dimensions are combined into a weighted composite (0–100) using weights
|
|
12
|
-
* from config/rubrics.yaml (default: Task×0.50 + Code×0.25 + Docs×0.25).
|
|
13
|
-
*
|
|
14
|
-
* Additionally compares with-docs vs without-docs scores to calculate
|
|
15
|
-
* the "Doc Lift" — how much documentation helps vs parametric knowledge.
|
|
16
|
-
*
|
|
17
|
-
* When tests are run with the InstrumentedProvider (agent-observer),
|
|
18
|
-
* this script also aggregates and reports agent behavior data: which
|
|
19
|
-
* documentation pages were visited, what searches were performed, and
|
|
20
|
-
* overall network activity patterns.
|
|
21
|
-
*/
|
|
22
|
-
// oxlint-disable-next-line import/no-unassigned-import -- side-effect: loads .env into process.env
|
|
23
|
-
import "dotenv/config";
|
|
24
|
-
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
25
|
-
import { dirname, join } from "path";
|
|
26
|
-
import { calculateCost } from "../agent-observer/pricing.js";
|
|
27
|
-
import { checkResultsExist } from "../pipeline/checks.js";
|
|
28
|
-
import { loadRubricTemplates } from "../pipeline/expand-tasks.js";
|
|
29
|
-
import { loadSource } from "../sources.js";
|
|
30
|
-
import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
|
|
31
|
-
/**
|
|
32
|
-
* Calculate scores grouped by model. Each model gets its own FeatureScore[]
|
|
33
|
-
* and model-level aggregates.
|
|
34
|
-
*
|
|
35
|
-
* Uses the provider.id from Promptfoo results to identify models.
|
|
36
|
-
* Falls back to provider.label, then "unknown" if neither is available.
|
|
37
|
-
*
|
|
38
|
-
* @returns Record keyed by model ID, or null if only one model was used
|
|
39
|
-
* (per-model breakdown is redundant when there's only one model).
|
|
40
|
-
*/
|
|
41
|
-
export function calculateScoresPerModel(resultsPath, weights) {
|
|
42
|
-
const results = readAndNormalizeResults(resultsPath);
|
|
43
|
-
// Group results by provider
|
|
44
|
-
const byModel = {};
|
|
45
|
-
for (const result of results) {
|
|
46
|
-
const modelId = result.providerId ?? result.providerLabel ?? "unknown";
|
|
47
|
-
const label = result.providerLabel ?? result.providerId ?? "unknown";
|
|
48
|
-
if (!byModel[modelId]) {
|
|
49
|
-
byModel[modelId] = { label, results: [] };
|
|
50
|
-
}
|
|
51
|
-
byModel[modelId].results.push(result);
|
|
52
|
-
}
|
|
53
|
-
const modelIds = Object.keys(byModel);
|
|
54
|
-
// Skip per-model breakdown if there's only one model — it would be
|
|
55
|
-
// Identical to the overall scores and adds no information.
|
|
56
|
-
if (modelIds.length <= 1) {
|
|
57
|
-
return null;
|
|
58
|
-
}
|
|
59
|
-
const perModel = [];
|
|
60
|
-
for (const [modelId, { label, results: modelResults }] of Object.entries(byModel)) {
|
|
61
|
-
const scores = scoreResults(modelResults, weights, modelId);
|
|
62
|
-
const totalTests = scores.reduce((s, sc) => s + sc.testCount, 0);
|
|
63
|
-
const totalCost = scores.reduce((s, sc) => s + sc.totalCost, 0);
|
|
64
|
-
const avgScore = scores.length > 0
|
|
65
|
-
? scores.reduce((s, sc) => s + sc.totalScore, 0) / scores.length
|
|
66
|
-
: 0;
|
|
67
|
-
const avgDocLift = scores.length > 0
|
|
68
|
-
? scores.reduce((s, sc) => s + sc.docLift, 0) / scores.length
|
|
69
|
-
: 0;
|
|
70
|
-
perModel.push({
|
|
71
|
-
label,
|
|
72
|
-
modelId,
|
|
73
|
-
overall: {
|
|
74
|
-
avgDocLift,
|
|
75
|
-
avgScore,
|
|
76
|
-
cost: totalCost > 0 ? totalCost : undefined,
|
|
77
|
-
testCount: totalTests,
|
|
78
|
-
},
|
|
79
|
-
scores,
|
|
80
|
-
});
|
|
81
|
-
}
|
|
82
|
-
return perModel;
|
|
83
|
-
}
|
|
84
|
-
export function classifyRubric(component) {
|
|
85
|
-
// Prefer structured metadata (Approach 5) over heuristic matching
|
|
86
|
-
const metadata = component.assertion?.metadata;
|
|
87
|
-
if (metadata?.dimension) {
|
|
88
|
-
switch (metadata.dimension) {
|
|
89
|
-
case "code-correctness":
|
|
90
|
-
return "codeCorrectness";
|
|
91
|
-
case "doc-coverage":
|
|
92
|
-
return "docCoverage";
|
|
93
|
-
case "task-completion":
|
|
94
|
-
return "taskCompletion";
|
|
95
|
-
default:
|
|
96
|
-
return null;
|
|
97
|
-
}
|
|
98
|
-
}
|
|
99
|
-
// Fallback: heuristic name matching (for backward compatibility)
|
|
100
|
-
const value = (component.assertion?.value ?? "").toLowerCase();
|
|
101
|
-
if (value.includes("task completion")) {
|
|
102
|
-
return "taskCompletion";
|
|
103
|
-
}
|
|
104
|
-
if (value.includes("code correctness")) {
|
|
105
|
-
return "codeCorrectness";
|
|
106
|
-
}
|
|
107
|
-
if (value.includes("documentation coverage") ||
|
|
108
|
-
value.includes("hallucinate")) {
|
|
109
|
-
return "docCoverage";
|
|
110
|
-
}
|
|
111
|
-
return null;
|
|
112
|
-
}
|
|
113
|
-
export function detectFeatureArea(description) {
|
|
114
|
-
const desc = description.toLowerCase();
|
|
115
|
-
if (desc.includes("studio")) {
|
|
116
|
-
return "studio-setup";
|
|
117
|
-
}
|
|
118
|
-
if (desc.includes("visual") ||
|
|
119
|
-
desc.includes("presentation") ||
|
|
120
|
-
desc.includes("live preview")) {
|
|
121
|
-
return "visual-editing";
|
|
122
|
-
}
|
|
123
|
-
if (desc.includes("function") || desc.includes("webhook")) {
|
|
124
|
-
return "functions";
|
|
125
|
-
}
|
|
126
|
-
if (desc.startsWith("groq")) {
|
|
127
|
-
return "groq";
|
|
128
|
-
}
|
|
129
|
-
if (desc.includes("next") || desc.includes("app router")) {
|
|
130
|
-
return "nextjs-live";
|
|
131
|
-
}
|
|
132
|
-
if (desc.includes("remix") ||
|
|
133
|
-
desc.includes("nuxt") ||
|
|
134
|
-
desc.includes("svelte")) {
|
|
135
|
-
return "frameworks";
|
|
136
|
-
}
|
|
137
|
-
return "other";
|
|
138
|
-
}
|
|
139
|
-
// ---------------------------------------------------------------------------
|
|
140
|
-
// URL extraction from assertion metadata
|
|
141
|
-
// ---------------------------------------------------------------------------
|
|
142
|
-
/**
|
|
143
|
-
* Extract grader judgments (reason text + scores) from evaluation results.
|
|
144
|
-
*
|
|
145
|
-
* This preserves the grader's natural language reasoning for downstream
|
|
146
|
-
* analysis (failure mode classification, gap analysis). Each llm-rubric
|
|
147
|
-
* assertion produces one GraderJudgment entry.
|
|
148
|
-
*
|
|
149
|
-
* Phase 3a prerequisite: structured judgment data for failure mode extraction.
|
|
150
|
-
*/
|
|
151
|
-
export function extractGraderJudgments(resultsPath) {
|
|
152
|
-
const results = readAndNormalizeResults(resultsPath);
|
|
153
|
-
const judgments = [];
|
|
154
|
-
for (const result of results) {
|
|
155
|
-
const taskId = result.description;
|
|
156
|
-
const modelId = result.providerId ?? result.providerLabel ?? "unknown";
|
|
157
|
-
for (const comp of result.gradingResult.componentResults) {
|
|
158
|
-
if (comp.assertion?.type !== "llm-rubric") {
|
|
159
|
-
continue;
|
|
160
|
-
}
|
|
161
|
-
const kind = classifyRubric(comp);
|
|
162
|
-
if (!kind) {
|
|
163
|
-
continue;
|
|
164
|
-
}
|
|
165
|
-
const score = parseRubricScore(comp);
|
|
166
|
-
// Extract the reason text — the grader's reasoning
|
|
167
|
-
let reason = comp.reason ?? "";
|
|
168
|
-
if (reason) {
|
|
169
|
-
// Try to parse JSON reason to extract the reason field
|
|
170
|
-
try {
|
|
171
|
-
const parsed = JSON.parse(reason);
|
|
172
|
-
const obj = parsed;
|
|
173
|
-
if (typeof obj.reason === "string") {
|
|
174
|
-
;
|
|
175
|
-
({ reason } = obj);
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
catch {
|
|
179
|
-
// Not JSON — use raw reason string
|
|
180
|
-
}
|
|
181
|
-
}
|
|
182
|
-
// Map internal dimension names to hyphenated form
|
|
183
|
-
const dimensionMap = {
|
|
184
|
-
codeCorrectness: "code-correctness",
|
|
185
|
-
docCoverage: "doc-coverage",
|
|
186
|
-
taskCompletion: "task-completion",
|
|
187
|
-
};
|
|
188
|
-
judgments.push({
|
|
189
|
-
dimension: dimensionMap[kind] ?? kind,
|
|
190
|
-
modelId,
|
|
191
|
-
reason,
|
|
192
|
-
score,
|
|
193
|
-
taskId,
|
|
194
|
-
});
|
|
195
|
-
}
|
|
196
|
-
}
|
|
197
|
-
return judgments;
|
|
198
|
-
}
|
|
199
|
-
/**
|
|
200
|
-
* Finds the URL-extraction assertion result in a test's componentResults
|
|
201
|
-
* and parses the structured JSON from its `reason` field.
|
|
202
|
-
*/
|
|
203
|
-
export function extractUrlMetadata(test) {
|
|
204
|
-
for (const comp of test.gradingResult.componentResults) {
|
|
205
|
-
if (comp.assertion?.type !== "javascript") {
|
|
206
|
-
continue;
|
|
207
|
-
}
|
|
208
|
-
if (!comp.reason) {
|
|
209
|
-
continue;
|
|
210
|
-
}
|
|
211
|
-
try {
|
|
212
|
-
const parsed = JSON.parse(comp.reason);
|
|
213
|
-
const obj = parsed;
|
|
214
|
-
if (Array.isArray(obj.sanityUrls) &&
|
|
215
|
-
typeof obj.totalUrlCount === "number") {
|
|
216
|
-
return parsed;
|
|
217
|
-
}
|
|
218
|
-
}
|
|
219
|
-
catch {
|
|
220
|
-
// Not the URL-extraction assertion - skip
|
|
221
|
-
}
|
|
222
|
-
}
|
|
223
|
-
return null;
|
|
224
|
-
}
|
|
225
|
-
// ---------------------------------------------------------------------------
|
|
226
|
-
// Per-model scoring (Approach 3 from evaluation roadmap)
|
|
227
|
-
// ---------------------------------------------------------------------------
|
|
228
|
-
export function parseRubricScore(component) {
|
|
229
|
-
// Direct score field
|
|
230
|
-
if (typeof component.score === "number") {
|
|
231
|
-
return component.score;
|
|
232
|
-
}
|
|
233
|
-
// Try to extract from reason (LLM rubric returns JSON)
|
|
234
|
-
if (component.reason) {
|
|
235
|
-
try {
|
|
236
|
-
const parsed = JSON.parse(component.reason);
|
|
237
|
-
const obj = parsed;
|
|
238
|
-
if (typeof obj.score === "number") {
|
|
239
|
-
return obj.score;
|
|
240
|
-
}
|
|
241
|
-
}
|
|
242
|
-
catch {
|
|
243
|
-
// Try to find a bare number
|
|
244
|
-
const match = component.reason.match(/(\d+)/);
|
|
245
|
-
if (match) {
|
|
246
|
-
return parseInt(match[1], 10);
|
|
247
|
-
}
|
|
248
|
-
}
|
|
249
|
-
}
|
|
250
|
-
return 0;
|
|
251
|
-
}
|
|
252
|
-
/**
|
|
253
|
-
* Aggregates agent behavior data across all test results, grouped by
|
|
254
|
-
* feature area. Returns null if no behavior data is present.
|
|
255
|
-
*/
|
|
256
|
-
function aggregateAgentBehavior(resultsPath) {
|
|
257
|
-
const results = readAndNormalizeResults(resultsPath);
|
|
258
|
-
const byFeature = {};
|
|
259
|
-
let hasBehaviorData = false;
|
|
260
|
-
for (const result of results) {
|
|
261
|
-
const feature = detectFeatureArea(result.description);
|
|
262
|
-
const behavior = extractAgentBehavior(result);
|
|
263
|
-
if (!behavior) {
|
|
264
|
-
continue;
|
|
265
|
-
}
|
|
266
|
-
hasBehaviorData = true;
|
|
267
|
-
if (!byFeature[feature]) {
|
|
268
|
-
byFeature[feature] = [];
|
|
269
|
-
}
|
|
270
|
-
byFeature[feature].push(behavior);
|
|
271
|
-
}
|
|
272
|
-
if (!hasBehaviorData) {
|
|
273
|
-
return null;
|
|
274
|
-
}
|
|
275
|
-
return Object.entries(byFeature)
|
|
276
|
-
.map(([feature, behaviors]) => {
|
|
277
|
-
const count = behaviors.length || 1;
|
|
278
|
-
return {
|
|
279
|
-
avgDocPagesVisited: behaviors.reduce((s, b) => s + b.docPagesVisited, 0) / count,
|
|
280
|
-
avgNetworkTimeMs: behaviors.reduce((s, b) => s + b.totalNetworkMs, 0) / count,
|
|
281
|
-
avgSearchesPerformed: behaviors.reduce((s, b) => s + b.searchesPerformed, 0) / count,
|
|
282
|
-
docSlugsVisited: [
|
|
283
|
-
...new Set(behaviors.flatMap((b) => b.docSlugsVisited)),
|
|
284
|
-
],
|
|
285
|
-
externalDomains: [
|
|
286
|
-
...new Set(behaviors.flatMap((b) => b.externalDomains)),
|
|
287
|
-
],
|
|
288
|
-
feature,
|
|
289
|
-
searchQueries: [
|
|
290
|
-
...new Set(behaviors.flatMap((b) => b.uniqueSearchQueries)),
|
|
291
|
-
],
|
|
292
|
-
tasksWithBehaviorData: behaviors.length,
|
|
293
|
-
};
|
|
294
|
-
})
|
|
295
|
-
.sort((a, b) => a.feature.localeCompare(b.feature));
|
|
296
|
-
}
|
|
297
|
-
/**
|
|
298
|
-
* Computes aggregate source isolation metrics from agentic eval results.
|
|
299
|
-
*
|
|
300
|
-
* Reads DOC_ALLOWED_ORIGINS from the environment (set by pipeline.ts)
|
|
301
|
-
* and analyzes all doc page visits across all test results.
|
|
302
|
-
*
|
|
303
|
-
* Returns null if no origin sandboxing was configured or no agent behavior
|
|
304
|
-
* data is present.
|
|
305
|
-
*/
|
|
306
|
-
function aggregateSourceIsolation(resultsPath) {
|
|
307
|
-
const originsEnv = process.env.DOC_ALLOWED_ORIGINS;
|
|
308
|
-
if (!originsEnv) {
|
|
309
|
-
return null;
|
|
310
|
-
}
|
|
311
|
-
const allowedOrigins = originsEnv
|
|
312
|
-
.split(",")
|
|
313
|
-
.map((o) => o.trim())
|
|
314
|
-
.filter(Boolean);
|
|
315
|
-
if (allowedOrigins.length === 0) {
|
|
316
|
-
return null;
|
|
317
|
-
}
|
|
318
|
-
const results = readAndNormalizeResults(resultsPath);
|
|
319
|
-
// Collect all doc page visits from all test results
|
|
320
|
-
const allDocVisits = [];
|
|
321
|
-
for (const result of results) {
|
|
322
|
-
const behavior = result.metadata?.agentBehavior;
|
|
323
|
-
if (!behavior?.docPageVisits) {
|
|
324
|
-
continue;
|
|
325
|
-
}
|
|
326
|
-
for (const visit of behavior.docPageVisits) {
|
|
327
|
-
allDocVisits.push({ url: visit.url });
|
|
328
|
-
}
|
|
329
|
-
}
|
|
330
|
-
if (allDocVisits.length === 0) {
|
|
331
|
-
return null;
|
|
332
|
-
}
|
|
333
|
-
return analyzeSourceIsolation(allDocVisits, allowedOrigins);
|
|
334
|
-
}
|
|
335
|
-
// ---------------------------------------------------------------------------
|
|
336
|
-
// Feature area detection
|
|
337
|
-
// ---------------------------------------------------------------------------
|
|
338
|
-
/**
|
|
339
|
-
* Aggregates URL references across all test results, grouped by feature
|
|
340
|
-
* area and gold/baseline variant.
|
|
341
|
-
*/
|
|
342
|
-
function aggregateUrlReferences(resultsPath) {
|
|
343
|
-
const results = readAndNormalizeResults(resultsPath);
|
|
344
|
-
const byFeature = {};
|
|
345
|
-
for (const result of results) {
|
|
346
|
-
const feature = detectFeatureArea(result.description);
|
|
347
|
-
if (!byFeature[feature]) {
|
|
348
|
-
byFeature[feature] = {
|
|
349
|
-
baseline: { testCount: 0, urls: {} },
|
|
350
|
-
gold: { testCount: 0, urls: {} },
|
|
351
|
-
};
|
|
352
|
-
}
|
|
353
|
-
const hasDocs = result.vars.docs && result.vars.docs.trim().length > 0;
|
|
354
|
-
const bucket = hasDocs
|
|
355
|
-
? byFeature[feature].gold
|
|
356
|
-
: byFeature[feature].baseline;
|
|
357
|
-
const meta = extractUrlMetadata(result);
|
|
358
|
-
if (!meta) {
|
|
359
|
-
continue;
|
|
360
|
-
}
|
|
361
|
-
bucket.testCount++;
|
|
362
|
-
for (const url of meta.sanityUrls) {
|
|
363
|
-
bucket.urls[url] = (bucket.urls[url] || 0) + 1;
|
|
364
|
-
}
|
|
365
|
-
}
|
|
366
|
-
return Object.entries(byFeature)
|
|
367
|
-
.map(([feature, data]) => ({ feature, ...data }))
|
|
368
|
-
.sort((a, b) => a.feature.localeCompare(b.feature));
|
|
369
|
-
}
|
|
370
|
-
// ---------------------------------------------------------------------------
|
|
371
|
-
// Score calculation
|
|
372
|
-
// ---------------------------------------------------------------------------
|
|
373
|
-
/**
|
|
374
|
-
* Build source verification data for the score summary.
|
|
375
|
-
*
|
|
376
|
-
* Combines pipeline configuration (mode, source, sandbox) with runtime
|
|
377
|
-
* metadata (URL fetch results, isolation scores) to produce a unified
|
|
378
|
-
* verification report.
|
|
379
|
-
*/
|
|
380
|
-
function buildSourceVerification(root, source) {
|
|
381
|
-
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string means unset
|
|
382
|
-
const mode = process.env.EVAL_MODE || "baseline";
|
|
383
|
-
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string means unset
|
|
384
|
-
const sourceUrl = source?.baseUrl || process.env.DOC_BASE_URL || "default";
|
|
385
|
-
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string means unset
|
|
386
|
-
const searchMode = process.env.EVAL_SEARCH_MODE || undefined;
|
|
387
|
-
const allowedOriginsEnv = process.env.DOC_ALLOWED_ORIGINS;
|
|
388
|
-
const allowedOrigins = allowedOriginsEnv
|
|
389
|
-
? allowedOriginsEnv
|
|
390
|
-
.split(",")
|
|
391
|
-
.map((o) => o.trim())
|
|
392
|
-
.filter(Boolean)
|
|
393
|
-
: undefined;
|
|
394
|
-
// Read URL fetch metadata if it exists (written by fetch-docs.ts)
|
|
395
|
-
let urlFetch;
|
|
396
|
-
const urlFetchPath = join(root, "contexts", "url-fetch.json");
|
|
397
|
-
if (existsSync(urlFetchPath)) {
|
|
398
|
-
try {
|
|
399
|
-
urlFetch = JSON.parse(readFileSync(urlFetchPath, "utf-8"));
|
|
400
|
-
}
|
|
401
|
-
catch {
|
|
402
|
-
// Malformed JSON — skip
|
|
403
|
-
}
|
|
404
|
-
}
|
|
405
|
-
return {
|
|
406
|
-
...(allowedOrigins && { allowedOrigins }),
|
|
407
|
-
mode,
|
|
408
|
-
...(searchMode && { searchMode }),
|
|
409
|
-
source: sourceUrl,
|
|
410
|
-
...(urlFetch && {
|
|
411
|
-
urlFetch: {
|
|
412
|
-
failures: urlFetch.failures,
|
|
413
|
-
fetchedUrls: urlFetch.fetchedUrls.map((f) => ({
|
|
414
|
-
method: f.method,
|
|
415
|
-
status: f.status,
|
|
416
|
-
url: f.url,
|
|
417
|
-
})),
|
|
418
|
-
totalFailed: urlFetch.totalFailed,
|
|
419
|
-
totalFetched: urlFetch.totalFetched,
|
|
420
|
-
},
|
|
421
|
-
}),
|
|
422
|
-
};
|
|
423
|
-
}
|
|
424
|
-
// ---------------------------------------------------------------------------
|
|
425
|
-
// Agent behavior aggregation
|
|
426
|
-
// ---------------------------------------------------------------------------
|
|
427
|
-
/**
|
|
428
|
-
* Calculate overall scores (all models combined).
|
|
429
|
-
* This is the original scoring path — backward compatible.
|
|
430
|
-
*/
|
|
431
|
-
function calculateScores(resultsPath, weights) {
|
|
432
|
-
const results = readAndNormalizeResults(resultsPath);
|
|
433
|
-
return scoreResults(results, weights);
|
|
434
|
-
}
|
|
435
|
-
/**
|
|
436
|
-
* Extracts agent behavior summary from a test result's metadata.
|
|
437
|
-
* Returns null if the test was not run with the instrumented provider.
|
|
438
|
-
*/
|
|
439
|
-
function extractAgentBehavior(test) {
|
|
440
|
-
const { metadata } = test;
|
|
441
|
-
if (!metadata?.agentBehaviorSummary) {
|
|
442
|
-
return null;
|
|
443
|
-
}
|
|
444
|
-
return metadata.agentBehaviorSummary;
|
|
445
|
-
}
|
|
446
|
-
/**
|
|
447
|
-
* Extracts grader (assertion) cost from the raw Promptfoo results file.
|
|
448
|
-
* Promptfoo tracks assertion token usage separately in stats.tokenUsage.assertions.
|
|
449
|
-
* The grader model is found in config.defaultTest.options.provider.
|
|
450
|
-
*/
|
|
451
|
-
function extractGraderCost(resultsPath) {
|
|
452
|
-
const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
|
|
453
|
-
const stats = file.results?.stats;
|
|
454
|
-
const assertions = stats?.tokenUsage?.assertions;
|
|
455
|
-
if (!assertions || assertions.total === 0) {
|
|
456
|
-
return null;
|
|
457
|
-
}
|
|
458
|
-
// Extract grader model from config
|
|
459
|
-
const graderModel = file.config?.defaultTest?.options?.rubricProvider ??
|
|
460
|
-
file.config?.defaultTest?.options?.provider;
|
|
461
|
-
// Extract just the model name from "openai:gpt-5-2025-08-07" format
|
|
462
|
-
const modelName = graderModel?.replace(/^[^:]+:/, "");
|
|
463
|
-
const cost = modelName
|
|
464
|
-
? calculateCost(modelName, assertions.prompt, assertions.completion)
|
|
465
|
-
: 0;
|
|
466
|
-
return {
|
|
467
|
-
completionTokens: assertions.completion,
|
|
468
|
-
cost,
|
|
469
|
-
model: graderModel,
|
|
470
|
-
promptTokens: assertions.prompt,
|
|
471
|
-
totalTokens: assertions.total,
|
|
472
|
-
};
|
|
473
|
-
}
|
|
474
|
-
/**
|
|
475
|
-
* Prints a formatted report of agent behavior observations.
|
|
476
|
-
*/
|
|
477
|
-
function printAgentBehaviorReport(agentBehavior) {
|
|
478
|
-
console.log("-".repeat(80));
|
|
479
|
-
console.log("AGENT BEHAVIOR OBSERVATION");
|
|
480
|
-
console.log("-".repeat(80));
|
|
481
|
-
console.log();
|
|
482
|
-
// Summary table
|
|
483
|
-
const h = "| Feature Area | Tests | Doc Pages | Searches | Net (ms) |";
|
|
484
|
-
const sep = "|---------------------|-------|-----------|----------|----------|";
|
|
485
|
-
console.log(h);
|
|
486
|
-
console.log(sep);
|
|
487
|
-
for (const ab of agentBehavior) {
|
|
488
|
-
console.log(`| ${ab.feature.padEnd(19)} | ` +
|
|
489
|
-
`${ab.tasksWithBehaviorData.toString().padStart(5)} | ` +
|
|
490
|
-
`${ab.avgDocPagesVisited.toFixed(1).padStart(9)} | ` +
|
|
491
|
-
`${ab.avgSearchesPerformed.toFixed(1).padStart(8)} | ` +
|
|
492
|
-
`${Math.round(ab.avgNetworkTimeMs).toString().padStart(8)} |`);
|
|
493
|
-
}
|
|
494
|
-
console.log();
|
|
495
|
-
// Doc pages visited
|
|
496
|
-
console.log(" Doc pages visited:");
|
|
497
|
-
for (const ab of agentBehavior) {
|
|
498
|
-
if (ab.docSlugsVisited.length === 0) {
|
|
499
|
-
console.log(` ${ab.feature}: (none)`);
|
|
500
|
-
}
|
|
501
|
-
else {
|
|
502
|
-
console.log(` ${ab.feature}:`);
|
|
503
|
-
for (const slug of ab.docSlugsVisited) {
|
|
504
|
-
console.log(` - /docs/${slug}`);
|
|
505
|
-
}
|
|
506
|
-
}
|
|
507
|
-
}
|
|
508
|
-
console.log();
|
|
509
|
-
// Search queries
|
|
510
|
-
const hasSearches = agentBehavior.some((ab) => ab.searchQueries.length > 0);
|
|
511
|
-
if (hasSearches) {
|
|
512
|
-
console.log(" Search queries:");
|
|
513
|
-
for (const ab of agentBehavior) {
|
|
514
|
-
if (ab.searchQueries.length === 0) {
|
|
515
|
-
continue;
|
|
516
|
-
}
|
|
517
|
-
console.log(` ${ab.feature}:`);
|
|
518
|
-
for (const q of ab.searchQueries) {
|
|
519
|
-
console.log(` - "${q}"`);
|
|
520
|
-
}
|
|
521
|
-
}
|
|
522
|
-
console.log();
|
|
523
|
-
}
|
|
524
|
-
// External domains
|
|
525
|
-
const allExternalDomains = [
|
|
526
|
-
...new Set(agentBehavior.flatMap((ab) => ab.externalDomains)),
|
|
527
|
-
];
|
|
528
|
-
if (allExternalDomains.length > 0) {
|
|
529
|
-
console.log(" External domains contacted:");
|
|
530
|
-
for (const d of allExternalDomains) {
|
|
531
|
-
console.log(` - ${d}`);
|
|
532
|
-
}
|
|
533
|
-
console.log();
|
|
534
|
-
}
|
|
535
|
-
}
|
|
536
|
-
// ---------------------------------------------------------------------------
|
|
537
|
-
// Report
|
|
538
|
-
// ---------------------------------------------------------------------------
|
|
539
|
-
// ---------------------------------------------------------------------------
|
|
540
|
-
// Grader cost extraction
|
|
541
|
-
// ---------------------------------------------------------------------------
|
|
542
|
-
/**
|
|
543
|
-
* Reads the raw Promptfoo output file and normalizes each result so that
|
|
544
|
-
* `description` is always a top-level field (pulled from `testCase` if needed).
|
|
545
|
-
*/
|
|
546
|
-
function readAndNormalizeResults(resultsPath) {
|
|
547
|
-
const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
|
|
548
|
-
const wrapper = file.results ?? file;
|
|
549
|
-
const all = wrapper.results.map((r) => ({
|
|
550
|
-
cost: r.cost ?? 0,
|
|
551
|
-
description: r.testCase?.description ?? "unknown",
|
|
552
|
-
error: r.error,
|
|
553
|
-
gradingResult: r.gradingResult,
|
|
554
|
-
metadata: r.metadata,
|
|
555
|
-
provider: r.provider?.label ?? r.provider?.id,
|
|
556
|
-
providerId: r.provider?.id,
|
|
557
|
-
providerLabel: r.provider?.label,
|
|
558
|
-
response: r.response,
|
|
559
|
-
vars: r.vars ?? r.testCase?.vars ?? {},
|
|
560
|
-
}));
|
|
561
|
-
// Filter out results where gradingResult is null (errored/timed-out tests).
|
|
562
|
-
// Promptfoo sets gradingResult to null when a test errors before grading.
|
|
563
|
-
const valid = all.filter((r) => r.gradingResult !== null);
|
|
564
|
-
const skipped = all.length - valid.length;
|
|
565
|
-
if (skipped > 0) {
|
|
566
|
-
console.warn(` ⚠ Skipping ${skipped} of ${all.length} result(s) with null gradingResult (errored tests):`);
|
|
567
|
-
for (const r of all) {
|
|
568
|
-
if (r.gradingResult === null) {
|
|
569
|
-
const providerLabel = r.provider ? `[${r.provider}] ` : "";
|
|
570
|
-
const errorMsg = r.error
|
|
571
|
-
? r.error.slice(0, 150)
|
|
572
|
-
: "unknown error (no error field in result)";
|
|
573
|
-
console.warn(` ✗ ${providerLabel}"${r.description}" — ${errorMsg}`);
|
|
574
|
-
}
|
|
575
|
-
}
|
|
576
|
-
}
|
|
577
|
-
return valid;
|
|
578
|
-
}
|
|
579
|
-
/**
|
|
580
|
-
* Core scoring logic: takes a pre-filtered array of TestResult and produces
|
|
581
|
-
* FeatureScore[] grouped by feature area. This is the shared implementation
|
|
582
|
-
* used by both the overall scoring and per-model scoring paths.
|
|
583
|
-
*
|
|
584
|
-
* @param results Pre-filtered (valid) test results
|
|
585
|
-
* @param weights Dimension weights from rubrics.yaml
|
|
586
|
-
* @param modelId Optional model identifier to tag each FeatureScore
|
|
587
|
-
*/
|
|
588
|
-
function scoreResults(results, weights, modelId) {
|
|
589
|
-
const wTask = weights["task-completion"] ?? 0.5;
|
|
590
|
-
const wCode = weights["code-correctness"] ?? 0.25;
|
|
591
|
-
const wDoc = weights["doc-coverage"] ?? 0.25;
|
|
592
|
-
// Group by feature + docs/no-docs
|
|
593
|
-
const byFeature = {};
|
|
594
|
-
for (const result of results) {
|
|
595
|
-
const feature = detectFeatureArea(result.description);
|
|
596
|
-
if (!byFeature[feature]) {
|
|
597
|
-
byFeature[feature] = { withDocs: [], withoutDocs: [] };
|
|
598
|
-
}
|
|
599
|
-
const hasDocs = result.vars.docs && result.vars.docs.trim().length > 0;
|
|
600
|
-
if (hasDocs) {
|
|
601
|
-
byFeature[feature].withDocs.push(result);
|
|
602
|
-
}
|
|
603
|
-
else {
|
|
604
|
-
byFeature[feature].withoutDocs.push(result);
|
|
605
|
-
}
|
|
606
|
-
}
|
|
607
|
-
const scores = [];
|
|
608
|
-
for (const [feature, data] of Object.entries(byFeature)) {
|
|
609
|
-
// --- With docs ---
|
|
610
|
-
let totalTask = 0;
|
|
611
|
-
let totalCode = 0;
|
|
612
|
-
let totalDoc = 0;
|
|
613
|
-
let featureCost = 0;
|
|
614
|
-
const countWithDocs = data.withDocs.length || 1;
|
|
615
|
-
for (const test of data.withDocs) {
|
|
616
|
-
featureCost += test.cost;
|
|
617
|
-
for (const comp of test.gradingResult.componentResults) {
|
|
618
|
-
if (comp.assertion?.type !== "llm-rubric") {
|
|
619
|
-
continue;
|
|
620
|
-
}
|
|
621
|
-
const score = parseRubricScore(comp);
|
|
622
|
-
const kind = classifyRubric(comp);
|
|
623
|
-
if (kind === "taskCompletion") {
|
|
624
|
-
totalTask += score;
|
|
625
|
-
}
|
|
626
|
-
else if (kind === "codeCorrectness") {
|
|
627
|
-
totalCode += score;
|
|
628
|
-
}
|
|
629
|
-
else if (kind === "docCoverage") {
|
|
630
|
-
totalDoc += score;
|
|
631
|
-
}
|
|
632
|
-
}
|
|
633
|
-
}
|
|
634
|
-
// Per-dimension averages (each 0–100)
|
|
635
|
-
const avgTask = totalTask / countWithDocs;
|
|
636
|
-
const avgCode = totalCode / countWithDocs;
|
|
637
|
-
const avgDoc = totalDoc / countWithDocs;
|
|
638
|
-
// Weighted composite (0–100)
|
|
639
|
-
const withDocsTotal = avgTask * wTask + avgCode * wCode + avgDoc * wDoc;
|
|
640
|
-
// --- Without docs (baseline) ---
|
|
641
|
-
let baselineTotal = 0;
|
|
642
|
-
let baselineCount = 0;
|
|
643
|
-
for (const test of data.withoutDocs) {
|
|
644
|
-
featureCost += test.cost;
|
|
645
|
-
for (const comp of test.gradingResult.componentResults) {
|
|
646
|
-
if (comp.assertion?.type !== "llm-rubric") {
|
|
647
|
-
continue;
|
|
648
|
-
}
|
|
649
|
-
baselineTotal += parseRubricScore(comp);
|
|
650
|
-
baselineCount++;
|
|
651
|
-
}
|
|
652
|
-
}
|
|
653
|
-
const withoutDocsScore = baselineCount > 0 ? baselineTotal / baselineCount : 0;
|
|
654
|
-
const ceilingScore = Math.round(withDocsTotal);
|
|
655
|
-
const floorScore = Math.round(withoutDocsScore);
|
|
656
|
-
const docLift = ceilingScore - floorScore;
|
|
657
|
-
scores.push({
|
|
658
|
-
ceilingScore,
|
|
659
|
-
codeCorrectness: Math.round(avgCode),
|
|
660
|
-
docCoverage: Math.round(avgDoc),
|
|
661
|
-
docLift,
|
|
662
|
-
docQualityGap: 100 - ceilingScore,
|
|
663
|
-
feature,
|
|
664
|
-
floorScore,
|
|
665
|
-
...(modelId && { modelId }),
|
|
666
|
-
negativeDocLift: docLift < 0,
|
|
667
|
-
taskCompletion: Math.round(avgTask),
|
|
668
|
-
testCount: data.withDocs.length,
|
|
669
|
-
totalCost: featureCost,
|
|
670
|
-
totalScore: ceilingScore,
|
|
671
|
-
});
|
|
672
|
-
}
|
|
673
|
-
return scores.sort((a, b) => a.feature.localeCompare(b.feature));
|
|
674
|
-
}
|
|
675
|
-
export function scoreAgenticResults(resultsPath, weights) {
|
|
676
|
-
const results = readAndNormalizeResults(resultsPath);
|
|
677
|
-
const wTask = weights["task-completion"] ?? 0.5;
|
|
678
|
-
const wCode = weights["code-correctness"] ?? 0.25;
|
|
679
|
-
const wDoc = weights["doc-coverage"] ?? 0.25;
|
|
680
|
-
// Group by feature area
|
|
681
|
-
const byFeature = {};
|
|
682
|
-
for (const result of results) {
|
|
683
|
-
const feature = detectFeatureArea(result.description);
|
|
684
|
-
if (!byFeature[feature]) {
|
|
685
|
-
byFeature[feature] = [];
|
|
686
|
-
}
|
|
687
|
-
byFeature[feature].push(result);
|
|
688
|
-
}
|
|
689
|
-
const entries = {};
|
|
690
|
-
for (const [feature, featureResults] of Object.entries(byFeature)) {
|
|
691
|
-
let totalTask = 0;
|
|
692
|
-
let totalCode = 0;
|
|
693
|
-
let totalDoc = 0;
|
|
694
|
-
let featureCost = 0;
|
|
695
|
-
const count = featureResults.length || 1;
|
|
696
|
-
for (const test of featureResults) {
|
|
697
|
-
featureCost += test.cost;
|
|
698
|
-
for (const comp of test.gradingResult.componentResults) {
|
|
699
|
-
if (comp.assertion?.type !== "llm-rubric")
|
|
700
|
-
continue;
|
|
701
|
-
const score = parseRubricScore(comp);
|
|
702
|
-
const kind = classifyRubric(comp);
|
|
703
|
-
if (kind === "taskCompletion")
|
|
704
|
-
totalTask += score;
|
|
705
|
-
else if (kind === "codeCorrectness")
|
|
706
|
-
totalCode += score;
|
|
707
|
-
else if (kind === "docCoverage")
|
|
708
|
-
totalDoc += score;
|
|
709
|
-
}
|
|
710
|
-
}
|
|
711
|
-
const avgTask = totalTask / count;
|
|
712
|
-
const avgCode = totalCode / count;
|
|
713
|
-
const avgDoc = totalDoc / count;
|
|
714
|
-
const actualScore = Math.round(avgTask * wTask + avgCode * wCode + avgDoc * wDoc);
|
|
715
|
-
entries[feature] = {
|
|
716
|
-
actualScore,
|
|
717
|
-
codeCorrectness: Math.round(avgCode),
|
|
718
|
-
docCoverage: Math.round(avgDoc),
|
|
719
|
-
taskCompletion: Math.round(avgTask),
|
|
720
|
-
testCount: featureResults.length,
|
|
721
|
-
totalCost: featureCost,
|
|
722
|
-
};
|
|
723
|
-
}
|
|
724
|
-
return entries;
|
|
725
|
-
}
|
|
726
|
-
// ---------------------------------------------------------------------------
|
|
727
|
-
// Score merging — combine baseline floor/ceiling with agentic actual
|
|
728
|
-
// ---------------------------------------------------------------------------
|
|
729
|
-
/**
|
|
730
|
-
* Merge baseline FeatureScore[] with agentic actual scores to produce
|
|
731
|
-
* the full three-layer decomposition.
|
|
732
|
-
*
|
|
733
|
-
* The merge is per feature area. For each area:
|
|
734
|
-
* - If baseline data exists: floor, ceiling, docLift, docQualityGap are populated
|
|
735
|
-
* - If agentic data exists: actualScore is populated
|
|
736
|
-
* - If both exist: retrievalGap and infrastructureEfficiency are computed
|
|
737
|
-
*
|
|
738
|
-
* @param baselineScores Floor/ceiling scores from baseline evaluation (may be empty)
|
|
739
|
-
* @param agenticScores Actual scores from agentic evaluation (may be empty)
|
|
740
|
-
*/
|
|
741
|
-
export function mergeScores(baselineScores, agenticScores) {
|
|
742
|
-
// Build a map of baseline scores by feature
|
|
743
|
-
const baselineMap = new Map();
|
|
744
|
-
for (const score of baselineScores) {
|
|
745
|
-
baselineMap.set(score.feature, score);
|
|
746
|
-
}
|
|
747
|
-
// Collect all feature areas from both sources
|
|
748
|
-
const allFeatures = new Set([
|
|
749
|
-
...baselineScores.map((s) => s.feature),
|
|
750
|
-
...Object.keys(agenticScores),
|
|
751
|
-
]);
|
|
752
|
-
const merged = [];
|
|
753
|
-
for (const feature of allFeatures) {
|
|
754
|
-
const baseline = baselineMap.get(feature);
|
|
755
|
-
const agentic = agenticScores[feature];
|
|
756
|
-
if (baseline && agentic) {
|
|
757
|
-
// Both data sources — full decomposition
|
|
758
|
-
const retrievalGap = baseline.ceilingScore - agentic.actualScore;
|
|
759
|
-
const negativeDocLift = baseline.docLift < 0;
|
|
760
|
-
// Infrastructure efficiency: actual / ceiling
|
|
761
|
-
// Null when ceiling ≤ 0 or negative Doc Lift (the metric is meaningless
|
|
762
|
-
// when docs hurt performance — see evaluation-ceiling.md)
|
|
763
|
-
let infrastructureEfficiency = null;
|
|
764
|
-
if (!negativeDocLift && baseline.ceilingScore > 0) {
|
|
765
|
-
infrastructureEfficiency = agentic.actualScore / baseline.ceilingScore;
|
|
766
|
-
}
|
|
767
|
-
// Inverted retrieval gap: agents outperform injected docs.
|
|
768
|
-
// This happens when Doc Lift is negative AND actual > ceiling.
|
|
769
|
-
// It means retrieval failure is masking a doc quality problem.
|
|
770
|
-
const invertedRetrievalGap = negativeDocLift && retrievalGap < 0;
|
|
771
|
-
merged.push({
|
|
772
|
-
...baseline,
|
|
773
|
-
actualScore: agentic.actualScore,
|
|
774
|
-
infrastructureEfficiency,
|
|
775
|
-
invertedRetrievalGap: invertedRetrievalGap || undefined,
|
|
776
|
-
retrievalGap,
|
|
777
|
-
totalCost: baseline.totalCost + agentic.totalCost,
|
|
778
|
-
});
|
|
779
|
-
}
|
|
780
|
-
else if (baseline) {
|
|
781
|
-
// Baseline only — no agentic data (partial summary)
|
|
782
|
-
merged.push({ ...baseline });
|
|
783
|
-
}
|
|
784
|
-
else if (agentic) {
|
|
785
|
-
// Agentic only — no baseline data (partial summary)
|
|
786
|
-
merged.push({
|
|
787
|
-
actualScore: agentic.actualScore,
|
|
788
|
-
ceilingScore: 0,
|
|
789
|
-
codeCorrectness: agentic.codeCorrectness,
|
|
790
|
-
docCoverage: agentic.docCoverage,
|
|
791
|
-
docLift: 0,
|
|
792
|
-
docQualityGap: 100,
|
|
793
|
-
feature,
|
|
794
|
-
floorScore: 0,
|
|
795
|
-
negativeDocLift: false,
|
|
796
|
-
taskCompletion: agentic.taskCompletion,
|
|
797
|
-
testCount: agentic.testCount,
|
|
798
|
-
totalCost: agentic.totalCost,
|
|
799
|
-
totalScore: agentic.actualScore,
|
|
800
|
-
});
|
|
801
|
-
}
|
|
802
|
-
}
|
|
803
|
-
return merged.sort((a, b) => a.feature.localeCompare(b.feature));
|
|
804
|
-
}
|
|
805
|
-
const CRITICAL_THRESHOLD = 40;
|
|
806
|
-
function main() {
|
|
807
|
-
const ROOT = join(dirname(new URL(import.meta.url).pathname), "..", "..");
|
|
808
|
-
const args = process.argv.slice(2);
|
|
809
|
-
// Parse --source <name> argument
|
|
810
|
-
const sourceIdx = args.indexOf("--source");
|
|
811
|
-
const sourceName = sourceIdx !== -1 ? args[sourceIdx + 1] : undefined;
|
|
812
|
-
// Always load source config so environment info is included in score summary.
|
|
813
|
-
// When no --source is specified, defaults to production.
|
|
814
|
-
let source;
|
|
815
|
-
try {
|
|
816
|
-
source = loadSource(sourceName);
|
|
817
|
-
}
|
|
818
|
-
catch {
|
|
819
|
-
console.warn(` [warn] Could not load source "${sourceName}", proceeding without source metadata`);
|
|
820
|
-
}
|
|
821
|
-
// Determine mode — controls which result files are read
|
|
822
|
-
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string means unset
|
|
823
|
-
const mode = process.env.EVAL_MODE || "baseline";
|
|
824
|
-
// First positional arg (not a flag) is the results path (baseline results)
|
|
825
|
-
const baselineResultsPath = args.find((a) => !a.startsWith("--") && args[args.indexOf(a) - 1] !== "--source") ?? join(ROOT, "results", "latest", "eval-results.json");
|
|
826
|
-
// Agentic results path (only used in full mode)
|
|
827
|
-
const agenticResultsPath = join(ROOT, "results", "latest", "eval-results-agentic.json");
|
|
828
|
-
// Validate baseline results file
|
|
829
|
-
const resultsIssues = checkResultsExist(ROOT, baselineResultsPath);
|
|
830
|
-
const resultsErrors = resultsIssues.filter((i) => i.severity === "error");
|
|
831
|
-
if (resultsErrors.length > 0) {
|
|
832
|
-
console.error("❌ Results validation failed:");
|
|
833
|
-
for (const e of resultsErrors) {
|
|
834
|
-
console.error(` ERROR: ${e.message}`);
|
|
835
|
-
if (e.path) {
|
|
836
|
-
console.error(` at ${e.path}`);
|
|
837
|
-
}
|
|
838
|
-
}
|
|
839
|
-
console.error("\nRun 'pnpm eval' first to generate results, then 'pnpm calculate-scores'.");
|
|
840
|
-
process.exit(1);
|
|
841
|
-
}
|
|
842
|
-
console.log(`Reading results from: ${baselineResultsPath}`);
|
|
843
|
-
if (source) {
|
|
844
|
-
console.log(`Source: ${sourceName} (${source.baseUrl})`);
|
|
845
|
-
}
|
|
846
|
-
// Load dimension weights from rubrics.yaml
|
|
847
|
-
const rubricConfig = loadRubricTemplates(ROOT);
|
|
848
|
-
const baselineScores = calculateScores(baselineResultsPath, rubricConfig.weights);
|
|
849
|
-
const perModel = calculateScoresPerModel(baselineResultsPath, rubricConfig.weights);
|
|
850
|
-
const urlRefs = aggregateUrlReferences(baselineResultsPath);
|
|
851
|
-
const sourceVerification = buildSourceVerification(ROOT, source);
|
|
852
|
-
const graderCost = extractGraderCost(baselineResultsPath);
|
|
853
|
-
// Full mode: merge baseline floor/ceiling with agentic actual scores
|
|
854
|
-
let scores;
|
|
855
|
-
let agentBehavior = null;
|
|
856
|
-
let sourceIsolation = null;
|
|
857
|
-
let evaluationMode;
|
|
858
|
-
if (mode === "full" && existsSync(agenticResultsPath)) {
|
|
859
|
-
console.log(`\nReading agentic results from: ${agenticResultsPath}`);
|
|
860
|
-
const agenticScores = scoreAgenticResults(agenticResultsPath, rubricConfig.weights);
|
|
861
|
-
scores = mergeScores(baselineScores, agenticScores);
|
|
862
|
-
evaluationMode = "full";
|
|
863
|
-
// Aggregate agent behavior and source isolation from agentic results
|
|
864
|
-
agentBehavior = aggregateAgentBehavior(agenticResultsPath);
|
|
865
|
-
sourceIsolation = aggregateSourceIsolation(agenticResultsPath);
|
|
866
|
-
// Merge grader costs from both files
|
|
867
|
-
const agenticGraderCost = extractGraderCost(agenticResultsPath);
|
|
868
|
-
if (graderCost && agenticGraderCost) {
|
|
869
|
-
graderCost.cost += agenticGraderCost.cost;
|
|
870
|
-
graderCost.totalTokens += agenticGraderCost.totalTokens;
|
|
871
|
-
graderCost.promptTokens += agenticGraderCost.promptTokens;
|
|
872
|
-
graderCost.completionTokens += agenticGraderCost.completionTokens;
|
|
873
|
-
}
|
|
874
|
-
}
|
|
875
|
-
else if (mode === "agentic") {
|
|
876
|
-
scores = baselineScores;
|
|
877
|
-
agentBehavior = aggregateAgentBehavior(baselineResultsPath);
|
|
878
|
-
sourceIsolation = aggregateSourceIsolation(baselineResultsPath);
|
|
879
|
-
evaluationMode = "agentic";
|
|
880
|
-
}
|
|
881
|
-
else {
|
|
882
|
-
scores = baselineScores;
|
|
883
|
-
agentBehavior = aggregateAgentBehavior(baselineResultsPath);
|
|
884
|
-
sourceIsolation = aggregateSourceIsolation(baselineResultsPath);
|
|
885
|
-
evaluationMode = mode === "observed" ? "observed" : "baseline";
|
|
886
|
-
}
|
|
887
|
-
const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode);
|
|
888
|
-
// Persist
|
|
889
|
-
const outDir = join(ROOT, "results", "latest");
|
|
890
|
-
mkdirSync(outDir, { recursive: true });
|
|
891
|
-
writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
|
|
892
|
-
console.log("Score summary written to results/latest/score-summary.json");
|
|
893
|
-
// Extract and persist grader judgments (Phase 3a: failure mode extraction)
|
|
894
|
-
const judgments = extractGraderJudgments(baselineResultsPath);
|
|
895
|
-
// In full mode, also extract judgments from agentic results
|
|
896
|
-
if (mode === "full" && existsSync(agenticResultsPath)) {
|
|
897
|
-
const agenticJudgments = extractGraderJudgments(agenticResultsPath);
|
|
898
|
-
judgments.push(...agenticJudgments);
|
|
899
|
-
}
|
|
900
|
-
if (judgments.length > 0) {
|
|
901
|
-
writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
|
|
902
|
-
console.log(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
|
|
903
|
-
}
|
|
904
|
-
// Exit with non-zero if any area below critical threshold
|
|
905
|
-
if (summary.belowCritical.length > 0) {
|
|
906
|
-
process.exit(1);
|
|
907
|
-
}
|
|
908
|
-
}
|
|
909
|
-
function printPerModelReport(perModel) {
|
|
910
|
-
console.log("-".repeat(80));
|
|
911
|
-
console.log("PER-MODEL BREAKDOWN");
|
|
912
|
-
console.log("-".repeat(80));
|
|
913
|
-
console.log();
|
|
914
|
-
// Model summary table
|
|
915
|
-
const h = "| Model | Avg Score | Avg Lift | Tests | Cost |";
|
|
916
|
-
const sep = "|--------------------------------|-----------|----------|-------|----------|";
|
|
917
|
-
console.log(h);
|
|
918
|
-
console.log(sep);
|
|
919
|
-
const sorted = [...perModel].sort((a, b) => b.overall.avgScore - a.overall.avgScore);
|
|
920
|
-
for (const entry of sorted) {
|
|
921
|
-
const displayName = entry.label || entry.modelId;
|
|
922
|
-
const costStr = entry.overall.cost
|
|
923
|
-
? `$${entry.overall.cost.toFixed(4)}`
|
|
924
|
-
: "—";
|
|
925
|
-
const liftStr = entry.overall.avgDocLift >= 0
|
|
926
|
-
? `+${entry.overall.avgDocLift.toFixed(1)}`
|
|
927
|
-
: entry.overall.avgDocLift.toFixed(1);
|
|
928
|
-
console.log(`| ${displayName.padEnd(30)} | ` +
|
|
929
|
-
`${entry.overall.avgScore.toFixed(1).padStart(9)} | ` +
|
|
930
|
-
`${liftStr.padStart(8)} | ` +
|
|
931
|
-
`${entry.overall.testCount.toString().padStart(5)} | ` +
|
|
932
|
-
`${costStr.padStart(8)} |`);
|
|
933
|
-
}
|
|
934
|
-
console.log();
|
|
935
|
-
// Per-model × per-area breakdown
|
|
936
|
-
for (const entry of sorted) {
|
|
937
|
-
const displayName = entry.label || entry.modelId;
|
|
938
|
-
console.log(` ${displayName} (${entry.modelId}):`);
|
|
939
|
-
const areaH = " | Feature Area | Task | Code | Docs | Total | Lift |";
|
|
940
|
-
const areaSep = " |---------------------|------|------|------|-------|------|";
|
|
941
|
-
console.log(areaH);
|
|
942
|
-
console.log(areaSep);
|
|
943
|
-
for (const s of entry.scores) {
|
|
944
|
-
const lift = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
|
|
945
|
-
console.log(` | ${s.feature.padEnd(19)} | ` +
|
|
946
|
-
`${s.taskCompletion.toString().padStart(4)} | ` +
|
|
947
|
-
`${s.codeCorrectness.toString().padStart(4)} | ` +
|
|
948
|
-
`${s.docCoverage.toString().padStart(4)} | ` +
|
|
949
|
-
`${s.totalScore.toString().padStart(5)} | ` +
|
|
950
|
-
`${lift.padStart(4)} |`);
|
|
951
|
-
}
|
|
952
|
-
console.log();
|
|
953
|
-
}
|
|
954
|
-
// Cost-per-quality-point
|
|
955
|
-
const modelsWithCost = sorted.filter((e) => e.overall.cost && e.overall.cost > 0);
|
|
956
|
-
if (modelsWithCost.length > 0) {
|
|
957
|
-
console.log(" Cost per quality point:");
|
|
958
|
-
for (const entry of modelsWithCost) {
|
|
959
|
-
const displayName = entry.label;
|
|
960
|
-
const costPerPoint = entry.overall.avgScore > 0
|
|
961
|
-
? (entry.overall.cost ?? 0) / entry.overall.avgScore
|
|
962
|
-
: 0;
|
|
963
|
-
console.log(` ${displayName}: $${costPerPoint.toFixed(6)}/point (score: ${entry.overall.avgScore.toFixed(1)}, cost: $${(entry.overall.cost ?? 0).toFixed(4)})`);
|
|
964
|
-
}
|
|
965
|
-
console.log();
|
|
966
|
-
}
|
|
967
|
-
}
|
|
968
|
-
// ---------------------------------------------------------------------------
|
|
969
|
-
// Main
|
|
970
|
-
// ---------------------------------------------------------------------------
|
|
971
|
-
function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode) {
|
|
972
|
-
console.log("\n" + "=".repeat(80));
|
|
973
|
-
console.log(" SANITY AI LITERACY SCORE REPORT");
|
|
974
|
-
console.log("=".repeat(80));
|
|
975
|
-
console.log();
|
|
976
|
-
// Table header
|
|
977
|
-
const h = "| Feature Area | Task | Code | Docs | Total | w/o Docs | Doc Lift |";
|
|
978
|
-
const sep = "|---------------------|------|------|------|-------|----------|----------|";
|
|
979
|
-
console.log(h);
|
|
980
|
-
console.log(sep);
|
|
981
|
-
for (const s of scores) {
|
|
982
|
-
const status = s.totalScore < CRITICAL_THRESHOLD ? "!!" : "ok";
|
|
983
|
-
const lift = s.docLift > 0 ? `+${s.docLift}` : `${s.docLift}`;
|
|
984
|
-
console.log(`| ${status} ${s.feature.padEnd(17)} | ` +
|
|
985
|
-
`${s.taskCompletion.toString().padStart(4)} | ` +
|
|
986
|
-
`${s.codeCorrectness.toString().padStart(4)} | ` +
|
|
987
|
-
`${s.docCoverage.toString().padStart(4)} | ` +
|
|
988
|
-
`${s.totalScore.toString().padStart(5)} | ` +
|
|
989
|
-
`${s.floorScore.toString().padStart(8)} | ` +
|
|
990
|
-
`${lift.padStart(8)} |`);
|
|
991
|
-
}
|
|
992
|
-
console.log();
|
|
993
|
-
// OKR status
|
|
994
|
-
const belowCritical = scores.filter((s) => s.totalScore < CRITICAL_THRESHOLD);
|
|
995
|
-
const lowestScore = scores.reduce((min, s) => s.totalScore < min.totalScore ? s : min);
|
|
996
|
-
const avgScore = scores.reduce((sum, s) => sum + s.totalScore, 0) / scores.length;
|
|
997
|
-
const avgLift = scores.reduce((sum, s) => sum + s.docLift, 0) / scores.length;
|
|
998
|
-
const avgCeilingScore = scores.reduce((sum, s) => sum + s.ceilingScore, 0) / scores.length;
|
|
999
|
-
const avgFloorScore = scores.reduce((sum, s) => sum + s.floorScore, 0) / scores.length;
|
|
1000
|
-
const avgDocQualityGap = scores.reduce((sum, s) => sum + s.docQualityGap, 0) / scores.length;
|
|
1001
|
-
const negativeDocLiftScores = scores.filter((s) => s.negativeDocLift);
|
|
1002
|
-
const negativeDocLiftAreas = negativeDocLiftScores.map((s) => ({
|
|
1003
|
-
area: s.feature,
|
|
1004
|
-
docLift: s.docLift,
|
|
1005
|
-
}));
|
|
1006
|
-
console.log("-".repeat(80));
|
|
1007
|
-
console.log("OKR STATUS");
|
|
1008
|
-
console.log("-".repeat(80));
|
|
1009
|
-
console.log();
|
|
1010
|
-
if (belowCritical.length === 0) {
|
|
1011
|
-
console.log(" KR1: PASS -- All areas above critical threshold (>=40)");
|
|
1012
|
-
}
|
|
1013
|
-
else {
|
|
1014
|
-
console.log(" KR1: FAIL -- Areas below critical threshold:");
|
|
1015
|
-
belowCritical.forEach((s) => console.log(` - ${s.feature}: ${s.totalScore}`));
|
|
1016
|
-
}
|
|
1017
|
-
console.log();
|
|
1018
|
-
console.log(` Lowest area: ${lowestScore.feature} (${lowestScore.totalScore})`);
|
|
1019
|
-
console.log(` Target: +15 points improvement`);
|
|
1020
|
-
console.log();
|
|
1021
|
-
console.log(` Avg score: ${avgScore.toFixed(1)}`);
|
|
1022
|
-
console.log(` Avg doc lift: +${avgLift.toFixed(1)} points`);
|
|
1023
|
-
console.log(` (Doc lift = how much docs help vs parametric knowledge alone)`);
|
|
1024
|
-
console.log();
|
|
1025
|
-
// Ceiling decomposition
|
|
1026
|
-
console.log("-".repeat(80));
|
|
1027
|
-
console.log("CEILING DECOMPOSITION");
|
|
1028
|
-
console.log("-".repeat(80));
|
|
1029
|
-
console.log();
|
|
1030
|
-
const ceilH = "| Feature Area | Floor | Ceiling | Doc Lift | Quality Gap |";
|
|
1031
|
-
const ceilSep = "|---------------------|-------|---------|----------|-------------|";
|
|
1032
|
-
console.log(ceilH);
|
|
1033
|
-
console.log(ceilSep);
|
|
1034
|
-
for (const s of scores) {
|
|
1035
|
-
const liftStr = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
|
|
1036
|
-
const liftFlag = s.negativeDocLift ? " 🚨" : "";
|
|
1037
|
-
console.log(`| ${s.feature.padEnd(19)} | ` +
|
|
1038
|
-
`${s.floorScore.toString().padStart(5)} | ` +
|
|
1039
|
-
`${s.ceilingScore.toString().padStart(7)} | ` +
|
|
1040
|
-
`${liftStr.padStart(8)}${liftFlag} | ` +
|
|
1041
|
-
`${s.docQualityGap.toString().padStart(11)} |`);
|
|
1042
|
-
}
|
|
1043
|
-
console.log();
|
|
1044
|
-
if (negativeDocLiftAreas.length > 0) {
|
|
1045
|
-
console.log(" 🚨 NEGATIVE DOC LIFT DETECTED:");
|
|
1046
|
-
for (const { area, docLift } of negativeDocLiftAreas) {
|
|
1047
|
-
const s = scores.find((sc) => sc.feature === area);
|
|
1048
|
-
console.log(` ${area}: Doc Lift = ${docLift} (floor: ${s.floorScore}, ceiling: ${s.ceilingScore})`);
|
|
1049
|
-
}
|
|
1050
|
-
console.log(" Documentation is HURTING model performance for these areas.");
|
|
1051
|
-
console.log(" See docs/design-docs/scenario-matrix/evaluation-ceiling.md");
|
|
1052
|
-
console.log();
|
|
1053
|
-
}
|
|
1054
|
-
else {
|
|
1055
|
-
console.log(" ✅ No areas with negative Doc Lift detected.");
|
|
1056
|
-
console.log();
|
|
1057
|
-
}
|
|
1058
|
-
// Three-layer decomposition (only when actual scores are present)
|
|
1059
|
-
const hasActualScores = scores.some((s) => s.actualScore !== undefined);
|
|
1060
|
-
if (hasActualScores) {
|
|
1061
|
-
console.log("-".repeat(80));
|
|
1062
|
-
console.log("THREE-LAYER DECOMPOSITION (floor → ceiling → actual)");
|
|
1063
|
-
console.log("-".repeat(80));
|
|
1064
|
-
console.log();
|
|
1065
|
-
const decompH = "| Feature Area | Floor | Ceiling | Actual | Doc Lift | Ret. Gap | Infra % |";
|
|
1066
|
-
const decompSep = "|---------------------|-------|---------|--------|----------|----------|---------|";
|
|
1067
|
-
console.log(decompH);
|
|
1068
|
-
console.log(decompSep);
|
|
1069
|
-
for (const s of scores) {
|
|
1070
|
-
const liftStr = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
|
|
1071
|
-
const actualStr = s.actualScore !== undefined ? s.actualScore.toString() : "—";
|
|
1072
|
-
const gapStr = s.retrievalGap !== undefined
|
|
1073
|
-
? s.retrievalGap >= 0
|
|
1074
|
-
? `+${s.retrievalGap}`
|
|
1075
|
-
: `${s.retrievalGap}`
|
|
1076
|
-
: "—";
|
|
1077
|
-
const infraStr = s.infrastructureEfficiency != null
|
|
1078
|
-
? `${Math.round(s.infrastructureEfficiency * 100)}%`
|
|
1079
|
-
: "—";
|
|
1080
|
-
const flag = s.invertedRetrievalGap ? " 🔄" : "";
|
|
1081
|
-
console.log(`| ${s.feature.padEnd(19)} | ` +
|
|
1082
|
-
`${s.floorScore.toString().padStart(5)} | ` +
|
|
1083
|
-
`${s.ceilingScore.toString().padStart(7)} | ` +
|
|
1084
|
-
`${actualStr.padStart(6)} | ` +
|
|
1085
|
-
`${liftStr.padStart(8)} | ` +
|
|
1086
|
-
`${(gapStr + flag).padStart(8)} | ` +
|
|
1087
|
-
`${infraStr.padStart(7)} |`);
|
|
1088
|
-
}
|
|
1089
|
-
console.log();
|
|
1090
|
-
console.log(" Doc Lift = ceiling − floor | Ret. Gap = ceiling − actual | Infra = actual / ceiling");
|
|
1091
|
-
console.log(" 🔄 = inverted retrieval gap (agents avoid bad docs → higher actual than ceiling)");
|
|
1092
|
-
console.log();
|
|
1093
|
-
}
|
|
1094
|
-
// Cost summary
|
|
1095
|
-
const totalCost = scores.reduce((sum, s) => sum + s.totalCost, 0);
|
|
1096
|
-
const totalTests = scores.reduce((sum, s) => sum + s.testCount, 0);
|
|
1097
|
-
const graderCostTotal = graderCost?.cost ?? 0;
|
|
1098
|
-
const combinedCost = totalCost + graderCostTotal;
|
|
1099
|
-
if (totalCost > 0 || graderCostTotal > 0) {
|
|
1100
|
-
console.log("-".repeat(80));
|
|
1101
|
-
console.log("COST SUMMARY");
|
|
1102
|
-
console.log("-".repeat(80));
|
|
1103
|
-
console.log();
|
|
1104
|
-
console.log(` Provider cost: $${totalCost.toFixed(4)}`);
|
|
1105
|
-
if (graderCostTotal > 0) {
|
|
1106
|
-
const graderLabel = graderCost?.model ?? "unknown";
|
|
1107
|
-
console.log(` Grader cost: $${graderCostTotal.toFixed(4)} (${graderLabel}, ${(graderCost?.totalTokens ?? 0).toLocaleString()} tokens)`);
|
|
1108
|
-
}
|
|
1109
|
-
console.log(` Total cost: $${combinedCost.toFixed(4)}`);
|
|
1110
|
-
console.log(` Avg cost per test: $${(combinedCost / (totalTests || 1)).toFixed(4)}`);
|
|
1111
|
-
console.log();
|
|
1112
|
-
const costHeader = "| Feature Area | Tests | Cost | Avg/Test |";
|
|
1113
|
-
const costSep = "|---------------------|-------|----------|----------|";
|
|
1114
|
-
console.log(costHeader);
|
|
1115
|
-
console.log(costSep);
|
|
1116
|
-
for (const s of scores) {
|
|
1117
|
-
const avgCost = s.testCount > 0 ? s.totalCost / s.testCount : 0;
|
|
1118
|
-
console.log(`| ${s.feature.padEnd(19)} | ` +
|
|
1119
|
-
`${s.testCount.toString().padStart(5)} | ` +
|
|
1120
|
-
`$${s.totalCost.toFixed(4).padStart(7)} | ` +
|
|
1121
|
-
`$${avgCost.toFixed(4).padStart(7)} |`);
|
|
1122
|
-
}
|
|
1123
|
-
console.log();
|
|
1124
|
-
}
|
|
1125
|
-
// Per-model breakdown
|
|
1126
|
-
if (perModel) {
|
|
1127
|
-
printPerModelReport(perModel);
|
|
1128
|
-
}
|
|
1129
|
-
// URL References
|
|
1130
|
-
printUrlReport(urlRefs);
|
|
1131
|
-
// Agent Behavior (only present when run with instrumented provider)
|
|
1132
|
-
if (agentBehavior && agentBehavior.length > 0) {
|
|
1133
|
-
printAgentBehaviorReport(agentBehavior);
|
|
1134
|
-
}
|
|
1135
|
-
// Source verification (unified report for all modes)
|
|
1136
|
-
if (sourceVerification || sourceIsolation) {
|
|
1137
|
-
console.log("-".repeat(80));
|
|
1138
|
-
console.log("📋 SOURCE VERIFICATION");
|
|
1139
|
-
console.log("-".repeat(80));
|
|
1140
|
-
if (sourceVerification) {
|
|
1141
|
-
console.log(` Source: ${sourceVerification.source}`);
|
|
1142
|
-
console.log(` Mode: ${sourceVerification.mode}`);
|
|
1143
|
-
if (sourceVerification.allowedOrigins) {
|
|
1144
|
-
console.log(` Sandbox: ${sourceVerification.allowedOrigins.join(", ")}`);
|
|
1145
|
-
}
|
|
1146
|
-
if (sourceVerification.searchMode) {
|
|
1147
|
-
console.log(` Search: ${sourceVerification.searchMode}`);
|
|
1148
|
-
}
|
|
1149
|
-
// URL fetch results (baseline mode with direct URLs)
|
|
1150
|
-
if (sourceVerification.urlFetch) {
|
|
1151
|
-
const uf = sourceVerification.urlFetch;
|
|
1152
|
-
console.log();
|
|
1153
|
-
console.log(` URL fetch: ${uf.totalFetched} fetched, ${uf.totalFailed} failed`);
|
|
1154
|
-
for (const f of uf.fetchedUrls) {
|
|
1155
|
-
console.log(` ✅ ${f.url} (via ${f.method})`);
|
|
1156
|
-
}
|
|
1157
|
-
for (const f of uf.failures) {
|
|
1158
|
-
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string means no error info
|
|
1159
|
-
console.log(` ⚠️ ${f.url}: ${f.error || "unknown error"}`);
|
|
1160
|
-
}
|
|
1161
|
-
}
|
|
1162
|
-
}
|
|
1163
|
-
// Agentic isolation score
|
|
1164
|
-
if (sourceIsolation) {
|
|
1165
|
-
const pct = Math.round(sourceIsolation.isolationScore * 100);
|
|
1166
|
-
const icon = sourceIsolation.offOrigin === 0 ? "✅" : "⚠️";
|
|
1167
|
-
console.log();
|
|
1168
|
-
console.log(` Agent isolation: ${icon} ${pct}% (${sourceIsolation.onOrigin}/${sourceIsolation.total} on-origin)`);
|
|
1169
|
-
if (sourceIsolation.offOrigin > 0) {
|
|
1170
|
-
console.log(` Off-origin fetches: ${sourceIsolation.offOrigin}`);
|
|
1171
|
-
for (const url of sourceIsolation.offOriginUrls.slice(0, 10)) {
|
|
1172
|
-
console.log(` • ${url}`);
|
|
1173
|
-
}
|
|
1174
|
-
}
|
|
1175
|
-
if (Object.keys(sourceIsolation.originBreakdown).length > 0) {
|
|
1176
|
-
console.log(" Origin breakdown:");
|
|
1177
|
-
for (const [origin, count] of Object.entries(sourceIsolation.originBreakdown).sort((a, b) => b[1] - a[1])) {
|
|
1178
|
-
console.log(` ${origin}: ${count}`);
|
|
1179
|
-
}
|
|
1180
|
-
}
|
|
1181
|
-
}
|
|
1182
|
-
console.log();
|
|
1183
|
-
}
|
|
1184
|
-
// Build overall agent behavior stats for summary
|
|
1185
|
-
const overallAgentBehavior = agentBehavior && agentBehavior.length > 0
|
|
1186
|
-
? {
|
|
1187
|
-
avgDocPagesVisited: agentBehavior.reduce((s, ab) => s + ab.avgDocPagesVisited, 0) /
|
|
1188
|
-
agentBehavior.length,
|
|
1189
|
-
avgNetworkTimeMs: agentBehavior.reduce((s, ab) => s + ab.avgNetworkTimeMs, 0) /
|
|
1190
|
-
agentBehavior.length,
|
|
1191
|
-
avgSearchesPerformed: agentBehavior.reduce((s, ab) => s + ab.avgSearchesPerformed, 0) /
|
|
1192
|
-
agentBehavior.length,
|
|
1193
|
-
testsWithBehaviorData: agentBehavior.reduce((s, ab) => s + ab.tasksWithBehaviorData, 0),
|
|
1194
|
-
totalUniqueDocSlugs: [
|
|
1195
|
-
...new Set(agentBehavior.flatMap((ab) => ab.docSlugsVisited)),
|
|
1196
|
-
].length,
|
|
1197
|
-
totalUniqueSearchQueries: [
|
|
1198
|
-
...new Set(agentBehavior.flatMap((ab) => ab.searchQueries)),
|
|
1199
|
-
].length,
|
|
1200
|
-
}
|
|
1201
|
-
: undefined;
|
|
1202
|
-
// Compute aggregate metrics from actual scores (when agentic data present)
|
|
1203
|
-
const scoresWithActual = scores.filter((s) => s.actualScore !== undefined);
|
|
1204
|
-
const avgActualScore = scoresWithActual.length > 0
|
|
1205
|
-
? scoresWithActual.reduce((sum, s) => sum + (s.actualScore ?? 0), 0) /
|
|
1206
|
-
scoresWithActual.length
|
|
1207
|
-
: undefined;
|
|
1208
|
-
const scoresWithGap = scores.filter((s) => s.retrievalGap !== undefined);
|
|
1209
|
-
const avgRetrievalGap = scoresWithGap.length > 0
|
|
1210
|
-
? scoresWithGap.reduce((sum, s) => sum + (s.retrievalGap ?? 0), 0) /
|
|
1211
|
-
scoresWithGap.length
|
|
1212
|
-
: undefined;
|
|
1213
|
-
const scoresWithInfra = scores.filter((s) => s.infrastructureEfficiency != null);
|
|
1214
|
-
const avgInfrastructureEfficiency = scoresWithInfra.length > 0
|
|
1215
|
-
? scoresWithInfra.reduce((sum, s) => sum + (s.infrastructureEfficiency ?? 0), 0) / scoresWithInfra.length
|
|
1216
|
-
: undefined;
|
|
1217
|
-
return {
|
|
1218
|
-
agentBehavior: agentBehavior ?? undefined,
|
|
1219
|
-
belowCritical: belowCritical.map((s) => s.feature),
|
|
1220
|
-
...(evaluationMode && { evaluationMode }),
|
|
1221
|
-
lowestArea: lowestScore.feature,
|
|
1222
|
-
lowestScore: lowestScore.totalScore,
|
|
1223
|
-
...(negativeDocLiftAreas.length > 0 && { negativeDocLiftAreas }),
|
|
1224
|
-
overall: {
|
|
1225
|
-
agentBehavior: overallAgentBehavior,
|
|
1226
|
-
...(avgActualScore !== undefined && { avgActualScore }),
|
|
1227
|
-
avgCeilingScore: avgCeilingScore,
|
|
1228
|
-
avgDocLift: avgLift,
|
|
1229
|
-
avgDocQualityGap: avgDocQualityGap,
|
|
1230
|
-
avgFloorScore: avgFloorScore,
|
|
1231
|
-
...(avgInfrastructureEfficiency !== undefined && {
|
|
1232
|
-
avgInfrastructureEfficiency,
|
|
1233
|
-
}),
|
|
1234
|
-
...(avgRetrievalGap !== undefined && { avgRetrievalGap }),
|
|
1235
|
-
avgScore,
|
|
1236
|
-
cost: totalCost > 0 || graderCostTotal > 0
|
|
1237
|
-
? {
|
|
1238
|
-
graderModel: graderCost?.model,
|
|
1239
|
-
graderTotal: graderCostTotal,
|
|
1240
|
-
perTest: combinedCost / (totalTests || 1),
|
|
1241
|
-
total: combinedCost,
|
|
1242
|
-
totalTokens: graderCost?.totalTokens ?? 0,
|
|
1243
|
-
}
|
|
1244
|
-
: undefined,
|
|
1245
|
-
negativeDocLiftCount: negativeDocLiftAreas.length,
|
|
1246
|
-
},
|
|
1247
|
-
scores,
|
|
1248
|
-
source: source
|
|
1249
|
-
? {
|
|
1250
|
-
baseUrl: source.baseUrl,
|
|
1251
|
-
dataset: source.dataset,
|
|
1252
|
-
name: source.name ?? "default",
|
|
1253
|
-
perspective: source.perspective,
|
|
1254
|
-
projectId: source.projectId,
|
|
1255
|
-
}
|
|
1256
|
-
: undefined,
|
|
1257
|
-
...(perModel && { perModel }),
|
|
1258
|
-
...(sourceIsolation && { sourceIsolation }),
|
|
1259
|
-
...(sourceVerification && { sourceVerification }),
|
|
1260
|
-
timestamp: new Date().toISOString(),
|
|
1261
|
-
urlReferences: urlRefs,
|
|
1262
|
-
};
|
|
1263
|
-
}
|
|
1264
|
-
function printUrlReport(urlRefs) {
|
|
1265
|
-
console.log("-".repeat(80));
|
|
1266
|
-
console.log("URL REFERENCES");
|
|
1267
|
-
console.log("-".repeat(80));
|
|
1268
|
-
console.log();
|
|
1269
|
-
for (const ref of urlRefs) {
|
|
1270
|
-
const goldUrls = Object.entries(ref.gold.urls).sort((a, b) => b[1] - a[1]);
|
|
1271
|
-
const baselineUrls = Object.entries(ref.baseline.urls).sort((a, b) => b[1] - a[1]);
|
|
1272
|
-
if (goldUrls.length > 0) {
|
|
1273
|
-
console.log(` ${ref.feature} (gold):`);
|
|
1274
|
-
for (const [url, count] of goldUrls) {
|
|
1275
|
-
const suffix = count > 1 ? ` (${count} tests)` : "";
|
|
1276
|
-
console.log(` ${url}${suffix}`);
|
|
1277
|
-
}
|
|
1278
|
-
}
|
|
1279
|
-
if (baselineUrls.length > 0) {
|
|
1280
|
-
console.log(` ${ref.feature} (baseline):`);
|
|
1281
|
-
for (const [url, count] of baselineUrls) {
|
|
1282
|
-
const suffix = count > 1 ? ` (${count} tests)` : "";
|
|
1283
|
-
console.log(` ${url}${suffix} [parametric]`);
|
|
1284
|
-
}
|
|
1285
|
-
}
|
|
1286
|
-
if (goldUrls.length === 0 && baselineUrls.length === 0) {
|
|
1287
|
-
console.log(` ${ref.feature}: no URLs referenced`);
|
|
1288
|
-
}
|
|
1289
|
-
console.log();
|
|
1290
|
-
}
|
|
1291
|
-
}
|
|
1292
|
-
// Only run when invoked directly (not when imported for testing)
|
|
1293
|
-
if (process.argv[1]?.endsWith("calculate-scores.ts") ||
|
|
1294
|
-
process.argv[1]?.endsWith("calculate-scores.js")) {
|
|
1295
|
-
main();
|
|
1296
|
-
}
|