@sanity/ailf 2.0.2 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
- package/dist/_vendor/ailf-core/examples/index.js +66 -1
- package/dist/agent-harness/assertions-runtime.d.ts +49 -0
- package/dist/agent-harness/assertions-runtime.js +138 -0
- package/dist/agent-harness/provider.d.ts +58 -0
- package/dist/agent-harness/provider.js +104 -0
- package/dist/cli.js +0 -0
- package/dist/commands/init.js +3 -0
- package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
- package/dist/orchestration/steps/generate-configs-step.js +35 -2
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +39 -25
- package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +28 -85
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
- package/package.json +25 -24
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
- package/dist/adapters/task-sources/yaml-task-source.js +0 -139
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
- package/dist/commands/update-quality-scores.d.ts +0 -5
- package/dist/commands/update-quality-scores.js +0 -20
- package/dist/lib/agent-behavior-report.d.ts +0 -8
- package/dist/lib/agent-behavior-report.js +0 -185
- package/dist/lib/baseline.d.ts +0 -19
- package/dist/lib/baseline.js +0 -153
- package/dist/lib/calculate-scores.d.ts +0 -23
- package/dist/lib/calculate-scores.js +0 -42
- package/dist/lib/compare.d.ts +0 -18
- package/dist/lib/compare.js +0 -170
- package/dist/lib/coverage-audit.d.ts +0 -4
- package/dist/lib/coverage-audit.js +0 -42
- package/dist/lib/discovery-report.d.ts +0 -13
- package/dist/lib/discovery-report.js +0 -57
- package/dist/lib/fetch-docs.d.ts +0 -30
- package/dist/lib/fetch-docs.js +0 -171
- package/dist/lib/generate-configs.d.ts +0 -25
- package/dist/lib/generate-configs.js +0 -42
- package/dist/lib/grader-api.d.ts +0 -21
- package/dist/lib/grader-api.js +0 -34
- package/dist/lib/grader-compare.d.ts +0 -19
- package/dist/lib/grader-compare.js +0 -91
- package/dist/lib/grader-consistency.d.ts +0 -27
- package/dist/lib/grader-consistency.js +0 -79
- package/dist/lib/grader-sensitivity.d.ts +0 -19
- package/dist/lib/grader-sensitivity.js +0 -75
- package/dist/lib/grader-validate.d.ts +0 -19
- package/dist/lib/grader-validate.js +0 -78
- package/dist/lib/measure-retrieval.d.ts +0 -14
- package/dist/lib/measure-retrieval.js +0 -71
- package/dist/lib/pr-comment.d.ts +0 -16
- package/dist/lib/pr-comment.js +0 -28
- package/dist/lib/readiness-report.d.ts +0 -13
- package/dist/lib/readiness-report.js +0 -108
- package/dist/lib/webhook-server.d.ts +0 -11
- package/dist/lib/webhook-server.js +0 -24
- package/dist/lib/weekly-digest.d.ts +0 -24
- package/dist/lib/weekly-digest.js +0 -148
- package/dist/orchestration/env-bridge.d.ts +0 -21
- package/dist/orchestration/env-bridge.js +0 -66
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
- package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
- package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
- package/dist/pipeline/compiler/task-bridge.js +0 -92
- package/dist/pipeline/expand-tasks.d.ts +0 -232
- package/dist/pipeline/expand-tasks.js +0 -467
- package/dist/pipeline/generate-configs.d.ts +0 -92
- package/dist/pipeline/generate-configs.js +0 -445
- package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/calculate-scores-step.js +0 -89
- package/dist/pipeline/steps/compare-step.d.ts +0 -18
- package/dist/pipeline/steps/compare-step.js +0 -90
- package/dist/pipeline/steps/eval-step.d.ts +0 -53
- package/dist/pipeline/steps/eval-step.js +0 -347
- package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
- package/dist/pipeline/steps/fetch-docs-step.js +0 -84
- package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
- package/dist/pipeline/steps/generate-configs-step.js +0 -98
- package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
- package/dist/pipeline/steps/grader-consistency-step.js +0 -74
- package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
- package/dist/pipeline/steps/publish-report-step.js +0 -243
- package/dist/pipeline/steps/report-step.d.ts +0 -13
- package/dist/pipeline/steps/report-step.js +0 -56
- package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/update-scores-step.js +0 -42
- package/dist/scripts/agent-behavior-report.d.ts +0 -19
- package/dist/scripts/agent-behavior-report.js +0 -315
- package/dist/scripts/baseline.d.ts +0 -43
- package/dist/scripts/baseline.js +0 -267
- package/dist/scripts/calculate-scores.d.ts +0 -166
- package/dist/scripts/calculate-scores.js +0 -1296
- package/dist/scripts/compare.d.ts +0 -22
- package/dist/scripts/compare.js +0 -334
- package/dist/scripts/coverage-audit.d.ts +0 -44
- package/dist/scripts/coverage-audit.js +0 -209
- package/dist/scripts/debug-eval.d.ts +0 -19
- package/dist/scripts/debug-eval.js +0 -73
- package/dist/scripts/discovery-report.d.ts +0 -58
- package/dist/scripts/discovery-report.js +0 -250
- package/dist/scripts/fetch-docs.d.ts +0 -35
- package/dist/scripts/fetch-docs.js +0 -472
- package/dist/scripts/generate-configs.d.ts +0 -66
- package/dist/scripts/generate-configs.js +0 -459
- package/dist/scripts/grader-api.d.ts +0 -27
- package/dist/scripts/grader-api.js +0 -206
- package/dist/scripts/grader-compare.d.ts +0 -22
- package/dist/scripts/grader-compare.js +0 -368
- package/dist/scripts/grader-consistency.d.ts +0 -20
- package/dist/scripts/grader-consistency.js +0 -313
- package/dist/scripts/grader-sensitivity.d.ts +0 -22
- package/dist/scripts/grader-sensitivity.js +0 -354
- package/dist/scripts/grader-validate.d.ts +0 -19
- package/dist/scripts/grader-validate.js +0 -267
- package/dist/scripts/measure-retrieval.d.ts +0 -10
- package/dist/scripts/measure-retrieval.js +0 -145
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
- package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
- package/dist/scripts/pipeline.d.ts +0 -76
- package/dist/scripts/pipeline.js +0 -1031
- package/dist/scripts/pr-comment.d.ts +0 -10
- package/dist/scripts/pr-comment.js +0 -510
- package/dist/scripts/readiness-report.d.ts +0 -88
- package/dist/scripts/readiness-report.js +0 -342
- package/dist/scripts/update-quality-scores.d.ts +0 -15
- package/dist/scripts/update-quality-scores.js +0 -184
- package/dist/scripts/validate-task-sources.d.ts +0 -21
- package/dist/scripts/validate-task-sources.js +0 -210
- package/dist/scripts/validate.d.ts +0 -13
- package/dist/scripts/validate.js +0 -79
- package/dist/scripts/webhook-server.d.ts +0 -26
- package/dist/scripts/webhook-server.js +0 -147
- package/dist/scripts/weekly-digest.d.ts +0 -24
- package/dist/scripts/weekly-digest.js +0 -144
- package/dist/sinks/format-slack.d.ts +0 -64
- package/dist/sinks/format-slack.js +0 -306
- package/dist/sinks/slack-sink.d.ts +0 -27
- package/dist/sinks/slack-sink.js +0 -78
- package/dist/sinks/webhook-sink.d.ts +0 -19
- package/dist/sinks/webhook-sink.js +0 -50
- package/tasks/.expanded.agentic.yaml +0 -280
- package/tasks/.expanded.yaml +0 -565
|
@@ -1,313 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* grader-consistency.ts
|
|
3
|
-
*
|
|
4
|
-
* CLI script for measuring grader consistency (Phase 1 of grader reliability).
|
|
5
|
-
*
|
|
6
|
-
* Reads existing eval results, re-runs ONLY the grading assertions N additional
|
|
7
|
-
* times with the configured grader model, and analyzes score variance.
|
|
8
|
-
*
|
|
9
|
-
* This does NOT re-run the models under test — it only re-grades the same
|
|
10
|
-
* responses. Cost is low: ~$0.005 per grading call × N replications.
|
|
11
|
-
*
|
|
12
|
-
* Usage:
|
|
13
|
-
* pnpm grader-consistency # 5 replications (default)
|
|
14
|
-
* pnpm grader-consistency --replications 3 # custom count
|
|
15
|
-
* pnpm grader-consistency --results <path> # custom results file
|
|
16
|
-
*
|
|
17
|
-
* Reads: results/latest/eval-results.json (default)
|
|
18
|
-
* Writes: results/latest/grader-consistency.json
|
|
19
|
-
*/
|
|
20
|
-
// oxlint-disable-next-line import/no-unassigned-import -- side-effect: loads .env into process.env
|
|
21
|
-
import "dotenv/config";
|
|
22
|
-
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
23
|
-
import { dirname, join, resolve } from "path";
|
|
24
|
-
import { fileURLToPath } from "url";
|
|
25
|
-
import { analyzeConsistency, } from "../pipeline/grader-consistency.js";
|
|
26
|
-
import { gradeOnce } from "./grader-api.js";
|
|
27
|
-
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
28
|
-
const ROOT = resolve(__dirname, "..", "..");
|
|
29
|
-
// ---------------------------------------------------------------------------
|
|
30
|
-
// CLI argument parsing
|
|
31
|
-
// ---------------------------------------------------------------------------
|
|
32
|
-
const args = process.argv.slice(2);
|
|
33
|
-
function getFlag(name) {
|
|
34
|
-
return args.includes(`--${name}`);
|
|
35
|
-
}
|
|
36
|
-
function getOption(name) {
|
|
37
|
-
const idx = args.indexOf(`--${name}`);
|
|
38
|
-
return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined;
|
|
39
|
-
}
|
|
40
|
-
const replicationsStr = getOption("replications") ?? "5";
|
|
41
|
-
const replications = parseInt(replicationsStr, 10);
|
|
42
|
-
const resultsPath = getOption("results") ?? join(ROOT, "results", "latest", "eval-results.json");
|
|
43
|
-
const showHelp = getFlag("help") || getFlag("h");
|
|
44
|
-
if (showHelp) {
|
|
45
|
-
console.log(`
|
|
46
|
-
Usage: pnpm grader-consistency [options]
|
|
47
|
-
|
|
48
|
-
Measure grader consistency by re-grading existing eval responses N times.
|
|
49
|
-
|
|
50
|
-
Options:
|
|
51
|
-
--replications <n> Number of additional grading replications (default: 5)
|
|
52
|
-
--results <path> Path to eval-results.json (default: results/latest/eval-results.json)
|
|
53
|
-
--help, -h Show this help
|
|
54
|
-
|
|
55
|
-
Examples:
|
|
56
|
-
pnpm grader-consistency # 5 replications
|
|
57
|
-
pnpm grader-consistency --replications 3 # fewer replications (faster)
|
|
58
|
-
pnpm grader-consistency --results results/latest/eval-results-agentic.json
|
|
59
|
-
`);
|
|
60
|
-
process.exit(0);
|
|
61
|
-
}
|
|
62
|
-
// RawPromptfooFile and RawTestResult imported from calculate-scores.ts
|
|
63
|
-
// ---------------------------------------------------------------------------
|
|
64
|
-
// Rubric dimension classification (similar to calculate-scores)
|
|
65
|
-
// ---------------------------------------------------------------------------
|
|
66
|
-
function classifyDimension(component) {
|
|
67
|
-
// Prefer structured metadata
|
|
68
|
-
const metadata = component.assertion?.metadata;
|
|
69
|
-
if (metadata?.dimension) {
|
|
70
|
-
switch (metadata.dimension) {
|
|
71
|
-
case "code-correctness":
|
|
72
|
-
return "codeCorrectness";
|
|
73
|
-
case "doc-coverage":
|
|
74
|
-
return "docCoverage";
|
|
75
|
-
case "task-completion":
|
|
76
|
-
return "taskCompletion";
|
|
77
|
-
default:
|
|
78
|
-
return null;
|
|
79
|
-
}
|
|
80
|
-
}
|
|
81
|
-
// Fallback: heuristic name matching
|
|
82
|
-
const value = (component.assertion?.value ?? "").toLowerCase();
|
|
83
|
-
if (value.includes("task completion"))
|
|
84
|
-
return "taskCompletion";
|
|
85
|
-
if (value.includes("code correctness"))
|
|
86
|
-
return "codeCorrectness";
|
|
87
|
-
if (value.includes("documentation coverage") || value.includes("hallucinate"))
|
|
88
|
-
return "docCoverage";
|
|
89
|
-
return null;
|
|
90
|
-
}
|
|
91
|
-
// ---------------------------------------------------------------------------
|
|
92
|
-
// Grading judgment extraction
|
|
93
|
-
// ---------------------------------------------------------------------------
|
|
94
|
-
function detectFeatureArea(description) {
|
|
95
|
-
const desc = description.toLowerCase();
|
|
96
|
-
if (desc.includes("studio"))
|
|
97
|
-
return "studio-setup";
|
|
98
|
-
if (desc.includes("visual") ||
|
|
99
|
-
desc.includes("presentation") ||
|
|
100
|
-
desc.includes("live preview"))
|
|
101
|
-
return "visual-editing";
|
|
102
|
-
if (desc.includes("function") || desc.includes("webhook"))
|
|
103
|
-
return "functions";
|
|
104
|
-
if (desc.startsWith("groq"))
|
|
105
|
-
return "groq";
|
|
106
|
-
if (desc.includes("next") || desc.includes("app router"))
|
|
107
|
-
return "nextjs-live";
|
|
108
|
-
if (desc.includes("remix") ||
|
|
109
|
-
desc.includes("nuxt") ||
|
|
110
|
-
desc.includes("svelte"))
|
|
111
|
-
return "frameworks";
|
|
112
|
-
return "other";
|
|
113
|
-
}
|
|
114
|
-
/**
|
|
115
|
-
* Extract all llm-rubric grading judgments from eval results.
|
|
116
|
-
* Only includes gold (with-docs) tests to keep the analysis focused.
|
|
117
|
-
*/
|
|
118
|
-
function extractGradingJudgments(file) {
|
|
119
|
-
const judgments = [];
|
|
120
|
-
for (const result of file.results.results) {
|
|
121
|
-
if (!result.gradingResult)
|
|
122
|
-
continue;
|
|
123
|
-
const description = result.testCase?.description ?? "unknown";
|
|
124
|
-
const hasDocs = result.vars?.docs && result.vars.docs.trim().length > 0;
|
|
125
|
-
// Only grade "gold" (with-docs) tests — baseline tests have abbreviated rubrics
|
|
126
|
-
if (!hasDocs)
|
|
127
|
-
continue;
|
|
128
|
-
const area = detectFeatureArea(description);
|
|
129
|
-
const providerId = result.provider?.id;
|
|
130
|
-
for (const comp of result.gradingResult.componentResults) {
|
|
131
|
-
if (comp.assertion?.type !== "llm-rubric")
|
|
132
|
-
continue;
|
|
133
|
-
const dimension = classifyDimension(comp);
|
|
134
|
-
if (!dimension)
|
|
135
|
-
continue;
|
|
136
|
-
const rubricText = typeof comp.assertion.value === "string" ? comp.assertion.value : "";
|
|
137
|
-
if (!rubricText)
|
|
138
|
-
continue;
|
|
139
|
-
judgments.push({
|
|
140
|
-
area,
|
|
141
|
-
description,
|
|
142
|
-
dimension,
|
|
143
|
-
originalScore: typeof comp.score === "number" ? comp.score : 0,
|
|
144
|
-
providerId,
|
|
145
|
-
responseText: result.response?.output ?? "",
|
|
146
|
-
rubricText,
|
|
147
|
-
});
|
|
148
|
-
}
|
|
149
|
-
}
|
|
150
|
-
return judgments;
|
|
151
|
-
}
|
|
152
|
-
// ---------------------------------------------------------------------------
|
|
153
|
-
// Main
|
|
154
|
-
// ---------------------------------------------------------------------------
|
|
155
|
-
async function main() {
|
|
156
|
-
console.log("=== Grader Consistency Analysis ===\n");
|
|
157
|
-
// Validate inputs
|
|
158
|
-
if (!existsSync(resultsPath)) {
|
|
159
|
-
console.error(`❌ Results file not found: ${resultsPath}`);
|
|
160
|
-
console.error("Run 'pnpm eval' first to generate results.");
|
|
161
|
-
process.exit(1);
|
|
162
|
-
}
|
|
163
|
-
if (replications < 2) {
|
|
164
|
-
console.error("❌ Need at least 2 replications for meaningful analysis.");
|
|
165
|
-
process.exit(1);
|
|
166
|
-
}
|
|
167
|
-
// Load eval results
|
|
168
|
-
console.log(` Results: ${resultsPath}`);
|
|
169
|
-
console.log(` Replications: ${replications}`);
|
|
170
|
-
const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
|
|
171
|
-
// Extract grader model
|
|
172
|
-
const graderModel = file.config?.defaultTest?.options?.rubricProvider ??
|
|
173
|
-
file.config?.defaultTest?.options?.provider;
|
|
174
|
-
if (!graderModel) {
|
|
175
|
-
console.error("❌ Could not determine grader model from eval results config.");
|
|
176
|
-
process.exit(1);
|
|
177
|
-
}
|
|
178
|
-
console.log(` Grader: ${graderModel}`);
|
|
179
|
-
// Extract judgments
|
|
180
|
-
const judgments = extractGradingJudgments(file);
|
|
181
|
-
console.log(` Judgments: ${judgments.length} (gold tests × rubric dimensions)`);
|
|
182
|
-
if (judgments.length === 0) {
|
|
183
|
-
console.error("❌ No gradable judgments found in results.");
|
|
184
|
-
process.exit(1);
|
|
185
|
-
}
|
|
186
|
-
const totalCalls = judgments.length * replications;
|
|
187
|
-
const estimatedCost = totalCalls * 0.005;
|
|
188
|
-
console.log(` API calls: ${totalCalls} (${judgments.length} × ${replications})`);
|
|
189
|
-
console.log(` Est. cost: ~$${estimatedCost.toFixed(2)}`);
|
|
190
|
-
console.log();
|
|
191
|
-
// Re-grade each judgment N times
|
|
192
|
-
console.log(` Running ${replications} replications per judgment...`);
|
|
193
|
-
const gradings = [];
|
|
194
|
-
let completed = 0;
|
|
195
|
-
let failed = 0;
|
|
196
|
-
for (const judgment of judgments) {
|
|
197
|
-
const scores = [judgment.originalScore]; // Include the original as replication #0
|
|
198
|
-
for (let i = 0; i < replications; i++) {
|
|
199
|
-
const score = await gradeOnce(graderModel, judgment.responseText, judgment.rubricText);
|
|
200
|
-
if (score !== null) {
|
|
201
|
-
scores.push(score);
|
|
202
|
-
}
|
|
203
|
-
else {
|
|
204
|
-
failed++;
|
|
205
|
-
}
|
|
206
|
-
}
|
|
207
|
-
completed++;
|
|
208
|
-
if (completed % 10 === 0 || completed === judgments.length) {
|
|
209
|
-
const pct = Math.round((completed / judgments.length) * 100);
|
|
210
|
-
process.stdout.write(`\r Progress: ${completed}/${judgments.length} (${pct}%)`);
|
|
211
|
-
}
|
|
212
|
-
gradings.push({
|
|
213
|
-
area: judgment.area,
|
|
214
|
-
dimension: judgment.dimension,
|
|
215
|
-
providerId: judgment.providerId,
|
|
216
|
-
scores,
|
|
217
|
-
taskId: judgment.description,
|
|
218
|
-
});
|
|
219
|
-
}
|
|
220
|
-
console.log(); // newline after progress
|
|
221
|
-
if (failed > 0) {
|
|
222
|
-
console.log(` ⚠ ${failed} grading calls failed (excluded from analysis)`);
|
|
223
|
-
}
|
|
224
|
-
console.log();
|
|
225
|
-
// Analyze consistency
|
|
226
|
-
const result = analyzeConsistency(gradings);
|
|
227
|
-
// Print report
|
|
228
|
-
printReport(result, graderModel);
|
|
229
|
-
// Write output
|
|
230
|
-
const outDir = join(ROOT, "results", "latest");
|
|
231
|
-
mkdirSync(outDir, { recursive: true });
|
|
232
|
-
const outPath = join(outDir, "grader-consistency.json");
|
|
233
|
-
writeFileSync(outPath, JSON.stringify(result, null, 2));
|
|
234
|
-
console.log(`\n 📄 Results written to ${outPath}`);
|
|
235
|
-
}
|
|
236
|
-
// ---------------------------------------------------------------------------
|
|
237
|
-
// Report formatting
|
|
238
|
-
// ---------------------------------------------------------------------------
|
|
239
|
-
function printReport(result, graderModel) {
|
|
240
|
-
console.log("=".repeat(80));
|
|
241
|
-
console.log(" GRADER CONSISTENCY REPORT");
|
|
242
|
-
console.log("=".repeat(80));
|
|
243
|
-
console.log();
|
|
244
|
-
console.log(` Grader model: ${graderModel}`);
|
|
245
|
-
console.log(` Replications: ${result.replications} (incl. original)`);
|
|
246
|
-
console.log(` Judgments: ${result.totalJudgments}`);
|
|
247
|
-
console.log();
|
|
248
|
-
// Overall stats
|
|
249
|
-
console.log("-".repeat(80));
|
|
250
|
-
console.log("OVERALL");
|
|
251
|
-
console.log("-".repeat(80));
|
|
252
|
-
console.log();
|
|
253
|
-
console.log(` Avg σ: ${result.avgStdDev}`);
|
|
254
|
-
console.log(` Max σ: ${result.maxStdDev}`);
|
|
255
|
-
console.log(` Avg range: ${result.avgRange} points`);
|
|
256
|
-
console.log();
|
|
257
|
-
// Per-dimension table
|
|
258
|
-
console.log("-".repeat(80));
|
|
259
|
-
console.log("PER-DIMENSION CONSISTENCY");
|
|
260
|
-
console.log("-".repeat(80));
|
|
261
|
-
console.log();
|
|
262
|
-
const h = "| Dimension | Avg σ | Max σ | Avg Range | Judgments |";
|
|
263
|
-
const sep = "|------------------|-------|-------|-----------|-----------|";
|
|
264
|
-
console.log(h);
|
|
265
|
-
console.log(sep);
|
|
266
|
-
const dims = [
|
|
267
|
-
{ data: result.perDimension.taskCompletion, name: "Task Completion" },
|
|
268
|
-
{ data: result.perDimension.codeCorrectness, name: "Code Correctness" },
|
|
269
|
-
{ data: result.perDimension.docCoverage, name: "Doc Coverage" },
|
|
270
|
-
];
|
|
271
|
-
for (const { data, name } of dims) {
|
|
272
|
-
console.log(`| ${name.padEnd(16)} | ${String(data.avgStdDev).padStart(5)} | ${String(data.maxStdDev).padStart(5)} | ${String(data.avgRange).padStart(9)} | ${String(data.judgmentCount).padStart(9)} |`);
|
|
273
|
-
}
|
|
274
|
-
console.log();
|
|
275
|
-
// Noise threshold recommendation
|
|
276
|
-
console.log("-".repeat(80));
|
|
277
|
-
console.log("NOISE THRESHOLD RECOMMENDATION");
|
|
278
|
-
console.log("-".repeat(80));
|
|
279
|
-
console.log();
|
|
280
|
-
console.log(` Current default: ±2 (DEFAULT_NOISE_THRESHOLD)`);
|
|
281
|
-
console.log(` Recommended: ±${result.recommendedThreshold} (based on 2× max dimension σ)`);
|
|
282
|
-
if (result.recommendedThreshold > 2) {
|
|
283
|
-
console.log(` ⚠ Current threshold may be too low — comparison deltas within ±${result.recommendedThreshold}`);
|
|
284
|
-
console.log(` should be treated as noise, not real changes.`);
|
|
285
|
-
}
|
|
286
|
-
else {
|
|
287
|
-
console.log(` ✅ Current threshold is adequate for this grader's consistency.`);
|
|
288
|
-
}
|
|
289
|
-
console.log();
|
|
290
|
-
// Top 5 noisiest judgments
|
|
291
|
-
const topN = Math.min(5, result.judgments.length);
|
|
292
|
-
if (topN > 0) {
|
|
293
|
-
console.log("-".repeat(80));
|
|
294
|
-
console.log(`TOP ${topN} NOISIEST JUDGMENTS`);
|
|
295
|
-
console.log("-".repeat(80));
|
|
296
|
-
console.log();
|
|
297
|
-
for (let i = 0; i < topN; i++) {
|
|
298
|
-
const j = result.judgments[i];
|
|
299
|
-
const provider = j.providerId ? ` [${j.providerId}]` : "";
|
|
300
|
-
console.log(` ${i + 1}. ${j.taskId}${provider}`);
|
|
301
|
-
console.log(` ${j.dimension}: σ=${j.stdDev}, range=${j.range} (${j.min}–${j.max}), mean=${j.mean}`);
|
|
302
|
-
}
|
|
303
|
-
console.log();
|
|
304
|
-
}
|
|
305
|
-
}
|
|
306
|
-
// Only run when invoked directly
|
|
307
|
-
if (process.argv[1]?.endsWith("grader-consistency.ts") ||
|
|
308
|
-
process.argv[1]?.endsWith("grader-consistency.js")) {
|
|
309
|
-
main().catch((err) => {
|
|
310
|
-
console.error("❌ Fatal error:", err);
|
|
311
|
-
process.exit(1);
|
|
312
|
-
});
|
|
313
|
-
}
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* grader-sensitivity.ts
|
|
3
|
-
*
|
|
4
|
-
* CLI for grader sensitivity testing (Phase 4 of grader reliability).
|
|
5
|
-
*
|
|
6
|
-
* Loads reference solutions, programmatically degrades them, grades both
|
|
7
|
-
* the original and degraded versions with the grader model, then measures
|
|
8
|
-
* whether the grader can distinguish quality levels.
|
|
9
|
-
*
|
|
10
|
-
* Usage:
|
|
11
|
-
* pnpm grader-sensitivity # test all reference solutions
|
|
12
|
-
* pnpm grader-sensitivity --area groq # test only groq area
|
|
13
|
-
* pnpm grader-sensitivity --format json # machine-readable output
|
|
14
|
-
*
|
|
15
|
-
* Reads: canonical/reference-solutions/**\/*.ts
|
|
16
|
-
* Reads: config/models.yaml (grader model)
|
|
17
|
-
* Reads: config/rubrics.yaml (dimension rubric templates)
|
|
18
|
-
* Writes: results/latest/grader-sensitivity.json
|
|
19
|
-
*
|
|
20
|
-
* @see docs/exec-plans/completed/grader-reliability.md — Phase 4
|
|
21
|
-
*/
|
|
22
|
-
export {};
|
|
@@ -1,354 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* grader-sensitivity.ts
|
|
3
|
-
*
|
|
4
|
-
* CLI for grader sensitivity testing (Phase 4 of grader reliability).
|
|
5
|
-
*
|
|
6
|
-
* Loads reference solutions, programmatically degrades them, grades both
|
|
7
|
-
* the original and degraded versions with the grader model, then measures
|
|
8
|
-
* whether the grader can distinguish quality levels.
|
|
9
|
-
*
|
|
10
|
-
* Usage:
|
|
11
|
-
* pnpm grader-sensitivity # test all reference solutions
|
|
12
|
-
* pnpm grader-sensitivity --area groq # test only groq area
|
|
13
|
-
* pnpm grader-sensitivity --format json # machine-readable output
|
|
14
|
-
*
|
|
15
|
-
* Reads: canonical/reference-solutions/**\/*.ts
|
|
16
|
-
* Reads: config/models.yaml (grader model)
|
|
17
|
-
* Reads: config/rubrics.yaml (dimension rubric templates)
|
|
18
|
-
* Writes: results/latest/grader-sensitivity.json
|
|
19
|
-
*
|
|
20
|
-
* @see docs/exec-plans/completed/grader-reliability.md — Phase 4
|
|
21
|
-
*/
|
|
22
|
-
import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
|
|
23
|
-
import { dirname, join, resolve } from "path";
|
|
24
|
-
import { fileURLToPath } from "url";
|
|
25
|
-
import { load } from "js-yaml";
|
|
26
|
-
import { DEGRADATION_STRATEGIES, } from "../pipeline/degradations.js";
|
|
27
|
-
import { analyzeSensitivity, } from "../pipeline/grader-sensitivity.js";
|
|
28
|
-
import { gradeOnce, loadGraderModel } from "./grader-api.js";
|
|
29
|
-
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
30
|
-
const ROOT = resolve(__dirname, "..", "..");
|
|
31
|
-
// ---------------------------------------------------------------------------
|
|
32
|
-
// CLI argument parsing
|
|
33
|
-
// ---------------------------------------------------------------------------
|
|
34
|
-
const args = process.argv.slice(2);
|
|
35
|
-
function getFlag(name) {
|
|
36
|
-
return args.includes(`--${name}`);
|
|
37
|
-
}
|
|
38
|
-
function getOption(name) {
|
|
39
|
-
const idx = args.indexOf(`--${name}`);
|
|
40
|
-
return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined;
|
|
41
|
-
}
|
|
42
|
-
const areaFilter = getOption("area");
|
|
43
|
-
const format = getOption("format") ?? "table";
|
|
44
|
-
const outputPath = getOption("output");
|
|
45
|
-
const showHelp = getFlag("help") || getFlag("h");
|
|
46
|
-
if (showHelp) {
|
|
47
|
-
console.log(`
|
|
48
|
-
Usage: pnpm grader-sensitivity [options]
|
|
49
|
-
|
|
50
|
-
Test grader discrimination power using programmatic code degradation.
|
|
51
|
-
|
|
52
|
-
Options:
|
|
53
|
-
--area <name> Test only reference solutions in this area (e.g., groq)
|
|
54
|
-
--format <fmt> Output format: table (default) or json
|
|
55
|
-
--output <path> Write JSON report to file
|
|
56
|
-
--help, -h Show this help
|
|
57
|
-
|
|
58
|
-
Examples:
|
|
59
|
-
pnpm grader-sensitivity # test all reference solutions
|
|
60
|
-
pnpm grader-sensitivity --area groq # test only GROQ area
|
|
61
|
-
pnpm grader-sensitivity --format json # machine-readable output
|
|
62
|
-
`);
|
|
63
|
-
process.exit(0);
|
|
64
|
-
}
|
|
65
|
-
// ---------------------------------------------------------------------------
|
|
66
|
-
// Types
|
|
67
|
-
// ---------------------------------------------------------------------------
|
|
68
|
-
// DimensionName imported from pipeline/types.ts
|
|
69
|
-
const DIMENSION_NAMES = [
|
|
70
|
-
"taskCompletion",
|
|
71
|
-
"codeCorrectness",
|
|
72
|
-
"docCoverage",
|
|
73
|
-
];
|
|
74
|
-
// ---------------------------------------------------------------------------
|
|
75
|
-
// Load config
|
|
76
|
-
// ---------------------------------------------------------------------------
|
|
77
|
-
/** Build a rubric prompt for a specific dimension */
|
|
78
|
-
function buildRubricPrompt(rubrics, dimension) {
|
|
79
|
-
const templateKey = dimension === "taskCompletion"
|
|
80
|
-
? "task-completion"
|
|
81
|
-
: dimension === "codeCorrectness"
|
|
82
|
-
? "code-correctness"
|
|
83
|
-
: "doc-coverage";
|
|
84
|
-
const template = rubrics.templates[templateKey];
|
|
85
|
-
if (!template) {
|
|
86
|
-
throw new Error(`No rubric template for ${dimension}`);
|
|
87
|
-
}
|
|
88
|
-
const lines = [
|
|
89
|
-
template.header,
|
|
90
|
-
"",
|
|
91
|
-
...template.scale.map((s) => `- ${s}`),
|
|
92
|
-
"",
|
|
93
|
-
rubrics.footer,
|
|
94
|
-
];
|
|
95
|
-
return lines.join("\n");
|
|
96
|
-
}
|
|
97
|
-
function discoverReferenceSolutions() {
|
|
98
|
-
const refDir = join(ROOT, "canonical", "reference-solutions");
|
|
99
|
-
if (!existsSync(refDir)) {
|
|
100
|
-
console.error(`❌ Reference solutions directory not found: ${refDir}`);
|
|
101
|
-
process.exit(1);
|
|
102
|
-
}
|
|
103
|
-
const solutions = [];
|
|
104
|
-
const areas = readdirSync(refDir, { withFileTypes: true })
|
|
105
|
-
.filter((d) => d.isDirectory())
|
|
106
|
-
.map((d) => d.name);
|
|
107
|
-
for (const area of areas) {
|
|
108
|
-
if (areaFilter && area !== areaFilter)
|
|
109
|
-
continue;
|
|
110
|
-
const areaDir = join(refDir, area);
|
|
111
|
-
const files = readdirSync(areaDir).filter((f) => f.endsWith(".ts") || f.endsWith(".tsx"));
|
|
112
|
-
for (const file of files) {
|
|
113
|
-
const content = readFileSync(join(areaDir, file), "utf-8");
|
|
114
|
-
solutions.push({
|
|
115
|
-
area,
|
|
116
|
-
content,
|
|
117
|
-
relativePath: `${area}/${file}`,
|
|
118
|
-
});
|
|
119
|
-
}
|
|
120
|
-
}
|
|
121
|
-
return solutions;
|
|
122
|
-
}
|
|
123
|
-
// ---------------------------------------------------------------------------
|
|
124
|
-
// Discover reference solutions
|
|
125
|
-
// ---------------------------------------------------------------------------
|
|
126
|
-
function formatSensitivityReport(result) {
|
|
127
|
-
console.log("-".repeat(80));
|
|
128
|
-
console.log("OVERALL SENSITIVITY");
|
|
129
|
-
console.log("-".repeat(80));
|
|
130
|
-
console.log();
|
|
131
|
-
console.log(` Grader: ${result.graderModel}`);
|
|
132
|
-
console.log(` Total pairs: ${result.totalPairs}`);
|
|
133
|
-
console.log(` Concordance rate: ${result.concordanceRate}%`);
|
|
134
|
-
console.log(` Avg separation: ${result.avgSeparation} points`);
|
|
135
|
-
console.log(` Failed pairs: ${result.failedPairs.length}`);
|
|
136
|
-
console.log();
|
|
137
|
-
// Per-dimension table
|
|
138
|
-
console.log("-".repeat(80));
|
|
139
|
-
console.log("PER-DIMENSION SENSITIVITY");
|
|
140
|
-
console.log("-".repeat(80));
|
|
141
|
-
console.log();
|
|
142
|
-
const h = "| Dimension | Concordance | Avg Sep | Tied | Pairs |";
|
|
143
|
-
const sep = "|------------------|-------------|---------|-------|-------|";
|
|
144
|
-
console.log(h);
|
|
145
|
-
console.log(sep);
|
|
146
|
-
const dims = [
|
|
147
|
-
{ data: result.perDimension.taskCompletion, name: "Task Completion" },
|
|
148
|
-
{ data: result.perDimension.codeCorrectness, name: "Code Correctness" },
|
|
149
|
-
{ data: result.perDimension.docCoverage, name: "Doc Coverage" },
|
|
150
|
-
];
|
|
151
|
-
for (const { data, name } of dims) {
|
|
152
|
-
console.log(`| ${name.padEnd(16)} | ${(data.concordanceRate + "%").padStart(11)} | ${String(data.avgSeparation).padStart(7)} | ${(data.tiedRate + "%").padStart(5)} | ${String(data.pairCount).padStart(5)} |`);
|
|
153
|
-
}
|
|
154
|
-
console.log();
|
|
155
|
-
// Cross-dimension analysis
|
|
156
|
-
console.log("-".repeat(80));
|
|
157
|
-
console.log("CROSS-DIMENSION ANALYSIS");
|
|
158
|
-
console.log("-".repeat(80));
|
|
159
|
-
console.log();
|
|
160
|
-
console.log(` On-target: ${result.crossDimension.onTarget.concordanceRate}% concordance, ` +
|
|
161
|
-
`${result.crossDimension.onTarget.avgSeparation} avg sep (${result.crossDimension.onTarget.pairCount} pairs)`);
|
|
162
|
-
console.log(` Off-target: ${result.crossDimension.offTarget.concordanceRate}% concordance, ` +
|
|
163
|
-
`${result.crossDimension.offTarget.avgSeparation} avg sep (${result.crossDimension.offTarget.pairCount} pairs)`);
|
|
164
|
-
console.log();
|
|
165
|
-
if (result.crossDimension.onTarget.concordanceRate >
|
|
166
|
-
result.crossDimension.offTarget.concordanceRate) {
|
|
167
|
-
console.log(" ✅ Grader is more sensitive to targeted degradations (expected)");
|
|
168
|
-
}
|
|
169
|
-
else {
|
|
170
|
-
console.log(" ⚠ Grader is equally or less sensitive to targeted degradations");
|
|
171
|
-
}
|
|
172
|
-
console.log();
|
|
173
|
-
// Per-degradation breakdown
|
|
174
|
-
if (result.byDegradation.length > 0) {
|
|
175
|
-
console.log("-".repeat(80));
|
|
176
|
-
console.log("PER-DEGRADATION SENSITIVITY (worst first)");
|
|
177
|
-
console.log("-".repeat(80));
|
|
178
|
-
console.log();
|
|
179
|
-
const dh = "| Degradation | Target | Conc | Sep | N |";
|
|
180
|
-
const ds = "|------------------------------------------------|------------|-------|-------|----|";
|
|
181
|
-
console.log(dh);
|
|
182
|
-
console.log(ds);
|
|
183
|
-
for (const d of result.byDegradation) {
|
|
184
|
-
const dimLabel = d.targetDimension === "taskCompletion"
|
|
185
|
-
? "Task"
|
|
186
|
-
: d.targetDimension === "codeCorrectness"
|
|
187
|
-
? "Code"
|
|
188
|
-
: "Docs";
|
|
189
|
-
console.log(`| ${d.description.slice(0, 46).padEnd(46)} | ${dimLabel.padEnd(10)} | ${(d.concordanceRate + "%").padStart(5)} | ${String(d.avgSeparation).padStart(5)} | ${String(d.pairCount).padStart(2)} |`);
|
|
190
|
-
}
|
|
191
|
-
console.log();
|
|
192
|
-
}
|
|
193
|
-
// Top failures
|
|
194
|
-
const topN = Math.min(5, result.failedPairs.length);
|
|
195
|
-
if (topN > 0) {
|
|
196
|
-
console.log("-".repeat(80));
|
|
197
|
-
console.log(`TOP ${topN} DISCRIMINATION FAILURES (grader preferred degraded)`);
|
|
198
|
-
console.log("-".repeat(80));
|
|
199
|
-
console.log();
|
|
200
|
-
for (let i = 0; i < topN; i++) {
|
|
201
|
-
const f = result.failedPairs[i];
|
|
202
|
-
const delta = f.degradedScore - f.originalScore;
|
|
203
|
-
console.log(` ${i + 1}. ${f.sourcePath} — ${f.dimension}`);
|
|
204
|
-
console.log(` Original=${f.originalScore}, Degraded=${f.degradedScore} (+${delta} for degraded)`);
|
|
205
|
-
console.log(` Degradation: ${f.degradationDescription}`);
|
|
206
|
-
}
|
|
207
|
-
console.log();
|
|
208
|
-
}
|
|
209
|
-
// Verdict
|
|
210
|
-
console.log("-".repeat(80));
|
|
211
|
-
console.log("VERDICT");
|
|
212
|
-
console.log("-".repeat(80));
|
|
213
|
-
console.log();
|
|
214
|
-
if (result.concordanceRate >= 90) {
|
|
215
|
-
console.log(` ✅ EXCELLENT: Grader correctly discriminates ${result.concordanceRate}% of pairs`);
|
|
216
|
-
}
|
|
217
|
-
else if (result.concordanceRate >= 75) {
|
|
218
|
-
console.log(` ⚠️ ACCEPTABLE: Grader correctly discriminates ${result.concordanceRate}% of pairs`);
|
|
219
|
-
}
|
|
220
|
-
else {
|
|
221
|
-
console.log(` ❌ POOR: Grader only discriminates ${result.concordanceRate}% of pairs`);
|
|
222
|
-
}
|
|
223
|
-
console.log();
|
|
224
|
-
}
|
|
225
|
-
function generateDegradedPairs(solutions) {
|
|
226
|
-
const pairs = [];
|
|
227
|
-
for (const solution of solutions) {
|
|
228
|
-
for (const strategy of DEGRADATION_STRATEGIES) {
|
|
229
|
-
const degraded = strategy.apply(solution.content);
|
|
230
|
-
// Only include if the degradation actually changed the code
|
|
231
|
-
if (degraded !== solution.content) {
|
|
232
|
-
pairs.push({
|
|
233
|
-
degradation: strategy,
|
|
234
|
-
degraded,
|
|
235
|
-
original: solution.content,
|
|
236
|
-
sourcePath: solution.relativePath,
|
|
237
|
-
});
|
|
238
|
-
}
|
|
239
|
-
}
|
|
240
|
-
}
|
|
241
|
-
return pairs;
|
|
242
|
-
}
|
|
243
|
-
// ---------------------------------------------------------------------------
|
|
244
|
-
// Main execution
|
|
245
|
-
// ---------------------------------------------------------------------------
|
|
246
|
-
function loadRubrics() {
|
|
247
|
-
const rubricsPath = join(ROOT, "config", "rubrics.yaml");
|
|
248
|
-
const raw = readFileSync(rubricsPath, "utf-8");
|
|
249
|
-
return load(raw);
|
|
250
|
-
}
|
|
251
|
-
// ---------------------------------------------------------------------------
|
|
252
|
-
// Formatted output
|
|
253
|
-
// ---------------------------------------------------------------------------
|
|
254
|
-
async function main() {
|
|
255
|
-
console.log("=".repeat(80));
|
|
256
|
-
console.log(" GRADER SENSITIVITY TESTING");
|
|
257
|
-
console.log("=".repeat(80));
|
|
258
|
-
console.log();
|
|
259
|
-
// Load config
|
|
260
|
-
const grader = loadGraderModel();
|
|
261
|
-
const rubrics = loadRubrics();
|
|
262
|
-
console.log(` Grader: ${grader.id} (${grader.label})`);
|
|
263
|
-
if (areaFilter) {
|
|
264
|
-
console.log(` Area filter: ${areaFilter}`);
|
|
265
|
-
}
|
|
266
|
-
console.log();
|
|
267
|
-
// Discover reference solutions
|
|
268
|
-
const solutions = discoverReferenceSolutions();
|
|
269
|
-
console.log(` Reference solutions: ${solutions.length}`);
|
|
270
|
-
if (solutions.length === 0) {
|
|
271
|
-
console.error("❌ No reference solutions found.");
|
|
272
|
-
process.exit(1);
|
|
273
|
-
}
|
|
274
|
-
for (const s of solutions) {
|
|
275
|
-
console.log(` ${s.relativePath} (${s.content.split("\n").length} lines)`);
|
|
276
|
-
}
|
|
277
|
-
console.log();
|
|
278
|
-
// Generate degraded pairs
|
|
279
|
-
const degradedPairs = generateDegradedPairs(solutions);
|
|
280
|
-
console.log(` Degraded pairs: ${degradedPairs.length}`);
|
|
281
|
-
console.log(` Total gradings: ${degradedPairs.length * DIMENSION_NAMES.length * 2} (${degradedPairs.length} pairs × ${DIMENSION_NAMES.length} dimensions × 2 versions)`);
|
|
282
|
-
console.log();
|
|
283
|
-
// Grade each pair on each dimension
|
|
284
|
-
const sensitivityPairs = [];
|
|
285
|
-
let completed = 0;
|
|
286
|
-
let failed = 0;
|
|
287
|
-
const totalGradings = degradedPairs.length * DIMENSION_NAMES.length * 2;
|
|
288
|
-
for (const pair of degradedPairs) {
|
|
289
|
-
const area = pair.sourcePath.split("/")[0];
|
|
290
|
-
for (const dimension of DIMENSION_NAMES) {
|
|
291
|
-
const rubricText = buildRubricPrompt(rubrics, dimension);
|
|
292
|
-
// Grade original
|
|
293
|
-
const originalScore = await gradeOnce(grader.id, pair.original, rubricText);
|
|
294
|
-
completed++;
|
|
295
|
-
// Grade degraded
|
|
296
|
-
const degradedScore = await gradeOnce(grader.id, pair.degraded, rubricText);
|
|
297
|
-
completed++;
|
|
298
|
-
if (originalScore !== null && degradedScore !== null) {
|
|
299
|
-
sensitivityPairs.push({
|
|
300
|
-
area,
|
|
301
|
-
degradationDescription: pair.degradation.description,
|
|
302
|
-
degradedScore,
|
|
303
|
-
dimension,
|
|
304
|
-
originalScore,
|
|
305
|
-
sourcePath: pair.sourcePath,
|
|
306
|
-
targetDimension: pair.degradation.targetDimension,
|
|
307
|
-
});
|
|
308
|
-
}
|
|
309
|
-
else {
|
|
310
|
-
failed++;
|
|
311
|
-
}
|
|
312
|
-
if (completed % 20 === 0 || completed === totalGradings) {
|
|
313
|
-
process.stdout.write(`\r Progress: ${completed}/${totalGradings} gradings${failed > 0 ? ` (${failed} failed)` : ""}`);
|
|
314
|
-
}
|
|
315
|
-
}
|
|
316
|
-
}
|
|
317
|
-
console.log();
|
|
318
|
-
console.log();
|
|
319
|
-
// Analyze
|
|
320
|
-
const result = analyzeSensitivity(sensitivityPairs, grader.id);
|
|
321
|
-
// Output
|
|
322
|
-
if (format === "json") {
|
|
323
|
-
const json = JSON.stringify(result, null, 2);
|
|
324
|
-
if (outputPath) {
|
|
325
|
-
writeFileSync(outputPath, json);
|
|
326
|
-
console.log(` ✅ Report written to ${outputPath}`);
|
|
327
|
-
}
|
|
328
|
-
else {
|
|
329
|
-
console.log(json);
|
|
330
|
-
}
|
|
331
|
-
}
|
|
332
|
-
else {
|
|
333
|
-
formatSensitivityReport(result);
|
|
334
|
-
}
|
|
335
|
-
// Write to results/latest/
|
|
336
|
-
const resultsDir = join(ROOT, "results", "latest");
|
|
337
|
-
try {
|
|
338
|
-
mkdirSync(resultsDir, { recursive: true });
|
|
339
|
-
const resultFilePath = join(resultsDir, "grader-sensitivity.json");
|
|
340
|
-
writeFileSync(resultFilePath, JSON.stringify(result, null, 2));
|
|
341
|
-
console.log(` 📄 Report saved: ${resultFilePath}`);
|
|
342
|
-
}
|
|
343
|
-
catch {
|
|
344
|
-
// results/latest/ may not exist yet
|
|
345
|
-
}
|
|
346
|
-
}
|
|
347
|
-
// Only run when invoked directly
|
|
348
|
-
if (process.argv[1]?.endsWith("grader-sensitivity.ts") ||
|
|
349
|
-
process.argv[1]?.endsWith("grader-sensitivity.js")) {
|
|
350
|
-
main().catch((err) => {
|
|
351
|
-
console.error("❌ Fatal error:", err);
|
|
352
|
-
process.exit(1);
|
|
353
|
-
});
|
|
354
|
-
}
|