@sanity/ailf 2.0.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. package/LICENSE +21 -0
  2. package/dist/cli.js +0 -0
  3. package/package.json +24 -24
  4. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
  5. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
  6. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
  7. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
  8. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  9. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  10. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  11. package/dist/_vendor/ailf-tasks/index.js +0 -16
  12. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  13. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  14. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  15. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  16. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  17. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  18. package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
  19. package/dist/adapters/task-sources/yaml-task-source.js +0 -139
  20. package/dist/agent-observer/test-imports.d.ts +0 -7
  21. package/dist/agent-observer/test-imports.js +0 -185
  22. package/dist/commands/update-quality-scores.d.ts +0 -5
  23. package/dist/commands/update-quality-scores.js +0 -20
  24. package/dist/lib/agent-behavior-report.d.ts +0 -8
  25. package/dist/lib/agent-behavior-report.js +0 -185
  26. package/dist/lib/baseline.d.ts +0 -19
  27. package/dist/lib/baseline.js +0 -153
  28. package/dist/lib/calculate-scores.d.ts +0 -23
  29. package/dist/lib/calculate-scores.js +0 -42
  30. package/dist/lib/compare.d.ts +0 -18
  31. package/dist/lib/compare.js +0 -170
  32. package/dist/lib/coverage-audit.d.ts +0 -4
  33. package/dist/lib/coverage-audit.js +0 -42
  34. package/dist/lib/discovery-report.d.ts +0 -13
  35. package/dist/lib/discovery-report.js +0 -57
  36. package/dist/lib/fetch-docs.d.ts +0 -30
  37. package/dist/lib/fetch-docs.js +0 -171
  38. package/dist/lib/generate-configs.d.ts +0 -25
  39. package/dist/lib/generate-configs.js +0 -42
  40. package/dist/lib/grader-api.d.ts +0 -21
  41. package/dist/lib/grader-api.js +0 -34
  42. package/dist/lib/grader-compare.d.ts +0 -19
  43. package/dist/lib/grader-compare.js +0 -91
  44. package/dist/lib/grader-consistency.d.ts +0 -27
  45. package/dist/lib/grader-consistency.js +0 -79
  46. package/dist/lib/grader-sensitivity.d.ts +0 -19
  47. package/dist/lib/grader-sensitivity.js +0 -75
  48. package/dist/lib/grader-validate.d.ts +0 -19
  49. package/dist/lib/grader-validate.js +0 -78
  50. package/dist/lib/measure-retrieval.d.ts +0 -14
  51. package/dist/lib/measure-retrieval.js +0 -71
  52. package/dist/lib/pr-comment.d.ts +0 -16
  53. package/dist/lib/pr-comment.js +0 -28
  54. package/dist/lib/readiness-report.d.ts +0 -13
  55. package/dist/lib/readiness-report.js +0 -108
  56. package/dist/lib/webhook-server.d.ts +0 -11
  57. package/dist/lib/webhook-server.js +0 -24
  58. package/dist/lib/weekly-digest.d.ts +0 -24
  59. package/dist/lib/weekly-digest.js +0 -148
  60. package/dist/orchestration/env-bridge.d.ts +0 -21
  61. package/dist/orchestration/env-bridge.js +0 -66
  62. package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
  63. package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
  64. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
  65. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
  66. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
  67. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  68. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  69. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  70. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  71. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  72. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
  73. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
  74. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
  75. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
  76. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
  77. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
  78. package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
  79. package/dist/pipeline/compiler/task-bridge.js +0 -92
  80. package/dist/pipeline/expand-tasks.d.ts +0 -232
  81. package/dist/pipeline/expand-tasks.js +0 -467
  82. package/dist/pipeline/generate-configs.d.ts +0 -92
  83. package/dist/pipeline/generate-configs.js +0 -445
  84. package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
  85. package/dist/pipeline/steps/calculate-scores-step.js +0 -89
  86. package/dist/pipeline/steps/compare-step.d.ts +0 -18
  87. package/dist/pipeline/steps/compare-step.js +0 -90
  88. package/dist/pipeline/steps/eval-step.d.ts +0 -53
  89. package/dist/pipeline/steps/eval-step.js +0 -347
  90. package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
  91. package/dist/pipeline/steps/fetch-docs-step.js +0 -84
  92. package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
  93. package/dist/pipeline/steps/generate-configs-step.js +0 -98
  94. package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
  95. package/dist/pipeline/steps/grader-consistency-step.js +0 -74
  96. package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
  97. package/dist/pipeline/steps/publish-report-step.js +0 -243
  98. package/dist/pipeline/steps/report-step.d.ts +0 -13
  99. package/dist/pipeline/steps/report-step.js +0 -56
  100. package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
  101. package/dist/pipeline/steps/update-scores-step.js +0 -42
  102. package/dist/scripts/agent-behavior-report.d.ts +0 -19
  103. package/dist/scripts/agent-behavior-report.js +0 -315
  104. package/dist/scripts/baseline.d.ts +0 -43
  105. package/dist/scripts/baseline.js +0 -267
  106. package/dist/scripts/calculate-scores.d.ts +0 -166
  107. package/dist/scripts/calculate-scores.js +0 -1296
  108. package/dist/scripts/compare.d.ts +0 -22
  109. package/dist/scripts/compare.js +0 -334
  110. package/dist/scripts/coverage-audit.d.ts +0 -44
  111. package/dist/scripts/coverage-audit.js +0 -209
  112. package/dist/scripts/debug-eval.d.ts +0 -19
  113. package/dist/scripts/debug-eval.js +0 -73
  114. package/dist/scripts/discovery-report.d.ts +0 -58
  115. package/dist/scripts/discovery-report.js +0 -250
  116. package/dist/scripts/fetch-docs.d.ts +0 -35
  117. package/dist/scripts/fetch-docs.js +0 -472
  118. package/dist/scripts/generate-configs.d.ts +0 -66
  119. package/dist/scripts/generate-configs.js +0 -459
  120. package/dist/scripts/grader-api.d.ts +0 -27
  121. package/dist/scripts/grader-api.js +0 -206
  122. package/dist/scripts/grader-compare.d.ts +0 -22
  123. package/dist/scripts/grader-compare.js +0 -368
  124. package/dist/scripts/grader-consistency.d.ts +0 -20
  125. package/dist/scripts/grader-consistency.js +0 -313
  126. package/dist/scripts/grader-sensitivity.d.ts +0 -22
  127. package/dist/scripts/grader-sensitivity.js +0 -354
  128. package/dist/scripts/grader-validate.d.ts +0 -19
  129. package/dist/scripts/grader-validate.js +0 -267
  130. package/dist/scripts/measure-retrieval.d.ts +0 -10
  131. package/dist/scripts/measure-retrieval.js +0 -145
  132. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
  133. package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
  134. package/dist/scripts/pipeline.d.ts +0 -76
  135. package/dist/scripts/pipeline.js +0 -1031
  136. package/dist/scripts/pr-comment.d.ts +0 -10
  137. package/dist/scripts/pr-comment.js +0 -510
  138. package/dist/scripts/readiness-report.d.ts +0 -88
  139. package/dist/scripts/readiness-report.js +0 -342
  140. package/dist/scripts/update-quality-scores.d.ts +0 -15
  141. package/dist/scripts/update-quality-scores.js +0 -184
  142. package/dist/scripts/validate-task-sources.d.ts +0 -21
  143. package/dist/scripts/validate-task-sources.js +0 -210
  144. package/dist/scripts/validate.d.ts +0 -13
  145. package/dist/scripts/validate.js +0 -79
  146. package/dist/scripts/webhook-server.d.ts +0 -26
  147. package/dist/scripts/webhook-server.js +0 -147
  148. package/dist/scripts/weekly-digest.d.ts +0 -24
  149. package/dist/scripts/weekly-digest.js +0 -144
  150. package/dist/sinks/format-slack.d.ts +0 -64
  151. package/dist/sinks/format-slack.js +0 -306
  152. package/dist/sinks/slack-sink.d.ts +0 -27
  153. package/dist/sinks/slack-sink.js +0 -78
  154. package/dist/sinks/webhook-sink.d.ts +0 -19
  155. package/dist/sinks/webhook-sink.js +0 -50
  156. package/tasks/.expanded.agentic.yaml +0 -280
  157. package/tasks/.expanded.yaml +0 -565
@@ -1,22 +0,0 @@
1
- /**
2
- * grader-compare.ts
3
- *
4
- * CLI for inter-grader comparison (Phase 3 of grader reliability).
5
- *
6
- * Re-runs grading assertions on existing eval results using candidate grader
7
- * models, then compares the resulting scores against the baseline grader.
8
- *
9
- * Usage:
10
- * pnpm grader-compare # compare vs configured candidates
11
- * pnpm grader-compare --candidate openai:gpt-5.5-preview
12
- * pnpm grader-compare --candidate openai:gpt-5.5-preview --candidate anthropic:claude-4-opus
13
- * pnpm grader-compare --results eval-results.json
14
- * pnpm grader-compare --format json # machine-readable output
15
- *
16
- * Reads: results/latest/eval-results.json (model responses to re-grade)
17
- * Reads: config/models.yaml (baseline grader + optional candidate list)
18
- * Writes: results/latest/grader-comparison.json
19
- *
20
- * @see docs/exec-plans/completed/grader-reliability.md — Phase 3
21
- */
22
- export {};
@@ -1,368 +0,0 @@
1
- /**
2
- * grader-compare.ts
3
- *
4
- * CLI for inter-grader comparison (Phase 3 of grader reliability).
5
- *
6
- * Re-runs grading assertions on existing eval results using candidate grader
7
- * models, then compares the resulting scores against the baseline grader.
8
- *
9
- * Usage:
10
- * pnpm grader-compare # compare vs configured candidates
11
- * pnpm grader-compare --candidate openai:gpt-5.5-preview
12
- * pnpm grader-compare --candidate openai:gpt-5.5-preview --candidate anthropic:claude-4-opus
13
- * pnpm grader-compare --results eval-results.json
14
- * pnpm grader-compare --format json # machine-readable output
15
- *
16
- * Reads: results/latest/eval-results.json (model responses to re-grade)
17
- * Reads: config/models.yaml (baseline grader + optional candidate list)
18
- * Writes: results/latest/grader-comparison.json
19
- *
20
- * @see docs/exec-plans/completed/grader-reliability.md — Phase 3
21
- */
22
- import { existsSync, readFileSync, writeFileSync } from "fs";
23
- import { dirname, join, resolve } from "path";
24
- import { fileURLToPath } from "url";
25
- import { load } from "js-yaml";
26
- import { compareGraders, } from "../pipeline/grader-comparison.js";
27
- import { classifyCorrelation } from "../pipeline/grader-validation.js";
28
- import { gradeOnce } from "./grader-api.js";
29
- const __dirname = dirname(fileURLToPath(import.meta.url));
30
- const ROOT = resolve(__dirname, "..", "..");
31
- // ---------------------------------------------------------------------------
32
- // CLI argument parsing
33
- // ---------------------------------------------------------------------------
34
- const args = process.argv.slice(2);
35
- function getAllOptions(name) {
36
- const results = [];
37
- const flag = `--${name}`;
38
- for (let i = 0; i < args.length; i++) {
39
- if (args[i] === flag && i + 1 < args.length) {
40
- results.push(args[i + 1]);
41
- }
42
- }
43
- return results;
44
- }
45
- function getFlag(name) {
46
- return args.includes(`--${name}`);
47
- }
48
- function getOption(name) {
49
- const idx = args.indexOf(`--${name}`);
50
- return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined;
51
- }
52
- const candidateArgs = getAllOptions("candidate");
53
- const resultsPath = getOption("results") ?? "results/latest/eval-results.json";
54
- const format = getOption("format") ?? "table";
55
- const outputPath = getOption("output");
56
- const showHelp = getFlag("help") || getFlag("h");
57
- if (showHelp) {
58
- console.log(`
59
- Usage: pnpm grader-compare [options]
60
-
61
- Compare multiple grader models on the same evaluation responses.
62
-
63
- Options:
64
- --candidate <model> Candidate grader model ID (repeatable)
65
- e.g., --candidate openai:gpt-5.5-preview
66
- --results <path> Path to eval results (default: results/latest/eval-results.json)
67
- --format <fmt> Output format: table (default) or json
68
- --output <path> Write JSON report to file
69
- --help, -h Show this help
70
-
71
- If no --candidate flags are provided, reads grader-candidates from config/models.yaml.
72
-
73
- Examples:
74
- pnpm grader-compare --candidate openai:gpt-5.5-preview
75
- pnpm grader-compare --candidate openai:gpt-5.5-preview --candidate anthropic:claude-4-opus
76
- pnpm grader-compare --format json
77
- `);
78
- process.exit(0);
79
- }
80
- // ---------------------------------------------------------------------------
81
- // Dimension classification
82
- // ---------------------------------------------------------------------------
83
- // DimensionName imported from pipeline/types.ts
84
- const DIMENSION_PATTERNS = [
85
- { dimension: "taskCompletion", pattern: /task[_-]?completion/i },
86
- { dimension: "codeCorrectness", pattern: /code[_-]?correct/i },
87
- { dimension: "docCoverage", pattern: /doc[_-]?coverage/i },
88
- ];
89
- /** Classify a component result into a dimension based on rubric content or metric */
90
- function classifyDimension(comp) {
91
- // Check the metric name first (structured dimensions)
92
- const metric = comp.assertion?.metric ?? "";
93
- for (const { dimension, pattern } of DIMENSION_PATTERNS) {
94
- if (pattern.test(metric))
95
- return dimension;
96
- }
97
- // Fall back to rubric text analysis
98
- const rubric = typeof comp.assertion?.value === "string" ? comp.assertion.value : "";
99
- for (const { dimension, pattern } of DIMENSION_PATTERNS) {
100
- if (pattern.test(rubric))
101
- return dimension;
102
- }
103
- return null;
104
- }
105
- // ---------------------------------------------------------------------------
106
- // Judgment extraction (same pattern as grader-consistency.ts)
107
- // ---------------------------------------------------------------------------
108
- /** Detect feature area from test description */
109
- function detectFeatureArea(description) {
110
- // Pattern: "[gold] Area Name — Task Description" or "Area Name — Task Description"
111
- const cleaned = description.replace(/^\[(?:gold|baseline)\]\s*/i, "");
112
- const parts = cleaned.split("—");
113
- if (parts.length >= 2) {
114
- return parts[0].trim().toLowerCase().replace(/\s+/g, "-");
115
- }
116
- return "unknown";
117
- }
118
- /** Detect task ID from test description */
119
- function detectTaskId(description) {
120
- // Description format: "[gold] Area Name — Task Description"
121
- const cleaned = description.replace(/^\[(?:gold|baseline)\]\s*/i, "");
122
- return cleaned
123
- .toLowerCase()
124
- .replace(/\s+/g, "-")
125
- .replace(/[^a-z0-9-]/g, "")
126
- .slice(0, 60);
127
- }
128
- // ---------------------------------------------------------------------------
129
- // OpenAI grading API call (reuses pattern from grader-consistency.ts)
130
- // ---------------------------------------------------------------------------
131
- function extractJudgments(evalResults) {
132
- const judgments = [];
133
- const results = evalResults.results?.results ?? [];
134
- for (const result of results) {
135
- const description = result.testCase?.description ?? result.description ?? "";
136
- // Only process gold tests (with-docs), skip baseline tests
137
- if (!description.toLowerCase().includes("[gold]"))
138
- continue;
139
- const area = detectFeatureArea(description);
140
- const taskId = detectTaskId(description);
141
- const providerId = result.provider?.id;
142
- const components = result.gradingResult?.componentResults ?? [];
143
- for (const comp of components) {
144
- if (comp.assertion?.type !== "llm-rubric")
145
- continue;
146
- const dimension = classifyDimension(comp);
147
- if (!dimension)
148
- continue;
149
- const rubricText = typeof comp.assertion.value === "string" ? comp.assertion.value : "";
150
- if (!rubricText)
151
- continue;
152
- judgments.push({
153
- area,
154
- dimension,
155
- originalScore: typeof comp.score === "number" ? comp.score : 0,
156
- providerId,
157
- responseText: result.response?.output ?? "",
158
- rubricText,
159
- taskId,
160
- });
161
- }
162
- }
163
- return judgments;
164
- }
165
- // ---------------------------------------------------------------------------
166
- // Config loading
167
- // ---------------------------------------------------------------------------
168
- function formatComparisonReport(result) {
169
- console.log("-".repeat(80));
170
- console.log("COMPARISON RESULTS");
171
- console.log("-".repeat(80));
172
- console.log();
173
- console.log(` Baseline grader: ${result.baselineGrader}`);
174
- console.log(` Candidates: ${result.candidateGraders.join(", ")}`);
175
- console.log();
176
- for (const pair of result.pairwise) {
177
- console.log("-".repeat(80));
178
- console.log(` ${pair.graderA} vs ${pair.graderB}`);
179
- console.log("-".repeat(80));
180
- console.log();
181
- console.log(` Overall:`);
182
- console.log(` Correlation: r=${pair.correlation} (${classifyCorrelation(pair.correlation)})`);
183
- console.log(` Bias: ${pair.bias > 0 ? "+" : ""}${pair.bias} (${pair.bias > 0 ? "candidate grades higher" : pair.bias < 0 ? "candidate grades lower" : "no systematic bias"})`);
184
- console.log(` Mean Abs Diff: ${pair.meanAbsDiff} points`);
185
- console.log();
186
- // Per-dimension table
187
- const h = "| Dimension | Correlation | Quality | Bias | MAD | Count |";
188
- const sep = "|------------------|-------------|-----------|--------|-------|-------|";
189
- console.log(h);
190
- console.log(sep);
191
- const dims = [
192
- { data: pair.perDimension.taskCompletion, name: "Task Completion" },
193
- { data: pair.perDimension.codeCorrectness, name: "Code Correctness" },
194
- { data: pair.perDimension.docCoverage, name: "Doc Coverage" },
195
- ];
196
- for (const { data, name } of dims) {
197
- const quality = classifyCorrelation(data.correlation);
198
- const biasStr = data.bias > 0 ? `+${data.bias}` : `${data.bias}`;
199
- console.log(`| ${name.padEnd(16)} | r=${String(data.correlation).padStart(9)} | ${quality.padEnd(9)} | ${biasStr.padStart(6)} | ${String(data.meanAbsDiff).padStart(5)} | ${String(data.count).padStart(5)} |`);
200
- }
201
- console.log();
202
- }
203
- // Recommendations
204
- if (result.recommendations.length > 0) {
205
- console.log("-".repeat(80));
206
- console.log("RECOMMENDATIONS");
207
- console.log("-".repeat(80));
208
- console.log();
209
- for (const rec of result.recommendations) {
210
- const icon = rec.recommendation === "comparable"
211
- ? "✅"
212
- : rec.recommendation === "divergent"
213
- ? "⚠️"
214
- : "❌";
215
- console.log(` ${icon} ${rec.modelId}: ${rec.recommendation}`);
216
- console.log(` ${rec.reason}`);
217
- }
218
- console.log();
219
- }
220
- }
221
- // ---------------------------------------------------------------------------
222
- // Main execution
223
- // ---------------------------------------------------------------------------
224
- function loadConfig() {
225
- const modelsPath = join(ROOT, "config", "models.yaml");
226
- if (!existsSync(modelsPath)) {
227
- console.error("❌ config/models.yaml not found");
228
- process.exit(1);
229
- }
230
- const raw = readFileSync(modelsPath, "utf-8");
231
- const data = load(raw);
232
- const grader = {
233
- id: data?.grader?.id ?? "openai:gpt-5",
234
- label: data?.grader?.label ?? "GPT-5 (grader)",
235
- };
236
- // CLI candidates override config candidates
237
- let candidates;
238
- if (candidateArgs.length > 0) {
239
- candidates = candidateArgs.map((id) => ({
240
- id,
241
- label: id.split(":").pop() ?? id,
242
- }));
243
- }
244
- else {
245
- const configCandidates = data?.["grader-candidates"] ?? [];
246
- candidates = configCandidates.map((c) => ({
247
- id: c.id,
248
- label: c.label ?? c.id.split(":").pop() ?? c.id,
249
- }));
250
- }
251
- return { baselineGrader: grader, candidates };
252
- }
253
- // ---------------------------------------------------------------------------
254
- // Formatted output
255
- // ---------------------------------------------------------------------------
256
- async function main() {
257
- console.log("=".repeat(80));
258
- console.log(" INTER-GRADER COMPARISON");
259
- console.log("=".repeat(80));
260
- console.log();
261
- // Load config
262
- const { baselineGrader, candidates } = loadConfig();
263
- if (candidates.length === 0) {
264
- console.error("❌ No candidate graders specified. Use --candidate <model> or add grader-candidates to config/models.yaml.");
265
- process.exit(1);
266
- }
267
- console.log(` Baseline grader: ${baselineGrader.id} (${baselineGrader.label})`);
268
- for (const c of candidates) {
269
- console.log(` Candidate: ${c.id} (${c.label})`);
270
- }
271
- console.log();
272
- // Load eval results
273
- const evalResultsPath = resolve(ROOT, resultsPath);
274
- if (!existsSync(evalResultsPath)) {
275
- console.error(`❌ Eval results not found: ${evalResultsPath}`);
276
- console.error(" Run the evaluation pipeline first: pnpm pipeline");
277
- process.exit(1);
278
- }
279
- const evalResultsRaw = readFileSync(evalResultsPath, "utf-8");
280
- const evalResults = JSON.parse(evalResultsRaw);
281
- const evalData = evalResults;
282
- // Extract judgments
283
- const judgments = extractJudgments(evalData);
284
- console.log(` Judgments found: ${judgments.length}`);
285
- if (judgments.length === 0) {
286
- console.error("❌ No gold-test judgments found in eval results.");
287
- process.exit(1);
288
- }
289
- // Build baseline scores from original eval results
290
- const baselineScores = judgments.map((j) => ({
291
- area: j.area,
292
- dimension: j.dimension,
293
- score: Math.round(j.originalScore * 100),
294
- taskId: j.taskId,
295
- }));
296
- const baselineScoreSet = {
297
- label: baselineGrader.label,
298
- modelId: baselineGrader.id,
299
- scores: baselineScores,
300
- };
301
- // Grade with each candidate
302
- const candidateScoreSets = [];
303
- for (const candidate of candidates) {
304
- console.log();
305
- console.log(` Grading with ${candidate.id}...`);
306
- const candidateScores = [];
307
- let completed = 0;
308
- let failed = 0;
309
- for (const j of judgments) {
310
- const score = await gradeOnce(candidate.id, j.responseText, j.rubricText);
311
- completed++;
312
- if (score !== null) {
313
- candidateScores.push({
314
- area: j.area,
315
- dimension: j.dimension,
316
- score,
317
- taskId: j.taskId,
318
- });
319
- }
320
- else {
321
- failed++;
322
- }
323
- if (completed % 10 === 0 || completed === judgments.length) {
324
- process.stdout.write(`\r Progress: ${completed}/${judgments.length}${failed > 0 ? ` (${failed} failed)` : ""}`);
325
- }
326
- }
327
- console.log();
328
- candidateScoreSets.push({
329
- label: candidate.label,
330
- modelId: candidate.id,
331
- scores: candidateScores,
332
- });
333
- }
334
- console.log();
335
- // Run comparison
336
- const comparison = compareGraders(baselineScoreSet, candidateScoreSets);
337
- // Output
338
- if (format === "json") {
339
- const json = JSON.stringify(comparison, null, 2);
340
- if (outputPath) {
341
- writeFileSync(outputPath, json);
342
- console.log(` ✅ Report written to ${outputPath}`);
343
- }
344
- else {
345
- console.log(json);
346
- }
347
- }
348
- else {
349
- formatComparisonReport(comparison);
350
- }
351
- // Write to results/latest/
352
- const resultFilePath = join(ROOT, "results", "latest", "grader-comparison.json");
353
- try {
354
- writeFileSync(resultFilePath, JSON.stringify(comparison, null, 2));
355
- console.log(` 📄 Report saved: ${resultFilePath}`);
356
- }
357
- catch {
358
- // results/latest/ may not exist yet
359
- }
360
- }
361
- // Only run when invoked directly
362
- if (process.argv[1]?.endsWith("grader-compare.ts") ||
363
- process.argv[1]?.endsWith("grader-compare.js")) {
364
- main().catch((err) => {
365
- console.error("❌ Fatal error:", err);
366
- process.exit(1);
367
- });
368
- }
@@ -1,20 +0,0 @@
1
- /**
2
- * grader-consistency.ts
3
- *
4
- * CLI script for measuring grader consistency (Phase 1 of grader reliability).
5
- *
6
- * Reads existing eval results, re-runs ONLY the grading assertions N additional
7
- * times with the configured grader model, and analyzes score variance.
8
- *
9
- * This does NOT re-run the models under test — it only re-grades the same
10
- * responses. Cost is low: ~$0.005 per grading call × N replications.
11
- *
12
- * Usage:
13
- * pnpm grader-consistency # 5 replications (default)
14
- * pnpm grader-consistency --replications 3 # custom count
15
- * pnpm grader-consistency --results <path> # custom results file
16
- *
17
- * Reads: results/latest/eval-results.json (default)
18
- * Writes: results/latest/grader-consistency.json
19
- */
20
- import "dotenv/config";