@sanity/ailf 2.0.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/LICENSE +21 -0
  2. package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
  3. package/dist/_vendor/ailf-core/examples/index.js +66 -1
  4. package/dist/agent-harness/assertions-runtime.d.ts +49 -0
  5. package/dist/agent-harness/assertions-runtime.js +138 -0
  6. package/dist/agent-harness/provider.d.ts +58 -0
  7. package/dist/agent-harness/provider.js +104 -0
  8. package/dist/cli.js +0 -0
  9. package/dist/commands/init.js +3 -0
  10. package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
  11. package/dist/orchestration/steps/generate-configs-step.js +35 -2
  12. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +39 -25
  13. package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
  14. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +9 -0
  15. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +28 -85
  16. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
  17. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
  18. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
  19. package/package.json +25 -24
  20. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
  21. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
  22. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
  23. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
  24. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  25. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  26. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  27. package/dist/_vendor/ailf-tasks/index.js +0 -16
  28. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  29. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  30. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  31. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  32. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  33. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  34. package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
  35. package/dist/adapters/task-sources/yaml-task-source.js +0 -139
  36. package/dist/agent-observer/test-imports.d.ts +0 -7
  37. package/dist/agent-observer/test-imports.js +0 -185
  38. package/dist/commands/update-quality-scores.d.ts +0 -5
  39. package/dist/commands/update-quality-scores.js +0 -20
  40. package/dist/lib/agent-behavior-report.d.ts +0 -8
  41. package/dist/lib/agent-behavior-report.js +0 -185
  42. package/dist/lib/baseline.d.ts +0 -19
  43. package/dist/lib/baseline.js +0 -153
  44. package/dist/lib/calculate-scores.d.ts +0 -23
  45. package/dist/lib/calculate-scores.js +0 -42
  46. package/dist/lib/compare.d.ts +0 -18
  47. package/dist/lib/compare.js +0 -170
  48. package/dist/lib/coverage-audit.d.ts +0 -4
  49. package/dist/lib/coverage-audit.js +0 -42
  50. package/dist/lib/discovery-report.d.ts +0 -13
  51. package/dist/lib/discovery-report.js +0 -57
  52. package/dist/lib/fetch-docs.d.ts +0 -30
  53. package/dist/lib/fetch-docs.js +0 -171
  54. package/dist/lib/generate-configs.d.ts +0 -25
  55. package/dist/lib/generate-configs.js +0 -42
  56. package/dist/lib/grader-api.d.ts +0 -21
  57. package/dist/lib/grader-api.js +0 -34
  58. package/dist/lib/grader-compare.d.ts +0 -19
  59. package/dist/lib/grader-compare.js +0 -91
  60. package/dist/lib/grader-consistency.d.ts +0 -27
  61. package/dist/lib/grader-consistency.js +0 -79
  62. package/dist/lib/grader-sensitivity.d.ts +0 -19
  63. package/dist/lib/grader-sensitivity.js +0 -75
  64. package/dist/lib/grader-validate.d.ts +0 -19
  65. package/dist/lib/grader-validate.js +0 -78
  66. package/dist/lib/measure-retrieval.d.ts +0 -14
  67. package/dist/lib/measure-retrieval.js +0 -71
  68. package/dist/lib/pr-comment.d.ts +0 -16
  69. package/dist/lib/pr-comment.js +0 -28
  70. package/dist/lib/readiness-report.d.ts +0 -13
  71. package/dist/lib/readiness-report.js +0 -108
  72. package/dist/lib/webhook-server.d.ts +0 -11
  73. package/dist/lib/webhook-server.js +0 -24
  74. package/dist/lib/weekly-digest.d.ts +0 -24
  75. package/dist/lib/weekly-digest.js +0 -148
  76. package/dist/orchestration/env-bridge.d.ts +0 -21
  77. package/dist/orchestration/env-bridge.js +0 -66
  78. package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
  79. package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
  80. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
  81. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
  82. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
  83. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  84. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  85. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  86. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  87. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  88. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
  89. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
  90. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
  91. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
  92. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
  93. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
  94. package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
  95. package/dist/pipeline/compiler/task-bridge.js +0 -92
  96. package/dist/pipeline/expand-tasks.d.ts +0 -232
  97. package/dist/pipeline/expand-tasks.js +0 -467
  98. package/dist/pipeline/generate-configs.d.ts +0 -92
  99. package/dist/pipeline/generate-configs.js +0 -445
  100. package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
  101. package/dist/pipeline/steps/calculate-scores-step.js +0 -89
  102. package/dist/pipeline/steps/compare-step.d.ts +0 -18
  103. package/dist/pipeline/steps/compare-step.js +0 -90
  104. package/dist/pipeline/steps/eval-step.d.ts +0 -53
  105. package/dist/pipeline/steps/eval-step.js +0 -347
  106. package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
  107. package/dist/pipeline/steps/fetch-docs-step.js +0 -84
  108. package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
  109. package/dist/pipeline/steps/generate-configs-step.js +0 -98
  110. package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
  111. package/dist/pipeline/steps/grader-consistency-step.js +0 -74
  112. package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
  113. package/dist/pipeline/steps/publish-report-step.js +0 -243
  114. package/dist/pipeline/steps/report-step.d.ts +0 -13
  115. package/dist/pipeline/steps/report-step.js +0 -56
  116. package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
  117. package/dist/pipeline/steps/update-scores-step.js +0 -42
  118. package/dist/scripts/agent-behavior-report.d.ts +0 -19
  119. package/dist/scripts/agent-behavior-report.js +0 -315
  120. package/dist/scripts/baseline.d.ts +0 -43
  121. package/dist/scripts/baseline.js +0 -267
  122. package/dist/scripts/calculate-scores.d.ts +0 -166
  123. package/dist/scripts/calculate-scores.js +0 -1296
  124. package/dist/scripts/compare.d.ts +0 -22
  125. package/dist/scripts/compare.js +0 -334
  126. package/dist/scripts/coverage-audit.d.ts +0 -44
  127. package/dist/scripts/coverage-audit.js +0 -209
  128. package/dist/scripts/debug-eval.d.ts +0 -19
  129. package/dist/scripts/debug-eval.js +0 -73
  130. package/dist/scripts/discovery-report.d.ts +0 -58
  131. package/dist/scripts/discovery-report.js +0 -250
  132. package/dist/scripts/fetch-docs.d.ts +0 -35
  133. package/dist/scripts/fetch-docs.js +0 -472
  134. package/dist/scripts/generate-configs.d.ts +0 -66
  135. package/dist/scripts/generate-configs.js +0 -459
  136. package/dist/scripts/grader-api.d.ts +0 -27
  137. package/dist/scripts/grader-api.js +0 -206
  138. package/dist/scripts/grader-compare.d.ts +0 -22
  139. package/dist/scripts/grader-compare.js +0 -368
  140. package/dist/scripts/grader-consistency.d.ts +0 -20
  141. package/dist/scripts/grader-consistency.js +0 -313
  142. package/dist/scripts/grader-sensitivity.d.ts +0 -22
  143. package/dist/scripts/grader-sensitivity.js +0 -354
  144. package/dist/scripts/grader-validate.d.ts +0 -19
  145. package/dist/scripts/grader-validate.js +0 -267
  146. package/dist/scripts/measure-retrieval.d.ts +0 -10
  147. package/dist/scripts/measure-retrieval.js +0 -145
  148. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
  149. package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
  150. package/dist/scripts/pipeline.d.ts +0 -76
  151. package/dist/scripts/pipeline.js +0 -1031
  152. package/dist/scripts/pr-comment.d.ts +0 -10
  153. package/dist/scripts/pr-comment.js +0 -510
  154. package/dist/scripts/readiness-report.d.ts +0 -88
  155. package/dist/scripts/readiness-report.js +0 -342
  156. package/dist/scripts/update-quality-scores.d.ts +0 -15
  157. package/dist/scripts/update-quality-scores.js +0 -184
  158. package/dist/scripts/validate-task-sources.d.ts +0 -21
  159. package/dist/scripts/validate-task-sources.js +0 -210
  160. package/dist/scripts/validate.d.ts +0 -13
  161. package/dist/scripts/validate.js +0 -79
  162. package/dist/scripts/webhook-server.d.ts +0 -26
  163. package/dist/scripts/webhook-server.js +0 -147
  164. package/dist/scripts/weekly-digest.d.ts +0 -24
  165. package/dist/scripts/weekly-digest.js +0 -144
  166. package/dist/sinks/format-slack.d.ts +0 -64
  167. package/dist/sinks/format-slack.js +0 -306
  168. package/dist/sinks/slack-sink.d.ts +0 -27
  169. package/dist/sinks/slack-sink.js +0 -78
  170. package/dist/sinks/webhook-sink.d.ts +0 -19
  171. package/dist/sinks/webhook-sink.js +0 -50
  172. package/tasks/.expanded.agentic.yaml +0 -280
  173. package/tasks/.expanded.yaml +0 -565
@@ -1,313 +0,0 @@
1
- /**
2
- * grader-consistency.ts
3
- *
4
- * CLI script for measuring grader consistency (Phase 1 of grader reliability).
5
- *
6
- * Reads existing eval results, re-runs ONLY the grading assertions N additional
7
- * times with the configured grader model, and analyzes score variance.
8
- *
9
- * This does NOT re-run the models under test — it only re-grades the same
10
- * responses. Cost is low: ~$0.005 per grading call × N replications.
11
- *
12
- * Usage:
13
- * pnpm grader-consistency # 5 replications (default)
14
- * pnpm grader-consistency --replications 3 # custom count
15
- * pnpm grader-consistency --results <path> # custom results file
16
- *
17
- * Reads: results/latest/eval-results.json (default)
18
- * Writes: results/latest/grader-consistency.json
19
- */
20
- // oxlint-disable-next-line import/no-unassigned-import -- side-effect: loads .env into process.env
21
- import "dotenv/config";
22
- import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
23
- import { dirname, join, resolve } from "path";
24
- import { fileURLToPath } from "url";
25
- import { analyzeConsistency, } from "../pipeline/grader-consistency.js";
26
- import { gradeOnce } from "./grader-api.js";
27
- const __dirname = dirname(fileURLToPath(import.meta.url));
28
- const ROOT = resolve(__dirname, "..", "..");
29
- // ---------------------------------------------------------------------------
30
- // CLI argument parsing
31
- // ---------------------------------------------------------------------------
32
- const args = process.argv.slice(2);
33
- function getFlag(name) {
34
- return args.includes(`--${name}`);
35
- }
36
- function getOption(name) {
37
- const idx = args.indexOf(`--${name}`);
38
- return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined;
39
- }
40
- const replicationsStr = getOption("replications") ?? "5";
41
- const replications = parseInt(replicationsStr, 10);
42
- const resultsPath = getOption("results") ?? join(ROOT, "results", "latest", "eval-results.json");
43
- const showHelp = getFlag("help") || getFlag("h");
44
- if (showHelp) {
45
- console.log(`
46
- Usage: pnpm grader-consistency [options]
47
-
48
- Measure grader consistency by re-grading existing eval responses N times.
49
-
50
- Options:
51
- --replications <n> Number of additional grading replications (default: 5)
52
- --results <path> Path to eval-results.json (default: results/latest/eval-results.json)
53
- --help, -h Show this help
54
-
55
- Examples:
56
- pnpm grader-consistency # 5 replications
57
- pnpm grader-consistency --replications 3 # fewer replications (faster)
58
- pnpm grader-consistency --results results/latest/eval-results-agentic.json
59
- `);
60
- process.exit(0);
61
- }
62
- // RawPromptfooFile and RawTestResult imported from calculate-scores.ts
63
- // ---------------------------------------------------------------------------
64
- // Rubric dimension classification (similar to calculate-scores)
65
- // ---------------------------------------------------------------------------
66
- function classifyDimension(component) {
67
- // Prefer structured metadata
68
- const metadata = component.assertion?.metadata;
69
- if (metadata?.dimension) {
70
- switch (metadata.dimension) {
71
- case "code-correctness":
72
- return "codeCorrectness";
73
- case "doc-coverage":
74
- return "docCoverage";
75
- case "task-completion":
76
- return "taskCompletion";
77
- default:
78
- return null;
79
- }
80
- }
81
- // Fallback: heuristic name matching
82
- const value = (component.assertion?.value ?? "").toLowerCase();
83
- if (value.includes("task completion"))
84
- return "taskCompletion";
85
- if (value.includes("code correctness"))
86
- return "codeCorrectness";
87
- if (value.includes("documentation coverage") || value.includes("hallucinate"))
88
- return "docCoverage";
89
- return null;
90
- }
91
- // ---------------------------------------------------------------------------
92
- // Grading judgment extraction
93
- // ---------------------------------------------------------------------------
94
- function detectFeatureArea(description) {
95
- const desc = description.toLowerCase();
96
- if (desc.includes("studio"))
97
- return "studio-setup";
98
- if (desc.includes("visual") ||
99
- desc.includes("presentation") ||
100
- desc.includes("live preview"))
101
- return "visual-editing";
102
- if (desc.includes("function") || desc.includes("webhook"))
103
- return "functions";
104
- if (desc.startsWith("groq"))
105
- return "groq";
106
- if (desc.includes("next") || desc.includes("app router"))
107
- return "nextjs-live";
108
- if (desc.includes("remix") ||
109
- desc.includes("nuxt") ||
110
- desc.includes("svelte"))
111
- return "frameworks";
112
- return "other";
113
- }
114
- /**
115
- * Extract all llm-rubric grading judgments from eval results.
116
- * Only includes gold (with-docs) tests to keep the analysis focused.
117
- */
118
- function extractGradingJudgments(file) {
119
- const judgments = [];
120
- for (const result of file.results.results) {
121
- if (!result.gradingResult)
122
- continue;
123
- const description = result.testCase?.description ?? "unknown";
124
- const hasDocs = result.vars?.docs && result.vars.docs.trim().length > 0;
125
- // Only grade "gold" (with-docs) tests — baseline tests have abbreviated rubrics
126
- if (!hasDocs)
127
- continue;
128
- const area = detectFeatureArea(description);
129
- const providerId = result.provider?.id;
130
- for (const comp of result.gradingResult.componentResults) {
131
- if (comp.assertion?.type !== "llm-rubric")
132
- continue;
133
- const dimension = classifyDimension(comp);
134
- if (!dimension)
135
- continue;
136
- const rubricText = typeof comp.assertion.value === "string" ? comp.assertion.value : "";
137
- if (!rubricText)
138
- continue;
139
- judgments.push({
140
- area,
141
- description,
142
- dimension,
143
- originalScore: typeof comp.score === "number" ? comp.score : 0,
144
- providerId,
145
- responseText: result.response?.output ?? "",
146
- rubricText,
147
- });
148
- }
149
- }
150
- return judgments;
151
- }
152
- // ---------------------------------------------------------------------------
153
- // Main
154
- // ---------------------------------------------------------------------------
155
- async function main() {
156
- console.log("=== Grader Consistency Analysis ===\n");
157
- // Validate inputs
158
- if (!existsSync(resultsPath)) {
159
- console.error(`❌ Results file not found: ${resultsPath}`);
160
- console.error("Run 'pnpm eval' first to generate results.");
161
- process.exit(1);
162
- }
163
- if (replications < 2) {
164
- console.error("❌ Need at least 2 replications for meaningful analysis.");
165
- process.exit(1);
166
- }
167
- // Load eval results
168
- console.log(` Results: ${resultsPath}`);
169
- console.log(` Replications: ${replications}`);
170
- const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
171
- // Extract grader model
172
- const graderModel = file.config?.defaultTest?.options?.rubricProvider ??
173
- file.config?.defaultTest?.options?.provider;
174
- if (!graderModel) {
175
- console.error("❌ Could not determine grader model from eval results config.");
176
- process.exit(1);
177
- }
178
- console.log(` Grader: ${graderModel}`);
179
- // Extract judgments
180
- const judgments = extractGradingJudgments(file);
181
- console.log(` Judgments: ${judgments.length} (gold tests × rubric dimensions)`);
182
- if (judgments.length === 0) {
183
- console.error("❌ No gradable judgments found in results.");
184
- process.exit(1);
185
- }
186
- const totalCalls = judgments.length * replications;
187
- const estimatedCost = totalCalls * 0.005;
188
- console.log(` API calls: ${totalCalls} (${judgments.length} × ${replications})`);
189
- console.log(` Est. cost: ~$${estimatedCost.toFixed(2)}`);
190
- console.log();
191
- // Re-grade each judgment N times
192
- console.log(` Running ${replications} replications per judgment...`);
193
- const gradings = [];
194
- let completed = 0;
195
- let failed = 0;
196
- for (const judgment of judgments) {
197
- const scores = [judgment.originalScore]; // Include the original as replication #0
198
- for (let i = 0; i < replications; i++) {
199
- const score = await gradeOnce(graderModel, judgment.responseText, judgment.rubricText);
200
- if (score !== null) {
201
- scores.push(score);
202
- }
203
- else {
204
- failed++;
205
- }
206
- }
207
- completed++;
208
- if (completed % 10 === 0 || completed === judgments.length) {
209
- const pct = Math.round((completed / judgments.length) * 100);
210
- process.stdout.write(`\r Progress: ${completed}/${judgments.length} (${pct}%)`);
211
- }
212
- gradings.push({
213
- area: judgment.area,
214
- dimension: judgment.dimension,
215
- providerId: judgment.providerId,
216
- scores,
217
- taskId: judgment.description,
218
- });
219
- }
220
- console.log(); // newline after progress
221
- if (failed > 0) {
222
- console.log(` ⚠ ${failed} grading calls failed (excluded from analysis)`);
223
- }
224
- console.log();
225
- // Analyze consistency
226
- const result = analyzeConsistency(gradings);
227
- // Print report
228
- printReport(result, graderModel);
229
- // Write output
230
- const outDir = join(ROOT, "results", "latest");
231
- mkdirSync(outDir, { recursive: true });
232
- const outPath = join(outDir, "grader-consistency.json");
233
- writeFileSync(outPath, JSON.stringify(result, null, 2));
234
- console.log(`\n 📄 Results written to ${outPath}`);
235
- }
236
- // ---------------------------------------------------------------------------
237
- // Report formatting
238
- // ---------------------------------------------------------------------------
239
- function printReport(result, graderModel) {
240
- console.log("=".repeat(80));
241
- console.log(" GRADER CONSISTENCY REPORT");
242
- console.log("=".repeat(80));
243
- console.log();
244
- console.log(` Grader model: ${graderModel}`);
245
- console.log(` Replications: ${result.replications} (incl. original)`);
246
- console.log(` Judgments: ${result.totalJudgments}`);
247
- console.log();
248
- // Overall stats
249
- console.log("-".repeat(80));
250
- console.log("OVERALL");
251
- console.log("-".repeat(80));
252
- console.log();
253
- console.log(` Avg σ: ${result.avgStdDev}`);
254
- console.log(` Max σ: ${result.maxStdDev}`);
255
- console.log(` Avg range: ${result.avgRange} points`);
256
- console.log();
257
- // Per-dimension table
258
- console.log("-".repeat(80));
259
- console.log("PER-DIMENSION CONSISTENCY");
260
- console.log("-".repeat(80));
261
- console.log();
262
- const h = "| Dimension | Avg σ | Max σ | Avg Range | Judgments |";
263
- const sep = "|------------------|-------|-------|-----------|-----------|";
264
- console.log(h);
265
- console.log(sep);
266
- const dims = [
267
- { data: result.perDimension.taskCompletion, name: "Task Completion" },
268
- { data: result.perDimension.codeCorrectness, name: "Code Correctness" },
269
- { data: result.perDimension.docCoverage, name: "Doc Coverage" },
270
- ];
271
- for (const { data, name } of dims) {
272
- console.log(`| ${name.padEnd(16)} | ${String(data.avgStdDev).padStart(5)} | ${String(data.maxStdDev).padStart(5)} | ${String(data.avgRange).padStart(9)} | ${String(data.judgmentCount).padStart(9)} |`);
273
- }
274
- console.log();
275
- // Noise threshold recommendation
276
- console.log("-".repeat(80));
277
- console.log("NOISE THRESHOLD RECOMMENDATION");
278
- console.log("-".repeat(80));
279
- console.log();
280
- console.log(` Current default: ±2 (DEFAULT_NOISE_THRESHOLD)`);
281
- console.log(` Recommended: ±${result.recommendedThreshold} (based on 2× max dimension σ)`);
282
- if (result.recommendedThreshold > 2) {
283
- console.log(` ⚠ Current threshold may be too low — comparison deltas within ±${result.recommendedThreshold}`);
284
- console.log(` should be treated as noise, not real changes.`);
285
- }
286
- else {
287
- console.log(` ✅ Current threshold is adequate for this grader's consistency.`);
288
- }
289
- console.log();
290
- // Top 5 noisiest judgments
291
- const topN = Math.min(5, result.judgments.length);
292
- if (topN > 0) {
293
- console.log("-".repeat(80));
294
- console.log(`TOP ${topN} NOISIEST JUDGMENTS`);
295
- console.log("-".repeat(80));
296
- console.log();
297
- for (let i = 0; i < topN; i++) {
298
- const j = result.judgments[i];
299
- const provider = j.providerId ? ` [${j.providerId}]` : "";
300
- console.log(` ${i + 1}. ${j.taskId}${provider}`);
301
- console.log(` ${j.dimension}: σ=${j.stdDev}, range=${j.range} (${j.min}–${j.max}), mean=${j.mean}`);
302
- }
303
- console.log();
304
- }
305
- }
306
- // Only run when invoked directly
307
- if (process.argv[1]?.endsWith("grader-consistency.ts") ||
308
- process.argv[1]?.endsWith("grader-consistency.js")) {
309
- main().catch((err) => {
310
- console.error("❌ Fatal error:", err);
311
- process.exit(1);
312
- });
313
- }
@@ -1,22 +0,0 @@
1
- /**
2
- * grader-sensitivity.ts
3
- *
4
- * CLI for grader sensitivity testing (Phase 4 of grader reliability).
5
- *
6
- * Loads reference solutions, programmatically degrades them, grades both
7
- * the original and degraded versions with the grader model, then measures
8
- * whether the grader can distinguish quality levels.
9
- *
10
- * Usage:
11
- * pnpm grader-sensitivity # test all reference solutions
12
- * pnpm grader-sensitivity --area groq # test only groq area
13
- * pnpm grader-sensitivity --format json # machine-readable output
14
- *
15
- * Reads: canonical/reference-solutions/**\/*.ts
16
- * Reads: config/models.yaml (grader model)
17
- * Reads: config/rubrics.yaml (dimension rubric templates)
18
- * Writes: results/latest/grader-sensitivity.json
19
- *
20
- * @see docs/exec-plans/completed/grader-reliability.md — Phase 4
21
- */
22
- export {};
@@ -1,354 +0,0 @@
1
- /**
2
- * grader-sensitivity.ts
3
- *
4
- * CLI for grader sensitivity testing (Phase 4 of grader reliability).
5
- *
6
- * Loads reference solutions, programmatically degrades them, grades both
7
- * the original and degraded versions with the grader model, then measures
8
- * whether the grader can distinguish quality levels.
9
- *
10
- * Usage:
11
- * pnpm grader-sensitivity # test all reference solutions
12
- * pnpm grader-sensitivity --area groq # test only groq area
13
- * pnpm grader-sensitivity --format json # machine-readable output
14
- *
15
- * Reads: canonical/reference-solutions/**\/*.ts
16
- * Reads: config/models.yaml (grader model)
17
- * Reads: config/rubrics.yaml (dimension rubric templates)
18
- * Writes: results/latest/grader-sensitivity.json
19
- *
20
- * @see docs/exec-plans/completed/grader-reliability.md — Phase 4
21
- */
22
- import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
23
- import { dirname, join, resolve } from "path";
24
- import { fileURLToPath } from "url";
25
- import { load } from "js-yaml";
26
- import { DEGRADATION_STRATEGIES, } from "../pipeline/degradations.js";
27
- import { analyzeSensitivity, } from "../pipeline/grader-sensitivity.js";
28
- import { gradeOnce, loadGraderModel } from "./grader-api.js";
29
- const __dirname = dirname(fileURLToPath(import.meta.url));
30
- const ROOT = resolve(__dirname, "..", "..");
31
- // ---------------------------------------------------------------------------
32
- // CLI argument parsing
33
- // ---------------------------------------------------------------------------
34
- const args = process.argv.slice(2);
35
- function getFlag(name) {
36
- return args.includes(`--${name}`);
37
- }
38
- function getOption(name) {
39
- const idx = args.indexOf(`--${name}`);
40
- return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined;
41
- }
42
- const areaFilter = getOption("area");
43
- const format = getOption("format") ?? "table";
44
- const outputPath = getOption("output");
45
- const showHelp = getFlag("help") || getFlag("h");
46
- if (showHelp) {
47
- console.log(`
48
- Usage: pnpm grader-sensitivity [options]
49
-
50
- Test grader discrimination power using programmatic code degradation.
51
-
52
- Options:
53
- --area <name> Test only reference solutions in this area (e.g., groq)
54
- --format <fmt> Output format: table (default) or json
55
- --output <path> Write JSON report to file
56
- --help, -h Show this help
57
-
58
- Examples:
59
- pnpm grader-sensitivity # test all reference solutions
60
- pnpm grader-sensitivity --area groq # test only GROQ area
61
- pnpm grader-sensitivity --format json # machine-readable output
62
- `);
63
- process.exit(0);
64
- }
65
- // ---------------------------------------------------------------------------
66
- // Types
67
- // ---------------------------------------------------------------------------
68
- // DimensionName imported from pipeline/types.ts
69
- const DIMENSION_NAMES = [
70
- "taskCompletion",
71
- "codeCorrectness",
72
- "docCoverage",
73
- ];
74
- // ---------------------------------------------------------------------------
75
- // Load config
76
- // ---------------------------------------------------------------------------
77
- /** Build a rubric prompt for a specific dimension */
78
- function buildRubricPrompt(rubrics, dimension) {
79
- const templateKey = dimension === "taskCompletion"
80
- ? "task-completion"
81
- : dimension === "codeCorrectness"
82
- ? "code-correctness"
83
- : "doc-coverage";
84
- const template = rubrics.templates[templateKey];
85
- if (!template) {
86
- throw new Error(`No rubric template for ${dimension}`);
87
- }
88
- const lines = [
89
- template.header,
90
- "",
91
- ...template.scale.map((s) => `- ${s}`),
92
- "",
93
- rubrics.footer,
94
- ];
95
- return lines.join("\n");
96
- }
97
- function discoverReferenceSolutions() {
98
- const refDir = join(ROOT, "canonical", "reference-solutions");
99
- if (!existsSync(refDir)) {
100
- console.error(`❌ Reference solutions directory not found: ${refDir}`);
101
- process.exit(1);
102
- }
103
- const solutions = [];
104
- const areas = readdirSync(refDir, { withFileTypes: true })
105
- .filter((d) => d.isDirectory())
106
- .map((d) => d.name);
107
- for (const area of areas) {
108
- if (areaFilter && area !== areaFilter)
109
- continue;
110
- const areaDir = join(refDir, area);
111
- const files = readdirSync(areaDir).filter((f) => f.endsWith(".ts") || f.endsWith(".tsx"));
112
- for (const file of files) {
113
- const content = readFileSync(join(areaDir, file), "utf-8");
114
- solutions.push({
115
- area,
116
- content,
117
- relativePath: `${area}/${file}`,
118
- });
119
- }
120
- }
121
- return solutions;
122
- }
123
- // ---------------------------------------------------------------------------
124
- // Discover reference solutions
125
- // ---------------------------------------------------------------------------
126
- function formatSensitivityReport(result) {
127
- console.log("-".repeat(80));
128
- console.log("OVERALL SENSITIVITY");
129
- console.log("-".repeat(80));
130
- console.log();
131
- console.log(` Grader: ${result.graderModel}`);
132
- console.log(` Total pairs: ${result.totalPairs}`);
133
- console.log(` Concordance rate: ${result.concordanceRate}%`);
134
- console.log(` Avg separation: ${result.avgSeparation} points`);
135
- console.log(` Failed pairs: ${result.failedPairs.length}`);
136
- console.log();
137
- // Per-dimension table
138
- console.log("-".repeat(80));
139
- console.log("PER-DIMENSION SENSITIVITY");
140
- console.log("-".repeat(80));
141
- console.log();
142
- const h = "| Dimension | Concordance | Avg Sep | Tied | Pairs |";
143
- const sep = "|------------------|-------------|---------|-------|-------|";
144
- console.log(h);
145
- console.log(sep);
146
- const dims = [
147
- { data: result.perDimension.taskCompletion, name: "Task Completion" },
148
- { data: result.perDimension.codeCorrectness, name: "Code Correctness" },
149
- { data: result.perDimension.docCoverage, name: "Doc Coverage" },
150
- ];
151
- for (const { data, name } of dims) {
152
- console.log(`| ${name.padEnd(16)} | ${(data.concordanceRate + "%").padStart(11)} | ${String(data.avgSeparation).padStart(7)} | ${(data.tiedRate + "%").padStart(5)} | ${String(data.pairCount).padStart(5)} |`);
153
- }
154
- console.log();
155
- // Cross-dimension analysis
156
- console.log("-".repeat(80));
157
- console.log("CROSS-DIMENSION ANALYSIS");
158
- console.log("-".repeat(80));
159
- console.log();
160
- console.log(` On-target: ${result.crossDimension.onTarget.concordanceRate}% concordance, ` +
161
- `${result.crossDimension.onTarget.avgSeparation} avg sep (${result.crossDimension.onTarget.pairCount} pairs)`);
162
- console.log(` Off-target: ${result.crossDimension.offTarget.concordanceRate}% concordance, ` +
163
- `${result.crossDimension.offTarget.avgSeparation} avg sep (${result.crossDimension.offTarget.pairCount} pairs)`);
164
- console.log();
165
- if (result.crossDimension.onTarget.concordanceRate >
166
- result.crossDimension.offTarget.concordanceRate) {
167
- console.log(" ✅ Grader is more sensitive to targeted degradations (expected)");
168
- }
169
- else {
170
- console.log(" ⚠ Grader is equally or less sensitive to targeted degradations");
171
- }
172
- console.log();
173
- // Per-degradation breakdown
174
- if (result.byDegradation.length > 0) {
175
- console.log("-".repeat(80));
176
- console.log("PER-DEGRADATION SENSITIVITY (worst first)");
177
- console.log("-".repeat(80));
178
- console.log();
179
- const dh = "| Degradation | Target | Conc | Sep | N |";
180
- const ds = "|------------------------------------------------|------------|-------|-------|----|";
181
- console.log(dh);
182
- console.log(ds);
183
- for (const d of result.byDegradation) {
184
- const dimLabel = d.targetDimension === "taskCompletion"
185
- ? "Task"
186
- : d.targetDimension === "codeCorrectness"
187
- ? "Code"
188
- : "Docs";
189
- console.log(`| ${d.description.slice(0, 46).padEnd(46)} | ${dimLabel.padEnd(10)} | ${(d.concordanceRate + "%").padStart(5)} | ${String(d.avgSeparation).padStart(5)} | ${String(d.pairCount).padStart(2)} |`);
190
- }
191
- console.log();
192
- }
193
- // Top failures
194
- const topN = Math.min(5, result.failedPairs.length);
195
- if (topN > 0) {
196
- console.log("-".repeat(80));
197
- console.log(`TOP ${topN} DISCRIMINATION FAILURES (grader preferred degraded)`);
198
- console.log("-".repeat(80));
199
- console.log();
200
- for (let i = 0; i < topN; i++) {
201
- const f = result.failedPairs[i];
202
- const delta = f.degradedScore - f.originalScore;
203
- console.log(` ${i + 1}. ${f.sourcePath} — ${f.dimension}`);
204
- console.log(` Original=${f.originalScore}, Degraded=${f.degradedScore} (+${delta} for degraded)`);
205
- console.log(` Degradation: ${f.degradationDescription}`);
206
- }
207
- console.log();
208
- }
209
- // Verdict
210
- console.log("-".repeat(80));
211
- console.log("VERDICT");
212
- console.log("-".repeat(80));
213
- console.log();
214
- if (result.concordanceRate >= 90) {
215
- console.log(` ✅ EXCELLENT: Grader correctly discriminates ${result.concordanceRate}% of pairs`);
216
- }
217
- else if (result.concordanceRate >= 75) {
218
- console.log(` ⚠️ ACCEPTABLE: Grader correctly discriminates ${result.concordanceRate}% of pairs`);
219
- }
220
- else {
221
- console.log(` ❌ POOR: Grader only discriminates ${result.concordanceRate}% of pairs`);
222
- }
223
- console.log();
224
- }
225
- function generateDegradedPairs(solutions) {
226
- const pairs = [];
227
- for (const solution of solutions) {
228
- for (const strategy of DEGRADATION_STRATEGIES) {
229
- const degraded = strategy.apply(solution.content);
230
- // Only include if the degradation actually changed the code
231
- if (degraded !== solution.content) {
232
- pairs.push({
233
- degradation: strategy,
234
- degraded,
235
- original: solution.content,
236
- sourcePath: solution.relativePath,
237
- });
238
- }
239
- }
240
- }
241
- return pairs;
242
- }
243
- // ---------------------------------------------------------------------------
244
- // Main execution
245
- // ---------------------------------------------------------------------------
246
- function loadRubrics() {
247
- const rubricsPath = join(ROOT, "config", "rubrics.yaml");
248
- const raw = readFileSync(rubricsPath, "utf-8");
249
- return load(raw);
250
- }
251
- // ---------------------------------------------------------------------------
252
- // Formatted output
253
- // ---------------------------------------------------------------------------
254
- async function main() {
255
- console.log("=".repeat(80));
256
- console.log(" GRADER SENSITIVITY TESTING");
257
- console.log("=".repeat(80));
258
- console.log();
259
- // Load config
260
- const grader = loadGraderModel();
261
- const rubrics = loadRubrics();
262
- console.log(` Grader: ${grader.id} (${grader.label})`);
263
- if (areaFilter) {
264
- console.log(` Area filter: ${areaFilter}`);
265
- }
266
- console.log();
267
- // Discover reference solutions
268
- const solutions = discoverReferenceSolutions();
269
- console.log(` Reference solutions: ${solutions.length}`);
270
- if (solutions.length === 0) {
271
- console.error("❌ No reference solutions found.");
272
- process.exit(1);
273
- }
274
- for (const s of solutions) {
275
- console.log(` ${s.relativePath} (${s.content.split("\n").length} lines)`);
276
- }
277
- console.log();
278
- // Generate degraded pairs
279
- const degradedPairs = generateDegradedPairs(solutions);
280
- console.log(` Degraded pairs: ${degradedPairs.length}`);
281
- console.log(` Total gradings: ${degradedPairs.length * DIMENSION_NAMES.length * 2} (${degradedPairs.length} pairs × ${DIMENSION_NAMES.length} dimensions × 2 versions)`);
282
- console.log();
283
- // Grade each pair on each dimension
284
- const sensitivityPairs = [];
285
- let completed = 0;
286
- let failed = 0;
287
- const totalGradings = degradedPairs.length * DIMENSION_NAMES.length * 2;
288
- for (const pair of degradedPairs) {
289
- const area = pair.sourcePath.split("/")[0];
290
- for (const dimension of DIMENSION_NAMES) {
291
- const rubricText = buildRubricPrompt(rubrics, dimension);
292
- // Grade original
293
- const originalScore = await gradeOnce(grader.id, pair.original, rubricText);
294
- completed++;
295
- // Grade degraded
296
- const degradedScore = await gradeOnce(grader.id, pair.degraded, rubricText);
297
- completed++;
298
- if (originalScore !== null && degradedScore !== null) {
299
- sensitivityPairs.push({
300
- area,
301
- degradationDescription: pair.degradation.description,
302
- degradedScore,
303
- dimension,
304
- originalScore,
305
- sourcePath: pair.sourcePath,
306
- targetDimension: pair.degradation.targetDimension,
307
- });
308
- }
309
- else {
310
- failed++;
311
- }
312
- if (completed % 20 === 0 || completed === totalGradings) {
313
- process.stdout.write(`\r Progress: ${completed}/${totalGradings} gradings${failed > 0 ? ` (${failed} failed)` : ""}`);
314
- }
315
- }
316
- }
317
- console.log();
318
- console.log();
319
- // Analyze
320
- const result = analyzeSensitivity(sensitivityPairs, grader.id);
321
- // Output
322
- if (format === "json") {
323
- const json = JSON.stringify(result, null, 2);
324
- if (outputPath) {
325
- writeFileSync(outputPath, json);
326
- console.log(` ✅ Report written to ${outputPath}`);
327
- }
328
- else {
329
- console.log(json);
330
- }
331
- }
332
- else {
333
- formatSensitivityReport(result);
334
- }
335
- // Write to results/latest/
336
- const resultsDir = join(ROOT, "results", "latest");
337
- try {
338
- mkdirSync(resultsDir, { recursive: true });
339
- const resultFilePath = join(resultsDir, "grader-sensitivity.json");
340
- writeFileSync(resultFilePath, JSON.stringify(result, null, 2));
341
- console.log(` 📄 Report saved: ${resultFilePath}`);
342
- }
343
- catch {
344
- // results/latest/ may not exist yet
345
- }
346
- }
347
- // Only run when invoked directly
348
- if (process.argv[1]?.endsWith("grader-sensitivity.ts") ||
349
- process.argv[1]?.endsWith("grader-sensitivity.js")) {
350
- main().catch((err) => {
351
- console.error("❌ Fatal error:", err);
352
- process.exit(1);
353
- });
354
- }