@sanity/ailf 2.0.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. package/LICENSE +21 -0
  2. package/dist/cli.js +0 -0
  3. package/package.json +24 -24
  4. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
  5. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
  6. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
  7. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
  8. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  9. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  10. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  11. package/dist/_vendor/ailf-tasks/index.js +0 -16
  12. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  13. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  14. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  15. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  16. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  17. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  18. package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
  19. package/dist/adapters/task-sources/yaml-task-source.js +0 -139
  20. package/dist/agent-observer/test-imports.d.ts +0 -7
  21. package/dist/agent-observer/test-imports.js +0 -185
  22. package/dist/commands/update-quality-scores.d.ts +0 -5
  23. package/dist/commands/update-quality-scores.js +0 -20
  24. package/dist/lib/agent-behavior-report.d.ts +0 -8
  25. package/dist/lib/agent-behavior-report.js +0 -185
  26. package/dist/lib/baseline.d.ts +0 -19
  27. package/dist/lib/baseline.js +0 -153
  28. package/dist/lib/calculate-scores.d.ts +0 -23
  29. package/dist/lib/calculate-scores.js +0 -42
  30. package/dist/lib/compare.d.ts +0 -18
  31. package/dist/lib/compare.js +0 -170
  32. package/dist/lib/coverage-audit.d.ts +0 -4
  33. package/dist/lib/coverage-audit.js +0 -42
  34. package/dist/lib/discovery-report.d.ts +0 -13
  35. package/dist/lib/discovery-report.js +0 -57
  36. package/dist/lib/fetch-docs.d.ts +0 -30
  37. package/dist/lib/fetch-docs.js +0 -171
  38. package/dist/lib/generate-configs.d.ts +0 -25
  39. package/dist/lib/generate-configs.js +0 -42
  40. package/dist/lib/grader-api.d.ts +0 -21
  41. package/dist/lib/grader-api.js +0 -34
  42. package/dist/lib/grader-compare.d.ts +0 -19
  43. package/dist/lib/grader-compare.js +0 -91
  44. package/dist/lib/grader-consistency.d.ts +0 -27
  45. package/dist/lib/grader-consistency.js +0 -79
  46. package/dist/lib/grader-sensitivity.d.ts +0 -19
  47. package/dist/lib/grader-sensitivity.js +0 -75
  48. package/dist/lib/grader-validate.d.ts +0 -19
  49. package/dist/lib/grader-validate.js +0 -78
  50. package/dist/lib/measure-retrieval.d.ts +0 -14
  51. package/dist/lib/measure-retrieval.js +0 -71
  52. package/dist/lib/pr-comment.d.ts +0 -16
  53. package/dist/lib/pr-comment.js +0 -28
  54. package/dist/lib/readiness-report.d.ts +0 -13
  55. package/dist/lib/readiness-report.js +0 -108
  56. package/dist/lib/webhook-server.d.ts +0 -11
  57. package/dist/lib/webhook-server.js +0 -24
  58. package/dist/lib/weekly-digest.d.ts +0 -24
  59. package/dist/lib/weekly-digest.js +0 -148
  60. package/dist/orchestration/env-bridge.d.ts +0 -21
  61. package/dist/orchestration/env-bridge.js +0 -66
  62. package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
  63. package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
  64. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
  65. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
  66. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
  67. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  68. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  69. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  70. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  71. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  72. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
  73. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
  74. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
  75. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
  76. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
  77. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
  78. package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
  79. package/dist/pipeline/compiler/task-bridge.js +0 -92
  80. package/dist/pipeline/expand-tasks.d.ts +0 -232
  81. package/dist/pipeline/expand-tasks.js +0 -467
  82. package/dist/pipeline/generate-configs.d.ts +0 -92
  83. package/dist/pipeline/generate-configs.js +0 -445
  84. package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
  85. package/dist/pipeline/steps/calculate-scores-step.js +0 -89
  86. package/dist/pipeline/steps/compare-step.d.ts +0 -18
  87. package/dist/pipeline/steps/compare-step.js +0 -90
  88. package/dist/pipeline/steps/eval-step.d.ts +0 -53
  89. package/dist/pipeline/steps/eval-step.js +0 -347
  90. package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
  91. package/dist/pipeline/steps/fetch-docs-step.js +0 -84
  92. package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
  93. package/dist/pipeline/steps/generate-configs-step.js +0 -98
  94. package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
  95. package/dist/pipeline/steps/grader-consistency-step.js +0 -74
  96. package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
  97. package/dist/pipeline/steps/publish-report-step.js +0 -243
  98. package/dist/pipeline/steps/report-step.d.ts +0 -13
  99. package/dist/pipeline/steps/report-step.js +0 -56
  100. package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
  101. package/dist/pipeline/steps/update-scores-step.js +0 -42
  102. package/dist/scripts/agent-behavior-report.d.ts +0 -19
  103. package/dist/scripts/agent-behavior-report.js +0 -315
  104. package/dist/scripts/baseline.d.ts +0 -43
  105. package/dist/scripts/baseline.js +0 -267
  106. package/dist/scripts/calculate-scores.d.ts +0 -166
  107. package/dist/scripts/calculate-scores.js +0 -1296
  108. package/dist/scripts/compare.d.ts +0 -22
  109. package/dist/scripts/compare.js +0 -334
  110. package/dist/scripts/coverage-audit.d.ts +0 -44
  111. package/dist/scripts/coverage-audit.js +0 -209
  112. package/dist/scripts/debug-eval.d.ts +0 -19
  113. package/dist/scripts/debug-eval.js +0 -73
  114. package/dist/scripts/discovery-report.d.ts +0 -58
  115. package/dist/scripts/discovery-report.js +0 -250
  116. package/dist/scripts/fetch-docs.d.ts +0 -35
  117. package/dist/scripts/fetch-docs.js +0 -472
  118. package/dist/scripts/generate-configs.d.ts +0 -66
  119. package/dist/scripts/generate-configs.js +0 -459
  120. package/dist/scripts/grader-api.d.ts +0 -27
  121. package/dist/scripts/grader-api.js +0 -206
  122. package/dist/scripts/grader-compare.d.ts +0 -22
  123. package/dist/scripts/grader-compare.js +0 -368
  124. package/dist/scripts/grader-consistency.d.ts +0 -20
  125. package/dist/scripts/grader-consistency.js +0 -313
  126. package/dist/scripts/grader-sensitivity.d.ts +0 -22
  127. package/dist/scripts/grader-sensitivity.js +0 -354
  128. package/dist/scripts/grader-validate.d.ts +0 -19
  129. package/dist/scripts/grader-validate.js +0 -267
  130. package/dist/scripts/measure-retrieval.d.ts +0 -10
  131. package/dist/scripts/measure-retrieval.js +0 -145
  132. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
  133. package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
  134. package/dist/scripts/pipeline.d.ts +0 -76
  135. package/dist/scripts/pipeline.js +0 -1031
  136. package/dist/scripts/pr-comment.d.ts +0 -10
  137. package/dist/scripts/pr-comment.js +0 -510
  138. package/dist/scripts/readiness-report.d.ts +0 -88
  139. package/dist/scripts/readiness-report.js +0 -342
  140. package/dist/scripts/update-quality-scores.d.ts +0 -15
  141. package/dist/scripts/update-quality-scores.js +0 -184
  142. package/dist/scripts/validate-task-sources.d.ts +0 -21
  143. package/dist/scripts/validate-task-sources.js +0 -210
  144. package/dist/scripts/validate.d.ts +0 -13
  145. package/dist/scripts/validate.js +0 -79
  146. package/dist/scripts/webhook-server.d.ts +0 -26
  147. package/dist/scripts/webhook-server.js +0 -147
  148. package/dist/scripts/weekly-digest.d.ts +0 -24
  149. package/dist/scripts/weekly-digest.js +0 -144
  150. package/dist/sinks/format-slack.d.ts +0 -64
  151. package/dist/sinks/format-slack.js +0 -306
  152. package/dist/sinks/slack-sink.d.ts +0 -27
  153. package/dist/sinks/slack-sink.js +0 -78
  154. package/dist/sinks/webhook-sink.d.ts +0 -19
  155. package/dist/sinks/webhook-sink.js +0 -50
  156. package/tasks/.expanded.agentic.yaml +0 -280
  157. package/tasks/.expanded.yaml +0 -565
@@ -1,22 +0,0 @@
1
- /**
2
- * compare.ts
3
- *
4
- * CLI for structured comparison between two evaluation runs.
5
- *
6
- * Usage:
7
- * pnpm compare # compare current vs latest baseline
8
- * pnpm compare --baseline <path> # compare current vs specific file
9
- * pnpm compare --baseline <path> --experiment <path> # compare two specific files
10
- * pnpm compare --threshold 5 # custom noise threshold
11
- * pnpm compare --output /tmp/comparison.json # write JSON report to file
12
- * pnpm compare --format json # output raw JSON (default: table)
13
- *
14
- * Reads: results/latest/score-summary.json (as experiment, unless --experiment)
15
- * Reads: results/baselines/<latest>.json (as baseline, unless --baseline)
16
- */
17
- import { type ComparisonReport } from "../pipeline/types.js";
18
- /**
19
- * Generate a markdown comparison section suitable for PR comments.
20
- */
21
- export declare function formatComparisonMarkdown(report: ComparisonReport): string;
22
- export declare function formatComparisonTable(report: ComparisonReport): string;
@@ -1,334 +0,0 @@
1
- /**
2
- * compare.ts
3
- *
4
- * CLI for structured comparison between two evaluation runs.
5
- *
6
- * Usage:
7
- * pnpm compare # compare current vs latest baseline
8
- * pnpm compare --baseline <path> # compare current vs specific file
9
- * pnpm compare --baseline <path> --experiment <path> # compare two specific files
10
- * pnpm compare --threshold 5 # custom noise threshold
11
- * pnpm compare --output /tmp/comparison.json # write JSON report to file
12
- * pnpm compare --format json # output raw JSON (default: table)
13
- *
14
- * Reads: results/latest/score-summary.json (as experiment, unless --experiment)
15
- * Reads: results/baselines/<latest>.json (as baseline, unless --baseline)
16
- */
17
- import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
18
- import { dirname, join, resolve } from "path";
19
- import { fileURLToPath } from "url";
20
- import { compare } from "../pipeline/compare.js";
21
- import { DEFAULT_NOISE_THRESHOLD, } from "../pipeline/types.js";
22
- const __dirname = dirname(fileURLToPath(import.meta.url));
23
- const ROOT = resolve(__dirname, "..", "..");
24
- const BASELINES_DIR = join(ROOT, "results", "baselines");
25
- const SCORE_SUMMARY_PATH = join(ROOT, "results", "latest", "score-summary.json");
26
- // ---------------------------------------------------------------------------
27
- // CLI argument parsing
28
- // ---------------------------------------------------------------------------
29
- const args = process.argv.slice(2);
30
- function getFlag(name) {
31
- return args.includes(`--${name}`);
32
- }
33
- function getOption(name) {
34
- const idx = args.indexOf(`--${name}`);
35
- return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined;
36
- }
37
- const baselinePath = getOption("baseline");
38
- const experimentPath = getOption("experiment");
39
- const thresholdStr = getOption("threshold");
40
- const threshold = thresholdStr
41
- ? parseFloat(thresholdStr)
42
- : DEFAULT_NOISE_THRESHOLD;
43
- const outputPath = getOption("output");
44
- const format = getOption("format") ?? "table";
45
- const showHelp = getFlag("help") || getFlag("h");
46
- if (showHelp) {
47
- console.log(`
48
- Usage: pnpm compare [options]
49
-
50
- Compare two evaluation score summaries and produce structured deltas.
51
-
52
- Options:
53
- --baseline <path> Baseline score-summary.json (default: latest baseline)
54
- --experiment <path> Experiment score-summary.json (default: results/latest/score-summary.json)
55
- --threshold <n> Noise threshold for unchanged classification (default: ${DEFAULT_NOISE_THRESHOLD})
56
- --output <path> Write JSON report to file
57
- --format <fmt> Output format: table (default) or json
58
- --help, -h Show this help
59
-
60
- Examples:
61
- pnpm compare # current scores vs latest baseline
62
- pnpm compare --threshold 5 # wider noise band
63
- pnpm compare --format json # machine-readable output
64
- pnpm compare --baseline results/baselines/20260310_02_43_44.json
65
- pnpm compare --baseline before.json --experiment after.json
66
- `);
67
- process.exit(0);
68
- }
69
- // ---------------------------------------------------------------------------
70
- // File loading helpers
71
- // ---------------------------------------------------------------------------
72
- /**
73
- * Generate a markdown comparison section suitable for PR comments.
74
- */
75
- export function formatComparisonMarkdown(report) {
76
- const lines = [];
77
- const overall = report.deltas.overall;
78
- const overallIcon = changeIcon(overall > report.noiseThreshold
79
- ? "improved"
80
- : overall < -report.noiseThreshold
81
- ? "regressed"
82
- : "unchanged");
83
- lines.push("### 📊 Score Comparison");
84
- lines.push("");
85
- lines.push(`**Overall: ${Math.round(report.baseline.overall.avgScore)} → ${Math.round(report.experiment.overall.avgScore)}** (${overallIcon} ${deltaStr(overall)})`);
86
- lines.push("");
87
- // Per-area table
88
- lines.push("| Feature | Baseline | Current | Delta | Task | Code | Docs |");
89
- lines.push("|---------|----------|---------|-------|------|------|------|");
90
- for (const a of report.areas) {
91
- const icon = changeIcon(a.change);
92
- lines.push(`| ${a.area} | ${a.baseline} | ${a.experiment} | ${icon} ${deltaStr(a.delta)} | ${deltaStr(a.dimensions.taskCompletion.delta)} | ${deltaStr(a.dimensions.codeCorrectness.delta)} | ${deltaStr(a.dimensions.docCoverage.delta)} |`);
93
- }
94
- lines.push("");
95
- // Summary
96
- const parts = [];
97
- if (report.improved.length > 0) {
98
- parts.push(`📈 ${report.improved.length} improved`);
99
- }
100
- if (report.regressed.length > 0) {
101
- parts.push(`📉 ${report.regressed.length} regressed`);
102
- }
103
- if (report.unchanged.length > 0) {
104
- parts.push(`➡️ ${report.unchanged.length} unchanged`);
105
- }
106
- if (parts.length > 0) {
107
- lines.push(parts.join(" · "));
108
- lines.push("");
109
- }
110
- // Dimension averages in collapsible
111
- lines.push("<details>");
112
- lines.push("<summary>Dimension averages</summary>");
113
- lines.push("");
114
- const dim = report.deltas.perDimension;
115
- lines.push("| Dimension | Delta |");
116
- lines.push("|-----------|-------|");
117
- lines.push(`| Task Completion | ${deltaStr(dim.taskCompletion)} |`);
118
- lines.push(`| Code Correctness | ${deltaStr(dim.codeCorrectness)} |`);
119
- lines.push(`| Doc Coverage | ${deltaStr(dim.docCoverage)} |`);
120
- lines.push(`| Doc Lift | ${deltaStr(report.deltas.docLift)} |`);
121
- if (report.deltas.cost !== undefined) {
122
- const costStr = report.deltas.cost > 0
123
- ? `+$${report.deltas.cost.toFixed(4)}`
124
- : `-$${Math.abs(report.deltas.cost).toFixed(4)}`;
125
- lines.push(`| Cost | ${costStr} |`);
126
- }
127
- lines.push("");
128
- lines.push("</details>");
129
- lines.push("");
130
- return lines.join("\n");
131
- }
132
- export function formatComparisonTable(report) {
133
- const lines = [];
134
- lines.push("=".repeat(80));
135
- lines.push(" COMPARISON REPORT");
136
- lines.push("=".repeat(80));
137
- lines.push("");
138
- // Overall summary
139
- const overall = report.deltas.overall;
140
- const overallIcon = changeIcon(overall > report.noiseThreshold
141
- ? "improved"
142
- : overall < -report.noiseThreshold
143
- ? "regressed"
144
- : "unchanged");
145
- lines.push(` Overall: ${Math.round(report.baseline.overall.avgScore)} → ${Math.round(report.experiment.overall.avgScore)} (${overallIcon} ${deltaStr(overall)})`);
146
- lines.push("");
147
- // Per-dimension averages
148
- const dim = report.deltas.perDimension;
149
- lines.push(" Dimension averages:");
150
- lines.push(` Task Completion: ${deltaStr(dim.taskCompletion)}`);
151
- lines.push(` Code Correctness: ${deltaStr(dim.codeCorrectness)}`);
152
- lines.push(` Doc Coverage: ${deltaStr(dim.docCoverage)}`);
153
- lines.push(` Doc Lift: ${deltaStr(report.deltas.docLift)}`);
154
- if (report.deltas.cost !== undefined) {
155
- lines.push(` Cost: ${report.deltas.cost > 0 ? "+" : ""}$${report.deltas.cost.toFixed(4)}`);
156
- }
157
- lines.push("");
158
- // Per-area table
159
- lines.push("-".repeat(80));
160
- lines.push("PER-AREA BREAKDOWN");
161
- lines.push("-".repeat(80));
162
- lines.push("");
163
- const h = "| Feature Area | Baseline | Experiment | Delta | Task | Code | Docs |";
164
- const sep = "|---------------------|----------|------------|-------|------|------|------|";
165
- lines.push(h);
166
- lines.push(sep);
167
- for (const a of report.areas) {
168
- const icon = changeIcon(a.change);
169
- lines.push(`| ${icon} ${a.area.padEnd(17)} | ${String(a.baseline).padStart(8)} | ${String(a.experiment).padStart(10)} | ${deltaStr(a.delta).padStart(5)} | ${deltaStr(a.dimensions.taskCompletion.delta).padStart(4)} | ${deltaStr(a.dimensions.codeCorrectness.delta).padStart(4)} | ${deltaStr(a.dimensions.docCoverage.delta).padStart(4)} |`);
170
- }
171
- lines.push("");
172
- // Classification summary
173
- if (report.improved.length > 0) {
174
- lines.push(` 📈 Improved: ${report.improved.join(", ")}`);
175
- }
176
- if (report.regressed.length > 0) {
177
- lines.push(` 📉 Regressed: ${report.regressed.join(", ")}`);
178
- }
179
- if (report.unchanged.length > 0) {
180
- lines.push(` ➡️ Unchanged: ${report.unchanged.join(", ")}`);
181
- }
182
- lines.push("");
183
- // Mismatched areas
184
- if (report.mismatched.onlyInBaseline.length > 0 ||
185
- report.mismatched.onlyInExperiment.length > 0) {
186
- lines.push(" ⚠️ Area mismatches:");
187
- if (report.mismatched.onlyInBaseline.length > 0) {
188
- lines.push(` Only in baseline: ${report.mismatched.onlyInBaseline.join(", ")}`);
189
- }
190
- if (report.mismatched.onlyInExperiment.length > 0) {
191
- lines.push(` Only in experiment: ${report.mismatched.onlyInExperiment.join(", ")}`);
192
- }
193
- lines.push("");
194
- }
195
- const isEmpirical = "noiseThresholdEmpirical" in report &&
196
- report.noiseThresholdEmpirical === true;
197
- const thresholdSource = isEmpirical
198
- ? "empirical, from grader consistency data"
199
- : "default";
200
- lines.push(` Noise threshold: ±${report.noiseThreshold}${Number.isInteger(report.noiseThreshold) ? "" : ` (${report.noiseThreshold.toFixed(1)})`} (${thresholdSource})`);
201
- lines.push("");
202
- // Ceiling decomposition deltas (when areas have ceiling data)
203
- const hasCeilingData = report.areas.some((a) => a.ceilingDelta !== undefined);
204
- if (hasCeilingData) {
205
- lines.push("-".repeat(80));
206
- lines.push("CEILING DECOMPOSITION DELTAS");
207
- lines.push("-".repeat(80));
208
- lines.push("");
209
- const cH = "| Feature Area | Ceiling Δ | Floor Δ | Doc Lift Δ |";
210
- const cSep = "|---------------------|-----------|---------|------------|";
211
- lines.push(cH);
212
- lines.push(cSep);
213
- for (const a of report.areas) {
214
- lines.push(`| ${a.area.padEnd(19)} | ` +
215
- `${deltaStr(a.ceilingDelta).padStart(9)} | ` +
216
- `${deltaStr(a.floorDelta).padStart(7)} | ` +
217
- `${deltaStr(a.docLiftDelta).padStart(10)} |`);
218
- }
219
- lines.push("");
220
- }
221
- return lines.join("\n");
222
- }
223
- // ---------------------------------------------------------------------------
224
- // Formatting
225
- // ---------------------------------------------------------------------------
226
- function changeIcon(change) {
227
- switch (change) {
228
- case "improved":
229
- return "📈";
230
- case "regressed":
231
- return "📉";
232
- default:
233
- return "➡️";
234
- }
235
- }
236
- function deltaStr(d) {
237
- if (d > 0)
238
- return `+${Math.round(d)}`;
239
- if (d < 0)
240
- return `${Math.round(d)}`;
241
- return "0";
242
- }
243
- function findLatestBaseline() {
244
- if (!existsSync(BASELINES_DIR))
245
- return null;
246
- const files = readdirSync(BASELINES_DIR)
247
- .filter((f) => f.endsWith(".json"))
248
- .sort()
249
- .reverse();
250
- return files.length > 0 ? join(BASELINES_DIR, files[0]) : null;
251
- }
252
- function loadSummary(path) {
253
- if (!existsSync(path)) {
254
- console.error(`❌ File not found: ${path}`);
255
- process.exit(1);
256
- }
257
- const raw = readFileSync(path, "utf-8");
258
- return JSON.parse(raw);
259
- }
260
- // ---------------------------------------------------------------------------
261
- // Main
262
- // ---------------------------------------------------------------------------
263
- function main() {
264
- // Resolve experiment path
265
- const expPath = experimentPath ?? SCORE_SUMMARY_PATH;
266
- const experiment = loadSummary(expPath);
267
- // Resolve baseline path
268
- let basePath;
269
- if (baselinePath) {
270
- basePath = resolve(baselinePath);
271
- }
272
- else {
273
- const latest = findLatestBaseline();
274
- if (!latest) {
275
- console.error("❌ No baselines found. Run 'pnpm baseline:save' first, or use --baseline <path>.");
276
- process.exit(1);
277
- }
278
- basePath = latest;
279
- }
280
- const baseline = loadSummary(basePath);
281
- // Try to load grader consistency data for empirical thresholds
282
- const consistencyPath = join(ROOT, "results", "latest", "grader-consistency.json");
283
- let graderConsistency;
284
- if (existsSync(consistencyPath) && !thresholdStr) {
285
- try {
286
- const consistencyRaw = JSON.parse(readFileSync(consistencyPath, "utf-8"));
287
- if (consistencyRaw.recommendedThreshold && consistencyRaw.perDimension) {
288
- graderConsistency =
289
- consistencyRaw;
290
- console.log(` 📊 Using empirical noise threshold: ±${graderConsistency.recommendedThreshold.toFixed(1)} (from grader consistency data)`);
291
- }
292
- }
293
- catch {
294
- // Non-fatal — fall back to default threshold
295
- }
296
- }
297
- console.log(` Baseline: ${basePath}`);
298
- console.log(` Experiment: ${expPath}`);
299
- if (!graderConsistency) {
300
- console.log(` Threshold: ±${threshold} (default — run --grader-replications for empirical threshold)`);
301
- }
302
- console.log("");
303
- const report = compare(baseline, experiment, {
304
- graderConsistency,
305
- noiseThreshold: threshold,
306
- });
307
- if (format === "json") {
308
- const json = JSON.stringify(report, null, 2);
309
- if (outputPath) {
310
- writeFileSync(outputPath, json);
311
- console.log(` ✅ Comparison report written to ${outputPath}`);
312
- }
313
- else {
314
- console.log(json);
315
- }
316
- }
317
- else {
318
- const table = formatComparisonTable(report);
319
- console.log(table);
320
- if (outputPath) {
321
- const json = JSON.stringify(report, null, 2);
322
- writeFileSync(outputPath, json);
323
- console.log(` ✅ Comparison report also written to ${outputPath}`);
324
- }
325
- }
326
- // Write comparison report to results/latest for other steps to consume
327
- const latestComparisonPath = join(ROOT, "results", "latest", "comparison-report.json");
328
- writeFileSync(latestComparisonPath, JSON.stringify(report, null, 2));
329
- }
330
- // Only run when invoked directly
331
- if (process.argv[1]?.endsWith("compare.ts") ||
332
- process.argv[1]?.endsWith("compare.js")) {
333
- main();
334
- }
@@ -1,44 +0,0 @@
1
- /**
2
- * coverage-audit.ts
3
- *
4
- * CLI script that cross-references the product feature registry
5
- * (config/features.yaml) against actual task files (tasks/*.yaml)
6
- * to produce a documentation coverage audit.
7
- *
8
- * Phase 3c of the Scenario Matrix implementation.
9
- *
10
- * Usage:
11
- * pnpm coverage-audit # console report
12
- * pnpm coverage-audit --format md # markdown output
13
- * pnpm coverage-audit --json # JSON output
14
- *
15
- * @see docs/exec-plans/completed/scenario-matrix-implementation/phase-3-gap-analysis.md
16
- */
17
- import type { CoverageAuditReport, ProductFeature } from "../pipeline/types.js";
18
- /**
19
- * Count unique document slugs referenced across all tasks.
20
- */
21
- export declare function countReferencedDocs(rootDir: string): {
22
- slugs: string[];
23
- total: number;
24
- };
25
- /**
26
- * Count actual tasks per area from task YAML files.
27
- */
28
- export declare function countTasksByArea(rootDir: string): Record<string, number>;
29
- /**
30
- * Format coverage audit for console output.
31
- */
32
- export declare function formatCoverageConsole(report: CoverageAuditReport): string;
33
- /**
34
- * Format coverage audit as markdown.
35
- */
36
- export declare function formatCoverageMarkdown(report: CoverageAuditReport): string;
37
- /**
38
- * Load and validate the feature registry from config/features.yaml.
39
- */
40
- export declare function loadFeatureRegistry(rootDir: string): null | ProductFeature[];
41
- /**
42
- * Run the coverage audit and produce a structured report.
43
- */
44
- export declare function runCoverageAudit(rootDir: string): CoverageAuditReport | null;
@@ -1,209 +0,0 @@
1
- /**
2
- * coverage-audit.ts
3
- *
4
- * CLI script that cross-references the product feature registry
5
- * (config/features.yaml) against actual task files (tasks/*.yaml)
6
- * to produce a documentation coverage audit.
7
- *
8
- * Phase 3c of the Scenario Matrix implementation.
9
- *
10
- * Usage:
11
- * pnpm coverage-audit # console report
12
- * pnpm coverage-audit --format md # markdown output
13
- * pnpm coverage-audit --json # JSON output
14
- *
15
- * @see docs/exec-plans/completed/scenario-matrix-implementation/phase-3-gap-analysis.md
16
- */
17
- import { existsSync, readFileSync } from "fs";
18
- import { dirname, join, resolve } from "path";
19
- import { fileURLToPath } from "url";
20
- import { load } from "js-yaml";
21
- import { FeatureRegistrySchema } from "../pipeline/schemas.js";
22
- import { resolveMappings } from "../pipeline/resolve-mappings.js";
23
- const __dirname = dirname(fileURLToPath(import.meta.url));
24
- const ROOT = resolve(__dirname, "..", "..");
25
- // ---------------------------------------------------------------------------
26
- // Core logic (exported for testing)
27
- // ---------------------------------------------------------------------------
28
- /**
29
- * Count unique document slugs referenced across all tasks.
30
- */
31
- export function countReferencedDocs(rootDir) {
32
- const mappings = resolveMappings(rootDir);
33
- const allSlugs = new Set();
34
- for (const config of Object.values(mappings.feature_areas)) {
35
- for (const task of config.tasks) {
36
- for (const doc of task.canonical_docs) {
37
- allSlugs.add(doc.slug);
38
- }
39
- }
40
- }
41
- const slugs = [...allSlugs].sort();
42
- return { slugs, total: slugs.length };
43
- }
44
- /**
45
- * Count actual tasks per area from task YAML files.
46
- */
47
- export function countTasksByArea(rootDir) {
48
- const mappings = resolveMappings(rootDir);
49
- const counts = {};
50
- for (const [area, config] of Object.entries(mappings.feature_areas)) {
51
- counts[area] = config.tasks.length;
52
- }
53
- return counts;
54
- }
55
- /**
56
- * Format coverage audit for console output.
57
- */
58
- export function formatCoverageConsole(report) {
59
- const lines = [];
60
- lines.push("═══════════════════════════════════════════════════════════════");
61
- lines.push(" DOCUMENTATION COVERAGE AUDIT");
62
- lines.push("═══════════════════════════════════════════════════════════════");
63
- lines.push("");
64
- lines.push(`Coverage: ${report.covered.length}/${report.totalFeatures} features (${report.coveragePercent}%)`);
65
- lines.push("");
66
- // Covered features
67
- if (report.covered.length > 0) {
68
- lines.push("COVERED FEATURES:");
69
- for (const f of report.covered) {
70
- const taskLabel = f.actualTaskCount === 1 ? "1 task" : `${f.actualTaskCount} tasks`;
71
- const sections = f.sections.join(", ");
72
- lines.push(` ✅ ${f.id.padEnd(20)} ${taskLabel.padEnd(10)} ${f.priority.padEnd(10)} ${sections}`);
73
- }
74
- lines.push("");
75
- }
76
- // Uncovered features
77
- if (report.uncovered.length > 0) {
78
- lines.push("UNCOVERED FEATURES (by priority):");
79
- for (const f of report.uncovered) {
80
- const sections = f.sections.join(", ");
81
- lines.push(` ❌ ${f.id.padEnd(20)} ${f.priority.padEnd(10)} ${sections}`);
82
- }
83
- lines.push("");
84
- }
85
- return lines.join("\n");
86
- }
87
- /**
88
- * Format coverage audit as markdown.
89
- */
90
- export function formatCoverageMarkdown(report) {
91
- const lines = [];
92
- lines.push("### 📊 Documentation Coverage Audit");
93
- lines.push("");
94
- lines.push(`**Coverage: ${report.covered.length}/${report.totalFeatures} features (${report.coveragePercent}%)**`);
95
- lines.push("");
96
- if (report.covered.length > 0) {
97
- lines.push("#### Covered Features");
98
- lines.push("");
99
- lines.push("| Feature | Tasks | Priority | Sections |");
100
- lines.push("|---------|-------|----------|----------|");
101
- for (const f of report.covered) {
102
- lines.push(`| ✅ ${f.name} | ${f.actualTaskCount} | ${f.priority} | ${f.sections.join(", ")} |`);
103
- }
104
- lines.push("");
105
- }
106
- if (report.uncovered.length > 0) {
107
- lines.push("#### Uncovered Features");
108
- lines.push("");
109
- lines.push("| Feature | Priority | Sections |");
110
- lines.push("|---------|----------|----------|");
111
- for (const f of report.uncovered) {
112
- lines.push(`| ❌ ${f.name} | ${f.priority} | ${f.sections.join(", ")} |`);
113
- }
114
- lines.push("");
115
- }
116
- return lines.join("\n");
117
- }
118
- // ---------------------------------------------------------------------------
119
- // Formatting
120
- // ---------------------------------------------------------------------------
121
- /**
122
- * Load and validate the feature registry from config/features.yaml.
123
- */
124
- export function loadFeatureRegistry(rootDir) {
125
- const filePath = join(rootDir, "config", "features.yaml");
126
- if (!existsSync(filePath)) {
127
- return null;
128
- }
129
- const raw = readFileSync(filePath, "utf-8");
130
- const parsed = load(raw);
131
- const result = FeatureRegistrySchema.safeParse(parsed);
132
- if (!result.success) {
133
- console.error("❌ config/features.yaml validation failed:");
134
- for (const issue of result.error.issues) {
135
- console.error(` ${issue.path.join(".")}: ${issue.message}`);
136
- }
137
- return null;
138
- }
139
- return result.data.features;
140
- }
141
- /**
142
- * Run the coverage audit and produce a structured report.
143
- */
144
- export function runCoverageAudit(rootDir) {
145
- const features = loadFeatureRegistry(rootDir);
146
- if (!features)
147
- return null;
148
- const taskCounts = countTasksByArea(rootDir);
149
- const totalFeatures = features.length;
150
- const covered = [];
151
- const uncovered = [];
152
- for (const feature of features) {
153
- if (feature.status === "covered" && feature.area) {
154
- const actualTaskCount = taskCounts[feature.area] ?? 0;
155
- covered.push({ ...feature, actualTaskCount });
156
- }
157
- else if (feature.status === "uncovered" || feature.status === "planned") {
158
- uncovered.push(feature);
159
- }
160
- }
161
- // Sort uncovered by priority
162
- const priorityOrder = { critical: 0, high: 1, low: 3, medium: 2 };
163
- uncovered.sort((a, b) => priorityOrder[a.priority] - priorityOrder[b.priority]);
164
- const coveredCount = covered.length;
165
- const coveragePercent = totalFeatures > 0 ? (coveredCount / totalFeatures) * 100 : 0;
166
- return {
167
- coveragePercent: Math.round(coveragePercent * 10) / 10,
168
- covered,
169
- generatedAt: new Date().toISOString(),
170
- totalFeatures,
171
- uncovered,
172
- };
173
- }
174
- // ---------------------------------------------------------------------------
175
- // CLI entry point
176
- // ---------------------------------------------------------------------------
177
- function main() {
178
- const args = process.argv.slice(2);
179
- const formatArg = args.includes("--format")
180
- ? args[args.indexOf("--format") + 1]
181
- : undefined;
182
- const jsonOutput = args.includes("--json");
183
- const report = runCoverageAudit(ROOT);
184
- if (!report) {
185
- console.error("❌ Coverage audit failed. Ensure config/features.yaml exists and is valid.");
186
- process.exit(1);
187
- }
188
- if (jsonOutput) {
189
- console.log(JSON.stringify(report, null, 2));
190
- }
191
- else if (formatArg === "md" || formatArg === "markdown") {
192
- console.log(formatCoverageMarkdown(report));
193
- }
194
- else {
195
- console.log(formatCoverageConsole(report));
196
- }
197
- // Also print document utilization stats
198
- if (!jsonOutput && formatArg !== "md") {
199
- const docStats = countReferencedDocs(ROOT);
200
- console.log("DOCUMENT UTILIZATION:");
201
- console.log(` ${docStats.total} unique document slugs referenced across evaluation tasks`);
202
- console.log("");
203
- }
204
- }
205
- // Only run when invoked directly
206
- if (process.argv[1]?.endsWith("coverage-audit.ts") ||
207
- process.argv[1]?.endsWith("coverage-audit.js")) {
208
- main();
209
- }
@@ -1,19 +0,0 @@
1
- /**
2
- * debug-eval.ts
3
- *
4
- * Thin wrapper around `promptfoo eval` that reads DEBUG_EVAL_* environment
5
- * variables and forwards them as native promptfoo filter flags.
6
- *
7
- * Environment variables:
8
- * DEBUG_EVAL=1 — required to enable debug mode
9
- * DEBUG_EVAL_PATTERN=<re> — --filter-pattern (regex on test description)
10
- * DEBUG_EVAL_N=<number> — --filter-first-n (default: 2 when no other filters set)
11
- * DEBUG_EVAL_SAMPLE=<number> — --filter-sample (random N tests)
12
- *
13
- * Usage:
14
- * tsx src/scripts/debug-eval.ts --config promptfooconfig.yaml
15
- * tsx src/scripts/debug-eval.ts --config promptfooconfig.agentic.yaml --no-cache
16
- *
17
- * All extra argv are forwarded to promptfoo eval unchanged.
18
- */
19
- export {};