@sanity/ailf 2.0.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/LICENSE +21 -0
  2. package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
  3. package/dist/_vendor/ailf-core/examples/index.js +66 -1
  4. package/dist/agent-harness/assertions-runtime.d.ts +49 -0
  5. package/dist/agent-harness/assertions-runtime.js +138 -0
  6. package/dist/agent-harness/provider.d.ts +58 -0
  7. package/dist/agent-harness/provider.js +104 -0
  8. package/dist/cli.js +0 -0
  9. package/dist/commands/init.js +3 -0
  10. package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
  11. package/dist/orchestration/steps/generate-configs-step.js +35 -2
  12. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +39 -25
  13. package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
  14. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +9 -0
  15. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +28 -85
  16. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
  17. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
  18. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
  19. package/package.json +25 -24
  20. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
  21. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
  22. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
  23. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
  24. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  25. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  26. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  27. package/dist/_vendor/ailf-tasks/index.js +0 -16
  28. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  29. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  30. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  31. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  32. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  33. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  34. package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
  35. package/dist/adapters/task-sources/yaml-task-source.js +0 -139
  36. package/dist/agent-observer/test-imports.d.ts +0 -7
  37. package/dist/agent-observer/test-imports.js +0 -185
  38. package/dist/commands/update-quality-scores.d.ts +0 -5
  39. package/dist/commands/update-quality-scores.js +0 -20
  40. package/dist/lib/agent-behavior-report.d.ts +0 -8
  41. package/dist/lib/agent-behavior-report.js +0 -185
  42. package/dist/lib/baseline.d.ts +0 -19
  43. package/dist/lib/baseline.js +0 -153
  44. package/dist/lib/calculate-scores.d.ts +0 -23
  45. package/dist/lib/calculate-scores.js +0 -42
  46. package/dist/lib/compare.d.ts +0 -18
  47. package/dist/lib/compare.js +0 -170
  48. package/dist/lib/coverage-audit.d.ts +0 -4
  49. package/dist/lib/coverage-audit.js +0 -42
  50. package/dist/lib/discovery-report.d.ts +0 -13
  51. package/dist/lib/discovery-report.js +0 -57
  52. package/dist/lib/fetch-docs.d.ts +0 -30
  53. package/dist/lib/fetch-docs.js +0 -171
  54. package/dist/lib/generate-configs.d.ts +0 -25
  55. package/dist/lib/generate-configs.js +0 -42
  56. package/dist/lib/grader-api.d.ts +0 -21
  57. package/dist/lib/grader-api.js +0 -34
  58. package/dist/lib/grader-compare.d.ts +0 -19
  59. package/dist/lib/grader-compare.js +0 -91
  60. package/dist/lib/grader-consistency.d.ts +0 -27
  61. package/dist/lib/grader-consistency.js +0 -79
  62. package/dist/lib/grader-sensitivity.d.ts +0 -19
  63. package/dist/lib/grader-sensitivity.js +0 -75
  64. package/dist/lib/grader-validate.d.ts +0 -19
  65. package/dist/lib/grader-validate.js +0 -78
  66. package/dist/lib/measure-retrieval.d.ts +0 -14
  67. package/dist/lib/measure-retrieval.js +0 -71
  68. package/dist/lib/pr-comment.d.ts +0 -16
  69. package/dist/lib/pr-comment.js +0 -28
  70. package/dist/lib/readiness-report.d.ts +0 -13
  71. package/dist/lib/readiness-report.js +0 -108
  72. package/dist/lib/webhook-server.d.ts +0 -11
  73. package/dist/lib/webhook-server.js +0 -24
  74. package/dist/lib/weekly-digest.d.ts +0 -24
  75. package/dist/lib/weekly-digest.js +0 -148
  76. package/dist/orchestration/env-bridge.d.ts +0 -21
  77. package/dist/orchestration/env-bridge.js +0 -66
  78. package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
  79. package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
  80. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
  81. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
  82. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
  83. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  84. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  85. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  86. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  87. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  88. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
  89. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
  90. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
  91. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
  92. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
  93. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
  94. package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
  95. package/dist/pipeline/compiler/task-bridge.js +0 -92
  96. package/dist/pipeline/expand-tasks.d.ts +0 -232
  97. package/dist/pipeline/expand-tasks.js +0 -467
  98. package/dist/pipeline/generate-configs.d.ts +0 -92
  99. package/dist/pipeline/generate-configs.js +0 -445
  100. package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
  101. package/dist/pipeline/steps/calculate-scores-step.js +0 -89
  102. package/dist/pipeline/steps/compare-step.d.ts +0 -18
  103. package/dist/pipeline/steps/compare-step.js +0 -90
  104. package/dist/pipeline/steps/eval-step.d.ts +0 -53
  105. package/dist/pipeline/steps/eval-step.js +0 -347
  106. package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
  107. package/dist/pipeline/steps/fetch-docs-step.js +0 -84
  108. package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
  109. package/dist/pipeline/steps/generate-configs-step.js +0 -98
  110. package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
  111. package/dist/pipeline/steps/grader-consistency-step.js +0 -74
  112. package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
  113. package/dist/pipeline/steps/publish-report-step.js +0 -243
  114. package/dist/pipeline/steps/report-step.d.ts +0 -13
  115. package/dist/pipeline/steps/report-step.js +0 -56
  116. package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
  117. package/dist/pipeline/steps/update-scores-step.js +0 -42
  118. package/dist/scripts/agent-behavior-report.d.ts +0 -19
  119. package/dist/scripts/agent-behavior-report.js +0 -315
  120. package/dist/scripts/baseline.d.ts +0 -43
  121. package/dist/scripts/baseline.js +0 -267
  122. package/dist/scripts/calculate-scores.d.ts +0 -166
  123. package/dist/scripts/calculate-scores.js +0 -1296
  124. package/dist/scripts/compare.d.ts +0 -22
  125. package/dist/scripts/compare.js +0 -334
  126. package/dist/scripts/coverage-audit.d.ts +0 -44
  127. package/dist/scripts/coverage-audit.js +0 -209
  128. package/dist/scripts/debug-eval.d.ts +0 -19
  129. package/dist/scripts/debug-eval.js +0 -73
  130. package/dist/scripts/discovery-report.d.ts +0 -58
  131. package/dist/scripts/discovery-report.js +0 -250
  132. package/dist/scripts/fetch-docs.d.ts +0 -35
  133. package/dist/scripts/fetch-docs.js +0 -472
  134. package/dist/scripts/generate-configs.d.ts +0 -66
  135. package/dist/scripts/generate-configs.js +0 -459
  136. package/dist/scripts/grader-api.d.ts +0 -27
  137. package/dist/scripts/grader-api.js +0 -206
  138. package/dist/scripts/grader-compare.d.ts +0 -22
  139. package/dist/scripts/grader-compare.js +0 -368
  140. package/dist/scripts/grader-consistency.d.ts +0 -20
  141. package/dist/scripts/grader-consistency.js +0 -313
  142. package/dist/scripts/grader-sensitivity.d.ts +0 -22
  143. package/dist/scripts/grader-sensitivity.js +0 -354
  144. package/dist/scripts/grader-validate.d.ts +0 -19
  145. package/dist/scripts/grader-validate.js +0 -267
  146. package/dist/scripts/measure-retrieval.d.ts +0 -10
  147. package/dist/scripts/measure-retrieval.js +0 -145
  148. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
  149. package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
  150. package/dist/scripts/pipeline.d.ts +0 -76
  151. package/dist/scripts/pipeline.js +0 -1031
  152. package/dist/scripts/pr-comment.d.ts +0 -10
  153. package/dist/scripts/pr-comment.js +0 -510
  154. package/dist/scripts/readiness-report.d.ts +0 -88
  155. package/dist/scripts/readiness-report.js +0 -342
  156. package/dist/scripts/update-quality-scores.d.ts +0 -15
  157. package/dist/scripts/update-quality-scores.js +0 -184
  158. package/dist/scripts/validate-task-sources.d.ts +0 -21
  159. package/dist/scripts/validate-task-sources.js +0 -210
  160. package/dist/scripts/validate.d.ts +0 -13
  161. package/dist/scripts/validate.js +0 -79
  162. package/dist/scripts/webhook-server.d.ts +0 -26
  163. package/dist/scripts/webhook-server.js +0 -147
  164. package/dist/scripts/weekly-digest.d.ts +0 -24
  165. package/dist/scripts/weekly-digest.js +0 -144
  166. package/dist/sinks/format-slack.d.ts +0 -64
  167. package/dist/sinks/format-slack.js +0 -306
  168. package/dist/sinks/slack-sink.d.ts +0 -27
  169. package/dist/sinks/slack-sink.js +0 -78
  170. package/dist/sinks/webhook-sink.d.ts +0 -19
  171. package/dist/sinks/webhook-sink.js +0 -50
  172. package/tasks/.expanded.agentic.yaml +0 -280
  173. package/tasks/.expanded.yaml +0 -565
@@ -1,22 +0,0 @@
1
- /**
2
- * compare.ts
3
- *
4
- * CLI for structured comparison between two evaluation runs.
5
- *
6
- * Usage:
7
- * pnpm compare # compare current vs latest baseline
8
- * pnpm compare --baseline <path> # compare current vs specific file
9
- * pnpm compare --baseline <path> --experiment <path> # compare two specific files
10
- * pnpm compare --threshold 5 # custom noise threshold
11
- * pnpm compare --output /tmp/comparison.json # write JSON report to file
12
- * pnpm compare --format json # output raw JSON (default: table)
13
- *
14
- * Reads: results/latest/score-summary.json (as experiment, unless --experiment)
15
- * Reads: results/baselines/<latest>.json (as baseline, unless --baseline)
16
- */
17
- import { type ComparisonReport } from "../pipeline/types.js";
18
- /**
19
- * Generate a markdown comparison section suitable for PR comments.
20
- */
21
- export declare function formatComparisonMarkdown(report: ComparisonReport): string;
22
- export declare function formatComparisonTable(report: ComparisonReport): string;
@@ -1,334 +0,0 @@
1
- /**
2
- * compare.ts
3
- *
4
- * CLI for structured comparison between two evaluation runs.
5
- *
6
- * Usage:
7
- * pnpm compare # compare current vs latest baseline
8
- * pnpm compare --baseline <path> # compare current vs specific file
9
- * pnpm compare --baseline <path> --experiment <path> # compare two specific files
10
- * pnpm compare --threshold 5 # custom noise threshold
11
- * pnpm compare --output /tmp/comparison.json # write JSON report to file
12
- * pnpm compare --format json # output raw JSON (default: table)
13
- *
14
- * Reads: results/latest/score-summary.json (as experiment, unless --experiment)
15
- * Reads: results/baselines/<latest>.json (as baseline, unless --baseline)
16
- */
17
- import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
18
- import { dirname, join, resolve } from "path";
19
- import { fileURLToPath } from "url";
20
- import { compare } from "../pipeline/compare.js";
21
- import { DEFAULT_NOISE_THRESHOLD, } from "../pipeline/types.js";
22
- const __dirname = dirname(fileURLToPath(import.meta.url));
23
- const ROOT = resolve(__dirname, "..", "..");
24
- const BASELINES_DIR = join(ROOT, "results", "baselines");
25
- const SCORE_SUMMARY_PATH = join(ROOT, "results", "latest", "score-summary.json");
26
- // ---------------------------------------------------------------------------
27
- // CLI argument parsing
28
- // ---------------------------------------------------------------------------
29
- const args = process.argv.slice(2);
30
- function getFlag(name) {
31
- return args.includes(`--${name}`);
32
- }
33
- function getOption(name) {
34
- const idx = args.indexOf(`--${name}`);
35
- return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined;
36
- }
37
- const baselinePath = getOption("baseline");
38
- const experimentPath = getOption("experiment");
39
- const thresholdStr = getOption("threshold");
40
- const threshold = thresholdStr
41
- ? parseFloat(thresholdStr)
42
- : DEFAULT_NOISE_THRESHOLD;
43
- const outputPath = getOption("output");
44
- const format = getOption("format") ?? "table";
45
- const showHelp = getFlag("help") || getFlag("h");
46
- if (showHelp) {
47
- console.log(`
48
- Usage: pnpm compare [options]
49
-
50
- Compare two evaluation score summaries and produce structured deltas.
51
-
52
- Options:
53
- --baseline <path> Baseline score-summary.json (default: latest baseline)
54
- --experiment <path> Experiment score-summary.json (default: results/latest/score-summary.json)
55
- --threshold <n> Noise threshold for unchanged classification (default: ${DEFAULT_NOISE_THRESHOLD})
56
- --output <path> Write JSON report to file
57
- --format <fmt> Output format: table (default) or json
58
- --help, -h Show this help
59
-
60
- Examples:
61
- pnpm compare # current scores vs latest baseline
62
- pnpm compare --threshold 5 # wider noise band
63
- pnpm compare --format json # machine-readable output
64
- pnpm compare --baseline results/baselines/20260310_02_43_44.json
65
- pnpm compare --baseline before.json --experiment after.json
66
- `);
67
- process.exit(0);
68
- }
69
- // ---------------------------------------------------------------------------
70
- // File loading helpers
71
- // ---------------------------------------------------------------------------
72
- /**
73
- * Generate a markdown comparison section suitable for PR comments.
74
- */
75
- export function formatComparisonMarkdown(report) {
76
- const lines = [];
77
- const overall = report.deltas.overall;
78
- const overallIcon = changeIcon(overall > report.noiseThreshold
79
- ? "improved"
80
- : overall < -report.noiseThreshold
81
- ? "regressed"
82
- : "unchanged");
83
- lines.push("### 📊 Score Comparison");
84
- lines.push("");
85
- lines.push(`**Overall: ${Math.round(report.baseline.overall.avgScore)} → ${Math.round(report.experiment.overall.avgScore)}** (${overallIcon} ${deltaStr(overall)})`);
86
- lines.push("");
87
- // Per-area table
88
- lines.push("| Feature | Baseline | Current | Delta | Task | Code | Docs |");
89
- lines.push("|---------|----------|---------|-------|------|------|------|");
90
- for (const a of report.areas) {
91
- const icon = changeIcon(a.change);
92
- lines.push(`| ${a.area} | ${a.baseline} | ${a.experiment} | ${icon} ${deltaStr(a.delta)} | ${deltaStr(a.dimensions.taskCompletion.delta)} | ${deltaStr(a.dimensions.codeCorrectness.delta)} | ${deltaStr(a.dimensions.docCoverage.delta)} |`);
93
- }
94
- lines.push("");
95
- // Summary
96
- const parts = [];
97
- if (report.improved.length > 0) {
98
- parts.push(`📈 ${report.improved.length} improved`);
99
- }
100
- if (report.regressed.length > 0) {
101
- parts.push(`📉 ${report.regressed.length} regressed`);
102
- }
103
- if (report.unchanged.length > 0) {
104
- parts.push(`➡️ ${report.unchanged.length} unchanged`);
105
- }
106
- if (parts.length > 0) {
107
- lines.push(parts.join(" · "));
108
- lines.push("");
109
- }
110
- // Dimension averages in collapsible
111
- lines.push("<details>");
112
- lines.push("<summary>Dimension averages</summary>");
113
- lines.push("");
114
- const dim = report.deltas.perDimension;
115
- lines.push("| Dimension | Delta |");
116
- lines.push("|-----------|-------|");
117
- lines.push(`| Task Completion | ${deltaStr(dim.taskCompletion)} |`);
118
- lines.push(`| Code Correctness | ${deltaStr(dim.codeCorrectness)} |`);
119
- lines.push(`| Doc Coverage | ${deltaStr(dim.docCoverage)} |`);
120
- lines.push(`| Doc Lift | ${deltaStr(report.deltas.docLift)} |`);
121
- if (report.deltas.cost !== undefined) {
122
- const costStr = report.deltas.cost > 0
123
- ? `+$${report.deltas.cost.toFixed(4)}`
124
- : `-$${Math.abs(report.deltas.cost).toFixed(4)}`;
125
- lines.push(`| Cost | ${costStr} |`);
126
- }
127
- lines.push("");
128
- lines.push("</details>");
129
- lines.push("");
130
- return lines.join("\n");
131
- }
132
- export function formatComparisonTable(report) {
133
- const lines = [];
134
- lines.push("=".repeat(80));
135
- lines.push(" COMPARISON REPORT");
136
- lines.push("=".repeat(80));
137
- lines.push("");
138
- // Overall summary
139
- const overall = report.deltas.overall;
140
- const overallIcon = changeIcon(overall > report.noiseThreshold
141
- ? "improved"
142
- : overall < -report.noiseThreshold
143
- ? "regressed"
144
- : "unchanged");
145
- lines.push(` Overall: ${Math.round(report.baseline.overall.avgScore)} → ${Math.round(report.experiment.overall.avgScore)} (${overallIcon} ${deltaStr(overall)})`);
146
- lines.push("");
147
- // Per-dimension averages
148
- const dim = report.deltas.perDimension;
149
- lines.push(" Dimension averages:");
150
- lines.push(` Task Completion: ${deltaStr(dim.taskCompletion)}`);
151
- lines.push(` Code Correctness: ${deltaStr(dim.codeCorrectness)}`);
152
- lines.push(` Doc Coverage: ${deltaStr(dim.docCoverage)}`);
153
- lines.push(` Doc Lift: ${deltaStr(report.deltas.docLift)}`);
154
- if (report.deltas.cost !== undefined) {
155
- lines.push(` Cost: ${report.deltas.cost > 0 ? "+" : ""}$${report.deltas.cost.toFixed(4)}`);
156
- }
157
- lines.push("");
158
- // Per-area table
159
- lines.push("-".repeat(80));
160
- lines.push("PER-AREA BREAKDOWN");
161
- lines.push("-".repeat(80));
162
- lines.push("");
163
- const h = "| Feature Area | Baseline | Experiment | Delta | Task | Code | Docs |";
164
- const sep = "|---------------------|----------|------------|-------|------|------|------|";
165
- lines.push(h);
166
- lines.push(sep);
167
- for (const a of report.areas) {
168
- const icon = changeIcon(a.change);
169
- lines.push(`| ${icon} ${a.area.padEnd(17)} | ${String(a.baseline).padStart(8)} | ${String(a.experiment).padStart(10)} | ${deltaStr(a.delta).padStart(5)} | ${deltaStr(a.dimensions.taskCompletion.delta).padStart(4)} | ${deltaStr(a.dimensions.codeCorrectness.delta).padStart(4)} | ${deltaStr(a.dimensions.docCoverage.delta).padStart(4)} |`);
170
- }
171
- lines.push("");
172
- // Classification summary
173
- if (report.improved.length > 0) {
174
- lines.push(` 📈 Improved: ${report.improved.join(", ")}`);
175
- }
176
- if (report.regressed.length > 0) {
177
- lines.push(` 📉 Regressed: ${report.regressed.join(", ")}`);
178
- }
179
- if (report.unchanged.length > 0) {
180
- lines.push(` ➡️ Unchanged: ${report.unchanged.join(", ")}`);
181
- }
182
- lines.push("");
183
- // Mismatched areas
184
- if (report.mismatched.onlyInBaseline.length > 0 ||
185
- report.mismatched.onlyInExperiment.length > 0) {
186
- lines.push(" ⚠️ Area mismatches:");
187
- if (report.mismatched.onlyInBaseline.length > 0) {
188
- lines.push(` Only in baseline: ${report.mismatched.onlyInBaseline.join(", ")}`);
189
- }
190
- if (report.mismatched.onlyInExperiment.length > 0) {
191
- lines.push(` Only in experiment: ${report.mismatched.onlyInExperiment.join(", ")}`);
192
- }
193
- lines.push("");
194
- }
195
- const isEmpirical = "noiseThresholdEmpirical" in report &&
196
- report.noiseThresholdEmpirical === true;
197
- const thresholdSource = isEmpirical
198
- ? "empirical, from grader consistency data"
199
- : "default";
200
- lines.push(` Noise threshold: ±${report.noiseThreshold}${Number.isInteger(report.noiseThreshold) ? "" : ` (${report.noiseThreshold.toFixed(1)})`} (${thresholdSource})`);
201
- lines.push("");
202
- // Ceiling decomposition deltas (when areas have ceiling data)
203
- const hasCeilingData = report.areas.some((a) => a.ceilingDelta !== undefined);
204
- if (hasCeilingData) {
205
- lines.push("-".repeat(80));
206
- lines.push("CEILING DECOMPOSITION DELTAS");
207
- lines.push("-".repeat(80));
208
- lines.push("");
209
- const cH = "| Feature Area | Ceiling Δ | Floor Δ | Doc Lift Δ |";
210
- const cSep = "|---------------------|-----------|---------|------------|";
211
- lines.push(cH);
212
- lines.push(cSep);
213
- for (const a of report.areas) {
214
- lines.push(`| ${a.area.padEnd(19)} | ` +
215
- `${deltaStr(a.ceilingDelta).padStart(9)} | ` +
216
- `${deltaStr(a.floorDelta).padStart(7)} | ` +
217
- `${deltaStr(a.docLiftDelta).padStart(10)} |`);
218
- }
219
- lines.push("");
220
- }
221
- return lines.join("\n");
222
- }
223
- // ---------------------------------------------------------------------------
224
- // Formatting
225
- // ---------------------------------------------------------------------------
226
- function changeIcon(change) {
227
- switch (change) {
228
- case "improved":
229
- return "📈";
230
- case "regressed":
231
- return "📉";
232
- default:
233
- return "➡️";
234
- }
235
- }
236
- function deltaStr(d) {
237
- if (d > 0)
238
- return `+${Math.round(d)}`;
239
- if (d < 0)
240
- return `${Math.round(d)}`;
241
- return "0";
242
- }
243
- function findLatestBaseline() {
244
- if (!existsSync(BASELINES_DIR))
245
- return null;
246
- const files = readdirSync(BASELINES_DIR)
247
- .filter((f) => f.endsWith(".json"))
248
- .sort()
249
- .reverse();
250
- return files.length > 0 ? join(BASELINES_DIR, files[0]) : null;
251
- }
252
- function loadSummary(path) {
253
- if (!existsSync(path)) {
254
- console.error(`❌ File not found: ${path}`);
255
- process.exit(1);
256
- }
257
- const raw = readFileSync(path, "utf-8");
258
- return JSON.parse(raw);
259
- }
260
- // ---------------------------------------------------------------------------
261
- // Main
262
- // ---------------------------------------------------------------------------
263
- function main() {
264
- // Resolve experiment path
265
- const expPath = experimentPath ?? SCORE_SUMMARY_PATH;
266
- const experiment = loadSummary(expPath);
267
- // Resolve baseline path
268
- let basePath;
269
- if (baselinePath) {
270
- basePath = resolve(baselinePath);
271
- }
272
- else {
273
- const latest = findLatestBaseline();
274
- if (!latest) {
275
- console.error("❌ No baselines found. Run 'pnpm baseline:save' first, or use --baseline <path>.");
276
- process.exit(1);
277
- }
278
- basePath = latest;
279
- }
280
- const baseline = loadSummary(basePath);
281
- // Try to load grader consistency data for empirical thresholds
282
- const consistencyPath = join(ROOT, "results", "latest", "grader-consistency.json");
283
- let graderConsistency;
284
- if (existsSync(consistencyPath) && !thresholdStr) {
285
- try {
286
- const consistencyRaw = JSON.parse(readFileSync(consistencyPath, "utf-8"));
287
- if (consistencyRaw.recommendedThreshold && consistencyRaw.perDimension) {
288
- graderConsistency =
289
- consistencyRaw;
290
- console.log(` 📊 Using empirical noise threshold: ±${graderConsistency.recommendedThreshold.toFixed(1)} (from grader consistency data)`);
291
- }
292
- }
293
- catch {
294
- // Non-fatal — fall back to default threshold
295
- }
296
- }
297
- console.log(` Baseline: ${basePath}`);
298
- console.log(` Experiment: ${expPath}`);
299
- if (!graderConsistency) {
300
- console.log(` Threshold: ±${threshold} (default — run --grader-replications for empirical threshold)`);
301
- }
302
- console.log("");
303
- const report = compare(baseline, experiment, {
304
- graderConsistency,
305
- noiseThreshold: threshold,
306
- });
307
- if (format === "json") {
308
- const json = JSON.stringify(report, null, 2);
309
- if (outputPath) {
310
- writeFileSync(outputPath, json);
311
- console.log(` ✅ Comparison report written to ${outputPath}`);
312
- }
313
- else {
314
- console.log(json);
315
- }
316
- }
317
- else {
318
- const table = formatComparisonTable(report);
319
- console.log(table);
320
- if (outputPath) {
321
- const json = JSON.stringify(report, null, 2);
322
- writeFileSync(outputPath, json);
323
- console.log(` ✅ Comparison report also written to ${outputPath}`);
324
- }
325
- }
326
- // Write comparison report to results/latest for other steps to consume
327
- const latestComparisonPath = join(ROOT, "results", "latest", "comparison-report.json");
328
- writeFileSync(latestComparisonPath, JSON.stringify(report, null, 2));
329
- }
330
- // Only run when invoked directly
331
- if (process.argv[1]?.endsWith("compare.ts") ||
332
- process.argv[1]?.endsWith("compare.js")) {
333
- main();
334
- }
@@ -1,44 +0,0 @@
1
- /**
2
- * coverage-audit.ts
3
- *
4
- * CLI script that cross-references the product feature registry
5
- * (config/features.yaml) against actual task files (tasks/*.yaml)
6
- * to produce a documentation coverage audit.
7
- *
8
- * Phase 3c of the Scenario Matrix implementation.
9
- *
10
- * Usage:
11
- * pnpm coverage-audit # console report
12
- * pnpm coverage-audit --format md # markdown output
13
- * pnpm coverage-audit --json # JSON output
14
- *
15
- * @see docs/exec-plans/completed/scenario-matrix-implementation/phase-3-gap-analysis.md
16
- */
17
- import type { CoverageAuditReport, ProductFeature } from "../pipeline/types.js";
18
- /**
19
- * Count unique document slugs referenced across all tasks.
20
- */
21
- export declare function countReferencedDocs(rootDir: string): {
22
- slugs: string[];
23
- total: number;
24
- };
25
- /**
26
- * Count actual tasks per area from task YAML files.
27
- */
28
- export declare function countTasksByArea(rootDir: string): Record<string, number>;
29
- /**
30
- * Format coverage audit for console output.
31
- */
32
- export declare function formatCoverageConsole(report: CoverageAuditReport): string;
33
- /**
34
- * Format coverage audit as markdown.
35
- */
36
- export declare function formatCoverageMarkdown(report: CoverageAuditReport): string;
37
- /**
38
- * Load and validate the feature registry from config/features.yaml.
39
- */
40
- export declare function loadFeatureRegistry(rootDir: string): null | ProductFeature[];
41
- /**
42
- * Run the coverage audit and produce a structured report.
43
- */
44
- export declare function runCoverageAudit(rootDir: string): CoverageAuditReport | null;
@@ -1,209 +0,0 @@
1
- /**
2
- * coverage-audit.ts
3
- *
4
- * CLI script that cross-references the product feature registry
5
- * (config/features.yaml) against actual task files (tasks/*.yaml)
6
- * to produce a documentation coverage audit.
7
- *
8
- * Phase 3c of the Scenario Matrix implementation.
9
- *
10
- * Usage:
11
- * pnpm coverage-audit # console report
12
- * pnpm coverage-audit --format md # markdown output
13
- * pnpm coverage-audit --json # JSON output
14
- *
15
- * @see docs/exec-plans/completed/scenario-matrix-implementation/phase-3-gap-analysis.md
16
- */
17
- import { existsSync, readFileSync } from "fs";
18
- import { dirname, join, resolve } from "path";
19
- import { fileURLToPath } from "url";
20
- import { load } from "js-yaml";
21
- import { FeatureRegistrySchema } from "../pipeline/schemas.js";
22
- import { resolveMappings } from "../pipeline/resolve-mappings.js";
23
- const __dirname = dirname(fileURLToPath(import.meta.url));
24
- const ROOT = resolve(__dirname, "..", "..");
25
- // ---------------------------------------------------------------------------
26
- // Core logic (exported for testing)
27
- // ---------------------------------------------------------------------------
28
- /**
29
- * Count unique document slugs referenced across all tasks.
30
- */
31
- export function countReferencedDocs(rootDir) {
32
- const mappings = resolveMappings(rootDir);
33
- const allSlugs = new Set();
34
- for (const config of Object.values(mappings.feature_areas)) {
35
- for (const task of config.tasks) {
36
- for (const doc of task.canonical_docs) {
37
- allSlugs.add(doc.slug);
38
- }
39
- }
40
- }
41
- const slugs = [...allSlugs].sort();
42
- return { slugs, total: slugs.length };
43
- }
44
- /**
45
- * Count actual tasks per area from task YAML files.
46
- */
47
- export function countTasksByArea(rootDir) {
48
- const mappings = resolveMappings(rootDir);
49
- const counts = {};
50
- for (const [area, config] of Object.entries(mappings.feature_areas)) {
51
- counts[area] = config.tasks.length;
52
- }
53
- return counts;
54
- }
55
- /**
56
- * Format coverage audit for console output.
57
- */
58
- export function formatCoverageConsole(report) {
59
- const lines = [];
60
- lines.push("═══════════════════════════════════════════════════════════════");
61
- lines.push(" DOCUMENTATION COVERAGE AUDIT");
62
- lines.push("═══════════════════════════════════════════════════════════════");
63
- lines.push("");
64
- lines.push(`Coverage: ${report.covered.length}/${report.totalFeatures} features (${report.coveragePercent}%)`);
65
- lines.push("");
66
- // Covered features
67
- if (report.covered.length > 0) {
68
- lines.push("COVERED FEATURES:");
69
- for (const f of report.covered) {
70
- const taskLabel = f.actualTaskCount === 1 ? "1 task" : `${f.actualTaskCount} tasks`;
71
- const sections = f.sections.join(", ");
72
- lines.push(` ✅ ${f.id.padEnd(20)} ${taskLabel.padEnd(10)} ${f.priority.padEnd(10)} ${sections}`);
73
- }
74
- lines.push("");
75
- }
76
- // Uncovered features
77
- if (report.uncovered.length > 0) {
78
- lines.push("UNCOVERED FEATURES (by priority):");
79
- for (const f of report.uncovered) {
80
- const sections = f.sections.join(", ");
81
- lines.push(` ❌ ${f.id.padEnd(20)} ${f.priority.padEnd(10)} ${sections}`);
82
- }
83
- lines.push("");
84
- }
85
- return lines.join("\n");
86
- }
87
- /**
88
- * Format coverage audit as markdown.
89
- */
90
- export function formatCoverageMarkdown(report) {
91
- const lines = [];
92
- lines.push("### 📊 Documentation Coverage Audit");
93
- lines.push("");
94
- lines.push(`**Coverage: ${report.covered.length}/${report.totalFeatures} features (${report.coveragePercent}%)**`);
95
- lines.push("");
96
- if (report.covered.length > 0) {
97
- lines.push("#### Covered Features");
98
- lines.push("");
99
- lines.push("| Feature | Tasks | Priority | Sections |");
100
- lines.push("|---------|-------|----------|----------|");
101
- for (const f of report.covered) {
102
- lines.push(`| ✅ ${f.name} | ${f.actualTaskCount} | ${f.priority} | ${f.sections.join(", ")} |`);
103
- }
104
- lines.push("");
105
- }
106
- if (report.uncovered.length > 0) {
107
- lines.push("#### Uncovered Features");
108
- lines.push("");
109
- lines.push("| Feature | Priority | Sections |");
110
- lines.push("|---------|----------|----------|");
111
- for (const f of report.uncovered) {
112
- lines.push(`| ❌ ${f.name} | ${f.priority} | ${f.sections.join(", ")} |`);
113
- }
114
- lines.push("");
115
- }
116
- return lines.join("\n");
117
- }
118
- // ---------------------------------------------------------------------------
119
- // Formatting
120
- // ---------------------------------------------------------------------------
121
- /**
122
- * Load and validate the feature registry from config/features.yaml.
123
- */
124
- export function loadFeatureRegistry(rootDir) {
125
- const filePath = join(rootDir, "config", "features.yaml");
126
- if (!existsSync(filePath)) {
127
- return null;
128
- }
129
- const raw = readFileSync(filePath, "utf-8");
130
- const parsed = load(raw);
131
- const result = FeatureRegistrySchema.safeParse(parsed);
132
- if (!result.success) {
133
- console.error("❌ config/features.yaml validation failed:");
134
- for (const issue of result.error.issues) {
135
- console.error(` ${issue.path.join(".")}: ${issue.message}`);
136
- }
137
- return null;
138
- }
139
- return result.data.features;
140
- }
141
- /**
142
- * Run the coverage audit and produce a structured report.
143
- */
144
- export function runCoverageAudit(rootDir) {
145
- const features = loadFeatureRegistry(rootDir);
146
- if (!features)
147
- return null;
148
- const taskCounts = countTasksByArea(rootDir);
149
- const totalFeatures = features.length;
150
- const covered = [];
151
- const uncovered = [];
152
- for (const feature of features) {
153
- if (feature.status === "covered" && feature.area) {
154
- const actualTaskCount = taskCounts[feature.area] ?? 0;
155
- covered.push({ ...feature, actualTaskCount });
156
- }
157
- else if (feature.status === "uncovered" || feature.status === "planned") {
158
- uncovered.push(feature);
159
- }
160
- }
161
- // Sort uncovered by priority
162
- const priorityOrder = { critical: 0, high: 1, low: 3, medium: 2 };
163
- uncovered.sort((a, b) => priorityOrder[a.priority] - priorityOrder[b.priority]);
164
- const coveredCount = covered.length;
165
- const coveragePercent = totalFeatures > 0 ? (coveredCount / totalFeatures) * 100 : 0;
166
- return {
167
- coveragePercent: Math.round(coveragePercent * 10) / 10,
168
- covered,
169
- generatedAt: new Date().toISOString(),
170
- totalFeatures,
171
- uncovered,
172
- };
173
- }
174
- // ---------------------------------------------------------------------------
175
- // CLI entry point
176
- // ---------------------------------------------------------------------------
177
- function main() {
178
- const args = process.argv.slice(2);
179
- const formatArg = args.includes("--format")
180
- ? args[args.indexOf("--format") + 1]
181
- : undefined;
182
- const jsonOutput = args.includes("--json");
183
- const report = runCoverageAudit(ROOT);
184
- if (!report) {
185
- console.error("❌ Coverage audit failed. Ensure config/features.yaml exists and is valid.");
186
- process.exit(1);
187
- }
188
- if (jsonOutput) {
189
- console.log(JSON.stringify(report, null, 2));
190
- }
191
- else if (formatArg === "md" || formatArg === "markdown") {
192
- console.log(formatCoverageMarkdown(report));
193
- }
194
- else {
195
- console.log(formatCoverageConsole(report));
196
- }
197
- // Also print document utilization stats
198
- if (!jsonOutput && formatArg !== "md") {
199
- const docStats = countReferencedDocs(ROOT);
200
- console.log("DOCUMENT UTILIZATION:");
201
- console.log(` ${docStats.total} unique document slugs referenced across evaluation tasks`);
202
- console.log("");
203
- }
204
- }
205
- // Only run when invoked directly
206
- if (process.argv[1]?.endsWith("coverage-audit.ts") ||
207
- process.argv[1]?.endsWith("coverage-audit.js")) {
208
- main();
209
- }
@@ -1,19 +0,0 @@
1
- /**
2
- * debug-eval.ts
3
- *
4
- * Thin wrapper around `promptfoo eval` that reads DEBUG_EVAL_* environment
5
- * variables and forwards them as native promptfoo filter flags.
6
- *
7
- * Environment variables:
8
- * DEBUG_EVAL=1 — required to enable debug mode
9
- * DEBUG_EVAL_PATTERN=<re> — --filter-pattern (regex on test description)
10
- * DEBUG_EVAL_N=<number> — --filter-first-n (default: 2 when no other filters set)
11
- * DEBUG_EVAL_SAMPLE=<number> — --filter-sample (random N tests)
12
- *
13
- * Usage:
14
- * tsx src/scripts/debug-eval.ts --config promptfooconfig.yaml
15
- * tsx src/scripts/debug-eval.ts --config promptfooconfig.agentic.yaml --no-cache
16
- *
17
- * All extra argv are forwarded to promptfoo eval unchanged.
18
- */
19
- export {};