@sanity/ailf 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/cli.js +0 -0
- package/dist/orchestration/steps/run-eval-step.js +1 -1
- package/dist/pipeline/checks.d.ts +8 -3
- package/dist/pipeline/checks.js +23 -3
- package/package.json +25 -25
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
- package/dist/adapters/task-sources/yaml-task-source.js +0 -139
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
- package/dist/commands/update-quality-scores.d.ts +0 -5
- package/dist/commands/update-quality-scores.js +0 -20
- package/dist/lib/agent-behavior-report.d.ts +0 -8
- package/dist/lib/agent-behavior-report.js +0 -185
- package/dist/lib/baseline.d.ts +0 -19
- package/dist/lib/baseline.js +0 -153
- package/dist/lib/calculate-scores.d.ts +0 -23
- package/dist/lib/calculate-scores.js +0 -42
- package/dist/lib/compare.d.ts +0 -18
- package/dist/lib/compare.js +0 -170
- package/dist/lib/coverage-audit.d.ts +0 -4
- package/dist/lib/coverage-audit.js +0 -42
- package/dist/lib/discovery-report.d.ts +0 -13
- package/dist/lib/discovery-report.js +0 -57
- package/dist/lib/fetch-docs.d.ts +0 -30
- package/dist/lib/fetch-docs.js +0 -171
- package/dist/lib/generate-configs.d.ts +0 -25
- package/dist/lib/generate-configs.js +0 -42
- package/dist/lib/grader-api.d.ts +0 -21
- package/dist/lib/grader-api.js +0 -34
- package/dist/lib/grader-compare.d.ts +0 -19
- package/dist/lib/grader-compare.js +0 -91
- package/dist/lib/grader-consistency.d.ts +0 -27
- package/dist/lib/grader-consistency.js +0 -79
- package/dist/lib/grader-sensitivity.d.ts +0 -19
- package/dist/lib/grader-sensitivity.js +0 -75
- package/dist/lib/grader-validate.d.ts +0 -19
- package/dist/lib/grader-validate.js +0 -78
- package/dist/lib/measure-retrieval.d.ts +0 -14
- package/dist/lib/measure-retrieval.js +0 -71
- package/dist/lib/pr-comment.d.ts +0 -16
- package/dist/lib/pr-comment.js +0 -28
- package/dist/lib/readiness-report.d.ts +0 -13
- package/dist/lib/readiness-report.js +0 -108
- package/dist/lib/webhook-server.d.ts +0 -11
- package/dist/lib/webhook-server.js +0 -24
- package/dist/lib/weekly-digest.d.ts +0 -24
- package/dist/lib/weekly-digest.js +0 -148
- package/dist/orchestration/env-bridge.d.ts +0 -21
- package/dist/orchestration/env-bridge.js +0 -66
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
- package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
- package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
- package/dist/pipeline/compiler/task-bridge.js +0 -92
- package/dist/pipeline/expand-tasks.d.ts +0 -232
- package/dist/pipeline/expand-tasks.js +0 -467
- package/dist/pipeline/generate-configs.d.ts +0 -92
- package/dist/pipeline/generate-configs.js +0 -445
- package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/calculate-scores-step.js +0 -89
- package/dist/pipeline/steps/compare-step.d.ts +0 -18
- package/dist/pipeline/steps/compare-step.js +0 -90
- package/dist/pipeline/steps/eval-step.d.ts +0 -53
- package/dist/pipeline/steps/eval-step.js +0 -347
- package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
- package/dist/pipeline/steps/fetch-docs-step.js +0 -84
- package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
- package/dist/pipeline/steps/generate-configs-step.js +0 -98
- package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
- package/dist/pipeline/steps/grader-consistency-step.js +0 -74
- package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
- package/dist/pipeline/steps/publish-report-step.js +0 -243
- package/dist/pipeline/steps/report-step.d.ts +0 -13
- package/dist/pipeline/steps/report-step.js +0 -56
- package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/update-scores-step.js +0 -42
- package/dist/scripts/agent-behavior-report.d.ts +0 -19
- package/dist/scripts/agent-behavior-report.js +0 -315
- package/dist/scripts/baseline.d.ts +0 -43
- package/dist/scripts/baseline.js +0 -267
- package/dist/scripts/calculate-scores.d.ts +0 -166
- package/dist/scripts/calculate-scores.js +0 -1296
- package/dist/scripts/compare.d.ts +0 -22
- package/dist/scripts/compare.js +0 -334
- package/dist/scripts/coverage-audit.d.ts +0 -44
- package/dist/scripts/coverage-audit.js +0 -209
- package/dist/scripts/debug-eval.d.ts +0 -19
- package/dist/scripts/debug-eval.js +0 -73
- package/dist/scripts/discovery-report.d.ts +0 -58
- package/dist/scripts/discovery-report.js +0 -250
- package/dist/scripts/fetch-docs.d.ts +0 -35
- package/dist/scripts/fetch-docs.js +0 -472
- package/dist/scripts/generate-configs.d.ts +0 -66
- package/dist/scripts/generate-configs.js +0 -459
- package/dist/scripts/grader-api.d.ts +0 -27
- package/dist/scripts/grader-api.js +0 -206
- package/dist/scripts/grader-compare.d.ts +0 -22
- package/dist/scripts/grader-compare.js +0 -368
- package/dist/scripts/grader-consistency.d.ts +0 -20
- package/dist/scripts/grader-consistency.js +0 -313
- package/dist/scripts/grader-sensitivity.d.ts +0 -22
- package/dist/scripts/grader-sensitivity.js +0 -354
- package/dist/scripts/grader-validate.d.ts +0 -19
- package/dist/scripts/grader-validate.js +0 -267
- package/dist/scripts/measure-retrieval.d.ts +0 -10
- package/dist/scripts/measure-retrieval.js +0 -145
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
- package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
- package/dist/scripts/pipeline.d.ts +0 -76
- package/dist/scripts/pipeline.js +0 -1031
- package/dist/scripts/pr-comment.d.ts +0 -10
- package/dist/scripts/pr-comment.js +0 -510
- package/dist/scripts/readiness-report.d.ts +0 -88
- package/dist/scripts/readiness-report.js +0 -342
- package/dist/scripts/update-quality-scores.d.ts +0 -15
- package/dist/scripts/update-quality-scores.js +0 -184
- package/dist/scripts/validate-task-sources.d.ts +0 -21
- package/dist/scripts/validate-task-sources.js +0 -210
- package/dist/scripts/validate.d.ts +0 -13
- package/dist/scripts/validate.js +0 -79
- package/dist/scripts/webhook-server.d.ts +0 -26
- package/dist/scripts/webhook-server.js +0 -147
- package/dist/scripts/weekly-digest.d.ts +0 -24
- package/dist/scripts/weekly-digest.js +0 -144
- package/dist/sinks/format-slack.d.ts +0 -64
- package/dist/sinks/format-slack.js +0 -306
- package/dist/sinks/slack-sink.d.ts +0 -27
- package/dist/sinks/slack-sink.js +0 -78
- package/dist/sinks/webhook-sink.d.ts +0 -19
- package/dist/sinks/webhook-sink.js +0 -50
- package/tasks/.expanded.agentic.yaml +0 -280
- package/tasks/.expanded.yaml +0 -565
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* compare.ts
|
|
3
|
-
*
|
|
4
|
-
* CLI for structured comparison between two evaluation runs.
|
|
5
|
-
*
|
|
6
|
-
* Usage:
|
|
7
|
-
* pnpm compare # compare current vs latest baseline
|
|
8
|
-
* pnpm compare --baseline <path> # compare current vs specific file
|
|
9
|
-
* pnpm compare --baseline <path> --experiment <path> # compare two specific files
|
|
10
|
-
* pnpm compare --threshold 5 # custom noise threshold
|
|
11
|
-
* pnpm compare --output /tmp/comparison.json # write JSON report to file
|
|
12
|
-
* pnpm compare --format json # output raw JSON (default: table)
|
|
13
|
-
*
|
|
14
|
-
* Reads: results/latest/score-summary.json (as experiment, unless --experiment)
|
|
15
|
-
* Reads: results/baselines/<latest>.json (as baseline, unless --baseline)
|
|
16
|
-
*/
|
|
17
|
-
import { type ComparisonReport } from "../pipeline/types.js";
|
|
18
|
-
/**
|
|
19
|
-
* Generate a markdown comparison section suitable for PR comments.
|
|
20
|
-
*/
|
|
21
|
-
export declare function formatComparisonMarkdown(report: ComparisonReport): string;
|
|
22
|
-
export declare function formatComparisonTable(report: ComparisonReport): string;
|
package/dist/scripts/compare.js
DELETED
|
@@ -1,334 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* compare.ts
|
|
3
|
-
*
|
|
4
|
-
* CLI for structured comparison between two evaluation runs.
|
|
5
|
-
*
|
|
6
|
-
* Usage:
|
|
7
|
-
* pnpm compare # compare current vs latest baseline
|
|
8
|
-
* pnpm compare --baseline <path> # compare current vs specific file
|
|
9
|
-
* pnpm compare --baseline <path> --experiment <path> # compare two specific files
|
|
10
|
-
* pnpm compare --threshold 5 # custom noise threshold
|
|
11
|
-
* pnpm compare --output /tmp/comparison.json # write JSON report to file
|
|
12
|
-
* pnpm compare --format json # output raw JSON (default: table)
|
|
13
|
-
*
|
|
14
|
-
* Reads: results/latest/score-summary.json (as experiment, unless --experiment)
|
|
15
|
-
* Reads: results/baselines/<latest>.json (as baseline, unless --baseline)
|
|
16
|
-
*/
|
|
17
|
-
import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
|
|
18
|
-
import { dirname, join, resolve } from "path";
|
|
19
|
-
import { fileURLToPath } from "url";
|
|
20
|
-
import { compare } from "../pipeline/compare.js";
|
|
21
|
-
import { DEFAULT_NOISE_THRESHOLD, } from "../pipeline/types.js";
|
|
22
|
-
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
23
|
-
const ROOT = resolve(__dirname, "..", "..");
|
|
24
|
-
const BASELINES_DIR = join(ROOT, "results", "baselines");
|
|
25
|
-
const SCORE_SUMMARY_PATH = join(ROOT, "results", "latest", "score-summary.json");
|
|
26
|
-
// ---------------------------------------------------------------------------
|
|
27
|
-
// CLI argument parsing
|
|
28
|
-
// ---------------------------------------------------------------------------
|
|
29
|
-
const args = process.argv.slice(2);
|
|
30
|
-
function getFlag(name) {
|
|
31
|
-
return args.includes(`--${name}`);
|
|
32
|
-
}
|
|
33
|
-
function getOption(name) {
|
|
34
|
-
const idx = args.indexOf(`--${name}`);
|
|
35
|
-
return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined;
|
|
36
|
-
}
|
|
37
|
-
const baselinePath = getOption("baseline");
|
|
38
|
-
const experimentPath = getOption("experiment");
|
|
39
|
-
const thresholdStr = getOption("threshold");
|
|
40
|
-
const threshold = thresholdStr
|
|
41
|
-
? parseFloat(thresholdStr)
|
|
42
|
-
: DEFAULT_NOISE_THRESHOLD;
|
|
43
|
-
const outputPath = getOption("output");
|
|
44
|
-
const format = getOption("format") ?? "table";
|
|
45
|
-
const showHelp = getFlag("help") || getFlag("h");
|
|
46
|
-
if (showHelp) {
|
|
47
|
-
console.log(`
|
|
48
|
-
Usage: pnpm compare [options]
|
|
49
|
-
|
|
50
|
-
Compare two evaluation score summaries and produce structured deltas.
|
|
51
|
-
|
|
52
|
-
Options:
|
|
53
|
-
--baseline <path> Baseline score-summary.json (default: latest baseline)
|
|
54
|
-
--experiment <path> Experiment score-summary.json (default: results/latest/score-summary.json)
|
|
55
|
-
--threshold <n> Noise threshold for unchanged classification (default: ${DEFAULT_NOISE_THRESHOLD})
|
|
56
|
-
--output <path> Write JSON report to file
|
|
57
|
-
--format <fmt> Output format: table (default) or json
|
|
58
|
-
--help, -h Show this help
|
|
59
|
-
|
|
60
|
-
Examples:
|
|
61
|
-
pnpm compare # current scores vs latest baseline
|
|
62
|
-
pnpm compare --threshold 5 # wider noise band
|
|
63
|
-
pnpm compare --format json # machine-readable output
|
|
64
|
-
pnpm compare --baseline results/baselines/20260310_02_43_44.json
|
|
65
|
-
pnpm compare --baseline before.json --experiment after.json
|
|
66
|
-
`);
|
|
67
|
-
process.exit(0);
|
|
68
|
-
}
|
|
69
|
-
// ---------------------------------------------------------------------------
|
|
70
|
-
// File loading helpers
|
|
71
|
-
// ---------------------------------------------------------------------------
|
|
72
|
-
/**
|
|
73
|
-
* Generate a markdown comparison section suitable for PR comments.
|
|
74
|
-
*/
|
|
75
|
-
export function formatComparisonMarkdown(report) {
|
|
76
|
-
const lines = [];
|
|
77
|
-
const overall = report.deltas.overall;
|
|
78
|
-
const overallIcon = changeIcon(overall > report.noiseThreshold
|
|
79
|
-
? "improved"
|
|
80
|
-
: overall < -report.noiseThreshold
|
|
81
|
-
? "regressed"
|
|
82
|
-
: "unchanged");
|
|
83
|
-
lines.push("### 📊 Score Comparison");
|
|
84
|
-
lines.push("");
|
|
85
|
-
lines.push(`**Overall: ${Math.round(report.baseline.overall.avgScore)} → ${Math.round(report.experiment.overall.avgScore)}** (${overallIcon} ${deltaStr(overall)})`);
|
|
86
|
-
lines.push("");
|
|
87
|
-
// Per-area table
|
|
88
|
-
lines.push("| Feature | Baseline | Current | Delta | Task | Code | Docs |");
|
|
89
|
-
lines.push("|---------|----------|---------|-------|------|------|------|");
|
|
90
|
-
for (const a of report.areas) {
|
|
91
|
-
const icon = changeIcon(a.change);
|
|
92
|
-
lines.push(`| ${a.area} | ${a.baseline} | ${a.experiment} | ${icon} ${deltaStr(a.delta)} | ${deltaStr(a.dimensions.taskCompletion.delta)} | ${deltaStr(a.dimensions.codeCorrectness.delta)} | ${deltaStr(a.dimensions.docCoverage.delta)} |`);
|
|
93
|
-
}
|
|
94
|
-
lines.push("");
|
|
95
|
-
// Summary
|
|
96
|
-
const parts = [];
|
|
97
|
-
if (report.improved.length > 0) {
|
|
98
|
-
parts.push(`📈 ${report.improved.length} improved`);
|
|
99
|
-
}
|
|
100
|
-
if (report.regressed.length > 0) {
|
|
101
|
-
parts.push(`📉 ${report.regressed.length} regressed`);
|
|
102
|
-
}
|
|
103
|
-
if (report.unchanged.length > 0) {
|
|
104
|
-
parts.push(`➡️ ${report.unchanged.length} unchanged`);
|
|
105
|
-
}
|
|
106
|
-
if (parts.length > 0) {
|
|
107
|
-
lines.push(parts.join(" · "));
|
|
108
|
-
lines.push("");
|
|
109
|
-
}
|
|
110
|
-
// Dimension averages in collapsible
|
|
111
|
-
lines.push("<details>");
|
|
112
|
-
lines.push("<summary>Dimension averages</summary>");
|
|
113
|
-
lines.push("");
|
|
114
|
-
const dim = report.deltas.perDimension;
|
|
115
|
-
lines.push("| Dimension | Delta |");
|
|
116
|
-
lines.push("|-----------|-------|");
|
|
117
|
-
lines.push(`| Task Completion | ${deltaStr(dim.taskCompletion)} |`);
|
|
118
|
-
lines.push(`| Code Correctness | ${deltaStr(dim.codeCorrectness)} |`);
|
|
119
|
-
lines.push(`| Doc Coverage | ${deltaStr(dim.docCoverage)} |`);
|
|
120
|
-
lines.push(`| Doc Lift | ${deltaStr(report.deltas.docLift)} |`);
|
|
121
|
-
if (report.deltas.cost !== undefined) {
|
|
122
|
-
const costStr = report.deltas.cost > 0
|
|
123
|
-
? `+$${report.deltas.cost.toFixed(4)}`
|
|
124
|
-
: `-$${Math.abs(report.deltas.cost).toFixed(4)}`;
|
|
125
|
-
lines.push(`| Cost | ${costStr} |`);
|
|
126
|
-
}
|
|
127
|
-
lines.push("");
|
|
128
|
-
lines.push("</details>");
|
|
129
|
-
lines.push("");
|
|
130
|
-
return lines.join("\n");
|
|
131
|
-
}
|
|
132
|
-
export function formatComparisonTable(report) {
|
|
133
|
-
const lines = [];
|
|
134
|
-
lines.push("=".repeat(80));
|
|
135
|
-
lines.push(" COMPARISON REPORT");
|
|
136
|
-
lines.push("=".repeat(80));
|
|
137
|
-
lines.push("");
|
|
138
|
-
// Overall summary
|
|
139
|
-
const overall = report.deltas.overall;
|
|
140
|
-
const overallIcon = changeIcon(overall > report.noiseThreshold
|
|
141
|
-
? "improved"
|
|
142
|
-
: overall < -report.noiseThreshold
|
|
143
|
-
? "regressed"
|
|
144
|
-
: "unchanged");
|
|
145
|
-
lines.push(` Overall: ${Math.round(report.baseline.overall.avgScore)} → ${Math.round(report.experiment.overall.avgScore)} (${overallIcon} ${deltaStr(overall)})`);
|
|
146
|
-
lines.push("");
|
|
147
|
-
// Per-dimension averages
|
|
148
|
-
const dim = report.deltas.perDimension;
|
|
149
|
-
lines.push(" Dimension averages:");
|
|
150
|
-
lines.push(` Task Completion: ${deltaStr(dim.taskCompletion)}`);
|
|
151
|
-
lines.push(` Code Correctness: ${deltaStr(dim.codeCorrectness)}`);
|
|
152
|
-
lines.push(` Doc Coverage: ${deltaStr(dim.docCoverage)}`);
|
|
153
|
-
lines.push(` Doc Lift: ${deltaStr(report.deltas.docLift)}`);
|
|
154
|
-
if (report.deltas.cost !== undefined) {
|
|
155
|
-
lines.push(` Cost: ${report.deltas.cost > 0 ? "+" : ""}$${report.deltas.cost.toFixed(4)}`);
|
|
156
|
-
}
|
|
157
|
-
lines.push("");
|
|
158
|
-
// Per-area table
|
|
159
|
-
lines.push("-".repeat(80));
|
|
160
|
-
lines.push("PER-AREA BREAKDOWN");
|
|
161
|
-
lines.push("-".repeat(80));
|
|
162
|
-
lines.push("");
|
|
163
|
-
const h = "| Feature Area | Baseline | Experiment | Delta | Task | Code | Docs |";
|
|
164
|
-
const sep = "|---------------------|----------|------------|-------|------|------|------|";
|
|
165
|
-
lines.push(h);
|
|
166
|
-
lines.push(sep);
|
|
167
|
-
for (const a of report.areas) {
|
|
168
|
-
const icon = changeIcon(a.change);
|
|
169
|
-
lines.push(`| ${icon} ${a.area.padEnd(17)} | ${String(a.baseline).padStart(8)} | ${String(a.experiment).padStart(10)} | ${deltaStr(a.delta).padStart(5)} | ${deltaStr(a.dimensions.taskCompletion.delta).padStart(4)} | ${deltaStr(a.dimensions.codeCorrectness.delta).padStart(4)} | ${deltaStr(a.dimensions.docCoverage.delta).padStart(4)} |`);
|
|
170
|
-
}
|
|
171
|
-
lines.push("");
|
|
172
|
-
// Classification summary
|
|
173
|
-
if (report.improved.length > 0) {
|
|
174
|
-
lines.push(` 📈 Improved: ${report.improved.join(", ")}`);
|
|
175
|
-
}
|
|
176
|
-
if (report.regressed.length > 0) {
|
|
177
|
-
lines.push(` 📉 Regressed: ${report.regressed.join(", ")}`);
|
|
178
|
-
}
|
|
179
|
-
if (report.unchanged.length > 0) {
|
|
180
|
-
lines.push(` ➡️ Unchanged: ${report.unchanged.join(", ")}`);
|
|
181
|
-
}
|
|
182
|
-
lines.push("");
|
|
183
|
-
// Mismatched areas
|
|
184
|
-
if (report.mismatched.onlyInBaseline.length > 0 ||
|
|
185
|
-
report.mismatched.onlyInExperiment.length > 0) {
|
|
186
|
-
lines.push(" ⚠️ Area mismatches:");
|
|
187
|
-
if (report.mismatched.onlyInBaseline.length > 0) {
|
|
188
|
-
lines.push(` Only in baseline: ${report.mismatched.onlyInBaseline.join(", ")}`);
|
|
189
|
-
}
|
|
190
|
-
if (report.mismatched.onlyInExperiment.length > 0) {
|
|
191
|
-
lines.push(` Only in experiment: ${report.mismatched.onlyInExperiment.join(", ")}`);
|
|
192
|
-
}
|
|
193
|
-
lines.push("");
|
|
194
|
-
}
|
|
195
|
-
const isEmpirical = "noiseThresholdEmpirical" in report &&
|
|
196
|
-
report.noiseThresholdEmpirical === true;
|
|
197
|
-
const thresholdSource = isEmpirical
|
|
198
|
-
? "empirical, from grader consistency data"
|
|
199
|
-
: "default";
|
|
200
|
-
lines.push(` Noise threshold: ±${report.noiseThreshold}${Number.isInteger(report.noiseThreshold) ? "" : ` (${report.noiseThreshold.toFixed(1)})`} (${thresholdSource})`);
|
|
201
|
-
lines.push("");
|
|
202
|
-
// Ceiling decomposition deltas (when areas have ceiling data)
|
|
203
|
-
const hasCeilingData = report.areas.some((a) => a.ceilingDelta !== undefined);
|
|
204
|
-
if (hasCeilingData) {
|
|
205
|
-
lines.push("-".repeat(80));
|
|
206
|
-
lines.push("CEILING DECOMPOSITION DELTAS");
|
|
207
|
-
lines.push("-".repeat(80));
|
|
208
|
-
lines.push("");
|
|
209
|
-
const cH = "| Feature Area | Ceiling Δ | Floor Δ | Doc Lift Δ |";
|
|
210
|
-
const cSep = "|---------------------|-----------|---------|------------|";
|
|
211
|
-
lines.push(cH);
|
|
212
|
-
lines.push(cSep);
|
|
213
|
-
for (const a of report.areas) {
|
|
214
|
-
lines.push(`| ${a.area.padEnd(19)} | ` +
|
|
215
|
-
`${deltaStr(a.ceilingDelta).padStart(9)} | ` +
|
|
216
|
-
`${deltaStr(a.floorDelta).padStart(7)} | ` +
|
|
217
|
-
`${deltaStr(a.docLiftDelta).padStart(10)} |`);
|
|
218
|
-
}
|
|
219
|
-
lines.push("");
|
|
220
|
-
}
|
|
221
|
-
return lines.join("\n");
|
|
222
|
-
}
|
|
223
|
-
// ---------------------------------------------------------------------------
|
|
224
|
-
// Formatting
|
|
225
|
-
// ---------------------------------------------------------------------------
|
|
226
|
-
function changeIcon(change) {
|
|
227
|
-
switch (change) {
|
|
228
|
-
case "improved":
|
|
229
|
-
return "📈";
|
|
230
|
-
case "regressed":
|
|
231
|
-
return "📉";
|
|
232
|
-
default:
|
|
233
|
-
return "➡️";
|
|
234
|
-
}
|
|
235
|
-
}
|
|
236
|
-
function deltaStr(d) {
|
|
237
|
-
if (d > 0)
|
|
238
|
-
return `+${Math.round(d)}`;
|
|
239
|
-
if (d < 0)
|
|
240
|
-
return `${Math.round(d)}`;
|
|
241
|
-
return "0";
|
|
242
|
-
}
|
|
243
|
-
function findLatestBaseline() {
|
|
244
|
-
if (!existsSync(BASELINES_DIR))
|
|
245
|
-
return null;
|
|
246
|
-
const files = readdirSync(BASELINES_DIR)
|
|
247
|
-
.filter((f) => f.endsWith(".json"))
|
|
248
|
-
.sort()
|
|
249
|
-
.reverse();
|
|
250
|
-
return files.length > 0 ? join(BASELINES_DIR, files[0]) : null;
|
|
251
|
-
}
|
|
252
|
-
function loadSummary(path) {
|
|
253
|
-
if (!existsSync(path)) {
|
|
254
|
-
console.error(`❌ File not found: ${path}`);
|
|
255
|
-
process.exit(1);
|
|
256
|
-
}
|
|
257
|
-
const raw = readFileSync(path, "utf-8");
|
|
258
|
-
return JSON.parse(raw);
|
|
259
|
-
}
|
|
260
|
-
// ---------------------------------------------------------------------------
|
|
261
|
-
// Main
|
|
262
|
-
// ---------------------------------------------------------------------------
|
|
263
|
-
function main() {
|
|
264
|
-
// Resolve experiment path
|
|
265
|
-
const expPath = experimentPath ?? SCORE_SUMMARY_PATH;
|
|
266
|
-
const experiment = loadSummary(expPath);
|
|
267
|
-
// Resolve baseline path
|
|
268
|
-
let basePath;
|
|
269
|
-
if (baselinePath) {
|
|
270
|
-
basePath = resolve(baselinePath);
|
|
271
|
-
}
|
|
272
|
-
else {
|
|
273
|
-
const latest = findLatestBaseline();
|
|
274
|
-
if (!latest) {
|
|
275
|
-
console.error("❌ No baselines found. Run 'pnpm baseline:save' first, or use --baseline <path>.");
|
|
276
|
-
process.exit(1);
|
|
277
|
-
}
|
|
278
|
-
basePath = latest;
|
|
279
|
-
}
|
|
280
|
-
const baseline = loadSummary(basePath);
|
|
281
|
-
// Try to load grader consistency data for empirical thresholds
|
|
282
|
-
const consistencyPath = join(ROOT, "results", "latest", "grader-consistency.json");
|
|
283
|
-
let graderConsistency;
|
|
284
|
-
if (existsSync(consistencyPath) && !thresholdStr) {
|
|
285
|
-
try {
|
|
286
|
-
const consistencyRaw = JSON.parse(readFileSync(consistencyPath, "utf-8"));
|
|
287
|
-
if (consistencyRaw.recommendedThreshold && consistencyRaw.perDimension) {
|
|
288
|
-
graderConsistency =
|
|
289
|
-
consistencyRaw;
|
|
290
|
-
console.log(` 📊 Using empirical noise threshold: ±${graderConsistency.recommendedThreshold.toFixed(1)} (from grader consistency data)`);
|
|
291
|
-
}
|
|
292
|
-
}
|
|
293
|
-
catch {
|
|
294
|
-
// Non-fatal — fall back to default threshold
|
|
295
|
-
}
|
|
296
|
-
}
|
|
297
|
-
console.log(` Baseline: ${basePath}`);
|
|
298
|
-
console.log(` Experiment: ${expPath}`);
|
|
299
|
-
if (!graderConsistency) {
|
|
300
|
-
console.log(` Threshold: ±${threshold} (default — run --grader-replications for empirical threshold)`);
|
|
301
|
-
}
|
|
302
|
-
console.log("");
|
|
303
|
-
const report = compare(baseline, experiment, {
|
|
304
|
-
graderConsistency,
|
|
305
|
-
noiseThreshold: threshold,
|
|
306
|
-
});
|
|
307
|
-
if (format === "json") {
|
|
308
|
-
const json = JSON.stringify(report, null, 2);
|
|
309
|
-
if (outputPath) {
|
|
310
|
-
writeFileSync(outputPath, json);
|
|
311
|
-
console.log(` ✅ Comparison report written to ${outputPath}`);
|
|
312
|
-
}
|
|
313
|
-
else {
|
|
314
|
-
console.log(json);
|
|
315
|
-
}
|
|
316
|
-
}
|
|
317
|
-
else {
|
|
318
|
-
const table = formatComparisonTable(report);
|
|
319
|
-
console.log(table);
|
|
320
|
-
if (outputPath) {
|
|
321
|
-
const json = JSON.stringify(report, null, 2);
|
|
322
|
-
writeFileSync(outputPath, json);
|
|
323
|
-
console.log(` ✅ Comparison report also written to ${outputPath}`);
|
|
324
|
-
}
|
|
325
|
-
}
|
|
326
|
-
// Write comparison report to results/latest for other steps to consume
|
|
327
|
-
const latestComparisonPath = join(ROOT, "results", "latest", "comparison-report.json");
|
|
328
|
-
writeFileSync(latestComparisonPath, JSON.stringify(report, null, 2));
|
|
329
|
-
}
|
|
330
|
-
// Only run when invoked directly
|
|
331
|
-
if (process.argv[1]?.endsWith("compare.ts") ||
|
|
332
|
-
process.argv[1]?.endsWith("compare.js")) {
|
|
333
|
-
main();
|
|
334
|
-
}
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* coverage-audit.ts
|
|
3
|
-
*
|
|
4
|
-
* CLI script that cross-references the product feature registry
|
|
5
|
-
* (config/features.yaml) against actual task files (tasks/*.yaml)
|
|
6
|
-
* to produce a documentation coverage audit.
|
|
7
|
-
*
|
|
8
|
-
* Phase 3c of the Scenario Matrix implementation.
|
|
9
|
-
*
|
|
10
|
-
* Usage:
|
|
11
|
-
* pnpm coverage-audit # console report
|
|
12
|
-
* pnpm coverage-audit --format md # markdown output
|
|
13
|
-
* pnpm coverage-audit --json # JSON output
|
|
14
|
-
*
|
|
15
|
-
* @see docs/exec-plans/completed/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
16
|
-
*/
|
|
17
|
-
import type { CoverageAuditReport, ProductFeature } from "../pipeline/types.js";
|
|
18
|
-
/**
|
|
19
|
-
* Count unique document slugs referenced across all tasks.
|
|
20
|
-
*/
|
|
21
|
-
export declare function countReferencedDocs(rootDir: string): {
|
|
22
|
-
slugs: string[];
|
|
23
|
-
total: number;
|
|
24
|
-
};
|
|
25
|
-
/**
|
|
26
|
-
* Count actual tasks per area from task YAML files.
|
|
27
|
-
*/
|
|
28
|
-
export declare function countTasksByArea(rootDir: string): Record<string, number>;
|
|
29
|
-
/**
|
|
30
|
-
* Format coverage audit for console output.
|
|
31
|
-
*/
|
|
32
|
-
export declare function formatCoverageConsole(report: CoverageAuditReport): string;
|
|
33
|
-
/**
|
|
34
|
-
* Format coverage audit as markdown.
|
|
35
|
-
*/
|
|
36
|
-
export declare function formatCoverageMarkdown(report: CoverageAuditReport): string;
|
|
37
|
-
/**
|
|
38
|
-
* Load and validate the feature registry from config/features.yaml.
|
|
39
|
-
*/
|
|
40
|
-
export declare function loadFeatureRegistry(rootDir: string): null | ProductFeature[];
|
|
41
|
-
/**
|
|
42
|
-
* Run the coverage audit and produce a structured report.
|
|
43
|
-
*/
|
|
44
|
-
export declare function runCoverageAudit(rootDir: string): CoverageAuditReport | null;
|
|
@@ -1,209 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* coverage-audit.ts
|
|
3
|
-
*
|
|
4
|
-
* CLI script that cross-references the product feature registry
|
|
5
|
-
* (config/features.yaml) against actual task files (tasks/*.yaml)
|
|
6
|
-
* to produce a documentation coverage audit.
|
|
7
|
-
*
|
|
8
|
-
* Phase 3c of the Scenario Matrix implementation.
|
|
9
|
-
*
|
|
10
|
-
* Usage:
|
|
11
|
-
* pnpm coverage-audit # console report
|
|
12
|
-
* pnpm coverage-audit --format md # markdown output
|
|
13
|
-
* pnpm coverage-audit --json # JSON output
|
|
14
|
-
*
|
|
15
|
-
* @see docs/exec-plans/completed/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
16
|
-
*/
|
|
17
|
-
import { existsSync, readFileSync } from "fs";
|
|
18
|
-
import { dirname, join, resolve } from "path";
|
|
19
|
-
import { fileURLToPath } from "url";
|
|
20
|
-
import { load } from "js-yaml";
|
|
21
|
-
import { FeatureRegistrySchema } from "../pipeline/schemas.js";
|
|
22
|
-
import { resolveMappings } from "../pipeline/resolve-mappings.js";
|
|
23
|
-
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
24
|
-
const ROOT = resolve(__dirname, "..", "..");
|
|
25
|
-
// ---------------------------------------------------------------------------
|
|
26
|
-
// Core logic (exported for testing)
|
|
27
|
-
// ---------------------------------------------------------------------------
|
|
28
|
-
/**
|
|
29
|
-
* Count unique document slugs referenced across all tasks.
|
|
30
|
-
*/
|
|
31
|
-
export function countReferencedDocs(rootDir) {
|
|
32
|
-
const mappings = resolveMappings(rootDir);
|
|
33
|
-
const allSlugs = new Set();
|
|
34
|
-
for (const config of Object.values(mappings.feature_areas)) {
|
|
35
|
-
for (const task of config.tasks) {
|
|
36
|
-
for (const doc of task.canonical_docs) {
|
|
37
|
-
allSlugs.add(doc.slug);
|
|
38
|
-
}
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
const slugs = [...allSlugs].sort();
|
|
42
|
-
return { slugs, total: slugs.length };
|
|
43
|
-
}
|
|
44
|
-
/**
|
|
45
|
-
* Count actual tasks per area from task YAML files.
|
|
46
|
-
*/
|
|
47
|
-
export function countTasksByArea(rootDir) {
|
|
48
|
-
const mappings = resolveMappings(rootDir);
|
|
49
|
-
const counts = {};
|
|
50
|
-
for (const [area, config] of Object.entries(mappings.feature_areas)) {
|
|
51
|
-
counts[area] = config.tasks.length;
|
|
52
|
-
}
|
|
53
|
-
return counts;
|
|
54
|
-
}
|
|
55
|
-
/**
|
|
56
|
-
* Format coverage audit for console output.
|
|
57
|
-
*/
|
|
58
|
-
export function formatCoverageConsole(report) {
|
|
59
|
-
const lines = [];
|
|
60
|
-
lines.push("═══════════════════════════════════════════════════════════════");
|
|
61
|
-
lines.push(" DOCUMENTATION COVERAGE AUDIT");
|
|
62
|
-
lines.push("═══════════════════════════════════════════════════════════════");
|
|
63
|
-
lines.push("");
|
|
64
|
-
lines.push(`Coverage: ${report.covered.length}/${report.totalFeatures} features (${report.coveragePercent}%)`);
|
|
65
|
-
lines.push("");
|
|
66
|
-
// Covered features
|
|
67
|
-
if (report.covered.length > 0) {
|
|
68
|
-
lines.push("COVERED FEATURES:");
|
|
69
|
-
for (const f of report.covered) {
|
|
70
|
-
const taskLabel = f.actualTaskCount === 1 ? "1 task" : `${f.actualTaskCount} tasks`;
|
|
71
|
-
const sections = f.sections.join(", ");
|
|
72
|
-
lines.push(` ✅ ${f.id.padEnd(20)} ${taskLabel.padEnd(10)} ${f.priority.padEnd(10)} ${sections}`);
|
|
73
|
-
}
|
|
74
|
-
lines.push("");
|
|
75
|
-
}
|
|
76
|
-
// Uncovered features
|
|
77
|
-
if (report.uncovered.length > 0) {
|
|
78
|
-
lines.push("UNCOVERED FEATURES (by priority):");
|
|
79
|
-
for (const f of report.uncovered) {
|
|
80
|
-
const sections = f.sections.join(", ");
|
|
81
|
-
lines.push(` ❌ ${f.id.padEnd(20)} ${f.priority.padEnd(10)} ${sections}`);
|
|
82
|
-
}
|
|
83
|
-
lines.push("");
|
|
84
|
-
}
|
|
85
|
-
return lines.join("\n");
|
|
86
|
-
}
|
|
87
|
-
/**
|
|
88
|
-
* Format coverage audit as markdown.
|
|
89
|
-
*/
|
|
90
|
-
export function formatCoverageMarkdown(report) {
|
|
91
|
-
const lines = [];
|
|
92
|
-
lines.push("### 📊 Documentation Coverage Audit");
|
|
93
|
-
lines.push("");
|
|
94
|
-
lines.push(`**Coverage: ${report.covered.length}/${report.totalFeatures} features (${report.coveragePercent}%)**`);
|
|
95
|
-
lines.push("");
|
|
96
|
-
if (report.covered.length > 0) {
|
|
97
|
-
lines.push("#### Covered Features");
|
|
98
|
-
lines.push("");
|
|
99
|
-
lines.push("| Feature | Tasks | Priority | Sections |");
|
|
100
|
-
lines.push("|---------|-------|----------|----------|");
|
|
101
|
-
for (const f of report.covered) {
|
|
102
|
-
lines.push(`| ✅ ${f.name} | ${f.actualTaskCount} | ${f.priority} | ${f.sections.join(", ")} |`);
|
|
103
|
-
}
|
|
104
|
-
lines.push("");
|
|
105
|
-
}
|
|
106
|
-
if (report.uncovered.length > 0) {
|
|
107
|
-
lines.push("#### Uncovered Features");
|
|
108
|
-
lines.push("");
|
|
109
|
-
lines.push("| Feature | Priority | Sections |");
|
|
110
|
-
lines.push("|---------|----------|----------|");
|
|
111
|
-
for (const f of report.uncovered) {
|
|
112
|
-
lines.push(`| ❌ ${f.name} | ${f.priority} | ${f.sections.join(", ")} |`);
|
|
113
|
-
}
|
|
114
|
-
lines.push("");
|
|
115
|
-
}
|
|
116
|
-
return lines.join("\n");
|
|
117
|
-
}
|
|
118
|
-
// ---------------------------------------------------------------------------
|
|
119
|
-
// Formatting
|
|
120
|
-
// ---------------------------------------------------------------------------
|
|
121
|
-
/**
|
|
122
|
-
* Load and validate the feature registry from config/features.yaml.
|
|
123
|
-
*/
|
|
124
|
-
export function loadFeatureRegistry(rootDir) {
|
|
125
|
-
const filePath = join(rootDir, "config", "features.yaml");
|
|
126
|
-
if (!existsSync(filePath)) {
|
|
127
|
-
return null;
|
|
128
|
-
}
|
|
129
|
-
const raw = readFileSync(filePath, "utf-8");
|
|
130
|
-
const parsed = load(raw);
|
|
131
|
-
const result = FeatureRegistrySchema.safeParse(parsed);
|
|
132
|
-
if (!result.success) {
|
|
133
|
-
console.error("❌ config/features.yaml validation failed:");
|
|
134
|
-
for (const issue of result.error.issues) {
|
|
135
|
-
console.error(` ${issue.path.join(".")}: ${issue.message}`);
|
|
136
|
-
}
|
|
137
|
-
return null;
|
|
138
|
-
}
|
|
139
|
-
return result.data.features;
|
|
140
|
-
}
|
|
141
|
-
/**
|
|
142
|
-
* Run the coverage audit and produce a structured report.
|
|
143
|
-
*/
|
|
144
|
-
export function runCoverageAudit(rootDir) {
|
|
145
|
-
const features = loadFeatureRegistry(rootDir);
|
|
146
|
-
if (!features)
|
|
147
|
-
return null;
|
|
148
|
-
const taskCounts = countTasksByArea(rootDir);
|
|
149
|
-
const totalFeatures = features.length;
|
|
150
|
-
const covered = [];
|
|
151
|
-
const uncovered = [];
|
|
152
|
-
for (const feature of features) {
|
|
153
|
-
if (feature.status === "covered" && feature.area) {
|
|
154
|
-
const actualTaskCount = taskCounts[feature.area] ?? 0;
|
|
155
|
-
covered.push({ ...feature, actualTaskCount });
|
|
156
|
-
}
|
|
157
|
-
else if (feature.status === "uncovered" || feature.status === "planned") {
|
|
158
|
-
uncovered.push(feature);
|
|
159
|
-
}
|
|
160
|
-
}
|
|
161
|
-
// Sort uncovered by priority
|
|
162
|
-
const priorityOrder = { critical: 0, high: 1, low: 3, medium: 2 };
|
|
163
|
-
uncovered.sort((a, b) => priorityOrder[a.priority] - priorityOrder[b.priority]);
|
|
164
|
-
const coveredCount = covered.length;
|
|
165
|
-
const coveragePercent = totalFeatures > 0 ? (coveredCount / totalFeatures) * 100 : 0;
|
|
166
|
-
return {
|
|
167
|
-
coveragePercent: Math.round(coveragePercent * 10) / 10,
|
|
168
|
-
covered,
|
|
169
|
-
generatedAt: new Date().toISOString(),
|
|
170
|
-
totalFeatures,
|
|
171
|
-
uncovered,
|
|
172
|
-
};
|
|
173
|
-
}
|
|
174
|
-
// ---------------------------------------------------------------------------
|
|
175
|
-
// CLI entry point
|
|
176
|
-
// ---------------------------------------------------------------------------
|
|
177
|
-
function main() {
|
|
178
|
-
const args = process.argv.slice(2);
|
|
179
|
-
const formatArg = args.includes("--format")
|
|
180
|
-
? args[args.indexOf("--format") + 1]
|
|
181
|
-
: undefined;
|
|
182
|
-
const jsonOutput = args.includes("--json");
|
|
183
|
-
const report = runCoverageAudit(ROOT);
|
|
184
|
-
if (!report) {
|
|
185
|
-
console.error("❌ Coverage audit failed. Ensure config/features.yaml exists and is valid.");
|
|
186
|
-
process.exit(1);
|
|
187
|
-
}
|
|
188
|
-
if (jsonOutput) {
|
|
189
|
-
console.log(JSON.stringify(report, null, 2));
|
|
190
|
-
}
|
|
191
|
-
else if (formatArg === "md" || formatArg === "markdown") {
|
|
192
|
-
console.log(formatCoverageMarkdown(report));
|
|
193
|
-
}
|
|
194
|
-
else {
|
|
195
|
-
console.log(formatCoverageConsole(report));
|
|
196
|
-
}
|
|
197
|
-
// Also print document utilization stats
|
|
198
|
-
if (!jsonOutput && formatArg !== "md") {
|
|
199
|
-
const docStats = countReferencedDocs(ROOT);
|
|
200
|
-
console.log("DOCUMENT UTILIZATION:");
|
|
201
|
-
console.log(` ${docStats.total} unique document slugs referenced across evaluation tasks`);
|
|
202
|
-
console.log("");
|
|
203
|
-
}
|
|
204
|
-
}
|
|
205
|
-
// Only run when invoked directly
|
|
206
|
-
if (process.argv[1]?.endsWith("coverage-audit.ts") ||
|
|
207
|
-
process.argv[1]?.endsWith("coverage-audit.js")) {
|
|
208
|
-
main();
|
|
209
|
-
}
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* debug-eval.ts
|
|
3
|
-
*
|
|
4
|
-
* Thin wrapper around `promptfoo eval` that reads DEBUG_EVAL_* environment
|
|
5
|
-
* variables and forwards them as native promptfoo filter flags.
|
|
6
|
-
*
|
|
7
|
-
* Environment variables:
|
|
8
|
-
* DEBUG_EVAL=1 — required to enable debug mode
|
|
9
|
-
* DEBUG_EVAL_PATTERN=<re> — --filter-pattern (regex on test description)
|
|
10
|
-
* DEBUG_EVAL_N=<number> — --filter-first-n (default: 2 when no other filters set)
|
|
11
|
-
* DEBUG_EVAL_SAMPLE=<number> — --filter-sample (random N tests)
|
|
12
|
-
*
|
|
13
|
-
* Usage:
|
|
14
|
-
* tsx src/scripts/debug-eval.ts --config promptfooconfig.yaml
|
|
15
|
-
* tsx src/scripts/debug-eval.ts --config promptfooconfig.agentic.yaml --no-cache
|
|
16
|
-
*
|
|
17
|
-
* All extra argv are forwarded to promptfoo eval unchanged.
|
|
18
|
-
*/
|
|
19
|
-
export {};
|