@sanity/ailf 2.0.2 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
- package/dist/_vendor/ailf-core/examples/index.js +66 -1
- package/dist/agent-harness/assertions-runtime.d.ts +49 -0
- package/dist/agent-harness/assertions-runtime.js +138 -0
- package/dist/agent-harness/provider.d.ts +58 -0
- package/dist/agent-harness/provider.js +104 -0
- package/dist/cli.js +0 -0
- package/dist/commands/init.js +3 -0
- package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
- package/dist/orchestration/steps/generate-configs-step.js +35 -2
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +39 -25
- package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +28 -85
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
- package/package.json +25 -24
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
- package/dist/adapters/task-sources/yaml-task-source.js +0 -139
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
- package/dist/commands/update-quality-scores.d.ts +0 -5
- package/dist/commands/update-quality-scores.js +0 -20
- package/dist/lib/agent-behavior-report.d.ts +0 -8
- package/dist/lib/agent-behavior-report.js +0 -185
- package/dist/lib/baseline.d.ts +0 -19
- package/dist/lib/baseline.js +0 -153
- package/dist/lib/calculate-scores.d.ts +0 -23
- package/dist/lib/calculate-scores.js +0 -42
- package/dist/lib/compare.d.ts +0 -18
- package/dist/lib/compare.js +0 -170
- package/dist/lib/coverage-audit.d.ts +0 -4
- package/dist/lib/coverage-audit.js +0 -42
- package/dist/lib/discovery-report.d.ts +0 -13
- package/dist/lib/discovery-report.js +0 -57
- package/dist/lib/fetch-docs.d.ts +0 -30
- package/dist/lib/fetch-docs.js +0 -171
- package/dist/lib/generate-configs.d.ts +0 -25
- package/dist/lib/generate-configs.js +0 -42
- package/dist/lib/grader-api.d.ts +0 -21
- package/dist/lib/grader-api.js +0 -34
- package/dist/lib/grader-compare.d.ts +0 -19
- package/dist/lib/grader-compare.js +0 -91
- package/dist/lib/grader-consistency.d.ts +0 -27
- package/dist/lib/grader-consistency.js +0 -79
- package/dist/lib/grader-sensitivity.d.ts +0 -19
- package/dist/lib/grader-sensitivity.js +0 -75
- package/dist/lib/grader-validate.d.ts +0 -19
- package/dist/lib/grader-validate.js +0 -78
- package/dist/lib/measure-retrieval.d.ts +0 -14
- package/dist/lib/measure-retrieval.js +0 -71
- package/dist/lib/pr-comment.d.ts +0 -16
- package/dist/lib/pr-comment.js +0 -28
- package/dist/lib/readiness-report.d.ts +0 -13
- package/dist/lib/readiness-report.js +0 -108
- package/dist/lib/webhook-server.d.ts +0 -11
- package/dist/lib/webhook-server.js +0 -24
- package/dist/lib/weekly-digest.d.ts +0 -24
- package/dist/lib/weekly-digest.js +0 -148
- package/dist/orchestration/env-bridge.d.ts +0 -21
- package/dist/orchestration/env-bridge.js +0 -66
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
- package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
- package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
- package/dist/pipeline/compiler/task-bridge.js +0 -92
- package/dist/pipeline/expand-tasks.d.ts +0 -232
- package/dist/pipeline/expand-tasks.js +0 -467
- package/dist/pipeline/generate-configs.d.ts +0 -92
- package/dist/pipeline/generate-configs.js +0 -445
- package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/calculate-scores-step.js +0 -89
- package/dist/pipeline/steps/compare-step.d.ts +0 -18
- package/dist/pipeline/steps/compare-step.js +0 -90
- package/dist/pipeline/steps/eval-step.d.ts +0 -53
- package/dist/pipeline/steps/eval-step.js +0 -347
- package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
- package/dist/pipeline/steps/fetch-docs-step.js +0 -84
- package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
- package/dist/pipeline/steps/generate-configs-step.js +0 -98
- package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
- package/dist/pipeline/steps/grader-consistency-step.js +0 -74
- package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
- package/dist/pipeline/steps/publish-report-step.js +0 -243
- package/dist/pipeline/steps/report-step.d.ts +0 -13
- package/dist/pipeline/steps/report-step.js +0 -56
- package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/update-scores-step.js +0 -42
- package/dist/scripts/agent-behavior-report.d.ts +0 -19
- package/dist/scripts/agent-behavior-report.js +0 -315
- package/dist/scripts/baseline.d.ts +0 -43
- package/dist/scripts/baseline.js +0 -267
- package/dist/scripts/calculate-scores.d.ts +0 -166
- package/dist/scripts/calculate-scores.js +0 -1296
- package/dist/scripts/compare.d.ts +0 -22
- package/dist/scripts/compare.js +0 -334
- package/dist/scripts/coverage-audit.d.ts +0 -44
- package/dist/scripts/coverage-audit.js +0 -209
- package/dist/scripts/debug-eval.d.ts +0 -19
- package/dist/scripts/debug-eval.js +0 -73
- package/dist/scripts/discovery-report.d.ts +0 -58
- package/dist/scripts/discovery-report.js +0 -250
- package/dist/scripts/fetch-docs.d.ts +0 -35
- package/dist/scripts/fetch-docs.js +0 -472
- package/dist/scripts/generate-configs.d.ts +0 -66
- package/dist/scripts/generate-configs.js +0 -459
- package/dist/scripts/grader-api.d.ts +0 -27
- package/dist/scripts/grader-api.js +0 -206
- package/dist/scripts/grader-compare.d.ts +0 -22
- package/dist/scripts/grader-compare.js +0 -368
- package/dist/scripts/grader-consistency.d.ts +0 -20
- package/dist/scripts/grader-consistency.js +0 -313
- package/dist/scripts/grader-sensitivity.d.ts +0 -22
- package/dist/scripts/grader-sensitivity.js +0 -354
- package/dist/scripts/grader-validate.d.ts +0 -19
- package/dist/scripts/grader-validate.js +0 -267
- package/dist/scripts/measure-retrieval.d.ts +0 -10
- package/dist/scripts/measure-retrieval.js +0 -145
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
- package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
- package/dist/scripts/pipeline.d.ts +0 -76
- package/dist/scripts/pipeline.js +0 -1031
- package/dist/scripts/pr-comment.d.ts +0 -10
- package/dist/scripts/pr-comment.js +0 -510
- package/dist/scripts/readiness-report.d.ts +0 -88
- package/dist/scripts/readiness-report.js +0 -342
- package/dist/scripts/update-quality-scores.d.ts +0 -15
- package/dist/scripts/update-quality-scores.js +0 -184
- package/dist/scripts/validate-task-sources.d.ts +0 -21
- package/dist/scripts/validate-task-sources.js +0 -210
- package/dist/scripts/validate.d.ts +0 -13
- package/dist/scripts/validate.js +0 -79
- package/dist/scripts/webhook-server.d.ts +0 -26
- package/dist/scripts/webhook-server.js +0 -147
- package/dist/scripts/weekly-digest.d.ts +0 -24
- package/dist/scripts/weekly-digest.js +0 -144
- package/dist/sinks/format-slack.d.ts +0 -64
- package/dist/sinks/format-slack.js +0 -306
- package/dist/sinks/slack-sink.d.ts +0 -27
- package/dist/sinks/slack-sink.js +0 -78
- package/dist/sinks/webhook-sink.d.ts +0 -19
- package/dist/sinks/webhook-sink.js +0 -50
- package/tasks/.expanded.agentic.yaml +0 -280
- package/tasks/.expanded.yaml +0 -565
|
@@ -1,315 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* agent-behavior-report.ts
|
|
3
|
-
*
|
|
4
|
-
* Standalone script that reads Promptfoo evaluation results containing
|
|
5
|
-
* agent behavior observation data and generates a detailed report.
|
|
6
|
-
*
|
|
7
|
-
* This provides deeper analysis than the summary included in the main
|
|
8
|
-
* calculate-scores report, including:
|
|
9
|
-
*
|
|
10
|
-
* - Per-task behavior breakdown (which specific pages each task visited)
|
|
11
|
-
* - Canonical doc coverage (did the agent find the "right" docs?)
|
|
12
|
-
* - Request timeline and latency analysis
|
|
13
|
-
* - Search strategy analysis
|
|
14
|
-
* - Cross-task navigation pattern detection
|
|
15
|
-
*
|
|
16
|
-
* Usage:
|
|
17
|
-
* tsx src/scripts/agent-behavior-report.ts [results-path]
|
|
18
|
-
*/
|
|
19
|
-
// oxlint-disable-next-line import/no-unassigned-import -- side-effect: loads .env into process.env
|
|
20
|
-
import "dotenv/config";
|
|
21
|
-
import { readFileSync, writeFileSync, mkdirSync, existsSync } from "fs";
|
|
22
|
-
import { join, dirname } from "path";
|
|
23
|
-
// Canonical doc mapping: task description patterns -> expected doc slugs
|
|
24
|
-
// This maps what docs a well-informed agent *should* visit for each task
|
|
25
|
-
const CANONICAL_DOC_MAP = {
|
|
26
|
-
frameworks: [
|
|
27
|
-
"remix",
|
|
28
|
-
"nuxt",
|
|
29
|
-
"svelte",
|
|
30
|
-
"astro",
|
|
31
|
-
"gatsby",
|
|
32
|
-
"client-libraries",
|
|
33
|
-
],
|
|
34
|
-
functions: [
|
|
35
|
-
"functions",
|
|
36
|
-
"webhooks",
|
|
37
|
-
"groq-powered-webhooks",
|
|
38
|
-
"event-driven",
|
|
39
|
-
"automations",
|
|
40
|
-
],
|
|
41
|
-
"nextjs-live": [
|
|
42
|
-
"next-js",
|
|
43
|
-
"live-content-api",
|
|
44
|
-
"content-source-maps",
|
|
45
|
-
"app-router",
|
|
46
|
-
"groq",
|
|
47
|
-
"client-libraries",
|
|
48
|
-
],
|
|
49
|
-
"studio-setup": [
|
|
50
|
-
"studio",
|
|
51
|
-
"schema-types",
|
|
52
|
-
"structure-builder",
|
|
53
|
-
"configuration",
|
|
54
|
-
"plugins",
|
|
55
|
-
],
|
|
56
|
-
"visual-editing": [
|
|
57
|
-
"visual-editing",
|
|
58
|
-
"presentation",
|
|
59
|
-
"preview",
|
|
60
|
-
"overlays",
|
|
61
|
-
"loaders",
|
|
62
|
-
],
|
|
63
|
-
};
|
|
64
|
-
function analyzeResults(resultsPath) {
|
|
65
|
-
const json = JSON.parse(readFileSync(resultsPath, "utf-8"));
|
|
66
|
-
// Support both the flat shape ({ results: TestResult[] }) and the full
|
|
67
|
-
// Promptfoo envelope shape ({ results: { results: TestResult[] } }).
|
|
68
|
-
const results = Array.isArray(json.results)
|
|
69
|
-
? json.results
|
|
70
|
-
: json.results.results;
|
|
71
|
-
const tasks = [];
|
|
72
|
-
for (const result of results) {
|
|
73
|
-
const metadata = result.metadata;
|
|
74
|
-
if (!metadata?.agentBehaviorSummary)
|
|
75
|
-
continue;
|
|
76
|
-
const behavior = metadata.agentBehaviorSummary;
|
|
77
|
-
tasks.push({
|
|
78
|
-
behavior,
|
|
79
|
-
description: result.description,
|
|
80
|
-
feature: detectFeatureArea(result.description),
|
|
81
|
-
hasDocs: !!(result.vars.docs && result.vars.docs.trim().length > 0),
|
|
82
|
-
});
|
|
83
|
-
}
|
|
84
|
-
if (tasks.length === 0) {
|
|
85
|
-
return { features: [], hasData: false, tasks: [] };
|
|
86
|
-
}
|
|
87
|
-
// Group by feature
|
|
88
|
-
const byFeature = {};
|
|
89
|
-
for (const t of tasks) {
|
|
90
|
-
if (!byFeature[t.feature])
|
|
91
|
-
byFeature[t.feature] = [];
|
|
92
|
-
byFeature[t.feature].push(t);
|
|
93
|
-
}
|
|
94
|
-
const features = Object.entries(byFeature)
|
|
95
|
-
.map(([feature, featureTasks]) => {
|
|
96
|
-
const allDocSlugs = [
|
|
97
|
-
...new Set(featureTasks.flatMap((t) => t.behavior.docSlugsVisited)),
|
|
98
|
-
];
|
|
99
|
-
const allSearchQueries = [
|
|
100
|
-
...new Set(featureTasks.flatMap((t) => t.behavior.uniqueSearchQueries)),
|
|
101
|
-
];
|
|
102
|
-
const allExternalDomains = [
|
|
103
|
-
...new Set(featureTasks.flatMap((t) => t.behavior.externalDomains)),
|
|
104
|
-
];
|
|
105
|
-
const canonicalSlugs = CANONICAL_DOC_MAP[feature] || [];
|
|
106
|
-
const matchedCanonical = canonicalSlugs.filter((slug) => allDocSlugs.some((visited) => visited.includes(slug)));
|
|
107
|
-
const canonicalCoverage = canonicalSlugs.length > 0
|
|
108
|
-
? matchedCanonical.length / canonicalSlugs.length
|
|
109
|
-
: 0;
|
|
110
|
-
const count = featureTasks.length || 1;
|
|
111
|
-
return {
|
|
112
|
-
allDocSlugs,
|
|
113
|
-
allExternalDomains,
|
|
114
|
-
allSearchQueries,
|
|
115
|
-
avgDocPages: featureTasks.reduce((s, t) => s + t.behavior.docPagesVisited, 0) /
|
|
116
|
-
count,
|
|
117
|
-
avgNetworkMs: featureTasks.reduce((s, t) => s + t.behavior.totalNetworkMs, 0) /
|
|
118
|
-
count,
|
|
119
|
-
avgSearches: featureTasks.reduce((s, t) => s + t.behavior.searchesPerformed, 0) /
|
|
120
|
-
count,
|
|
121
|
-
canonicalCoverage,
|
|
122
|
-
canonicalSlugs,
|
|
123
|
-
feature,
|
|
124
|
-
tasks: featureTasks,
|
|
125
|
-
};
|
|
126
|
-
})
|
|
127
|
-
.sort((a, b) => a.feature.localeCompare(b.feature));
|
|
128
|
-
return { features, hasData: true, tasks };
|
|
129
|
-
}
|
|
130
|
-
function detectFeatureArea(description) {
|
|
131
|
-
const desc = description.toLowerCase();
|
|
132
|
-
if (desc.includes("studio"))
|
|
133
|
-
return "studio-setup";
|
|
134
|
-
if (desc.includes("visual") ||
|
|
135
|
-
desc.includes("presentation") ||
|
|
136
|
-
desc.includes("live preview"))
|
|
137
|
-
return "visual-editing";
|
|
138
|
-
if (desc.includes("function") || desc.includes("webhook"))
|
|
139
|
-
return "functions";
|
|
140
|
-
if (desc.includes("next") || desc.includes("app router"))
|
|
141
|
-
return "nextjs-live";
|
|
142
|
-
if (desc.includes("remix") ||
|
|
143
|
-
desc.includes("nuxt") ||
|
|
144
|
-
desc.includes("svelte"))
|
|
145
|
-
return "frameworks";
|
|
146
|
-
return "other";
|
|
147
|
-
}
|
|
148
|
-
// ---------------------------------------------------------------------------
|
|
149
|
-
// Report output
|
|
150
|
-
// ---------------------------------------------------------------------------
|
|
151
|
-
function main() {
|
|
152
|
-
const ROOT = join(dirname(new URL(import.meta.url).pathname), "..", "..");
|
|
153
|
-
const resultsPath = process.argv[2] || join(ROOT, "results", "latest", "eval-results.json");
|
|
154
|
-
if (!existsSync(resultsPath)) {
|
|
155
|
-
console.error(`Results file not found: ${resultsPath}`);
|
|
156
|
-
console.error("Run an evaluation first: pnpm eval:observed");
|
|
157
|
-
process.exit(1);
|
|
158
|
-
}
|
|
159
|
-
console.log(`Reading results from: ${resultsPath}`);
|
|
160
|
-
console.log();
|
|
161
|
-
const analysis = analyzeResults(resultsPath);
|
|
162
|
-
if (!analysis.hasData) {
|
|
163
|
-
console.log("No agent behavior data found in the results.");
|
|
164
|
-
console.log("Make sure you ran the evaluation with the observed config:");
|
|
165
|
-
console.log(" pnpm eval:observed");
|
|
166
|
-
process.exit(0);
|
|
167
|
-
}
|
|
168
|
-
printReport(analysis);
|
|
169
|
-
// Persist detailed report as JSON
|
|
170
|
-
const outDir = join(ROOT, "results", "latest");
|
|
171
|
-
mkdirSync(outDir, { recursive: true });
|
|
172
|
-
const reportData = {
|
|
173
|
-
features: analysis.features.map((f) => ({
|
|
174
|
-
avgDocPages: f.avgDocPages,
|
|
175
|
-
avgNetworkMs: f.avgNetworkMs,
|
|
176
|
-
avgSearches: f.avgSearches,
|
|
177
|
-
canonicalCoverage: f.canonicalCoverage,
|
|
178
|
-
canonicalSlugs: f.canonicalSlugs,
|
|
179
|
-
docSlugsVisited: f.allDocSlugs,
|
|
180
|
-
externalDomains: f.allExternalDomains,
|
|
181
|
-
feature: f.feature,
|
|
182
|
-
searchQueries: f.allSearchQueries,
|
|
183
|
-
taskCount: f.tasks.length,
|
|
184
|
-
})),
|
|
185
|
-
tasks: analysis.tasks.map((t) => ({
|
|
186
|
-
behavior: t.behavior,
|
|
187
|
-
description: t.description,
|
|
188
|
-
feature: t.feature,
|
|
189
|
-
hasDocs: t.hasDocs,
|
|
190
|
-
})),
|
|
191
|
-
timestamp: new Date().toISOString(),
|
|
192
|
-
totalTasks: analysis.tasks.length,
|
|
193
|
-
};
|
|
194
|
-
writeFileSync(join(outDir, "agent-behavior-report.json"), JSON.stringify(reportData, null, 2));
|
|
195
|
-
console.log("Agent behavior report written to results/latest/agent-behavior-report.json");
|
|
196
|
-
}
|
|
197
|
-
// ---------------------------------------------------------------------------
|
|
198
|
-
// Main
|
|
199
|
-
// ---------------------------------------------------------------------------
|
|
200
|
-
function printReport(analysis) {
|
|
201
|
-
console.log("=".repeat(80));
|
|
202
|
-
console.log(" AGENT BEHAVIOR OBSERVATION REPORT");
|
|
203
|
-
console.log("=".repeat(80));
|
|
204
|
-
console.log();
|
|
205
|
-
// ---- Overview table ----
|
|
206
|
-
console.log("OVERVIEW BY FEATURE AREA");
|
|
207
|
-
console.log("-".repeat(80));
|
|
208
|
-
const h = "| Feature Area | Tasks | Avg Docs | Avg Search | Avg Net(ms) | Canon% |";
|
|
209
|
-
const sep = "|---------------------|-------|----------|------------|-------------|--------|";
|
|
210
|
-
console.log(h);
|
|
211
|
-
console.log(sep);
|
|
212
|
-
for (const f of analysis.features) {
|
|
213
|
-
console.log(`| ${f.feature.padEnd(19)} | ` +
|
|
214
|
-
`${f.tasks.length.toString().padStart(5)} | ` +
|
|
215
|
-
`${f.avgDocPages.toFixed(1).padStart(8)} | ` +
|
|
216
|
-
`${f.avgSearches.toFixed(1).padStart(10)} | ` +
|
|
217
|
-
`${Math.round(f.avgNetworkMs).toString().padStart(11)} | ` +
|
|
218
|
-
`${(f.canonicalCoverage * 100).toFixed(0).padStart(5)}% |`);
|
|
219
|
-
}
|
|
220
|
-
console.log();
|
|
221
|
-
// ---- Canonical coverage breakdown ----
|
|
222
|
-
console.log("CANONICAL DOCUMENTATION COVERAGE");
|
|
223
|
-
console.log("-".repeat(80));
|
|
224
|
-
console.log();
|
|
225
|
-
for (const f of analysis.features) {
|
|
226
|
-
console.log(` ${f.feature} (${(f.canonicalCoverage * 100).toFixed(0)}% canonical coverage):`);
|
|
227
|
-
if (f.canonicalSlugs.length === 0) {
|
|
228
|
-
console.log(" (no canonical docs defined)");
|
|
229
|
-
}
|
|
230
|
-
else {
|
|
231
|
-
for (const slug of f.canonicalSlugs) {
|
|
232
|
-
const found = f.allDocSlugs.some((visited) => visited.includes(slug));
|
|
233
|
-
const marker = found ? "[x]" : "[ ]";
|
|
234
|
-
console.log(` ${marker} ${slug}`);
|
|
235
|
-
}
|
|
236
|
-
}
|
|
237
|
-
if (f.allDocSlugs.length > 0) {
|
|
238
|
-
const nonCanonical = f.allDocSlugs.filter((slug) => !f.canonicalSlugs.some((c) => slug.includes(c)));
|
|
239
|
-
if (nonCanonical.length > 0) {
|
|
240
|
-
console.log(" Additional docs visited:");
|
|
241
|
-
for (const slug of nonCanonical) {
|
|
242
|
-
console.log(` + ${slug}`);
|
|
243
|
-
}
|
|
244
|
-
}
|
|
245
|
-
}
|
|
246
|
-
console.log();
|
|
247
|
-
}
|
|
248
|
-
// ---- Search strategy ----
|
|
249
|
-
const allSearches = analysis.features.flatMap((f) => f.allSearchQueries);
|
|
250
|
-
if (allSearches.length > 0) {
|
|
251
|
-
console.log("SEARCH STRATEGY");
|
|
252
|
-
console.log("-".repeat(80));
|
|
253
|
-
console.log();
|
|
254
|
-
for (const f of analysis.features) {
|
|
255
|
-
if (f.allSearchQueries.length === 0)
|
|
256
|
-
continue;
|
|
257
|
-
console.log(` ${f.feature}:`);
|
|
258
|
-
for (const q of f.allSearchQueries) {
|
|
259
|
-
console.log(` -> "${q}"`);
|
|
260
|
-
}
|
|
261
|
-
}
|
|
262
|
-
console.log();
|
|
263
|
-
}
|
|
264
|
-
// ---- Per-task detail ----
|
|
265
|
-
console.log("PER-TASK DETAIL");
|
|
266
|
-
console.log("-".repeat(80));
|
|
267
|
-
console.log();
|
|
268
|
-
for (const f of analysis.features) {
|
|
269
|
-
console.log(` ## ${f.feature}`);
|
|
270
|
-
console.log();
|
|
271
|
-
for (const t of f.tasks) {
|
|
272
|
-
const variant = t.hasDocs ? "[gold]" : "[baseline]";
|
|
273
|
-
console.log(` ${variant} ${t.description}`);
|
|
274
|
-
console.log(` Requests: ${t.behavior.totalRequests} | ` +
|
|
275
|
-
`Doc pages: ${t.behavior.docPagesVisited} | ` +
|
|
276
|
-
`Searches: ${t.behavior.searchesPerformed} | ` +
|
|
277
|
-
`External: ${t.behavior.externalRequestCount}`);
|
|
278
|
-
if (t.behavior.docSlugsVisited.length > 0) {
|
|
279
|
-
console.log(` Docs: ${t.behavior.docSlugsVisited.join(", ")}`);
|
|
280
|
-
}
|
|
281
|
-
if (t.behavior.uniqueSearchQueries.length > 0) {
|
|
282
|
-
console.log(` Queries: ${t.behavior.uniqueSearchQueries.map((q) => `"${q}"`).join(", ")}`);
|
|
283
|
-
}
|
|
284
|
-
console.log();
|
|
285
|
-
}
|
|
286
|
-
}
|
|
287
|
-
// ---- External domains ----
|
|
288
|
-
const allDomains = [
|
|
289
|
-
...new Set(analysis.features.flatMap((f) => f.allExternalDomains)),
|
|
290
|
-
];
|
|
291
|
-
if (allDomains.length > 0) {
|
|
292
|
-
console.log("EXTERNAL DOMAINS");
|
|
293
|
-
console.log("-".repeat(80));
|
|
294
|
-
console.log();
|
|
295
|
-
for (const d of allDomains) {
|
|
296
|
-
console.log(` - ${d}`);
|
|
297
|
-
}
|
|
298
|
-
console.log();
|
|
299
|
-
}
|
|
300
|
-
// ---- Summary stats ----
|
|
301
|
-
console.log("OVERALL STATISTICS");
|
|
302
|
-
console.log("-".repeat(80));
|
|
303
|
-
console.log();
|
|
304
|
-
const totalTasks = analysis.tasks.length;
|
|
305
|
-
const tasksUsingDocs = analysis.tasks.filter((t) => t.behavior.usedDocs).length;
|
|
306
|
-
const tasksUsingSearch = analysis.tasks.filter((t) => t.behavior.usedSearch).length;
|
|
307
|
-
const avgCanonical = analysis.features.reduce((s, f) => s + f.canonicalCoverage, 0) /
|
|
308
|
-
(analysis.features.length || 1);
|
|
309
|
-
console.log(` Total tasks observed: ${totalTasks}`);
|
|
310
|
-
console.log(` Tasks that used docs: ${tasksUsingDocs}/${totalTasks} (${((tasksUsingDocs / totalTasks) * 100).toFixed(0)}%)`);
|
|
311
|
-
console.log(` Tasks that used search: ${tasksUsingSearch}/${totalTasks} (${((tasksUsingSearch / totalTasks) * 100).toFixed(0)}%)`);
|
|
312
|
-
console.log(` Avg canonical coverage: ${(avgCanonical * 100).toFixed(1)}%`);
|
|
313
|
-
console.log();
|
|
314
|
-
}
|
|
315
|
-
main();
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Baseline.ts
|
|
3
|
-
*
|
|
4
|
-
* Manages historical baseline snapshots of evaluation scores.
|
|
5
|
-
* Allows saving, comparing, and listing score baselines over time.
|
|
6
|
-
*
|
|
7
|
-
* Usage:
|
|
8
|
-
* pnpm baseline:save # save current scores as baseline
|
|
9
|
-
* pnpm baseline:save --tag "pre-groq" # save with a descriptive tag
|
|
10
|
-
* pnpm baseline:compare # compare current vs latest baseline
|
|
11
|
-
* pnpm baseline:history # list all saved baselines
|
|
12
|
-
*/
|
|
13
|
-
interface BaselineMetadata {
|
|
14
|
-
areaCount: number;
|
|
15
|
-
avgScore: number;
|
|
16
|
-
filename: string;
|
|
17
|
-
graderCost?: number;
|
|
18
|
-
tag?: string;
|
|
19
|
-
timestamp: string;
|
|
20
|
-
totalCost?: number;
|
|
21
|
-
}
|
|
22
|
-
interface CompareResult {
|
|
23
|
-
comparisons?: ScoreComparison[];
|
|
24
|
-
message: string;
|
|
25
|
-
overallDelta?: number;
|
|
26
|
-
success: boolean;
|
|
27
|
-
}
|
|
28
|
-
interface ScoreComparison {
|
|
29
|
-
baseline: number;
|
|
30
|
-
costBaseline?: number;
|
|
31
|
-
costCurrent?: number;
|
|
32
|
-
costDelta?: number;
|
|
33
|
-
current: number;
|
|
34
|
-
delta: number;
|
|
35
|
-
feature: string;
|
|
36
|
-
}
|
|
37
|
-
export declare function compareBaseline(baselineFile?: string): CompareResult;
|
|
38
|
-
export declare function listBaselines(): BaselineMetadata[];
|
|
39
|
-
export declare function saveBaseline(tag?: string): {
|
|
40
|
-
success: boolean;
|
|
41
|
-
message: string;
|
|
42
|
-
};
|
|
43
|
-
export {};
|
package/dist/scripts/baseline.js
DELETED
|
@@ -1,267 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Baseline.ts
|
|
3
|
-
*
|
|
4
|
-
* Manages historical baseline snapshots of evaluation scores.
|
|
5
|
-
* Allows saving, comparing, and listing score baselines over time.
|
|
6
|
-
*
|
|
7
|
-
* Usage:
|
|
8
|
-
* pnpm baseline:save # save current scores as baseline
|
|
9
|
-
* pnpm baseline:save --tag "pre-groq" # save with a descriptive tag
|
|
10
|
-
* pnpm baseline:compare # compare current vs latest baseline
|
|
11
|
-
* pnpm baseline:history # list all saved baselines
|
|
12
|
-
*/
|
|
13
|
-
import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
|
|
14
|
-
import { dirname, join, resolve } from "path";
|
|
15
|
-
import { fileURLToPath } from "url";
|
|
16
|
-
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
17
|
-
const ROOT = resolve(__dirname, "..", "..");
|
|
18
|
-
const BASELINES_DIR = join(ROOT, "results", "baselines");
|
|
19
|
-
const SCORE_SUMMARY_PATH = join(ROOT, "results", "latest", "score-summary.json");
|
|
20
|
-
// ---------------------------------------------------------------------------
|
|
21
|
-
// Compare
|
|
22
|
-
// ---------------------------------------------------------------------------
|
|
23
|
-
export function compareBaseline(baselineFile) {
|
|
24
|
-
if (!existsSync(SCORE_SUMMARY_PATH)) {
|
|
25
|
-
return {
|
|
26
|
-
message: "No current score-summary.json found.",
|
|
27
|
-
success: false,
|
|
28
|
-
};
|
|
29
|
-
}
|
|
30
|
-
// Find baseline to compare against
|
|
31
|
-
const baselines = listBaselines();
|
|
32
|
-
if (baselines.length === 0) {
|
|
33
|
-
return {
|
|
34
|
-
message: "No baselines saved yet. Run 'pnpm baseline:save' first.",
|
|
35
|
-
success: false,
|
|
36
|
-
};
|
|
37
|
-
}
|
|
38
|
-
const targetFile = baselineFile ?? baselines[0].filename;
|
|
39
|
-
const baselinePath = join(BASELINES_DIR, targetFile);
|
|
40
|
-
if (!existsSync(baselinePath)) {
|
|
41
|
-
return {
|
|
42
|
-
message: `Baseline file not found: ${targetFile}`,
|
|
43
|
-
success: false,
|
|
44
|
-
};
|
|
45
|
-
}
|
|
46
|
-
const current = JSON.parse(readFileSync(SCORE_SUMMARY_PATH, "utf-8"));
|
|
47
|
-
const baseline = JSON.parse(readFileSync(baselinePath, "utf-8"));
|
|
48
|
-
const baselineMap = new Map(baseline.scores.map((s) => [s.feature, s.totalScore]));
|
|
49
|
-
const baselineCostMap = new Map(baseline.scores.map((s) => [s.feature, s.totalCost ?? 0]));
|
|
50
|
-
const comparisons = current.scores.map((s) => {
|
|
51
|
-
const baseScore = baselineMap.get(s.feature) ?? 0;
|
|
52
|
-
const currentCost = s.totalCost ?? 0;
|
|
53
|
-
const baseCost = baselineCostMap.get(s.feature) ?? 0;
|
|
54
|
-
return {
|
|
55
|
-
baseline: baseScore,
|
|
56
|
-
costBaseline: baseCost > 0 ? baseCost : undefined,
|
|
57
|
-
costCurrent: currentCost > 0 ? currentCost : undefined,
|
|
58
|
-
costDelta: currentCost > 0 || baseCost > 0 ? currentCost - baseCost : undefined,
|
|
59
|
-
current: s.totalScore,
|
|
60
|
-
delta: s.totalScore - baseScore,
|
|
61
|
-
feature: s.feature,
|
|
62
|
-
};
|
|
63
|
-
});
|
|
64
|
-
// Check for areas in baseline but not in current
|
|
65
|
-
for (const [feature, score] of baselineMap) {
|
|
66
|
-
if (!comparisons.find((c) => c.feature === feature)) {
|
|
67
|
-
comparisons.push({
|
|
68
|
-
baseline: score,
|
|
69
|
-
current: 0,
|
|
70
|
-
delta: -score,
|
|
71
|
-
feature,
|
|
72
|
-
});
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
comparisons.sort((a, b) => b.delta - a.delta);
|
|
76
|
-
const overallDelta = Math.round(current.overall.avgScore) - Math.round(baseline.overall.avgScore);
|
|
77
|
-
return {
|
|
78
|
-
comparisons,
|
|
79
|
-
message: `Compared against ${targetFile}`,
|
|
80
|
-
overallDelta,
|
|
81
|
-
success: true,
|
|
82
|
-
};
|
|
83
|
-
}
|
|
84
|
-
export function listBaselines() {
|
|
85
|
-
if (!existsSync(BASELINES_DIR)) {
|
|
86
|
-
return [];
|
|
87
|
-
}
|
|
88
|
-
const files = readdirSync(BASELINES_DIR)
|
|
89
|
-
.filter((f) => f.endsWith(".json"))
|
|
90
|
-
.sort()
|
|
91
|
-
.reverse(); // Newest first
|
|
92
|
-
return files.map((filename) => {
|
|
93
|
-
const raw = readFileSync(join(BASELINES_DIR, filename), "utf-8");
|
|
94
|
-
const data = JSON.parse(raw);
|
|
95
|
-
return {
|
|
96
|
-
areaCount: data.scores.length,
|
|
97
|
-
avgScore: Math.round(data.overall.avgScore),
|
|
98
|
-
filename,
|
|
99
|
-
graderCost: data.overall.cost?.graderTotal,
|
|
100
|
-
tag: data.baselineMeta?.tag,
|
|
101
|
-
timestamp: data.timestamp,
|
|
102
|
-
totalCost: data.overall.cost?.total,
|
|
103
|
-
};
|
|
104
|
-
});
|
|
105
|
-
}
|
|
106
|
-
export function saveBaseline(tag) {
|
|
107
|
-
if (!existsSync(SCORE_SUMMARY_PATH)) {
|
|
108
|
-
return {
|
|
109
|
-
message: "No score-summary.json found. Run 'pnpm calculate-scores' first.",
|
|
110
|
-
success: false,
|
|
111
|
-
};
|
|
112
|
-
}
|
|
113
|
-
const raw = readFileSync(SCORE_SUMMARY_PATH, "utf-8");
|
|
114
|
-
const summary = JSON.parse(raw);
|
|
115
|
-
mkdirSync(BASELINES_DIR, { recursive: true });
|
|
116
|
-
// Generate filename: YYYY-MM-DD_HHmmss[_tag].json
|
|
117
|
-
const now = new Date();
|
|
118
|
-
const datePart = now
|
|
119
|
-
.toISOString()
|
|
120
|
-
.slice(0, 19)
|
|
121
|
-
.replace(/[T:]/g, "_")
|
|
122
|
-
.replace(/-/g, "");
|
|
123
|
-
const tagPart = tag
|
|
124
|
-
? `_${tag.replace(/[^a-z0-9-]/gi, "-").toLowerCase()}`
|
|
125
|
-
: "";
|
|
126
|
-
const filename = `${datePart}${tagPart}.json`;
|
|
127
|
-
const baseline = {
|
|
128
|
-
...summary,
|
|
129
|
-
baselineMeta: {
|
|
130
|
-
savedAt: now.toISOString(),
|
|
131
|
-
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string tag should be treated as no tag
|
|
132
|
-
tag: tag || undefined,
|
|
133
|
-
},
|
|
134
|
-
};
|
|
135
|
-
writeFileSync(join(BASELINES_DIR, filename), JSON.stringify(baseline, null, 2));
|
|
136
|
-
return {
|
|
137
|
-
message: `Saved baseline to results/baselines/${filename} (avg: ${Math.round(summary.overall.avgScore)}, ${summary.scores.length} areas)`,
|
|
138
|
-
success: true,
|
|
139
|
-
};
|
|
140
|
-
}
|
|
141
|
-
// ---------------------------------------------------------------------------
|
|
142
|
-
// CLI
|
|
143
|
-
// ---------------------------------------------------------------------------
|
|
144
|
-
if (process.argv[1]?.endsWith("baseline.ts") ||
|
|
145
|
-
process.argv[1]?.endsWith("baseline.js")) {
|
|
146
|
-
const args = process.argv.slice(2);
|
|
147
|
-
const command = args[0] || "save";
|
|
148
|
-
function getArg(name) {
|
|
149
|
-
const idx = args.indexOf(`--${name}`);
|
|
150
|
-
return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined;
|
|
151
|
-
}
|
|
152
|
-
switch (command) {
|
|
153
|
-
case "compare": {
|
|
154
|
-
const file = getArg("file");
|
|
155
|
-
console.log("=== Baseline Comparison ===\n");
|
|
156
|
-
const result = compareBaseline(file);
|
|
157
|
-
if (!result.success) {
|
|
158
|
-
console.error(` ❌ ${result.message}`);
|
|
159
|
-
process.exit(1);
|
|
160
|
-
}
|
|
161
|
-
console.log(` ${result.message}\n`);
|
|
162
|
-
console.log(" " +
|
|
163
|
-
"Feature Area".padEnd(18) +
|
|
164
|
-
"Current".padEnd(10) +
|
|
165
|
-
"Baseline".padEnd(10) +
|
|
166
|
-
"Delta");
|
|
167
|
-
console.log(" " + "-".repeat(50));
|
|
168
|
-
for (const c of result.comparisons) {
|
|
169
|
-
const deltaStr = c.delta > 0 ? `+${c.delta}` : c.delta === 0 ? "=" : String(c.delta);
|
|
170
|
-
const icon = c.delta > 0 ? "📈" : c.delta < 0 ? "📉" : "➡️";
|
|
171
|
-
console.log(" " +
|
|
172
|
-
c.feature.padEnd(18) +
|
|
173
|
-
String(c.current).padEnd(10) +
|
|
174
|
-
String(c.baseline).padEnd(10) +
|
|
175
|
-
`${icon} ${deltaStr}`);
|
|
176
|
-
}
|
|
177
|
-
// Cost comparison (only if cost data exists)
|
|
178
|
-
const hasCostData = result.comparisons.some((c) => c.costCurrent !== undefined || c.costBaseline !== undefined);
|
|
179
|
-
if (hasCostData) {
|
|
180
|
-
console.log();
|
|
181
|
-
console.log(" " + "Cost Comparison:");
|
|
182
|
-
console.log(" " +
|
|
183
|
-
"Feature Area".padEnd(18) +
|
|
184
|
-
"Current".padEnd(10) +
|
|
185
|
-
"Baseline".padEnd(10) +
|
|
186
|
-
"Delta");
|
|
187
|
-
console.log(" " + "-".repeat(50));
|
|
188
|
-
for (const c of result.comparisons) {
|
|
189
|
-
if (c.costCurrent === undefined && c.costBaseline === undefined) {
|
|
190
|
-
continue;
|
|
191
|
-
}
|
|
192
|
-
const cur = `$${(c.costCurrent ?? 0).toFixed(4)}`;
|
|
193
|
-
const base = `$${(c.costBaseline ?? 0).toFixed(4)}`;
|
|
194
|
-
const delta = c.costDelta ?? 0;
|
|
195
|
-
const deltaStr = delta > 0
|
|
196
|
-
? `+$${delta.toFixed(4)}`
|
|
197
|
-
: delta < 0
|
|
198
|
-
? `-$${Math.abs(delta).toFixed(4)}`
|
|
199
|
-
: "=";
|
|
200
|
-
const icon = delta > 0 ? "📈" : delta < 0 ? "📉" : "➡️";
|
|
201
|
-
console.log(" " +
|
|
202
|
-
c.feature.padEnd(18) +
|
|
203
|
-
cur.padEnd(10) +
|
|
204
|
-
base.padEnd(10) +
|
|
205
|
-
`${icon} ${deltaStr}`);
|
|
206
|
-
}
|
|
207
|
-
}
|
|
208
|
-
console.log();
|
|
209
|
-
const overallIcon = result.overallDelta > 0 ? "📈" : result.overallDelta < 0 ? "📉" : "➡️";
|
|
210
|
-
const overallStr = result.overallDelta > 0
|
|
211
|
-
? `+${result.overallDelta}`
|
|
212
|
-
: result.overallDelta === 0
|
|
213
|
-
? "="
|
|
214
|
-
: String(result.overallDelta);
|
|
215
|
-
console.log(` Overall: ${overallIcon} ${overallStr} points`);
|
|
216
|
-
break;
|
|
217
|
-
}
|
|
218
|
-
case "history": {
|
|
219
|
-
console.log("=== Baseline History ===\n");
|
|
220
|
-
const baselines = listBaselines();
|
|
221
|
-
if (baselines.length === 0) {
|
|
222
|
-
console.log(" No baselines saved yet.");
|
|
223
|
-
}
|
|
224
|
-
else {
|
|
225
|
-
const hasCosts = baselines.some((b) => b.totalCost !== undefined || b.graderCost !== undefined);
|
|
226
|
-
const costHeader = hasCosts ? "Cost".padEnd(10) : "";
|
|
227
|
-
console.log(" " +
|
|
228
|
-
"Date".padEnd(22) +
|
|
229
|
-
"Avg".padEnd(6) +
|
|
230
|
-
"Areas".padEnd(7) +
|
|
231
|
-
costHeader +
|
|
232
|
-
"Tag");
|
|
233
|
-
console.log(" " + "-".repeat(hasCosts ? 60 : 50));
|
|
234
|
-
for (const b of baselines) {
|
|
235
|
-
const date = new Date(b.timestamp).toLocaleString();
|
|
236
|
-
const combinedCost = (b.totalCost ?? 0) + (b.graderCost ?? 0);
|
|
237
|
-
const costStr = hasCosts
|
|
238
|
-
? (combinedCost > 0 ? `$${combinedCost.toFixed(2)}` : "-").padEnd(10)
|
|
239
|
-
: "";
|
|
240
|
-
console.log(" " +
|
|
241
|
-
date.padEnd(22) +
|
|
242
|
-
String(b.avgScore).padEnd(6) +
|
|
243
|
-
String(b.areaCount).padEnd(7) +
|
|
244
|
-
costStr +
|
|
245
|
-
(b.tag ?? ""));
|
|
246
|
-
}
|
|
247
|
-
}
|
|
248
|
-
break;
|
|
249
|
-
}
|
|
250
|
-
case "save": {
|
|
251
|
-
const tag = getArg("tag");
|
|
252
|
-
console.log("=== Saving baseline snapshot ===\n");
|
|
253
|
-
const result = saveBaseline(tag);
|
|
254
|
-
if (result.success) {
|
|
255
|
-
console.log(` ✅ ${result.message}`);
|
|
256
|
-
}
|
|
257
|
-
else {
|
|
258
|
-
console.error(` ❌ ${result.message}`);
|
|
259
|
-
process.exit(1);
|
|
260
|
-
}
|
|
261
|
-
break;
|
|
262
|
-
}
|
|
263
|
-
default:
|
|
264
|
-
console.error(`Unknown command: "${command}". Use: save, history, compare`);
|
|
265
|
-
process.exit(1);
|
|
266
|
-
}
|
|
267
|
-
}
|