@sanity/ailf 2.0.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/cli.js +0 -0
- package/package.json +24 -24
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
- package/dist/adapters/task-sources/yaml-task-source.js +0 -139
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
- package/dist/commands/update-quality-scores.d.ts +0 -5
- package/dist/commands/update-quality-scores.js +0 -20
- package/dist/lib/agent-behavior-report.d.ts +0 -8
- package/dist/lib/agent-behavior-report.js +0 -185
- package/dist/lib/baseline.d.ts +0 -19
- package/dist/lib/baseline.js +0 -153
- package/dist/lib/calculate-scores.d.ts +0 -23
- package/dist/lib/calculate-scores.js +0 -42
- package/dist/lib/compare.d.ts +0 -18
- package/dist/lib/compare.js +0 -170
- package/dist/lib/coverage-audit.d.ts +0 -4
- package/dist/lib/coverage-audit.js +0 -42
- package/dist/lib/discovery-report.d.ts +0 -13
- package/dist/lib/discovery-report.js +0 -57
- package/dist/lib/fetch-docs.d.ts +0 -30
- package/dist/lib/fetch-docs.js +0 -171
- package/dist/lib/generate-configs.d.ts +0 -25
- package/dist/lib/generate-configs.js +0 -42
- package/dist/lib/grader-api.d.ts +0 -21
- package/dist/lib/grader-api.js +0 -34
- package/dist/lib/grader-compare.d.ts +0 -19
- package/dist/lib/grader-compare.js +0 -91
- package/dist/lib/grader-consistency.d.ts +0 -27
- package/dist/lib/grader-consistency.js +0 -79
- package/dist/lib/grader-sensitivity.d.ts +0 -19
- package/dist/lib/grader-sensitivity.js +0 -75
- package/dist/lib/grader-validate.d.ts +0 -19
- package/dist/lib/grader-validate.js +0 -78
- package/dist/lib/measure-retrieval.d.ts +0 -14
- package/dist/lib/measure-retrieval.js +0 -71
- package/dist/lib/pr-comment.d.ts +0 -16
- package/dist/lib/pr-comment.js +0 -28
- package/dist/lib/readiness-report.d.ts +0 -13
- package/dist/lib/readiness-report.js +0 -108
- package/dist/lib/webhook-server.d.ts +0 -11
- package/dist/lib/webhook-server.js +0 -24
- package/dist/lib/weekly-digest.d.ts +0 -24
- package/dist/lib/weekly-digest.js +0 -148
- package/dist/orchestration/env-bridge.d.ts +0 -21
- package/dist/orchestration/env-bridge.js +0 -66
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
- package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
- package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
- package/dist/pipeline/compiler/task-bridge.js +0 -92
- package/dist/pipeline/expand-tasks.d.ts +0 -232
- package/dist/pipeline/expand-tasks.js +0 -467
- package/dist/pipeline/generate-configs.d.ts +0 -92
- package/dist/pipeline/generate-configs.js +0 -445
- package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/calculate-scores-step.js +0 -89
- package/dist/pipeline/steps/compare-step.d.ts +0 -18
- package/dist/pipeline/steps/compare-step.js +0 -90
- package/dist/pipeline/steps/eval-step.d.ts +0 -53
- package/dist/pipeline/steps/eval-step.js +0 -347
- package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
- package/dist/pipeline/steps/fetch-docs-step.js +0 -84
- package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
- package/dist/pipeline/steps/generate-configs-step.js +0 -98
- package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
- package/dist/pipeline/steps/grader-consistency-step.js +0 -74
- package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
- package/dist/pipeline/steps/publish-report-step.js +0 -243
- package/dist/pipeline/steps/report-step.d.ts +0 -13
- package/dist/pipeline/steps/report-step.js +0 -56
- package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/update-scores-step.js +0 -42
- package/dist/scripts/agent-behavior-report.d.ts +0 -19
- package/dist/scripts/agent-behavior-report.js +0 -315
- package/dist/scripts/baseline.d.ts +0 -43
- package/dist/scripts/baseline.js +0 -267
- package/dist/scripts/calculate-scores.d.ts +0 -166
- package/dist/scripts/calculate-scores.js +0 -1296
- package/dist/scripts/compare.d.ts +0 -22
- package/dist/scripts/compare.js +0 -334
- package/dist/scripts/coverage-audit.d.ts +0 -44
- package/dist/scripts/coverage-audit.js +0 -209
- package/dist/scripts/debug-eval.d.ts +0 -19
- package/dist/scripts/debug-eval.js +0 -73
- package/dist/scripts/discovery-report.d.ts +0 -58
- package/dist/scripts/discovery-report.js +0 -250
- package/dist/scripts/fetch-docs.d.ts +0 -35
- package/dist/scripts/fetch-docs.js +0 -472
- package/dist/scripts/generate-configs.d.ts +0 -66
- package/dist/scripts/generate-configs.js +0 -459
- package/dist/scripts/grader-api.d.ts +0 -27
- package/dist/scripts/grader-api.js +0 -206
- package/dist/scripts/grader-compare.d.ts +0 -22
- package/dist/scripts/grader-compare.js +0 -368
- package/dist/scripts/grader-consistency.d.ts +0 -20
- package/dist/scripts/grader-consistency.js +0 -313
- package/dist/scripts/grader-sensitivity.d.ts +0 -22
- package/dist/scripts/grader-sensitivity.js +0 -354
- package/dist/scripts/grader-validate.d.ts +0 -19
- package/dist/scripts/grader-validate.js +0 -267
- package/dist/scripts/measure-retrieval.d.ts +0 -10
- package/dist/scripts/measure-retrieval.js +0 -145
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
- package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
- package/dist/scripts/pipeline.d.ts +0 -76
- package/dist/scripts/pipeline.js +0 -1031
- package/dist/scripts/pr-comment.d.ts +0 -10
- package/dist/scripts/pr-comment.js +0 -510
- package/dist/scripts/readiness-report.d.ts +0 -88
- package/dist/scripts/readiness-report.js +0 -342
- package/dist/scripts/update-quality-scores.d.ts +0 -15
- package/dist/scripts/update-quality-scores.js +0 -184
- package/dist/scripts/validate-task-sources.d.ts +0 -21
- package/dist/scripts/validate-task-sources.js +0 -210
- package/dist/scripts/validate.d.ts +0 -13
- package/dist/scripts/validate.js +0 -79
- package/dist/scripts/webhook-server.d.ts +0 -26
- package/dist/scripts/webhook-server.js +0 -147
- package/dist/scripts/weekly-digest.d.ts +0 -24
- package/dist/scripts/weekly-digest.js +0 -144
- package/dist/sinks/format-slack.d.ts +0 -64
- package/dist/sinks/format-slack.js +0 -306
- package/dist/sinks/slack-sink.d.ts +0 -27
- package/dist/sinks/slack-sink.js +0 -78
- package/dist/sinks/webhook-sink.d.ts +0 -19
- package/dist/sinks/webhook-sink.js +0 -50
- package/tasks/.expanded.agentic.yaml +0 -280
- package/tasks/.expanded.yaml +0 -565
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* debug-eval.ts
|
|
3
|
-
*
|
|
4
|
-
* Thin wrapper around `promptfoo eval` that reads DEBUG_EVAL_* environment
|
|
5
|
-
* variables and forwards them as native promptfoo filter flags.
|
|
6
|
-
*
|
|
7
|
-
* Environment variables:
|
|
8
|
-
* DEBUG_EVAL=1 — required to enable debug mode
|
|
9
|
-
* DEBUG_EVAL_PATTERN=<re> — --filter-pattern (regex on test description)
|
|
10
|
-
* DEBUG_EVAL_N=<number> — --filter-first-n (default: 2 when no other filters set)
|
|
11
|
-
* DEBUG_EVAL_SAMPLE=<number> — --filter-sample (random N tests)
|
|
12
|
-
*
|
|
13
|
-
* Usage:
|
|
14
|
-
* tsx src/scripts/debug-eval.ts --config promptfooconfig.yaml
|
|
15
|
-
* tsx src/scripts/debug-eval.ts --config promptfooconfig.agentic.yaml --no-cache
|
|
16
|
-
*
|
|
17
|
-
* All extra argv are forwarded to promptfoo eval unchanged.
|
|
18
|
-
*/
|
|
19
|
-
import { execSync } from "child_process";
|
|
20
|
-
// ---------------------------------------------------------------------------
|
|
21
|
-
// Parse DEBUG_EVAL_* environment
|
|
22
|
-
// ---------------------------------------------------------------------------
|
|
23
|
-
const debugEnabled = process.env.DEBUG_EVAL === "1";
|
|
24
|
-
if (!debugEnabled) {
|
|
25
|
-
console.error("⚠ debug-eval.ts called without DEBUG_EVAL=1 — running full eval instead.");
|
|
26
|
-
}
|
|
27
|
-
const pattern = process.env.DEBUG_EVAL_PATTERN;
|
|
28
|
-
const firstN = process.env.DEBUG_EVAL_N;
|
|
29
|
-
const sample = process.env.DEBUG_EVAL_SAMPLE;
|
|
30
|
-
// ---------------------------------------------------------------------------
|
|
31
|
-
// Build filter flags
|
|
32
|
-
// ---------------------------------------------------------------------------
|
|
33
|
-
const filterFlags = [];
|
|
34
|
-
if (debugEnabled) {
|
|
35
|
-
if (pattern) {
|
|
36
|
-
filterFlags.push(`--filter-pattern`, `'${pattern}'`);
|
|
37
|
-
}
|
|
38
|
-
if (sample) {
|
|
39
|
-
filterFlags.push(`--filter-sample`, sample);
|
|
40
|
-
}
|
|
41
|
-
if (firstN) {
|
|
42
|
-
filterFlags.push(`--filter-first-n`, firstN);
|
|
43
|
-
}
|
|
44
|
-
// Default: if no filters specified, limit to first 2 tests for speed
|
|
45
|
-
if (filterFlags.length === 0) {
|
|
46
|
-
filterFlags.push(`--filter-first-n`, "2");
|
|
47
|
-
}
|
|
48
|
-
}
|
|
49
|
-
// ---------------------------------------------------------------------------
|
|
50
|
-
// Forward to promptfoo eval
|
|
51
|
-
// ---------------------------------------------------------------------------
|
|
52
|
-
// argv[0] = node, argv[1] = this script, argv[2..] = user args
|
|
53
|
-
const passthroughArgs = process.argv.slice(2);
|
|
54
|
-
const allArgs = ["eval", ...passthroughArgs, ...filterFlags];
|
|
55
|
-
const cmd = `promptfoo ${allArgs.join(" ")}`;
|
|
56
|
-
if (debugEnabled) {
|
|
57
|
-
console.log(`\n🐛 Debug mode enabled`);
|
|
58
|
-
console.log(` Filters: ${filterFlags.join(" ") || "(default: first 2)"}`);
|
|
59
|
-
console.log(` Command: ${cmd}\n`);
|
|
60
|
-
}
|
|
61
|
-
try {
|
|
62
|
-
execSync(cmd, {
|
|
63
|
-
env: process.env,
|
|
64
|
-
stdio: "inherit",
|
|
65
|
-
});
|
|
66
|
-
}
|
|
67
|
-
catch (err) {
|
|
68
|
-
// promptfoo exits non-zero when assertions fail — that's expected
|
|
69
|
-
const code = err !== null && typeof err === "object" && "status" in err
|
|
70
|
-
? err.status
|
|
71
|
-
: 1;
|
|
72
|
-
process.exit(code);
|
|
73
|
-
}
|
|
@@ -1,58 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* discovery-report.ts
|
|
3
|
-
*
|
|
4
|
-
* Generates an agent discoverability report from agentic mode retrieval
|
|
5
|
-
* metrics. Reads score-summary.json (which contains `retrievalMetrics`
|
|
6
|
-
* from agentic evaluation) and produces a markdown report showing:
|
|
7
|
-
*
|
|
8
|
-
* - Retrieval summary (recall, precision, F1)
|
|
9
|
-
* - Per-area retrieval breakdown
|
|
10
|
-
* - Invisible documents (never retrieved by any task)
|
|
11
|
-
* - Recommendations for improving discoverability
|
|
12
|
-
*
|
|
13
|
-
* Phase 5c of the Scenario Matrix implementation (Scenarios 4.1 and 4.2).
|
|
14
|
-
*
|
|
15
|
-
* Usage:
|
|
16
|
-
* tsx src/scripts/discovery-report.ts # stdout
|
|
17
|
-
* tsx src/scripts/discovery-report.ts --area groq # filter by area
|
|
18
|
-
* tsx src/scripts/discovery-report.ts --output report.md
|
|
19
|
-
*
|
|
20
|
-
* @see docs/design-docs/retrieval-metrics.md
|
|
21
|
-
*/
|
|
22
|
-
import "dotenv/config";
|
|
23
|
-
import type { AreaRetrievalMetrics, RetrievalMetrics, ScoreSummary } from "../pipeline/types.js";
|
|
24
|
-
export interface DiscoveryReport {
|
|
25
|
-
/** All areas included in the report (after filtering) */
|
|
26
|
-
areas: AreaRetrievalMetrics[];
|
|
27
|
-
/** Base URL from the score summary source config */
|
|
28
|
-
baseUrl: string | undefined;
|
|
29
|
-
/** Document slugs that were never retrieved by any task */
|
|
30
|
-
invisibleDocs: InvisibleDoc[];
|
|
31
|
-
/** Overall retrieval metrics */
|
|
32
|
-
overall: RetrievalMetrics["overall"];
|
|
33
|
-
/** Actionable recommendations */
|
|
34
|
-
recommendations: string[];
|
|
35
|
-
/** ISO timestamp of the source evaluation */
|
|
36
|
-
timestamp: string;
|
|
37
|
-
/** Total canonical docs across included areas */
|
|
38
|
-
totalCanonicalDocs: number;
|
|
39
|
-
/** Total hits (canonical docs successfully retrieved) */
|
|
40
|
-
totalHits: number;
|
|
41
|
-
}
|
|
42
|
-
export interface InvisibleDoc {
|
|
43
|
-
/** Tasks that reference this document via canonical_docs */
|
|
44
|
-
referencedBy: string[];
|
|
45
|
-
/** The document slug */
|
|
46
|
-
slug: string;
|
|
47
|
-
}
|
|
48
|
-
/**
|
|
49
|
-
* Format a discovery report as markdown.
|
|
50
|
-
*/
|
|
51
|
-
export declare function formatDiscoveryMarkdown(report: DiscoveryReport): string;
|
|
52
|
-
/**
|
|
53
|
-
* Generate a structured discovery report from a score summary.
|
|
54
|
-
*
|
|
55
|
-
* @param summary - Parsed score-summary.json
|
|
56
|
-
* @param areaFilter - Optional area names to include (all if empty)
|
|
57
|
-
*/
|
|
58
|
-
export declare function generateDiscoveryReport(summary: ScoreSummary, areaFilter?: string[]): DiscoveryReport;
|
|
@@ -1,250 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* discovery-report.ts
|
|
3
|
-
*
|
|
4
|
-
* Generates an agent discoverability report from agentic mode retrieval
|
|
5
|
-
* metrics. Reads score-summary.json (which contains `retrievalMetrics`
|
|
6
|
-
* from agentic evaluation) and produces a markdown report showing:
|
|
7
|
-
*
|
|
8
|
-
* - Retrieval summary (recall, precision, F1)
|
|
9
|
-
* - Per-area retrieval breakdown
|
|
10
|
-
* - Invisible documents (never retrieved by any task)
|
|
11
|
-
* - Recommendations for improving discoverability
|
|
12
|
-
*
|
|
13
|
-
* Phase 5c of the Scenario Matrix implementation (Scenarios 4.1 and 4.2).
|
|
14
|
-
*
|
|
15
|
-
* Usage:
|
|
16
|
-
* tsx src/scripts/discovery-report.ts # stdout
|
|
17
|
-
* tsx src/scripts/discovery-report.ts --area groq # filter by area
|
|
18
|
-
* tsx src/scripts/discovery-report.ts --output report.md
|
|
19
|
-
*
|
|
20
|
-
* @see docs/design-docs/retrieval-metrics.md
|
|
21
|
-
*/
|
|
22
|
-
// oxlint-disable-next-line import/no-unassigned-import -- side-effect: loads .env into process.env
|
|
23
|
-
import "dotenv/config";
|
|
24
|
-
import { existsSync, readFileSync, writeFileSync } from "node:fs";
|
|
25
|
-
import { dirname, join, resolve } from "node:path";
|
|
26
|
-
import { fileURLToPath } from "node:url";
|
|
27
|
-
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
28
|
-
const ROOT = resolve(__dirname, "..", "..");
|
|
29
|
-
// ---------------------------------------------------------------------------
|
|
30
|
-
// Core logic (exported for testing)
|
|
31
|
-
// ---------------------------------------------------------------------------
|
|
32
|
-
/**
|
|
33
|
-
* Format a discovery report as markdown.
|
|
34
|
-
*/
|
|
35
|
-
export function formatDiscoveryMarkdown(report) {
|
|
36
|
-
const lines = [];
|
|
37
|
-
// Header
|
|
38
|
-
lines.push("## 🔍 Agent Discoverability Report");
|
|
39
|
-
lines.push("");
|
|
40
|
-
if (report.baseUrl) {
|
|
41
|
-
lines.push(`**Base URL:** ${report.baseUrl}`);
|
|
42
|
-
}
|
|
43
|
-
lines.push("**Mode:** Agentic");
|
|
44
|
-
lines.push("");
|
|
45
|
-
// Retrieval summary table
|
|
46
|
-
lines.push("### Retrieval Summary");
|
|
47
|
-
lines.push("");
|
|
48
|
-
lines.push("| Metric | Value |");
|
|
49
|
-
lines.push("|---|---|");
|
|
50
|
-
lines.push(`| Recall (canonical docs found) | ${pct(report.overall.avgRecall)} (${report.totalHits}/${report.totalCanonicalDocs}) |`);
|
|
51
|
-
lines.push(`| Precision (relevant docs fetched) | ${pct(report.overall.avgPrecision)} |`);
|
|
52
|
-
lines.push(`| F1 Score | ${report.overall.avgF1.toFixed(2)} |`);
|
|
53
|
-
lines.push(`| Invisible docs | ${report.invisibleDocs.length} |`);
|
|
54
|
-
lines.push("");
|
|
55
|
-
// Per-area breakdown
|
|
56
|
-
if (report.areas.length > 0) {
|
|
57
|
-
lines.push("### Per-Area Breakdown");
|
|
58
|
-
lines.push("");
|
|
59
|
-
lines.push("| Area | Recall | Precision | F1 | Tasks |");
|
|
60
|
-
lines.push("|---|---|---|---|---|");
|
|
61
|
-
for (const area of sortedAreas(report.areas)) {
|
|
62
|
-
lines.push(`| ${area.area} | ${pct(area.avgRecall)} | ${pct(area.avgPrecision)} | ${area.avgF1.toFixed(2)} | ${area.taskCount} |`);
|
|
63
|
-
}
|
|
64
|
-
lines.push("");
|
|
65
|
-
}
|
|
66
|
-
// Invisible documents
|
|
67
|
-
if (report.invisibleDocs.length > 0) {
|
|
68
|
-
lines.push("### Invisible Documents (never retrieved by any task)");
|
|
69
|
-
lines.push("");
|
|
70
|
-
for (const doc of report.invisibleDocs) {
|
|
71
|
-
const refs = doc.referencedBy.join(", ");
|
|
72
|
-
lines.push(`- \`${doc.slug}\` — referenced by ${refs}`);
|
|
73
|
-
}
|
|
74
|
-
lines.push("");
|
|
75
|
-
}
|
|
76
|
-
// Recommendations
|
|
77
|
-
if (report.recommendations.length > 0) {
|
|
78
|
-
lines.push("### Recommendations");
|
|
79
|
-
lines.push("");
|
|
80
|
-
for (let i = 0; i < report.recommendations.length; i++) {
|
|
81
|
-
lines.push(`${i + 1}. ${report.recommendations[i]}`);
|
|
82
|
-
}
|
|
83
|
-
lines.push("");
|
|
84
|
-
}
|
|
85
|
-
return lines.join("\n");
|
|
86
|
-
}
|
|
87
|
-
/**
|
|
88
|
-
* Generate a structured discovery report from a score summary.
|
|
89
|
-
*
|
|
90
|
-
* @param summary - Parsed score-summary.json
|
|
91
|
-
* @param areaFilter - Optional area names to include (all if empty)
|
|
92
|
-
*/
|
|
93
|
-
export function generateDiscoveryReport(summary, areaFilter) {
|
|
94
|
-
const metrics = summary.retrievalMetrics;
|
|
95
|
-
if (!metrics) {
|
|
96
|
-
throw new Error("score-summary.json does not contain retrievalMetrics. " +
|
|
97
|
-
"Run an agentic evaluation first: pnpm pipeline -- --mode agentic");
|
|
98
|
-
}
|
|
99
|
-
// Apply area filter
|
|
100
|
-
const areas = areaFilter && areaFilter.length > 0
|
|
101
|
-
? metrics.areas.filter((a) => areaFilter.includes(a.area))
|
|
102
|
-
: metrics.areas;
|
|
103
|
-
if (areaFilter && areaFilter.length > 0 && areas.length === 0) {
|
|
104
|
-
throw new Error(`No retrieval data found for area(s): ${areaFilter.join(", ")}. ` +
|
|
105
|
-
`Available areas: ${metrics.areas.map((a) => a.area).join(", ")}`);
|
|
106
|
-
}
|
|
107
|
-
// Recompute overall metrics for filtered areas
|
|
108
|
-
const overall = areas.length === metrics.areas.length
|
|
109
|
-
? metrics.overall
|
|
110
|
-
: computeOverall(areas);
|
|
111
|
-
// Build invisible docs list with task references
|
|
112
|
-
const invisibleDocs = buildInvisibleDocs(areas);
|
|
113
|
-
// Compute totals for the summary table
|
|
114
|
-
const allTasks = areas.flatMap((a) => a.tasks);
|
|
115
|
-
const allExpected = new Set(allTasks.flatMap((t) => t.expected));
|
|
116
|
-
const allHits = new Set(allTasks.flatMap((t) => t.hits));
|
|
117
|
-
const totalCanonicalDocs = allExpected.size;
|
|
118
|
-
const totalHits = allHits.size;
|
|
119
|
-
// Generate recommendations
|
|
120
|
-
const recommendations = generateRecommendations(invisibleDocs, areas, overall);
|
|
121
|
-
return {
|
|
122
|
-
areas,
|
|
123
|
-
baseUrl: summary.source?.baseUrl,
|
|
124
|
-
invisibleDocs,
|
|
125
|
-
overall,
|
|
126
|
-
recommendations,
|
|
127
|
-
timestamp: summary.timestamp,
|
|
128
|
-
totalCanonicalDocs,
|
|
129
|
-
totalHits,
|
|
130
|
-
};
|
|
131
|
-
}
|
|
132
|
-
// ---------------------------------------------------------------------------
|
|
133
|
-
// Helpers (alphabetical for perfectionist/sort-modules)
|
|
134
|
-
// ---------------------------------------------------------------------------
|
|
135
|
-
function buildInvisibleDocs(areas) {
|
|
136
|
-
// Collect all invisible slugs and map them to the tasks that reference them
|
|
137
|
-
const slugToTasks = new Map();
|
|
138
|
-
for (const area of areas) {
|
|
139
|
-
for (const task of area.tasks) {
|
|
140
|
-
for (const slug of task.missed) {
|
|
141
|
-
// Check if this slug is globally invisible (never retrieved by ANY task)
|
|
142
|
-
const isGloballyInvisible = areas.every((a) => a.tasks.every((t) => !t.retrieved.includes(slug)));
|
|
143
|
-
if (isGloballyInvisible) {
|
|
144
|
-
if (!slugToTasks.has(slug)) {
|
|
145
|
-
slugToTasks.set(slug, new Set());
|
|
146
|
-
}
|
|
147
|
-
slugToTasks.get(slug).add(task.taskId);
|
|
148
|
-
}
|
|
149
|
-
}
|
|
150
|
-
}
|
|
151
|
-
}
|
|
152
|
-
return [...slugToTasks.entries()]
|
|
153
|
-
.map(([slug, tasks]) => ({
|
|
154
|
-
referencedBy: [...tasks].sort(),
|
|
155
|
-
slug,
|
|
156
|
-
}))
|
|
157
|
-
.sort((a, b) => b.referencedBy.length - a.referencedBy.length);
|
|
158
|
-
}
|
|
159
|
-
function computeOverall(areas) {
|
|
160
|
-
if (areas.length === 0) {
|
|
161
|
-
return { avgF1: 0, avgPrecision: 0, avgRecall: 0 };
|
|
162
|
-
}
|
|
163
|
-
// Weight by task count for fair averaging
|
|
164
|
-
const totalTasks = areas.reduce((s, a) => s + a.taskCount, 0);
|
|
165
|
-
if (totalTasks === 0) {
|
|
166
|
-
return { avgF1: 0, avgPrecision: 0, avgRecall: 0 };
|
|
167
|
-
}
|
|
168
|
-
const avgRecall = areas.reduce((s, a) => s + a.avgRecall * a.taskCount, 0) / totalTasks;
|
|
169
|
-
const avgPrecision = areas.reduce((s, a) => s + a.avgPrecision * a.taskCount, 0) / totalTasks;
|
|
170
|
-
const avgF1 = areas.reduce((s, a) => s + a.avgF1 * a.taskCount, 0) / totalTasks;
|
|
171
|
-
return { avgF1, avgPrecision, avgRecall };
|
|
172
|
-
}
|
|
173
|
-
function generateRecommendations(invisibleDocs, areas, overall) {
|
|
174
|
-
const recs = [];
|
|
175
|
-
// Recommend adding invisible docs to llms.txt
|
|
176
|
-
const highImpactInvisible = invisibleDocs.filter((d) => d.referencedBy.length > 0);
|
|
177
|
-
for (const doc of highImpactInvisible.slice(0, 5)) {
|
|
178
|
-
const taskWord = doc.referencedBy.length === 1 ? "task" : "tasks";
|
|
179
|
-
recs.push(`Add \`${doc.slug}\` to llms.txt (referenced by ${doc.referencedBy.length} ${taskWord})`);
|
|
180
|
-
}
|
|
181
|
-
// Recommend cross-linking for invisible docs
|
|
182
|
-
if (invisibleDocs.length > 0) {
|
|
183
|
-
recs.push(`Improve cross-linking to ${invisibleDocs.length} invisible document${invisibleDocs.length === 1 ? "" : "s"}`);
|
|
184
|
-
}
|
|
185
|
-
// Flag low-recall areas
|
|
186
|
-
const lowRecallAreas = areas.filter((a) => a.avgRecall < 0.5);
|
|
187
|
-
for (const area of lowRecallAreas) {
|
|
188
|
-
recs.push(`Investigate low recall in \`${area.area}\` (${pct(area.avgRecall)}) — agents miss most canonical docs`);
|
|
189
|
-
}
|
|
190
|
-
// Flag low-precision areas
|
|
191
|
-
const lowPrecisionAreas = areas.filter((a) => a.avgPrecision < 0.5);
|
|
192
|
-
for (const area of lowPrecisionAreas) {
|
|
193
|
-
recs.push(`Review search relevance for \`${area.area}\` (precision ${pct(area.avgPrecision)}) — agents fetch many irrelevant docs`);
|
|
194
|
-
}
|
|
195
|
-
// Overall recommendation
|
|
196
|
-
if (overall.avgF1 < 0.6) {
|
|
197
|
-
recs.push("Overall F1 is below 0.60 — consider a documentation restructure for agent accessibility");
|
|
198
|
-
}
|
|
199
|
-
return recs;
|
|
200
|
-
}
|
|
201
|
-
function main() {
|
|
202
|
-
const { areaFilter, output, summaryPath } = parseArgs(process.argv);
|
|
203
|
-
if (!existsSync(summaryPath)) {
|
|
204
|
-
console.error(`❌ Score summary not found: ${summaryPath}`);
|
|
205
|
-
console.error("Run an agentic evaluation first: pnpm pipeline -- --mode agentic");
|
|
206
|
-
process.exit(1);
|
|
207
|
-
}
|
|
208
|
-
const summary = JSON.parse(readFileSync(summaryPath, "utf-8"));
|
|
209
|
-
const report = generateDiscoveryReport(summary, areaFilter.length > 0 ? areaFilter : undefined);
|
|
210
|
-
const markdown = formatDiscoveryMarkdown(report);
|
|
211
|
-
if (output) {
|
|
212
|
-
writeFileSync(output, markdown, "utf-8");
|
|
213
|
-
console.log(`✅ Discovery report written to ${output}`);
|
|
214
|
-
}
|
|
215
|
-
else {
|
|
216
|
-
console.log(markdown);
|
|
217
|
-
}
|
|
218
|
-
}
|
|
219
|
-
function parseArgs(argv) {
|
|
220
|
-
const args = argv.slice(2);
|
|
221
|
-
let output;
|
|
222
|
-
const areaFilter = [];
|
|
223
|
-
let summaryPath = join(ROOT, "results", "latest", "score-summary.json");
|
|
224
|
-
for (let i = 0; i < args.length; i++) {
|
|
225
|
-
if (args[i] === "--output" && args[i + 1]) {
|
|
226
|
-
output = args[++i];
|
|
227
|
-
}
|
|
228
|
-
else if (args[i] === "--area" && args[i + 1]) {
|
|
229
|
-
areaFilter.push(...args[++i].split(","));
|
|
230
|
-
}
|
|
231
|
-
else if (args[i] === "--input" && args[i + 1]) {
|
|
232
|
-
summaryPath = args[++i];
|
|
233
|
-
}
|
|
234
|
-
else if (!args[i].startsWith("-")) {
|
|
235
|
-
summaryPath = args[i];
|
|
236
|
-
}
|
|
237
|
-
}
|
|
238
|
-
return { areaFilter, output, summaryPath };
|
|
239
|
-
}
|
|
240
|
-
function pct(value) {
|
|
241
|
-
return `${Math.round(value * 100)}%`;
|
|
242
|
-
}
|
|
243
|
-
function sortedAreas(areas) {
|
|
244
|
-
return [...areas].sort((a, b) => a.area.localeCompare(b.area));
|
|
245
|
-
}
|
|
246
|
-
// Only run when invoked directly
|
|
247
|
-
if (process.argv[1]?.endsWith("discovery-report.ts") ||
|
|
248
|
-
process.argv[1]?.endsWith("discovery-report.js")) {
|
|
249
|
-
main();
|
|
250
|
-
}
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Fetch-docs.ts
|
|
3
|
-
*
|
|
4
|
-
* Pulls documentation from the Sanity CMS and generates markdown context
|
|
5
|
-
* files for use in Promptfoo evaluations. Always produces canonical contexts;
|
|
6
|
-
* other outputs are opt-in:
|
|
7
|
-
*
|
|
8
|
-
* 1. Canonical contexts — one file per evaluation task, containing
|
|
9
|
-
* only the manually-annotated "gold" documents for that task (always)
|
|
10
|
-
* 2. Feature-area contexts — one file per GROQ feature area query
|
|
11
|
-
* (opt-in via --include-feature-areas)
|
|
12
|
-
* 3. Full corpus — all articles in one file
|
|
13
|
-
* (opt-in via --include-corpus)
|
|
14
|
-
*/
|
|
15
|
-
import "dotenv/config";
|
|
16
|
-
/**
|
|
17
|
-
* Result of resolving --sanity-document IDs against canonical docs.
|
|
18
|
-
*
|
|
19
|
-
* Documents specified by ID either replace a canonical doc (if the fetched
|
|
20
|
-
* document's slug matches one in the canonical set) or are appended as
|
|
21
|
-
* additional context (if the slug is not in the canonical set).
|
|
22
|
-
*/
|
|
23
|
-
export interface DocumentOverlay {
|
|
24
|
-
/** Extra formatted content for docs that don't match any canonical slug */
|
|
25
|
-
appendedContent: string[];
|
|
26
|
-
/** Map from canonical slug → formatted content (replaces the normal fetch) */
|
|
27
|
-
replacements: Map<string, string>;
|
|
28
|
-
}
|
|
29
|
-
/** Result of comparing canonical docs between published and perspective */
|
|
30
|
-
export interface ReleaseImpact {
|
|
31
|
-
added: string[];
|
|
32
|
-
modified: string[];
|
|
33
|
-
removed: string[];
|
|
34
|
-
unchanged: string[];
|
|
35
|
-
}
|