@sanity/ailf 2.0.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. package/LICENSE +21 -0
  2. package/dist/cli.js +0 -0
  3. package/package.json +24 -24
  4. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
  5. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
  6. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
  7. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
  8. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  9. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  10. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  11. package/dist/_vendor/ailf-tasks/index.js +0 -16
  12. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  13. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  14. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  15. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  16. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  17. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  18. package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
  19. package/dist/adapters/task-sources/yaml-task-source.js +0 -139
  20. package/dist/agent-observer/test-imports.d.ts +0 -7
  21. package/dist/agent-observer/test-imports.js +0 -185
  22. package/dist/commands/update-quality-scores.d.ts +0 -5
  23. package/dist/commands/update-quality-scores.js +0 -20
  24. package/dist/lib/agent-behavior-report.d.ts +0 -8
  25. package/dist/lib/agent-behavior-report.js +0 -185
  26. package/dist/lib/baseline.d.ts +0 -19
  27. package/dist/lib/baseline.js +0 -153
  28. package/dist/lib/calculate-scores.d.ts +0 -23
  29. package/dist/lib/calculate-scores.js +0 -42
  30. package/dist/lib/compare.d.ts +0 -18
  31. package/dist/lib/compare.js +0 -170
  32. package/dist/lib/coverage-audit.d.ts +0 -4
  33. package/dist/lib/coverage-audit.js +0 -42
  34. package/dist/lib/discovery-report.d.ts +0 -13
  35. package/dist/lib/discovery-report.js +0 -57
  36. package/dist/lib/fetch-docs.d.ts +0 -30
  37. package/dist/lib/fetch-docs.js +0 -171
  38. package/dist/lib/generate-configs.d.ts +0 -25
  39. package/dist/lib/generate-configs.js +0 -42
  40. package/dist/lib/grader-api.d.ts +0 -21
  41. package/dist/lib/grader-api.js +0 -34
  42. package/dist/lib/grader-compare.d.ts +0 -19
  43. package/dist/lib/grader-compare.js +0 -91
  44. package/dist/lib/grader-consistency.d.ts +0 -27
  45. package/dist/lib/grader-consistency.js +0 -79
  46. package/dist/lib/grader-sensitivity.d.ts +0 -19
  47. package/dist/lib/grader-sensitivity.js +0 -75
  48. package/dist/lib/grader-validate.d.ts +0 -19
  49. package/dist/lib/grader-validate.js +0 -78
  50. package/dist/lib/measure-retrieval.d.ts +0 -14
  51. package/dist/lib/measure-retrieval.js +0 -71
  52. package/dist/lib/pr-comment.d.ts +0 -16
  53. package/dist/lib/pr-comment.js +0 -28
  54. package/dist/lib/readiness-report.d.ts +0 -13
  55. package/dist/lib/readiness-report.js +0 -108
  56. package/dist/lib/webhook-server.d.ts +0 -11
  57. package/dist/lib/webhook-server.js +0 -24
  58. package/dist/lib/weekly-digest.d.ts +0 -24
  59. package/dist/lib/weekly-digest.js +0 -148
  60. package/dist/orchestration/env-bridge.d.ts +0 -21
  61. package/dist/orchestration/env-bridge.js +0 -66
  62. package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
  63. package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
  64. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
  65. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
  66. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
  67. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  68. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  69. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  70. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  71. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  72. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
  73. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
  74. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
  75. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
  76. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
  77. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
  78. package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
  79. package/dist/pipeline/compiler/task-bridge.js +0 -92
  80. package/dist/pipeline/expand-tasks.d.ts +0 -232
  81. package/dist/pipeline/expand-tasks.js +0 -467
  82. package/dist/pipeline/generate-configs.d.ts +0 -92
  83. package/dist/pipeline/generate-configs.js +0 -445
  84. package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
  85. package/dist/pipeline/steps/calculate-scores-step.js +0 -89
  86. package/dist/pipeline/steps/compare-step.d.ts +0 -18
  87. package/dist/pipeline/steps/compare-step.js +0 -90
  88. package/dist/pipeline/steps/eval-step.d.ts +0 -53
  89. package/dist/pipeline/steps/eval-step.js +0 -347
  90. package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
  91. package/dist/pipeline/steps/fetch-docs-step.js +0 -84
  92. package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
  93. package/dist/pipeline/steps/generate-configs-step.js +0 -98
  94. package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
  95. package/dist/pipeline/steps/grader-consistency-step.js +0 -74
  96. package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
  97. package/dist/pipeline/steps/publish-report-step.js +0 -243
  98. package/dist/pipeline/steps/report-step.d.ts +0 -13
  99. package/dist/pipeline/steps/report-step.js +0 -56
  100. package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
  101. package/dist/pipeline/steps/update-scores-step.js +0 -42
  102. package/dist/scripts/agent-behavior-report.d.ts +0 -19
  103. package/dist/scripts/agent-behavior-report.js +0 -315
  104. package/dist/scripts/baseline.d.ts +0 -43
  105. package/dist/scripts/baseline.js +0 -267
  106. package/dist/scripts/calculate-scores.d.ts +0 -166
  107. package/dist/scripts/calculate-scores.js +0 -1296
  108. package/dist/scripts/compare.d.ts +0 -22
  109. package/dist/scripts/compare.js +0 -334
  110. package/dist/scripts/coverage-audit.d.ts +0 -44
  111. package/dist/scripts/coverage-audit.js +0 -209
  112. package/dist/scripts/debug-eval.d.ts +0 -19
  113. package/dist/scripts/debug-eval.js +0 -73
  114. package/dist/scripts/discovery-report.d.ts +0 -58
  115. package/dist/scripts/discovery-report.js +0 -250
  116. package/dist/scripts/fetch-docs.d.ts +0 -35
  117. package/dist/scripts/fetch-docs.js +0 -472
  118. package/dist/scripts/generate-configs.d.ts +0 -66
  119. package/dist/scripts/generate-configs.js +0 -459
  120. package/dist/scripts/grader-api.d.ts +0 -27
  121. package/dist/scripts/grader-api.js +0 -206
  122. package/dist/scripts/grader-compare.d.ts +0 -22
  123. package/dist/scripts/grader-compare.js +0 -368
  124. package/dist/scripts/grader-consistency.d.ts +0 -20
  125. package/dist/scripts/grader-consistency.js +0 -313
  126. package/dist/scripts/grader-sensitivity.d.ts +0 -22
  127. package/dist/scripts/grader-sensitivity.js +0 -354
  128. package/dist/scripts/grader-validate.d.ts +0 -19
  129. package/dist/scripts/grader-validate.js +0 -267
  130. package/dist/scripts/measure-retrieval.d.ts +0 -10
  131. package/dist/scripts/measure-retrieval.js +0 -145
  132. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
  133. package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
  134. package/dist/scripts/pipeline.d.ts +0 -76
  135. package/dist/scripts/pipeline.js +0 -1031
  136. package/dist/scripts/pr-comment.d.ts +0 -10
  137. package/dist/scripts/pr-comment.js +0 -510
  138. package/dist/scripts/readiness-report.d.ts +0 -88
  139. package/dist/scripts/readiness-report.js +0 -342
  140. package/dist/scripts/update-quality-scores.d.ts +0 -15
  141. package/dist/scripts/update-quality-scores.js +0 -184
  142. package/dist/scripts/validate-task-sources.d.ts +0 -21
  143. package/dist/scripts/validate-task-sources.js +0 -210
  144. package/dist/scripts/validate.d.ts +0 -13
  145. package/dist/scripts/validate.js +0 -79
  146. package/dist/scripts/webhook-server.d.ts +0 -26
  147. package/dist/scripts/webhook-server.js +0 -147
  148. package/dist/scripts/weekly-digest.d.ts +0 -24
  149. package/dist/scripts/weekly-digest.js +0 -144
  150. package/dist/sinks/format-slack.d.ts +0 -64
  151. package/dist/sinks/format-slack.js +0 -306
  152. package/dist/sinks/slack-sink.d.ts +0 -27
  153. package/dist/sinks/slack-sink.js +0 -78
  154. package/dist/sinks/webhook-sink.d.ts +0 -19
  155. package/dist/sinks/webhook-sink.js +0 -50
  156. package/tasks/.expanded.agentic.yaml +0 -280
  157. package/tasks/.expanded.yaml +0 -565
@@ -1,73 +0,0 @@
1
- /**
2
- * debug-eval.ts
3
- *
4
- * Thin wrapper around `promptfoo eval` that reads DEBUG_EVAL_* environment
5
- * variables and forwards them as native promptfoo filter flags.
6
- *
7
- * Environment variables:
8
- * DEBUG_EVAL=1 — required to enable debug mode
9
- * DEBUG_EVAL_PATTERN=<re> — --filter-pattern (regex on test description)
10
- * DEBUG_EVAL_N=<number> — --filter-first-n (default: 2 when no other filters set)
11
- * DEBUG_EVAL_SAMPLE=<number> — --filter-sample (random N tests)
12
- *
13
- * Usage:
14
- * tsx src/scripts/debug-eval.ts --config promptfooconfig.yaml
15
- * tsx src/scripts/debug-eval.ts --config promptfooconfig.agentic.yaml --no-cache
16
- *
17
- * All extra argv are forwarded to promptfoo eval unchanged.
18
- */
19
- import { execSync } from "child_process";
20
- // ---------------------------------------------------------------------------
21
- // Parse DEBUG_EVAL_* environment
22
- // ---------------------------------------------------------------------------
23
- const debugEnabled = process.env.DEBUG_EVAL === "1";
24
- if (!debugEnabled) {
25
- console.error("⚠ debug-eval.ts called without DEBUG_EVAL=1 — running full eval instead.");
26
- }
27
- const pattern = process.env.DEBUG_EVAL_PATTERN;
28
- const firstN = process.env.DEBUG_EVAL_N;
29
- const sample = process.env.DEBUG_EVAL_SAMPLE;
30
- // ---------------------------------------------------------------------------
31
- // Build filter flags
32
- // ---------------------------------------------------------------------------
33
- const filterFlags = [];
34
- if (debugEnabled) {
35
- if (pattern) {
36
- filterFlags.push(`--filter-pattern`, `'${pattern}'`);
37
- }
38
- if (sample) {
39
- filterFlags.push(`--filter-sample`, sample);
40
- }
41
- if (firstN) {
42
- filterFlags.push(`--filter-first-n`, firstN);
43
- }
44
- // Default: if no filters specified, limit to first 2 tests for speed
45
- if (filterFlags.length === 0) {
46
- filterFlags.push(`--filter-first-n`, "2");
47
- }
48
- }
49
- // ---------------------------------------------------------------------------
50
- // Forward to promptfoo eval
51
- // ---------------------------------------------------------------------------
52
- // argv[0] = node, argv[1] = this script, argv[2..] = user args
53
- const passthroughArgs = process.argv.slice(2);
54
- const allArgs = ["eval", ...passthroughArgs, ...filterFlags];
55
- const cmd = `promptfoo ${allArgs.join(" ")}`;
56
- if (debugEnabled) {
57
- console.log(`\n🐛 Debug mode enabled`);
58
- console.log(` Filters: ${filterFlags.join(" ") || "(default: first 2)"}`);
59
- console.log(` Command: ${cmd}\n`);
60
- }
61
- try {
62
- execSync(cmd, {
63
- env: process.env,
64
- stdio: "inherit",
65
- });
66
- }
67
- catch (err) {
68
- // promptfoo exits non-zero when assertions fail — that's expected
69
- const code = err !== null && typeof err === "object" && "status" in err
70
- ? err.status
71
- : 1;
72
- process.exit(code);
73
- }
@@ -1,58 +0,0 @@
1
- /**
2
- * discovery-report.ts
3
- *
4
- * Generates an agent discoverability report from agentic mode retrieval
5
- * metrics. Reads score-summary.json (which contains `retrievalMetrics`
6
- * from agentic evaluation) and produces a markdown report showing:
7
- *
8
- * - Retrieval summary (recall, precision, F1)
9
- * - Per-area retrieval breakdown
10
- * - Invisible documents (never retrieved by any task)
11
- * - Recommendations for improving discoverability
12
- *
13
- * Phase 5c of the Scenario Matrix implementation (Scenarios 4.1 and 4.2).
14
- *
15
- * Usage:
16
- * tsx src/scripts/discovery-report.ts # stdout
17
- * tsx src/scripts/discovery-report.ts --area groq # filter by area
18
- * tsx src/scripts/discovery-report.ts --output report.md
19
- *
20
- * @see docs/design-docs/retrieval-metrics.md
21
- */
22
- import "dotenv/config";
23
- import type { AreaRetrievalMetrics, RetrievalMetrics, ScoreSummary } from "../pipeline/types.js";
24
- export interface DiscoveryReport {
25
- /** All areas included in the report (after filtering) */
26
- areas: AreaRetrievalMetrics[];
27
- /** Base URL from the score summary source config */
28
- baseUrl: string | undefined;
29
- /** Document slugs that were never retrieved by any task */
30
- invisibleDocs: InvisibleDoc[];
31
- /** Overall retrieval metrics */
32
- overall: RetrievalMetrics["overall"];
33
- /** Actionable recommendations */
34
- recommendations: string[];
35
- /** ISO timestamp of the source evaluation */
36
- timestamp: string;
37
- /** Total canonical docs across included areas */
38
- totalCanonicalDocs: number;
39
- /** Total hits (canonical docs successfully retrieved) */
40
- totalHits: number;
41
- }
42
- export interface InvisibleDoc {
43
- /** Tasks that reference this document via canonical_docs */
44
- referencedBy: string[];
45
- /** The document slug */
46
- slug: string;
47
- }
48
- /**
49
- * Format a discovery report as markdown.
50
- */
51
- export declare function formatDiscoveryMarkdown(report: DiscoveryReport): string;
52
- /**
53
- * Generate a structured discovery report from a score summary.
54
- *
55
- * @param summary - Parsed score-summary.json
56
- * @param areaFilter - Optional area names to include (all if empty)
57
- */
58
- export declare function generateDiscoveryReport(summary: ScoreSummary, areaFilter?: string[]): DiscoveryReport;
@@ -1,250 +0,0 @@
1
- /**
2
- * discovery-report.ts
3
- *
4
- * Generates an agent discoverability report from agentic mode retrieval
5
- * metrics. Reads score-summary.json (which contains `retrievalMetrics`
6
- * from agentic evaluation) and produces a markdown report showing:
7
- *
8
- * - Retrieval summary (recall, precision, F1)
9
- * - Per-area retrieval breakdown
10
- * - Invisible documents (never retrieved by any task)
11
- * - Recommendations for improving discoverability
12
- *
13
- * Phase 5c of the Scenario Matrix implementation (Scenarios 4.1 and 4.2).
14
- *
15
- * Usage:
16
- * tsx src/scripts/discovery-report.ts # stdout
17
- * tsx src/scripts/discovery-report.ts --area groq # filter by area
18
- * tsx src/scripts/discovery-report.ts --output report.md
19
- *
20
- * @see docs/design-docs/retrieval-metrics.md
21
- */
22
- // oxlint-disable-next-line import/no-unassigned-import -- side-effect: loads .env into process.env
23
- import "dotenv/config";
24
- import { existsSync, readFileSync, writeFileSync } from "node:fs";
25
- import { dirname, join, resolve } from "node:path";
26
- import { fileURLToPath } from "node:url";
27
- const __dirname = dirname(fileURLToPath(import.meta.url));
28
- const ROOT = resolve(__dirname, "..", "..");
29
- // ---------------------------------------------------------------------------
30
- // Core logic (exported for testing)
31
- // ---------------------------------------------------------------------------
32
- /**
33
- * Format a discovery report as markdown.
34
- */
35
- export function formatDiscoveryMarkdown(report) {
36
- const lines = [];
37
- // Header
38
- lines.push("## 🔍 Agent Discoverability Report");
39
- lines.push("");
40
- if (report.baseUrl) {
41
- lines.push(`**Base URL:** ${report.baseUrl}`);
42
- }
43
- lines.push("**Mode:** Agentic");
44
- lines.push("");
45
- // Retrieval summary table
46
- lines.push("### Retrieval Summary");
47
- lines.push("");
48
- lines.push("| Metric | Value |");
49
- lines.push("|---|---|");
50
- lines.push(`| Recall (canonical docs found) | ${pct(report.overall.avgRecall)} (${report.totalHits}/${report.totalCanonicalDocs}) |`);
51
- lines.push(`| Precision (relevant docs fetched) | ${pct(report.overall.avgPrecision)} |`);
52
- lines.push(`| F1 Score | ${report.overall.avgF1.toFixed(2)} |`);
53
- lines.push(`| Invisible docs | ${report.invisibleDocs.length} |`);
54
- lines.push("");
55
- // Per-area breakdown
56
- if (report.areas.length > 0) {
57
- lines.push("### Per-Area Breakdown");
58
- lines.push("");
59
- lines.push("| Area | Recall | Precision | F1 | Tasks |");
60
- lines.push("|---|---|---|---|---|");
61
- for (const area of sortedAreas(report.areas)) {
62
- lines.push(`| ${area.area} | ${pct(area.avgRecall)} | ${pct(area.avgPrecision)} | ${area.avgF1.toFixed(2)} | ${area.taskCount} |`);
63
- }
64
- lines.push("");
65
- }
66
- // Invisible documents
67
- if (report.invisibleDocs.length > 0) {
68
- lines.push("### Invisible Documents (never retrieved by any task)");
69
- lines.push("");
70
- for (const doc of report.invisibleDocs) {
71
- const refs = doc.referencedBy.join(", ");
72
- lines.push(`- \`${doc.slug}\` — referenced by ${refs}`);
73
- }
74
- lines.push("");
75
- }
76
- // Recommendations
77
- if (report.recommendations.length > 0) {
78
- lines.push("### Recommendations");
79
- lines.push("");
80
- for (let i = 0; i < report.recommendations.length; i++) {
81
- lines.push(`${i + 1}. ${report.recommendations[i]}`);
82
- }
83
- lines.push("");
84
- }
85
- return lines.join("\n");
86
- }
87
- /**
88
- * Generate a structured discovery report from a score summary.
89
- *
90
- * @param summary - Parsed score-summary.json
91
- * @param areaFilter - Optional area names to include (all if empty)
92
- */
93
- export function generateDiscoveryReport(summary, areaFilter) {
94
- const metrics = summary.retrievalMetrics;
95
- if (!metrics) {
96
- throw new Error("score-summary.json does not contain retrievalMetrics. " +
97
- "Run an agentic evaluation first: pnpm pipeline -- --mode agentic");
98
- }
99
- // Apply area filter
100
- const areas = areaFilter && areaFilter.length > 0
101
- ? metrics.areas.filter((a) => areaFilter.includes(a.area))
102
- : metrics.areas;
103
- if (areaFilter && areaFilter.length > 0 && areas.length === 0) {
104
- throw new Error(`No retrieval data found for area(s): ${areaFilter.join(", ")}. ` +
105
- `Available areas: ${metrics.areas.map((a) => a.area).join(", ")}`);
106
- }
107
- // Recompute overall metrics for filtered areas
108
- const overall = areas.length === metrics.areas.length
109
- ? metrics.overall
110
- : computeOverall(areas);
111
- // Build invisible docs list with task references
112
- const invisibleDocs = buildInvisibleDocs(areas);
113
- // Compute totals for the summary table
114
- const allTasks = areas.flatMap((a) => a.tasks);
115
- const allExpected = new Set(allTasks.flatMap((t) => t.expected));
116
- const allHits = new Set(allTasks.flatMap((t) => t.hits));
117
- const totalCanonicalDocs = allExpected.size;
118
- const totalHits = allHits.size;
119
- // Generate recommendations
120
- const recommendations = generateRecommendations(invisibleDocs, areas, overall);
121
- return {
122
- areas,
123
- baseUrl: summary.source?.baseUrl,
124
- invisibleDocs,
125
- overall,
126
- recommendations,
127
- timestamp: summary.timestamp,
128
- totalCanonicalDocs,
129
- totalHits,
130
- };
131
- }
132
- // ---------------------------------------------------------------------------
133
- // Helpers (alphabetical for perfectionist/sort-modules)
134
- // ---------------------------------------------------------------------------
135
- function buildInvisibleDocs(areas) {
136
- // Collect all invisible slugs and map them to the tasks that reference them
137
- const slugToTasks = new Map();
138
- for (const area of areas) {
139
- for (const task of area.tasks) {
140
- for (const slug of task.missed) {
141
- // Check if this slug is globally invisible (never retrieved by ANY task)
142
- const isGloballyInvisible = areas.every((a) => a.tasks.every((t) => !t.retrieved.includes(slug)));
143
- if (isGloballyInvisible) {
144
- if (!slugToTasks.has(slug)) {
145
- slugToTasks.set(slug, new Set());
146
- }
147
- slugToTasks.get(slug).add(task.taskId);
148
- }
149
- }
150
- }
151
- }
152
- return [...slugToTasks.entries()]
153
- .map(([slug, tasks]) => ({
154
- referencedBy: [...tasks].sort(),
155
- slug,
156
- }))
157
- .sort((a, b) => b.referencedBy.length - a.referencedBy.length);
158
- }
159
- function computeOverall(areas) {
160
- if (areas.length === 0) {
161
- return { avgF1: 0, avgPrecision: 0, avgRecall: 0 };
162
- }
163
- // Weight by task count for fair averaging
164
- const totalTasks = areas.reduce((s, a) => s + a.taskCount, 0);
165
- if (totalTasks === 0) {
166
- return { avgF1: 0, avgPrecision: 0, avgRecall: 0 };
167
- }
168
- const avgRecall = areas.reduce((s, a) => s + a.avgRecall * a.taskCount, 0) / totalTasks;
169
- const avgPrecision = areas.reduce((s, a) => s + a.avgPrecision * a.taskCount, 0) / totalTasks;
170
- const avgF1 = areas.reduce((s, a) => s + a.avgF1 * a.taskCount, 0) / totalTasks;
171
- return { avgF1, avgPrecision, avgRecall };
172
- }
173
- function generateRecommendations(invisibleDocs, areas, overall) {
174
- const recs = [];
175
- // Recommend adding invisible docs to llms.txt
176
- const highImpactInvisible = invisibleDocs.filter((d) => d.referencedBy.length > 0);
177
- for (const doc of highImpactInvisible.slice(0, 5)) {
178
- const taskWord = doc.referencedBy.length === 1 ? "task" : "tasks";
179
- recs.push(`Add \`${doc.slug}\` to llms.txt (referenced by ${doc.referencedBy.length} ${taskWord})`);
180
- }
181
- // Recommend cross-linking for invisible docs
182
- if (invisibleDocs.length > 0) {
183
- recs.push(`Improve cross-linking to ${invisibleDocs.length} invisible document${invisibleDocs.length === 1 ? "" : "s"}`);
184
- }
185
- // Flag low-recall areas
186
- const lowRecallAreas = areas.filter((a) => a.avgRecall < 0.5);
187
- for (const area of lowRecallAreas) {
188
- recs.push(`Investigate low recall in \`${area.area}\` (${pct(area.avgRecall)}) — agents miss most canonical docs`);
189
- }
190
- // Flag low-precision areas
191
- const lowPrecisionAreas = areas.filter((a) => a.avgPrecision < 0.5);
192
- for (const area of lowPrecisionAreas) {
193
- recs.push(`Review search relevance for \`${area.area}\` (precision ${pct(area.avgPrecision)}) — agents fetch many irrelevant docs`);
194
- }
195
- // Overall recommendation
196
- if (overall.avgF1 < 0.6) {
197
- recs.push("Overall F1 is below 0.60 — consider a documentation restructure for agent accessibility");
198
- }
199
- return recs;
200
- }
201
- function main() {
202
- const { areaFilter, output, summaryPath } = parseArgs(process.argv);
203
- if (!existsSync(summaryPath)) {
204
- console.error(`❌ Score summary not found: ${summaryPath}`);
205
- console.error("Run an agentic evaluation first: pnpm pipeline -- --mode agentic");
206
- process.exit(1);
207
- }
208
- const summary = JSON.parse(readFileSync(summaryPath, "utf-8"));
209
- const report = generateDiscoveryReport(summary, areaFilter.length > 0 ? areaFilter : undefined);
210
- const markdown = formatDiscoveryMarkdown(report);
211
- if (output) {
212
- writeFileSync(output, markdown, "utf-8");
213
- console.log(`✅ Discovery report written to ${output}`);
214
- }
215
- else {
216
- console.log(markdown);
217
- }
218
- }
219
- function parseArgs(argv) {
220
- const args = argv.slice(2);
221
- let output;
222
- const areaFilter = [];
223
- let summaryPath = join(ROOT, "results", "latest", "score-summary.json");
224
- for (let i = 0; i < args.length; i++) {
225
- if (args[i] === "--output" && args[i + 1]) {
226
- output = args[++i];
227
- }
228
- else if (args[i] === "--area" && args[i + 1]) {
229
- areaFilter.push(...args[++i].split(","));
230
- }
231
- else if (args[i] === "--input" && args[i + 1]) {
232
- summaryPath = args[++i];
233
- }
234
- else if (!args[i].startsWith("-")) {
235
- summaryPath = args[i];
236
- }
237
- }
238
- return { areaFilter, output, summaryPath };
239
- }
240
- function pct(value) {
241
- return `${Math.round(value * 100)}%`;
242
- }
243
- function sortedAreas(areas) {
244
- return [...areas].sort((a, b) => a.area.localeCompare(b.area));
245
- }
246
- // Only run when invoked directly
247
- if (process.argv[1]?.endsWith("discovery-report.ts") ||
248
- process.argv[1]?.endsWith("discovery-report.js")) {
249
- main();
250
- }
@@ -1,35 +0,0 @@
1
- /**
2
- * Fetch-docs.ts
3
- *
4
- * Pulls documentation from the Sanity CMS and generates markdown context
5
- * files for use in Promptfoo evaluations. Always produces canonical contexts;
6
- * other outputs are opt-in:
7
- *
8
- * 1. Canonical contexts — one file per evaluation task, containing
9
- * only the manually-annotated "gold" documents for that task (always)
10
- * 2. Feature-area contexts — one file per GROQ feature area query
11
- * (opt-in via --include-feature-areas)
12
- * 3. Full corpus — all articles in one file
13
- * (opt-in via --include-corpus)
14
- */
15
- import "dotenv/config";
16
- /**
17
- * Result of resolving --sanity-document IDs against canonical docs.
18
- *
19
- * Documents specified by ID either replace a canonical doc (if the fetched
20
- * document's slug matches one in the canonical set) or are appended as
21
- * additional context (if the slug is not in the canonical set).
22
- */
23
- export interface DocumentOverlay {
24
- /** Extra formatted content for docs that don't match any canonical slug */
25
- appendedContent: string[];
26
- /** Map from canonical slug → formatted content (replaces the normal fetch) */
27
- replacements: Map<string, string>;
28
- }
29
- /** Result of comparing canonical docs between published and perspective */
30
- export interface ReleaseImpact {
31
- added: string[];
32
- modified: string[];
33
- removed: string[];
34
- unchanged: string[];
35
- }