@sanity/ailf 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. package/LICENSE +21 -0
  2. package/dist/cli.js +0 -0
  3. package/dist/orchestration/steps/run-eval-step.js +1 -1
  4. package/dist/pipeline/checks.d.ts +8 -3
  5. package/dist/pipeline/checks.js +23 -3
  6. package/package.json +25 -25
  7. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
  8. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
  9. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
  10. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
  11. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  12. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  13. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  14. package/dist/_vendor/ailf-tasks/index.js +0 -16
  15. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  16. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  17. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  18. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  19. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  20. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  21. package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
  22. package/dist/adapters/task-sources/yaml-task-source.js +0 -139
  23. package/dist/agent-observer/test-imports.d.ts +0 -7
  24. package/dist/agent-observer/test-imports.js +0 -185
  25. package/dist/commands/update-quality-scores.d.ts +0 -5
  26. package/dist/commands/update-quality-scores.js +0 -20
  27. package/dist/lib/agent-behavior-report.d.ts +0 -8
  28. package/dist/lib/agent-behavior-report.js +0 -185
  29. package/dist/lib/baseline.d.ts +0 -19
  30. package/dist/lib/baseline.js +0 -153
  31. package/dist/lib/calculate-scores.d.ts +0 -23
  32. package/dist/lib/calculate-scores.js +0 -42
  33. package/dist/lib/compare.d.ts +0 -18
  34. package/dist/lib/compare.js +0 -170
  35. package/dist/lib/coverage-audit.d.ts +0 -4
  36. package/dist/lib/coverage-audit.js +0 -42
  37. package/dist/lib/discovery-report.d.ts +0 -13
  38. package/dist/lib/discovery-report.js +0 -57
  39. package/dist/lib/fetch-docs.d.ts +0 -30
  40. package/dist/lib/fetch-docs.js +0 -171
  41. package/dist/lib/generate-configs.d.ts +0 -25
  42. package/dist/lib/generate-configs.js +0 -42
  43. package/dist/lib/grader-api.d.ts +0 -21
  44. package/dist/lib/grader-api.js +0 -34
  45. package/dist/lib/grader-compare.d.ts +0 -19
  46. package/dist/lib/grader-compare.js +0 -91
  47. package/dist/lib/grader-consistency.d.ts +0 -27
  48. package/dist/lib/grader-consistency.js +0 -79
  49. package/dist/lib/grader-sensitivity.d.ts +0 -19
  50. package/dist/lib/grader-sensitivity.js +0 -75
  51. package/dist/lib/grader-validate.d.ts +0 -19
  52. package/dist/lib/grader-validate.js +0 -78
  53. package/dist/lib/measure-retrieval.d.ts +0 -14
  54. package/dist/lib/measure-retrieval.js +0 -71
  55. package/dist/lib/pr-comment.d.ts +0 -16
  56. package/dist/lib/pr-comment.js +0 -28
  57. package/dist/lib/readiness-report.d.ts +0 -13
  58. package/dist/lib/readiness-report.js +0 -108
  59. package/dist/lib/webhook-server.d.ts +0 -11
  60. package/dist/lib/webhook-server.js +0 -24
  61. package/dist/lib/weekly-digest.d.ts +0 -24
  62. package/dist/lib/weekly-digest.js +0 -148
  63. package/dist/orchestration/env-bridge.d.ts +0 -21
  64. package/dist/orchestration/env-bridge.js +0 -66
  65. package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
  66. package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
  67. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
  68. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
  69. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
  70. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  71. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  72. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  73. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  74. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  75. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
  76. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
  77. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
  78. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
  79. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
  80. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
  81. package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
  82. package/dist/pipeline/compiler/task-bridge.js +0 -92
  83. package/dist/pipeline/expand-tasks.d.ts +0 -232
  84. package/dist/pipeline/expand-tasks.js +0 -467
  85. package/dist/pipeline/generate-configs.d.ts +0 -92
  86. package/dist/pipeline/generate-configs.js +0 -445
  87. package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
  88. package/dist/pipeline/steps/calculate-scores-step.js +0 -89
  89. package/dist/pipeline/steps/compare-step.d.ts +0 -18
  90. package/dist/pipeline/steps/compare-step.js +0 -90
  91. package/dist/pipeline/steps/eval-step.d.ts +0 -53
  92. package/dist/pipeline/steps/eval-step.js +0 -347
  93. package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
  94. package/dist/pipeline/steps/fetch-docs-step.js +0 -84
  95. package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
  96. package/dist/pipeline/steps/generate-configs-step.js +0 -98
  97. package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
  98. package/dist/pipeline/steps/grader-consistency-step.js +0 -74
  99. package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
  100. package/dist/pipeline/steps/publish-report-step.js +0 -243
  101. package/dist/pipeline/steps/report-step.d.ts +0 -13
  102. package/dist/pipeline/steps/report-step.js +0 -56
  103. package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
  104. package/dist/pipeline/steps/update-scores-step.js +0 -42
  105. package/dist/scripts/agent-behavior-report.d.ts +0 -19
  106. package/dist/scripts/agent-behavior-report.js +0 -315
  107. package/dist/scripts/baseline.d.ts +0 -43
  108. package/dist/scripts/baseline.js +0 -267
  109. package/dist/scripts/calculate-scores.d.ts +0 -166
  110. package/dist/scripts/calculate-scores.js +0 -1296
  111. package/dist/scripts/compare.d.ts +0 -22
  112. package/dist/scripts/compare.js +0 -334
  113. package/dist/scripts/coverage-audit.d.ts +0 -44
  114. package/dist/scripts/coverage-audit.js +0 -209
  115. package/dist/scripts/debug-eval.d.ts +0 -19
  116. package/dist/scripts/debug-eval.js +0 -73
  117. package/dist/scripts/discovery-report.d.ts +0 -58
  118. package/dist/scripts/discovery-report.js +0 -250
  119. package/dist/scripts/fetch-docs.d.ts +0 -35
  120. package/dist/scripts/fetch-docs.js +0 -472
  121. package/dist/scripts/generate-configs.d.ts +0 -66
  122. package/dist/scripts/generate-configs.js +0 -459
  123. package/dist/scripts/grader-api.d.ts +0 -27
  124. package/dist/scripts/grader-api.js +0 -206
  125. package/dist/scripts/grader-compare.d.ts +0 -22
  126. package/dist/scripts/grader-compare.js +0 -368
  127. package/dist/scripts/grader-consistency.d.ts +0 -20
  128. package/dist/scripts/grader-consistency.js +0 -313
  129. package/dist/scripts/grader-sensitivity.d.ts +0 -22
  130. package/dist/scripts/grader-sensitivity.js +0 -354
  131. package/dist/scripts/grader-validate.d.ts +0 -19
  132. package/dist/scripts/grader-validate.js +0 -267
  133. package/dist/scripts/measure-retrieval.d.ts +0 -10
  134. package/dist/scripts/measure-retrieval.js +0 -145
  135. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
  136. package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
  137. package/dist/scripts/pipeline.d.ts +0 -76
  138. package/dist/scripts/pipeline.js +0 -1031
  139. package/dist/scripts/pr-comment.d.ts +0 -10
  140. package/dist/scripts/pr-comment.js +0 -510
  141. package/dist/scripts/readiness-report.d.ts +0 -88
  142. package/dist/scripts/readiness-report.js +0 -342
  143. package/dist/scripts/update-quality-scores.d.ts +0 -15
  144. package/dist/scripts/update-quality-scores.js +0 -184
  145. package/dist/scripts/validate-task-sources.d.ts +0 -21
  146. package/dist/scripts/validate-task-sources.js +0 -210
  147. package/dist/scripts/validate.d.ts +0 -13
  148. package/dist/scripts/validate.js +0 -79
  149. package/dist/scripts/webhook-server.d.ts +0 -26
  150. package/dist/scripts/webhook-server.js +0 -147
  151. package/dist/scripts/weekly-digest.d.ts +0 -24
  152. package/dist/scripts/weekly-digest.js +0 -144
  153. package/dist/sinks/format-slack.d.ts +0 -64
  154. package/dist/sinks/format-slack.js +0 -306
  155. package/dist/sinks/slack-sink.d.ts +0 -27
  156. package/dist/sinks/slack-sink.js +0 -78
  157. package/dist/sinks/webhook-sink.d.ts +0 -19
  158. package/dist/sinks/webhook-sink.js +0 -50
  159. package/tasks/.expanded.agentic.yaml +0 -280
  160. package/tasks/.expanded.yaml +0 -565
@@ -1,73 +0,0 @@
1
- /**
2
- * debug-eval.ts
3
- *
4
- * Thin wrapper around `promptfoo eval` that reads DEBUG_EVAL_* environment
5
- * variables and forwards them as native promptfoo filter flags.
6
- *
7
- * Environment variables:
8
- * DEBUG_EVAL=1 — required to enable debug mode
9
- * DEBUG_EVAL_PATTERN=<re> — --filter-pattern (regex on test description)
10
- * DEBUG_EVAL_N=<number> — --filter-first-n (default: 2 when no other filters set)
11
- * DEBUG_EVAL_SAMPLE=<number> — --filter-sample (random N tests)
12
- *
13
- * Usage:
14
- * tsx src/scripts/debug-eval.ts --config promptfooconfig.yaml
15
- * tsx src/scripts/debug-eval.ts --config promptfooconfig.agentic.yaml --no-cache
16
- *
17
- * All extra argv are forwarded to promptfoo eval unchanged.
18
- */
19
- import { execSync } from "child_process";
20
- // ---------------------------------------------------------------------------
21
- // Parse DEBUG_EVAL_* environment
22
- // ---------------------------------------------------------------------------
23
- const debugEnabled = process.env.DEBUG_EVAL === "1";
24
- if (!debugEnabled) {
25
- console.error("⚠ debug-eval.ts called without DEBUG_EVAL=1 — running full eval instead.");
26
- }
27
- const pattern = process.env.DEBUG_EVAL_PATTERN;
28
- const firstN = process.env.DEBUG_EVAL_N;
29
- const sample = process.env.DEBUG_EVAL_SAMPLE;
30
- // ---------------------------------------------------------------------------
31
- // Build filter flags
32
- // ---------------------------------------------------------------------------
33
- const filterFlags = [];
34
- if (debugEnabled) {
35
- if (pattern) {
36
- filterFlags.push(`--filter-pattern`, `'${pattern}'`);
37
- }
38
- if (sample) {
39
- filterFlags.push(`--filter-sample`, sample);
40
- }
41
- if (firstN) {
42
- filterFlags.push(`--filter-first-n`, firstN);
43
- }
44
- // Default: if no filters specified, limit to first 2 tests for speed
45
- if (filterFlags.length === 0) {
46
- filterFlags.push(`--filter-first-n`, "2");
47
- }
48
- }
49
- // ---------------------------------------------------------------------------
50
- // Forward to promptfoo eval
51
- // ---------------------------------------------------------------------------
52
- // argv[0] = node, argv[1] = this script, argv[2..] = user args
53
- const passthroughArgs = process.argv.slice(2);
54
- const allArgs = ["eval", ...passthroughArgs, ...filterFlags];
55
- const cmd = `promptfoo ${allArgs.join(" ")}`;
56
- if (debugEnabled) {
57
- console.log(`\n🐛 Debug mode enabled`);
58
- console.log(` Filters: ${filterFlags.join(" ") || "(default: first 2)"}`);
59
- console.log(` Command: ${cmd}\n`);
60
- }
61
- try {
62
- execSync(cmd, {
63
- env: process.env,
64
- stdio: "inherit",
65
- });
66
- }
67
- catch (err) {
68
- // promptfoo exits non-zero when assertions fail — that's expected
69
- const code = err !== null && typeof err === "object" && "status" in err
70
- ? err.status
71
- : 1;
72
- process.exit(code);
73
- }
@@ -1,58 +0,0 @@
1
- /**
2
- * discovery-report.ts
3
- *
4
- * Generates an agent discoverability report from agentic mode retrieval
5
- * metrics. Reads score-summary.json (which contains `retrievalMetrics`
6
- * from agentic evaluation) and produces a markdown report showing:
7
- *
8
- * - Retrieval summary (recall, precision, F1)
9
- * - Per-area retrieval breakdown
10
- * - Invisible documents (never retrieved by any task)
11
- * - Recommendations for improving discoverability
12
- *
13
- * Phase 5c of the Scenario Matrix implementation (Scenarios 4.1 and 4.2).
14
- *
15
- * Usage:
16
- * tsx src/scripts/discovery-report.ts # stdout
17
- * tsx src/scripts/discovery-report.ts --area groq # filter by area
18
- * tsx src/scripts/discovery-report.ts --output report.md
19
- *
20
- * @see docs/design-docs/retrieval-metrics.md
21
- */
22
- import "dotenv/config";
23
- import type { AreaRetrievalMetrics, RetrievalMetrics, ScoreSummary } from "../pipeline/types.js";
24
- export interface DiscoveryReport {
25
- /** All areas included in the report (after filtering) */
26
- areas: AreaRetrievalMetrics[];
27
- /** Base URL from the score summary source config */
28
- baseUrl: string | undefined;
29
- /** Document slugs that were never retrieved by any task */
30
- invisibleDocs: InvisibleDoc[];
31
- /** Overall retrieval metrics */
32
- overall: RetrievalMetrics["overall"];
33
- /** Actionable recommendations */
34
- recommendations: string[];
35
- /** ISO timestamp of the source evaluation */
36
- timestamp: string;
37
- /** Total canonical docs across included areas */
38
- totalCanonicalDocs: number;
39
- /** Total hits (canonical docs successfully retrieved) */
40
- totalHits: number;
41
- }
42
- export interface InvisibleDoc {
43
- /** Tasks that reference this document via canonical_docs */
44
- referencedBy: string[];
45
- /** The document slug */
46
- slug: string;
47
- }
48
- /**
49
- * Format a discovery report as markdown.
50
- */
51
- export declare function formatDiscoveryMarkdown(report: DiscoveryReport): string;
52
- /**
53
- * Generate a structured discovery report from a score summary.
54
- *
55
- * @param summary - Parsed score-summary.json
56
- * @param areaFilter - Optional area names to include (all if empty)
57
- */
58
- export declare function generateDiscoveryReport(summary: ScoreSummary, areaFilter?: string[]): DiscoveryReport;
@@ -1,250 +0,0 @@
1
- /**
2
- * discovery-report.ts
3
- *
4
- * Generates an agent discoverability report from agentic mode retrieval
5
- * metrics. Reads score-summary.json (which contains `retrievalMetrics`
6
- * from agentic evaluation) and produces a markdown report showing:
7
- *
8
- * - Retrieval summary (recall, precision, F1)
9
- * - Per-area retrieval breakdown
10
- * - Invisible documents (never retrieved by any task)
11
- * - Recommendations for improving discoverability
12
- *
13
- * Phase 5c of the Scenario Matrix implementation (Scenarios 4.1 and 4.2).
14
- *
15
- * Usage:
16
- * tsx src/scripts/discovery-report.ts # stdout
17
- * tsx src/scripts/discovery-report.ts --area groq # filter by area
18
- * tsx src/scripts/discovery-report.ts --output report.md
19
- *
20
- * @see docs/design-docs/retrieval-metrics.md
21
- */
22
- // oxlint-disable-next-line import/no-unassigned-import -- side-effect: loads .env into process.env
23
- import "dotenv/config";
24
- import { existsSync, readFileSync, writeFileSync } from "node:fs";
25
- import { dirname, join, resolve } from "node:path";
26
- import { fileURLToPath } from "node:url";
27
- const __dirname = dirname(fileURLToPath(import.meta.url));
28
- const ROOT = resolve(__dirname, "..", "..");
29
- // ---------------------------------------------------------------------------
30
- // Core logic (exported for testing)
31
- // ---------------------------------------------------------------------------
32
- /**
33
- * Format a discovery report as markdown.
34
- */
35
- export function formatDiscoveryMarkdown(report) {
36
- const lines = [];
37
- // Header
38
- lines.push("## 🔍 Agent Discoverability Report");
39
- lines.push("");
40
- if (report.baseUrl) {
41
- lines.push(`**Base URL:** ${report.baseUrl}`);
42
- }
43
- lines.push("**Mode:** Agentic");
44
- lines.push("");
45
- // Retrieval summary table
46
- lines.push("### Retrieval Summary");
47
- lines.push("");
48
- lines.push("| Metric | Value |");
49
- lines.push("|---|---|");
50
- lines.push(`| Recall (canonical docs found) | ${pct(report.overall.avgRecall)} (${report.totalHits}/${report.totalCanonicalDocs}) |`);
51
- lines.push(`| Precision (relevant docs fetched) | ${pct(report.overall.avgPrecision)} |`);
52
- lines.push(`| F1 Score | ${report.overall.avgF1.toFixed(2)} |`);
53
- lines.push(`| Invisible docs | ${report.invisibleDocs.length} |`);
54
- lines.push("");
55
- // Per-area breakdown
56
- if (report.areas.length > 0) {
57
- lines.push("### Per-Area Breakdown");
58
- lines.push("");
59
- lines.push("| Area | Recall | Precision | F1 | Tasks |");
60
- lines.push("|---|---|---|---|---|");
61
- for (const area of sortedAreas(report.areas)) {
62
- lines.push(`| ${area.area} | ${pct(area.avgRecall)} | ${pct(area.avgPrecision)} | ${area.avgF1.toFixed(2)} | ${area.taskCount} |`);
63
- }
64
- lines.push("");
65
- }
66
- // Invisible documents
67
- if (report.invisibleDocs.length > 0) {
68
- lines.push("### Invisible Documents (never retrieved by any task)");
69
- lines.push("");
70
- for (const doc of report.invisibleDocs) {
71
- const refs = doc.referencedBy.join(", ");
72
- lines.push(`- \`${doc.slug}\` — referenced by ${refs}`);
73
- }
74
- lines.push("");
75
- }
76
- // Recommendations
77
- if (report.recommendations.length > 0) {
78
- lines.push("### Recommendations");
79
- lines.push("");
80
- for (let i = 0; i < report.recommendations.length; i++) {
81
- lines.push(`${i + 1}. ${report.recommendations[i]}`);
82
- }
83
- lines.push("");
84
- }
85
- return lines.join("\n");
86
- }
87
- /**
88
- * Generate a structured discovery report from a score summary.
89
- *
90
- * @param summary - Parsed score-summary.json
91
- * @param areaFilter - Optional area names to include (all if empty)
92
- */
93
- export function generateDiscoveryReport(summary, areaFilter) {
94
- const metrics = summary.retrievalMetrics;
95
- if (!metrics) {
96
- throw new Error("score-summary.json does not contain retrievalMetrics. " +
97
- "Run an agentic evaluation first: pnpm pipeline -- --mode agentic");
98
- }
99
- // Apply area filter
100
- const areas = areaFilter && areaFilter.length > 0
101
- ? metrics.areas.filter((a) => areaFilter.includes(a.area))
102
- : metrics.areas;
103
- if (areaFilter && areaFilter.length > 0 && areas.length === 0) {
104
- throw new Error(`No retrieval data found for area(s): ${areaFilter.join(", ")}. ` +
105
- `Available areas: ${metrics.areas.map((a) => a.area).join(", ")}`);
106
- }
107
- // Recompute overall metrics for filtered areas
108
- const overall = areas.length === metrics.areas.length
109
- ? metrics.overall
110
- : computeOverall(areas);
111
- // Build invisible docs list with task references
112
- const invisibleDocs = buildInvisibleDocs(areas);
113
- // Compute totals for the summary table
114
- const allTasks = areas.flatMap((a) => a.tasks);
115
- const allExpected = new Set(allTasks.flatMap((t) => t.expected));
116
- const allHits = new Set(allTasks.flatMap((t) => t.hits));
117
- const totalCanonicalDocs = allExpected.size;
118
- const totalHits = allHits.size;
119
- // Generate recommendations
120
- const recommendations = generateRecommendations(invisibleDocs, areas, overall);
121
- return {
122
- areas,
123
- baseUrl: summary.source?.baseUrl,
124
- invisibleDocs,
125
- overall,
126
- recommendations,
127
- timestamp: summary.timestamp,
128
- totalCanonicalDocs,
129
- totalHits,
130
- };
131
- }
132
- // ---------------------------------------------------------------------------
133
- // Helpers (alphabetical for perfectionist/sort-modules)
134
- // ---------------------------------------------------------------------------
135
- function buildInvisibleDocs(areas) {
136
- // Collect all invisible slugs and map them to the tasks that reference them
137
- const slugToTasks = new Map();
138
- for (const area of areas) {
139
- for (const task of area.tasks) {
140
- for (const slug of task.missed) {
141
- // Check if this slug is globally invisible (never retrieved by ANY task)
142
- const isGloballyInvisible = areas.every((a) => a.tasks.every((t) => !t.retrieved.includes(slug)));
143
- if (isGloballyInvisible) {
144
- if (!slugToTasks.has(slug)) {
145
- slugToTasks.set(slug, new Set());
146
- }
147
- slugToTasks.get(slug).add(task.taskId);
148
- }
149
- }
150
- }
151
- }
152
- return [...slugToTasks.entries()]
153
- .map(([slug, tasks]) => ({
154
- referencedBy: [...tasks].sort(),
155
- slug,
156
- }))
157
- .sort((a, b) => b.referencedBy.length - a.referencedBy.length);
158
- }
159
- function computeOverall(areas) {
160
- if (areas.length === 0) {
161
- return { avgF1: 0, avgPrecision: 0, avgRecall: 0 };
162
- }
163
- // Weight by task count for fair averaging
164
- const totalTasks = areas.reduce((s, a) => s + a.taskCount, 0);
165
- if (totalTasks === 0) {
166
- return { avgF1: 0, avgPrecision: 0, avgRecall: 0 };
167
- }
168
- const avgRecall = areas.reduce((s, a) => s + a.avgRecall * a.taskCount, 0) / totalTasks;
169
- const avgPrecision = areas.reduce((s, a) => s + a.avgPrecision * a.taskCount, 0) / totalTasks;
170
- const avgF1 = areas.reduce((s, a) => s + a.avgF1 * a.taskCount, 0) / totalTasks;
171
- return { avgF1, avgPrecision, avgRecall };
172
- }
173
- function generateRecommendations(invisibleDocs, areas, overall) {
174
- const recs = [];
175
- // Recommend adding invisible docs to llms.txt
176
- const highImpactInvisible = invisibleDocs.filter((d) => d.referencedBy.length > 0);
177
- for (const doc of highImpactInvisible.slice(0, 5)) {
178
- const taskWord = doc.referencedBy.length === 1 ? "task" : "tasks";
179
- recs.push(`Add \`${doc.slug}\` to llms.txt (referenced by ${doc.referencedBy.length} ${taskWord})`);
180
- }
181
- // Recommend cross-linking for invisible docs
182
- if (invisibleDocs.length > 0) {
183
- recs.push(`Improve cross-linking to ${invisibleDocs.length} invisible document${invisibleDocs.length === 1 ? "" : "s"}`);
184
- }
185
- // Flag low-recall areas
186
- const lowRecallAreas = areas.filter((a) => a.avgRecall < 0.5);
187
- for (const area of lowRecallAreas) {
188
- recs.push(`Investigate low recall in \`${area.area}\` (${pct(area.avgRecall)}) — agents miss most canonical docs`);
189
- }
190
- // Flag low-precision areas
191
- const lowPrecisionAreas = areas.filter((a) => a.avgPrecision < 0.5);
192
- for (const area of lowPrecisionAreas) {
193
- recs.push(`Review search relevance for \`${area.area}\` (precision ${pct(area.avgPrecision)}) — agents fetch many irrelevant docs`);
194
- }
195
- // Overall recommendation
196
- if (overall.avgF1 < 0.6) {
197
- recs.push("Overall F1 is below 0.60 — consider a documentation restructure for agent accessibility");
198
- }
199
- return recs;
200
- }
201
- function main() {
202
- const { areaFilter, output, summaryPath } = parseArgs(process.argv);
203
- if (!existsSync(summaryPath)) {
204
- console.error(`❌ Score summary not found: ${summaryPath}`);
205
- console.error("Run an agentic evaluation first: pnpm pipeline -- --mode agentic");
206
- process.exit(1);
207
- }
208
- const summary = JSON.parse(readFileSync(summaryPath, "utf-8"));
209
- const report = generateDiscoveryReport(summary, areaFilter.length > 0 ? areaFilter : undefined);
210
- const markdown = formatDiscoveryMarkdown(report);
211
- if (output) {
212
- writeFileSync(output, markdown, "utf-8");
213
- console.log(`✅ Discovery report written to ${output}`);
214
- }
215
- else {
216
- console.log(markdown);
217
- }
218
- }
219
- function parseArgs(argv) {
220
- const args = argv.slice(2);
221
- let output;
222
- const areaFilter = [];
223
- let summaryPath = join(ROOT, "results", "latest", "score-summary.json");
224
- for (let i = 0; i < args.length; i++) {
225
- if (args[i] === "--output" && args[i + 1]) {
226
- output = args[++i];
227
- }
228
- else if (args[i] === "--area" && args[i + 1]) {
229
- areaFilter.push(...args[++i].split(","));
230
- }
231
- else if (args[i] === "--input" && args[i + 1]) {
232
- summaryPath = args[++i];
233
- }
234
- else if (!args[i].startsWith("-")) {
235
- summaryPath = args[i];
236
- }
237
- }
238
- return { areaFilter, output, summaryPath };
239
- }
240
- function pct(value) {
241
- return `${Math.round(value * 100)}%`;
242
- }
243
- function sortedAreas(areas) {
244
- return [...areas].sort((a, b) => a.area.localeCompare(b.area));
245
- }
246
- // Only run when invoked directly
247
- if (process.argv[1]?.endsWith("discovery-report.ts") ||
248
- process.argv[1]?.endsWith("discovery-report.js")) {
249
- main();
250
- }
@@ -1,35 +0,0 @@
1
- /**
2
- * Fetch-docs.ts
3
- *
4
- * Pulls documentation from the Sanity CMS and generates markdown context
5
- * files for use in Promptfoo evaluations. Always produces canonical contexts;
6
- * other outputs are opt-in:
7
- *
8
- * 1. Canonical contexts — one file per evaluation task, containing
9
- * only the manually-annotated "gold" documents for that task (always)
10
- * 2. Feature-area contexts — one file per GROQ feature area query
11
- * (opt-in via --include-feature-areas)
12
- * 3. Full corpus — all articles in one file
13
- * (opt-in via --include-corpus)
14
- */
15
- import "dotenv/config";
16
- /**
17
- * Result of resolving --sanity-document IDs against canonical docs.
18
- *
19
- * Documents specified by ID either replace a canonical doc (if the fetched
20
- * document's slug matches one in the canonical set) or are appended as
21
- * additional context (if the slug is not in the canonical set).
22
- */
23
- export interface DocumentOverlay {
24
- /** Extra formatted content for docs that don't match any canonical slug */
25
- appendedContent: string[];
26
- /** Map from canonical slug → formatted content (replaces the normal fetch) */
27
- replacements: Map<string, string>;
28
- }
29
- /** Result of comparing canonical docs between published and perspective */
30
- export interface ReleaseImpact {
31
- added: string[];
32
- modified: string[];
33
- removed: string[];
34
- unchanged: string[];
35
- }