@sanity/ailf 2.0.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/LICENSE +21 -0
  2. package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
  3. package/dist/_vendor/ailf-core/examples/index.js +66 -1
  4. package/dist/agent-harness/assertions-runtime.d.ts +49 -0
  5. package/dist/agent-harness/assertions-runtime.js +138 -0
  6. package/dist/agent-harness/provider.d.ts +58 -0
  7. package/dist/agent-harness/provider.js +104 -0
  8. package/dist/cli.js +0 -0
  9. package/dist/commands/init.js +3 -0
  10. package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
  11. package/dist/orchestration/steps/generate-configs-step.js +35 -2
  12. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +39 -25
  13. package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
  14. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +9 -0
  15. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +28 -85
  16. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
  17. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
  18. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
  19. package/package.json +25 -24
  20. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
  21. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
  22. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
  23. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
  24. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  25. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  26. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  27. package/dist/_vendor/ailf-tasks/index.js +0 -16
  28. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  29. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  30. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  31. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  32. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  33. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  34. package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
  35. package/dist/adapters/task-sources/yaml-task-source.js +0 -139
  36. package/dist/agent-observer/test-imports.d.ts +0 -7
  37. package/dist/agent-observer/test-imports.js +0 -185
  38. package/dist/commands/update-quality-scores.d.ts +0 -5
  39. package/dist/commands/update-quality-scores.js +0 -20
  40. package/dist/lib/agent-behavior-report.d.ts +0 -8
  41. package/dist/lib/agent-behavior-report.js +0 -185
  42. package/dist/lib/baseline.d.ts +0 -19
  43. package/dist/lib/baseline.js +0 -153
  44. package/dist/lib/calculate-scores.d.ts +0 -23
  45. package/dist/lib/calculate-scores.js +0 -42
  46. package/dist/lib/compare.d.ts +0 -18
  47. package/dist/lib/compare.js +0 -170
  48. package/dist/lib/coverage-audit.d.ts +0 -4
  49. package/dist/lib/coverage-audit.js +0 -42
  50. package/dist/lib/discovery-report.d.ts +0 -13
  51. package/dist/lib/discovery-report.js +0 -57
  52. package/dist/lib/fetch-docs.d.ts +0 -30
  53. package/dist/lib/fetch-docs.js +0 -171
  54. package/dist/lib/generate-configs.d.ts +0 -25
  55. package/dist/lib/generate-configs.js +0 -42
  56. package/dist/lib/grader-api.d.ts +0 -21
  57. package/dist/lib/grader-api.js +0 -34
  58. package/dist/lib/grader-compare.d.ts +0 -19
  59. package/dist/lib/grader-compare.js +0 -91
  60. package/dist/lib/grader-consistency.d.ts +0 -27
  61. package/dist/lib/grader-consistency.js +0 -79
  62. package/dist/lib/grader-sensitivity.d.ts +0 -19
  63. package/dist/lib/grader-sensitivity.js +0 -75
  64. package/dist/lib/grader-validate.d.ts +0 -19
  65. package/dist/lib/grader-validate.js +0 -78
  66. package/dist/lib/measure-retrieval.d.ts +0 -14
  67. package/dist/lib/measure-retrieval.js +0 -71
  68. package/dist/lib/pr-comment.d.ts +0 -16
  69. package/dist/lib/pr-comment.js +0 -28
  70. package/dist/lib/readiness-report.d.ts +0 -13
  71. package/dist/lib/readiness-report.js +0 -108
  72. package/dist/lib/webhook-server.d.ts +0 -11
  73. package/dist/lib/webhook-server.js +0 -24
  74. package/dist/lib/weekly-digest.d.ts +0 -24
  75. package/dist/lib/weekly-digest.js +0 -148
  76. package/dist/orchestration/env-bridge.d.ts +0 -21
  77. package/dist/orchestration/env-bridge.js +0 -66
  78. package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
  79. package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
  80. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
  81. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
  82. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
  83. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  84. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  85. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  86. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  87. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  88. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
  89. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
  90. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
  91. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
  92. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
  93. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
  94. package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
  95. package/dist/pipeline/compiler/task-bridge.js +0 -92
  96. package/dist/pipeline/expand-tasks.d.ts +0 -232
  97. package/dist/pipeline/expand-tasks.js +0 -467
  98. package/dist/pipeline/generate-configs.d.ts +0 -92
  99. package/dist/pipeline/generate-configs.js +0 -445
  100. package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
  101. package/dist/pipeline/steps/calculate-scores-step.js +0 -89
  102. package/dist/pipeline/steps/compare-step.d.ts +0 -18
  103. package/dist/pipeline/steps/compare-step.js +0 -90
  104. package/dist/pipeline/steps/eval-step.d.ts +0 -53
  105. package/dist/pipeline/steps/eval-step.js +0 -347
  106. package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
  107. package/dist/pipeline/steps/fetch-docs-step.js +0 -84
  108. package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
  109. package/dist/pipeline/steps/generate-configs-step.js +0 -98
  110. package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
  111. package/dist/pipeline/steps/grader-consistency-step.js +0 -74
  112. package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
  113. package/dist/pipeline/steps/publish-report-step.js +0 -243
  114. package/dist/pipeline/steps/report-step.d.ts +0 -13
  115. package/dist/pipeline/steps/report-step.js +0 -56
  116. package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
  117. package/dist/pipeline/steps/update-scores-step.js +0 -42
  118. package/dist/scripts/agent-behavior-report.d.ts +0 -19
  119. package/dist/scripts/agent-behavior-report.js +0 -315
  120. package/dist/scripts/baseline.d.ts +0 -43
  121. package/dist/scripts/baseline.js +0 -267
  122. package/dist/scripts/calculate-scores.d.ts +0 -166
  123. package/dist/scripts/calculate-scores.js +0 -1296
  124. package/dist/scripts/compare.d.ts +0 -22
  125. package/dist/scripts/compare.js +0 -334
  126. package/dist/scripts/coverage-audit.d.ts +0 -44
  127. package/dist/scripts/coverage-audit.js +0 -209
  128. package/dist/scripts/debug-eval.d.ts +0 -19
  129. package/dist/scripts/debug-eval.js +0 -73
  130. package/dist/scripts/discovery-report.d.ts +0 -58
  131. package/dist/scripts/discovery-report.js +0 -250
  132. package/dist/scripts/fetch-docs.d.ts +0 -35
  133. package/dist/scripts/fetch-docs.js +0 -472
  134. package/dist/scripts/generate-configs.d.ts +0 -66
  135. package/dist/scripts/generate-configs.js +0 -459
  136. package/dist/scripts/grader-api.d.ts +0 -27
  137. package/dist/scripts/grader-api.js +0 -206
  138. package/dist/scripts/grader-compare.d.ts +0 -22
  139. package/dist/scripts/grader-compare.js +0 -368
  140. package/dist/scripts/grader-consistency.d.ts +0 -20
  141. package/dist/scripts/grader-consistency.js +0 -313
  142. package/dist/scripts/grader-sensitivity.d.ts +0 -22
  143. package/dist/scripts/grader-sensitivity.js +0 -354
  144. package/dist/scripts/grader-validate.d.ts +0 -19
  145. package/dist/scripts/grader-validate.js +0 -267
  146. package/dist/scripts/measure-retrieval.d.ts +0 -10
  147. package/dist/scripts/measure-retrieval.js +0 -145
  148. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
  149. package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
  150. package/dist/scripts/pipeline.d.ts +0 -76
  151. package/dist/scripts/pipeline.js +0 -1031
  152. package/dist/scripts/pr-comment.d.ts +0 -10
  153. package/dist/scripts/pr-comment.js +0 -510
  154. package/dist/scripts/readiness-report.d.ts +0 -88
  155. package/dist/scripts/readiness-report.js +0 -342
  156. package/dist/scripts/update-quality-scores.d.ts +0 -15
  157. package/dist/scripts/update-quality-scores.js +0 -184
  158. package/dist/scripts/validate-task-sources.d.ts +0 -21
  159. package/dist/scripts/validate-task-sources.js +0 -210
  160. package/dist/scripts/validate.d.ts +0 -13
  161. package/dist/scripts/validate.js +0 -79
  162. package/dist/scripts/webhook-server.d.ts +0 -26
  163. package/dist/scripts/webhook-server.js +0 -147
  164. package/dist/scripts/weekly-digest.d.ts +0 -24
  165. package/dist/scripts/weekly-digest.js +0 -144
  166. package/dist/sinks/format-slack.d.ts +0 -64
  167. package/dist/sinks/format-slack.js +0 -306
  168. package/dist/sinks/slack-sink.d.ts +0 -27
  169. package/dist/sinks/slack-sink.js +0 -78
  170. package/dist/sinks/webhook-sink.d.ts +0 -19
  171. package/dist/sinks/webhook-sink.js +0 -50
  172. package/tasks/.expanded.agentic.yaml +0 -280
  173. package/tasks/.expanded.yaml +0 -565
@@ -1,1031 +0,0 @@
1
- /**
2
- * pipeline.ts
3
- *
4
- * CLI orchestrator for the modular evaluation pipeline.
5
- * Runs steps in sequence with validation between each.
6
- *
7
- * This is the single entry point for both local and CI evaluation.
8
- * The CI workflow (eval.yml) calls this script, then layers on
9
- * CI-specific post-steps (PR comment posting, artifact upload).
10
- *
11
- * Usage:
12
- * pnpm pipeline # full baseline pipeline
13
- * pnpm pipeline --dry-run # validate only, no execution
14
- * pnpm pipeline --skip-fetch # reuse cached doc contexts
15
- * pnpm pipeline --skip-eval # recalculate from existing results
16
- * pnpm pipeline --mode agentic # run agentic pipeline
17
- * pnpm pipeline --mode observed # run observed pipeline
18
- * pnpm pipeline --source staging # use staging doc source
19
- * pnpm pipeline --debug # run first 2 tests only (fast)
20
- * pnpm pipeline --debug-n 5 # run first 5 tests
21
- * pnpm pipeline --debug-pattern "Blog" # filter by description
22
- * pnpm pipeline --debug-sample 3 # random sample of 3 tests
23
- * pnpm pipeline --no-cache # bypass caching, force re-run
24
- * pnpm pipeline --concurrency 64 # override max parallel API calls
25
- * pnpm pipeline --area groq,frameworks # only evaluate these areas
26
- * pnpm pipeline --task groq-blog-queries # only evaluate this task
27
- * pnpm pipeline --changed-docs groq-introduction,how-queries-work
28
- * # auto-scope to affected tasks
29
- * pnpm pipeline --url https://... # override docs base URL
30
- * pnpm pipeline --sanity-dataset staging # override Sanity dataset
31
- * pnpm pipeline --sanity-project abc123 # override Sanity project ID
32
- * pnpm pipeline --sanity-perspective agent-c7OKTk
33
- * # evaluate a Sanity release
34
- * pnpm pipeline --sanity-document <uuid>
35
- * # evaluate specific document(s)
36
- * pnpm pipeline --sanity-document <uuid> --sanity-documents <uuid>
37
- * # singular and plural aliases work
38
- * pnpm pipeline --header "X-Vercel-Protection-Bypass: <secret>"
39
- * # custom HTTP header (repeatable)
40
- * pnpm pipeline --allowed-origin my-branch.sanity.build
41
- * # sandbox agent to this origin
42
- * pnpm pipeline --before published # run before/after impact evaluation
43
- * pnpm pipeline --before production # before = production source
44
- * pnpm pipeline --before results/baselines/20260310.json # use existing scores
45
- * pnpm pipeline --before latest-baseline # use most recent baseline
46
- * pnpm pipeline --compare # compare scores against latest baseline
47
- * pnpm pipeline --compare --compare-baseline <path> # compare against specific file
48
- * pnpm pipeline --compare --threshold 5 # noise threshold for unchanged (default: 2)
49
- * pnpm pipeline --output /tmp/report.md # write report to specific path
50
- * pnpm pipeline --promptfoo-url <url> # include Promptfoo URL in report
51
- * pnpm pipeline --gap-analysis # run failure mode + impact analysis
52
- * pnpm pipeline --publish # write report to Sanity + fan out to sinks
53
- * pnpm pipeline --publish --publish-tag "daily-2026-03-11" # tag the report
54
- * pnpm pipeline --publish --report-dataset ailf-reports # report store dataset
55
- * pnpm pipeline --publish --report-project abc123 # report store project
56
- *
57
- * Override precedence (highest wins):
58
- * CLI flag (--url, --sanity-dataset, --sanity-project, --allowed-origin)
59
- * → Environment variable (DOC_BASE_URL, SANITY_DATASET, SANITY_PROJECT_ID, DOC_ALLOWED_ORIGIN)
60
- * → config/sources.yaml default value
61
- *
62
- * --header flags are additive and do not override env vars — they are
63
- * always merged with any headers defined in DOC_HEADERS env var.
64
- *
65
- * Environment variable fallbacks (for CI):
66
- * DEBUG_EVAL=1 → --debug
67
- * DEBUG_EVAL_N=2 → --debug-n 2
68
- * DEBUG_EVAL_PATTERN → --debug-pattern
69
- * DEBUG_EVAL_SAMPLE → --debug-sample
70
- * EVAL_FILTER_AREAS → --area
71
- * EVAL_FILTER_TASKS → --task
72
- * EVAL_CHANGED_DOCS → --changed-docs
73
- * AILF_REPORT_DATASET → --report-dataset (report store, not eval)
74
- * AILF_REPORT_PROJECT_ID → --report-project (report store, not eval)
75
- */
76
- import { config as dotenvConfig } from "dotenv";
77
- import { existsSync, readFileSync, writeFileSync } from "fs";
78
- import { load } from "js-yaml";
79
- import { dirname, join, resolve } from "path";
80
- import { fileURLToPath } from "url";
81
- import { createCacheStats } from "../pipeline/cache.js";
82
- import { checkEnvironment } from "../pipeline/checks.js";
83
- import { runCalculateScores } from "../pipeline/steps/calculate-scores-step.js";
84
- import { runCompare } from "../pipeline/steps/compare-step.js";
85
- import { runGraderConsistency } from "../pipeline/steps/grader-consistency-step.js";
86
- import { extractShareUrl, runEval } from "../pipeline/steps/eval-step.js";
87
- import { runFetchDocs } from "../pipeline/steps/fetch-docs-step.js";
88
- import { runGenerateConfigs } from "../pipeline/steps/generate-configs-step.js";
89
- import { runPublishReport } from "../pipeline/steps/publish-report-step.js";
90
- import { runReport } from "../pipeline/steps/report-step.js";
91
- import { assessImpact, buildReverseMapping, } from "../pipeline/reverse-mapping.js";
92
- import { classifyUrls } from "../pipeline/classify-url.js";
93
- import { validateConfiguration } from "../pipeline/validate.js";
94
- const __dirname = dirname(fileURLToPath(import.meta.url));
95
- const ROOT = resolve(__dirname, "..", "..");
96
- // ---------------------------------------------------------------------------
97
- // Load root .env — override shell env vars for consistency with individual
98
- // scripts that use `dotenv -e ../../.env -o --`. The `-o` (override) flag in
99
- // those scripts means .env always wins, so we replicate that here.
100
- // In CI, env vars are set via workflow `env:` blocks, so this is a no-op
101
- // (the .env file doesn't exist in CI).
102
- // ---------------------------------------------------------------------------
103
- const envPath = resolve(ROOT, "..", "..", ".env");
104
- if (existsSync(envPath)) {
105
- dotenvConfig({ override: true, path: envPath });
106
- }
107
- // ---------------------------------------------------------------------------
108
- // CLI argument parsing
109
- // ---------------------------------------------------------------------------
110
- const args = process.argv.slice(2);
111
- function getAllOptions(...names) {
112
- const results = [];
113
- for (const name of names) {
114
- const flag = `--${name}`;
115
- for (let i = 0; i < args.length; i++) {
116
- if (args[i] === flag && i + 1 < args.length) {
117
- results.push(args[i + 1]);
118
- }
119
- }
120
- }
121
- return results;
122
- }
123
- function getFlag(name) {
124
- return args.includes(`--${name}`);
125
- }
126
- function getOption(...names) {
127
- for (const name of names) {
128
- const idx = args.indexOf(`--${name}`);
129
- if (idx !== -1 && idx + 1 < args.length)
130
- return args[idx + 1];
131
- }
132
- return undefined;
133
- }
134
- const dryRun = getFlag("dry-run");
135
- const skipFetch = getFlag("skip-fetch");
136
- const skipEval = getFlag("skip-eval");
137
- const noCache = getFlag("no-cache");
138
- const mode = (getOption("mode") ?? "baseline");
139
- const source = getOption("source");
140
- const outputPath = getOption("output");
141
- const promptfooUrl = getOption("promptfoo-url");
142
- // Debug: CLI flags take precedence, then env vars.
143
- // Any sub-flag (--debug-n, --debug-pattern, --debug-sample) implies --debug.
144
- const debugN = getOption("debug-n") ?? process.env.DEBUG_EVAL_N ?? undefined;
145
- const debugPattern = getOption("debug-pattern") ?? process.env.DEBUG_EVAL_PATTERN ?? undefined;
146
- const debugSample = getOption("debug-sample") ?? process.env.DEBUG_EVAL_SAMPLE ?? undefined;
147
- const debugEnabled = getFlag("debug") ||
148
- process.env.DEBUG_EVAL === "1" ||
149
- debugN !== undefined ||
150
- debugPattern !== undefined ||
151
- debugSample !== undefined;
152
- const debug = debugEnabled
153
- ? {
154
- enabled: true,
155
- firstN: debugN ? parseInt(debugN, 10) : undefined,
156
- pattern: debugPattern,
157
- sample: debugSample ? parseInt(debugSample, 10) : undefined,
158
- }
159
- : undefined;
160
- const concurrencyStr = getOption("concurrency");
161
- const concurrency = concurrencyStr ? parseInt(concurrencyStr, 10) : undefined;
162
- // Scoping: filter to specific areas or tasks
163
- const areaOption = getOption("area");
164
- const taskOption = getOption("task");
165
- // Document-driven scoping: --changed-docs slug1,slug2 or env var
166
- const changedDocsOption = getOption("changed-docs") ?? process.env.EVAL_CHANGED_DOCS ?? undefined;
167
- // Grader consistency: measure grading variance
168
- const graderReplicationsStr = getOption("grader-replications");
169
- const graderReplications = graderReplicationsStr
170
- ? parseInt(graderReplicationsStr, 10)
171
- : undefined;
172
- // Before/after impact evaluation: --before <source>
173
- // When provided, runs the evaluation twice (before + after) and auto-compares.
174
- // Accepts: "published" (Sanity published state), "production" (prod source),
175
- // a path to an existing score-summary.json, or "latest-baseline".
176
- const beforeOption = getOption("before");
177
- // Comparison: compare current scores against a baseline
178
- const compareEnabled = getFlag("compare") || beforeOption !== undefined;
179
- const compareBaseline = getOption("compare-baseline");
180
- const compareThresholdStr = getOption("threshold");
181
- const compareThreshold = compareThresholdStr
182
- ? parseFloat(compareThresholdStr)
183
- : undefined;
184
- // Gap analysis: run failure mode classification + impact estimation (Phase 3)
185
- const gapAnalysisEnabled = getFlag("gap-analysis");
186
- // Readiness report: launch readiness checklist for a specific area (Phase 5b)
187
- const readinessEnabled = getFlag("readiness");
188
- // Discovery report: agent discoverability from agentic retrieval metrics (Phase 5c)
189
- const discoveryReportEnabled = getFlag("discovery-report");
190
- // Publish: write report to Sanity Content Lake + fan out to sinks
191
- const publishEnabled = getFlag("publish") || process.env.AILF_PUBLISH === "1";
192
- const publishTag = getOption("publish-tag");
193
- // Report store: target dataset/project for report persistence
194
- // Independent of SANITY_DATASET/SANITY_PROJECT_ID (which control doc evaluation)
195
- const reportDataset = getOption("report-dataset") ?? process.env.AILF_REPORT_DATASET ?? undefined;
196
- const reportProjectId = getOption("report-project") ?? process.env.AILF_REPORT_PROJECT_ID ?? undefined;
197
- // Search mode: controls web_search tool in agentic mode
198
- // --search open (default), --search origin-only, --search off
199
- const searchMode = getOption("search") ?? process.env.EVAL_SEARCH_MODE ?? "open";
200
- const VALID_SEARCH_MODES = ["open", "origin-only", "off"];
201
- if (!VALID_SEARCH_MODES.includes(searchMode)) {
202
- console.error(`❌ Invalid --search mode "${searchMode}". Must be one of: ${VALID_SEARCH_MODES.join(", ")}`);
203
- process.exit(1);
204
- }
205
- // Pass through to generate-configs via env var
206
- if (searchMode !== "open") {
207
- process.env.EVAL_SEARCH_MODE = searchMode;
208
- }
209
- // Source overrides: --url/--urls, --sanity-dataset, --sanity-project, --sanity-perspective
210
- // CLI flag → env var → config/sources.yaml default
211
- // These set process.env so the interpolation layer in sources.ts picks them up.
212
- const urlArgs = getAllOptions("url", "urls");
213
- const datasetOverride = getOption("sanity-dataset");
214
- const projectIdOverride = getOption("sanity-project");
215
- const perspectiveOverride = getOption("sanity-perspective");
216
- const studioOriginOverride = getOption("sanity-studio-origin");
217
- const sanityDocumentArgs = getAllOptions("sanity-document", "sanity-documents");
218
- // Classify URLs: detect Sanity releases, documents, and direct URLs
219
- if (urlArgs.length > 0) {
220
- const classification = classifyUrls(urlArgs);
221
- // First URL becomes the base URL (used for agentic mode, llms.txt, etc.)
222
- process.env.DOC_BASE_URL = urlArgs[0];
223
- // Merge classified document IDs with explicit --sanity-document args
224
- if (classification.documentIds.length > 0) {
225
- const existing = sanityDocumentArgs.length > 0 ? sanityDocumentArgs : [];
226
- const merged = [...new Set([...existing, ...classification.documentIds])];
227
- process.env.SANITY_DOCUMENT_IDS = merged.join(",");
228
- }
229
- // Use inferred perspective if no explicit one was provided
230
- if (classification.inferredPerspective && !perspectiveOverride) {
231
- process.env.SANITY_PERSPECTIVE = classification.inferredPerspective;
232
- }
233
- // Use inferred studio origin if no explicit one was provided
234
- if (classification.inferredStudioOrigin && !studioOriginOverride) {
235
- process.env.SANITY_STUDIO_ORIGIN = classification.inferredStudioOrigin;
236
- }
237
- // Direct URLs: pass through for baseline mode doc fetching
238
- if (classification.directUrls.length > 0) {
239
- process.env.DOC_DIRECT_URLS = classification.directUrls.join(",");
240
- }
241
- }
242
- if (datasetOverride)
243
- process.env.SANITY_DATASET = datasetOverride;
244
- if (projectIdOverride)
245
- process.env.SANITY_PROJECT_ID = projectIdOverride;
246
- if (perspectiveOverride)
247
- process.env.SANITY_PERSPECTIVE = perspectiveOverride;
248
- if (studioOriginOverride)
249
- process.env.SANITY_STUDIO_ORIGIN = studioOriginOverride;
250
- if (sanityDocumentArgs.length > 0)
251
- process.env.SANITY_DOCUMENT_IDS = sanityDocumentArgs.join(",");
252
- // Custom HTTP headers: --header "Key: Value" (repeatable, like curl -H)
253
- // Parsed into DOC_HEADERS env var as JSON for sources.ts to pick up.
254
- const headerArgs = getAllOptions("header", "headers");
255
- if (headerArgs.length > 0) {
256
- const headers = {};
257
- for (const h of headerArgs) {
258
- const colonIdx = h.indexOf(":");
259
- if (colonIdx === -1) {
260
- console.error(`❌ Invalid header format: "${h}". Expected "Key: Value".`);
261
- process.exit(1);
262
- }
263
- const key = h.slice(0, colonIdx).trim();
264
- const value = h.slice(colonIdx + 1).trim();
265
- if (!key) {
266
- console.error(`❌ Invalid header: empty key in "${h}"`);
267
- process.exit(1);
268
- }
269
- headers[key] = value;
270
- }
271
- process.env.DOC_HEADERS = JSON.stringify(headers);
272
- }
273
- // Allowed origin sandboxing: --allowed-origin my-branch.sanity.build (repeatable)
274
- // Restricts agentic providers to only fetch pages from these origins.
275
- // Supports glob patterns: --allowed-origin "*.sanity.build"
276
- // Plural alias: --allowed-origins
277
- const allowedOriginArgs = getAllOptions("allowed-origin", "allowed-origins");
278
- // Auto-infer allowed origin from --url when no explicit origins are provided.
279
- // This makes agentic isolation the default when a --url is passed.
280
- if (urlArgs.length > 0 && allowedOriginArgs.length === 0) {
281
- try {
282
- const hostname = new URL(urlArgs[0]).hostname.replace(/^www\./, "");
283
- allowedOriginArgs.push(hostname);
284
- }
285
- catch {
286
- // Invalid URL — will be caught later in validation
287
- }
288
- }
289
- // Set env var for downstream consumption (comma-separated)
290
- // DOC_ALLOWED_ORIGINS (plural) is the canonical env var.
291
- // DOC_ALLOWED_ORIGIN (singular) is kept for backward compatibility.
292
- if (allowedOriginArgs.length > 0) {
293
- process.env.DOC_ALLOWED_ORIGINS = allowedOriginArgs.join(",");
294
- }
295
- // Document-driven scoping: resolve changed docs to areas and tasks
296
- // This runs before area/task env var assignment so it can set defaults
297
- // that are then overridden or intersected by explicit --area/--task flags.
298
- let impactSummary;
299
- if (changedDocsOption) {
300
- const changedSlugs = changedDocsOption
301
- .split(",")
302
- .map((s) => s.trim())
303
- .filter(Boolean);
304
- if (changedSlugs.length > 0) {
305
- const reverseMapping = buildReverseMapping(ROOT);
306
- impactSummary = assessImpact(changedSlugs, reverseMapping);
307
- if (impactSummary.areas.length === 0) {
308
- console.warn(`\n⚠️ No evaluation tasks reference any of the changed documents:`);
309
- for (const slug of changedSlugs) {
310
- console.warn(` - ${slug}`);
311
- }
312
- console.warn(`\n Score impact cannot be measured for these documents.\n`);
313
- }
314
- else {
315
- // Apply document-driven scoping: set areas and tasks from impact
316
- // If --area is also provided, intersect (both filters must match)
317
- if (areaOption) {
318
- const explicitAreas = new Set(areaOption.split(",").map((s) => s.trim()));
319
- const intersected = impactSummary.areas.filter((a) => explicitAreas.has(a));
320
- if (intersected.length > 0) {
321
- process.env.EVAL_FILTER_AREAS = intersected.join(",");
322
- }
323
- else {
324
- console.warn(`\n⚠️ No overlap between --area (${areaOption}) and affected areas (${impactSummary.areas.join(", ")})`);
325
- console.warn(` Running with --area filter only (${areaOption})\n`);
326
- }
327
- }
328
- else {
329
- process.env.EVAL_FILTER_AREAS = impactSummary.areas.join(",");
330
- }
331
- // If --task is also provided, intersect
332
- if (taskOption) {
333
- const explicitTasks = new Set(taskOption.split(",").map((s) => s.trim()));
334
- const intersected = impactSummary.taskIds.filter((t) => explicitTasks.has(t));
335
- if (intersected.length > 0) {
336
- process.env.EVAL_FILTER_TASKS = intersected.join(",");
337
- }
338
- // If no overlap, let the explicit --task filter win
339
- }
340
- else {
341
- process.env.EVAL_FILTER_TASKS = impactSummary.taskIds.join(",");
342
- }
343
- }
344
- }
345
- }
346
- // Area/task scoping: passed via env vars to generate-configs subprocess
347
- // Explicit --area/--task flags override document-driven scoping (if not already set above)
348
- if (areaOption && !process.env.EVAL_FILTER_AREAS)
349
- process.env.EVAL_FILTER_AREAS = areaOption;
350
- if (taskOption && !process.env.EVAL_FILTER_TASKS)
351
- process.env.EVAL_FILTER_TASKS = taskOption;
352
- // Validate mode
353
- const VALID_MODES = ["baseline", "observed", "agentic"];
354
- if (!VALID_MODES.includes(mode)) {
355
- console.error(`❌ Invalid mode "${mode}". Must be one of: ${VALID_MODES.join(", ")}`);
356
- process.exit(1);
357
- }
358
- // Export mode for downstream consumption (calculate-scores source verification)
359
- process.env.EVAL_MODE = mode;
360
- // ---------------------------------------------------------------------------
361
- // Promptfoo share URL extraction
362
- // ---------------------------------------------------------------------------
363
- function buildResult(pipelineStart, steps, validation, cacheStats, resolvedPromptfooUrl) {
364
- const durationMs = Date.now() - pipelineStart;
365
- const success = Object.values(steps).every((s) => s.status === "success" || s.status === "skipped");
366
- return {
367
- durationMs,
368
- promptfooUrl: resolvedPromptfooUrl,
369
- steps,
370
- success,
371
- validation,
372
- ...(cacheStats && {
373
- cache: {
374
- hits: cacheStats.hits,
375
- misses: cacheStats.misses,
376
- skipped: cacheStats.skipped,
377
- total: cacheStats.total,
378
- },
379
- }),
380
- };
381
- }
382
- // ---------------------------------------------------------------------------
383
- // Formatting helpers
384
- // ---------------------------------------------------------------------------
385
- function formatDuration(ms) {
386
- if (ms < 1000)
387
- return `${ms}ms`;
388
- if (ms < 60000)
389
- return `${(ms / 1000).toFixed(1)}s`;
390
- const min = Math.floor(ms / 60000);
391
- const sec = Math.round((ms % 60000) / 1000);
392
- return `${min}m ${sec}s`;
393
- }
394
- function printStepResult(name, result) {
395
- const icon = stepIcon(result);
396
- switch (result.status) {
397
- case "failed":
398
- console.log(` ${icon} ${name} (${formatDuration(result.durationMs)}) — ${result.error}`);
399
- break;
400
- case "skipped":
401
- console.log(` ${icon} ${name} — ${result.reason}`);
402
- break;
403
- case "success":
404
- console.log(` ${icon} ${name} (${formatDuration(result.durationMs)}) — ${result.summary}`);
405
- break;
406
- }
407
- }
408
- // ---------------------------------------------------------------------------
409
- // Pipeline execution
410
- // ---------------------------------------------------------------------------
411
- async function runPipeline() {
412
- const pipelineStart = Date.now();
413
- const steps = {};
414
- console.log("=== ai-literacy-framework — Evaluation Pipeline ===\n");
415
- const cacheStats = createCacheStats();
416
- console.log(` Mode: ${mode}`);
417
- console.log(` Source: ${source ?? "default (production)"}`);
418
- console.log(` Dry run: ${dryRun}`);
419
- console.log(` Skip fetch: ${skipFetch}`);
420
- console.log(` Skip eval: ${skipEval}`);
421
- console.log(` Cache: ${noCache ? "disabled (--no-cache)" : "enabled"}`);
422
- if (urlArgs.length > 0) {
423
- console.log(` URLs: ${urlArgs.length} URL(s)`);
424
- for (const u of urlArgs)
425
- console.log(` ${u}`);
426
- }
427
- if (datasetOverride) {
428
- console.log(` Dataset: ${datasetOverride}`);
429
- }
430
- if (projectIdOverride) {
431
- console.log(` Project ID: ${projectIdOverride}`);
432
- }
433
- if (perspectiveOverride) {
434
- console.log(` Perspective: ${perspectiveOverride}`);
435
- }
436
- if (studioOriginOverride) {
437
- console.log(` Studio: ${studioOriginOverride}`);
438
- }
439
- if (sanityDocumentArgs.length > 0) {
440
- console.log(` Documents: ${sanityDocumentArgs.length} document(s)`);
441
- for (const d of sanityDocumentArgs)
442
- console.log(` ${d}`);
443
- }
444
- if (allowedOriginArgs.length > 0) {
445
- const originDisplay = allowedOriginArgs.join(", ");
446
- const autoInferred = urlArgs.length > 0 &&
447
- !args.some((a) => a === "--allowed-origin" || a === "--allowed-origins");
448
- console.log(` Origin: ${originDisplay}${autoInferred ? " (auto-inferred)" : " (sandbox)"}`);
449
- }
450
- if (searchMode !== "open") {
451
- const searchDesc = searchMode === "off" ? "disabled" : "results filtered to allowed origins";
452
- console.log(` Search: ${searchMode} (${searchDesc})`);
453
- }
454
- if (headerArgs.length > 0) {
455
- console.log(` Headers: ${headerArgs.length} custom header(s)`);
456
- }
457
- if (outputPath) {
458
- console.log(` Output: ${outputPath}`);
459
- }
460
- if (debug) {
461
- const filters = [
462
- debug.firstN ? `first ${debug.firstN}` : null,
463
- debug.pattern ? `pattern "${debug.pattern}"` : null,
464
- debug.sample ? `sample ${debug.sample}` : null,
465
- ].filter(Boolean);
466
- console.log(` Debug: ${filters.length > 0 ? filters.join(", ") : "first 2 (default)"}`);
467
- }
468
- if (concurrency) {
469
- console.log(` Concurrency: ${concurrency}`);
470
- }
471
- if (changedDocsOption) {
472
- const slugs = changedDocsOption
473
- .split(",")
474
- .map((s) => s.trim())
475
- .filter(Boolean);
476
- console.log(` Changed: ${slugs.length} document(s)`);
477
- for (const s of slugs)
478
- console.log(` ${s}`);
479
- if (impactSummary) {
480
- if (impactSummary.areas.length > 0) {
481
- console.log(` Affected: ${impactSummary.taskIds.length} task(s) in ${impactSummary.areas.length} area(s)`);
482
- console.log(` areas: ${impactSummary.areas.join(", ")}`);
483
- console.log(` tasks: ${impactSummary.taskIds.join(", ")}`);
484
- }
485
- if (impactSummary.unmatchedSlugs.length > 0) {
486
- console.log(` Unmatched: ${impactSummary.unmatchedSlugs.length} document(s) not in any task`);
487
- for (const s of impactSummary.unmatchedSlugs)
488
- console.log(` ${s}`);
489
- }
490
- }
491
- }
492
- if (areaOption) {
493
- console.log(` Areas: ${process.env.EVAL_FILTER_AREAS ?? areaOption}`);
494
- }
495
- else if (impactSummary?.areas.length) {
496
- console.log(` Areas: ${impactSummary.areas.join(", ")} (from changed docs)`);
497
- }
498
- if (taskOption) {
499
- console.log(` Tasks: ${process.env.EVAL_FILTER_TASKS ?? taskOption}`);
500
- }
501
- else if (impactSummary?.taskIds.length) {
502
- console.log(` Tasks: ${impactSummary.taskIds.join(", ")} (from changed docs)`);
503
- }
504
- if (graderReplications) {
505
- console.log(` Grader: ${graderReplications} replications`);
506
- }
507
- if (beforeOption) {
508
- console.log(` Before: ${beforeOption}`);
509
- }
510
- if (compareEnabled) {
511
- const vsTarget = compareBaseline
512
- ? ` (vs ${compareBaseline})`
513
- : beforeOption
514
- ? ` (vs --before ${beforeOption})`
515
- : " (vs latest baseline)";
516
- console.log(` Compare: enabled${vsTarget}`);
517
- if (compareThreshold !== undefined) {
518
- console.log(` Threshold: ±${compareThreshold}`);
519
- }
520
- }
521
- if (readinessEnabled) {
522
- const readinessArea = areaOption ?? process.env.EVAL_FILTER_AREAS;
523
- console.log(` Readiness: enabled${readinessArea ? ` (area: ${readinessArea})` : " (all areas)"}`);
524
- }
525
- if (discoveryReportEnabled) {
526
- console.log(" Discovery: enabled");
527
- }
528
- if (publishEnabled) {
529
- console.log(` Publish: enabled${publishTag ? ` (tag: "${publishTag}")` : ""}`);
530
- }
531
- console.log();
532
- // -----------------------------------------------------------------------
533
- // Step 0: Validate configuration
534
- // -----------------------------------------------------------------------
535
- console.log("─── Step 0: Validate Configuration ────────────────────────\n");
536
- const validation = validateConfiguration(ROOT);
537
- const envIssues = checkEnvironment(ROOT);
538
- validation.issues.push(...envIssues);
539
- validation.valid =
540
- validation.valid && envIssues.every((i) => i.severity !== "error");
541
- const errors = validation.issues.filter((i) => i.severity === "error");
542
- const warnings = validation.issues.filter((i) => i.severity === "warning");
543
- if (warnings.length > 0) {
544
- for (const w of warnings) {
545
- console.log(` ⚠️ [${w.source}] ${w.message}`);
546
- }
547
- console.log();
548
- }
549
- if (errors.length > 0) {
550
- console.log("❌ Configuration validation failed:\n");
551
- for (const e of errors) {
552
- console.log(` ERROR [${e.source}] ${e.message}`);
553
- if (e.path)
554
- console.log(` at ${e.path}`);
555
- }
556
- return {
557
- durationMs: Date.now() - pipelineStart,
558
- steps,
559
- success: false,
560
- validation,
561
- };
562
- }
563
- console.log(" ✅ Configuration is valid\n");
564
- if (dryRun) {
565
- console.log("─── Dry run complete ────────────────────────────────────────────\n");
566
- console.log(" Pipeline configuration is valid. No steps were executed.");
567
- console.log(" Remove --dry-run to execute the full pipeline.\n");
568
- return {
569
- durationMs: Date.now() - pipelineStart,
570
- steps,
571
- success: true,
572
- validation,
573
- };
574
- }
575
- // -----------------------------------------------------------------------
576
- // Step 1: Fetch documentation
577
- // -----------------------------------------------------------------------
578
- console.log("─── Step 1: Fetch Documentation ───────────────────────────\n");
579
- if (skipFetch) {
580
- steps["fetch-docs"] = { reason: "--skip-fetch", status: "skipped" };
581
- cacheStats.skipped++;
582
- }
583
- else {
584
- steps["fetch-docs"] = await runFetchDocs(source, noCache);
585
- }
586
- trackCacheStats(cacheStats, "fetch-docs", steps["fetch-docs"], noCache);
587
- printStepResult("fetch-docs", steps["fetch-docs"]);
588
- console.log();
589
- if (steps["fetch-docs"].status === "failed") {
590
- return buildResult(pipelineStart, steps, validation, cacheStats);
591
- }
592
- // -----------------------------------------------------------------------
593
- // Step 2: Generate configs
594
- // -----------------------------------------------------------------------
595
- console.log("─── Step 2: Generate Configs ──────────────────────────────\n");
596
- steps["generate-configs"] = runGenerateConfigs(source, noCache);
597
- trackCacheStats(cacheStats, "generate-configs", steps["generate-configs"], noCache);
598
- printStepResult("generate-configs", steps["generate-configs"]);
599
- console.log();
600
- if (steps["generate-configs"].status === "failed") {
601
- return buildResult(pipelineStart, steps, validation, cacheStats);
602
- }
603
- // -----------------------------------------------------------------------
604
- // Step 3: Run evaluation
605
- // -----------------------------------------------------------------------
606
- console.log("─── Step 3: Run Evaluation ────────────────────────────────\n");
607
- if (skipEval) {
608
- steps["eval"] = { reason: "--skip-eval", status: "skipped" };
609
- cacheStats.skipped++;
610
- }
611
- else {
612
- const evalResult = await runEval(mode, debug, concurrency, noCache);
613
- steps["eval"] = evalResult.stepResult;
614
- }
615
- trackCacheStats(cacheStats, "eval", steps["eval"], noCache);
616
- printStepResult("eval", steps["eval"]);
617
- console.log();
618
- if (steps["eval"].status === "failed") {
619
- return buildResult(pipelineStart, steps, validation, cacheStats);
620
- }
621
- // -----------------------------------------------------------------------
622
- // Step 3b: Extract Promptfoo share URL (non-blocking)
623
- // -----------------------------------------------------------------------
624
- // Promptfoo auto-shares when PROMPTFOO_API_KEY is set during eval.
625
- // Extract the URL from the eval results JSON for inclusion in the report.
626
- let resolvedPromptfooUrl = promptfooUrl;
627
- if (!resolvedPromptfooUrl && !skipEval) {
628
- resolvedPromptfooUrl = extractShareUrl(mode);
629
- if (resolvedPromptfooUrl) {
630
- console.log(` 📤 Promptfoo results: ${resolvedPromptfooUrl}\n`);
631
- }
632
- }
633
- // -----------------------------------------------------------------------
634
- // Step 3c: Grader consistency analysis (optional)
635
- // -----------------------------------------------------------------------
636
- if (graderReplications) {
637
- console.log("─── Step 3c: Grader Consistency Analysis ────────────────────\n");
638
- steps["grader-consistency"] = runGraderConsistency(graderReplications, mode);
639
- printStepResult("grader-consistency", steps["grader-consistency"]);
640
- console.log();
641
- if (steps["grader-consistency"].status === "failed") {
642
- return buildResult(pipelineStart, steps, validation, cacheStats);
643
- }
644
- }
645
- // -----------------------------------------------------------------------
646
- // Step 4: Calculate scores
647
- // -----------------------------------------------------------------------
648
- console.log("─── Step 4: Calculate Scores ──────────────────────────────\n");
649
- steps["calculate-scores"] = runCalculateScores(source, mode, noCache);
650
- trackCacheStats(cacheStats, "calculate-scores", steps["calculate-scores"], noCache);
651
- printStepResult("calculate-scores", steps["calculate-scores"]);
652
- console.log();
653
- if (steps["calculate-scores"].status === "failed") {
654
- return buildResult(pipelineStart, steps, validation, cacheStats);
655
- }
656
- // -----------------------------------------------------------------------
657
- // Step 4b: Gap analysis (optional, Phase 3)
658
- // -----------------------------------------------------------------------
659
- if (gapAnalysisEnabled) {
660
- console.log("─── Step 4b: Gap Analysis ─────────────────────────────────\n");
661
- const judgmentsPath = resolve(ROOT, "results", "latest", "grader-judgments.json");
662
- const scoreSummaryPath = resolve(ROOT, "results", "latest", "score-summary.json");
663
- if (!existsSync(judgmentsPath)) {
664
- steps["gap-analysis"] = {
665
- reason: "No grader-judgments.json — run a full evaluation first",
666
- status: "skipped",
667
- };
668
- }
669
- else if (!existsSync(scoreSummaryPath)) {
670
- steps["gap-analysis"] = {
671
- reason: "No score-summary.json",
672
- status: "skipped",
673
- };
674
- }
675
- else {
676
- const gaStart = Date.now();
677
- try {
678
- const { buildFailureModeReport, formatFailureModesConsole } = await import("../pipeline/failure-modes.js");
679
- const { buildGapAnalysisReport, formatGapAnalysisConsole } = await import("../pipeline/gap-analysis.js");
680
- const judgments = JSON.parse(readFileSync(judgmentsPath, "utf-8"));
681
- const scoreSummary = JSON.parse(readFileSync(scoreSummaryPath, "utf-8"));
682
- // Phase 3a: Failure mode classification
683
- const failureModeReport = buildFailureModeReport(judgments, scoreSummary.scores);
684
- console.log(formatFailureModesConsole(failureModeReport));
685
- // Phase 3b: Gap analysis (impact estimation)
686
- const gapReport = buildGapAnalysisReport(failureModeReport, scoreSummary.scores);
687
- console.log(formatGapAnalysisConsole(gapReport));
688
- // Persist reports
689
- const outDir = resolve(ROOT, "results", "latest");
690
- writeFileSync(join(outDir, "failure-modes.json"), JSON.stringify(failureModeReport, null, 2));
691
- writeFileSync(join(outDir, "gap-analysis.json"), JSON.stringify(gapReport, null, 2));
692
- // Enrich score-summary.json with failure mode data
693
- const enrichedSummary = {
694
- ...scoreSummary,
695
- failureModes: failureModeReport,
696
- };
697
- writeFileSync(scoreSummaryPath, JSON.stringify(enrichedSummary, null, 2));
698
- const gapCount = gapReport.gaps.length;
699
- const classRate = failureModeReport.classificationRate.toFixed(0);
700
- steps["gap-analysis"] = {
701
- durationMs: Date.now() - gaStart,
702
- status: "success",
703
- summary: `${failureModeReport.totalJudgments} judgments analyzed (${classRate}% classified), ${gapCount} actionable gaps identified`,
704
- };
705
- }
706
- catch (err) {
707
- steps["gap-analysis"] = {
708
- durationMs: Date.now() - gaStart,
709
- error: err instanceof Error ? err.message : String(err),
710
- status: "failed",
711
- };
712
- }
713
- }
714
- printStepResult("gap-analysis", steps["gap-analysis"]);
715
- console.log();
716
- // Gap analysis failures are non-fatal — it's diagnostic, not blocking
717
- if (steps["gap-analysis"].status === "failed") {
718
- console.warn(" ⚠️ Gap analysis failed — continuing without diagnostic data\n");
719
- }
720
- }
721
- // -----------------------------------------------------------------------
722
- // Step 4c: Publish report to store + sinks (optional)
723
- // -----------------------------------------------------------------------
724
- if (publishEnabled) {
725
- console.log("─── Step 4b: Publish Report ───────────────────────────────\n");
726
- steps["publish-report"] = await runPublishReport(pipelineStart, {
727
- promptfooUrl: resolvedPromptfooUrl,
728
- reportDataset,
729
- reportProjectId,
730
- tag: publishTag,
731
- });
732
- printStepResult("publish-report", steps["publish-report"]);
733
- console.log();
734
- // Publish failures are non-fatal (P5: local-first)
735
- if (steps["publish-report"].status === "failed") {
736
- console.warn(" ⚠️ Publish failed — continuing with local results only\n");
737
- }
738
- }
739
- // -----------------------------------------------------------------------
740
- // Step 5: Generate report
741
- // -----------------------------------------------------------------------
742
- console.log("─── Step 5: Generate Report ───────────────────────────────\n");
743
- steps["report"] = runReport(outputPath, resolvedPromptfooUrl);
744
- printStepResult("report", steps["report"]);
745
- console.log();
746
- // -----------------------------------------------------------------------
747
- // Step 5b: Compare against baseline (optional)
748
- // -----------------------------------------------------------------------
749
- if (compareEnabled) {
750
- console.log("─── Step 5b: Compare Against Baseline ─────────────────────\n");
751
- // Resolve --before option to a baseline file path
752
- let resolvedBaseline = compareBaseline;
753
- if (beforeOption && !resolvedBaseline) {
754
- if (beforeOption === "latest-baseline") {
755
- // Use the most recent baseline file (compare step handles this by default)
756
- resolvedBaseline = undefined;
757
- }
758
- else if (beforeOption === "published" ||
759
- beforeOption === "production") {
760
- // TODO (Phase 2b full): Run before-evaluation with the specified source.
761
- // For now, fall back to latest baseline and log a message.
762
- console.log(` ℹ️ --before ${beforeOption}: full before/after orchestration not yet implemented.`);
763
- console.log(" Falling back to comparison against latest baseline.\n");
764
- resolvedBaseline = undefined;
765
- }
766
- else if (existsSync(beforeOption)) {
767
- // Treat as a path to a score-summary.json file
768
- resolvedBaseline = beforeOption;
769
- console.log(` 📂 Using before-state from file: ${resolvedBaseline}\n`);
770
- }
771
- else {
772
- console.warn(` ⚠️ --before "${beforeOption}" is not a recognized option or file path.`);
773
- console.warn(" Expected: published, production, latest-baseline, or a file path.\n");
774
- }
775
- }
776
- steps["compare"] = runCompare(ROOT, resolvedBaseline, {
777
- noiseThreshold: compareThreshold,
778
- });
779
- printStepResult("compare", steps["compare"]);
780
- // If --changed-docs was provided and comparison succeeded, run attribution
781
- if (steps["compare"].status === "success" &&
782
- changedDocsOption &&
783
- impactSummary) {
784
- const comparisonReportPath = resolve(ROOT, "results", "latest", "comparison-report.json");
785
- if (existsSync(comparisonReportPath)) {
786
- try {
787
- const { attributeChanges, formatAttributionConsole } = await import("../pipeline/attribution.js");
788
- const { resolveMappings } = await import("../pipeline/resolve-mappings.js");
789
- const comparisonReport = JSON.parse(readFileSync(comparisonReportPath, "utf-8"));
790
- const mappings = resolveMappings(ROOT);
791
- const changedSlugs = changedDocsOption
792
- .split(",")
793
- .map((s) => s.trim())
794
- .filter(Boolean);
795
- const attribution = attributeChanges(comparisonReport, changedSlugs, mappings, comparisonReport.noiseThreshold);
796
- // Write attribution to comparison report
797
- const enrichedReport = { ...comparisonReport, attribution };
798
- writeFileSync(comparisonReportPath, JSON.stringify(enrichedReport, null, 2));
799
- // Print attribution to console
800
- console.log();
801
- console.log(formatAttributionConsole(attribution));
802
- }
803
- catch (err) {
804
- console.warn(` ⚠️ Attribution failed: ${err instanceof Error ? err.message : String(err)}`);
805
- }
806
- }
807
- }
808
- console.log();
809
- }
810
- // -----------------------------------------------------------------------
811
- // Step 6: Readiness report (optional, Phase 5b)
812
- // -----------------------------------------------------------------------
813
- if (readinessEnabled) {
814
- console.log("─── Step 6b: Readiness Report ─────────────────────────────────\n");
815
- const readinessStart = Date.now();
816
- try {
817
- const { formatReadinessMarkdown, generateReadinessReport } = await import("../scripts/readiness-report.js");
818
- const { ThresholdConfigSchema } = await import("../pipeline/schemas.js");
819
- const scoreSummaryPath = resolve(ROOT, "results", "latest", "score-summary.json");
820
- const thresholdsPath = resolve(ROOT, "config", "thresholds.yaml");
821
- if (!existsSync(scoreSummaryPath)) {
822
- steps["readiness"] = {
823
- durationMs: Date.now() - readinessStart,
824
- error: "score-summary.json not found",
825
- status: "failed",
826
- };
827
- }
828
- else if (!existsSync(thresholdsPath)) {
829
- steps["readiness"] = {
830
- durationMs: Date.now() - readinessStart,
831
- error: "config/thresholds.yaml not found — required for readiness report",
832
- status: "failed",
833
- };
834
- }
835
- else {
836
- const scoreSummary = JSON.parse(readFileSync(scoreSummaryPath, "utf-8"));
837
- const rawThresholds = load(readFileSync(thresholdsPath, "utf-8"));
838
- const thresholdConfig = ThresholdConfigSchema.parse(rawThresholds);
839
- // Load gap analysis if available
840
- const gapPath = resolve(ROOT, "results", "latest", "gap-analysis.json");
841
- const gapAnalysis = existsSync(gapPath)
842
- ? JSON.parse(readFileSync(gapPath, "utf-8"))
843
- : undefined;
844
- // Determine which areas to generate readiness for
845
- const readinessAreas = areaOption
846
- ? areaOption.split(",").map((s) => s.trim())
847
- : process.env.EVAL_FILTER_AREAS
848
- ? process.env.EVAL_FILTER_AREAS.split(",").map((s) => s.trim())
849
- : scoreSummary.scores.map((s) => s.feature);
850
- const readinessLines = [];
851
- for (const area of readinessAreas) {
852
- const areaScore = scoreSummary.scores.find((s) => s.feature === area);
853
- if (!areaScore) {
854
- console.warn(` ⚠️ Area "${area}" not found in scores — skipping`);
855
- continue;
856
- }
857
- const report = generateReadinessReport({
858
- area,
859
- gapAnalysis,
860
- scoreSummary,
861
- thresholdConfig,
862
- });
863
- const md = formatReadinessMarkdown(report);
864
- readinessLines.push(md);
865
- console.log(md);
866
- }
867
- // Write combined readiness output
868
- if (readinessLines.length > 0) {
869
- const readinessOutPath = resolve(ROOT, "results", "latest", "readiness-report.md");
870
- writeFileSync(readinessOutPath, readinessLines.join("\n---\n\n"));
871
- }
872
- const passCount = readinessAreas.filter((area) => {
873
- const areaScore = scoreSummary.scores.find((s) => s.feature === area);
874
- if (!areaScore)
875
- return false;
876
- const report = generateReadinessReport({
877
- area,
878
- scoreSummary,
879
- thresholdConfig,
880
- });
881
- return report.pass;
882
- }).length;
883
- steps["readiness"] = {
884
- durationMs: Date.now() - readinessStart,
885
- status: "success",
886
- summary: `${passCount}/${readinessAreas.length} areas ready`,
887
- };
888
- }
889
- }
890
- catch (err) {
891
- steps["readiness"] = {
892
- durationMs: Date.now() - readinessStart,
893
- error: err instanceof Error ? err.message : String(err),
894
- status: "failed",
895
- };
896
- }
897
- printStepResult("readiness", steps["readiness"]);
898
- console.log();
899
- }
900
- // -----------------------------------------------------------------------
901
- // Step 6c: Discovery report (optional, Phase 5c)
902
- // -----------------------------------------------------------------------
903
- if (discoveryReportEnabled) {
904
- console.log("─── Step 6c: Discovery Report ─────────────────────────────────\n");
905
- const discoveryStart = Date.now();
906
- try {
907
- const { formatDiscoveryMarkdown, generateDiscoveryReport } = await import("../scripts/discovery-report.js");
908
- const scoreSummaryPath = resolve(ROOT, "results", "latest", "score-summary.json");
909
- if (!existsSync(scoreSummaryPath)) {
910
- steps["discovery-report"] = {
911
- durationMs: Date.now() - discoveryStart,
912
- error: "score-summary.json not found",
913
- status: "failed",
914
- };
915
- }
916
- else {
917
- const scoreSummary = JSON.parse(readFileSync(scoreSummaryPath, "utf-8"));
918
- if (!scoreSummary.retrievalMetrics) {
919
- steps["discovery-report"] = {
920
- reason: "No retrieval metrics in score summary — run an agentic evaluation first",
921
- status: "skipped",
922
- };
923
- }
924
- else {
925
- const areaFilter = areaOption
926
- ? areaOption.split(",").map((s) => s.trim())
927
- : undefined;
928
- const report = generateDiscoveryReport(scoreSummary, areaFilter);
929
- const md = formatDiscoveryMarkdown(report);
930
- // Write to file
931
- const discoveryOutPath = resolve(ROOT, "results", "latest", "discovery-report.md");
932
- writeFileSync(discoveryOutPath, md);
933
- console.log(md);
934
- const invisible = report.invisibleDocs.length;
935
- const f1 = report.overall.avgF1.toFixed(2);
936
- steps["discovery-report"] = {
937
- durationMs: Date.now() - discoveryStart,
938
- status: "success",
939
- summary: `F1=${f1}, ${invisible} invisible doc${invisible === 1 ? "" : "s"}, ${report.recommendations.length} recommendation${report.recommendations.length === 1 ? "" : "s"}`,
940
- };
941
- }
942
- }
943
- }
944
- catch (err) {
945
- steps["discovery-report"] = {
946
- durationMs: Date.now() - discoveryStart,
947
- error: err instanceof Error ? err.message : String(err),
948
- status: "failed",
949
- };
950
- }
951
- printStepResult("discovery-report", steps["discovery-report"]);
952
- console.log();
953
- }
954
- return buildResult(pipelineStart, steps, validation, cacheStats, resolvedPromptfooUrl);
955
- }
956
- function stepIcon(result) {
957
- switch (result.status) {
958
- case "failed":
959
- return "❌";
960
- case "skipped":
961
- return "⏭️";
962
- case "success":
963
- return "✅";
964
- }
965
- }
966
- /**
967
- * Track cache hit/miss stats for a step based on its result summary.
968
- * Steps that return "Skipped (cached)" in their summary are cache hits.
969
- */
970
- function trackCacheStats(stats, stepName, result, noCacheFlag) {
971
- if (result.status === "skipped") {
972
- stats.steps[stepName] = "skipped";
973
- // skipped count already incremented at skip site
974
- return;
975
- }
976
- if (noCacheFlag) {
977
- stats.total++;
978
- stats.misses++;
979
- stats.steps[stepName] = "disabled";
980
- return;
981
- }
982
- stats.total++;
983
- if (result.status === "success" &&
984
- result.summary.startsWith("Skipped (cached)")) {
985
- stats.hits++;
986
- stats.steps[stepName] = "hit";
987
- }
988
- else {
989
- stats.misses++;
990
- stats.steps[stepName] = "miss";
991
- }
992
- }
993
- // ---------------------------------------------------------------------------
994
- // Main
995
- // ---------------------------------------------------------------------------
996
- const result = await runPipeline();
997
- // Print summary
998
- console.log("═══════════════════════════════════════════════════════════════\n");
999
- if (result.success) {
1000
- console.log(` ✅ Pipeline completed successfully (${formatDuration(result.durationMs)})`);
1001
- }
1002
- else {
1003
- console.log(` ❌ Pipeline failed (${formatDuration(result.durationMs)})`);
1004
- }
1005
- console.log();
1006
- console.log(" Steps:");
1007
- for (const [name, step] of Object.entries(result.steps)) {
1008
- printStepResult(name, step);
1009
- }
1010
- if (result.cache && result.cache.total > 0) {
1011
- const c = result.cache;
1012
- const parts = [];
1013
- if (c.hits > 0)
1014
- parts.push(`${c.hits} cached`);
1015
- if (c.misses > 0)
1016
- parts.push(`${c.misses} evaluated`);
1017
- if (c.skipped > 0)
1018
- parts.push(`${c.skipped} skipped`);
1019
- console.log(`\n 📦 Cache: ${parts.join(", ")} (${c.total} total steps)`);
1020
- }
1021
- console.log();
1022
- // Write machine-readable result for CI consumption
1023
- const resultFile = resolve(ROOT, "results", "latest", "pipeline-result.json");
1024
- try {
1025
- writeFileSync(resultFile, JSON.stringify(result, null, 2));
1026
- console.log(` 📄 Pipeline result: ${resultFile}\n`);
1027
- }
1028
- catch {
1029
- // results/latest/ may not exist yet on first run — not critical
1030
- }
1031
- process.exit(result.success ? 0 : 1);