@sanity/ailf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (530) hide show
  1. package/README.md +89 -0
  2. package/bin/ailf.js +64 -0
  3. package/canonical/grader-references/README.md +88 -0
  4. package/canonical/grader-references/groq.yaml +234 -0
  5. package/canonical/grader-references/studio-setup.yaml +275 -0
  6. package/canonical/reference-solutions/.gitkeep +1 -0
  7. package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
  8. package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
  9. package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
  10. package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
  11. package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
  12. package/canonical/reference-solutions/groq/joins-references.ts +300 -0
  13. package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
  14. package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
  15. package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
  16. package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
  17. package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
  18. package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
  19. package/config/bigquery/README.md +74 -0
  20. package/config/bigquery/views/area_scores.sql +87 -0
  21. package/config/bigquery/views/reports.sql +49 -0
  22. package/config/features.yaml +116 -0
  23. package/config/models.yaml +115 -0
  24. package/config/prompts.yaml +75 -0
  25. package/config/rubrics.yaml +62 -0
  26. package/config/schedules.yaml +43 -0
  27. package/config/sinks.yaml +54 -0
  28. package/config/sources.yaml +51 -0
  29. package/config/thresholds.yaml +49 -0
  30. package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
  31. package/dist/_vendor/ailf-core/examples/index.js +285 -0
  32. package/dist/_vendor/ailf-core/index.d.ts +17 -0
  33. package/dist/_vendor/ailf-core/index.js +17 -0
  34. package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
  35. package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
  36. package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
  37. package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
  38. package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
  39. package/dist/_vendor/ailf-core/ports/context.js +14 -0
  40. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
  41. package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
  42. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
  43. package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
  44. package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
  45. package/dist/_vendor/ailf-core/ports/index.js +7 -0
  46. package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
  47. package/dist/_vendor/ailf-core/ports/logger.js +11 -0
  48. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
  49. package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
  50. package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
  51. package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
  52. package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
  53. package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
  54. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
  55. package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
  56. package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
  57. package/dist/_vendor/ailf-core/schemas/index.js +16 -0
  58. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
  59. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
  60. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
  61. package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
  62. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
  63. package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
  64. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
  65. package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
  66. package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
  67. package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
  68. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
  69. package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
  70. package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
  71. package/dist/_vendor/ailf-core/services/index.js +12 -0
  72. package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
  73. package/dist/_vendor/ailf-core/services/scoring.js +222 -0
  74. package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
  75. package/dist/_vendor/ailf-core/types/index.js +21 -0
  76. package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
  77. package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
  78. package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
  79. package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
  80. package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
  81. package/dist/_vendor/ailf-shared/document-ref.js +1 -0
  82. package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
  83. package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
  84. package/dist/_vendor/ailf-shared/index.d.ts +16 -0
  85. package/dist/_vendor/ailf-shared/index.js +16 -0
  86. package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
  87. package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
  88. package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
  89. package/dist/_vendor/ailf-shared/score-grades.js +23 -0
  90. package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
  91. package/dist/adapters/cache/content-lake-cache.js +59 -0
  92. package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
  93. package/dist/adapters/cache/filesystem-cache.js +54 -0
  94. package/dist/adapters/cache/index.d.ts +2 -0
  95. package/dist/adapters/cache/index.js +2 -0
  96. package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
  97. package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
  98. package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
  99. package/dist/adapters/config-sources/file-config-adapter.js +96 -0
  100. package/dist/adapters/config-sources/index.d.ts +2 -0
  101. package/dist/adapters/config-sources/index.js +2 -0
  102. package/dist/adapters/doc-fetchers/index.d.ts +1 -0
  103. package/dist/adapters/doc-fetchers/index.js +1 -0
  104. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
  105. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
  106. package/dist/adapters/eval-runners/index.d.ts +1 -0
  107. package/dist/adapters/eval-runners/index.js +1 -0
  108. package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
  109. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
  110. package/dist/adapters/index.d.ts +12 -0
  111. package/dist/adapters/index.js +12 -0
  112. package/dist/adapters/loggers/console-logger.d.ts +22 -0
  113. package/dist/adapters/loggers/console-logger.js +54 -0
  114. package/dist/adapters/loggers/index.d.ts +9 -0
  115. package/dist/adapters/loggers/index.js +9 -0
  116. package/dist/adapters/loggers/json-logger.d.ts +18 -0
  117. package/dist/adapters/loggers/json-logger.js +33 -0
  118. package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
  119. package/dist/adapters/loggers/quiet-logger.js +30 -0
  120. package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
  121. package/dist/adapters/task-sources/composite-task-source.js +59 -0
  122. package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
  123. package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
  124. package/dist/adapters/task-sources/index.d.ts +7 -0
  125. package/dist/adapters/task-sources/index.js +7 -0
  126. package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
  127. package/dist/adapters/task-sources/repo-schemas.js +234 -0
  128. package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
  129. package/dist/adapters/task-sources/repo-task-source.js +104 -0
  130. package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
  131. package/dist/adapters/task-sources/repo-trigger.js +153 -0
  132. package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
  133. package/dist/adapters/task-sources/repo-validation.js +164 -0
  134. package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
  135. package/dist/adapters/task-sources/yaml-task-source.js +136 -0
  136. package/dist/agent-observer/agentic-provider.d.ts +132 -0
  137. package/dist/agent-observer/agentic-provider.js +983 -0
  138. package/dist/agent-observer/classifier.d.ts +62 -0
  139. package/dist/agent-observer/classifier.js +269 -0
  140. package/dist/agent-observer/index.d.ts +7 -0
  141. package/dist/agent-observer/index.js +4 -0
  142. package/dist/agent-observer/pricing.d.ts +35 -0
  143. package/dist/agent-observer/pricing.js +82 -0
  144. package/dist/agent-observer/provider.d.ts +77 -0
  145. package/dist/agent-observer/provider.js +151 -0
  146. package/dist/agent-observer/proxy.d.ts +91 -0
  147. package/dist/agent-observer/proxy.js +321 -0
  148. package/dist/agent-observer/test-imports.d.ts +7 -0
  149. package/dist/agent-observer/test-imports.js +185 -0
  150. package/dist/agent-observer/types.d.ts +137 -0
  151. package/dist/agent-observer/types.js +16 -0
  152. package/dist/assertions/source-isolation.d.ts +72 -0
  153. package/dist/assertions/source-isolation.js +117 -0
  154. package/dist/cli.d.ts +24 -0
  155. package/dist/cli.js +199 -0
  156. package/dist/commands/agent-report.d.ts +5 -0
  157. package/dist/commands/agent-report.js +69 -0
  158. package/dist/commands/baseline.d.ts +9 -0
  159. package/dist/commands/baseline.js +141 -0
  160. package/dist/commands/cache.d.ts +13 -0
  161. package/dist/commands/cache.js +135 -0
  162. package/dist/commands/calculate-scores.d.ts +8 -0
  163. package/dist/commands/calculate-scores.js +48 -0
  164. package/dist/commands/compare.d.ts +8 -0
  165. package/dist/commands/compare.js +120 -0
  166. package/dist/commands/completion.d.ts +18 -0
  167. package/dist/commands/completion.js +260 -0
  168. package/dist/commands/coverage-audit.d.ts +7 -0
  169. package/dist/commands/coverage-audit.js +40 -0
  170. package/dist/commands/discovery-report.d.ts +10 -0
  171. package/dist/commands/discovery-report.js +44 -0
  172. package/dist/commands/eval.d.ts +9 -0
  173. package/dist/commands/eval.js +35 -0
  174. package/dist/commands/explain-handler.d.ts +34 -0
  175. package/dist/commands/explain-handler.js +719 -0
  176. package/dist/commands/fetch-docs.d.ts +8 -0
  177. package/dist/commands/fetch-docs.js +128 -0
  178. package/dist/commands/generate-configs.d.ts +8 -0
  179. package/dist/commands/generate-configs.js +46 -0
  180. package/dist/commands/grader/index.d.ts +11 -0
  181. package/dist/commands/grader/index.js +118 -0
  182. package/dist/commands/init.d.ts +19 -0
  183. package/dist/commands/init.js +150 -0
  184. package/dist/commands/interactive.d.ts +12 -0
  185. package/dist/commands/interactive.js +238 -0
  186. package/dist/commands/lookup-doc.d.ts +15 -0
  187. package/dist/commands/lookup-doc.js +84 -0
  188. package/dist/commands/measure-retrieval.d.ts +5 -0
  189. package/dist/commands/measure-retrieval.js +65 -0
  190. package/dist/commands/pipeline-action.d.ts +71 -0
  191. package/dist/commands/pipeline-action.js +305 -0
  192. package/dist/commands/pipeline.d.ts +62 -0
  193. package/dist/commands/pipeline.js +53 -0
  194. package/dist/commands/pr-comment.d.ts +8 -0
  195. package/dist/commands/pr-comment.js +47 -0
  196. package/dist/commands/publish.d.ts +26 -0
  197. package/dist/commands/publish.js +253 -0
  198. package/dist/commands/readiness-report.d.ts +10 -0
  199. package/dist/commands/readiness-report.js +104 -0
  200. package/dist/commands/shared/options.d.ts +29 -0
  201. package/dist/commands/shared/options.js +57 -0
  202. package/dist/commands/update-quality-scores.d.ts +5 -0
  203. package/dist/commands/update-quality-scores.js +20 -0
  204. package/dist/commands/validate-tasks.d.ts +16 -0
  205. package/dist/commands/validate-tasks.js +93 -0
  206. package/dist/commands/validate.d.ts +9 -0
  207. package/dist/commands/validate.js +73 -0
  208. package/dist/commands/webhook-server.d.ts +5 -0
  209. package/dist/commands/webhook-server.js +30 -0
  210. package/dist/commands/weekly-digest.d.ts +10 -0
  211. package/dist/commands/weekly-digest.js +104 -0
  212. package/dist/composition-root.d.ts +26 -0
  213. package/dist/composition-root.js +107 -0
  214. package/dist/interpolate.d.ts +26 -0
  215. package/dist/interpolate.js +70 -0
  216. package/dist/job-store.d.ts +104 -0
  217. package/dist/job-store.js +188 -0
  218. package/dist/lib/agent-behavior-report.d.ts +8 -0
  219. package/dist/lib/agent-behavior-report.js +185 -0
  220. package/dist/lib/baseline.d.ts +19 -0
  221. package/dist/lib/baseline.js +153 -0
  222. package/dist/lib/calculate-scores.d.ts +23 -0
  223. package/dist/lib/calculate-scores.js +42 -0
  224. package/dist/lib/compare.d.ts +18 -0
  225. package/dist/lib/compare.js +170 -0
  226. package/dist/lib/coverage-audit.d.ts +4 -0
  227. package/dist/lib/coverage-audit.js +42 -0
  228. package/dist/lib/discovery-report.d.ts +13 -0
  229. package/dist/lib/discovery-report.js +57 -0
  230. package/dist/lib/fetch-docs.d.ts +30 -0
  231. package/dist/lib/fetch-docs.js +171 -0
  232. package/dist/lib/generate-configs.d.ts +25 -0
  233. package/dist/lib/generate-configs.js +42 -0
  234. package/dist/lib/grader-api.d.ts +21 -0
  235. package/dist/lib/grader-api.js +34 -0
  236. package/dist/lib/grader-compare.d.ts +19 -0
  237. package/dist/lib/grader-compare.js +91 -0
  238. package/dist/lib/grader-consistency.d.ts +27 -0
  239. package/dist/lib/grader-consistency.js +79 -0
  240. package/dist/lib/grader-sensitivity.d.ts +19 -0
  241. package/dist/lib/grader-sensitivity.js +75 -0
  242. package/dist/lib/grader-validate.d.ts +19 -0
  243. package/dist/lib/grader-validate.js +78 -0
  244. package/dist/lib/measure-retrieval.d.ts +14 -0
  245. package/dist/lib/measure-retrieval.js +71 -0
  246. package/dist/lib/pr-comment.d.ts +16 -0
  247. package/dist/lib/pr-comment.js +28 -0
  248. package/dist/lib/readiness-report.d.ts +13 -0
  249. package/dist/lib/readiness-report.js +108 -0
  250. package/dist/lib/webhook-server.d.ts +11 -0
  251. package/dist/lib/webhook-server.js +24 -0
  252. package/dist/lib/weekly-digest.d.ts +24 -0
  253. package/dist/lib/weekly-digest.js +148 -0
  254. package/dist/orchestration/build-app-context.d.ts +27 -0
  255. package/dist/orchestration/build-app-context.js +81 -0
  256. package/dist/orchestration/build-step-sequence.d.ts +15 -0
  257. package/dist/orchestration/build-step-sequence.js +84 -0
  258. package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
  259. package/dist/orchestration/config-to-source-overrides.js +28 -0
  260. package/dist/orchestration/env-bridge.d.ts +21 -0
  261. package/dist/orchestration/env-bridge.js +66 -0
  262. package/dist/orchestration/index.d.ts +11 -0
  263. package/dist/orchestration/index.js +11 -0
  264. package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
  265. package/dist/orchestration/pipeline-orchestrator.js +153 -0
  266. package/dist/orchestration/step-runner.d.ts +20 -0
  267. package/dist/orchestration/step-runner.js +88 -0
  268. package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
  269. package/dist/orchestration/steps/calculate-scores-step.js +95 -0
  270. package/dist/orchestration/steps/callback-step.d.ts +24 -0
  271. package/dist/orchestration/steps/callback-step.js +76 -0
  272. package/dist/orchestration/steps/compare-step.d.ts +14 -0
  273. package/dist/orchestration/steps/compare-step.js +92 -0
  274. package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
  275. package/dist/orchestration/steps/discovery-report-step.js +55 -0
  276. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  277. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  278. package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
  279. package/dist/orchestration/steps/fetch-docs-step.js +135 -0
  280. package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
  281. package/dist/orchestration/steps/gap-analysis-step.js +136 -0
  282. package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
  283. package/dist/orchestration/steps/generate-configs-step.js +85 -0
  284. package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
  285. package/dist/orchestration/steps/grader-consistency-step.js +64 -0
  286. package/dist/orchestration/steps/index.d.ts +19 -0
  287. package/dist/orchestration/steps/index.js +19 -0
  288. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
  289. package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
  290. package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
  291. package/dist/orchestration/steps/publish-report-step.js +216 -0
  292. package/dist/orchestration/steps/readiness-step.d.ts +13 -0
  293. package/dist/orchestration/steps/readiness-step.js +91 -0
  294. package/dist/orchestration/steps/report-step.d.ts +12 -0
  295. package/dist/orchestration/steps/report-step.js +49 -0
  296. package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
  297. package/dist/orchestration/steps/run-eval-step.js +195 -0
  298. package/dist/orchestration/steps/validate-step.d.ts +12 -0
  299. package/dist/orchestration/steps/validate-step.js +41 -0
  300. package/dist/pipeline/agent-behavior-report.d.ts +53 -0
  301. package/dist/pipeline/agent-behavior-report.js +132 -0
  302. package/dist/pipeline/attribution.d.ts +47 -0
  303. package/dist/pipeline/attribution.js +226 -0
  304. package/dist/pipeline/baseline.d.ts +37 -0
  305. package/dist/pipeline/baseline.js +141 -0
  306. package/dist/pipeline/cache.d.ts +101 -0
  307. package/dist/pipeline/cache.js +283 -0
  308. package/dist/pipeline/calculate-scores.d.ts +102 -0
  309. package/dist/pipeline/calculate-scores.js +1128 -0
  310. package/dist/pipeline/callback-delivery.d.ts +50 -0
  311. package/dist/pipeline/callback-delivery.js +89 -0
  312. package/dist/pipeline/checks.d.ts +39 -0
  313. package/dist/pipeline/checks.js +280 -0
  314. package/dist/pipeline/classify-url.d.ts +61 -0
  315. package/dist/pipeline/classify-url.js +93 -0
  316. package/dist/pipeline/compare.d.ts +31 -0
  317. package/dist/pipeline/compare.js +208 -0
  318. package/dist/pipeline/coverage-audit.d.ts +39 -0
  319. package/dist/pipeline/coverage-audit.js +165 -0
  320. package/dist/pipeline/degradations.d.ts +85 -0
  321. package/dist/pipeline/degradations.js +242 -0
  322. package/dist/pipeline/discovery-report.d.ts +55 -0
  323. package/dist/pipeline/discovery-report.js +178 -0
  324. package/dist/pipeline/eval-constants.d.ts +68 -0
  325. package/dist/pipeline/eval-constants.js +111 -0
  326. package/dist/pipeline/eval-fingerprint.d.ts +66 -0
  327. package/dist/pipeline/eval-fingerprint.js +175 -0
  328. package/dist/pipeline/expand-tasks.d.ts +220 -0
  329. package/dist/pipeline/expand-tasks.js +421 -0
  330. package/dist/pipeline/failure-modes.d.ts +46 -0
  331. package/dist/pipeline/failure-modes.js +348 -0
  332. package/dist/pipeline/fetch-url-content.d.ts +44 -0
  333. package/dist/pipeline/fetch-url-content.js +93 -0
  334. package/dist/pipeline/gap-analysis.d.ts +48 -0
  335. package/dist/pipeline/gap-analysis.js +231 -0
  336. package/dist/pipeline/generate-configs.d.ts +72 -0
  337. package/dist/pipeline/generate-configs.js +395 -0
  338. package/dist/pipeline/grader-api.d.ts +49 -0
  339. package/dist/pipeline/grader-api.js +200 -0
  340. package/dist/pipeline/grader-compare-runner.d.ts +44 -0
  341. package/dist/pipeline/grader-compare-runner.js +301 -0
  342. package/dist/pipeline/grader-comparison.d.ts +111 -0
  343. package/dist/pipeline/grader-comparison.js +161 -0
  344. package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
  345. package/dist/pipeline/grader-consistency-runner.js +270 -0
  346. package/dist/pipeline/grader-consistency.d.ts +103 -0
  347. package/dist/pipeline/grader-consistency.js +146 -0
  348. package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
  349. package/dist/pipeline/grader-sensitivity-runner.js +282 -0
  350. package/dist/pipeline/grader-sensitivity.d.ts +94 -0
  351. package/dist/pipeline/grader-sensitivity.js +144 -0
  352. package/dist/pipeline/grader-validate-runner.d.ts +38 -0
  353. package/dist/pipeline/grader-validate-runner.js +229 -0
  354. package/dist/pipeline/grader-validation.d.ts +107 -0
  355. package/dist/pipeline/grader-validation.js +169 -0
  356. package/dist/pipeline/map-request-to-config.d.ts +19 -0
  357. package/dist/pipeline/map-request-to-config.js +80 -0
  358. package/dist/pipeline/measure-retrieval.d.ts +59 -0
  359. package/dist/pipeline/measure-retrieval.js +111 -0
  360. package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
  361. package/dist/pipeline/mirror-repo-tasks.js +350 -0
  362. package/dist/pipeline/plan-format.d.ts +33 -0
  363. package/dist/pipeline/plan-format.js +202 -0
  364. package/dist/pipeline/plan.d.ts +169 -0
  365. package/dist/pipeline/plan.js +708 -0
  366. package/dist/pipeline/pr-comment.d.ts +19 -0
  367. package/dist/pipeline/pr-comment.js +502 -0
  368. package/dist/pipeline/probe.d.ts +52 -0
  369. package/dist/pipeline/probe.js +390 -0
  370. package/dist/pipeline/provenance.d.ts +47 -0
  371. package/dist/pipeline/provenance.js +146 -0
  372. package/dist/pipeline/readiness-report.d.ts +87 -0
  373. package/dist/pipeline/readiness-report.js +205 -0
  374. package/dist/pipeline/release-classification.d.ts +54 -0
  375. package/dist/pipeline/release-classification.js +238 -0
  376. package/dist/pipeline/release-report.d.ts +37 -0
  377. package/dist/pipeline/release-report.js +222 -0
  378. package/dist/pipeline/repo-eval-comment.d.ts +37 -0
  379. package/dist/pipeline/repo-eval-comment.js +165 -0
  380. package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
  381. package/dist/pipeline/repo-threshold-evaluator.js +162 -0
  382. package/dist/pipeline/resolve-mappings.d.ts +35 -0
  383. package/dist/pipeline/resolve-mappings.js +72 -0
  384. package/dist/pipeline/retrieval-metrics.d.ts +39 -0
  385. package/dist/pipeline/retrieval-metrics.js +136 -0
  386. package/dist/pipeline/reverse-mapping.d.ts +67 -0
  387. package/dist/pipeline/reverse-mapping.js +88 -0
  388. package/dist/pipeline/schemas.d.ts +9 -0
  389. package/dist/pipeline/schemas.js +9 -0
  390. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  391. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  392. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  393. package/dist/pipeline/steps/compare-step.js +90 -0
  394. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  395. package/dist/pipeline/steps/eval-step.js +347 -0
  396. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  397. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  398. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  399. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  400. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  401. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  402. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  403. package/dist/pipeline/steps/publish-report-step.js +243 -0
  404. package/dist/pipeline/steps/report-step.d.ts +13 -0
  405. package/dist/pipeline/steps/report-step.js +56 -0
  406. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  407. package/dist/pipeline/steps/update-scores-step.js +42 -0
  408. package/dist/pipeline/targeted-loo.d.ts +88 -0
  409. package/dist/pipeline/targeted-loo.js +203 -0
  410. package/dist/pipeline/thresholds.d.ts +27 -0
  411. package/dist/pipeline/thresholds.js +245 -0
  412. package/dist/pipeline/types.d.ts +10 -0
  413. package/dist/pipeline/types.js +10 -0
  414. package/dist/pipeline/validate.d.ts +67 -0
  415. package/dist/pipeline/validate.js +406 -0
  416. package/dist/pipeline/webhook-server.d.ts +37 -0
  417. package/dist/pipeline/webhook-server.js +133 -0
  418. package/dist/report-store.d.ts +84 -0
  419. package/dist/report-store.js +208 -0
  420. package/dist/sanity/client.d.ts +38 -0
  421. package/dist/sanity/client.js +86 -0
  422. package/dist/sanity/portable-text.d.ts +11 -0
  423. package/dist/sanity/portable-text.js +211 -0
  424. package/dist/sanity/queries.d.ts +133 -0
  425. package/dist/sanity/queries.js +300 -0
  426. package/dist/schedules/digest.d.ts +116 -0
  427. package/dist/schedules/digest.js +156 -0
  428. package/dist/schedules/index.d.ts +12 -0
  429. package/dist/schedules/index.js +10 -0
  430. package/dist/schedules/loader.d.ts +31 -0
  431. package/dist/schedules/loader.js +73 -0
  432. package/dist/schedules/schema.d.ts +9 -0
  433. package/dist/schedules/schema.js +9 -0
  434. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  435. package/dist/scripts/agent-behavior-report.js +315 -0
  436. package/dist/scripts/baseline.d.ts +43 -0
  437. package/dist/scripts/baseline.js +267 -0
  438. package/dist/scripts/calculate-scores.d.ts +166 -0
  439. package/dist/scripts/calculate-scores.js +1296 -0
  440. package/dist/scripts/compare.d.ts +22 -0
  441. package/dist/scripts/compare.js +334 -0
  442. package/dist/scripts/coverage-audit.d.ts +44 -0
  443. package/dist/scripts/coverage-audit.js +209 -0
  444. package/dist/scripts/debug-eval.d.ts +19 -0
  445. package/dist/scripts/debug-eval.js +73 -0
  446. package/dist/scripts/discovery-report.d.ts +58 -0
  447. package/dist/scripts/discovery-report.js +250 -0
  448. package/dist/scripts/fetch-docs.d.ts +35 -0
  449. package/dist/scripts/fetch-docs.js +472 -0
  450. package/dist/scripts/generate-configs.d.ts +66 -0
  451. package/dist/scripts/generate-configs.js +459 -0
  452. package/dist/scripts/grader-api.d.ts +27 -0
  453. package/dist/scripts/grader-api.js +206 -0
  454. package/dist/scripts/grader-compare.d.ts +22 -0
  455. package/dist/scripts/grader-compare.js +368 -0
  456. package/dist/scripts/grader-consistency.d.ts +20 -0
  457. package/dist/scripts/grader-consistency.js +313 -0
  458. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  459. package/dist/scripts/grader-sensitivity.js +354 -0
  460. package/dist/scripts/grader-validate.d.ts +19 -0
  461. package/dist/scripts/grader-validate.js +267 -0
  462. package/dist/scripts/measure-retrieval.d.ts +10 -0
  463. package/dist/scripts/measure-retrieval.js +145 -0
  464. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
  465. package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
  466. package/dist/scripts/pipeline.d.ts +76 -0
  467. package/dist/scripts/pipeline.js +1031 -0
  468. package/dist/scripts/pr-comment.d.ts +10 -0
  469. package/dist/scripts/pr-comment.js +510 -0
  470. package/dist/scripts/readiness-report.d.ts +88 -0
  471. package/dist/scripts/readiness-report.js +342 -0
  472. package/dist/scripts/update-quality-scores.d.ts +15 -0
  473. package/dist/scripts/update-quality-scores.js +184 -0
  474. package/dist/scripts/validate-task-sources.d.ts +21 -0
  475. package/dist/scripts/validate-task-sources.js +210 -0
  476. package/dist/scripts/validate.d.ts +13 -0
  477. package/dist/scripts/validate.js +79 -0
  478. package/dist/scripts/webhook-server.d.ts +26 -0
  479. package/dist/scripts/webhook-server.js +147 -0
  480. package/dist/scripts/weekly-digest.d.ts +24 -0
  481. package/dist/scripts/weekly-digest.js +144 -0
  482. package/dist/sinks/bigquery/index.d.ts +131 -0
  483. package/dist/sinks/bigquery/index.js +222 -0
  484. package/dist/sinks/format-slack.d.ts +64 -0
  485. package/dist/sinks/format-slack.js +306 -0
  486. package/dist/sinks/index.d.ts +23 -0
  487. package/dist/sinks/index.js +18 -0
  488. package/dist/sinks/loader.d.ts +18 -0
  489. package/dist/sinks/loader.js +82 -0
  490. package/dist/sinks/retry.d.ts +24 -0
  491. package/dist/sinks/retry.js +52 -0
  492. package/dist/sinks/schema.d.ts +9 -0
  493. package/dist/sinks/schema.js +9 -0
  494. package/dist/sinks/slack/format.d.ts +65 -0
  495. package/dist/sinks/slack/format.js +327 -0
  496. package/dist/sinks/slack/index.d.ts +27 -0
  497. package/dist/sinks/slack/index.js +78 -0
  498. package/dist/sinks/slack-sink.d.ts +27 -0
  499. package/dist/sinks/slack-sink.js +78 -0
  500. package/dist/sinks/types.d.ts +59 -0
  501. package/dist/sinks/types.js +44 -0
  502. package/dist/sinks/webhook/index.d.ts +19 -0
  503. package/dist/sinks/webhook/index.js +50 -0
  504. package/dist/sinks/webhook-sink.d.ts +19 -0
  505. package/dist/sinks/webhook-sink.js +50 -0
  506. package/dist/sources.d.ts +104 -0
  507. package/dist/sources.js +292 -0
  508. package/dist/webhook/budget.d.ts +42 -0
  509. package/dist/webhook/budget.js +60 -0
  510. package/dist/webhook/debounce.d.ts +67 -0
  511. package/dist/webhook/debounce.js +76 -0
  512. package/dist/webhook/dispatch.d.ts +45 -0
  513. package/dist/webhook/dispatch.js +84 -0
  514. package/dist/webhook/eval-request-handler.d.ts +87 -0
  515. package/dist/webhook/eval-request-handler.js +181 -0
  516. package/dist/webhook/handler.d.ts +88 -0
  517. package/dist/webhook/handler.js +203 -0
  518. package/dist/webhook/index.d.ts +17 -0
  519. package/dist/webhook/index.js +12 -0
  520. package/dist/webhook/types.d.ts +109 -0
  521. package/dist/webhook/types.js +10 -0
  522. package/package.json +72 -0
  523. package/tasks/.expanded.agentic.yaml +51 -0
  524. package/tasks/.expanded.yaml +66 -0
  525. package/tasks/frameworks.yaml +98 -0
  526. package/tasks/functions.yaml +51 -0
  527. package/tasks/groq.yaml +216 -0
  528. package/tasks/nextjs-live.yaml +62 -0
  529. package/tasks/studio-setup.yaml +111 -0
  530. package/tasks/visual-editing.yaml +120 -0
@@ -0,0 +1,708 @@
1
+ /**
2
+ * pipeline/plan.ts
3
+ *
4
+ * Execution plan assembly for the `--explain` CLI flag.
5
+ *
6
+ * Computes a detailed preview of what a command will do without executing
7
+ * anything. Calls existing pure functions (task expansion, model loading,
8
+ * cache hashing, pricing) and composes them into an `ExecutionPlan`.
9
+ *
10
+ * @see docs/exec-plans/active/execution-preview.md
11
+ */
12
+ import { existsSync, readFileSync, readdirSync, statSync } from "fs";
13
+ import { resolve } from "path";
14
+ import { load } from "js-yaml";
15
+ import { lookupPricing } from "../agent-observer/pricing.js";
16
+ import { RepoTaskSource } from "../adapters/task-sources/repo-task-source.js";
17
+ import { lookupCache } from "./cache.js";
18
+ import { loadAndExpandTasks } from "./expand-tasks.js";
19
+ import { validateConfiguration } from "./validate.js";
20
+ /**
21
+ * Known promptfoo provider prefixes — stripped to get the raw model name.
22
+ * Must be ordered longest-first to avoid partial matches.
23
+ */
24
+ const PROVIDER_PREFIXES = [
25
+ "anthropic:messages:",
26
+ "openai:chat:",
27
+ "openai:responses:",
28
+ "openai:",
29
+ "anthropic:",
30
+ "google:",
31
+ ];
32
+ function extractModelName(id) {
33
+ for (const prefix of PROVIDER_PREFIXES) {
34
+ if (id.startsWith(prefix)) {
35
+ return id.slice(prefix.length);
36
+ }
37
+ }
38
+ const parts = id.split(":");
39
+ return parts.length > 1 ? parts.slice(1).join(":") : id;
40
+ }
41
+ function loadModelsFile(rootDir) {
42
+ const modelsPath = resolve(rootDir, "config", "models.yaml");
43
+ if (!existsSync(modelsPath))
44
+ return null;
45
+ try {
46
+ const raw = readFileSync(modelsPath, "utf-8");
47
+ return load(raw);
48
+ }
49
+ catch {
50
+ return null;
51
+ }
52
+ }
53
+ /**
54
+ * Map eval mode to the model "modes" array values from models.yaml.
55
+ * Baseline mode maps to "baseline"; agentic maps to both naive and optimized.
56
+ */
57
+ function modeMatchesModelModes(mode, modelModes) {
58
+ if (!modelModes || modelModes.length === 0)
59
+ return true;
60
+ switch (mode) {
61
+ case "agentic":
62
+ return (modelModes.includes("agentic-naive") ||
63
+ modelModes.includes("agentic-optimized"));
64
+ case "baseline":
65
+ return modelModes.includes("baseline");
66
+ case "full":
67
+ // Full mode uses all models — a model matches if it's in any sub-mode
68
+ return (modelModes.includes("baseline") ||
69
+ modelModes.includes("agentic-naive") ||
70
+ modelModes.includes("agentic-optimized"));
71
+ case "observed":
72
+ return modelModes.includes("observed");
73
+ }
74
+ }
75
+ // ---------------------------------------------------------------------------
76
+ // Cost estimation
77
+ // ---------------------------------------------------------------------------
78
+ /**
79
+ * Average token estimates per evaluation call (conservative).
80
+ * These are rough averages based on typical Sanity documentation tasks.
81
+ */
82
+ const AVG_TOKENS = {
83
+ /** Grader output per judgment */
84
+ graderOutput: 500,
85
+ /** Grader prompt per judgment (rubric + response) */
86
+ graderPrompt: 2000,
87
+ /** Model response per task */
88
+ responseTokens: 1500,
89
+ /** Task prompt + context tokens */
90
+ taskPromptTokens: 2000,
91
+ /** Higher estimate multiplier */
92
+ upperMultiplier: 2.0,
93
+ };
94
+ // ---------------------------------------------------------------------------
95
+ // Cache prediction
96
+ // ---------------------------------------------------------------------------
97
+ /**
98
+ * Build a complete execution plan for the `pipeline` command.
99
+ *
100
+ * This is a read-only operation — it computes the plan by calling existing
101
+ * pure functions (task expansion, model loading, cache lookup, pricing)
102
+ * without executing any pipeline steps or writing to process.env.
103
+ */
104
+ export async function buildPipelinePlan(opts, rootDir) {
105
+ // 1. Validate configuration (same as Step 0)
106
+ let validation;
107
+ try {
108
+ validation = validateConfiguration(rootDir);
109
+ }
110
+ catch {
111
+ validation = { issues: [], valid: false };
112
+ }
113
+ const warnings = validation.issues
114
+ .filter((i) => i.severity === "warning")
115
+ .map((i) => `[${i.source}] ${i.message}`);
116
+ const errors = validation.issues
117
+ .filter((i) => i.severity === "error")
118
+ .map((i) => `[${i.source}] ${i.message}`);
119
+ // 2. Expand tasks with filters
120
+ const filter = opts.areaOption || opts.taskOption
121
+ ? {
122
+ areas: opts.areaOption
123
+ ? opts.areaOption.split(",").map((a) => a.trim())
124
+ : undefined,
125
+ taskIds: opts.taskOption
126
+ ? opts.taskOption.split(",").map((t) => t.trim())
127
+ : undefined,
128
+ }
129
+ : undefined;
130
+ let totalTests = 0;
131
+ let tasks = [];
132
+ let repoTaskCount;
133
+ try {
134
+ const { entries } = loadAndExpandTasks(rootDir, filter);
135
+ totalTests = entries.length;
136
+ tasks = entries.map((entry) => {
137
+ const desc = typeof entry.description === "string" ? entry.description : "(unknown)";
138
+ const isBaseline = desc.includes("[Baseline]") ||
139
+ desc.endsWith("(baseline)") ||
140
+ (Array.isArray(entry.prompts) && entry.prompts.includes("without-docs"));
141
+ return {
142
+ description: desc,
143
+ variant: isBaseline ? "baseline" : "gold",
144
+ };
145
+ });
146
+ }
147
+ catch {
148
+ errors.push("Failed to expand tasks — check task YAML files");
149
+ }
150
+ // Scan repo tasks path for additional task count (preview only)
151
+ if (opts.repoTasksPath) {
152
+ try {
153
+ const repoSource = new RepoTaskSource(opts.repoTasksPath);
154
+ const repoTasks = await repoSource.loadTasks(filter);
155
+ repoTaskCount = repoTasks.length;
156
+ if (repoTaskCount > 0) {
157
+ // Expand repo tasks to estimate test entries
158
+ const { expandTaskDefinitions } = await import("./expand-tasks.js");
159
+ const { entries: repoEntries } = expandTaskDefinitions(repoTasks, rootDir, opts.mode === "agentic" ? "agentic" : "baseline");
160
+ totalTests += repoEntries.length;
161
+ for (const entry of repoEntries) {
162
+ const desc = typeof entry.description === "string"
163
+ ? entry.description
164
+ : "(unknown)";
165
+ const isBaseline = desc.includes("[Baseline]") ||
166
+ desc.endsWith("(baseline)") ||
167
+ (Array.isArray(entry.prompts) &&
168
+ entry.prompts.includes("without-docs"));
169
+ tasks.push({
170
+ description: desc,
171
+ variant: isBaseline ? "baseline" : "gold",
172
+ });
173
+ }
174
+ }
175
+ }
176
+ catch {
177
+ warnings.push(`Failed to scan repo tasks at ${opts.repoTasksPath} — count may be underestimated`);
178
+ }
179
+ }
180
+ // 3. Apply debug filter simulation
181
+ const debugPlan = simulateDebugFilter(totalTests, opts.debug);
182
+ const effectiveTestCount = debugPlan?.filteredCount ?? totalTests;
183
+ // 4. Load models for the selected mode
184
+ const modelsFile = loadModelsFile(rootDir);
185
+ const models = [];
186
+ let graderModelName = "";
187
+ if (modelsFile) {
188
+ const activeModels = modelsFile.models.filter((m) => modeMatchesModelModes(opts.mode, m.modes));
189
+ // For agentic mode, each model appears twice (naive + optimized)
190
+ for (const m of activeModels) {
191
+ const modelName = extractModelName(m.id);
192
+ if (opts.mode === "agentic") {
193
+ if (m.modes?.includes("agentic-naive")) {
194
+ models.push({
195
+ id: m.id,
196
+ label: `${m.label} (Naive)`,
197
+ modelName,
198
+ });
199
+ }
200
+ if (m.modes?.includes("agentic-optimized")) {
201
+ models.push({
202
+ id: m.id,
203
+ label: `${m.label} (Optimized)`,
204
+ modelName,
205
+ });
206
+ }
207
+ }
208
+ else {
209
+ models.push({ id: m.id, label: m.label, modelName });
210
+ }
211
+ }
212
+ graderModelName = extractModelName(modelsFile.grader.id);
213
+ }
214
+ // 5. Estimate cost
215
+ const rubricCount = estimateRubricAssertionsPerTask(rootDir);
216
+ const costEstimate = models.length > 0 && effectiveTestCount > 0
217
+ ? estimateCost(effectiveTestCount, models, graderModelName, rubricCount)
218
+ : undefined;
219
+ // 6. Cache prediction
220
+ const cachePrediction = predictCacheStatus(rootDir, {
221
+ noCache: opts.noCache,
222
+ skipEval: opts.skipEval,
223
+ skipFetch: opts.skipFetch,
224
+ });
225
+ // 7. Build step plan
226
+ const steps = buildStepPlan({
227
+ compareEnabled: opts.compareEnabled,
228
+ discoveryReportEnabled: opts.discoveryReportEnabled,
229
+ dryRun: opts.dryRun,
230
+ gapAnalysisEnabled: opts.gapAnalysisEnabled,
231
+ graderReplications: opts.graderReplications,
232
+ noCache: opts.noCache,
233
+ publishEnabled: opts.publishEnabled,
234
+ readinessEnabled: opts.readinessEnabled,
235
+ skipEval: opts.skipEval,
236
+ skipFetch: opts.skipFetch,
237
+ }, cachePrediction);
238
+ // 8. Comparison plan
239
+ const comparison = buildComparisonPlan(rootDir, opts.compareEnabled, opts.compareBaseline, opts.beforeOption, opts.compareThreshold);
240
+ // 9. File lists
241
+ const filesRead = collectFilesRead(rootDir, opts.mode);
242
+ const filesCreated = collectFilesCreated({
243
+ compareEnabled: opts.compareEnabled,
244
+ discoveryReportEnabled: opts.discoveryReportEnabled,
245
+ gapAnalysisEnabled: opts.gapAnalysisEnabled,
246
+ publishEnabled: opts.publishEnabled,
247
+ readinessEnabled: opts.readinessEnabled,
248
+ });
249
+ return {
250
+ cacheStatus: cachePrediction.predictions,
251
+ command: "pipeline",
252
+ comparison,
253
+ costEstimate,
254
+ debug: debugPlan,
255
+ description: `Run the full evaluation pipeline in ${opts.mode} mode`,
256
+ errors,
257
+ filesCreated,
258
+ filesRead,
259
+ mode: opts.mode,
260
+ models: models.length > 0 ? models : undefined,
261
+ repoTaskCount,
262
+ source: opts.source ?? "production",
263
+ steps,
264
+ tasks: tasks.length > 0 ? tasks : undefined,
265
+ totalCacheSavingsMs: cachePrediction.estimatedSavedMs,
266
+ totalTests: effectiveTestCount,
267
+ warnings,
268
+ };
269
+ }
270
+ /**
271
+ * Build a minimal plan for commands that don't have complex step graphs.
272
+ * Used by compare, validate, fetch-docs, baseline, etc.
273
+ */
274
+ export function buildSimpleCommandPlan(opts) {
275
+ let validation;
276
+ try {
277
+ validation = validateConfiguration(opts.rootDir);
278
+ }
279
+ catch {
280
+ validation = { issues: [], valid: true };
281
+ }
282
+ return {
283
+ cacheStatus: {},
284
+ command: opts.command,
285
+ description: opts.description,
286
+ errors: validation.issues
287
+ .filter((i) => i.severity === "error")
288
+ .map((i) => `[${i.source}] ${i.message}`),
289
+ filesCreated: opts.filesCreated ?? [],
290
+ filesRead: opts.filesRead ?? [],
291
+ steps: opts.steps ?? [],
292
+ totalCacheSavingsMs: 0,
293
+ warnings: validation.issues
294
+ .filter((i) => i.severity === "warning")
295
+ .map((i) => `[${i.source}] ${i.message}`),
296
+ };
297
+ }
298
+ // ---------------------------------------------------------------------------
299
+ // Comparison plan builder
300
+ // ---------------------------------------------------------------------------
301
+ function buildComparisonPlan(rootDir, compareEnabled, compareBaseline, beforeOption, threshold) {
302
+ if (!compareEnabled)
303
+ return undefined;
304
+ let baselinePath;
305
+ if (compareBaseline) {
306
+ baselinePath = resolve(rootDir, compareBaseline);
307
+ }
308
+ else {
309
+ // Find latest baseline
310
+ const baselinesDir = resolve(rootDir, "results", "baselines");
311
+ if (existsSync(baselinesDir)) {
312
+ const files = readdirSync(baselinesDir)
313
+ .filter((f) => f.endsWith(".json"))
314
+ .sort()
315
+ .reverse();
316
+ baselinePath =
317
+ files.length > 0
318
+ ? resolve(baselinesDir, files[0])
319
+ : resolve(baselinesDir, "latest.json");
320
+ }
321
+ else {
322
+ baselinePath = resolve(baselinesDir, "latest.json");
323
+ }
324
+ }
325
+ const exists = existsSync(baselinePath);
326
+ let baselineAge;
327
+ if (exists) {
328
+ try {
329
+ const stat = statSync(baselinePath);
330
+ const ageMs = Date.now() - stat.mtimeMs;
331
+ const ageHours = Math.floor(ageMs / 3600000);
332
+ if (ageHours < 1) {
333
+ baselineAge = "< 1 hour";
334
+ }
335
+ else if (ageHours < 24) {
336
+ baselineAge = `${ageHours} hour${ageHours === 1 ? "" : "s"}`;
337
+ }
338
+ else {
339
+ const ageDays = Math.floor(ageHours / 24);
340
+ baselineAge = `${ageDays} day${ageDays === 1 ? "" : "s"}`;
341
+ }
342
+ }
343
+ catch {
344
+ // stat failed — not critical
345
+ }
346
+ }
347
+ return {
348
+ baselineAge,
349
+ baselinePath,
350
+ exists,
351
+ threshold: threshold ?? 2,
352
+ };
353
+ }
354
+ // ---------------------------------------------------------------------------
355
+ // File list builders
356
+ // ---------------------------------------------------------------------------
357
+ function buildStepPlan(opts, cachePrediction) {
358
+ const steps = [];
359
+ // Step 0: Validate
360
+ steps.push({
361
+ cacheStatus: "miss",
362
+ name: "Validate configuration",
363
+ reason: "Parse and check all YAML configs, schemas, reference solutions",
364
+ willRun: true,
365
+ });
366
+ if (opts.dryRun) {
367
+ // Mark remaining steps as skipped
368
+ const remaining = [
369
+ "Fetch documentation",
370
+ "Generate configs",
371
+ "Run evaluation",
372
+ "Calculate scores",
373
+ "Generate report",
374
+ ];
375
+ for (const name of remaining) {
376
+ steps.push({
377
+ cacheStatus: "skipped",
378
+ name,
379
+ reason: "--dry-run: no execution",
380
+ willRun: false,
381
+ });
382
+ }
383
+ return steps;
384
+ }
385
+ // Step 1: Fetch docs
386
+ const fetchStatus = cachePrediction.predictions["fetch-docs"] ?? "unknown";
387
+ steps.push({
388
+ cacheStatus: fetchStatus,
389
+ estimatedSavedMs: fetchStatus === "hit" ? cachePrediction.estimatedSavedMs : undefined,
390
+ name: "Fetch documentation",
391
+ reason: fetchStatus === "skipped"
392
+ ? "--skip-fetch: reuse cached contexts"
393
+ : fetchStatus === "hit"
394
+ ? "CACHED (inputs unchanged)"
395
+ : "Fetch from Sanity Content Lake",
396
+ willRun: fetchStatus !== "skipped" && fetchStatus !== "hit",
397
+ });
398
+ // Step 2: Generate configs
399
+ const genStatus = cachePrediction.predictions["generate-configs"] ?? "unknown";
400
+ steps.push({
401
+ cacheStatus: genStatus,
402
+ name: "Generate configs",
403
+ reason: genStatus === "hit"
404
+ ? "CACHED (inputs unchanged)"
405
+ : "Expand tasks → Promptfoo config files",
406
+ willRun: genStatus !== "hit",
407
+ });
408
+ // Step 3: Run evaluation
409
+ const evalStatus = cachePrediction.predictions["eval"] ?? "unknown";
410
+ steps.push({
411
+ cacheStatus: evalStatus,
412
+ name: "Run evaluation",
413
+ reason: opts.skipEval
414
+ ? "--skip-eval: reuse existing results"
415
+ : evalStatus === "hit"
416
+ ? "CACHED (inputs unchanged)"
417
+ : "Execute Promptfoo evaluation against all models",
418
+ willRun: !opts.skipEval && evalStatus !== "hit",
419
+ });
420
+ // Step 3c: Grader consistency (optional)
421
+ if (opts.graderReplications) {
422
+ steps.push({
423
+ cacheStatus: "miss",
424
+ name: "Grader consistency analysis",
425
+ reason: `${opts.graderReplications} replications`,
426
+ willRun: true,
427
+ });
428
+ }
429
+ // Step 4: Calculate scores
430
+ const scoreStatus = cachePrediction.predictions["calculate-scores"] ?? "unknown";
431
+ steps.push({
432
+ cacheStatus: scoreStatus,
433
+ name: "Calculate scores",
434
+ reason: scoreStatus === "hit"
435
+ ? "CACHED (inputs unchanged)"
436
+ : "Compute AI Literacy Scores from eval results",
437
+ willRun: scoreStatus !== "hit",
438
+ });
439
+ // Step 4b: Gap analysis (optional)
440
+ if (opts.gapAnalysisEnabled) {
441
+ steps.push({
442
+ cacheStatus: "miss",
443
+ name: "Gap analysis",
444
+ reason: "Classify failure modes + build remediation plan",
445
+ willRun: true,
446
+ });
447
+ }
448
+ // Step 4c: Publish report (optional)
449
+ if (opts.publishEnabled) {
450
+ steps.push({
451
+ cacheStatus: "miss",
452
+ name: "Publish report",
453
+ reason: "Write to Sanity Content Lake + fan out to sinks",
454
+ willRun: true,
455
+ });
456
+ }
457
+ // Step 5: Generate report
458
+ steps.push({
459
+ cacheStatus: "miss",
460
+ name: "Generate report",
461
+ reason: "Build PR comment markdown + score summary",
462
+ willRun: true,
463
+ });
464
+ // Step 5b: Compare (optional)
465
+ if (opts.compareEnabled) {
466
+ steps.push({
467
+ cacheStatus: "miss",
468
+ name: "Compare against baseline",
469
+ reason: "Structured score comparison with delta analysis",
470
+ willRun: true,
471
+ });
472
+ }
473
+ // Step 6: Readiness report (optional)
474
+ if (opts.readinessEnabled) {
475
+ steps.push({
476
+ cacheStatus: "miss",
477
+ name: "Readiness report",
478
+ reason: "Generate launch readiness checklist",
479
+ willRun: true,
480
+ });
481
+ }
482
+ // Step 6c: Discovery report (optional)
483
+ if (opts.discoveryReportEnabled) {
484
+ steps.push({
485
+ cacheStatus: "miss",
486
+ name: "Discovery report",
487
+ reason: "Analyze agent discoverability from retrieval metrics",
488
+ willRun: true,
489
+ });
490
+ }
491
+ return steps;
492
+ }
493
+ function collectFilesCreated(opts) {
494
+ const files = [
495
+ "results/latest/eval-results.json",
496
+ "results/latest/score-summary.json",
497
+ "results/latest/grader-judgments.json",
498
+ "results/latest/pipeline-result.json",
499
+ ];
500
+ if (opts.compareEnabled) {
501
+ files.push("results/latest/comparison-report.json");
502
+ }
503
+ if (opts.gapAnalysisEnabled) {
504
+ files.push("results/latest/failure-modes.json");
505
+ files.push("results/latest/gap-analysis.json");
506
+ }
507
+ if (opts.readinessEnabled) {
508
+ files.push("results/latest/readiness-report.md");
509
+ }
510
+ if (opts.discoveryReportEnabled) {
511
+ files.push("results/latest/discovery-report.md");
512
+ }
513
+ return files.sort();
514
+ }
515
+ // ---------------------------------------------------------------------------
516
+ // Step plan builder
517
+ // ---------------------------------------------------------------------------
518
+ function collectFilesRead(rootDir, _mode) {
519
+ const files = [
520
+ "config/models.yaml",
521
+ "config/rubrics.yaml",
522
+ "config/prompts.yaml",
523
+ "config/sources.yaml",
524
+ ];
525
+ // Task files
526
+ const tasksDir = resolve(rootDir, "tasks");
527
+ if (existsSync(tasksDir)) {
528
+ const taskFiles = readdirSync(tasksDir)
529
+ .filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
530
+ .sort();
531
+ for (const f of taskFiles)
532
+ files.push(`tasks/${f}`);
533
+ }
534
+ // Context files (canonical)
535
+ const canonicalDir = resolve(rootDir, "contexts", "canonical");
536
+ if (existsSync(canonicalDir)) {
537
+ const contextFiles = readdirSync(canonicalDir)
538
+ .filter((f) => f.endsWith(".md"))
539
+ .sort();
540
+ for (const f of contextFiles)
541
+ files.push(`contexts/canonical/${f}`);
542
+ }
543
+ // Reference solutions
544
+ const refDir = resolve(rootDir, "canonical", "reference-solutions");
545
+ if (existsSync(refDir)) {
546
+ const refFiles = readdirSync(refDir)
547
+ .filter((f) => statSync(resolve(refDir, f)).isFile() && !f.startsWith("."))
548
+ .sort();
549
+ for (const f of refFiles)
550
+ files.push(`canonical/reference-solutions/${f}`);
551
+ }
552
+ // Thresholds (if readiness is involved)
553
+ if (existsSync(resolve(rootDir, "config", "thresholds.yaml"))) {
554
+ files.push("config/thresholds.yaml");
555
+ }
556
+ if (existsSync(resolve(rootDir, "config", "features.yaml"))) {
557
+ files.push("config/features.yaml");
558
+ }
559
+ return [...new Set(files)].sort();
560
+ }
561
+ // ---------------------------------------------------------------------------
562
+ // Rubric assertion counting
563
+ // ---------------------------------------------------------------------------
564
+ function estimateCost(testCount, models, graderModelName, rubricAssertionsPerTask) {
565
+ const perModel = [];
566
+ let totalMin = 0;
567
+ let totalMax = 0;
568
+ for (const model of models) {
569
+ const pricing = lookupPricing(model.modelName);
570
+ if (!pricing) {
571
+ perModel.push({
572
+ cost: { max: 0, min: 0 },
573
+ label: model.label,
574
+ modelName: model.modelName,
575
+ });
576
+ continue;
577
+ }
578
+ const minCost = testCount *
579
+ (pricing.input * AVG_TOKENS.taskPromptTokens +
580
+ pricing.output * AVG_TOKENS.responseTokens);
581
+ const maxCost = minCost * AVG_TOKENS.upperMultiplier;
582
+ perModel.push({
583
+ cost: { max: maxCost, min: minCost },
584
+ label: model.label,
585
+ modelName: model.modelName,
586
+ });
587
+ totalMin += minCost;
588
+ totalMax += maxCost;
589
+ }
590
+ // Grading cost: each test × each model × rubric assertions per task
591
+ const gradingCalls = testCount * models.length * rubricAssertionsPerTask;
592
+ const graderPricing = lookupPricing(graderModelName);
593
+ let gradingMin = 0;
594
+ let gradingMax = 0;
595
+ if (graderPricing) {
596
+ gradingMin =
597
+ gradingCalls *
598
+ (graderPricing.input * AVG_TOKENS.graderPrompt +
599
+ graderPricing.output * AVG_TOKENS.graderOutput);
600
+ gradingMax = gradingMin * AVG_TOKENS.upperMultiplier;
601
+ }
602
+ return {
603
+ grading: { max: gradingMax, min: gradingMin },
604
+ gradingCalls,
605
+ perModel,
606
+ totalApiCalls: testCount * models.length,
607
+ totalCost: {
608
+ max: totalMax + gradingMax,
609
+ min: totalMin + gradingMin,
610
+ },
611
+ };
612
+ }
613
+ // ---------------------------------------------------------------------------
614
+ // Resolved options subset (matches pipeline-action.ts ResolvedOptions shape)
615
+ // Used by the plan builder without importing the full type to avoid circular deps.
616
+ // ---------------------------------------------------------------------------
617
+ function estimateRubricAssertionsPerTask(rootDir) {
618
+ // Load rubrics.yaml and count the default template set.
619
+ // In practice, most tasks have 2-4 rubric assertions.
620
+ const rubricsPath = resolve(rootDir, "config", "rubrics.yaml");
621
+ if (!existsSync(rubricsPath))
622
+ return 2; // conservative default
623
+ try {
624
+ const raw = readFileSync(rubricsPath, "utf-8");
625
+ const data = load(raw);
626
+ const templateCount = data?.templates
627
+ ? Object.keys(data.templates).length
628
+ : 2;
629
+ // Most tasks use 2-3 of the available templates
630
+ return Math.min(templateCount, 3);
631
+ }
632
+ catch {
633
+ return 2;
634
+ }
635
+ }
636
+ // ---------------------------------------------------------------------------
637
+ // Pipeline plan builder
638
+ // ---------------------------------------------------------------------------
639
+ function predictCacheStatus(rootDir, opts) {
640
+ const predictions = {};
641
+ let estimatedSavedMs = 0;
642
+ const steps = [
643
+ "fetch-docs",
644
+ "generate-configs",
645
+ "eval",
646
+ "calculate-scores",
647
+ "report",
648
+ ];
649
+ for (const step of steps) {
650
+ // Check for skip flags
651
+ if (step === "fetch-docs" && opts.skipFetch) {
652
+ predictions[step] = "skipped";
653
+ continue;
654
+ }
655
+ if (step === "eval" && opts.skipEval) {
656
+ predictions[step] = "skipped";
657
+ continue;
658
+ }
659
+ if (opts.noCache) {
660
+ predictions[step] = "miss";
661
+ continue;
662
+ }
663
+ try {
664
+ const result = lookupCache(rootDir, step);
665
+ if (result.hit) {
666
+ predictions[step] = "hit";
667
+ estimatedSavedMs += result.entry.durationMs;
668
+ }
669
+ else {
670
+ predictions[step] = "miss";
671
+ }
672
+ }
673
+ catch {
674
+ predictions[step] = "unknown";
675
+ }
676
+ }
677
+ return { estimatedSavedMs, predictions };
678
+ }
679
+ // ---------------------------------------------------------------------------
680
+ // Simple command plan builders (non-pipeline)
681
+ // ---------------------------------------------------------------------------
682
+ function simulateDebugFilter(totalEntries, debug) {
683
+ if (!debug?.enabled)
684
+ return undefined;
685
+ let filteredCount = totalEntries;
686
+ let filterDescription = "";
687
+ if (debug.pattern) {
688
+ // We can't know exact match count without loading task descriptions,
689
+ // but we can indicate the filter is applied
690
+ filterDescription = `pattern "${debug.pattern}"`;
691
+ // Conservative: assume ~30% survive a pattern filter
692
+ filteredCount = Math.max(1, Math.ceil(totalEntries * 0.3));
693
+ }
694
+ else if (debug.sample) {
695
+ filteredCount = Math.min(debug.sample, totalEntries);
696
+ filterDescription = `random sample of ${filteredCount}`;
697
+ }
698
+ else {
699
+ const firstN = debug.firstN ?? 2;
700
+ filteredCount = Math.min(firstN * 2, totalEntries); // ×2 for gold+baseline
701
+ filterDescription = `first ${Math.min(firstN, Math.ceil(totalEntries / 2))} tasks (${filteredCount} test entries)`;
702
+ }
703
+ return {
704
+ filterDescription,
705
+ filteredCount,
706
+ totalCount: totalEntries,
707
+ };
708
+ }