@sanity/ailf 2.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (442) hide show
  1. package/canonical/grader-references/README.md +2 -2
  2. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  3. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  4. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  5. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  6. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  7. package/config/features.ts +1 -1
  8. package/config/models.ts +28 -23
  9. package/config/sources.ts +1 -1
  10. package/config/thresholds.ts +1 -1
  11. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  13. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  17. package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
  18. package/dist/_vendor/ailf-core/config-helpers.js +29 -0
  19. package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
  20. package/dist/_vendor/ailf-core/examples/index.js +208 -114
  21. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  22. package/dist/_vendor/ailf-core/index.js +1 -0
  23. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  25. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  27. package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
  28. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  29. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  30. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  31. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  32. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  33. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
  34. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
  35. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  36. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  37. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  38. package/dist/_vendor/ailf-core/services/index.js +1 -1
  39. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  40. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
  41. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  42. package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
  43. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
  44. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  45. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  46. package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
  47. package/dist/_vendor/ailf-tasks/cli.js +61 -0
  48. package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
  49. package/dist/_vendor/ailf-tasks/index.js +16 -0
  50. package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
  51. package/dist/_vendor/ailf-tasks/parser.js +73 -0
  52. package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
  53. package/dist/_vendor/ailf-tasks/schemas.js +180 -0
  54. package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
  55. package/dist/_vendor/ailf-tasks/validation.js +162 -0
  56. package/dist/adapters/api-client/remediation.js +2 -2
  57. package/dist/adapters/config-sources/file-config-adapter.js +6 -1
  58. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  59. package/dist/adapters/index.d.ts +0 -1
  60. package/dist/adapters/index.js +0 -1
  61. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  62. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  63. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  64. package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
  65. package/dist/adapters/task-sources/index.d.ts +1 -2
  66. package/dist/adapters/task-sources/index.js +1 -2
  67. package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
  68. package/dist/adapters/task-sources/repo-schemas.js +2 -2
  69. package/dist/adapters/task-sources/repo-task-source.js +1 -1
  70. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  71. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
  73. package/dist/adapters/task-sources/task-file-loader.js +20 -6
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/explain-handler.d.ts +1 -1
  95. package/dist/commands/explain-handler.js +37 -8
  96. package/dist/commands/fetch-docs.js +1 -0
  97. package/dist/commands/generate-configs.d.ts +3 -3
  98. package/dist/commands/generate-configs.js +20 -8
  99. package/dist/commands/init.d.ts +2 -3
  100. package/dist/commands/init.js +56 -170
  101. package/dist/commands/pipeline-action.d.ts +7 -1
  102. package/dist/commands/pipeline-action.js +43 -19
  103. package/dist/commands/pipeline.d.ts +6 -1
  104. package/dist/commands/pipeline.js +7 -2
  105. package/dist/commands/pr-comment.js +1 -0
  106. package/dist/commands/publish.js +1 -0
  107. package/dist/commands/shared/help.js +2 -2
  108. package/dist/commands/update-quality-scores.d.ts +5 -0
  109. package/dist/commands/update-quality-scores.js +20 -0
  110. package/dist/composition-root.d.ts +2 -3
  111. package/dist/composition-root.js +27 -14
  112. package/dist/config/features.ts +23 -0
  113. package/dist/config/models.ts +100 -0
  114. package/dist/config/prompts.ts +16 -0
  115. package/dist/config/rubrics.ts +225 -0
  116. package/dist/config/schedules.ts +47 -0
  117. package/dist/config/sinks.ts +37 -0
  118. package/dist/config/sources.ts +21 -0
  119. package/dist/config/thresholds.ts +61 -0
  120. package/dist/lib/agent-behavior-report.d.ts +8 -0
  121. package/dist/lib/agent-behavior-report.js +185 -0
  122. package/dist/lib/baseline.d.ts +19 -0
  123. package/dist/lib/baseline.js +153 -0
  124. package/dist/lib/calculate-scores.d.ts +23 -0
  125. package/dist/lib/calculate-scores.js +42 -0
  126. package/dist/lib/compare.d.ts +18 -0
  127. package/dist/lib/compare.js +170 -0
  128. package/dist/lib/coverage-audit.d.ts +4 -0
  129. package/dist/lib/coverage-audit.js +42 -0
  130. package/dist/lib/discovery-report.d.ts +13 -0
  131. package/dist/lib/discovery-report.js +57 -0
  132. package/dist/lib/fetch-docs.d.ts +30 -0
  133. package/dist/lib/fetch-docs.js +171 -0
  134. package/dist/lib/generate-configs.d.ts +25 -0
  135. package/dist/lib/generate-configs.js +42 -0
  136. package/dist/lib/grader-api.d.ts +21 -0
  137. package/dist/lib/grader-api.js +34 -0
  138. package/dist/lib/grader-compare.d.ts +19 -0
  139. package/dist/lib/grader-compare.js +91 -0
  140. package/dist/lib/grader-consistency.d.ts +27 -0
  141. package/dist/lib/grader-consistency.js +79 -0
  142. package/dist/lib/grader-sensitivity.d.ts +19 -0
  143. package/dist/lib/grader-sensitivity.js +75 -0
  144. package/dist/lib/grader-validate.d.ts +19 -0
  145. package/dist/lib/grader-validate.js +78 -0
  146. package/dist/lib/measure-retrieval.d.ts +14 -0
  147. package/dist/lib/measure-retrieval.js +71 -0
  148. package/dist/lib/pr-comment.d.ts +16 -0
  149. package/dist/lib/pr-comment.js +28 -0
  150. package/dist/lib/readiness-report.d.ts +13 -0
  151. package/dist/lib/readiness-report.js +108 -0
  152. package/dist/lib/webhook-server.d.ts +11 -0
  153. package/dist/lib/webhook-server.js +24 -0
  154. package/dist/lib/weekly-digest.d.ts +24 -0
  155. package/dist/lib/weekly-digest.js +148 -0
  156. package/dist/orchestration/build-app-context.js +13 -0
  157. package/dist/orchestration/cache-context.d.ts +23 -0
  158. package/dist/orchestration/cache-context.js +43 -0
  159. package/dist/orchestration/env-bridge.d.ts +21 -0
  160. package/dist/orchestration/env-bridge.js +66 -0
  161. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  162. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  163. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  164. package/dist/orchestration/step-runner.js +5 -1
  165. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  166. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  167. package/dist/orchestration/steps/callback-step.js +10 -1
  168. package/dist/orchestration/steps/compare-step.js +6 -3
  169. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  170. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  171. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  172. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  173. package/dist/orchestration/steps/fetch-docs-step.js +30 -16
  174. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  175. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  176. package/dist/orchestration/steps/generate-configs-step.js +50 -15
  177. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  178. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  179. package/dist/orchestration/steps/publish-report-step.js +19 -0
  180. package/dist/orchestration/steps/readiness-step.js +8 -3
  181. package/dist/orchestration/steps/report-step.js +17 -4
  182. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  183. package/dist/orchestration/steps/run-eval-step.js +51 -31
  184. package/dist/pipeline/agent-behavior-report.js +6 -0
  185. package/dist/pipeline/attribution.d.ts +1 -1
  186. package/dist/pipeline/attribution.js +1 -1
  187. package/dist/pipeline/cache.js +29 -15
  188. package/dist/pipeline/calculate-scores.d.ts +2 -0
  189. package/dist/pipeline/calculate-scores.js +70 -33
  190. package/dist/pipeline/chronic-failures.d.ts +55 -0
  191. package/dist/pipeline/chronic-failures.js +110 -0
  192. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
  193. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  194. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  195. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  196. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  197. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  198. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  199. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  200. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  201. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  202. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  203. package/dist/pipeline/compiler/config-loader.js +42 -2
  204. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  205. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  206. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  207. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  208. package/dist/pipeline/compiler/index.d.ts +2 -5
  209. package/dist/pipeline/compiler/index.js +2 -5
  210. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  211. package/dist/pipeline/compiler/literacy-bridge.js +1 -1
  212. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
  213. package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
  214. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
  215. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
  216. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
  217. package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
  218. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
  219. package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
  220. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
  221. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
  222. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  223. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  224. package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
  225. package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
  226. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
  227. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
  228. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  229. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  230. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
  231. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
  232. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  233. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  234. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  235. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
  236. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
  237. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
  238. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
  239. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
  240. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
  241. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
  242. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
  243. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
  244. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
  245. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
  246. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
  247. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
  248. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  249. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  250. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
  251. package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
  252. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  253. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  254. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  255. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  256. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  257. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  258. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  259. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  260. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  261. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  262. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  263. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  264. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  265. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  266. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  267. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  268. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  269. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  270. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  271. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  272. package/dist/pipeline/compiler/task-bridge.js +92 -0
  273. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  274. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  275. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  276. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  277. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  278. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  279. package/dist/pipeline/coverage-audit.d.ts +1 -1
  280. package/dist/pipeline/coverage-audit.js +1 -1
  281. package/dist/pipeline/degradations.d.ts +1 -1
  282. package/dist/pipeline/degradations.js +1 -1
  283. package/dist/pipeline/failure-modes.d.ts +1 -1
  284. package/dist/pipeline/failure-modes.js +13 -1
  285. package/dist/pipeline/gap-analysis.d.ts +1 -1
  286. package/dist/pipeline/gap-analysis.js +3 -1
  287. package/dist/pipeline/generate-configs.d.ts +2 -2
  288. package/dist/pipeline/generate-configs.js +15 -8
  289. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  290. package/dist/pipeline/grader-compare-runner.js +7 -1
  291. package/dist/pipeline/grader-comparison.d.ts +1 -1
  292. package/dist/pipeline/grader-comparison.js +1 -1
  293. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  294. package/dist/pipeline/grader-consistency-runner.js +7 -1
  295. package/dist/pipeline/grader-consistency.d.ts +1 -1
  296. package/dist/pipeline/grader-consistency.js +1 -1
  297. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  298. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  299. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  300. package/dist/pipeline/grader-sensitivity.js +1 -1
  301. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  302. package/dist/pipeline/grader-validate-runner.js +2 -2
  303. package/dist/pipeline/grader-validation.d.ts +1 -1
  304. package/dist/pipeline/grader-validation.js +1 -1
  305. package/dist/pipeline/map-request-to-config.js +15 -2
  306. package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
  307. package/dist/pipeline/mirror-repo-tasks.js +1 -1
  308. package/dist/pipeline/plan-format.d.ts +1 -1
  309. package/dist/pipeline/plan-format.js +1 -1
  310. package/dist/pipeline/plan.d.ts +1 -1
  311. package/dist/pipeline/plan.js +67 -29
  312. package/dist/pipeline/probe.d.ts +1 -1
  313. package/dist/pipeline/probe.js +1 -1
  314. package/dist/pipeline/readiness-report.d.ts +2 -2
  315. package/dist/pipeline/readiness-report.js +2 -2
  316. package/dist/pipeline/release-classification.d.ts +1 -1
  317. package/dist/pipeline/release-classification.js +1 -1
  318. package/dist/pipeline/release-report.d.ts +1 -1
  319. package/dist/pipeline/release-report.js +1 -1
  320. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  321. package/dist/pipeline/repo-eval-comment.js +1 -1
  322. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  323. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  324. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  325. package/dist/pipeline/resolve-mappings.js +44 -44
  326. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  327. package/dist/pipeline/retrieval-metrics.js +28 -20
  328. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  329. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  330. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  331. package/dist/pipeline/steps/compare-step.js +90 -0
  332. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  333. package/dist/pipeline/steps/eval-step.js +347 -0
  334. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  335. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  336. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  337. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  338. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  339. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  340. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  341. package/dist/pipeline/steps/publish-report-step.js +243 -0
  342. package/dist/pipeline/steps/report-step.d.ts +13 -0
  343. package/dist/pipeline/steps/report-step.js +56 -0
  344. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  345. package/dist/pipeline/steps/update-scores-step.js +42 -0
  346. package/dist/pipeline/targeted-loo.d.ts +1 -1
  347. package/dist/pipeline/targeted-loo.js +1 -1
  348. package/dist/pipeline/thresholds.d.ts +1 -1
  349. package/dist/pipeline/thresholds.js +1 -1
  350. package/dist/pipeline/validate.js +13 -0
  351. package/dist/report-store.d.ts +17 -0
  352. package/dist/report-store.js +24 -0
  353. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  354. package/dist/scripts/agent-behavior-report.js +315 -0
  355. package/dist/scripts/baseline.d.ts +43 -0
  356. package/dist/scripts/baseline.js +267 -0
  357. package/dist/scripts/calculate-scores.d.ts +166 -0
  358. package/dist/scripts/calculate-scores.js +1296 -0
  359. package/dist/scripts/compare.d.ts +22 -0
  360. package/dist/scripts/compare.js +334 -0
  361. package/dist/scripts/coverage-audit.d.ts +44 -0
  362. package/dist/scripts/coverage-audit.js +209 -0
  363. package/dist/scripts/debug-eval.d.ts +19 -0
  364. package/dist/scripts/debug-eval.js +73 -0
  365. package/dist/scripts/discovery-report.d.ts +58 -0
  366. package/dist/scripts/discovery-report.js +250 -0
  367. package/dist/scripts/fetch-docs.d.ts +35 -0
  368. package/dist/scripts/fetch-docs.js +472 -0
  369. package/dist/scripts/generate-configs.d.ts +66 -0
  370. package/dist/scripts/generate-configs.js +459 -0
  371. package/dist/scripts/grader-api.d.ts +27 -0
  372. package/dist/scripts/grader-api.js +206 -0
  373. package/dist/scripts/grader-compare.d.ts +22 -0
  374. package/dist/scripts/grader-compare.js +368 -0
  375. package/dist/scripts/grader-consistency.d.ts +20 -0
  376. package/dist/scripts/grader-consistency.js +313 -0
  377. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  378. package/dist/scripts/grader-sensitivity.js +354 -0
  379. package/dist/scripts/grader-validate.d.ts +19 -0
  380. package/dist/scripts/grader-validate.js +267 -0
  381. package/dist/scripts/measure-retrieval.d.ts +10 -0
  382. package/dist/scripts/measure-retrieval.js +145 -0
  383. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  384. package/dist/scripts/migrate-task-mode.js +1 -1
  385. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  386. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  387. package/dist/scripts/pipeline.d.ts +76 -0
  388. package/dist/scripts/pipeline.js +1031 -0
  389. package/dist/scripts/pr-comment.d.ts +10 -0
  390. package/dist/scripts/pr-comment.js +510 -0
  391. package/dist/scripts/readiness-report.d.ts +88 -0
  392. package/dist/scripts/readiness-report.js +342 -0
  393. package/dist/scripts/update-quality-scores.d.ts +15 -0
  394. package/dist/scripts/update-quality-scores.js +184 -0
  395. package/dist/scripts/validate-task-sources.d.ts +1 -1
  396. package/dist/scripts/validate-task-sources.js +1 -1
  397. package/dist/scripts/validate.d.ts +13 -0
  398. package/dist/scripts/validate.js +79 -0
  399. package/dist/scripts/webhook-server.d.ts +26 -0
  400. package/dist/scripts/webhook-server.js +147 -0
  401. package/dist/scripts/weekly-digest.d.ts +24 -0
  402. package/dist/scripts/weekly-digest.js +144 -0
  403. package/dist/sinks/format-slack.d.ts +64 -0
  404. package/dist/sinks/format-slack.js +306 -0
  405. package/dist/sinks/slack-sink.d.ts +27 -0
  406. package/dist/sinks/slack-sink.js +78 -0
  407. package/dist/sinks/types.d.ts +1 -1
  408. package/dist/sinks/types.js +1 -1
  409. package/dist/sinks/webhook-sink.d.ts +19 -0
  410. package/dist/sinks/webhook-sink.js +50 -0
  411. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  412. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  413. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  414. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  415. package/dist/tasks/literacy/functions.task.ts +70 -0
  416. package/dist/tasks/literacy/groq.task.ts +259 -0
  417. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  418. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  419. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  420. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  421. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  422. package/package.json +24 -24
  423. package/tasks/.expanded.agentic.yaml +280 -0
  424. package/tasks/.expanded.yaml +565 -0
  425. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  426. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  427. package/tasks/literacy/content-lake.task.ts +181 -0
  428. package/tasks/literacy/frameworks.task.ts +1 -0
  429. package/tasks/literacy/functions.task.ts +1 -0
  430. package/tasks/literacy/groq.task.ts +1 -0
  431. package/tasks/literacy/image-handling.task.ts +95 -0
  432. package/tasks/literacy/nextjs-live.task.ts +2 -1
  433. package/tasks/literacy/portable-text.task.ts +169 -0
  434. package/tasks/literacy/studio-setup.task.ts +5 -2
  435. package/tasks/literacy/visual-editing.task.ts +1 -0
  436. package/LICENSE +0 -21
  437. package/tasks/frameworks.yaml +0 -98
  438. package/tasks/functions.yaml +0 -51
  439. package/tasks/groq.yaml +0 -216
  440. package/tasks/nextjs-live.yaml +0 -62
  441. package/tasks/studio-setup.yaml +0 -111
  442. package/tasks/visual-editing.yaml +0 -120
@@ -10,11 +10,13 @@
10
10
  * and stores a `releaseAutoScope` entry in PipelineState. Downstream
11
11
  * steps (GenerateConfigsStep, RunEvalStep) use this to narrow scope.
12
12
  */
13
- import { mkdirSync, writeFileSync } from "fs";
13
+ import { existsSync, mkdirSync, writeFileSync } from "fs";
14
14
  import { join } from "path";
15
15
  import { isIdRef, isPathRef, isSlugRef, } from "../../_vendor/ailf-core/index.js";
16
16
  import { getStepInputPaths } from "../../pipeline/cache.js";
17
+ import { buildCacheContext } from "../cache-context.js";
17
18
  import { checkCanonicalContextsExist } from "../../pipeline/checks.js";
19
+ import { loadPipelineTasks } from "../load-pipeline-tasks.js";
18
20
  import { loadSource } from "../../sources.js";
19
21
  import { configToSourceOverrides } from "../config-to-source-overrides.js";
20
22
  export class FetchDocsStep {
@@ -27,8 +29,15 @@ export class FetchDocsStep {
27
29
  return { status: "skipped", reason: "--skip-fetch" };
28
30
  }
29
31
  const start = Date.now();
30
- // Precondition: at least one task has canonical doc mappings
31
- const allTasks = await ctx.taskSource.loadTasks(buildFilter(ctx));
32
+ // Load tasks from the filesystem the same source GenerateConfigsStep
33
+ // uses. This replaces ctx.taskSource (ContentLakeTaskSource) which may
34
+ // have no ailf.task documents, causing a mismatch where generated
35
+ // configs reference context files that were never fetched.
36
+ const allTasks = await loadPipelineTasks({
37
+ rootDir: ctx.config.rootDir,
38
+ mode: ctx.config.mode,
39
+ repoTasksPath: ctx.config.repoTasksPath,
40
+ });
32
41
  // Bridge: narrow to literacy tasks for canonical doc access
33
42
  const literacyTasks = allTasks.filter((t) => t.mode === "literacy");
34
43
  const tasksWithDocs = literacyTasks.filter((t) => (t.context?.docs?.length ?? 0) > 0);
@@ -71,6 +80,21 @@ export class FetchDocsStep {
71
80
  if (result.metadata) {
72
81
  writeMetadataFiles(ctx.config.rootDir, result.metadata);
73
82
  }
83
+ // Capture metadata files (mode-specific extras)
84
+ if (ctx.collector.extrasEnabled) {
85
+ const contextsDir = join(ctx.config.rootDir, "contexts");
86
+ for (const [type, filename] of [
87
+ ["document-manifest", "document-manifest.json"],
88
+ ["release-impact", "release-impact.json"],
89
+ ["document-overlay", "document-overlay.json"],
90
+ ["url-fetch", "url-fetch.json"],
91
+ ]) {
92
+ const filePath = join(contextsDir, filename);
93
+ if (existsSync(filePath)) {
94
+ ctx.collector.captureFile("fetch-docs", type, filePath);
95
+ }
96
+ }
97
+ }
74
98
  }
75
99
  catch (err) {
76
100
  return {
@@ -117,19 +141,9 @@ export class FetchDocsStep {
117
141
  cacheInputs(ctx) {
118
142
  return getStepInputPaths(ctx.config.rootDir, "fetch-docs");
119
143
  }
120
- }
121
- // ---------------------------------------------------------------------------
122
- // Helpers
123
- // ---------------------------------------------------------------------------
124
- function buildFilter(ctx) {
125
- const { areas, tasks, tags } = ctx.config;
126
- if (!areas && !tasks && !tags)
127
- return undefined;
128
- return {
129
- ...(areas ? { areas } : {}),
130
- ...(tasks ? { taskIds: tasks } : {}),
131
- ...(tags ? { tags } : {}),
132
- };
144
+ cacheContext(ctx) {
145
+ return buildCacheContext(ctx.config);
146
+ }
133
147
  }
134
148
  /**
135
149
  * Write metadata files returned by DocFetcher to the contexts/ directory.
@@ -14,7 +14,7 @@
14
14
  *
15
15
  * This is an optional step — failure doesn't stop the pipeline.
16
16
  */
17
- import { existsSync, readFileSync, writeFileSync } from "fs";
17
+ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
18
18
  import { join, resolve } from "path";
19
19
  import { isSlugRef } from "../../_vendor/ailf-core/index.js";
20
20
  export class GapAnalysisStep {
@@ -56,7 +56,9 @@ export class GapAnalysisStep {
56
56
  console.log(formatFailureModesConsole(failureModeReport));
57
57
  const gapReport = buildGapAnalysisReport(failureModeReport, scoreSummary.scores);
58
58
  console.log(formatGapAnalysisConsole(gapReport));
59
- const outDir = resolve(root, "results", "latest");
59
+ // Write user-facing artifacts to outputDir (respects --output-dir)
60
+ const outDir = ctx.config.outputDir;
61
+ mkdirSync(outDir, { recursive: true });
60
62
  writeFileSync(join(outDir, "failure-modes.json"), JSON.stringify(failureModeReport, null, 2));
61
63
  writeFileSync(join(outDir, "gap-analysis.json"), JSON.stringify(gapReport, null, 2));
62
64
  const manifestPath = resolve(root, "contexts", "document-manifest.json");
@@ -166,6 +168,15 @@ export class GapAnalysisStep {
166
168
  scores: enrichedScores,
167
169
  };
168
170
  writeFileSync(scoreSummaryPath, JSON.stringify(enrichedSummary, null, 2));
171
+ // Capture gap analysis artifacts
172
+ const failureModesPath = join(outDir, "failure-modes.json");
173
+ if (existsSync(failureModesPath)) {
174
+ ctx.collector.captureFile("gap-analysis", "failure-modes", failureModesPath);
175
+ }
176
+ const gapReportPath = join(outDir, "gap-analysis.json");
177
+ if (existsSync(gapReportPath)) {
178
+ ctx.collector.captureFile("gap-analysis", "gap-report", gapReportPath);
179
+ }
169
180
  const gapCount = gapReport.gaps.length;
170
181
  const classRate = failureModeReport.classificationRate.toFixed(0);
171
182
  return {
@@ -24,4 +24,5 @@ export declare class GenerateConfigsStep implements PipelineStep {
24
24
  private compileAll;
25
25
  private checkLiteracyPostconditions;
26
26
  cacheInputs(ctx: AppContext): string[];
27
+ cacheContext(ctx: AppContext): string[];
27
28
  }
@@ -8,8 +8,12 @@
8
8
  * When the variant is "full", the handler is called twice (baseline + agentic)
9
9
  * and three YAML files are written. Other modes produce one YAML file.
10
10
  */
11
+ import { existsSync } from "node:fs";
12
+ import { resolve } from "node:path";
11
13
  import { LiteracyVariant } from "../../pipeline/normalize-mode.js";
14
+ import { modelMatchesLiteracyVariant } from "../../pipeline/compiler/mode-bases/literacy.js";
12
15
  import { getStepInputPaths } from "../../pipeline/cache.js";
16
+ import { buildCacheContext } from "../cache-context.js";
13
17
  import { checkGeneratedConfigsExist } from "../../pipeline/checks.js";
14
18
  import { validateModelsYaml } from "../../pipeline/validate.js";
15
19
  import { loadSource } from "../../sources.js";
@@ -85,21 +89,14 @@ export class GenerateConfigsStep {
85
89
  // ---------------------------------------------------------------------------
86
90
  async compileLiteracyVariants(ctx, handler, tasks, models, providers, start) {
87
91
  ctx.logger.info(`Compiling ${tasks.length} literacy task(s) via registry handler...`);
88
- // Filter models per variant
92
+ // Filter models per variant using shared literacy variant matcher
89
93
  const baselineModels = models.models
90
- .filter((m) => !m.modes || m.modes.includes(LiteracyVariant.STANDARD))
91
- .map((m) => ({
92
- id: m.id,
93
- label: m.label,
94
- }));
94
+ .filter((m) => modelMatchesLiteracyVariant(m, "baseline"))
95
+ .map((m) => ({ id: m.id, label: m.label }));
95
96
  const agenticModels = models.models
96
- .filter((m) => !m.modes ||
97
- m.modes.includes("agentic-naive") ||
98
- m.modes.includes("agentic-optimized"))
99
- .map((m) => ({
100
- id: m.id,
101
- label: m.label,
102
- }));
97
+ .filter((m) => modelMatchesLiteracyVariant(m, "agentic-naive") ||
98
+ modelMatchesLiteracyVariant(m, "agentic-optimized"))
99
+ .map((m) => ({ id: m.id, label: m.label }));
103
100
  // Load rubric config for template resolution
104
101
  let rubricConfig;
105
102
  try {
@@ -137,6 +134,14 @@ export class GenerateConfigsStep {
137
134
  maxConcurrency: models.maxConcurrency,
138
135
  logger: ctx.logger,
139
136
  });
137
+ // Capture generated config files (use configFileForMode for legacy naming)
138
+ const { configFileForMode } = await import("../../pipeline/eval-constants.js");
139
+ for (const variant of ["baseline", "agentic", "observed"]) {
140
+ const configPath = resolve(ctx.config.rootDir, configFileForMode(variant));
141
+ if (existsSync(configPath)) {
142
+ ctx.collector.captureFile("generate-configs", `promptfoo-config-${variant}`, configPath, { mode: "literacy", variant });
143
+ }
144
+ }
140
145
  return this.checkLiteracyPostconditions(ctx, start);
141
146
  }
142
147
  // ---------------------------------------------------------------------------
@@ -168,6 +173,18 @@ export class GenerateConfigsStep {
168
173
  maxConcurrency: models.maxConcurrency,
169
174
  logger: ctx.logger,
170
175
  });
176
+ // Capture generated config file
177
+ const configPath = resolve(ctx.config.rootDir, `promptfooconfig.${mode}.yaml`);
178
+ if (existsSync(configPath)) {
179
+ ctx.collector.captureFile("generate-configs", "promptfoo-config", configPath, { mode });
180
+ }
181
+ // Capture mode-specific test artifacts (extras)
182
+ if (ctx.collector.extrasEnabled) {
183
+ const testsPath = resolve(ctx.config.rootDir, "results", "latest", `${mode}-tests.json`);
184
+ if (existsSync(testsPath)) {
185
+ ctx.collector.captureFile("generate-configs", `${mode}-tests`, testsPath, { mode });
186
+ }
187
+ }
171
188
  return {
172
189
  durationMs: Date.now() - start,
173
190
  status: "success",
@@ -180,8 +197,11 @@ export class GenerateConfigsStep {
180
197
  async loadTasks(ctx, mode, state) {
181
198
  const { resolve } = await import("path");
182
199
  const { discoverTsTaskFiles, loadTsTaskFile } = await import("../../adapters/task-sources/task-file-loader.js");
183
- // Discover task files from the mode-specific directory and --repo-tasks-path
184
- const tasksDir = resolve(ctx.config.rootDir, "tasks", mode);
200
+ const { resolveVendoredSubdir } = await import("../../pipeline/compiler/config-loader.js");
201
+ // Discover task files from the mode-specific directory and --repo-tasks-path.
202
+ // Use vendored copies in dist/ when @sanity/ailf-core isn't resolvable
203
+ // (i.e., running outside the monorepo via npx).
204
+ const tasksDir = resolveVendoredSubdir(ctx.config.rootDir, `tasks/${mode}`);
185
205
  const dirs = [tasksDir];
186
206
  // Also search --repo-tasks-path (e.g., .ailf/tasks/) for repo-based tasks
187
207
  if (ctx.config.repoTasksPath) {
@@ -191,6 +211,7 @@ export class GenerateConfigsStep {
191
211
  }
192
212
  }
193
213
  const tasks = [];
214
+ const skippedByMode = new Map();
194
215
  for (const dir of dirs) {
195
216
  const files = discoverTsTaskFiles(dir);
196
217
  for (const file of files) {
@@ -201,9 +222,20 @@ export class GenerateConfigsStep {
201
222
  if (!("mode" in task) || task.mode === mode) {
202
223
  tasks.push(task);
203
224
  }
225
+ else {
226
+ const taskMode = task.mode ?? "unknown";
227
+ skippedByMode.set(taskMode, (skippedByMode.get(taskMode) ?? 0) + 1);
228
+ }
204
229
  }
205
230
  }
206
231
  }
232
+ if (skippedByMode.size > 0) {
233
+ const total = [...skippedByMode.values()].reduce((a, b) => a + b, 0);
234
+ const summary = [...skippedByMode.entries()]
235
+ .map(([m, n]) => `${n} ${m}`)
236
+ .join(", ");
237
+ ctx.logger.warn(` ⚠ Skipped ${total} task(s) with non-matching mode (${summary}). Current pipeline mode: ${mode}. Run with --mode <mode> to include them.`);
238
+ }
207
239
  // Apply area/task/tag filters
208
240
  const filtered = this.applyFilters(ctx, tasks);
209
241
  // Release auto-scope
@@ -280,6 +312,9 @@ export class GenerateConfigsStep {
280
312
  cacheInputs(ctx) {
281
313
  return getStepInputPaths(ctx.config.rootDir, "generate-configs");
282
314
  }
315
+ cacheContext(ctx) {
316
+ return buildCacheContext(ctx.config);
317
+ }
283
318
  }
284
319
  // ---------------------------------------------------------------------------
285
320
  // Helpers
@@ -10,7 +10,7 @@
10
10
  * exist before evaluation begins.
11
11
  *
12
12
  * @see packages/eval/src/pipeline/mirror-repo-tasks.ts
13
- * @see docs/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
13
+ * @see docs/archive/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
14
14
  */
15
15
  import type { AppContext, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
16
16
  export declare class MirrorRepoTasksStep implements PipelineStep {
@@ -10,7 +10,7 @@
10
10
  * exist before evaluation begins.
11
11
  *
12
12
  * @see packages/eval/src/pipeline/mirror-repo-tasks.ts
13
- * @see docs/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
13
+ * @see docs/archive/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
14
14
  */
15
15
  import { getSanityClient } from "../../sanity/client.js";
16
16
  import { detectGitContext, mirrorRepoTasks, } from "../../pipeline/mirror-repo-tasks.js";
@@ -115,12 +115,31 @@ export class PublishReportStep {
115
115
  };
116
116
  // Share reportId with downstream steps (CallbackStep + orchestrator job update)
117
117
  state.reportId = reportId;
118
+ // Capture report object (Tier 2)
119
+ ctx.collector.capture("publish-report", "report-object", report);
120
+ // Capture auto-comparison if present (Tier 2)
121
+ if (comparison) {
122
+ ctx.collector.capture("publish-report", "auto-comparison", comparison);
123
+ }
118
124
  // Write to store (system of record — best-effort, P5)
119
125
  const sanityResult = ctx.reportStore
120
126
  ? await ctx.reportStore.write(report)
121
127
  : null;
122
128
  // Run sinks (fire-and-forget, P6)
123
129
  const publishResult = await runSinks(report, ctx);
130
+ // Capture sink results (Tier 2)
131
+ if (publishResult.sinkResults.length > 0) {
132
+ ctx.collector.capture("publish-report", "sink-results", {
133
+ sinkCount: publishResult.sinkResults.length,
134
+ results: publishResult.sinkResults.map((r) => ({
135
+ name: r.name,
136
+ status: r.result.status,
137
+ ...(r.result.status === "success" ? { detail: r.result.detail } : {}),
138
+ ...(r.result.status === "failed" ? { error: r.result.error } : {}),
139
+ ...(r.result.status === "skipped" ? { reason: r.result.reason } : {}),
140
+ })),
141
+ });
142
+ }
124
143
  // Build result summary
125
144
  const parts = [];
126
145
  if (sanityResult) {
@@ -4,7 +4,7 @@
4
4
  * Calls pure functions from pipeline/readiness-report.ts directly.
5
5
  * Optional step — failure doesn't stop the pipeline.
6
6
  */
7
- import { existsSync, readFileSync, writeFileSync } from "fs";
7
+ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
8
8
  import { resolve } from "path";
9
9
  import { tryLoadConfigFile } from "../../pipeline/compiler/config-loader.js";
10
10
  import { formatReadinessMarkdown, generateReadinessReport, } from "../../pipeline/readiness-report.js";
@@ -37,7 +37,8 @@ export class ReadinessStep {
37
37
  }
38
38
  const scoreSummary = JSON.parse(readFileSync(scoreSummaryPath, "utf-8"));
39
39
  const thresholdConfig = ThresholdConfigSchema.parse(thresholdsLoaded.data);
40
- const gapPath = resolve(root, "results", "latest", "gap-analysis.json");
40
+ // Read gap-analysis.json from outputDir (gap-analysis step writes there)
41
+ const gapPath = resolve(ctx.config.outputDir, "gap-analysis.json");
41
42
  const gapAnalysis = existsSync(gapPath)
42
43
  ? JSON.parse(readFileSync(gapPath, "utf-8"))
43
44
  : undefined;
@@ -60,7 +61,11 @@ export class ReadinessStep {
60
61
  console.log(md);
61
62
  }
62
63
  if (readinessLines.length > 0) {
63
- writeFileSync(resolve(root, "results", "latest", "readiness-report.md"), readinessLines.join("\n---\n\n"));
64
+ // Write to outputDir (respects --output-dir)
65
+ mkdirSync(ctx.config.outputDir, { recursive: true });
66
+ const readinessPath = resolve(ctx.config.outputDir, "readiness-report.md");
67
+ writeFileSync(readinessPath, readinessLines.join("\n---\n\n"));
68
+ ctx.collector.captureFile("readiness", "readiness-report", readinessPath);
64
69
  }
65
70
  const passCount = readinessAreas.filter((area) => {
66
71
  const areaScore = scoreSummary.scores.find((s) => s.feature === area);
@@ -4,10 +4,10 @@
4
4
  * Calls generatePrComment() from pipeline/pr-comment.ts with typed options.
5
5
  * No env bridge or process.argv manipulation needed.
6
6
  */
7
- import { resolve } from "path";
7
+ import { existsSync, mkdirSync } from "node:fs";
8
+ import { dirname, resolve } from "path";
8
9
  import { checkScoreSummaryValid } from "../../pipeline/checks.js";
9
10
  import { generatePrComment } from "../../pipeline/pr-comment.js";
10
- const DEFAULT_REPORT_PATH = "results/latest/pr-comment.md";
11
11
  export class ReportStep {
12
12
  name = "report";
13
13
  check() {
@@ -15,7 +15,7 @@ export class ReportStep {
15
15
  }
16
16
  async execute(ctx) {
17
17
  const start = Date.now();
18
- // Precondition: score summary exists
18
+ // Precondition: score summary exists (intermediate files stay in rootDir)
19
19
  const summaryIssues = checkScoreSummaryValid(ctx.config.rootDir);
20
20
  const summaryErrors = summaryIssues.filter((i) => i.severity === "error");
21
21
  if (summaryErrors.length > 0) {
@@ -25,7 +25,12 @@ export class ReportStep {
25
25
  status: "failed",
26
26
  };
27
27
  }
28
- const resolvedOutput = ctx.config.outputPath ?? resolve(ctx.config.rootDir, DEFAULT_REPORT_PATH);
28
+ // User-facing output: --output flag wins, else outputDir
29
+ const resolvedOutput = ctx.config.outputPath ?? resolve(ctx.config.outputDir, "pr-comment.md");
30
+ // Ensure outputDir exists before writing (it may be a custom --output-dir
31
+ // that hasn't been created yet — writePipelineResult runs after the
32
+ // orchestrator returns, so we can't rely on it).
33
+ mkdirSync(dirname(resolvedOutput), { recursive: true });
29
34
  try {
30
35
  generatePrComment({
31
36
  outputPath: resolvedOutput,
@@ -40,6 +45,14 @@ export class ReportStep {
40
45
  status: "failed",
41
46
  };
42
47
  }
48
+ // Capture report artifacts
49
+ if (existsSync(resolvedOutput)) {
50
+ ctx.collector.captureFile("report", "pr-comment", resolvedOutput);
51
+ }
52
+ const pipelineResultPath = resolve(ctx.config.outputDir, "pipeline-result.json");
53
+ if (existsSync(pipelineResultPath)) {
54
+ ctx.collector.captureFile("report", "pipeline-result", pipelineResultPath);
55
+ }
43
56
  return {
44
57
  durationMs: Date.now() - start,
45
58
  status: "success",
@@ -13,4 +13,5 @@ export declare class RunEvalStep implements PipelineStep {
13
13
  check(): ValidationIssue[];
14
14
  execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
15
15
  cacheInputs(ctx: AppContext): string[];
16
+ cacheContext(ctx: AppContext): string[];
16
17
  }
@@ -8,6 +8,7 @@
8
8
  import { existsSync, mkdirSync, writeFileSync } from "fs";
9
9
  import { resolve } from "path";
10
10
  import { getStepInputPaths } from "../../pipeline/cache.js";
11
+ import { buildCacheContext } from "../cache-context.js";
11
12
  import { checkCanonicalContextsExist, checkGeneratedConfigsExist, checkResultsExist, } from "../../pipeline/checks.js";
12
13
  import { computeEvalFingerprint } from "../../pipeline/eval-fingerprint.js";
13
14
  import { buildFilterFlags, configFileForMode, resultsFileForMode, scanResultsForErrors, } from "../../pipeline/eval-constants.js";
@@ -38,38 +39,41 @@ export class RunEvalStep {
38
39
  };
39
40
  }
40
41
  // Precondition: canonical context files exist for filtered tasks.
41
- // Must apply the same area/task filter as fetch-docs so we only
42
- // check contexts that were actually fetched.
43
- const filter = ctx.config.areas || ctx.config.tasks || ctx.config.tags
44
- ? {
45
- ...(ctx.config.areas ? { areas: ctx.config.areas } : {}),
46
- ...(ctx.config.tasks ? { taskIds: ctx.config.tasks } : {}),
47
- ...(ctx.config.tags ? { tags: ctx.config.tags } : {}),
42
+ // Only applies to literacy mode other modes don't use canonical doc contexts.
43
+ if (this.mode === "literacy") {
44
+ // Must apply the same area/task filter as fetch-docs so we only
45
+ // check contexts that were actually fetched.
46
+ const filter = ctx.config.areas || ctx.config.tasks || ctx.config.tags
47
+ ? {
48
+ ...(ctx.config.areas ? { areas: ctx.config.areas } : {}),
49
+ ...(ctx.config.tasks ? { taskIds: ctx.config.tasks } : {}),
50
+ ...(ctx.config.tags ? { tags: ctx.config.tags } : {}),
51
+ }
52
+ : undefined;
53
+ let tasks = await ctx.taskSource.loadTasks(filter);
54
+ // Release auto-scope: narrow to affected tasks (mirrors GenerateConfigsStep)
55
+ if (state.releaseAutoScope && !ctx.config.noAutoScope) {
56
+ const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
57
+ tasks = tasks.filter((t) => scopedIds.has(t.id));
58
+ }
59
+ // Only check context files for tasks that have canonical docs.
60
+ // Tasks without canonical docs are skipped by FetchDocsStep (they
61
+ // have no docs to fetch), so no context file is written for them.
62
+ // The generated Promptfoo config still includes their "without-docs"
63
+ // variant (testing model knowledge alone), which doesn't need a
64
+ // context file.
65
+ // Bridge: narrow to literacy tasks with docs
66
+ const tasksWithDocs = tasks.filter((t) => t.mode === "literacy" && (t.context?.docs?.length ?? 0) > 0);
67
+ const taskIds = tasksWithDocs.map((t) => t.id);
68
+ const contextIssues = checkCanonicalContextsExist(rootDir, taskIds);
69
+ const contextErrors = contextIssues.filter((i) => i.severity === "error");
70
+ if (contextErrors.length > 0) {
71
+ return {
72
+ durationMs: Date.now() - start,
73
+ error: `Context files missing. Run fetch-docs first. ${contextErrors.map((e) => e.message).join("; ")}`,
74
+ status: "failed",
75
+ };
48
76
  }
49
- : undefined;
50
- let tasks = await ctx.taskSource.loadTasks(filter);
51
- // Release auto-scope: narrow to affected tasks (mirrors GenerateConfigsStep)
52
- if (state.releaseAutoScope && !ctx.config.noAutoScope) {
53
- const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
54
- tasks = tasks.filter((t) => scopedIds.has(t.id));
55
- }
56
- // Only check context files for tasks that have canonical docs.
57
- // Tasks without canonical docs are skipped by FetchDocsStep (they
58
- // have no docs to fetch), so no context file is written for them.
59
- // The generated Promptfoo config still includes their "without-docs"
60
- // variant (testing model knowledge alone), which doesn't need a
61
- // context file.
62
- // Bridge: narrow to literacy tasks with docs
63
- const tasksWithDocs = tasks.filter((t) => t.mode === "literacy" && (t.context?.docs?.length ?? 0) > 0);
64
- const taskIds = tasksWithDocs.map((t) => t.id);
65
- const contextIssues = checkCanonicalContextsExist(rootDir, taskIds);
66
- const contextErrors = contextIssues.filter((i) => i.severity === "error");
67
- if (contextErrors.length > 0) {
68
- return {
69
- durationMs: Date.now() - start,
70
- error: `Context files missing. Run fetch-docs first. ${contextErrors.map((e) => e.message).join("; ")}`,
71
- status: "failed",
72
- };
73
77
  }
74
78
  // -----------------------------------------------------------------
75
79
  // Compute eval fingerprint (for remote cache + provenance)
@@ -109,6 +113,11 @@ export class RunEvalStep {
109
113
  // required eval modes were satisfied from the remote cache.
110
114
  state.remoteCacheHits ??= new Set();
111
115
  state.remoteCacheHits.add(this.mode);
116
+ // Capture the restored score-summary from remote cache
117
+ const cachedSummaryPath = resolve(rootDir, "results", "latest", "score-summary.json");
118
+ if (existsSync(cachedSummaryPath)) {
119
+ ctx.collector.captureFile("run-eval", "score-summary-cached", cachedSummaryPath, { source: "remote-cache", mode: this.mode });
120
+ }
112
121
  return {
113
122
  durationMs: Date.now() - start,
114
123
  status: "success",
@@ -143,6 +152,7 @@ export class RunEvalStep {
143
152
  configPath: configFile,
144
153
  env: subprocessEnv,
145
154
  filterFlags: filterFlags.trim() || undefined,
155
+ maxDurationMs: ctx.config.evalBudgetMs,
146
156
  });
147
157
  // Check if results were written despite non-zero exit
148
158
  if (result.status === "failed") {
@@ -172,6 +182,13 @@ export class RunEvalStep {
172
182
  console.log();
173
183
  console.log(errorSummary);
174
184
  }
185
+ // Capture eval results
186
+ const resultsPath = resolve(rootDir, resultsFileForMode(this.mode));
187
+ if (existsSync(resultsPath)) {
188
+ ctx.collector.captureFile("run-eval", `eval-results-${this.mode}`, resultsPath, {
189
+ mode: this.mode,
190
+ });
191
+ }
175
192
  const durationMs = Date.now() - start;
176
193
  return {
177
194
  durationMs,
@@ -182,6 +199,9 @@ export class RunEvalStep {
182
199
  cacheInputs(ctx) {
183
200
  return getStepInputPaths(ctx.config.rootDir, `eval-${this.mode}`);
184
201
  }
202
+ cacheContext(ctx) {
203
+ return buildCacheContext(ctx.config);
204
+ }
185
205
  }
186
206
  // ---------------------------------------------------------------------------
187
207
  // Remote cache helpers
@@ -47,6 +47,12 @@ export const CANONICAL_DOC_MAP = {
47
47
  // ---------------------------------------------------------------------------
48
48
  export function detectFeatureArea(description) {
49
49
  const desc = description.toLowerCase();
50
+ if (desc.includes("portable text"))
51
+ return "portable-text";
52
+ if (desc.includes("content lake"))
53
+ return "content-lake";
54
+ if (desc.includes("image handling") || desc.includes("image asset"))
55
+ return "image-handling";
50
56
  if (desc.includes("studio"))
51
57
  return "studio-setup";
52
58
  if (desc.includes("visual") ||
@@ -13,7 +13,7 @@
13
13
  * layered on top for ambiguous cases when higher precision is needed.
14
14
  *
15
15
  * @see docs/design-docs/scenario-matrix/per-document-attribution.md
16
- * @see docs/exec-plans/scenario-matrix-implementation/phase-2-impact-scenarios.md
16
+ * @see docs/archive/exec-plans/scenario-matrix-implementation/phase-2-impact-scenarios.md
17
17
  */
18
18
  import type { AttributionReport, ComparisonReport } from "./types.js";
19
19
  import type { ResolvedMappings } from "./resolve-mappings.js";
@@ -13,7 +13,7 @@
13
13
  * layered on top for ambiguous cases when higher precision is needed.
14
14
  *
15
15
  * @see docs/design-docs/scenario-matrix/per-document-attribution.md
16
- * @see docs/exec-plans/scenario-matrix-implementation/phase-2-impact-scenarios.md
16
+ * @see docs/archive/exec-plans/scenario-matrix-implementation/phase-2-impact-scenarios.md
17
17
  */
18
18
  // ---------------------------------------------------------------------------
19
19
  // Public API
@@ -125,15 +125,18 @@ export function getStepInputPaths(rootDir, step) {
125
125
  }
126
126
  }
127
127
  // Task files (contain assertions and test definitions).
128
- // Exclude generated .expanded*.yaml files those are already listed
129
- // explicitly above per mode.
128
+ // Task files live in tasks/{mode}/ subdirectories (e.g., tasks/literacy/)
130
129
  const tasksDir = r("tasks");
131
130
  if (existsSync(tasksDir)) {
132
- const taskFiles = readdirSync(tasksDir)
133
- .filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) &&
134
- !f.startsWith(".expanded"))
135
- .map((f) => join(tasksDir, f));
136
- paths.push(...taskFiles);
131
+ for (const entry of readdirSync(tasksDir)) {
132
+ const subDir = join(tasksDir, entry);
133
+ if (statSync(subDir).isDirectory()) {
134
+ const taskFiles = readdirSync(subDir)
135
+ .filter((f) => /\.(task\.ts|task\.js)$/.test(f))
136
+ .map((f) => join(subDir, f));
137
+ paths.push(...taskFiles);
138
+ }
139
+ }
137
140
  }
138
141
  // Reference solutions (used by grader assertions)
139
142
  const refDir = r("canonical/reference-solutions");
@@ -155,12 +158,18 @@ export function getStepInputPaths(rootDir, step) {
155
158
  if (modelsPath2)
156
159
  paths.push(modelsPath2);
157
160
  // Include all task files (they define feature areas)
161
+ // Task files live in tasks/{mode}/ subdirectories (e.g., tasks/literacy/)
158
162
  const tasksDir = r("tasks");
159
163
  if (existsSync(tasksDir)) {
160
- const taskFiles = readdirSync(tasksDir)
161
- .filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
162
- .map((f) => join(tasksDir, f));
163
- paths.push(...taskFiles);
164
+ for (const entry of readdirSync(tasksDir)) {
165
+ const subDir = join(tasksDir, entry);
166
+ if (statSync(subDir).isDirectory()) {
167
+ const taskFiles = readdirSync(subDir)
168
+ .filter((f) => /\.(task\.ts|task\.js)$/.test(f))
169
+ .map((f) => join(subDir, f));
170
+ paths.push(...taskFiles);
171
+ }
172
+ }
164
173
  }
165
174
  return paths;
166
175
  }
@@ -175,10 +184,15 @@ export function getStepInputPaths(rootDir, step) {
175
184
  paths.push(sourcesPath2);
176
185
  const tasksDir = r("tasks");
177
186
  if (existsSync(tasksDir)) {
178
- const taskFiles = readdirSync(tasksDir)
179
- .filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
180
- .map((f) => join(tasksDir, f));
181
- paths.push(...taskFiles);
187
+ for (const entry of readdirSync(tasksDir)) {
188
+ const subDir = join(tasksDir, entry);
189
+ if (statSync(subDir).isDirectory()) {
190
+ const taskFiles = readdirSync(subDir)
191
+ .filter((f) => /\.(task\.ts|task\.js)$/.test(f))
192
+ .map((f) => join(subDir, f));
193
+ paths.push(...taskFiles);
194
+ }
195
+ }
182
196
  }
183
197
  return paths;
184
198
  }
@@ -38,6 +38,8 @@ export interface RawTestResult {
38
38
  componentResults: ComponentResult[];
39
39
  pass: boolean;
40
40
  };
41
+ /** Per-test latency in ms (populated by Promptfoo when available) */
42
+ latencyMs?: number;
41
43
  metadata?: Record<string, unknown>;
42
44
  provider?: {
43
45
  id?: string;