@sanity/ailf 1.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (499) hide show
  1. package/README.md +0 -1
  2. package/canonical/grader-references/README.md +2 -2
  3. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  4. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  5. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  6. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  7. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  8. package/config/features.ts +1 -1
  9. package/config/models.ts +29 -12
  10. package/config/sources.ts +1 -1
  11. package/config/thresholds.ts +1 -1
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  13. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  17. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  18. package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
  19. package/dist/_vendor/ailf-core/config-helpers.js +51 -2
  20. package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
  21. package/dist/_vendor/ailf-core/examples/index.js +213 -94
  22. package/dist/_vendor/ailf-core/index.d.ts +3 -2
  23. package/dist/_vendor/ailf-core/index.js +2 -1
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  25. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  27. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  28. package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
  29. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  30. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  31. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  32. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  33. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  34. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  35. package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
  36. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
  37. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
  38. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
  39. package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
  40. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  41. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  42. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  43. package/dist/_vendor/ailf-core/services/index.js +1 -1
  44. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
  46. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  47. package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
  50. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  51. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  52. package/dist/adapters/api-client/remediation.js +2 -2
  53. package/dist/adapters/config-sources/file-config-adapter.js +7 -1
  54. package/dist/adapters/config-sources/ts-config-loader.js +21 -13
  55. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  56. package/dist/adapters/index.d.ts +0 -1
  57. package/dist/adapters/index.js +0 -1
  58. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  59. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  60. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  61. package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
  62. package/dist/adapters/task-sources/index.d.ts +3 -4
  63. package/dist/adapters/task-sources/index.js +3 -4
  64. package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
  65. package/dist/adapters/task-sources/repo-schemas.js +228 -20
  66. package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
  67. package/dist/adapters/task-sources/repo-task-source.js +81 -122
  68. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  69. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  70. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  71. package/dist/adapters/task-sources/repo-validation.js +126 -5
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
  73. package/dist/adapters/task-sources/task-file-loader.js +21 -7
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/coverage-audit.js +3 -1
  95. package/dist/commands/explain-handler.d.ts +1 -1
  96. package/dist/commands/explain-handler.js +37 -8
  97. package/dist/commands/fetch-docs.js +1 -0
  98. package/dist/commands/generate-configs.d.ts +3 -3
  99. package/dist/commands/generate-configs.js +20 -8
  100. package/dist/commands/init.d.ts +5 -4
  101. package/dist/commands/init.js +190 -25
  102. package/dist/commands/pipeline-action.d.ts +7 -1
  103. package/dist/commands/pipeline-action.js +43 -19
  104. package/dist/commands/pipeline.d.ts +6 -1
  105. package/dist/commands/pipeline.js +7 -2
  106. package/dist/commands/pr-comment.js +1 -0
  107. package/dist/commands/publish.js +1 -0
  108. package/dist/commands/shared/help.js +2 -2
  109. package/dist/commands/update-quality-scores.d.ts +5 -0
  110. package/dist/commands/update-quality-scores.js +20 -0
  111. package/dist/commands/validate-tasks.d.ts +2 -2
  112. package/dist/commands/validate-tasks.js +26 -15
  113. package/dist/composition-root.d.ts +15 -4
  114. package/dist/composition-root.js +100 -55
  115. package/dist/config/features.ts +23 -0
  116. package/dist/config/models.ts +100 -0
  117. package/dist/config/prompts.ts +16 -0
  118. package/dist/config/rubrics.ts +225 -0
  119. package/dist/config/schedules.ts +47 -0
  120. package/dist/config/sinks.ts +37 -0
  121. package/dist/config/sources.ts +21 -0
  122. package/dist/config/thresholds.ts +61 -0
  123. package/dist/index.d.ts +41 -0
  124. package/dist/index.js +48 -0
  125. package/dist/lib/agent-behavior-report.d.ts +8 -0
  126. package/dist/lib/agent-behavior-report.js +185 -0
  127. package/dist/lib/baseline.d.ts +19 -0
  128. package/dist/lib/baseline.js +153 -0
  129. package/dist/lib/calculate-scores.d.ts +23 -0
  130. package/dist/lib/calculate-scores.js +42 -0
  131. package/dist/lib/compare.d.ts +18 -0
  132. package/dist/lib/compare.js +170 -0
  133. package/dist/lib/coverage-audit.d.ts +4 -0
  134. package/dist/lib/coverage-audit.js +42 -0
  135. package/dist/lib/discovery-report.d.ts +13 -0
  136. package/dist/lib/discovery-report.js +57 -0
  137. package/dist/lib/fetch-docs.d.ts +30 -0
  138. package/dist/lib/fetch-docs.js +171 -0
  139. package/dist/lib/generate-configs.d.ts +25 -0
  140. package/dist/lib/generate-configs.js +42 -0
  141. package/dist/lib/grader-api.d.ts +21 -0
  142. package/dist/lib/grader-api.js +34 -0
  143. package/dist/lib/grader-compare.d.ts +19 -0
  144. package/dist/lib/grader-compare.js +91 -0
  145. package/dist/lib/grader-consistency.d.ts +27 -0
  146. package/dist/lib/grader-consistency.js +79 -0
  147. package/dist/lib/grader-sensitivity.d.ts +19 -0
  148. package/dist/lib/grader-sensitivity.js +75 -0
  149. package/dist/lib/grader-validate.d.ts +19 -0
  150. package/dist/lib/grader-validate.js +78 -0
  151. package/dist/lib/measure-retrieval.d.ts +14 -0
  152. package/dist/lib/measure-retrieval.js +71 -0
  153. package/dist/lib/pr-comment.d.ts +16 -0
  154. package/dist/lib/pr-comment.js +28 -0
  155. package/dist/lib/readiness-report.d.ts +13 -0
  156. package/dist/lib/readiness-report.js +108 -0
  157. package/dist/lib/webhook-server.d.ts +11 -0
  158. package/dist/lib/webhook-server.js +24 -0
  159. package/dist/lib/weekly-digest.d.ts +24 -0
  160. package/dist/lib/weekly-digest.js +148 -0
  161. package/dist/orchestration/build-app-context.js +13 -0
  162. package/dist/orchestration/build-step-sequence.js +4 -2
  163. package/dist/orchestration/cache-context.d.ts +23 -0
  164. package/dist/orchestration/cache-context.js +43 -0
  165. package/dist/orchestration/env-bridge.d.ts +21 -0
  166. package/dist/orchestration/env-bridge.js +66 -0
  167. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  168. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  169. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  170. package/dist/orchestration/step-runner.js +5 -1
  171. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  172. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  173. package/dist/orchestration/steps/callback-step.js +10 -1
  174. package/dist/orchestration/steps/compare-step.js +6 -3
  175. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  176. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  177. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  178. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  179. package/dist/orchestration/steps/fetch-docs-step.js +32 -19
  180. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  181. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  182. package/dist/orchestration/steps/generate-configs-step.js +77 -26
  183. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  184. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  185. package/dist/orchestration/steps/publish-report-step.js +19 -0
  186. package/dist/orchestration/steps/readiness-step.js +8 -3
  187. package/dist/orchestration/steps/report-step.js +17 -4
  188. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  189. package/dist/orchestration/steps/run-eval-step.js +51 -31
  190. package/dist/pipeline/agent-behavior-report.js +6 -0
  191. package/dist/pipeline/attribution.d.ts +1 -1
  192. package/dist/pipeline/attribution.js +1 -1
  193. package/dist/pipeline/cache.js +29 -15
  194. package/dist/pipeline/calculate-scores.d.ts +2 -0
  195. package/dist/pipeline/calculate-scores.js +70 -33
  196. package/dist/pipeline/chronic-failures.d.ts +55 -0
  197. package/dist/pipeline/chronic-failures.js +110 -0
  198. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
  199. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
  200. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
  201. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
  202. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  203. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
  204. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  205. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  206. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  207. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  208. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  209. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  210. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  211. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  212. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  213. package/dist/pipeline/compiler/config-loader.js +42 -2
  214. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  215. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  216. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  217. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  218. package/dist/pipeline/compiler/index.d.ts +2 -5
  219. package/dist/pipeline/compiler/index.js +2 -5
  220. package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
  221. package/dist/pipeline/compiler/literacy-bridge.js +2 -2
  222. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  223. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  224. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  225. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  226. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  227. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  228. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
  229. package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
  230. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  231. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  232. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  233. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  234. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  235. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  236. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  237. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  238. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  239. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  240. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  241. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  242. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  243. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  244. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
  245. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  246. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  247. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  248. package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
  249. package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
  250. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  251. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  252. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  253. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  254. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  255. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  256. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  257. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  258. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  259. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  260. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  261. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  262. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  263. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  264. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  265. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  266. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  267. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  268. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  269. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  270. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  271. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  272. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  273. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  274. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
  275. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
  276. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
  277. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  278. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  279. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  280. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  281. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  282. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  283. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  284. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
  285. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  286. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  287. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  288. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  289. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
  290. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
  291. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  292. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
  293. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  294. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
  295. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  296. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  297. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
  298. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
  299. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
  300. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  301. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  302. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  303. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  304. package/dist/pipeline/compiler/preset-loader.js +99 -0
  305. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
  306. package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
  307. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  308. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  309. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  310. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  311. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  312. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  313. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  314. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  315. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  316. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  317. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  318. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  319. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  320. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  321. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  322. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  323. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  324. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  325. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  326. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  327. package/dist/pipeline/compiler/task-bridge.js +92 -0
  328. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  329. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  330. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  331. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  332. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  333. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  334. package/dist/pipeline/coverage-audit.d.ts +1 -1
  335. package/dist/pipeline/coverage-audit.js +1 -1
  336. package/dist/pipeline/degradations.d.ts +1 -1
  337. package/dist/pipeline/degradations.js +1 -1
  338. package/dist/pipeline/expand-tasks.d.ts +2 -2
  339. package/dist/pipeline/expand-tasks.js +2 -2
  340. package/dist/pipeline/failure-modes.d.ts +1 -1
  341. package/dist/pipeline/failure-modes.js +13 -1
  342. package/dist/pipeline/gap-analysis.d.ts +1 -1
  343. package/dist/pipeline/gap-analysis.js +3 -1
  344. package/dist/pipeline/generate-configs.d.ts +2 -2
  345. package/dist/pipeline/generate-configs.js +16 -9
  346. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  347. package/dist/pipeline/grader-compare-runner.js +7 -1
  348. package/dist/pipeline/grader-comparison.d.ts +1 -1
  349. package/dist/pipeline/grader-comparison.js +1 -1
  350. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  351. package/dist/pipeline/grader-consistency-runner.js +7 -1
  352. package/dist/pipeline/grader-consistency.d.ts +1 -1
  353. package/dist/pipeline/grader-consistency.js +1 -1
  354. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  355. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  356. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  357. package/dist/pipeline/grader-sensitivity.js +1 -1
  358. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  359. package/dist/pipeline/grader-validate-runner.js +2 -2
  360. package/dist/pipeline/grader-validation.d.ts +1 -1
  361. package/dist/pipeline/grader-validation.js +1 -1
  362. package/dist/pipeline/map-request-to-config.js +16 -2
  363. package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
  364. package/dist/pipeline/mirror-repo-tasks.js +10 -10
  365. package/dist/pipeline/plan-format.d.ts +1 -1
  366. package/dist/pipeline/plan-format.js +1 -1
  367. package/dist/pipeline/plan.d.ts +1 -1
  368. package/dist/pipeline/plan.js +68 -30
  369. package/dist/pipeline/probe.d.ts +1 -1
  370. package/dist/pipeline/probe.js +1 -1
  371. package/dist/pipeline/readiness-report.d.ts +2 -2
  372. package/dist/pipeline/readiness-report.js +2 -2
  373. package/dist/pipeline/release-classification.d.ts +1 -1
  374. package/dist/pipeline/release-classification.js +1 -1
  375. package/dist/pipeline/release-report.d.ts +1 -1
  376. package/dist/pipeline/release-report.js +1 -1
  377. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  378. package/dist/pipeline/repo-eval-comment.js +1 -1
  379. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  380. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  381. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  382. package/dist/pipeline/resolve-mappings.js +44 -44
  383. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  384. package/dist/pipeline/retrieval-metrics.js +28 -20
  385. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  386. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  387. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  388. package/dist/pipeline/steps/compare-step.js +90 -0
  389. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  390. package/dist/pipeline/steps/eval-step.js +347 -0
  391. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  392. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  393. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  394. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  395. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  396. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  397. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  398. package/dist/pipeline/steps/publish-report-step.js +243 -0
  399. package/dist/pipeline/steps/report-step.d.ts +13 -0
  400. package/dist/pipeline/steps/report-step.js +56 -0
  401. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  402. package/dist/pipeline/steps/update-scores-step.js +42 -0
  403. package/dist/pipeline/targeted-loo.d.ts +1 -1
  404. package/dist/pipeline/targeted-loo.js +1 -1
  405. package/dist/pipeline/thresholds.d.ts +1 -1
  406. package/dist/pipeline/thresholds.js +1 -1
  407. package/dist/pipeline/validate.js +13 -0
  408. package/dist/report-store.d.ts +17 -0
  409. package/dist/report-store.js +24 -0
  410. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  411. package/dist/scripts/agent-behavior-report.js +315 -0
  412. package/dist/scripts/baseline.d.ts +43 -0
  413. package/dist/scripts/baseline.js +267 -0
  414. package/dist/scripts/calculate-scores.d.ts +166 -0
  415. package/dist/scripts/calculate-scores.js +1296 -0
  416. package/dist/scripts/compare.d.ts +22 -0
  417. package/dist/scripts/compare.js +334 -0
  418. package/dist/scripts/coverage-audit.d.ts +44 -0
  419. package/dist/scripts/coverage-audit.js +209 -0
  420. package/dist/scripts/debug-eval.d.ts +19 -0
  421. package/dist/scripts/debug-eval.js +73 -0
  422. package/dist/scripts/discovery-report.d.ts +58 -0
  423. package/dist/scripts/discovery-report.js +250 -0
  424. package/dist/scripts/fetch-docs.d.ts +35 -0
  425. package/dist/scripts/fetch-docs.js +472 -0
  426. package/dist/scripts/generate-configs.d.ts +66 -0
  427. package/dist/scripts/generate-configs.js +459 -0
  428. package/dist/scripts/grader-api.d.ts +27 -0
  429. package/dist/scripts/grader-api.js +206 -0
  430. package/dist/scripts/grader-compare.d.ts +22 -0
  431. package/dist/scripts/grader-compare.js +368 -0
  432. package/dist/scripts/grader-consistency.d.ts +20 -0
  433. package/dist/scripts/grader-consistency.js +313 -0
  434. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  435. package/dist/scripts/grader-sensitivity.js +354 -0
  436. package/dist/scripts/grader-validate.d.ts +19 -0
  437. package/dist/scripts/grader-validate.js +267 -0
  438. package/dist/scripts/measure-retrieval.d.ts +10 -0
  439. package/dist/scripts/measure-retrieval.js +145 -0
  440. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  441. package/dist/scripts/migrate-task-mode.js +1 -1
  442. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  443. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  444. package/dist/scripts/pipeline.d.ts +76 -0
  445. package/dist/scripts/pipeline.js +1031 -0
  446. package/dist/scripts/pr-comment.d.ts +10 -0
  447. package/dist/scripts/pr-comment.js +510 -0
  448. package/dist/scripts/readiness-report.d.ts +88 -0
  449. package/dist/scripts/readiness-report.js +342 -0
  450. package/dist/scripts/update-quality-scores.d.ts +15 -0
  451. package/dist/scripts/update-quality-scores.js +184 -0
  452. package/dist/scripts/validate-task-sources.d.ts +1 -1
  453. package/dist/scripts/validate-task-sources.js +1 -1
  454. package/dist/scripts/validate.d.ts +13 -0
  455. package/dist/scripts/validate.js +79 -0
  456. package/dist/scripts/webhook-server.d.ts +26 -0
  457. package/dist/scripts/webhook-server.js +147 -0
  458. package/dist/scripts/weekly-digest.d.ts +24 -0
  459. package/dist/scripts/weekly-digest.js +144 -0
  460. package/dist/sinks/format-slack.d.ts +64 -0
  461. package/dist/sinks/format-slack.js +306 -0
  462. package/dist/sinks/slack-sink.d.ts +27 -0
  463. package/dist/sinks/slack-sink.js +78 -0
  464. package/dist/sinks/types.d.ts +1 -1
  465. package/dist/sinks/types.js +1 -1
  466. package/dist/sinks/webhook-sink.d.ts +19 -0
  467. package/dist/sinks/webhook-sink.js +50 -0
  468. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  469. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  470. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  471. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  472. package/dist/tasks/literacy/functions.task.ts +70 -0
  473. package/dist/tasks/literacy/groq.task.ts +259 -0
  474. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  475. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  476. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  477. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  478. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  479. package/package.json +32 -24
  480. package/tasks/.expanded.agentic.yaml +280 -0
  481. package/tasks/.expanded.yaml +565 -0
  482. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  483. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  484. package/tasks/literacy/content-lake.task.ts +181 -0
  485. package/tasks/literacy/frameworks.task.ts +1 -0
  486. package/tasks/literacy/functions.task.ts +1 -0
  487. package/tasks/literacy/groq.task.ts +1 -0
  488. package/tasks/literacy/image-handling.task.ts +95 -0
  489. package/tasks/literacy/nextjs-live.task.ts +2 -1
  490. package/tasks/literacy/portable-text.task.ts +169 -0
  491. package/tasks/literacy/studio-setup.task.ts +5 -2
  492. package/tasks/literacy/visual-editing.task.ts +1 -0
  493. package/LICENSE +0 -21
  494. package/tasks/frameworks.yaml +0 -98
  495. package/tasks/functions.yaml +0 -51
  496. package/tasks/groq.yaml +0 -216
  497. package/tasks/nextjs-live.yaml +0 -62
  498. package/tasks/studio-setup.yaml +0 -111
  499. package/tasks/visual-editing.yaml +0 -120
@@ -8,8 +8,12 @@
8
8
  * When the variant is "full", the handler is called twice (baseline + agentic)
9
9
  * and three YAML files are written. Other modes produce one YAML file.
10
10
  */
11
+ import { existsSync } from "node:fs";
12
+ import { resolve } from "node:path";
11
13
  import { LiteracyVariant } from "../../pipeline/normalize-mode.js";
14
+ import { modelMatchesLiteracyVariant } from "../../pipeline/compiler/mode-bases/literacy.js";
12
15
  import { getStepInputPaths } from "../../pipeline/cache.js";
16
+ import { buildCacheContext } from "../cache-context.js";
13
17
  import { checkGeneratedConfigsExist } from "../../pipeline/checks.js";
14
18
  import { validateModelsYaml } from "../../pipeline/validate.js";
15
19
  import { loadSource } from "../../sources.js";
@@ -85,21 +89,14 @@ export class GenerateConfigsStep {
85
89
  // ---------------------------------------------------------------------------
86
90
  async compileLiteracyVariants(ctx, handler, tasks, models, providers, start) {
87
91
  ctx.logger.info(`Compiling ${tasks.length} literacy task(s) via registry handler...`);
88
- // Filter models per variant
92
+ // Filter models per variant using shared literacy variant matcher
89
93
  const baselineModels = models.models
90
- .filter((m) => !m.modes || m.modes.includes(LiteracyVariant.STANDARD))
91
- .map((m) => ({
92
- id: m.id,
93
- label: m.label,
94
- }));
94
+ .filter((m) => modelMatchesLiteracyVariant(m, "baseline"))
95
+ .map((m) => ({ id: m.id, label: m.label }));
95
96
  const agenticModels = models.models
96
- .filter((m) => !m.modes ||
97
- m.modes.includes("agentic-naive") ||
98
- m.modes.includes("agentic-optimized"))
99
- .map((m) => ({
100
- id: m.id,
101
- label: m.label,
102
- }));
97
+ .filter((m) => modelMatchesLiteracyVariant(m, "agentic-naive") ||
98
+ modelMatchesLiteracyVariant(m, "agentic-optimized"))
99
+ .map((m) => ({ id: m.id, label: m.label }));
103
100
  // Load rubric config for template resolution
104
101
  let rubricConfig;
105
102
  try {
@@ -137,6 +134,14 @@ export class GenerateConfigsStep {
137
134
  maxConcurrency: models.maxConcurrency,
138
135
  logger: ctx.logger,
139
136
  });
137
+ // Capture generated config files (use configFileForMode for legacy naming)
138
+ const { configFileForMode } = await import("../../pipeline/eval-constants.js");
139
+ for (const variant of ["baseline", "agentic", "observed"]) {
140
+ const configPath = resolve(ctx.config.rootDir, configFileForMode(variant));
141
+ if (existsSync(configPath)) {
142
+ ctx.collector.captureFile("generate-configs", `promptfoo-config-${variant}`, configPath, { mode: "literacy", variant });
143
+ }
144
+ }
140
145
  return this.checkLiteracyPostconditions(ctx, start);
141
146
  }
142
147
  // ---------------------------------------------------------------------------
@@ -144,13 +149,18 @@ export class GenerateConfigsStep {
144
149
  // ---------------------------------------------------------------------------
145
150
  async compileSingleMode(ctx, handler, tasks, mode, models, start) {
146
151
  ctx.logger.info(`Compiling ${tasks.length} ${mode} task(s) via registry handler...`);
152
+ // Filter models to those that declare this mode in their modes array
153
+ const modeModels = models.models
154
+ .filter((m) => !m.modes || m.modes.includes(mode))
155
+ .map((m) => ({
156
+ id: m.id,
157
+ label: m.label,
158
+ config: m.config,
159
+ }));
147
160
  const merged = this.compileAll(handler, tasks, {
148
161
  rootDir: ctx.config.rootDir,
149
162
  graderProvider: models.grader.id,
150
- models: models.models.map((m) => ({
151
- id: m.id,
152
- label: m.label,
153
- })),
163
+ models: modeModels,
154
164
  });
155
165
  for (const w of merged.warnings) {
156
166
  ctx.logger.warn(` ⚠ ${w}`);
@@ -163,6 +173,18 @@ export class GenerateConfigsStep {
163
173
  maxConcurrency: models.maxConcurrency,
164
174
  logger: ctx.logger,
165
175
  });
176
+ // Capture generated config file
177
+ const configPath = resolve(ctx.config.rootDir, `promptfooconfig.${mode}.yaml`);
178
+ if (existsSync(configPath)) {
179
+ ctx.collector.captureFile("generate-configs", "promptfoo-config", configPath, { mode });
180
+ }
181
+ // Capture mode-specific test artifacts (extras)
182
+ if (ctx.collector.extrasEnabled) {
183
+ const testsPath = resolve(ctx.config.rootDir, "results", "latest", `${mode}-tests.json`);
184
+ if (existsSync(testsPath)) {
185
+ ctx.collector.captureFile("generate-configs", `${mode}-tests`, testsPath, { mode });
186
+ }
187
+ }
166
188
  return {
167
189
  durationMs: Date.now() - start,
168
190
  status: "success",
@@ -175,19 +197,45 @@ export class GenerateConfigsStep {
175
197
  async loadTasks(ctx, mode, state) {
176
198
  const { resolve } = await import("path");
177
199
  const { discoverTsTaskFiles, loadTsTaskFile } = await import("../../adapters/task-sources/task-file-loader.js");
178
- const tasksDir = resolve(ctx.config.rootDir, "tasks", mode);
179
- const files = discoverTsTaskFiles(tasksDir);
200
+ const { resolveVendoredSubdir } = await import("../../pipeline/compiler/config-loader.js");
201
+ // Discover task files from the mode-specific directory and --repo-tasks-path.
202
+ // Use vendored copies in dist/ when @sanity/ailf-core isn't resolvable
203
+ // (i.e., running outside the monorepo via npx).
204
+ const tasksDir = resolveVendoredSubdir(ctx.config.rootDir, `tasks/${mode}`);
205
+ const dirs = [tasksDir];
206
+ // Also search --repo-tasks-path (e.g., .ailf/tasks/) for repo-based tasks
207
+ if (ctx.config.repoTasksPath) {
208
+ const repoDir = resolve(ctx.config.repoTasksPath);
209
+ if (!dirs.includes(repoDir)) {
210
+ dirs.push(repoDir);
211
+ }
212
+ }
180
213
  const tasks = [];
181
- for (const file of files) {
182
- const raw = await loadTsTaskFile(file);
183
- for (const t of raw.tasks) {
184
- const task = t;
185
- // Filter to matching mode (skip tasks from other modes in same dir)
186
- if (!("mode" in task) || task.mode === mode) {
187
- tasks.push(task);
214
+ const skippedByMode = new Map();
215
+ for (const dir of dirs) {
216
+ const files = discoverTsTaskFiles(dir);
217
+ for (const file of files) {
218
+ const raw = await loadTsTaskFile(file);
219
+ for (const t of raw.tasks) {
220
+ const task = t;
221
+ // Filter to matching mode (skip tasks from other modes in same dir)
222
+ if (!("mode" in task) || task.mode === mode) {
223
+ tasks.push(task);
224
+ }
225
+ else {
226
+ const taskMode = task.mode ?? "unknown";
227
+ skippedByMode.set(taskMode, (skippedByMode.get(taskMode) ?? 0) + 1);
228
+ }
188
229
  }
189
230
  }
190
231
  }
232
+ if (skippedByMode.size > 0) {
233
+ const total = [...skippedByMode.values()].reduce((a, b) => a + b, 0);
234
+ const summary = [...skippedByMode.entries()]
235
+ .map(([m, n]) => `${n} ${m}`)
236
+ .join(", ");
237
+ ctx.logger.warn(` ⚠ Skipped ${total} task(s) with non-matching mode (${summary}). Current pipeline mode: ${mode}. Run with --mode <mode> to include them.`);
238
+ }
191
239
  // Apply area/task/tag filters
192
240
  const filtered = this.applyFilters(ctx, tasks);
193
241
  // Release auto-scope
@@ -264,6 +312,9 @@ export class GenerateConfigsStep {
264
312
  cacheInputs(ctx) {
265
313
  return getStepInputPaths(ctx.config.rootDir, "generate-configs");
266
314
  }
315
+ cacheContext(ctx) {
316
+ return buildCacheContext(ctx.config);
317
+ }
267
318
  }
268
319
  // ---------------------------------------------------------------------------
269
320
  // Helpers
@@ -10,7 +10,7 @@
10
10
  * exist before evaluation begins.
11
11
  *
12
12
  * @see packages/eval/src/pipeline/mirror-repo-tasks.ts
13
- * @see docs/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
13
+ * @see docs/archive/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
14
14
  */
15
15
  import type { AppContext, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
16
16
  export declare class MirrorRepoTasksStep implements PipelineStep {
@@ -10,7 +10,7 @@
10
10
  * exist before evaluation begins.
11
11
  *
12
12
  * @see packages/eval/src/pipeline/mirror-repo-tasks.ts
13
- * @see docs/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
13
+ * @see docs/archive/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
14
14
  */
15
15
  import { getSanityClient } from "../../sanity/client.js";
16
16
  import { detectGitContext, mirrorRepoTasks, } from "../../pipeline/mirror-repo-tasks.js";
@@ -115,12 +115,31 @@ export class PublishReportStep {
115
115
  };
116
116
  // Share reportId with downstream steps (CallbackStep + orchestrator job update)
117
117
  state.reportId = reportId;
118
+ // Capture report object (Tier 2)
119
+ ctx.collector.capture("publish-report", "report-object", report);
120
+ // Capture auto-comparison if present (Tier 2)
121
+ if (comparison) {
122
+ ctx.collector.capture("publish-report", "auto-comparison", comparison);
123
+ }
118
124
  // Write to store (system of record — best-effort, P5)
119
125
  const sanityResult = ctx.reportStore
120
126
  ? await ctx.reportStore.write(report)
121
127
  : null;
122
128
  // Run sinks (fire-and-forget, P6)
123
129
  const publishResult = await runSinks(report, ctx);
130
+ // Capture sink results (Tier 2)
131
+ if (publishResult.sinkResults.length > 0) {
132
+ ctx.collector.capture("publish-report", "sink-results", {
133
+ sinkCount: publishResult.sinkResults.length,
134
+ results: publishResult.sinkResults.map((r) => ({
135
+ name: r.name,
136
+ status: r.result.status,
137
+ ...(r.result.status === "success" ? { detail: r.result.detail } : {}),
138
+ ...(r.result.status === "failed" ? { error: r.result.error } : {}),
139
+ ...(r.result.status === "skipped" ? { reason: r.result.reason } : {}),
140
+ })),
141
+ });
142
+ }
124
143
  // Build result summary
125
144
  const parts = [];
126
145
  if (sanityResult) {
@@ -4,7 +4,7 @@
4
4
  * Calls pure functions from pipeline/readiness-report.ts directly.
5
5
  * Optional step — failure doesn't stop the pipeline.
6
6
  */
7
- import { existsSync, readFileSync, writeFileSync } from "fs";
7
+ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
8
8
  import { resolve } from "path";
9
9
  import { tryLoadConfigFile } from "../../pipeline/compiler/config-loader.js";
10
10
  import { formatReadinessMarkdown, generateReadinessReport, } from "../../pipeline/readiness-report.js";
@@ -37,7 +37,8 @@ export class ReadinessStep {
37
37
  }
38
38
  const scoreSummary = JSON.parse(readFileSync(scoreSummaryPath, "utf-8"));
39
39
  const thresholdConfig = ThresholdConfigSchema.parse(thresholdsLoaded.data);
40
- const gapPath = resolve(root, "results", "latest", "gap-analysis.json");
40
+ // Read gap-analysis.json from outputDir (gap-analysis step writes there)
41
+ const gapPath = resolve(ctx.config.outputDir, "gap-analysis.json");
41
42
  const gapAnalysis = existsSync(gapPath)
42
43
  ? JSON.parse(readFileSync(gapPath, "utf-8"))
43
44
  : undefined;
@@ -60,7 +61,11 @@ export class ReadinessStep {
60
61
  console.log(md);
61
62
  }
62
63
  if (readinessLines.length > 0) {
63
- writeFileSync(resolve(root, "results", "latest", "readiness-report.md"), readinessLines.join("\n---\n\n"));
64
+ // Write to outputDir (respects --output-dir)
65
+ mkdirSync(ctx.config.outputDir, { recursive: true });
66
+ const readinessPath = resolve(ctx.config.outputDir, "readiness-report.md");
67
+ writeFileSync(readinessPath, readinessLines.join("\n---\n\n"));
68
+ ctx.collector.captureFile("readiness", "readiness-report", readinessPath);
64
69
  }
65
70
  const passCount = readinessAreas.filter((area) => {
66
71
  const areaScore = scoreSummary.scores.find((s) => s.feature === area);
@@ -4,10 +4,10 @@
4
4
  * Calls generatePrComment() from pipeline/pr-comment.ts with typed options.
5
5
  * No env bridge or process.argv manipulation needed.
6
6
  */
7
- import { resolve } from "path";
7
+ import { existsSync, mkdirSync } from "node:fs";
8
+ import { dirname, resolve } from "path";
8
9
  import { checkScoreSummaryValid } from "../../pipeline/checks.js";
9
10
  import { generatePrComment } from "../../pipeline/pr-comment.js";
10
- const DEFAULT_REPORT_PATH = "results/latest/pr-comment.md";
11
11
  export class ReportStep {
12
12
  name = "report";
13
13
  check() {
@@ -15,7 +15,7 @@ export class ReportStep {
15
15
  }
16
16
  async execute(ctx) {
17
17
  const start = Date.now();
18
- // Precondition: score summary exists
18
+ // Precondition: score summary exists (intermediate files stay in rootDir)
19
19
  const summaryIssues = checkScoreSummaryValid(ctx.config.rootDir);
20
20
  const summaryErrors = summaryIssues.filter((i) => i.severity === "error");
21
21
  if (summaryErrors.length > 0) {
@@ -25,7 +25,12 @@ export class ReportStep {
25
25
  status: "failed",
26
26
  };
27
27
  }
28
- const resolvedOutput = ctx.config.outputPath ?? resolve(ctx.config.rootDir, DEFAULT_REPORT_PATH);
28
+ // User-facing output: --output flag wins, else outputDir
29
+ const resolvedOutput = ctx.config.outputPath ?? resolve(ctx.config.outputDir, "pr-comment.md");
30
+ // Ensure outputDir exists before writing (it may be a custom --output-dir
31
+ // that hasn't been created yet — writePipelineResult runs after the
32
+ // orchestrator returns, so we can't rely on it).
33
+ mkdirSync(dirname(resolvedOutput), { recursive: true });
29
34
  try {
30
35
  generatePrComment({
31
36
  outputPath: resolvedOutput,
@@ -40,6 +45,14 @@ export class ReportStep {
40
45
  status: "failed",
41
46
  };
42
47
  }
48
+ // Capture report artifacts
49
+ if (existsSync(resolvedOutput)) {
50
+ ctx.collector.captureFile("report", "pr-comment", resolvedOutput);
51
+ }
52
+ const pipelineResultPath = resolve(ctx.config.outputDir, "pipeline-result.json");
53
+ if (existsSync(pipelineResultPath)) {
54
+ ctx.collector.captureFile("report", "pipeline-result", pipelineResultPath);
55
+ }
43
56
  return {
44
57
  durationMs: Date.now() - start,
45
58
  status: "success",
@@ -13,4 +13,5 @@ export declare class RunEvalStep implements PipelineStep {
13
13
  check(): ValidationIssue[];
14
14
  execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
15
15
  cacheInputs(ctx: AppContext): string[];
16
+ cacheContext(ctx: AppContext): string[];
16
17
  }
@@ -8,6 +8,7 @@
8
8
  import { existsSync, mkdirSync, writeFileSync } from "fs";
9
9
  import { resolve } from "path";
10
10
  import { getStepInputPaths } from "../../pipeline/cache.js";
11
+ import { buildCacheContext } from "../cache-context.js";
11
12
  import { checkCanonicalContextsExist, checkGeneratedConfigsExist, checkResultsExist, } from "../../pipeline/checks.js";
12
13
  import { computeEvalFingerprint } from "../../pipeline/eval-fingerprint.js";
13
14
  import { buildFilterFlags, configFileForMode, resultsFileForMode, scanResultsForErrors, } from "../../pipeline/eval-constants.js";
@@ -38,38 +39,41 @@ export class RunEvalStep {
38
39
  };
39
40
  }
40
41
  // Precondition: canonical context files exist for filtered tasks.
41
- // Must apply the same area/task filter as fetch-docs so we only
42
- // check contexts that were actually fetched.
43
- const filter = ctx.config.areas || ctx.config.tasks || ctx.config.tags
44
- ? {
45
- ...(ctx.config.areas ? { areas: ctx.config.areas } : {}),
46
- ...(ctx.config.tasks ? { taskIds: ctx.config.tasks } : {}),
47
- ...(ctx.config.tags ? { tags: ctx.config.tags } : {}),
42
+ // Only applies to literacy mode other modes don't use canonical doc contexts.
43
+ if (this.mode === "literacy") {
44
+ // Must apply the same area/task filter as fetch-docs so we only
45
+ // check contexts that were actually fetched.
46
+ const filter = ctx.config.areas || ctx.config.tasks || ctx.config.tags
47
+ ? {
48
+ ...(ctx.config.areas ? { areas: ctx.config.areas } : {}),
49
+ ...(ctx.config.tasks ? { taskIds: ctx.config.tasks } : {}),
50
+ ...(ctx.config.tags ? { tags: ctx.config.tags } : {}),
51
+ }
52
+ : undefined;
53
+ let tasks = await ctx.taskSource.loadTasks(filter);
54
+ // Release auto-scope: narrow to affected tasks (mirrors GenerateConfigsStep)
55
+ if (state.releaseAutoScope && !ctx.config.noAutoScope) {
56
+ const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
57
+ tasks = tasks.filter((t) => scopedIds.has(t.id));
58
+ }
59
+ // Only check context files for tasks that have canonical docs.
60
+ // Tasks without canonical docs are skipped by FetchDocsStep (they
61
+ // have no docs to fetch), so no context file is written for them.
62
+ // The generated Promptfoo config still includes their "without-docs"
63
+ // variant (testing model knowledge alone), which doesn't need a
64
+ // context file.
65
+ // Bridge: narrow to literacy tasks with docs
66
+ const tasksWithDocs = tasks.filter((t) => t.mode === "literacy" && (t.context?.docs?.length ?? 0) > 0);
67
+ const taskIds = tasksWithDocs.map((t) => t.id);
68
+ const contextIssues = checkCanonicalContextsExist(rootDir, taskIds);
69
+ const contextErrors = contextIssues.filter((i) => i.severity === "error");
70
+ if (contextErrors.length > 0) {
71
+ return {
72
+ durationMs: Date.now() - start,
73
+ error: `Context files missing. Run fetch-docs first. ${contextErrors.map((e) => e.message).join("; ")}`,
74
+ status: "failed",
75
+ };
48
76
  }
49
- : undefined;
50
- let tasks = await ctx.taskSource.loadTasks(filter);
51
- // Release auto-scope: narrow to affected tasks (mirrors GenerateConfigsStep)
52
- if (state.releaseAutoScope && !ctx.config.noAutoScope) {
53
- const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
54
- tasks = tasks.filter((t) => scopedIds.has(t.id));
55
- }
56
- // Only check context files for tasks that have canonical docs.
57
- // Tasks without canonical docs are skipped by FetchDocsStep (they
58
- // have no docs to fetch), so no context file is written for them.
59
- // The generated Promptfoo config still includes their "without-docs"
60
- // variant (testing model knowledge alone), which doesn't need a
61
- // context file.
62
- // Bridge: narrow to literacy tasks with docs
63
- const tasksWithDocs = tasks.filter((t) => t.mode === "literacy" && (t.context?.docs?.length ?? 0) > 0);
64
- const taskIds = tasksWithDocs.map((t) => t.id);
65
- const contextIssues = checkCanonicalContextsExist(rootDir, taskIds);
66
- const contextErrors = contextIssues.filter((i) => i.severity === "error");
67
- if (contextErrors.length > 0) {
68
- return {
69
- durationMs: Date.now() - start,
70
- error: `Context files missing. Run fetch-docs first. ${contextErrors.map((e) => e.message).join("; ")}`,
71
- status: "failed",
72
- };
73
77
  }
74
78
  // -----------------------------------------------------------------
75
79
  // Compute eval fingerprint (for remote cache + provenance)
@@ -109,6 +113,11 @@ export class RunEvalStep {
109
113
  // required eval modes were satisfied from the remote cache.
110
114
  state.remoteCacheHits ??= new Set();
111
115
  state.remoteCacheHits.add(this.mode);
116
+ // Capture the restored score-summary from remote cache
117
+ const cachedSummaryPath = resolve(rootDir, "results", "latest", "score-summary.json");
118
+ if (existsSync(cachedSummaryPath)) {
119
+ ctx.collector.captureFile("run-eval", "score-summary-cached", cachedSummaryPath, { source: "remote-cache", mode: this.mode });
120
+ }
112
121
  return {
113
122
  durationMs: Date.now() - start,
114
123
  status: "success",
@@ -143,6 +152,7 @@ export class RunEvalStep {
143
152
  configPath: configFile,
144
153
  env: subprocessEnv,
145
154
  filterFlags: filterFlags.trim() || undefined,
155
+ maxDurationMs: ctx.config.evalBudgetMs,
146
156
  });
147
157
  // Check if results were written despite non-zero exit
148
158
  if (result.status === "failed") {
@@ -172,6 +182,13 @@ export class RunEvalStep {
172
182
  console.log();
173
183
  console.log(errorSummary);
174
184
  }
185
+ // Capture eval results
186
+ const resultsPath = resolve(rootDir, resultsFileForMode(this.mode));
187
+ if (existsSync(resultsPath)) {
188
+ ctx.collector.captureFile("run-eval", `eval-results-${this.mode}`, resultsPath, {
189
+ mode: this.mode,
190
+ });
191
+ }
175
192
  const durationMs = Date.now() - start;
176
193
  return {
177
194
  durationMs,
@@ -182,6 +199,9 @@ export class RunEvalStep {
182
199
  cacheInputs(ctx) {
183
200
  return getStepInputPaths(ctx.config.rootDir, `eval-${this.mode}`);
184
201
  }
202
+ cacheContext(ctx) {
203
+ return buildCacheContext(ctx.config);
204
+ }
185
205
  }
186
206
  // ---------------------------------------------------------------------------
187
207
  // Remote cache helpers
@@ -47,6 +47,12 @@ export const CANONICAL_DOC_MAP = {
47
47
  // ---------------------------------------------------------------------------
48
48
  export function detectFeatureArea(description) {
49
49
  const desc = description.toLowerCase();
50
+ if (desc.includes("portable text"))
51
+ return "portable-text";
52
+ if (desc.includes("content lake"))
53
+ return "content-lake";
54
+ if (desc.includes("image handling") || desc.includes("image asset"))
55
+ return "image-handling";
50
56
  if (desc.includes("studio"))
51
57
  return "studio-setup";
52
58
  if (desc.includes("visual") ||
@@ -13,7 +13,7 @@
13
13
  * layered on top for ambiguous cases when higher precision is needed.
14
14
  *
15
15
  * @see docs/design-docs/scenario-matrix/per-document-attribution.md
16
- * @see docs/exec-plans/scenario-matrix-implementation/phase-2-impact-scenarios.md
16
+ * @see docs/archive/exec-plans/scenario-matrix-implementation/phase-2-impact-scenarios.md
17
17
  */
18
18
  import type { AttributionReport, ComparisonReport } from "./types.js";
19
19
  import type { ResolvedMappings } from "./resolve-mappings.js";
@@ -13,7 +13,7 @@
13
13
  * layered on top for ambiguous cases when higher precision is needed.
14
14
  *
15
15
  * @see docs/design-docs/scenario-matrix/per-document-attribution.md
16
- * @see docs/exec-plans/scenario-matrix-implementation/phase-2-impact-scenarios.md
16
+ * @see docs/archive/exec-plans/scenario-matrix-implementation/phase-2-impact-scenarios.md
17
17
  */
18
18
  // ---------------------------------------------------------------------------
19
19
  // Public API
@@ -125,15 +125,18 @@ export function getStepInputPaths(rootDir, step) {
125
125
  }
126
126
  }
127
127
  // Task files (contain assertions and test definitions).
128
- // Exclude generated .expanded*.yaml files those are already listed
129
- // explicitly above per mode.
128
+ // Task files live in tasks/{mode}/ subdirectories (e.g., tasks/literacy/)
130
129
  const tasksDir = r("tasks");
131
130
  if (existsSync(tasksDir)) {
132
- const taskFiles = readdirSync(tasksDir)
133
- .filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) &&
134
- !f.startsWith(".expanded"))
135
- .map((f) => join(tasksDir, f));
136
- paths.push(...taskFiles);
131
+ for (const entry of readdirSync(tasksDir)) {
132
+ const subDir = join(tasksDir, entry);
133
+ if (statSync(subDir).isDirectory()) {
134
+ const taskFiles = readdirSync(subDir)
135
+ .filter((f) => /\.(task\.ts|task\.js)$/.test(f))
136
+ .map((f) => join(subDir, f));
137
+ paths.push(...taskFiles);
138
+ }
139
+ }
137
140
  }
138
141
  // Reference solutions (used by grader assertions)
139
142
  const refDir = r("canonical/reference-solutions");
@@ -155,12 +158,18 @@ export function getStepInputPaths(rootDir, step) {
155
158
  if (modelsPath2)
156
159
  paths.push(modelsPath2);
157
160
  // Include all task files (they define feature areas)
161
+ // Task files live in tasks/{mode}/ subdirectories (e.g., tasks/literacy/)
158
162
  const tasksDir = r("tasks");
159
163
  if (existsSync(tasksDir)) {
160
- const taskFiles = readdirSync(tasksDir)
161
- .filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
162
- .map((f) => join(tasksDir, f));
163
- paths.push(...taskFiles);
164
+ for (const entry of readdirSync(tasksDir)) {
165
+ const subDir = join(tasksDir, entry);
166
+ if (statSync(subDir).isDirectory()) {
167
+ const taskFiles = readdirSync(subDir)
168
+ .filter((f) => /\.(task\.ts|task\.js)$/.test(f))
169
+ .map((f) => join(subDir, f));
170
+ paths.push(...taskFiles);
171
+ }
172
+ }
164
173
  }
165
174
  return paths;
166
175
  }
@@ -175,10 +184,15 @@ export function getStepInputPaths(rootDir, step) {
175
184
  paths.push(sourcesPath2);
176
185
  const tasksDir = r("tasks");
177
186
  if (existsSync(tasksDir)) {
178
- const taskFiles = readdirSync(tasksDir)
179
- .filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
180
- .map((f) => join(tasksDir, f));
181
- paths.push(...taskFiles);
187
+ for (const entry of readdirSync(tasksDir)) {
188
+ const subDir = join(tasksDir, entry);
189
+ if (statSync(subDir).isDirectory()) {
190
+ const taskFiles = readdirSync(subDir)
191
+ .filter((f) => /\.(task\.ts|task\.js)$/.test(f))
192
+ .map((f) => join(subDir, f));
193
+ paths.push(...taskFiles);
194
+ }
195
+ }
182
196
  }
183
197
  return paths;
184
198
  }
@@ -38,6 +38,8 @@ export interface RawTestResult {
38
38
  componentResults: ComponentResult[];
39
39
  pass: boolean;
40
40
  };
41
+ /** Per-test latency in ms (populated by Promptfoo when available) */
42
+ latencyMs?: number;
41
43
  metadata?: Record<string, unknown>;
42
44
  provider?: {
43
45
  id?: string;