@sanity/ailf 1.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (499) hide show
  1. package/README.md +0 -1
  2. package/canonical/grader-references/README.md +2 -2
  3. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  4. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  5. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  6. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  7. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  8. package/config/features.ts +1 -1
  9. package/config/models.ts +29 -12
  10. package/config/sources.ts +1 -1
  11. package/config/thresholds.ts +1 -1
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  13. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  17. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  18. package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
  19. package/dist/_vendor/ailf-core/config-helpers.js +51 -2
  20. package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
  21. package/dist/_vendor/ailf-core/examples/index.js +213 -94
  22. package/dist/_vendor/ailf-core/index.d.ts +3 -2
  23. package/dist/_vendor/ailf-core/index.js +2 -1
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  25. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  27. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  28. package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
  29. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  30. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  31. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  32. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  33. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  34. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  35. package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
  36. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
  37. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
  38. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
  39. package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
  40. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  41. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  42. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  43. package/dist/_vendor/ailf-core/services/index.js +1 -1
  44. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
  46. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  47. package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
  50. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  51. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  52. package/dist/adapters/api-client/remediation.js +2 -2
  53. package/dist/adapters/config-sources/file-config-adapter.js +7 -1
  54. package/dist/adapters/config-sources/ts-config-loader.js +21 -13
  55. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  56. package/dist/adapters/index.d.ts +0 -1
  57. package/dist/adapters/index.js +0 -1
  58. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  59. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  60. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  61. package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
  62. package/dist/adapters/task-sources/index.d.ts +3 -4
  63. package/dist/adapters/task-sources/index.js +3 -4
  64. package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
  65. package/dist/adapters/task-sources/repo-schemas.js +228 -20
  66. package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
  67. package/dist/adapters/task-sources/repo-task-source.js +81 -122
  68. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  69. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  70. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  71. package/dist/adapters/task-sources/repo-validation.js +126 -5
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
  73. package/dist/adapters/task-sources/task-file-loader.js +21 -7
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/coverage-audit.js +3 -1
  95. package/dist/commands/explain-handler.d.ts +1 -1
  96. package/dist/commands/explain-handler.js +37 -8
  97. package/dist/commands/fetch-docs.js +1 -0
  98. package/dist/commands/generate-configs.d.ts +3 -3
  99. package/dist/commands/generate-configs.js +20 -8
  100. package/dist/commands/init.d.ts +5 -4
  101. package/dist/commands/init.js +190 -25
  102. package/dist/commands/pipeline-action.d.ts +7 -1
  103. package/dist/commands/pipeline-action.js +43 -19
  104. package/dist/commands/pipeline.d.ts +6 -1
  105. package/dist/commands/pipeline.js +7 -2
  106. package/dist/commands/pr-comment.js +1 -0
  107. package/dist/commands/publish.js +1 -0
  108. package/dist/commands/shared/help.js +2 -2
  109. package/dist/commands/update-quality-scores.d.ts +5 -0
  110. package/dist/commands/update-quality-scores.js +20 -0
  111. package/dist/commands/validate-tasks.d.ts +2 -2
  112. package/dist/commands/validate-tasks.js +26 -15
  113. package/dist/composition-root.d.ts +15 -4
  114. package/dist/composition-root.js +100 -55
  115. package/dist/config/features.ts +23 -0
  116. package/dist/config/models.ts +100 -0
  117. package/dist/config/prompts.ts +16 -0
  118. package/dist/config/rubrics.ts +225 -0
  119. package/dist/config/schedules.ts +47 -0
  120. package/dist/config/sinks.ts +37 -0
  121. package/dist/config/sources.ts +21 -0
  122. package/dist/config/thresholds.ts +61 -0
  123. package/dist/index.d.ts +41 -0
  124. package/dist/index.js +48 -0
  125. package/dist/lib/agent-behavior-report.d.ts +8 -0
  126. package/dist/lib/agent-behavior-report.js +185 -0
  127. package/dist/lib/baseline.d.ts +19 -0
  128. package/dist/lib/baseline.js +153 -0
  129. package/dist/lib/calculate-scores.d.ts +23 -0
  130. package/dist/lib/calculate-scores.js +42 -0
  131. package/dist/lib/compare.d.ts +18 -0
  132. package/dist/lib/compare.js +170 -0
  133. package/dist/lib/coverage-audit.d.ts +4 -0
  134. package/dist/lib/coverage-audit.js +42 -0
  135. package/dist/lib/discovery-report.d.ts +13 -0
  136. package/dist/lib/discovery-report.js +57 -0
  137. package/dist/lib/fetch-docs.d.ts +30 -0
  138. package/dist/lib/fetch-docs.js +171 -0
  139. package/dist/lib/generate-configs.d.ts +25 -0
  140. package/dist/lib/generate-configs.js +42 -0
  141. package/dist/lib/grader-api.d.ts +21 -0
  142. package/dist/lib/grader-api.js +34 -0
  143. package/dist/lib/grader-compare.d.ts +19 -0
  144. package/dist/lib/grader-compare.js +91 -0
  145. package/dist/lib/grader-consistency.d.ts +27 -0
  146. package/dist/lib/grader-consistency.js +79 -0
  147. package/dist/lib/grader-sensitivity.d.ts +19 -0
  148. package/dist/lib/grader-sensitivity.js +75 -0
  149. package/dist/lib/grader-validate.d.ts +19 -0
  150. package/dist/lib/grader-validate.js +78 -0
  151. package/dist/lib/measure-retrieval.d.ts +14 -0
  152. package/dist/lib/measure-retrieval.js +71 -0
  153. package/dist/lib/pr-comment.d.ts +16 -0
  154. package/dist/lib/pr-comment.js +28 -0
  155. package/dist/lib/readiness-report.d.ts +13 -0
  156. package/dist/lib/readiness-report.js +108 -0
  157. package/dist/lib/webhook-server.d.ts +11 -0
  158. package/dist/lib/webhook-server.js +24 -0
  159. package/dist/lib/weekly-digest.d.ts +24 -0
  160. package/dist/lib/weekly-digest.js +148 -0
  161. package/dist/orchestration/build-app-context.js +13 -0
  162. package/dist/orchestration/build-step-sequence.js +4 -2
  163. package/dist/orchestration/cache-context.d.ts +23 -0
  164. package/dist/orchestration/cache-context.js +43 -0
  165. package/dist/orchestration/env-bridge.d.ts +21 -0
  166. package/dist/orchestration/env-bridge.js +66 -0
  167. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  168. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  169. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  170. package/dist/orchestration/step-runner.js +5 -1
  171. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  172. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  173. package/dist/orchestration/steps/callback-step.js +10 -1
  174. package/dist/orchestration/steps/compare-step.js +6 -3
  175. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  176. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  177. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  178. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  179. package/dist/orchestration/steps/fetch-docs-step.js +32 -19
  180. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  181. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  182. package/dist/orchestration/steps/generate-configs-step.js +77 -26
  183. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  184. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  185. package/dist/orchestration/steps/publish-report-step.js +19 -0
  186. package/dist/orchestration/steps/readiness-step.js +8 -3
  187. package/dist/orchestration/steps/report-step.js +17 -4
  188. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  189. package/dist/orchestration/steps/run-eval-step.js +51 -31
  190. package/dist/pipeline/agent-behavior-report.js +6 -0
  191. package/dist/pipeline/attribution.d.ts +1 -1
  192. package/dist/pipeline/attribution.js +1 -1
  193. package/dist/pipeline/cache.js +29 -15
  194. package/dist/pipeline/calculate-scores.d.ts +2 -0
  195. package/dist/pipeline/calculate-scores.js +70 -33
  196. package/dist/pipeline/chronic-failures.d.ts +55 -0
  197. package/dist/pipeline/chronic-failures.js +110 -0
  198. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
  199. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
  200. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
  201. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
  202. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  203. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
  204. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  205. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  206. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  207. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  208. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  209. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  210. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  211. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  212. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  213. package/dist/pipeline/compiler/config-loader.js +42 -2
  214. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  215. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  216. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  217. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  218. package/dist/pipeline/compiler/index.d.ts +2 -5
  219. package/dist/pipeline/compiler/index.js +2 -5
  220. package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
  221. package/dist/pipeline/compiler/literacy-bridge.js +2 -2
  222. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  223. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  224. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  225. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  226. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  227. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  228. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
  229. package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
  230. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  231. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  232. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  233. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  234. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  235. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  236. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  237. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  238. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  239. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  240. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  241. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  242. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  243. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  244. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
  245. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  246. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  247. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  248. package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
  249. package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
  250. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  251. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  252. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  253. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  254. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  255. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  256. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  257. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  258. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  259. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  260. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  261. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  262. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  263. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  264. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  265. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  266. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  267. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  268. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  269. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  270. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  271. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  272. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  273. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  274. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
  275. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
  276. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
  277. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  278. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  279. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  280. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  281. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  282. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  283. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  284. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
  285. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  286. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  287. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  288. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  289. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
  290. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
  291. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  292. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
  293. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  294. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
  295. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  296. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  297. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
  298. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
  299. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
  300. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  301. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  302. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  303. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  304. package/dist/pipeline/compiler/preset-loader.js +99 -0
  305. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
  306. package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
  307. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  308. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  309. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  310. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  311. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  312. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  313. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  314. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  315. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  316. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  317. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  318. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  319. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  320. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  321. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  322. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  323. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  324. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  325. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  326. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  327. package/dist/pipeline/compiler/task-bridge.js +92 -0
  328. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  329. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  330. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  331. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  332. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  333. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  334. package/dist/pipeline/coverage-audit.d.ts +1 -1
  335. package/dist/pipeline/coverage-audit.js +1 -1
  336. package/dist/pipeline/degradations.d.ts +1 -1
  337. package/dist/pipeline/degradations.js +1 -1
  338. package/dist/pipeline/expand-tasks.d.ts +2 -2
  339. package/dist/pipeline/expand-tasks.js +2 -2
  340. package/dist/pipeline/failure-modes.d.ts +1 -1
  341. package/dist/pipeline/failure-modes.js +13 -1
  342. package/dist/pipeline/gap-analysis.d.ts +1 -1
  343. package/dist/pipeline/gap-analysis.js +3 -1
  344. package/dist/pipeline/generate-configs.d.ts +2 -2
  345. package/dist/pipeline/generate-configs.js +16 -9
  346. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  347. package/dist/pipeline/grader-compare-runner.js +7 -1
  348. package/dist/pipeline/grader-comparison.d.ts +1 -1
  349. package/dist/pipeline/grader-comparison.js +1 -1
  350. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  351. package/dist/pipeline/grader-consistency-runner.js +7 -1
  352. package/dist/pipeline/grader-consistency.d.ts +1 -1
  353. package/dist/pipeline/grader-consistency.js +1 -1
  354. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  355. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  356. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  357. package/dist/pipeline/grader-sensitivity.js +1 -1
  358. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  359. package/dist/pipeline/grader-validate-runner.js +2 -2
  360. package/dist/pipeline/grader-validation.d.ts +1 -1
  361. package/dist/pipeline/grader-validation.js +1 -1
  362. package/dist/pipeline/map-request-to-config.js +16 -2
  363. package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
  364. package/dist/pipeline/mirror-repo-tasks.js +10 -10
  365. package/dist/pipeline/plan-format.d.ts +1 -1
  366. package/dist/pipeline/plan-format.js +1 -1
  367. package/dist/pipeline/plan.d.ts +1 -1
  368. package/dist/pipeline/plan.js +68 -30
  369. package/dist/pipeline/probe.d.ts +1 -1
  370. package/dist/pipeline/probe.js +1 -1
  371. package/dist/pipeline/readiness-report.d.ts +2 -2
  372. package/dist/pipeline/readiness-report.js +2 -2
  373. package/dist/pipeline/release-classification.d.ts +1 -1
  374. package/dist/pipeline/release-classification.js +1 -1
  375. package/dist/pipeline/release-report.d.ts +1 -1
  376. package/dist/pipeline/release-report.js +1 -1
  377. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  378. package/dist/pipeline/repo-eval-comment.js +1 -1
  379. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  380. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  381. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  382. package/dist/pipeline/resolve-mappings.js +44 -44
  383. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  384. package/dist/pipeline/retrieval-metrics.js +28 -20
  385. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  386. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  387. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  388. package/dist/pipeline/steps/compare-step.js +90 -0
  389. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  390. package/dist/pipeline/steps/eval-step.js +347 -0
  391. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  392. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  393. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  394. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  395. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  396. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  397. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  398. package/dist/pipeline/steps/publish-report-step.js +243 -0
  399. package/dist/pipeline/steps/report-step.d.ts +13 -0
  400. package/dist/pipeline/steps/report-step.js +56 -0
  401. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  402. package/dist/pipeline/steps/update-scores-step.js +42 -0
  403. package/dist/pipeline/targeted-loo.d.ts +1 -1
  404. package/dist/pipeline/targeted-loo.js +1 -1
  405. package/dist/pipeline/thresholds.d.ts +1 -1
  406. package/dist/pipeline/thresholds.js +1 -1
  407. package/dist/pipeline/validate.js +13 -0
  408. package/dist/report-store.d.ts +17 -0
  409. package/dist/report-store.js +24 -0
  410. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  411. package/dist/scripts/agent-behavior-report.js +315 -0
  412. package/dist/scripts/baseline.d.ts +43 -0
  413. package/dist/scripts/baseline.js +267 -0
  414. package/dist/scripts/calculate-scores.d.ts +166 -0
  415. package/dist/scripts/calculate-scores.js +1296 -0
  416. package/dist/scripts/compare.d.ts +22 -0
  417. package/dist/scripts/compare.js +334 -0
  418. package/dist/scripts/coverage-audit.d.ts +44 -0
  419. package/dist/scripts/coverage-audit.js +209 -0
  420. package/dist/scripts/debug-eval.d.ts +19 -0
  421. package/dist/scripts/debug-eval.js +73 -0
  422. package/dist/scripts/discovery-report.d.ts +58 -0
  423. package/dist/scripts/discovery-report.js +250 -0
  424. package/dist/scripts/fetch-docs.d.ts +35 -0
  425. package/dist/scripts/fetch-docs.js +472 -0
  426. package/dist/scripts/generate-configs.d.ts +66 -0
  427. package/dist/scripts/generate-configs.js +459 -0
  428. package/dist/scripts/grader-api.d.ts +27 -0
  429. package/dist/scripts/grader-api.js +206 -0
  430. package/dist/scripts/grader-compare.d.ts +22 -0
  431. package/dist/scripts/grader-compare.js +368 -0
  432. package/dist/scripts/grader-consistency.d.ts +20 -0
  433. package/dist/scripts/grader-consistency.js +313 -0
  434. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  435. package/dist/scripts/grader-sensitivity.js +354 -0
  436. package/dist/scripts/grader-validate.d.ts +19 -0
  437. package/dist/scripts/grader-validate.js +267 -0
  438. package/dist/scripts/measure-retrieval.d.ts +10 -0
  439. package/dist/scripts/measure-retrieval.js +145 -0
  440. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  441. package/dist/scripts/migrate-task-mode.js +1 -1
  442. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  443. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  444. package/dist/scripts/pipeline.d.ts +76 -0
  445. package/dist/scripts/pipeline.js +1031 -0
  446. package/dist/scripts/pr-comment.d.ts +10 -0
  447. package/dist/scripts/pr-comment.js +510 -0
  448. package/dist/scripts/readiness-report.d.ts +88 -0
  449. package/dist/scripts/readiness-report.js +342 -0
  450. package/dist/scripts/update-quality-scores.d.ts +15 -0
  451. package/dist/scripts/update-quality-scores.js +184 -0
  452. package/dist/scripts/validate-task-sources.d.ts +1 -1
  453. package/dist/scripts/validate-task-sources.js +1 -1
  454. package/dist/scripts/validate.d.ts +13 -0
  455. package/dist/scripts/validate.js +79 -0
  456. package/dist/scripts/webhook-server.d.ts +26 -0
  457. package/dist/scripts/webhook-server.js +147 -0
  458. package/dist/scripts/weekly-digest.d.ts +24 -0
  459. package/dist/scripts/weekly-digest.js +144 -0
  460. package/dist/sinks/format-slack.d.ts +64 -0
  461. package/dist/sinks/format-slack.js +306 -0
  462. package/dist/sinks/slack-sink.d.ts +27 -0
  463. package/dist/sinks/slack-sink.js +78 -0
  464. package/dist/sinks/types.d.ts +1 -1
  465. package/dist/sinks/types.js +1 -1
  466. package/dist/sinks/webhook-sink.d.ts +19 -0
  467. package/dist/sinks/webhook-sink.js +50 -0
  468. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  469. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  470. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  471. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  472. package/dist/tasks/literacy/functions.task.ts +70 -0
  473. package/dist/tasks/literacy/groq.task.ts +259 -0
  474. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  475. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  476. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  477. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  478. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  479. package/package.json +32 -24
  480. package/tasks/.expanded.agentic.yaml +280 -0
  481. package/tasks/.expanded.yaml +565 -0
  482. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  483. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  484. package/tasks/literacy/content-lake.task.ts +181 -0
  485. package/tasks/literacy/frameworks.task.ts +1 -0
  486. package/tasks/literacy/functions.task.ts +1 -0
  487. package/tasks/literacy/groq.task.ts +1 -0
  488. package/tasks/literacy/image-handling.task.ts +95 -0
  489. package/tasks/literacy/nextjs-live.task.ts +2 -1
  490. package/tasks/literacy/portable-text.task.ts +169 -0
  491. package/tasks/literacy/studio-setup.task.ts +5 -2
  492. package/tasks/literacy/visual-editing.task.ts +1 -0
  493. package/LICENSE +0 -21
  494. package/tasks/frameworks.yaml +0 -98
  495. package/tasks/functions.yaml +0 -51
  496. package/tasks/groq.yaml +0 -216
  497. package/tasks/nextjs-live.yaml +0 -62
  498. package/tasks/studio-setup.yaml +0 -111
  499. package/tasks/visual-editing.yaml +0 -120
@@ -0,0 +1,94 @@
1
+ /**
2
+ * Port: ArtifactCollector — captures pipeline artifacts during execution.
3
+ *
4
+ * Injected into AppContext. When capture is disabled (default), the
5
+ * composition root provides NoOpArtifactCollector. When --capture is
6
+ * set, provides FilesystemArtifactCollector.
7
+ *
8
+ * Design principles:
9
+ * - P1: Zero-cost when off (no-op stub)
10
+ * - P2: Capture, don't intercept (steps call capture() explicitly)
11
+ * - P5: Non-blocking (failures swallowed, never block the pipeline)
12
+ */
13
+ /**
14
+ * The contract for artifact capture during pipeline execution.
15
+ *
16
+ * Steps call capture() for in-memory data and captureFile() for
17
+ * artifacts already on disk. The orchestrator calls flush() once
18
+ * at pipeline end to write everything to the configured destination.
19
+ */
20
+ export interface ArtifactCollector {
21
+ /**
22
+ * Record an in-memory artifact produced during pipeline execution.
23
+ *
24
+ * Callers need not check `enabled` before calling — the NoOp
25
+ * implementation is zero-cost, so unconditional calls are safe.
26
+ *
27
+ * @param step - Pipeline step name (e.g., "run-eval")
28
+ * @param type - Artifact type identifier (e.g., "eval-results")
29
+ * @param data - Content to serialize (JSON or text)
30
+ * @param meta - Optional metadata (variant, model, etc.)
31
+ */
32
+ capture(step: string, type: string, data: unknown, meta?: Record<string, unknown>): void;
33
+ /**
34
+ * Record a file reference for an artifact already on disk.
35
+ * The file is copied into the capture directory on flush().
36
+ *
37
+ * @param step - Pipeline step name
38
+ * @param type - Artifact type identifier
39
+ * @param filePath - Absolute path to the existing file
40
+ * @param meta - Optional metadata
41
+ */
42
+ captureFile(step: string, type: string, filePath: string, meta?: Record<string, unknown>): void;
43
+ /**
44
+ * Flush all captured artifacts to the configured destination.
45
+ * Called once at pipeline end by the orchestrator.
46
+ */
47
+ flush(): Promise<CaptureFlushResult>;
48
+ /** Whether capture is active */
49
+ readonly enabled: boolean;
50
+ /** Whether mode-specific extras are being captured */
51
+ readonly extrasEnabled: boolean;
52
+ }
53
+ /** Result of flushing captured artifacts to the destination. */
54
+ export interface CaptureFlushResult {
55
+ /** Total number of artifacts captured */
56
+ artifactCount: number;
57
+ /** Output path (directory or .tar.gz) */
58
+ destination: string;
59
+ /** Total bytes written (uncompressed) */
60
+ totalBytes: number;
61
+ /** Whether output was compressed */
62
+ compressed: boolean;
63
+ }
64
+ /** A single entry in the capture manifest. */
65
+ export interface ArtifactManifestEntry {
66
+ /** Pipeline step that produced this artifact */
67
+ step: string;
68
+ /** Artifact type identifier */
69
+ type: string;
70
+ /** Relative path within the capture directory */
71
+ path: string;
72
+ /** ISO 8601 timestamp of when capture() was called */
73
+ capturedAt: string;
74
+ /** Byte size of the artifact */
75
+ bytes: number;
76
+ /** Content format */
77
+ format: "json" | "markdown" | "text";
78
+ /** Optional metadata */
79
+ meta?: Record<string, unknown>;
80
+ }
81
+ /** The manifest.json written to each capture directory. */
82
+ export interface ArtifactManifest {
83
+ version: 1;
84
+ captureId: string;
85
+ startedAt: string;
86
+ completedAt: string;
87
+ pipeline: {
88
+ mode: string;
89
+ variant?: string;
90
+ source?: string;
91
+ areas?: string[];
92
+ };
93
+ artifacts: ArtifactManifestEntry[];
94
+ }
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Port: ArtifactCollector — captures pipeline artifacts during execution.
3
+ *
4
+ * Injected into AppContext. When capture is disabled (default), the
5
+ * composition root provides NoOpArtifactCollector. When --capture is
6
+ * set, provides FilesystemArtifactCollector.
7
+ *
8
+ * Design principles:
9
+ * - P1: Zero-cost when off (no-op stub)
10
+ * - P2: Capture, don't intercept (steps call capture() explicitly)
11
+ * - P5: Non-blocking (failures swallowed, never block the pipeline)
12
+ */
13
+ export {};
@@ -0,0 +1,138 @@
1
+ /**
2
+ * Types for cross-run capture comparison.
3
+ *
4
+ * The CaptureComparator reads two capture directories (baseline + experiment)
5
+ * and produces a CaptureDiffReport. Types are defined in core so external
6
+ * tooling can consume diff reports without depending on the eval package.
7
+ *
8
+ * Implementation lives in packages/eval/src/artifact-capture/comparator.ts.
9
+ */
10
+ /** How deeply to compare artifacts. */
11
+ export type ComparisonMode = "strict" | "structural" | "inventory";
12
+ /** Configurable thresholds for comparison. */
13
+ export interface ComparisonOptions {
14
+ /** Comparison depth: inventory (existence), structural (shape), strict (content) */
15
+ mode: ComparisonMode;
16
+ /** Score regression thresholds */
17
+ scoreThresholds?: {
18
+ /** Maximum allowed aggregate score delta (percentage points, default 5) */
19
+ aggregate: number;
20
+ /** Maximum allowed per-task score drop (points, default 10) */
21
+ perTask: number;
22
+ };
23
+ /** Timing regression thresholds */
24
+ timingThresholds?: {
25
+ /** Multiplier — flag steps exceeding this ratio (default 2.0) */
26
+ multiplier: number;
27
+ /** Per-step overrides (step name → custom multiplier) */
28
+ perStep?: Record<string, number>;
29
+ };
30
+ /** JSON structural diff depth (default 3) */
31
+ jsonDiffDepth?: number;
32
+ /** Additional ephemeral fields to ignore (merged with defaults) */
33
+ ephemeralFields?: string[];
34
+ }
35
+ /** Inventory diff — which artifacts exist in each capture. */
36
+ export interface InventoryDiff {
37
+ /** Artifact types in experiment but not in baseline */
38
+ added: string[];
39
+ /** Artifact types in baseline but not in experiment */
40
+ removed: string[];
41
+ /** Artifact types present in both */
42
+ common: string[];
43
+ }
44
+ /** A single structural change in a JSON artifact. */
45
+ export interface JsonDiffEntry {
46
+ /** JSON pointer path (e.g., "config.mode") */
47
+ path: string;
48
+ /** Value in baseline (undefined if key is added) */
49
+ baseline?: unknown;
50
+ /** Value in experiment (undefined if key is removed) */
51
+ experiment?: unknown;
52
+ }
53
+ /** Content diff for a single artifact. */
54
+ export interface ArtifactContentDiff {
55
+ /** Artifact type identifier (step/type) */
56
+ artifactKey: string;
57
+ /** Content format */
58
+ format: "json" | "markdown" | "text";
59
+ /** Structural changes (JSON) or line diff summary (text/markdown) */
60
+ changes: JsonDiffEntry[] | {
61
+ addedLines: number;
62
+ removedLines: number;
63
+ };
64
+ }
65
+ /** Score comparison between two captures. */
66
+ export interface ScoreComparison {
67
+ /** Baseline aggregate score */
68
+ baselineMean: number;
69
+ /** Experiment aggregate score */
70
+ currentMean: number;
71
+ /** Absolute delta (current - baseline) */
72
+ delta: number;
73
+ /** Per-task score deltas */
74
+ perTask: {
75
+ task: string;
76
+ baseline: number;
77
+ current: number;
78
+ delta: number;
79
+ }[];
80
+ /** Tasks that breached configured thresholds */
81
+ breaches: string[];
82
+ }
83
+ /** Timing comparison between two captures. */
84
+ export interface TimingComparison {
85
+ /** Total pipeline duration delta in ms */
86
+ totalDeltaMs: number;
87
+ /** Per-step timing */
88
+ perStep: {
89
+ step: string;
90
+ baselineMs: number;
91
+ currentMs: number;
92
+ ratio: number;
93
+ }[];
94
+ /** Steps that breached the timing multiplier threshold */
95
+ breaches: string[];
96
+ }
97
+ /** Metadata comparison between two captures. */
98
+ export interface MetadataComparison {
99
+ /** Whether pipeline modes match */
100
+ modeMatch: boolean;
101
+ /** Whether pipeline variants match */
102
+ variantMatch: boolean;
103
+ /** Config key differences */
104
+ configDiffs: JsonDiffEntry[];
105
+ }
106
+ /** Security scan results. */
107
+ export interface SecurityScan {
108
+ /** Whether any potential secret leaks were found */
109
+ leaksFound: boolean;
110
+ /** Details of each violation */
111
+ violations: {
112
+ /** Relative artifact file path */
113
+ file: string;
114
+ /** Description of the finding */
115
+ detail: string;
116
+ }[];
117
+ }
118
+ /** The full diff report produced by CaptureComparator. */
119
+ export interface CaptureDiffReport {
120
+ /** Are the two captures semantically equivalent? */
121
+ equivalent: boolean;
122
+ /** Human-readable summary (1-3 sentences) */
123
+ summary: string;
124
+ /** Comparison mode used */
125
+ mode: ComparisonMode;
126
+ /** Artifact inventory diff */
127
+ inventory: InventoryDiff;
128
+ /** Content diffs for common artifacts (structural/strict modes only) */
129
+ content?: ArtifactContentDiff[];
130
+ /** Score comparison (if score-summary exists in both captures) */
131
+ scores?: ScoreComparison;
132
+ /** Timing comparison (if pipeline-context exists in both captures) */
133
+ timing?: TimingComparison;
134
+ /** Metadata comparison */
135
+ metadata?: MetadataComparison;
136
+ /** Security scan results */
137
+ security: SecurityScan;
138
+ }
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Types for cross-run capture comparison.
3
+ *
4
+ * The CaptureComparator reads two capture directories (baseline + experiment)
5
+ * and produces a CaptureDiffReport. Types are defined in core so external
6
+ * tooling can consume diff reports without depending on the eval package.
7
+ *
8
+ * Implementation lives in packages/eval/src/artifact-capture/comparator.ts.
9
+ */
10
+ export {};
@@ -12,6 +12,7 @@
12
12
  * as downstream consumers are converted to use them.
13
13
  */
14
14
  import type { DebugOptions, EvalMode, PluginRegistry } from "../types/index.js";
15
+ import type { ArtifactCollector } from "./artifact-collector.js";
15
16
  import type { CacheStore } from "./cache-store.js";
16
17
  import type { DocFetcher } from "./doc-fetcher.js";
17
18
  import type { EvalRunner } from "./eval-runner.js";
@@ -78,6 +79,8 @@ export interface ResolvedConfig {
78
79
  noRemoteCache: boolean;
79
80
  /** Grader replications for consistency measurement */
80
81
  graderReplications?: number;
82
+ /** Base directory for user-facing pipeline output artifacts. */
83
+ outputDir: string;
81
84
  /** Output path override */
82
85
  outputPath?: string;
83
86
  /** Doc source URL overrides */
@@ -90,6 +93,12 @@ export interface ResolvedConfig {
90
93
  searchMode: "off" | "open" | "origin-only";
91
94
  /** Eval concurrency */
92
95
  concurrency?: number;
96
+ /**
97
+ * Maximum wall-clock time per eval step in ms.
98
+ * When exceeded, the subprocess is killed and partial results are used.
99
+ * Sourced from models config `evalBudgetMs`.
100
+ */
101
+ evalBudgetMs?: number;
93
102
  /** Promptfoo URL from eval output */
94
103
  promptfooUrl?: string;
95
104
  /** Sanity dataset override */
@@ -109,7 +118,7 @@ export interface ResolvedConfig {
109
118
  /** Before option for comparison */
110
119
  beforeOption?: string;
111
120
  /** Task source adapter selection */
112
- taskSourceType?: "content-lake" | "repo" | "yaml";
121
+ taskSourceType?: "content-lake" | "repo";
113
122
  /** Path to repo-based tasks directory (e.g., .ailf/tasks/) */
114
123
  repoTasksPath?: string;
115
124
  /** Report store project ID from .ailf/config.yaml reportStore block */
@@ -140,6 +149,16 @@ export interface ResolvedConfig {
140
149
  apiUrl: string;
141
150
  /** AILF API key (from AILF_API_KEY env var) */
142
151
  apiKey?: string;
152
+ /** External preset file paths or npm package names to load */
153
+ presets?: string[];
154
+ /** Whether artifact capture is enabled for this run (default: false) */
155
+ captureEnabled?: boolean;
156
+ /** Base directory for capture output (default: results/captures/) */
157
+ captureDir?: string;
158
+ /** Whether to compress capture output to tar.gz (default: true) */
159
+ captureCompress?: boolean;
160
+ /** Whether to include mode-specific extra artifacts (default: true) */
161
+ captureExtras?: boolean;
143
162
  }
144
163
  /**
145
164
  * Application context — the complete dependency carrier.
@@ -156,6 +175,8 @@ export interface ResolvedConfig {
156
175
  export interface AppContext {
157
176
  /** Evaluation caching (filesystem + optional Content Lake fallback) */
158
177
  readonly cache?: CacheStore;
178
+ /** Artifact capture collector (no-op when --capture is not set) */
179
+ readonly collector: ArtifactCollector;
159
180
  /** Resolved pipeline configuration */
160
181
  readonly config: ResolvedConfig;
161
182
  /** Documentation context fetcher */
@@ -15,6 +15,12 @@ export interface EvalRunConfig {
15
15
  concurrency?: number;
16
16
  /** Environment variables to pass to the eval process */
17
17
  env?: Record<string, string>;
18
+ /**
19
+ * Maximum wall-clock time for this eval subprocess in ms.
20
+ * When exceeded, the process is killed and partial results are used.
21
+ * Default: no limit (backward compatible).
22
+ */
23
+ maxDurationMs?: number;
18
24
  }
19
25
  export interface EvalRunner {
20
26
  /** Run an evaluation and return the step result */
@@ -4,6 +4,8 @@
4
4
  * Ports define the contracts between the domain kernel and the outside world.
5
5
  * Adapters (in packages/eval) implement these interfaces.
6
6
  */
7
+ export type { ArtifactCollector, ArtifactManifest, ArtifactManifestEntry, CaptureFlushResult, } from "./artifact-collector.js";
8
+ export type { ArtifactContentDiff, CaptureDiffReport, ComparisonMode, ComparisonOptions, InventoryDiff, JsonDiffEntry, MetadataComparison, ScoreComparison, SecurityScan, TimingComparison, } from "./capture-comparator.js";
7
9
  export type { CacheEntryMetadata, CacheKey, CacheLookupResult, CacheRecordInput, CacheStore, } from "./cache-store.js";
8
10
  export type { ConfigSource } from "./config-source.js";
9
11
  export type { AppContext, ReportSinkPort, ReportStorePort, ResolvedConfig, } from "./context.js";
@@ -38,6 +38,17 @@ export interface PipelineStep {
38
38
  * When defined, the StepRunner computes a hash and checks the cache.
39
39
  */
40
40
  cacheInputs?(ctx: AppContext): string[];
41
+ /**
42
+ * Cache context strings — non-file state that participates in cache key
43
+ * computation (e.g., mode, variant, area/task/tag filters).
44
+ *
45
+ * Without these, two runs with different CLI flags but identical config
46
+ * files would share a cache entry, causing cross-mode or cross-area
47
+ * contamination.
48
+ *
49
+ * When undefined, only file content determines the cache key.
50
+ */
51
+ cacheContext?(ctx: AppContext): string[];
41
52
  /**
42
53
  * Whether this step is optional — a failure in an optional step
43
54
  * does not stop the pipeline.
@@ -2,9 +2,9 @@
2
2
  * Port: Where task definitions come from.
3
3
  *
4
4
  * Adapters:
5
- * - YamlTaskSource (current) reads tasks/*.yaml files
6
- * - ContentLakeTaskSource (tasks-as-content Phase 2) — GROQ query
7
- * - RepoTaskSource (tasks-as-content Phase 4) — reads .ailf/tasks/
5
+ * - ContentLakeTaskSourceGROQ query against Sanity Content Lake
6
+ * - RepoTaskSource reads .ailf/tasks/*.task.ts files
7
+ * - TsTaskFileLoader — reads tasks/{mode}/*.task.ts (eval package)
8
8
  *
9
9
  * The key invariant: the pipeline orchestrator and all downstream steps
10
10
  * work with GeneralizedTaskDefinition[] regardless of where they came from.
@@ -2,9 +2,9 @@
2
2
  * Port: Where task definitions come from.
3
3
  *
4
4
  * Adapters:
5
- * - YamlTaskSource (current) reads tasks/*.yaml files
6
- * - ContentLakeTaskSource (tasks-as-content Phase 2) — GROQ query
7
- * - RepoTaskSource (tasks-as-content Phase 4) — reads .ailf/tasks/
5
+ * - ContentLakeTaskSourceGROQ query against Sanity Content Lake
6
+ * - RepoTaskSource reads .ailf/tasks/*.task.ts files
7
+ * - TsTaskFileLoader — reads tasks/{mode}/*.task.ts (eval package)
8
8
  *
9
9
  * The key invariant: the pipeline orchestrator and all downstream steps
10
10
  * work with GeneralizedTaskDefinition[] regardless of where they came from.
@@ -57,5 +57,6 @@ export declare const EvalConfigSchema: z.ZodObject<{
57
57
  source: z.ZodOptional<z.ZodString>;
58
58
  tasks: z.ZodOptional<z.ZodArray<z.ZodString>>;
59
59
  urls: z.ZodOptional<z.ZodArray<z.ZodString>>;
60
+ presets: z.ZodOptional<z.ZodArray<z.ZodString>>;
60
61
  }, z.core.$strict>;
61
62
  export type EvalConfig = z.infer<typeof EvalConfigSchema>;
@@ -81,5 +81,15 @@ export const EvalConfigSchema = z
81
81
  tasks: z.array(z.string()).optional(),
82
82
  /** Doc source URL overrides */
83
83
  urls: z.array(z.string().url()).optional(),
84
+ /**
85
+ * External presets to load — file paths or npm package names.
86
+ *
87
+ * Each entry is resolved as:
88
+ * - Relative path (./foo or ../foo): loaded from disk via jiti
89
+ * - Package name: resolved via Node require
90
+ *
91
+ * Presets are registered in order after built-in presets.
92
+ */
93
+ presets: z.array(z.string()).optional(),
84
94
  })
85
95
  .strict();
@@ -77,10 +77,16 @@ export declare const PipelineRequestSchema: z.ZodObject<{
77
77
  taskMode: z.ZodOptional<z.ZodEnum<{
78
78
  inline: "inline";
79
79
  "content-lake": "content-lake";
80
- yaml: "yaml";
81
80
  }>>;
82
81
  tasks: z.ZodOptional<z.ZodArray<z.ZodString>>;
83
82
  urls: z.ZodOptional<z.ZodArray<z.ZodString>>;
83
+ variant: z.ZodOptional<z.ZodEnum<{
84
+ baseline: "baseline";
85
+ agentic: "agentic";
86
+ observed: "observed";
87
+ full: "full";
88
+ }>>;
89
+ presets: z.ZodOptional<z.ZodArray<z.ZodString>>;
84
90
  }, z.core.$strip>;
85
91
  /** Inferred TypeScript type for a pipeline request payload. */
86
92
  export type PipelineRequest = z.infer<typeof PipelineRequestSchema>;
@@ -13,7 +13,7 @@
13
13
  * @see packages/eval/src/pipeline/map-request-to-config.ts — maps to ResolvedConfig
14
14
  */
15
15
  import { z } from "zod";
16
- import { RAW_EVAL_MODES } from "../../ailf-shared/index.js";
16
+ import { LITERACY_VARIANTS, RAW_EVAL_MODES } from "../../ailf-shared/index.js";
17
17
  // ---------------------------------------------------------------------------
18
18
  // Debug options — boolean shorthand or structured object
19
19
  // ---------------------------------------------------------------------------
@@ -86,7 +86,21 @@ export const PipelineRequestSchema = z.object({
86
86
  searchMode: z.enum(["off", "open", "origin-only"]).optional(),
87
87
  source: z.string().optional(),
88
88
  sourceReportId: z.string().optional(),
89
- taskMode: z.enum(["content-lake", "yaml", "inline"]).optional(),
89
+ taskMode: z.enum(["content-lake", "inline"]).optional(),
90
90
  tasks: z.array(z.string()).optional(),
91
91
  urls: z.array(z.string().url()).optional(),
92
+ /**
93
+ * Literacy variant — only meaningful when mode is "literacy".
94
+ *
95
+ * When provided with a canonical mode (`mode: "literacy"`), this field
96
+ * specifies the variant directly. When mode is a legacy alias (e.g.,
97
+ * `mode: "baseline"`), the variant is derived from the mode name and
98
+ * this field is ignored.
99
+ *
100
+ * Prefer explicit `mode: "literacy", variant: "baseline"` over the
101
+ * legacy `mode: "baseline"` form.
102
+ */
103
+ variant: z.enum(LITERACY_VARIANTS).optional(),
104
+ /** External preset file paths or npm package names to load */
105
+ presets: z.array(z.string()).optional(),
92
106
  });
@@ -79,7 +79,6 @@ export declare const FeatureSchema: z.ZodObject<{
79
79
  planned: "planned";
80
80
  "out-of-scope": "out-of-scope";
81
81
  }>;
82
- taskCount: z.ZodOptional<z.ZodNumber>;
83
82
  }, z.core.$strip>;
84
83
  /** Inferred TypeScript type for a product feature. */
85
84
  export type Feature = z.infer<typeof FeatureSchema>;
@@ -104,7 +103,6 @@ export declare const FeatureRegistrySchema: z.ZodObject<{
104
103
  planned: "planned";
105
104
  "out-of-scope": "out-of-scope";
106
105
  }>;
107
- taskCount: z.ZodOptional<z.ZodNumber>;
108
106
  }, z.core.$strip>>;
109
107
  }, z.core.$strip>;
110
108
  /** Inferred TypeScript type for the feature registry. */
@@ -108,7 +108,6 @@ export const FeatureSchema = z.object({
108
108
  priority: z.enum(["critical", "high", "medium", "low"]),
109
109
  sections: z.array(z.string().min(1)).min(1),
110
110
  status: z.enum(["covered", "uncovered", "planned", "out-of-scope"]),
111
- taskCount: z.number().int().min(0).optional(),
112
111
  });
113
112
  /**
114
113
  * Schema for the full config/features.yaml config file.
@@ -6,7 +6,9 @@
6
6
  * Extracted from packages/eval/src/lib/generate-configs.ts during
7
7
  * the Ports & Adapters migration (Phase 4e).
8
8
  */
9
+ import type { EvalMode } from "../../ailf-shared/index.d.ts";
9
10
  import type { ModelEntry } from "../types/index.js";
11
+ import type { ModeBase } from "../types/plugin-registry.js";
10
12
  /**
11
13
  * Extract the raw API model name from a Promptfoo provider ID.
12
14
  *
@@ -38,4 +40,17 @@ export declare function mergeConfig(defaults: Record<string, unknown>, modelConf
38
40
  *
39
41
  * Models without a `modes` field match all modes.
40
42
  */
41
- export declare function modelMatchesMode(model: ModelEntry, mode: string): boolean;
43
+ export declare function modelMatchesMode(model: ModelEntry, mode: EvalMode): boolean;
44
+ /**
45
+ * Resolve which variants a model participates in for a given mode.
46
+ *
47
+ * Resolution rules:
48
+ * - If the mode has no variants defined → returns `undefined` (no variant filtering)
49
+ * - If the model specifies variants for this mode → returns that whitelist
50
+ * - If the model omits variants for this mode → returns ALL mode variants (default)
51
+ *
52
+ * @param model - The model entry from models config
53
+ * @param modeBase - The mode base (contains variant definitions)
54
+ * @returns Array of variant IDs, or `undefined` if the mode has no variants
55
+ */
56
+ export declare function resolveModelVariants(model: ModelEntry, modeBase: ModeBase): string[] | undefined;
@@ -84,3 +84,24 @@ export function modelMatchesMode(model, mode) {
84
84
  }
85
85
  return model.modes.includes(mode);
86
86
  }
87
+ /**
88
+ * Resolve which variants a model participates in for a given mode.
89
+ *
90
+ * Resolution rules:
91
+ * - If the mode has no variants defined → returns `undefined` (no variant filtering)
92
+ * - If the model specifies variants for this mode → returns that whitelist
93
+ * - If the model omits variants for this mode → returns ALL mode variants (default)
94
+ *
95
+ * @param model - The model entry from models config
96
+ * @param modeBase - The mode base (contains variant definitions)
97
+ * @returns Array of variant IDs, or `undefined` if the mode has no variants
98
+ */
99
+ export function resolveModelVariants(model, modeBase) {
100
+ const modeVariants = modeBase.mode.variants;
101
+ if (!modeVariants || modeVariants.length === 0)
102
+ return undefined;
103
+ const allVariantIds = modeVariants.map((v) => v.id);
104
+ const modeId = modeBase.mode.id;
105
+ const explicit = model.variants?.[modeId];
106
+ return explicit ?? allVariantIds;
107
+ }
@@ -10,4 +10,4 @@
10
10
  export { classifyRubric, detectFeatureArea, extractDimensions, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
11
11
  export { formatComparisonMarkdown, formatComparisonTable, } from "./comparison-formatters.js";
12
12
  export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, type AggregationStrategy, type AreaScore, type AssertionScore, type DimensionScore, type EnsembleGradingConfig, type GraderTransitionConfig, type TaskScore, type TaskScoreOptions, } from "./scoring-engine.js";
13
- export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "./config-helpers.js";
13
+ export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
@@ -10,4 +10,4 @@
10
10
  export { classifyRubric, detectFeatureArea, extractDimensions, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
11
11
  export { formatComparisonMarkdown, formatComparisonTable, } from "./comparison-formatters.js";
12
12
  export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, } from "./scoring-engine.js";
13
- export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "./config-helpers.js";
13
+ export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
@@ -65,6 +65,15 @@ export function classifyRubric(component) {
65
65
  */
66
66
  export function detectFeatureArea(description) {
67
67
  const desc = description.toLowerCase();
68
+ if (desc.includes("portable text")) {
69
+ return "portable-text";
70
+ }
71
+ if (desc.includes("content lake")) {
72
+ return "content-lake";
73
+ }
74
+ if (desc.includes("image handling") || desc.includes("image asset")) {
75
+ return "image-handling";
76
+ }
68
77
  if (desc.includes("studio")) {
69
78
  return "studio-setup";
70
79
  }
@@ -8,7 +8,7 @@
8
8
  *
9
9
  * @see docs/design-docs/architecture-overhaul/domain-model.md (canonical)
10
10
  * @see docs/design-docs/architecture-overhaul/test-definition.md (authoring surfaces)
11
- * @see docs/exec-plans/architecture-overhaul/phase-0-foundation-types.md (task 0h)
11
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-0-foundation-types.md (task 0h)
12
12
  */
13
13
  /** Difficulty level for a task */
14
14
  export type TaskDifficulty = "basic" | "intermediate" | "advanced";
@@ -178,6 +178,17 @@ export interface MCPServerTaskDefinition extends TaskCommonFields {
178
178
  url?: string;
179
179
  /** Environment variables for the server process */
180
180
  env?: Record<string, string>;
181
+ /**
182
+ * HTTP headers for remote transports (sse / streamable-http).
183
+ * Merged on top of any auth-derived headers, so explicit values
184
+ * here take precedence over `auth`-generated headers.
185
+ *
186
+ * Values support `{{env.VAR}}` template syntax for secrets.
187
+ *
188
+ * @example
189
+ * headers: { Authorization: "Bearer {{env.SANITY_API_TOKEN}}" }
190
+ */
191
+ headers?: Record<string, string>;
181
192
  /** Startup timeout in milliseconds */
182
193
  startupTimeoutMs?: number;
183
194
  /**
@@ -226,6 +237,19 @@ export interface MCPServerTaskDefinition extends TaskCommonFields {
226
237
  content: string;
227
238
  }[];
228
239
  };
240
+ /**
241
+ * Override model list (provider IDs). When set, only these models are
242
+ * used instead of models filtered from the registry by `mcp-server` mode.
243
+ *
244
+ * @example ["anthropic:messages:claude-opus-4-6"]
245
+ */
246
+ models?: string[];
247
+ /**
248
+ * Maximum number of tool call rounds before forcing a final text response.
249
+ * Each round may include multiple parallel tool calls.
250
+ * @default 5
251
+ */
252
+ maxToolRounds?: number;
229
253
  }
230
254
  /**
231
255
  * Agent harness mode — evaluates autonomous agents in a sandbox.
@@ -8,6 +8,6 @@
8
8
  *
9
9
  * @see docs/design-docs/architecture-overhaul/domain-model.md (canonical)
10
10
  * @see docs/design-docs/architecture-overhaul/test-definition.md (authoring surfaces)
11
- * @see docs/exec-plans/architecture-overhaul/phase-0-foundation-types.md (task 0h)
11
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-0-foundation-types.md (task 0h)
12
12
  */
13
13
  export {};