@sanity/ailf 1.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (499) hide show
  1. package/README.md +0 -1
  2. package/canonical/grader-references/README.md +2 -2
  3. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  4. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  5. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  6. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  7. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  8. package/config/features.ts +1 -1
  9. package/config/models.ts +29 -12
  10. package/config/sources.ts +1 -1
  11. package/config/thresholds.ts +1 -1
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  13. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  17. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  18. package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
  19. package/dist/_vendor/ailf-core/config-helpers.js +51 -2
  20. package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
  21. package/dist/_vendor/ailf-core/examples/index.js +213 -94
  22. package/dist/_vendor/ailf-core/index.d.ts +3 -2
  23. package/dist/_vendor/ailf-core/index.js +2 -1
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  25. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  27. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  28. package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
  29. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  30. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  31. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  32. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  33. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  34. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  35. package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
  36. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
  37. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
  38. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
  39. package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
  40. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  41. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  42. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  43. package/dist/_vendor/ailf-core/services/index.js +1 -1
  44. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
  46. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  47. package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
  50. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  51. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  52. package/dist/adapters/api-client/remediation.js +2 -2
  53. package/dist/adapters/config-sources/file-config-adapter.js +7 -1
  54. package/dist/adapters/config-sources/ts-config-loader.js +21 -13
  55. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  56. package/dist/adapters/index.d.ts +0 -1
  57. package/dist/adapters/index.js +0 -1
  58. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  59. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  60. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  61. package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
  62. package/dist/adapters/task-sources/index.d.ts +3 -4
  63. package/dist/adapters/task-sources/index.js +3 -4
  64. package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
  65. package/dist/adapters/task-sources/repo-schemas.js +228 -20
  66. package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
  67. package/dist/adapters/task-sources/repo-task-source.js +81 -122
  68. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  69. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  70. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  71. package/dist/adapters/task-sources/repo-validation.js +126 -5
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
  73. package/dist/adapters/task-sources/task-file-loader.js +21 -7
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/coverage-audit.js +3 -1
  95. package/dist/commands/explain-handler.d.ts +1 -1
  96. package/dist/commands/explain-handler.js +37 -8
  97. package/dist/commands/fetch-docs.js +1 -0
  98. package/dist/commands/generate-configs.d.ts +3 -3
  99. package/dist/commands/generate-configs.js +20 -8
  100. package/dist/commands/init.d.ts +5 -4
  101. package/dist/commands/init.js +190 -25
  102. package/dist/commands/pipeline-action.d.ts +7 -1
  103. package/dist/commands/pipeline-action.js +43 -19
  104. package/dist/commands/pipeline.d.ts +6 -1
  105. package/dist/commands/pipeline.js +7 -2
  106. package/dist/commands/pr-comment.js +1 -0
  107. package/dist/commands/publish.js +1 -0
  108. package/dist/commands/shared/help.js +2 -2
  109. package/dist/commands/update-quality-scores.d.ts +5 -0
  110. package/dist/commands/update-quality-scores.js +20 -0
  111. package/dist/commands/validate-tasks.d.ts +2 -2
  112. package/dist/commands/validate-tasks.js +26 -15
  113. package/dist/composition-root.d.ts +15 -4
  114. package/dist/composition-root.js +100 -55
  115. package/dist/config/features.ts +23 -0
  116. package/dist/config/models.ts +100 -0
  117. package/dist/config/prompts.ts +16 -0
  118. package/dist/config/rubrics.ts +225 -0
  119. package/dist/config/schedules.ts +47 -0
  120. package/dist/config/sinks.ts +37 -0
  121. package/dist/config/sources.ts +21 -0
  122. package/dist/config/thresholds.ts +61 -0
  123. package/dist/index.d.ts +41 -0
  124. package/dist/index.js +48 -0
  125. package/dist/lib/agent-behavior-report.d.ts +8 -0
  126. package/dist/lib/agent-behavior-report.js +185 -0
  127. package/dist/lib/baseline.d.ts +19 -0
  128. package/dist/lib/baseline.js +153 -0
  129. package/dist/lib/calculate-scores.d.ts +23 -0
  130. package/dist/lib/calculate-scores.js +42 -0
  131. package/dist/lib/compare.d.ts +18 -0
  132. package/dist/lib/compare.js +170 -0
  133. package/dist/lib/coverage-audit.d.ts +4 -0
  134. package/dist/lib/coverage-audit.js +42 -0
  135. package/dist/lib/discovery-report.d.ts +13 -0
  136. package/dist/lib/discovery-report.js +57 -0
  137. package/dist/lib/fetch-docs.d.ts +30 -0
  138. package/dist/lib/fetch-docs.js +171 -0
  139. package/dist/lib/generate-configs.d.ts +25 -0
  140. package/dist/lib/generate-configs.js +42 -0
  141. package/dist/lib/grader-api.d.ts +21 -0
  142. package/dist/lib/grader-api.js +34 -0
  143. package/dist/lib/grader-compare.d.ts +19 -0
  144. package/dist/lib/grader-compare.js +91 -0
  145. package/dist/lib/grader-consistency.d.ts +27 -0
  146. package/dist/lib/grader-consistency.js +79 -0
  147. package/dist/lib/grader-sensitivity.d.ts +19 -0
  148. package/dist/lib/grader-sensitivity.js +75 -0
  149. package/dist/lib/grader-validate.d.ts +19 -0
  150. package/dist/lib/grader-validate.js +78 -0
  151. package/dist/lib/measure-retrieval.d.ts +14 -0
  152. package/dist/lib/measure-retrieval.js +71 -0
  153. package/dist/lib/pr-comment.d.ts +16 -0
  154. package/dist/lib/pr-comment.js +28 -0
  155. package/dist/lib/readiness-report.d.ts +13 -0
  156. package/dist/lib/readiness-report.js +108 -0
  157. package/dist/lib/webhook-server.d.ts +11 -0
  158. package/dist/lib/webhook-server.js +24 -0
  159. package/dist/lib/weekly-digest.d.ts +24 -0
  160. package/dist/lib/weekly-digest.js +148 -0
  161. package/dist/orchestration/build-app-context.js +13 -0
  162. package/dist/orchestration/build-step-sequence.js +4 -2
  163. package/dist/orchestration/cache-context.d.ts +23 -0
  164. package/dist/orchestration/cache-context.js +43 -0
  165. package/dist/orchestration/env-bridge.d.ts +21 -0
  166. package/dist/orchestration/env-bridge.js +66 -0
  167. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  168. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  169. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  170. package/dist/orchestration/step-runner.js +5 -1
  171. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  172. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  173. package/dist/orchestration/steps/callback-step.js +10 -1
  174. package/dist/orchestration/steps/compare-step.js +6 -3
  175. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  176. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  177. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  178. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  179. package/dist/orchestration/steps/fetch-docs-step.js +32 -19
  180. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  181. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  182. package/dist/orchestration/steps/generate-configs-step.js +77 -26
  183. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  184. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  185. package/dist/orchestration/steps/publish-report-step.js +19 -0
  186. package/dist/orchestration/steps/readiness-step.js +8 -3
  187. package/dist/orchestration/steps/report-step.js +17 -4
  188. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  189. package/dist/orchestration/steps/run-eval-step.js +51 -31
  190. package/dist/pipeline/agent-behavior-report.js +6 -0
  191. package/dist/pipeline/attribution.d.ts +1 -1
  192. package/dist/pipeline/attribution.js +1 -1
  193. package/dist/pipeline/cache.js +29 -15
  194. package/dist/pipeline/calculate-scores.d.ts +2 -0
  195. package/dist/pipeline/calculate-scores.js +70 -33
  196. package/dist/pipeline/chronic-failures.d.ts +55 -0
  197. package/dist/pipeline/chronic-failures.js +110 -0
  198. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
  199. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
  200. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
  201. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
  202. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  203. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
  204. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  205. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  206. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  207. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  208. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  209. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  210. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  211. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  212. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  213. package/dist/pipeline/compiler/config-loader.js +42 -2
  214. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  215. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  216. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  217. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  218. package/dist/pipeline/compiler/index.d.ts +2 -5
  219. package/dist/pipeline/compiler/index.js +2 -5
  220. package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
  221. package/dist/pipeline/compiler/literacy-bridge.js +2 -2
  222. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  223. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  224. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  225. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  226. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  227. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  228. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
  229. package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
  230. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  231. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  232. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  233. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  234. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  235. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  236. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  237. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  238. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  239. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  240. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  241. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  242. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  243. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  244. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
  245. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  246. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  247. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  248. package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
  249. package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
  250. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  251. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  252. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  253. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  254. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  255. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  256. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  257. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  258. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  259. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  260. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  261. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  262. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  263. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  264. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  265. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  266. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  267. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  268. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  269. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  270. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  271. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  272. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  273. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  274. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
  275. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
  276. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
  277. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  278. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  279. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  280. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  281. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  282. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  283. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  284. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
  285. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  286. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  287. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  288. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  289. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
  290. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
  291. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  292. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
  293. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  294. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
  295. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  296. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  297. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
  298. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
  299. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
  300. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  301. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  302. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  303. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  304. package/dist/pipeline/compiler/preset-loader.js +99 -0
  305. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
  306. package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
  307. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  308. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  309. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  310. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  311. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  312. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  313. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  314. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  315. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  316. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  317. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  318. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  319. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  320. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  321. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  322. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  323. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  324. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  325. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  326. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  327. package/dist/pipeline/compiler/task-bridge.js +92 -0
  328. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  329. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  330. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  331. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  332. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  333. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  334. package/dist/pipeline/coverage-audit.d.ts +1 -1
  335. package/dist/pipeline/coverage-audit.js +1 -1
  336. package/dist/pipeline/degradations.d.ts +1 -1
  337. package/dist/pipeline/degradations.js +1 -1
  338. package/dist/pipeline/expand-tasks.d.ts +2 -2
  339. package/dist/pipeline/expand-tasks.js +2 -2
  340. package/dist/pipeline/failure-modes.d.ts +1 -1
  341. package/dist/pipeline/failure-modes.js +13 -1
  342. package/dist/pipeline/gap-analysis.d.ts +1 -1
  343. package/dist/pipeline/gap-analysis.js +3 -1
  344. package/dist/pipeline/generate-configs.d.ts +2 -2
  345. package/dist/pipeline/generate-configs.js +16 -9
  346. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  347. package/dist/pipeline/grader-compare-runner.js +7 -1
  348. package/dist/pipeline/grader-comparison.d.ts +1 -1
  349. package/dist/pipeline/grader-comparison.js +1 -1
  350. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  351. package/dist/pipeline/grader-consistency-runner.js +7 -1
  352. package/dist/pipeline/grader-consistency.d.ts +1 -1
  353. package/dist/pipeline/grader-consistency.js +1 -1
  354. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  355. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  356. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  357. package/dist/pipeline/grader-sensitivity.js +1 -1
  358. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  359. package/dist/pipeline/grader-validate-runner.js +2 -2
  360. package/dist/pipeline/grader-validation.d.ts +1 -1
  361. package/dist/pipeline/grader-validation.js +1 -1
  362. package/dist/pipeline/map-request-to-config.js +16 -2
  363. package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
  364. package/dist/pipeline/mirror-repo-tasks.js +10 -10
  365. package/dist/pipeline/plan-format.d.ts +1 -1
  366. package/dist/pipeline/plan-format.js +1 -1
  367. package/dist/pipeline/plan.d.ts +1 -1
  368. package/dist/pipeline/plan.js +68 -30
  369. package/dist/pipeline/probe.d.ts +1 -1
  370. package/dist/pipeline/probe.js +1 -1
  371. package/dist/pipeline/readiness-report.d.ts +2 -2
  372. package/dist/pipeline/readiness-report.js +2 -2
  373. package/dist/pipeline/release-classification.d.ts +1 -1
  374. package/dist/pipeline/release-classification.js +1 -1
  375. package/dist/pipeline/release-report.d.ts +1 -1
  376. package/dist/pipeline/release-report.js +1 -1
  377. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  378. package/dist/pipeline/repo-eval-comment.js +1 -1
  379. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  380. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  381. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  382. package/dist/pipeline/resolve-mappings.js +44 -44
  383. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  384. package/dist/pipeline/retrieval-metrics.js +28 -20
  385. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  386. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  387. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  388. package/dist/pipeline/steps/compare-step.js +90 -0
  389. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  390. package/dist/pipeline/steps/eval-step.js +347 -0
  391. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  392. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  393. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  394. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  395. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  396. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  397. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  398. package/dist/pipeline/steps/publish-report-step.js +243 -0
  399. package/dist/pipeline/steps/report-step.d.ts +13 -0
  400. package/dist/pipeline/steps/report-step.js +56 -0
  401. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  402. package/dist/pipeline/steps/update-scores-step.js +42 -0
  403. package/dist/pipeline/targeted-loo.d.ts +1 -1
  404. package/dist/pipeline/targeted-loo.js +1 -1
  405. package/dist/pipeline/thresholds.d.ts +1 -1
  406. package/dist/pipeline/thresholds.js +1 -1
  407. package/dist/pipeline/validate.js +13 -0
  408. package/dist/report-store.d.ts +17 -0
  409. package/dist/report-store.js +24 -0
  410. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  411. package/dist/scripts/agent-behavior-report.js +315 -0
  412. package/dist/scripts/baseline.d.ts +43 -0
  413. package/dist/scripts/baseline.js +267 -0
  414. package/dist/scripts/calculate-scores.d.ts +166 -0
  415. package/dist/scripts/calculate-scores.js +1296 -0
  416. package/dist/scripts/compare.d.ts +22 -0
  417. package/dist/scripts/compare.js +334 -0
  418. package/dist/scripts/coverage-audit.d.ts +44 -0
  419. package/dist/scripts/coverage-audit.js +209 -0
  420. package/dist/scripts/debug-eval.d.ts +19 -0
  421. package/dist/scripts/debug-eval.js +73 -0
  422. package/dist/scripts/discovery-report.d.ts +58 -0
  423. package/dist/scripts/discovery-report.js +250 -0
  424. package/dist/scripts/fetch-docs.d.ts +35 -0
  425. package/dist/scripts/fetch-docs.js +472 -0
  426. package/dist/scripts/generate-configs.d.ts +66 -0
  427. package/dist/scripts/generate-configs.js +459 -0
  428. package/dist/scripts/grader-api.d.ts +27 -0
  429. package/dist/scripts/grader-api.js +206 -0
  430. package/dist/scripts/grader-compare.d.ts +22 -0
  431. package/dist/scripts/grader-compare.js +368 -0
  432. package/dist/scripts/grader-consistency.d.ts +20 -0
  433. package/dist/scripts/grader-consistency.js +313 -0
  434. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  435. package/dist/scripts/grader-sensitivity.js +354 -0
  436. package/dist/scripts/grader-validate.d.ts +19 -0
  437. package/dist/scripts/grader-validate.js +267 -0
  438. package/dist/scripts/measure-retrieval.d.ts +10 -0
  439. package/dist/scripts/measure-retrieval.js +145 -0
  440. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  441. package/dist/scripts/migrate-task-mode.js +1 -1
  442. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  443. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  444. package/dist/scripts/pipeline.d.ts +76 -0
  445. package/dist/scripts/pipeline.js +1031 -0
  446. package/dist/scripts/pr-comment.d.ts +10 -0
  447. package/dist/scripts/pr-comment.js +510 -0
  448. package/dist/scripts/readiness-report.d.ts +88 -0
  449. package/dist/scripts/readiness-report.js +342 -0
  450. package/dist/scripts/update-quality-scores.d.ts +15 -0
  451. package/dist/scripts/update-quality-scores.js +184 -0
  452. package/dist/scripts/validate-task-sources.d.ts +1 -1
  453. package/dist/scripts/validate-task-sources.js +1 -1
  454. package/dist/scripts/validate.d.ts +13 -0
  455. package/dist/scripts/validate.js +79 -0
  456. package/dist/scripts/webhook-server.d.ts +26 -0
  457. package/dist/scripts/webhook-server.js +147 -0
  458. package/dist/scripts/weekly-digest.d.ts +24 -0
  459. package/dist/scripts/weekly-digest.js +144 -0
  460. package/dist/sinks/format-slack.d.ts +64 -0
  461. package/dist/sinks/format-slack.js +306 -0
  462. package/dist/sinks/slack-sink.d.ts +27 -0
  463. package/dist/sinks/slack-sink.js +78 -0
  464. package/dist/sinks/types.d.ts +1 -1
  465. package/dist/sinks/types.js +1 -1
  466. package/dist/sinks/webhook-sink.d.ts +19 -0
  467. package/dist/sinks/webhook-sink.js +50 -0
  468. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  469. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  470. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  471. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  472. package/dist/tasks/literacy/functions.task.ts +70 -0
  473. package/dist/tasks/literacy/groq.task.ts +259 -0
  474. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  475. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  476. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  477. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  478. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  479. package/package.json +32 -24
  480. package/tasks/.expanded.agentic.yaml +280 -0
  481. package/tasks/.expanded.yaml +565 -0
  482. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  483. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  484. package/tasks/literacy/content-lake.task.ts +181 -0
  485. package/tasks/literacy/frameworks.task.ts +1 -0
  486. package/tasks/literacy/functions.task.ts +1 -0
  487. package/tasks/literacy/groq.task.ts +1 -0
  488. package/tasks/literacy/image-handling.task.ts +95 -0
  489. package/tasks/literacy/nextjs-live.task.ts +2 -1
  490. package/tasks/literacy/portable-text.task.ts +169 -0
  491. package/tasks/literacy/studio-setup.task.ts +5 -2
  492. package/tasks/literacy/visual-editing.task.ts +1 -0
  493. package/LICENSE +0 -21
  494. package/tasks/frameworks.yaml +0 -98
  495. package/tasks/functions.yaml +0 -51
  496. package/tasks/groq.yaml +0 -216
  497. package/tasks/nextjs-live.yaml +0 -62
  498. package/tasks/studio-setup.yaml +0 -111
  499. package/tasks/visual-editing.yaml +0 -120
@@ -1,7 +1,7 @@
1
1
  /**
2
- * validate-tasks command — standalone validation of repo-based task YAML files.
2
+ * validate-tasks command — standalone validation of task files.
3
3
  *
4
- * Validates .ailf/tasks/*.yaml files against the RepoTaskSchema without
4
+ * Validates .ailf/tasks/*.yaml files against the CanonicalTaskSchema without
5
5
  * running the full pipeline. Useful for pre-commit hooks and CI checks
6
6
  * in external repos.
7
7
  *
@@ -16,11 +16,11 @@ import { existsSync, readdirSync, readFileSync } from "fs";
16
16
  import { resolve, relative } from "path";
17
17
  import { Command } from "commander";
18
18
  import { load } from "js-yaml";
19
- import { parseRepoTaskFile } from "../adapters/task-sources/repo-schemas.js";
20
- import { validateRepoTasks, formatValidationResult, } from "../adapters/task-sources/repo-validation.js";
19
+ import { detectLegacyFieldNames, parseCanonicalTaskFile, } from "../adapters/task-sources/repo-schemas.js";
20
+ import { validateCanonicalTasks, formatValidationResult, } from "../adapters/task-sources/repo-validation.js";
21
21
  export function createValidateTasksCommand() {
22
22
  return new Command("validate-tasks")
23
- .description("Validate repo-based task YAML files (.ailf/tasks/) against the schema")
23
+ .description("Validate task YAML files (.ailf/tasks/) against the canonical schema")
24
24
  .argument("[path]", "Path to tasks directory (default: .ailf/tasks/)", ".ailf/tasks")
25
25
  .option("--strict", "Treat warnings as errors", false)
26
26
  .action(async (tasksPath, opts) => {
@@ -29,12 +29,12 @@ export function createValidateTasksCommand() {
29
29
  const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
30
30
  const resolvedPath = resolve(callerCwd, tasksPath);
31
31
  if (!existsSync(resolvedPath)) {
32
- console.error(`❌ Directory not found: ${resolvedPath}`);
32
+ console.error(`Directory not found: ${resolvedPath}`);
33
33
  process.exit(1);
34
34
  }
35
35
  const yamlFiles = readdirSync(resolvedPath).filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."));
36
36
  if (yamlFiles.length === 0) {
37
- console.error(`❌ No YAML files found in ${resolvedPath}`);
37
+ console.error(`No YAML files found in ${resolvedPath}`);
38
38
  process.exit(1);
39
39
  }
40
40
  console.log(`\nValidating ${yamlFiles.length} task file(s) in ${relative(process.cwd(), resolvedPath)}/\n`);
@@ -50,25 +50,36 @@ export function createValidateTasksCommand() {
50
50
  }
51
51
  catch (err) {
52
52
  const msg = err instanceof Error ? err.message : String(err);
53
- console.error(` ${file}: YAML parse error`);
53
+ console.error(` ${file}: YAML parse error`);
54
54
  console.error(` ${msg}\n`);
55
55
  hasErrors = true;
56
56
  continue;
57
57
  }
58
58
  if (!Array.isArray(parsed)) {
59
- console.error(` ${file}: Expected a YAML array of task definitions`);
59
+ console.error(` ${file}: Expected a YAML array of task definitions`);
60
+ hasErrors = true;
61
+ continue;
62
+ }
63
+ // Detect legacy field names before Zod validation
64
+ const legacyWarnings = detectLegacyFieldNames(parsed, file);
65
+ if (legacyWarnings.length > 0) {
66
+ console.error(` ${file}: Uses legacy field names`);
67
+ for (const w of legacyWarnings) {
68
+ console.error(` ${w}`);
69
+ }
70
+ console.error();
60
71
  hasErrors = true;
61
72
  continue;
62
73
  }
63
74
  try {
64
- const tasks = parseRepoTaskFile(parsed, file);
65
- console.log(` ${file}: ${tasks.length} task${tasks.length === 1 ? "" : "s"} valid`);
75
+ const tasks = parseCanonicalTaskFile(parsed, file);
76
+ console.log(` ${file}: ${tasks.length} task${tasks.length === 1 ? "" : "s"} valid`);
66
77
  totalTasks += tasks.length;
67
78
  allTasks.push(...tasks);
68
79
  }
69
80
  catch (err) {
70
81
  const msg = err instanceof Error ? err.message : String(err);
71
- console.error(` ${file}: Schema validation failed`);
82
+ console.error(` ${file}: Schema validation failed`);
72
83
  console.error(`${msg
73
84
  .split("\n")
74
85
  .map((l) => ` ${l}`)
@@ -79,7 +90,7 @@ export function createValidateTasksCommand() {
79
90
  // Run semantic validation on all parsed tasks
80
91
  if (allTasks.length > 0) {
81
92
  console.log(); // blank line
82
- const semanticResult = validateRepoTasks(allTasks);
93
+ const semanticResult = validateCanonicalTasks(allTasks);
83
94
  const formatted = formatValidationResult(semanticResult);
84
95
  console.log(formatted);
85
96
  if (!semanticResult.valid) {
@@ -87,10 +98,10 @@ export function createValidateTasksCommand() {
87
98
  }
88
99
  if (opts.strict && semanticResult.warnings.length > 0) {
89
100
  hasErrors = true;
90
- console.log("\n ⚠️ --strict mode: warnings treated as errors");
101
+ console.log("\n --strict mode: warnings treated as errors");
91
102
  }
92
103
  }
93
- console.log(`\n${hasErrors ? "" : ""} ${totalTasks} task${totalTasks === 1 ? "" : "s"} across ${yamlFiles.length} file${yamlFiles.length === 1 ? "" : "s"}\n`);
104
+ console.log(`\n${hasErrors ? "FAIL" : "OK"} ${totalTasks} task${totalTasks === 1 ? "" : "s"} across ${yamlFiles.length} file${yamlFiles.length === 1 ? "" : "s"}\n`);
94
105
  process.exit(hasErrors ? 1 : 0);
95
106
  });
96
107
  }
@@ -13,14 +13,25 @@
13
13
  * - After: one factory, one place to change adapter wiring
14
14
  *
15
15
  * @see packages/core/src/ports/context.ts — AppContext interface
16
- * @see docs/exec-plans/ports-and-adapters/phase-7-composition-root.md
16
+ * @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
17
17
  */
18
- import { type AppContext, type ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
18
+ import { type AppContext, type AssertionRegistration, type ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
19
19
  /**
20
20
  * Create a fully wired AppContext from resolved configuration.
21
21
  *
22
22
  * Every adapter is constructed here and nowhere else (outside of tests).
23
- * Swapping an adapter (e.g., YamlTaskSource ContentLakeTaskSource)
24
- * is a one-line change in this function.
23
+ * Swapping an adapter is a one-line change in this function.
25
24
  */
26
25
  export declare function createAppContext(config: ResolvedConfig): AppContext;
26
+ /**
27
+ * Generic Promptfoo assertion types available to all evaluation modes.
28
+ *
29
+ * These are evaluation primitives (text matching, JSON validation, LLM grading)
30
+ * that aren't specific to any mode or domain. They're registered before any
31
+ * preset so every mode has access to them.
32
+ *
33
+ * `compatibleModes` is omitted — when undefined, the assertion is compatible
34
+ * with all modes. Mode-specific assertions can be registered by presets with
35
+ * explicit mode whitelists.
36
+ */
37
+ export declare const FRAMEWORK_ASSERTIONS: AssertionRegistration[];
@@ -13,14 +13,18 @@
13
13
  * - After: one factory, one place to change adapter wiring
14
14
  *
15
15
  * @see packages/core/src/ports/context.ts — AppContext interface
16
- * @see docs/exec-plans/ports-and-adapters/phase-7-composition-root.md
16
+ * @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
17
17
  */
18
- import { InMemoryPluginRegistry, } from "./_vendor/ailf-core/index.js";
18
+ import { join } from "node:path";
19
+ import { InMemoryPluginRegistry, NoOpArtifactCollector, } from "./_vendor/ailf-core/index.js";
20
+ import { FilesystemArtifactCollector } from "./artifact-capture/filesystem-collector.js";
19
21
  import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js";
22
+ import { loadExternalPresets } from "./pipeline/compiler/preset-loader.js";
20
23
  import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
21
24
  import { PromptfooEvalAdapter } from "./adapters/eval-runners/promptfoo-eval-adapter.js";
22
25
  import { ConsoleLogger, JsonLogger, QuietLogger, } from "./adapters/loggers/index.js";
23
- import { CompositeTaskSource, ContentLakeTaskSource, RepoTaskSource, YamlTaskSource, } from "./adapters/task-sources/index.js";
26
+ import { CompositeTaskSource, ContentLakeTaskSource, RepoTaskSource, } from "./adapters/task-sources/index.js";
27
+ import { createAgentHarnessBase, createKnowledgeProbeBase, createLiteracyModeBase, createMcpServerModeBase, } from "./pipeline/compiler/mode-bases/index.js";
24
28
  import { createSanityLiteracyPreset } from "./pipeline/compiler/presets/index.js";
25
29
  import { getSanityClient } from "./sanity/client.js";
26
30
  import { ReportStore } from "./report-store.js";
@@ -29,8 +33,7 @@ import { loadSinks } from "./sinks/index.js";
29
33
  * Create a fully wired AppContext from resolved configuration.
30
34
  *
31
35
  * Every adapter is constructed here and nowhere else (outside of tests).
32
- * Swapping an adapter (e.g., YamlTaskSource ContentLakeTaskSource)
33
- * is a one-line change in this function.
36
+ * Swapping an adapter is a one-line change in this function.
34
37
  */
35
38
  export function createAppContext(config) {
36
39
  // Logger — selected by env var preferences
@@ -39,10 +42,12 @@ export function createAppContext(config) {
39
42
  const cache = config.noCache ? undefined : createCache(config);
40
43
  // Task source — selected by config.taskSourceType
41
44
  const taskSource = createTaskSource(config);
42
- // Plugin registry — mode handlers, assertions, rubric templates, doc fetcher.
43
- // The Sanity preset is registered here with config.rootDir so its doc fetcher
44
- // factory resolves paths relative to the eval package root (not cwd).
45
- const registry = createRegistry(config.rootDir);
45
+ // Plugin registry — mode bases, assertions, presets, doc fetcher.
46
+ // External presets from config are loaded and registered after built-ins.
47
+ const externalPresets = config.presets && config.presets.length > 0
48
+ ? loadExternalPresets(config.presets, config.rootDir)
49
+ : undefined;
50
+ const registry = createRegistry(config.rootDir, externalPresets);
46
51
  // Doc fetcher — provided by the registered preset's factory
47
52
  const docFetcherFactory = registry.getDocFetcherFactory();
48
53
  const docFetcher = docFetcherFactory ? docFetcherFactory() : undefined;
@@ -52,8 +57,23 @@ export function createAppContext(config) {
52
57
  const reportStore = createReportStore(config);
53
58
  // Sinks — loaded from config/sinks
54
59
  const sinks = loadSinks();
60
+ // Artifact collector — no-op by default, filesystem when --capture is set
61
+ const collector = config.captureEnabled
62
+ ? new FilesystemArtifactCollector({
63
+ captureDir: config.captureDir ?? join(config.rootDir, "results", "captures"),
64
+ mode: config.mode,
65
+ compress: config.captureCompress ?? true,
66
+ extras: config.captureExtras ?? true,
67
+ pipeline: {
68
+ variant: config.variant,
69
+ source: config.source,
70
+ areas: config.areas,
71
+ },
72
+ })
73
+ : new NoOpArtifactCollector();
55
74
  return {
56
75
  cache,
76
+ collector,
57
77
  config,
58
78
  docFetcher,
59
79
  evalRunner,
@@ -101,15 +121,12 @@ function createTaskSource(config) {
101
121
  }
102
122
  return new RepoTaskSource(config.repoTasksPath);
103
123
  }
104
- // Primary source — selected by config.taskSourceType
105
- // Content Lake tasks may require the report token for access.
106
- const primary = config.taskSourceType === "yaml"
107
- ? new YamlTaskSource(config.rootDir)
108
- : new ContentLakeTaskSource(getSanityClient({
109
- token: process.env.AILF_REPORT_SANITY_API_TOKEN ??
110
- process.env.SANITY_API_TOKEN ??
111
- undefined,
112
- }));
124
+ // Primary source — Content Lake (the only non-repo source remaining)
125
+ const primary = new ContentLakeTaskSource(getSanityClient({
126
+ token: process.env.AILF_REPORT_SANITY_API_TOKEN ??
127
+ process.env.SANITY_API_TOKEN ??
128
+ undefined,
129
+ }));
113
130
  // If repo tasks path is set, combine primary + repo sources.
114
131
  // This is the "augment" mode — repo tasks extend the primary source.
115
132
  if (config.repoTasksPath) {
@@ -121,58 +138,86 @@ function createTaskSource(config) {
121
138
  return primary;
122
139
  }
123
140
  // ---------------------------------------------------------------------------
124
- // Built-in mode registrations for non-literacy modes
141
+ // Layer 0: Framework built-in assertions
125
142
  // ---------------------------------------------------------------------------
126
- const BUILT_IN_MODES = [
143
+ /**
144
+ * Generic Promptfoo assertion types available to all evaluation modes.
145
+ *
146
+ * These are evaluation primitives (text matching, JSON validation, LLM grading)
147
+ * that aren't specific to any mode or domain. They're registered before any
148
+ * preset so every mode has access to them.
149
+ *
150
+ * `compatibleModes` is omitted — when undefined, the assertion is compatible
151
+ * with all modes. Mode-specific assertions can be registered by presets with
152
+ * explicit mode whitelists.
153
+ */
154
+ export const FRAMEWORK_ASSERTIONS = [
155
+ {
156
+ type: "contains",
157
+ label: "Contains text",
158
+ handlerModule: "promptfoo:builtin",
159
+ },
160
+ {
161
+ type: "contains-all",
162
+ label: "Contains all texts",
163
+ handlerModule: "promptfoo:builtin",
164
+ },
165
+ {
166
+ type: "contains-any",
167
+ label: "Contains any text",
168
+ handlerModule: "promptfoo:builtin",
169
+ },
170
+ { type: "equals", label: "Exact match", handlerModule: "promptfoo:builtin" },
171
+ { type: "regex", label: "Regex match", handlerModule: "promptfoo:builtin" },
172
+ { type: "is-json", label: "Valid JSON", handlerModule: "promptfoo:builtin" },
127
173
  {
128
- id: "knowledge-probe",
129
- label: "Knowledge Probe",
130
- validProviderPatterns: ["^openai:", "^anthropic:", "^file://"],
131
- rubricTemplateIds: [],
132
- handlerModule: "./mode-handlers/knowledge-probe-handler.js",
174
+ type: "javascript",
175
+ label: "JavaScript assertion",
176
+ handlerModule: "promptfoo:builtin",
133
177
  },
134
178
  {
135
- id: "mcp-server",
136
- label: "MCP Server Testing",
137
- validProviderPatterns: ["^mcp:", "^file://"],
138
- rubricTemplateIds: [
139
- "mcp-input-validation",
140
- "mcp-output-correctness",
141
- "mcp-error-handling",
142
- ],
143
- handlerModule: "./mode-handlers/mcp-server-handler.js",
179
+ type: "llm-rubric",
180
+ label: "LLM-graded rubric",
181
+ handlerModule: "promptfoo:builtin",
144
182
  },
145
183
  {
146
- id: "agent-harness",
147
- label: "Agent Harness",
148
- validProviderPatterns: ["^openai:", "^anthropic:", "^file://"],
149
- rubricTemplateIds: [],
150
- handlerModule: "./mode-handlers/agent-harness-handler.js",
184
+ type: "similar",
185
+ label: "Semantic similarity",
186
+ handlerModule: "promptfoo:builtin",
151
187
  },
152
188
  ];
153
189
  /**
154
190
  * Build and populate the plugin registry.
155
191
  *
156
- * Preset registration flow:
157
- * 1. A preset is a PresetDefinition — a bundle of modes, assertions, rubric
158
- * templates, prompt templates, scoring profiles, a doc fetcher factory,
159
- * source definitions, and feature definitions.
160
- * 2. registerPreset() iterates the preset's fields and delegates each one to
161
- * the appropriate register method (registerMode, registerRubricTemplate, …).
162
- * 3. After registration the rest of createAppContext() can pull capabilities
163
- * from the registry (e.g. getDocFetcherFactory()) without knowing which
164
- * preset provided them.
192
+ * Registration follows the five-layer model:
193
+ *
194
+ * Layer 0: Framework built-in assertions (generic Promptfoo builtins)
195
+ * Layer 0.5: Mode bases (shared evaluation methodology per mode)
196
+ * Layer 1: Domain presets (domain-specific config targeting a mode base)
165
197
  *
166
- * To add a new preset: create a PresetDefinition, then call
167
- * registry.registerPreset() here before the built-in mode registrations.
198
+ * Mode bases define HOW you evaluate (rubrics, scoring, prompts).
199
+ * Domain presets define WHAT you evaluate (sources, features, doc fetcher)
200
+ * and target a mode base by ID. When a preset is registered, it inherits
201
+ * its mode base's defaults and can optionally override them.
168
202
  */
169
- function createRegistry(rootDir) {
203
+ function createRegistry(rootDir, externalPresets) {
170
204
  const registry = new InMemoryPluginRegistry();
171
- // Register the sanity-literacy preset the Sanity-specific evaluation bundle.
205
+ // Layer 0: Framework built-in assertions (available to all modes)
206
+ for (const assertion of FRAMEWORK_ASSERTIONS) {
207
+ registry.registerAssertion(assertion);
208
+ }
209
+ // Layer 0.5: Mode bases (evaluation methodology)
210
+ registry.registerModeBase(createLiteracyModeBase());
211
+ registry.registerModeBase(createMcpServerModeBase());
212
+ registry.registerModeBase(createKnowledgeProbeBase());
213
+ registry.registerModeBase(createAgentHarnessBase());
214
+ // Layer 1: Built-in domain presets
172
215
  registry.registerPreset(createSanityLiteracyPreset({ rootDir }));
173
- // Register other built-in modes (not part of any preset yet)
174
- for (const mode of BUILT_IN_MODES) {
175
- registry.registerMode(mode);
216
+ // Layer 1+: External domain presets (from config.presets)
217
+ if (externalPresets) {
218
+ for (const preset of externalPresets) {
219
+ registry.registerPreset(preset);
220
+ }
176
221
  }
177
222
  return registry;
178
223
  }
@@ -0,0 +1,23 @@
1
+ /**
2
+ * features.ts — Product feature registry for documentation coverage auditing.
3
+ *
4
+ * Default features are provided by the sanity-literacy preset registered
5
+ * in the composition root. This file exists as an override point — any
6
+ * features defined here take precedence over preset-provided features
7
+ * during coverage auditing.
8
+ *
9
+ * To track custom features, define them here:
10
+ *
11
+ * export default defineFeatures({
12
+ * features: [
13
+ * { id: "my-feature", name: "My Feature", sections: ["api"], ... },
14
+ * ],
15
+ * })
16
+ *
17
+ * @see packages/eval/src/pipeline/compiler/presets/sanity-literacy.ts
18
+ * @see docs/archive/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
19
+ */
20
+
21
+ import { defineFeatures } from "../_vendor/ailf-core/index.js"
22
+
23
+ export default defineFeatures({ features: [] })
@@ -0,0 +1,100 @@
1
+ /**
2
+ * models.ts — Central model registry for AILF evaluations.
3
+ *
4
+ * Define all models to test here. Each model declares which evaluation
5
+ * modes it participates in (e.g., "literacy", "mcp-server") and
6
+ * optionally which variants within those modes.
7
+ *
8
+ * When a model enrolls in a mode without specifying variants, all
9
+ * variants defined by the mode base are included by default.
10
+ *
11
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-1-ts-config-loading.md
12
+ */
13
+
14
+ import { defineModels } from "../_vendor/ailf-core/index.js"
15
+
16
+ export default defineModels({
17
+ models: [
18
+ // ── Anthropic ──────────────────────────────────────────────
19
+ {
20
+ id: "anthropic:messages:claude-opus-4-6",
21
+ label: "Claude Opus 4.6",
22
+ config: { temperature: 0.2, max_tokens: 4096 },
23
+ modes: ["literacy", "mcp-server", "knowledge-probe"],
24
+ // All literacy variants included by default (baseline, observed,
25
+ // agentic-naive, agentic-optimized)
26
+ },
27
+
28
+ // ── Google ─────────────────────────────────────────────────
29
+ // {
30
+ // id: "google:gemini-2.5-pro",
31
+ // label: "Gemini 2.5 Pro",
32
+ // config: { temperature: 0.2, max_tokens: 4096 },
33
+ // modes: ["literacy"],
34
+ // },
35
+
36
+ // ── OpenAI ─────────────────────────────────────────────────
37
+ {
38
+ id: "openai:chat:gpt-5.2",
39
+ label: "GPT 5.2",
40
+ config: { temperature: 0.2, max_tokens: 4096 },
41
+ modes: ["literacy", "knowledge-probe"],
42
+ // All literacy variants included by default
43
+ },
44
+ {
45
+ id: "openai:responses:gpt-5.4",
46
+ label: "GPT 5.4",
47
+ config: {
48
+ reasoning_effort: "medium",
49
+ max_output_tokens: 4096,
50
+ maxRetries: 1,
51
+ },
52
+ timeoutMs: 600_000, // 10 min — reasoning model needs more headroom
53
+ modes: ["literacy", "mcp-server", "knowledge-probe"],
54
+ // All literacy variants included by default
55
+ },
56
+
57
+ // ── Disabled models (uncomment to enable) ──────────────────
58
+ // { id: "anthropic:claude-sonnet-4-20250514", label: "Claude Sonnet 4",
59
+ // config: { temperature: 0.2, max_tokens: 4096 },
60
+ // modes: ["literacy"],
61
+ // variants: { literacy: ["baseline"] } },
62
+ // { id: "anthropic:claude-3.5-sonnet-20241022", label: "Claude 3.5 Sonnet",
63
+ // config: { temperature: 0.2, max_tokens: 4096 },
64
+ // modes: ["literacy"],
65
+ // variants: { literacy: ["baseline", "agentic-naive", "agentic-optimized"] } },
66
+ // { id: "google:gemini-2.0-flash", label: "Gemini 2.0 Flash",
67
+ // config: { temperature: 0.2, max_tokens: 4096 },
68
+ // modes: ["literacy"],
69
+ // variants: { literacy: ["baseline"] } },
70
+ // { id: "openrouter:deepseek/deepseek-r1", label: "DeepSeek R1",
71
+ // config: { temperature: 0.2, max_tokens: 4096 },
72
+ // modes: ["literacy"],
73
+ // variants: { literacy: ["baseline"] } },
74
+ ],
75
+
76
+ // ── Grading Model ──────────────────────────────────────────
77
+ // Which model scores the responses. Separate from the models being tested.
78
+ grader: {
79
+ id: "anthropic:messages:claude-opus-4-5-20251101",
80
+ label: "Claude Opus 4.5 (grader)",
81
+ },
82
+
83
+ // ── Evaluation Options ─────────────────────────────────────
84
+ evalBudgetMs: 1_200_000, // 20 min per eval mode — outer kill switch
85
+ maxConcurrency: 32, // max parallel API calls — benchmarked in DOC-1896
86
+
87
+ // ── Default Config ─────────────────────────────────────────
88
+ // Applied to all models unless overridden per-model.
89
+ defaults: {
90
+ temperature: 0.2,
91
+ max_tokens: 4096,
92
+ maxToolRounds: 5, // for agentic modes
93
+ observerOptions: {
94
+ maxPreviewBytes: 2048,
95
+ captureResponsePreview: true,
96
+ includePatterns: ["sanity.io", "sanity.dev", "cdn.sanity.io"],
97
+ sensitiveHeaders: ["authorization", "cookie", "x-api-key"],
98
+ },
99
+ },
100
+ })
@@ -0,0 +1,16 @@
1
+ /**
2
+ * prompts.ts — User-override prompt templates.
3
+ *
4
+ * Canonical literacy prompt templates now live in the literacy mode handler:
5
+ * src/pipeline/compiler/mode-handlers/literacy-handler.ts
6
+ *
7
+ * Each mode handler owns its own prompts via getPrompts(). This file exists
8
+ * for user-level overrides only. Add entries here to replace handler-owned
9
+ * defaults for specific prompt IDs.
10
+ *
11
+ * @see packages/eval/src/pipeline/compiler/mode-handlers/literacy-handler.ts
12
+ */
13
+
14
+ import { definePrompts } from "../_vendor/ailf-core/index.js"
15
+
16
+ export default definePrompts([])