@sanity/ailf 1.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (499) hide show
  1. package/README.md +0 -1
  2. package/canonical/grader-references/README.md +2 -2
  3. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  4. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  5. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  6. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  7. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  8. package/config/features.ts +1 -1
  9. package/config/models.ts +29 -12
  10. package/config/sources.ts +1 -1
  11. package/config/thresholds.ts +1 -1
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  13. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  17. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  18. package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
  19. package/dist/_vendor/ailf-core/config-helpers.js +51 -2
  20. package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
  21. package/dist/_vendor/ailf-core/examples/index.js +213 -94
  22. package/dist/_vendor/ailf-core/index.d.ts +3 -2
  23. package/dist/_vendor/ailf-core/index.js +2 -1
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  25. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  27. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  28. package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
  29. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  30. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  31. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  32. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  33. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  34. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  35. package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
  36. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
  37. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
  38. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
  39. package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
  40. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  41. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  42. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  43. package/dist/_vendor/ailf-core/services/index.js +1 -1
  44. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
  46. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  47. package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
  50. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  51. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  52. package/dist/adapters/api-client/remediation.js +2 -2
  53. package/dist/adapters/config-sources/file-config-adapter.js +7 -1
  54. package/dist/adapters/config-sources/ts-config-loader.js +21 -13
  55. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  56. package/dist/adapters/index.d.ts +0 -1
  57. package/dist/adapters/index.js +0 -1
  58. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  59. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  60. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  61. package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
  62. package/dist/adapters/task-sources/index.d.ts +3 -4
  63. package/dist/adapters/task-sources/index.js +3 -4
  64. package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
  65. package/dist/adapters/task-sources/repo-schemas.js +228 -20
  66. package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
  67. package/dist/adapters/task-sources/repo-task-source.js +81 -122
  68. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  69. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  70. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  71. package/dist/adapters/task-sources/repo-validation.js +126 -5
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
  73. package/dist/adapters/task-sources/task-file-loader.js +21 -7
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/coverage-audit.js +3 -1
  95. package/dist/commands/explain-handler.d.ts +1 -1
  96. package/dist/commands/explain-handler.js +37 -8
  97. package/dist/commands/fetch-docs.js +1 -0
  98. package/dist/commands/generate-configs.d.ts +3 -3
  99. package/dist/commands/generate-configs.js +20 -8
  100. package/dist/commands/init.d.ts +5 -4
  101. package/dist/commands/init.js +190 -25
  102. package/dist/commands/pipeline-action.d.ts +7 -1
  103. package/dist/commands/pipeline-action.js +43 -19
  104. package/dist/commands/pipeline.d.ts +6 -1
  105. package/dist/commands/pipeline.js +7 -2
  106. package/dist/commands/pr-comment.js +1 -0
  107. package/dist/commands/publish.js +1 -0
  108. package/dist/commands/shared/help.js +2 -2
  109. package/dist/commands/update-quality-scores.d.ts +5 -0
  110. package/dist/commands/update-quality-scores.js +20 -0
  111. package/dist/commands/validate-tasks.d.ts +2 -2
  112. package/dist/commands/validate-tasks.js +26 -15
  113. package/dist/composition-root.d.ts +15 -4
  114. package/dist/composition-root.js +100 -55
  115. package/dist/config/features.ts +23 -0
  116. package/dist/config/models.ts +100 -0
  117. package/dist/config/prompts.ts +16 -0
  118. package/dist/config/rubrics.ts +225 -0
  119. package/dist/config/schedules.ts +47 -0
  120. package/dist/config/sinks.ts +37 -0
  121. package/dist/config/sources.ts +21 -0
  122. package/dist/config/thresholds.ts +61 -0
  123. package/dist/index.d.ts +41 -0
  124. package/dist/index.js +48 -0
  125. package/dist/lib/agent-behavior-report.d.ts +8 -0
  126. package/dist/lib/agent-behavior-report.js +185 -0
  127. package/dist/lib/baseline.d.ts +19 -0
  128. package/dist/lib/baseline.js +153 -0
  129. package/dist/lib/calculate-scores.d.ts +23 -0
  130. package/dist/lib/calculate-scores.js +42 -0
  131. package/dist/lib/compare.d.ts +18 -0
  132. package/dist/lib/compare.js +170 -0
  133. package/dist/lib/coverage-audit.d.ts +4 -0
  134. package/dist/lib/coverage-audit.js +42 -0
  135. package/dist/lib/discovery-report.d.ts +13 -0
  136. package/dist/lib/discovery-report.js +57 -0
  137. package/dist/lib/fetch-docs.d.ts +30 -0
  138. package/dist/lib/fetch-docs.js +171 -0
  139. package/dist/lib/generate-configs.d.ts +25 -0
  140. package/dist/lib/generate-configs.js +42 -0
  141. package/dist/lib/grader-api.d.ts +21 -0
  142. package/dist/lib/grader-api.js +34 -0
  143. package/dist/lib/grader-compare.d.ts +19 -0
  144. package/dist/lib/grader-compare.js +91 -0
  145. package/dist/lib/grader-consistency.d.ts +27 -0
  146. package/dist/lib/grader-consistency.js +79 -0
  147. package/dist/lib/grader-sensitivity.d.ts +19 -0
  148. package/dist/lib/grader-sensitivity.js +75 -0
  149. package/dist/lib/grader-validate.d.ts +19 -0
  150. package/dist/lib/grader-validate.js +78 -0
  151. package/dist/lib/measure-retrieval.d.ts +14 -0
  152. package/dist/lib/measure-retrieval.js +71 -0
  153. package/dist/lib/pr-comment.d.ts +16 -0
  154. package/dist/lib/pr-comment.js +28 -0
  155. package/dist/lib/readiness-report.d.ts +13 -0
  156. package/dist/lib/readiness-report.js +108 -0
  157. package/dist/lib/webhook-server.d.ts +11 -0
  158. package/dist/lib/webhook-server.js +24 -0
  159. package/dist/lib/weekly-digest.d.ts +24 -0
  160. package/dist/lib/weekly-digest.js +148 -0
  161. package/dist/orchestration/build-app-context.js +13 -0
  162. package/dist/orchestration/build-step-sequence.js +4 -2
  163. package/dist/orchestration/cache-context.d.ts +23 -0
  164. package/dist/orchestration/cache-context.js +43 -0
  165. package/dist/orchestration/env-bridge.d.ts +21 -0
  166. package/dist/orchestration/env-bridge.js +66 -0
  167. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  168. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  169. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  170. package/dist/orchestration/step-runner.js +5 -1
  171. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  172. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  173. package/dist/orchestration/steps/callback-step.js +10 -1
  174. package/dist/orchestration/steps/compare-step.js +6 -3
  175. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  176. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  177. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  178. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  179. package/dist/orchestration/steps/fetch-docs-step.js +32 -19
  180. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  181. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  182. package/dist/orchestration/steps/generate-configs-step.js +77 -26
  183. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  184. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  185. package/dist/orchestration/steps/publish-report-step.js +19 -0
  186. package/dist/orchestration/steps/readiness-step.js +8 -3
  187. package/dist/orchestration/steps/report-step.js +17 -4
  188. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  189. package/dist/orchestration/steps/run-eval-step.js +51 -31
  190. package/dist/pipeline/agent-behavior-report.js +6 -0
  191. package/dist/pipeline/attribution.d.ts +1 -1
  192. package/dist/pipeline/attribution.js +1 -1
  193. package/dist/pipeline/cache.js +29 -15
  194. package/dist/pipeline/calculate-scores.d.ts +2 -0
  195. package/dist/pipeline/calculate-scores.js +70 -33
  196. package/dist/pipeline/chronic-failures.d.ts +55 -0
  197. package/dist/pipeline/chronic-failures.js +110 -0
  198. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
  199. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
  200. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
  201. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
  202. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  203. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
  204. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  205. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  206. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  207. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  208. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  209. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  210. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  211. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  212. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  213. package/dist/pipeline/compiler/config-loader.js +42 -2
  214. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  215. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  216. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  217. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  218. package/dist/pipeline/compiler/index.d.ts +2 -5
  219. package/dist/pipeline/compiler/index.js +2 -5
  220. package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
  221. package/dist/pipeline/compiler/literacy-bridge.js +2 -2
  222. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  223. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  224. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  225. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  226. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  227. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  228. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
  229. package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
  230. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  231. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  232. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  233. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  234. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  235. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  236. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  237. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  238. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  239. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  240. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  241. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  242. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  243. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  244. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
  245. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  246. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  247. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  248. package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
  249. package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
  250. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  251. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  252. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  253. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  254. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  255. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  256. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  257. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  258. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  259. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  260. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  261. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  262. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  263. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  264. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  265. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  266. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  267. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  268. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  269. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  270. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  271. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  272. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  273. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  274. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
  275. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
  276. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
  277. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  278. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  279. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  280. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  281. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  282. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  283. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  284. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
  285. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  286. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  287. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  288. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  289. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
  290. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
  291. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  292. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
  293. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  294. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
  295. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  296. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  297. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
  298. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
  299. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
  300. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  301. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  302. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  303. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  304. package/dist/pipeline/compiler/preset-loader.js +99 -0
  305. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
  306. package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
  307. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  308. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  309. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  310. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  311. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  312. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  313. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  314. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  315. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  316. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  317. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  318. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  319. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  320. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  321. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  322. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  323. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  324. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  325. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  326. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  327. package/dist/pipeline/compiler/task-bridge.js +92 -0
  328. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  329. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  330. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  331. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  332. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  333. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  334. package/dist/pipeline/coverage-audit.d.ts +1 -1
  335. package/dist/pipeline/coverage-audit.js +1 -1
  336. package/dist/pipeline/degradations.d.ts +1 -1
  337. package/dist/pipeline/degradations.js +1 -1
  338. package/dist/pipeline/expand-tasks.d.ts +2 -2
  339. package/dist/pipeline/expand-tasks.js +2 -2
  340. package/dist/pipeline/failure-modes.d.ts +1 -1
  341. package/dist/pipeline/failure-modes.js +13 -1
  342. package/dist/pipeline/gap-analysis.d.ts +1 -1
  343. package/dist/pipeline/gap-analysis.js +3 -1
  344. package/dist/pipeline/generate-configs.d.ts +2 -2
  345. package/dist/pipeline/generate-configs.js +16 -9
  346. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  347. package/dist/pipeline/grader-compare-runner.js +7 -1
  348. package/dist/pipeline/grader-comparison.d.ts +1 -1
  349. package/dist/pipeline/grader-comparison.js +1 -1
  350. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  351. package/dist/pipeline/grader-consistency-runner.js +7 -1
  352. package/dist/pipeline/grader-consistency.d.ts +1 -1
  353. package/dist/pipeline/grader-consistency.js +1 -1
  354. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  355. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  356. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  357. package/dist/pipeline/grader-sensitivity.js +1 -1
  358. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  359. package/dist/pipeline/grader-validate-runner.js +2 -2
  360. package/dist/pipeline/grader-validation.d.ts +1 -1
  361. package/dist/pipeline/grader-validation.js +1 -1
  362. package/dist/pipeline/map-request-to-config.js +16 -2
  363. package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
  364. package/dist/pipeline/mirror-repo-tasks.js +10 -10
  365. package/dist/pipeline/plan-format.d.ts +1 -1
  366. package/dist/pipeline/plan-format.js +1 -1
  367. package/dist/pipeline/plan.d.ts +1 -1
  368. package/dist/pipeline/plan.js +68 -30
  369. package/dist/pipeline/probe.d.ts +1 -1
  370. package/dist/pipeline/probe.js +1 -1
  371. package/dist/pipeline/readiness-report.d.ts +2 -2
  372. package/dist/pipeline/readiness-report.js +2 -2
  373. package/dist/pipeline/release-classification.d.ts +1 -1
  374. package/dist/pipeline/release-classification.js +1 -1
  375. package/dist/pipeline/release-report.d.ts +1 -1
  376. package/dist/pipeline/release-report.js +1 -1
  377. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  378. package/dist/pipeline/repo-eval-comment.js +1 -1
  379. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  380. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  381. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  382. package/dist/pipeline/resolve-mappings.js +44 -44
  383. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  384. package/dist/pipeline/retrieval-metrics.js +28 -20
  385. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  386. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  387. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  388. package/dist/pipeline/steps/compare-step.js +90 -0
  389. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  390. package/dist/pipeline/steps/eval-step.js +347 -0
  391. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  392. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  393. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  394. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  395. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  396. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  397. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  398. package/dist/pipeline/steps/publish-report-step.js +243 -0
  399. package/dist/pipeline/steps/report-step.d.ts +13 -0
  400. package/dist/pipeline/steps/report-step.js +56 -0
  401. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  402. package/dist/pipeline/steps/update-scores-step.js +42 -0
  403. package/dist/pipeline/targeted-loo.d.ts +1 -1
  404. package/dist/pipeline/targeted-loo.js +1 -1
  405. package/dist/pipeline/thresholds.d.ts +1 -1
  406. package/dist/pipeline/thresholds.js +1 -1
  407. package/dist/pipeline/validate.js +13 -0
  408. package/dist/report-store.d.ts +17 -0
  409. package/dist/report-store.js +24 -0
  410. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  411. package/dist/scripts/agent-behavior-report.js +315 -0
  412. package/dist/scripts/baseline.d.ts +43 -0
  413. package/dist/scripts/baseline.js +267 -0
  414. package/dist/scripts/calculate-scores.d.ts +166 -0
  415. package/dist/scripts/calculate-scores.js +1296 -0
  416. package/dist/scripts/compare.d.ts +22 -0
  417. package/dist/scripts/compare.js +334 -0
  418. package/dist/scripts/coverage-audit.d.ts +44 -0
  419. package/dist/scripts/coverage-audit.js +209 -0
  420. package/dist/scripts/debug-eval.d.ts +19 -0
  421. package/dist/scripts/debug-eval.js +73 -0
  422. package/dist/scripts/discovery-report.d.ts +58 -0
  423. package/dist/scripts/discovery-report.js +250 -0
  424. package/dist/scripts/fetch-docs.d.ts +35 -0
  425. package/dist/scripts/fetch-docs.js +472 -0
  426. package/dist/scripts/generate-configs.d.ts +66 -0
  427. package/dist/scripts/generate-configs.js +459 -0
  428. package/dist/scripts/grader-api.d.ts +27 -0
  429. package/dist/scripts/grader-api.js +206 -0
  430. package/dist/scripts/grader-compare.d.ts +22 -0
  431. package/dist/scripts/grader-compare.js +368 -0
  432. package/dist/scripts/grader-consistency.d.ts +20 -0
  433. package/dist/scripts/grader-consistency.js +313 -0
  434. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  435. package/dist/scripts/grader-sensitivity.js +354 -0
  436. package/dist/scripts/grader-validate.d.ts +19 -0
  437. package/dist/scripts/grader-validate.js +267 -0
  438. package/dist/scripts/measure-retrieval.d.ts +10 -0
  439. package/dist/scripts/measure-retrieval.js +145 -0
  440. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  441. package/dist/scripts/migrate-task-mode.js +1 -1
  442. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  443. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  444. package/dist/scripts/pipeline.d.ts +76 -0
  445. package/dist/scripts/pipeline.js +1031 -0
  446. package/dist/scripts/pr-comment.d.ts +10 -0
  447. package/dist/scripts/pr-comment.js +510 -0
  448. package/dist/scripts/readiness-report.d.ts +88 -0
  449. package/dist/scripts/readiness-report.js +342 -0
  450. package/dist/scripts/update-quality-scores.d.ts +15 -0
  451. package/dist/scripts/update-quality-scores.js +184 -0
  452. package/dist/scripts/validate-task-sources.d.ts +1 -1
  453. package/dist/scripts/validate-task-sources.js +1 -1
  454. package/dist/scripts/validate.d.ts +13 -0
  455. package/dist/scripts/validate.js +79 -0
  456. package/dist/scripts/webhook-server.d.ts +26 -0
  457. package/dist/scripts/webhook-server.js +147 -0
  458. package/dist/scripts/weekly-digest.d.ts +24 -0
  459. package/dist/scripts/weekly-digest.js +144 -0
  460. package/dist/sinks/format-slack.d.ts +64 -0
  461. package/dist/sinks/format-slack.js +306 -0
  462. package/dist/sinks/slack-sink.d.ts +27 -0
  463. package/dist/sinks/slack-sink.js +78 -0
  464. package/dist/sinks/types.d.ts +1 -1
  465. package/dist/sinks/types.js +1 -1
  466. package/dist/sinks/webhook-sink.d.ts +19 -0
  467. package/dist/sinks/webhook-sink.js +50 -0
  468. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  469. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  470. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  471. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  472. package/dist/tasks/literacy/functions.task.ts +70 -0
  473. package/dist/tasks/literacy/groq.task.ts +259 -0
  474. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  475. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  476. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  477. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  478. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  479. package/package.json +32 -24
  480. package/tasks/.expanded.agentic.yaml +280 -0
  481. package/tasks/.expanded.yaml +565 -0
  482. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  483. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  484. package/tasks/literacy/content-lake.task.ts +181 -0
  485. package/tasks/literacy/frameworks.task.ts +1 -0
  486. package/tasks/literacy/functions.task.ts +1 -0
  487. package/tasks/literacy/groq.task.ts +1 -0
  488. package/tasks/literacy/image-handling.task.ts +95 -0
  489. package/tasks/literacy/nextjs-live.task.ts +2 -1
  490. package/tasks/literacy/portable-text.task.ts +169 -0
  491. package/tasks/literacy/studio-setup.task.ts +5 -2
  492. package/tasks/literacy/visual-editing.task.ts +1 -0
  493. package/LICENSE +0 -21
  494. package/tasks/frameworks.yaml +0 -98
  495. package/tasks/functions.yaml +0 -51
  496. package/tasks/groq.yaml +0 -216
  497. package/tasks/nextjs-live.yaml +0 -62
  498. package/tasks/studio-setup.yaml +0 -111
  499. package/tasks/visual-editing.yaml +0 -120
@@ -9,12 +9,12 @@
9
9
  * Ports & Adapters migration (Phase 0c). The original file is now a
10
10
  * re-export barrel that preserves backward compatibility.
11
11
  */
12
- import type { DocumentRef as _DocumentRef, EvalMode as _EvalMode } from "../../ailf-shared/index.d.ts";
12
+ import type { DocumentRef as _DocumentRef, EvalMode } from "../../ailf-shared/index.d.ts";
13
13
  export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "./scoring-input.js";
14
14
  export type { DocumentRef } from "../../ailf-shared/index.d.ts";
15
15
  export type { StoredBaseline, StoredReport, StoredRun, StoredTaskResult, StoredTrace, SchemaVersioned, } from "./storage-schema.js";
16
16
  export { CURRENT_SCHEMA_VERSION, isSchemaVersioned, migrateDocument, } from "./storage-schema.js";
17
- export type { AssertionRegistration, FixtureResolverRegistration, ModeRegistration, PluginManifest, PluginRegistry, PresetDefinition, ReportSinkRegistration, RubricTemplateRegistration, } from "./plugin-registry.js";
17
+ export type { AssertionRegistration, FixtureResolverRegistration, ModeBase, ModeRegistration, PluginManifest, PluginRegistry, PresetDefinition, ReportSinkRegistration, RubricTemplateRegistration, } from "./plugin-registry.js";
18
18
  export { InMemoryPluginRegistry } from "./plugin-registry.js";
19
19
  export type { AgentHarnessConfig, AgentHarnessModeConfig, CustomModeConfig, EvalModeConfig, EvalModeType, KnowledgeBaseRef, KnowledgeProbeModeConfig, LiteracyModeConfig, MCPServerConfig, MCPServerModeConfig, ProbeStrategy, SandboxConfig, ToolDef, } from "./eval-mode-config.js";
20
20
  export { evalModeType } from "./eval-mode-config.js";
@@ -25,7 +25,6 @@ export type { ArtifactId, Brand, Err, FixtureId, IdValidationError, NewReportId,
25
25
  export { err, fixtureId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
26
26
  export type { AgentHarnessTaskDefinition, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PathDocRef, PerspectiveDocRef, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
27
27
  type DocumentRef = _DocumentRef;
28
- type EvalMode = _EvalMode;
29
28
  /** Aggregated retrieval metrics for a feature area */
30
29
  export interface AreaRetrievalMetrics {
31
30
  area: string;
@@ -119,7 +118,7 @@ export interface FailureModeReport {
119
118
  totalJudgments: number;
120
119
  }
121
120
  /** Failure mode classification for a low-scoring judgment */
122
- export type FailureModeType = "incorrect-docs" | "missing-docs" | "model-limitation" | "outdated-docs" | "poor-structure" | "unclassified";
121
+ export type FailureModeType = "api-error" | "incorrect-docs" | "missing-docs" | "model-limitation" | "outdated-docs" | "poor-structure" | "unclassified";
123
122
  /** Per-feature-area score breakdown */
124
123
  export interface FeatureScore {
125
124
  /**
@@ -352,11 +351,40 @@ export interface ModelEntry {
352
351
  env?: string;
353
352
  id: string;
354
353
  label: string;
355
- modes?: string[];
354
+ /**
355
+ * Which evaluation modes this model participates in.
356
+ *
357
+ * Values must be canonical eval mode names (e.g., "literacy", "mcp-server").
358
+ * When omitted, the model participates in all modes.
359
+ */
360
+ modes?: EvalMode[];
361
+ /**
362
+ * Per-provider timeout in ms. Emitted into Promptfoo provider config.
363
+ * Default: 300_000 (5 min, matching Promptfoo's built-in default).
364
+ */
365
+ timeoutMs?: number;
366
+ /**
367
+ * Per-mode variant whitelist. Keys are eval mode IDs, values are arrays
368
+ * of variant IDs to include for that mode.
369
+ *
370
+ * When a model enrolls in a mode (via `modes`) but does not specify
371
+ * variants for it here, ALL variants defined by the mode base are included.
372
+ *
373
+ * Only meaningful for modes that define variants (e.g., literacy has
374
+ * "baseline", "observed", "agentic-naive", "agentic-optimized").
375
+ * Ignored for modes without variants.
376
+ */
377
+ variants?: Partial<Record<EvalMode, string[]>>;
356
378
  }
357
379
  /** Parsed config/models.yaml structure */
358
380
  export interface ModelsConfig {
359
381
  defaults: Record<string, unknown>;
382
+ /**
383
+ * Maximum wall-clock time per eval step (all tests for one mode) in ms.
384
+ * When exceeded, the subprocess is killed and partial results are used.
385
+ * Default: no limit (backward compatible).
386
+ */
387
+ evalBudgetMs?: number;
360
388
  grader: {
361
389
  id: string;
362
390
  label?: string;
@@ -507,6 +535,21 @@ export interface TestSummary {
507
535
  task: string;
508
536
  error: string;
509
537
  }[];
538
+ /** Per-test timing statistics (when latencyMs is available from Promptfoo) */
539
+ timing?: {
540
+ /** Median test duration in ms */
541
+ medianMs: number;
542
+ /** 95th percentile test duration in ms */
543
+ p95Ms: number;
544
+ /** Maximum test duration in ms */
545
+ maxMs: number;
546
+ /** Tests that exceeded the slow threshold (2x median, min 60s) */
547
+ slowTests: {
548
+ task: string;
549
+ model: string;
550
+ durationMs: number;
551
+ }[];
552
+ };
510
553
  }
511
554
  /** Token usage and estimated cost for a pipeline run. */
512
555
  export interface PipelineUsage {
@@ -575,8 +618,6 @@ export interface ProductFeature {
575
618
  sections: string[];
576
619
  /** Coverage status */
577
620
  status: "covered" | "out-of-scope" | "planned" | "uncovered";
578
- /** Number of evaluation tasks (if covered) */
579
- taskCount?: number;
580
621
  }
581
622
  /** Full classification of a content release for evaluation */
582
623
  export interface ReleaseClassification {
@@ -1,12 +1,12 @@
1
1
  /**
2
2
  * Plugin registry — typed extension points for AILF evaluation capabilities.
3
3
  *
4
- * Twelve extension points: evaluation modes, providers, assertions,
5
- * rubric templates, fixture resolvers, report sinks, dashboard renderers,
6
- * prompt templates, scoring profiles, doc fetcher factory, source defs,
7
- * and feature defs.
8
- *
9
- * Presets bundle multiple extensions into a single installable unit.
4
+ * Three-tier architecture:
5
+ * - **Mode bases** define evaluation methodology (rubrics, scoring, prompts)
6
+ * - **Domain presets** target a mode base and add domain config (sources,
7
+ * features, doc fetcher)
8
+ * - **Framework assertions** are generic evaluation primitives available to
9
+ * all modes
10
10
  *
11
11
  * @see docs/design-docs/architecture-overhaul/extensibility-plugins.md
12
12
  */
@@ -14,6 +14,25 @@ import type { PromptTemplate } from "../ports/mode-handler.js";
14
14
  import type { DocFetcher } from "../ports/doc-fetcher.js";
15
15
  import type { SourceEntry } from "../config-helpers.js";
16
16
  import type { FeatureRegistry } from "../schemas/pipeline.js";
17
+ /**
18
+ * A named variant within an evaluation mode.
19
+ *
20
+ * Modes can declare variants to represent different evaluation strategies
21
+ * that share the same methodology. For example, the literacy mode has
22
+ * "baseline", "observed", and "agentic-*" variants.
23
+ *
24
+ * Models opt into specific variants via `ModelEntry.variants`. When a model
25
+ * enrolls in a mode without specifying variants, all defined variants are
26
+ * included by default.
27
+ */
28
+ export interface ModeVariantDefinition {
29
+ /** Variant identifier (e.g., "baseline", "agentic-naive") */
30
+ id: string;
31
+ /** Human-readable label (e.g., "Standard (baseline)") */
32
+ label: string;
33
+ /** Optional description for docs/CLI help */
34
+ description?: string;
35
+ }
17
36
  /** A registered evaluation mode handler */
18
37
  export interface ModeRegistration {
19
38
  /** Unique mode identifier (e.g., "api-contract") */
@@ -26,6 +45,14 @@ export interface ModeRegistration {
26
45
  rubricTemplateIds: string[];
27
46
  /** Compile function module path (loaded at runtime) */
28
47
  handlerModule: string;
48
+ /**
49
+ * Variants this mode supports. Omit or empty for modes without variants.
50
+ *
51
+ * When defined, models can selectively opt into specific variants via
52
+ * `ModelEntry.variants`. Models that enroll in the mode without specifying
53
+ * variants participate in all defined variants.
54
+ */
55
+ variants?: ModeVariantDefinition[];
29
56
  }
30
57
  /** A registered assertion type */
31
58
  export interface AssertionRegistration {
@@ -33,8 +60,12 @@ export interface AssertionRegistration {
33
60
  type: string;
34
61
  /** Human-readable label */
35
62
  label: string;
36
- /** Which modes this assertion is compatible with */
37
- compatibleModes: string[];
63
+ /**
64
+ * Which modes this assertion is compatible with.
65
+ * When omitted, the assertion is compatible with all modes.
66
+ * When specified, acts as a whitelist of mode IDs.
67
+ */
68
+ compatibleModes?: string[];
38
69
  /** Assertion handler module path */
39
70
  handlerModule: string;
40
71
  }
@@ -65,6 +96,30 @@ export interface ReportSinkRegistration {
65
96
  /** Sink module path */
66
97
  handlerModule: string;
67
98
  }
99
+ /**
100
+ * ModeBase — shared evaluation methodology for a mode.
101
+ *
102
+ * Defines HOW you evaluate (rubrics, scoring, prompts) independently of
103
+ * WHAT you're evaluating (sources, features, docs). Multiple domain presets
104
+ * can target the same mode base and inherit its defaults.
105
+ *
106
+ * Example: the "literacy" mode base defines rubric templates for
107
+ * task-completion, code-correctness, and doc-coverage. Both a Sanity docs
108
+ * preset and an external docs preset can target "literacy" and inherit
109
+ * these rubrics without redefining them.
110
+ */
111
+ export interface ModeBase {
112
+ /** The mode registration (handler, provider patterns, rubric template IDs) */
113
+ mode: ModeRegistration;
114
+ /** Default rubric templates for this mode */
115
+ rubricTemplates?: RubricTemplateRegistration[];
116
+ /** Default scoring profiles for this mode (profile name → dimension weights) */
117
+ scoringProfiles?: Record<string, Record<string, number>>;
118
+ /** Default prompt templates for this mode (template name → template) */
119
+ promptTemplates?: Record<string, PromptTemplate>;
120
+ /** Mode-specific assertion types (beyond framework builtins) */
121
+ assertions?: AssertionRegistration[];
122
+ }
68
123
  /** Plugin manifest describing a single plugin */
69
124
  export interface PluginManifest {
70
125
  /** Plugin name (npm package style) */
@@ -80,32 +135,49 @@ export interface PluginManifest {
80
135
  /** Dependencies on other plugins */
81
136
  requires?: string[];
82
137
  }
83
- /** A preset bundles multiple extensions into an installable unit */
138
+ /**
139
+ * A domain preset targets a mode base and adds domain-specific configuration.
140
+ *
141
+ * The preset inherits evaluation methodology (rubrics, scoring, prompts) from
142
+ * its mode base. It can optionally override any inherited values.
143
+ */
84
144
  export interface PresetDefinition {
85
- /** Preset name */
145
+ /** Preset name (unique identifier) */
86
146
  name: string;
87
147
  /** Plugin manifest */
88
148
  manifest: PluginManifest;
89
- /** Evaluation modes to register */
90
- modes?: ModeRegistration[];
91
- /** Assertion types to register */
92
- assertions?: AssertionRegistration[];
93
- /** Rubric templates to register */
94
- rubricTemplates?: RubricTemplateRegistration[];
95
- /** Fixture resolvers to register */
149
+ /**
150
+ * Lifecycle status — mirrors task status semantics.
151
+ * active: registered and used in evaluations (default)
152
+ * draft: registered but skipped unless explicitly targeted
153
+ * paused: registered but skipped (can be resumed)
154
+ * archived: not registered
155
+ */
156
+ status?: "active" | "archived" | "draft" | "paused";
157
+ /**
158
+ * Which mode this preset targets (by mode ID).
159
+ * Links to a registered ModeBase. The preset inherits rubrics,
160
+ * scoring profiles, and prompt templates from the base.
161
+ */
162
+ mode: string;
163
+ /** Fixture resolvers */
96
164
  fixtureResolvers?: FixtureResolverRegistration[];
97
- /** Report sinks to register */
165
+ /** Report sinks */
98
166
  reportSinks?: ReportSinkRegistration[];
99
- /** Prompt templates keyed by template name (e.g. "with-docs", "agentic") */
100
- promptTemplates?: Record<string, PromptTemplate>;
101
- /** Scoring profiles mapping profile name to dimension-weight pairs */
102
- scoringProfiles?: Record<string, Record<string, number>>;
103
167
  /** Factory function that creates a DocFetcher instance */
104
168
  docFetcher?: () => DocFetcher;
105
169
  /** Documentation source definitions (production, branch, local, etc.) */
106
170
  sourceDefs?: SourceEntry[];
107
171
  /** Product feature registry for coverage tracking */
108
172
  featureDefs?: FeatureRegistry;
173
+ /** Override rubric templates (merged by ID with mode base) */
174
+ rubricTemplates?: RubricTemplateRegistration[];
175
+ /** Override scoring profiles (merged by name with mode base) */
176
+ scoringProfiles?: Record<string, Record<string, number>>;
177
+ /** Override prompt templates (merged by name with mode base) */
178
+ promptTemplates?: Record<string, PromptTemplate>;
179
+ /** Additional mode-specific assertions */
180
+ assertions?: AssertionRegistration[];
109
181
  }
110
182
  /**
111
183
  * PluginRegistry — central registry for all AILF extensions.
@@ -154,10 +226,16 @@ export interface PluginRegistry {
154
226
  registerSourceDefs(sources: SourceEntry[]): void;
155
227
  /** Get all registered source definitions */
156
228
  getSourceDefs(): SourceEntry[];
157
- /** Register a feature registry (last-write-wins) */
229
+ /** Register a feature registry (merged by feature ID with existing) */
158
230
  registerFeatureDefs(features: FeatureRegistry): void;
159
231
  /** Get the registered feature registry, if any */
160
232
  getFeatureDefs(): FeatureRegistry | undefined;
233
+ /** Register a mode base (evaluation methodology) */
234
+ registerModeBase(base: ModeBase): void;
235
+ /** Get a mode base by mode ID */
236
+ getModeBase(modeId: string): ModeBase | undefined;
237
+ /** Get all registered mode bases */
238
+ getModeBases(): ModeBase[];
161
239
  /** Get all registered presets */
162
240
  getPresets(): PresetDefinition[];
163
241
  }
@@ -170,6 +248,7 @@ export declare class InMemoryPluginRegistry implements PluginRegistry {
170
248
  private readonly rubricTemplates_;
171
249
  private readonly fixtureResolvers_;
172
250
  private readonly reportSinks_;
251
+ private readonly modeBases_;
173
252
  private readonly presets_;
174
253
  private promptTemplates_;
175
254
  private scoringProfiles_;
@@ -199,4 +278,7 @@ export declare class InMemoryPluginRegistry implements PluginRegistry {
199
278
  getSourceDefs(): SourceEntry[];
200
279
  registerFeatureDefs(features: FeatureRegistry): void;
201
280
  getFeatureDefs(): FeatureRegistry | undefined;
281
+ registerModeBase(base: ModeBase): void;
282
+ getModeBase(modeId: string): ModeBase | undefined;
283
+ getModeBases(): ModeBase[];
202
284
  }
@@ -1,12 +1,12 @@
1
1
  /**
2
2
  * Plugin registry — typed extension points for AILF evaluation capabilities.
3
3
  *
4
- * Twelve extension points: evaluation modes, providers, assertions,
5
- * rubric templates, fixture resolvers, report sinks, dashboard renderers,
6
- * prompt templates, scoring profiles, doc fetcher factory, source defs,
7
- * and feature defs.
8
- *
9
- * Presets bundle multiple extensions into a single installable unit.
4
+ * Three-tier architecture:
5
+ * - **Mode bases** define evaluation methodology (rubrics, scoring, prompts)
6
+ * - **Domain presets** target a mode base and add domain config (sources,
7
+ * features, doc fetcher)
8
+ * - **Framework assertions** are generic evaluation primitives available to
9
+ * all modes
10
10
  *
11
11
  * @see docs/design-docs/architecture-overhaul/extensibility-plugins.md
12
12
  */
@@ -19,6 +19,7 @@ export class InMemoryPluginRegistry {
19
19
  rubricTemplates_ = new Map();
20
20
  fixtureResolvers_ = new Map();
21
21
  reportSinks_ = new Map();
22
+ modeBases_ = new Map();
22
23
  presets_ = new Map();
23
24
  promptTemplates_ = {};
24
25
  scoringProfiles_ = {};
@@ -26,19 +27,56 @@ export class InMemoryPluginRegistry {
26
27
  sourceDefs_ = [];
27
28
  featureDefs_;
28
29
  registerPreset(preset) {
30
+ // Skip archived presets entirely
31
+ if (preset.status === "archived")
32
+ return;
33
+ // Store draft/paused presets in the map (for later activation via
34
+ // --preset flag) but skip all side-effect registrations. This prevents
35
+ // a draft preset from silently overwriting the doc fetcher, merging
36
+ // scoring profiles, etc.
29
37
  this.presets_.set(preset.name, preset);
30
- if (preset.modes) {
31
- for (const mode of preset.modes)
32
- this.registerMode(mode);
38
+ if (preset.status === "draft" || preset.status === "paused")
39
+ return;
40
+ // Resolve mode base defaults
41
+ const base = this.modeBases_.get(preset.mode);
42
+ if (!base) {
43
+ throw new Error(`Preset "${preset.name}" targets mode "${preset.mode}" ` +
44
+ `but no mode base is registered for it. ` +
45
+ `Available mode bases: ${[...this.modeBases_.keys()].join(", ") || "(none)"}`);
46
+ }
47
+ // Mode is already registered by registerModeBase() — no need to re-register.
48
+ // Merge rubric templates: base defaults + preset overrides (by ID)
49
+ const baseRubrics = new Map((base.rubricTemplates ?? []).map((r) => [r.id, r]));
50
+ for (const r of preset.rubricTemplates ?? []) {
51
+ baseRubrics.set(r.id, r);
52
+ }
53
+ for (const r of baseRubrics.values()) {
54
+ this.registerRubricTemplate(r);
55
+ }
56
+ // Merge scoring profiles: base defaults + preset overrides (by name)
57
+ const profiles = {
58
+ ...base.scoringProfiles,
59
+ ...preset.scoringProfiles,
60
+ };
61
+ if (Object.keys(profiles).length > 0) {
62
+ this.registerScoringProfiles(profiles);
33
63
  }
64
+ // Merge prompt templates: base defaults + preset overrides (by name)
65
+ const prompts = {
66
+ ...base.promptTemplates,
67
+ ...preset.promptTemplates,
68
+ };
69
+ if (Object.keys(prompts).length > 0) {
70
+ this.registerPromptTemplates(prompts);
71
+ }
72
+ // Merge assertions: base + preset (preset overrides by type)
73
+ for (const a of base.assertions ?? [])
74
+ this.registerAssertion(a);
34
75
  if (preset.assertions) {
35
76
  for (const a of preset.assertions)
36
77
  this.registerAssertion(a);
37
78
  }
38
- if (preset.rubricTemplates) {
39
- for (const t of preset.rubricTemplates)
40
- this.registerRubricTemplate(t);
41
- }
79
+ // Register domain-specific fields
42
80
  if (preset.fixtureResolvers) {
43
81
  for (const r of preset.fixtureResolvers)
44
82
  this.registerFixtureResolver(r);
@@ -47,12 +85,6 @@ export class InMemoryPluginRegistry {
47
85
  for (const s of preset.reportSinks)
48
86
  this.registerReportSink(s);
49
87
  }
50
- if (preset.promptTemplates) {
51
- this.registerPromptTemplates(preset.promptTemplates);
52
- }
53
- if (preset.scoringProfiles) {
54
- this.registerScoringProfiles(preset.scoringProfiles);
55
- }
56
88
  if (preset.docFetcher) {
57
89
  this.registerDocFetcherFactory(preset.docFetcher);
58
90
  }
@@ -124,9 +156,30 @@ export class InMemoryPluginRegistry {
124
156
  return this.sourceDefs_;
125
157
  }
126
158
  registerFeatureDefs(features) {
127
- this.featureDefs_ = features;
159
+ if (!this.featureDefs_) {
160
+ this.featureDefs_ = features;
161
+ return;
162
+ }
163
+ // Merge by feature ID: new features override existing on ID collision,
164
+ // existing features not in new set are preserved.
165
+ const merged = new Map(this.featureDefs_.features.map((f) => [f.id, f]));
166
+ for (const f of features.features) {
167
+ merged.set(f.id, f);
168
+ }
169
+ this.featureDefs_ = { features: [...merged.values()] };
128
170
  }
129
171
  getFeatureDefs() {
130
172
  return this.featureDefs_;
131
173
  }
174
+ registerModeBase(base) {
175
+ this.modeBases_.set(base.mode.id, base);
176
+ // Also register the mode itself so getMode() works
177
+ this.registerMode(base.mode);
178
+ }
179
+ getModeBase(modeId) {
180
+ return this.modeBases_.get(modeId);
181
+ }
182
+ getModeBases() {
183
+ return [...this.modeBases_.values()];
184
+ }
132
185
  }
@@ -37,6 +37,21 @@ export type RawEvalMode = EvalMode | "agentic" | "baseline" | "full" | "observed
37
37
  export declare const CANONICAL_EVAL_MODES: readonly ["literacy", "mcp-server", "agent-harness", "knowledge-probe", "custom"];
38
38
  /** Legacy CLI aliases that map to `mode: "literacy"` + variant. */
39
39
  export declare const LEGACY_EVAL_MODE_ALIASES: readonly ["baseline", "agentic", "observed", "full"];
40
+ /**
41
+ * Literacy mode variant names — each is a distinct evaluation strategy.
42
+ *
43
+ * These are the valid values for the `variant` field in PipelineRequest
44
+ * when `mode` is `"literacy"`. They match LEGACY_EVAL_MODE_ALIASES because
45
+ * variants were originally exposed as top-level mode names.
46
+ *
47
+ * - `baseline` — with-docs / without-docs comparison (gold + floor)
48
+ * - `agentic` — model uses tools to find docs (gold only)
49
+ * - `observed` — HTTP-instrumented behavior observation
50
+ * - `full` — combined baseline + agentic
51
+ */
52
+ export declare const LITERACY_VARIANTS: readonly ["baseline", "agentic", "observed", "full"];
53
+ /** Union of all literacy variant string values. */
54
+ export type LiteracyVariant = (typeof LITERACY_VARIANTS)[number];
40
55
  /**
41
56
  * All accepted mode names for Zod enum construction.
42
57
  * Canonical modes first, then legacy aliases.
@@ -22,6 +22,24 @@ export const LEGACY_EVAL_MODE_ALIASES = [
22
22
  "observed",
23
23
  "full",
24
24
  ];
25
+ /**
26
+ * Literacy mode variant names — each is a distinct evaluation strategy.
27
+ *
28
+ * These are the valid values for the `variant` field in PipelineRequest
29
+ * when `mode` is `"literacy"`. They match LEGACY_EVAL_MODE_ALIASES because
30
+ * variants were originally exposed as top-level mode names.
31
+ *
32
+ * - `baseline` — with-docs / without-docs comparison (gold + floor)
33
+ * - `agentic` — model uses tools to find docs (gold only)
34
+ * - `observed` — HTTP-instrumented behavior observation
35
+ * - `full` — combined baseline + agentic
36
+ */
37
+ export const LITERACY_VARIANTS = [
38
+ "baseline",
39
+ "agentic",
40
+ "observed",
41
+ "full",
42
+ ];
25
43
  /**
26
44
  * All accepted mode names for Zod enum construction.
27
45
  * Canonical modes first, then legacy aliases.
@@ -17,7 +17,7 @@ const HINTS = [
17
17
  /no article found for slug/i.test(e.message),
18
18
  hint: "One or more `canonicalDocs` slugs in your task definitions don't match " +
19
19
  "any article in the documentation. Check the `slug` values in " +
20
- "`.ailf/tasks/*.yaml` and ensure they correspond to real articles.\n" +
20
+ "`.ailf/tasks/` and ensure they correspond to real articles.\n" +
21
21
  " Run `ailf validate` to check your task definitions locally.",
22
22
  },
23
23
  {
@@ -51,7 +51,7 @@ const HINTS = [
51
51
  hint: "The documentation fetch step completed but one or more tasks had " +
52
52
  "empty context. This usually means a `canonicalDocs` slug doesn't " +
53
53
  "match any article.\n" +
54
- " Check the slug values in `.ailf/tasks/*.yaml`.",
54
+ " Check the slug values in `.ailf/tasks/`.",
55
55
  },
56
56
  {
57
57
  match: (e) => e.step === "dispatch" && /dispatch failed/i.test(e.message),
@@ -22,7 +22,7 @@
22
22
  * @see docs/design-docs/architecture-overhaul/typescript-configuration.md
23
23
  */
24
24
  import { readFileSync } from "fs";
25
- import { extname } from "path";
25
+ import { extname, resolve } from "path";
26
26
  import { EvalConfigSchema, PipelineRequestSchema, } from "../../_vendor/ailf-core/index.js";
27
27
  import { mapRequestToConfig } from "../../pipeline/map-request-to-config.js";
28
28
  import { normalizeMode } from "../../pipeline/normalize-mode.js";
@@ -93,6 +93,7 @@ function mapEvalConfigToResolvedConfig(config, rootDir) {
93
93
  const normalized = normalizeMode(config.mode ?? "literacy");
94
94
  return {
95
95
  rootDir,
96
+ outputDir: resolve(rootDir, "results", "latest"),
96
97
  mode: normalized.mode,
97
98
  variant: normalized.variant,
98
99
  noAutoScope: config.noAutoScope ?? false,
@@ -119,7 +120,12 @@ function mapEvalConfigToResolvedConfig(config, rootDir) {
119
120
  allowedOrigins: config.allowedOrigins,
120
121
  searchMode: config.searchMode ?? "open",
121
122
  concurrency: config.concurrency,
123
+ captureEnabled: false,
124
+ captureDir: undefined,
125
+ captureCompress: true,
126
+ captureExtras: true,
122
127
  remote: false,
123
128
  apiUrl: "https://ailf-api.sanity.build",
129
+ presets: config.presets,
124
130
  };
125
131
  }
@@ -13,21 +13,29 @@
13
13
  * @see docs/design-docs/architecture-overhaul/typescript-configuration.md
14
14
  */
15
15
  import { existsSync } from "fs";
16
+ import { pathToFileURL } from "node:url";
16
17
  import { createJiti } from "jiti";
17
18
  // ---------------------------------------------------------------------------
18
- // Singleton jiti instance — reused across all loads for caching
19
+ // jiti instance factory resolves imports relative to the loaded file
19
20
  // ---------------------------------------------------------------------------
20
- let _jiti = null;
21
- function getJiti() {
22
- if (!_jiti) {
23
- _jiti = createJiti(import.meta.url, {
24
- // Interop: handle both `export default` and `module.exports`
25
- interopDefault: true,
26
- // Don't require file extensions in imports
27
- requireCache: true,
28
- });
29
- }
30
- return _jiti;
21
+ /**
22
+ * Create a jiti instance that resolves bare-specifier imports relative to
23
+ * the given file path, not relative to this loader module.
24
+ *
25
+ * This is critical for pnpm workspaces: a task file at `.ailf/tasks/foo.task.ts`
26
+ * importing `@sanity/ailf` must resolve through the dependency graph
27
+ * visible from the task file's directory, not from deep inside packages/eval/.
28
+ *
29
+ * We pass a `file://` URL (not a bare path) so jiti uses ESM resolution,
30
+ * which matches the `"import"` condition in package.json exports maps.
31
+ */
32
+ function createJitiForFile(filePath) {
33
+ return createJiti(pathToFileURL(filePath).href, {
34
+ // Interop: handle both `export default` and `module.exports`
35
+ interopDefault: true,
36
+ // Don't require file extensions in imports
37
+ requireCache: true,
38
+ });
31
39
  }
32
40
  /**
33
41
  * Load a TypeScript or JavaScript config file and return its default export.
@@ -43,7 +51,7 @@ export async function loadTsConfig(filePath) {
43
51
  return { ok: false, error: `File not found: ${filePath}`, path: filePath };
44
52
  }
45
53
  try {
46
- const jiti = getJiti();
54
+ const jiti = createJitiForFile(filePath);
47
55
  const mod = await jiti.import(filePath);
48
56
  const value = extractDefault(mod);
49
57
  if (value === undefined || value === null) {
@@ -33,6 +33,7 @@ export class PromptfooEvalAdapter {
33
33
  cwd: this.rootDir,
34
34
  env: { ...process.env, ...config.env },
35
35
  stdio: "inherit",
36
+ ...(config.maxDurationMs ? { timeout: config.maxDurationMs } : {}),
36
37
  });
37
38
  return {
38
39
  durationMs: Date.now() - start,
@@ -40,10 +41,15 @@ export class PromptfooEvalAdapter {
40
41
  summary: `Evaluation complete (${config.configPath})`,
41
42
  };
42
43
  }
43
- catch {
44
+ catch (err) {
45
+ const isTimeout = err instanceof Error &&
46
+ "killed" in err &&
47
+ err.killed === true;
44
48
  return {
45
49
  durationMs: Date.now() - start,
46
- error: `Promptfoo evaluation failed: ${config.configPath}`,
50
+ error: isTimeout
51
+ ? `Eval subprocess killed after ${config.maxDurationMs}ms time budget`
52
+ : `Promptfoo evaluation failed: ${config.configPath}`,
47
53
  status: "failed",
48
54
  };
49
55
  }
@@ -9,4 +9,3 @@ export { SanityDocFetcher } from "./doc-fetchers/index.js";
9
9
  export { PromptfooEvalAdapter } from "./eval-runners/index.js";
10
10
  export { ConsoleLogger, type ConsoleLoggerOptions, JsonLogger, QuietLogger, } from "./loggers/index.js";
11
11
  export { CliConfigAdapter, FileConfigAdapter } from "./config-sources/index.js";
12
- export { YamlTaskSource } from "./task-sources/index.js";
@@ -9,4 +9,3 @@ export { SanityDocFetcher } from "./doc-fetchers/index.js";
9
9
  export { PromptfooEvalAdapter } from "./eval-runners/index.js";
10
10
  export { ConsoleLogger, JsonLogger, QuietLogger, } from "./loggers/index.js";
11
11
  export { CliConfigAdapter, FileConfigAdapter } from "./config-sources/index.js";
12
- export { YamlTaskSource } from "./task-sources/index.js";
@@ -10,7 +10,7 @@
10
10
  * tasks in a single GeneralizedTaskDefinition[].
11
11
  *
12
12
  * @see packages/core/src/ports/task-source.ts — TaskSource port
13
- * @see docs/exec-plans/tasks-as-content/phase-4-repo-based-tasks.md
13
+ * @see docs/archive/exec-plans/tasks-as-content/phase-4-repo-based-tasks.md
14
14
  */
15
15
  import type { FilterOptions, GeneralizedTaskDefinition, TaskSource } from "../../_vendor/ailf-core/index.d.ts";
16
16
  export declare class CompositeTaskSource implements TaskSource {
@@ -10,7 +10,7 @@
10
10
  * tasks in a single GeneralizedTaskDefinition[].
11
11
  *
12
12
  * @see packages/core/src/ports/task-source.ts — TaskSource port
13
- * @see docs/exec-plans/tasks-as-content/phase-4-repo-based-tasks.md
13
+ * @see docs/archive/exec-plans/tasks-as-content/phase-4-repo-based-tasks.md
14
14
  */
15
15
  export class CompositeTaskSource {
16
16
  sources;