@sanity/ailf 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (444) hide show
  1. package/canonical/grader-references/README.md +2 -2
  2. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  3. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  4. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  5. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  6. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  7. package/config/features.ts +1 -1
  8. package/config/models.ts +28 -23
  9. package/config/sources.ts +1 -1
  10. package/config/thresholds.ts +1 -1
  11. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  13. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  17. package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
  18. package/dist/_vendor/ailf-core/config-helpers.js +29 -0
  19. package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
  20. package/dist/_vendor/ailf-core/examples/index.js +208 -114
  21. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  22. package/dist/_vendor/ailf-core/index.js +1 -0
  23. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  25. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  27. package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
  28. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  29. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  30. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  31. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  32. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  33. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
  34. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
  35. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  36. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  37. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  38. package/dist/_vendor/ailf-core/services/index.js +1 -1
  39. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  40. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
  41. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  42. package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
  43. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
  44. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  45. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  46. package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
  47. package/dist/_vendor/ailf-tasks/cli.js +61 -0
  48. package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
  49. package/dist/_vendor/ailf-tasks/index.js +16 -0
  50. package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
  51. package/dist/_vendor/ailf-tasks/parser.js +73 -0
  52. package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
  53. package/dist/_vendor/ailf-tasks/schemas.js +180 -0
  54. package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
  55. package/dist/_vendor/ailf-tasks/validation.js +162 -0
  56. package/dist/adapters/api-client/remediation.js +2 -2
  57. package/dist/adapters/config-sources/file-config-adapter.js +6 -1
  58. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  59. package/dist/adapters/index.d.ts +0 -1
  60. package/dist/adapters/index.js +0 -1
  61. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  62. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  63. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  64. package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
  65. package/dist/adapters/task-sources/index.d.ts +1 -2
  66. package/dist/adapters/task-sources/index.js +1 -2
  67. package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
  68. package/dist/adapters/task-sources/repo-schemas.js +2 -2
  69. package/dist/adapters/task-sources/repo-task-source.js +1 -1
  70. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  71. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
  73. package/dist/adapters/task-sources/task-file-loader.js +20 -6
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/explain-handler.d.ts +1 -1
  95. package/dist/commands/explain-handler.js +37 -8
  96. package/dist/commands/fetch-docs.js +1 -0
  97. package/dist/commands/generate-configs.d.ts +3 -3
  98. package/dist/commands/generate-configs.js +20 -8
  99. package/dist/commands/init.d.ts +2 -3
  100. package/dist/commands/init.js +56 -170
  101. package/dist/commands/pipeline-action.d.ts +7 -1
  102. package/dist/commands/pipeline-action.js +43 -19
  103. package/dist/commands/pipeline.d.ts +6 -1
  104. package/dist/commands/pipeline.js +7 -2
  105. package/dist/commands/pr-comment.js +1 -0
  106. package/dist/commands/publish.js +1 -0
  107. package/dist/commands/shared/help.js +2 -2
  108. package/dist/commands/update-quality-scores.d.ts +5 -0
  109. package/dist/commands/update-quality-scores.js +20 -0
  110. package/dist/composition-root.d.ts +2 -3
  111. package/dist/composition-root.js +27 -14
  112. package/dist/config/features.ts +23 -0
  113. package/dist/config/models.ts +100 -0
  114. package/dist/config/prompts.ts +16 -0
  115. package/dist/config/rubrics.ts +225 -0
  116. package/dist/config/schedules.ts +47 -0
  117. package/dist/config/sinks.ts +37 -0
  118. package/dist/config/sources.ts +21 -0
  119. package/dist/config/thresholds.ts +61 -0
  120. package/dist/lib/agent-behavior-report.d.ts +8 -0
  121. package/dist/lib/agent-behavior-report.js +185 -0
  122. package/dist/lib/baseline.d.ts +19 -0
  123. package/dist/lib/baseline.js +153 -0
  124. package/dist/lib/calculate-scores.d.ts +23 -0
  125. package/dist/lib/calculate-scores.js +42 -0
  126. package/dist/lib/compare.d.ts +18 -0
  127. package/dist/lib/compare.js +170 -0
  128. package/dist/lib/coverage-audit.d.ts +4 -0
  129. package/dist/lib/coverage-audit.js +42 -0
  130. package/dist/lib/discovery-report.d.ts +13 -0
  131. package/dist/lib/discovery-report.js +57 -0
  132. package/dist/lib/fetch-docs.d.ts +30 -0
  133. package/dist/lib/fetch-docs.js +171 -0
  134. package/dist/lib/generate-configs.d.ts +25 -0
  135. package/dist/lib/generate-configs.js +42 -0
  136. package/dist/lib/grader-api.d.ts +21 -0
  137. package/dist/lib/grader-api.js +34 -0
  138. package/dist/lib/grader-compare.d.ts +19 -0
  139. package/dist/lib/grader-compare.js +91 -0
  140. package/dist/lib/grader-consistency.d.ts +27 -0
  141. package/dist/lib/grader-consistency.js +79 -0
  142. package/dist/lib/grader-sensitivity.d.ts +19 -0
  143. package/dist/lib/grader-sensitivity.js +75 -0
  144. package/dist/lib/grader-validate.d.ts +19 -0
  145. package/dist/lib/grader-validate.js +78 -0
  146. package/dist/lib/measure-retrieval.d.ts +14 -0
  147. package/dist/lib/measure-retrieval.js +71 -0
  148. package/dist/lib/pr-comment.d.ts +16 -0
  149. package/dist/lib/pr-comment.js +28 -0
  150. package/dist/lib/readiness-report.d.ts +13 -0
  151. package/dist/lib/readiness-report.js +108 -0
  152. package/dist/lib/webhook-server.d.ts +11 -0
  153. package/dist/lib/webhook-server.js +24 -0
  154. package/dist/lib/weekly-digest.d.ts +24 -0
  155. package/dist/lib/weekly-digest.js +148 -0
  156. package/dist/orchestration/build-app-context.js +13 -0
  157. package/dist/orchestration/cache-context.d.ts +23 -0
  158. package/dist/orchestration/cache-context.js +43 -0
  159. package/dist/orchestration/env-bridge.d.ts +21 -0
  160. package/dist/orchestration/env-bridge.js +66 -0
  161. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  162. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  163. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  164. package/dist/orchestration/step-runner.js +5 -1
  165. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  166. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  167. package/dist/orchestration/steps/callback-step.js +10 -1
  168. package/dist/orchestration/steps/compare-step.js +6 -3
  169. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  170. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  171. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  172. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  173. package/dist/orchestration/steps/fetch-docs-step.js +30 -16
  174. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  175. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  176. package/dist/orchestration/steps/generate-configs-step.js +50 -15
  177. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  178. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  179. package/dist/orchestration/steps/publish-report-step.js +19 -0
  180. package/dist/orchestration/steps/readiness-step.js +8 -3
  181. package/dist/orchestration/steps/report-step.js +17 -4
  182. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  183. package/dist/orchestration/steps/run-eval-step.js +52 -32
  184. package/dist/pipeline/agent-behavior-report.js +6 -0
  185. package/dist/pipeline/attribution.d.ts +1 -1
  186. package/dist/pipeline/attribution.js +1 -1
  187. package/dist/pipeline/cache.js +29 -15
  188. package/dist/pipeline/calculate-scores.d.ts +2 -0
  189. package/dist/pipeline/calculate-scores.js +70 -33
  190. package/dist/pipeline/checks.d.ts +8 -3
  191. package/dist/pipeline/checks.js +23 -3
  192. package/dist/pipeline/chronic-failures.d.ts +55 -0
  193. package/dist/pipeline/chronic-failures.js +110 -0
  194. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
  195. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  196. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  197. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  198. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  199. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  200. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  201. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  202. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  203. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  204. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  205. package/dist/pipeline/compiler/config-loader.js +42 -2
  206. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  207. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  208. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  209. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  210. package/dist/pipeline/compiler/index.d.ts +2 -5
  211. package/dist/pipeline/compiler/index.js +2 -5
  212. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  213. package/dist/pipeline/compiler/literacy-bridge.js +1 -1
  214. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
  215. package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
  216. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
  217. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
  218. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
  219. package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
  220. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
  221. package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
  222. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
  223. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
  224. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  225. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  226. package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
  227. package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
  228. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
  229. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
  230. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  231. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  232. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
  233. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
  234. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  235. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  236. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  237. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
  238. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
  239. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
  240. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
  241. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
  242. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
  243. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
  244. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
  245. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
  246. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
  247. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
  248. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
  249. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
  250. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  251. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  252. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
  253. package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
  254. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  255. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  256. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  257. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  258. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  259. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  260. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  261. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  262. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  263. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  264. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  265. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  266. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  267. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  268. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  269. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  270. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  271. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  272. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  273. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  274. package/dist/pipeline/compiler/task-bridge.js +92 -0
  275. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  276. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  277. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  278. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  279. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  280. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  281. package/dist/pipeline/coverage-audit.d.ts +1 -1
  282. package/dist/pipeline/coverage-audit.js +1 -1
  283. package/dist/pipeline/degradations.d.ts +1 -1
  284. package/dist/pipeline/degradations.js +1 -1
  285. package/dist/pipeline/failure-modes.d.ts +1 -1
  286. package/dist/pipeline/failure-modes.js +13 -1
  287. package/dist/pipeline/gap-analysis.d.ts +1 -1
  288. package/dist/pipeline/gap-analysis.js +3 -1
  289. package/dist/pipeline/generate-configs.d.ts +2 -2
  290. package/dist/pipeline/generate-configs.js +15 -8
  291. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  292. package/dist/pipeline/grader-compare-runner.js +7 -1
  293. package/dist/pipeline/grader-comparison.d.ts +1 -1
  294. package/dist/pipeline/grader-comparison.js +1 -1
  295. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  296. package/dist/pipeline/grader-consistency-runner.js +7 -1
  297. package/dist/pipeline/grader-consistency.d.ts +1 -1
  298. package/dist/pipeline/grader-consistency.js +1 -1
  299. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  300. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  301. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  302. package/dist/pipeline/grader-sensitivity.js +1 -1
  303. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  304. package/dist/pipeline/grader-validate-runner.js +2 -2
  305. package/dist/pipeline/grader-validation.d.ts +1 -1
  306. package/dist/pipeline/grader-validation.js +1 -1
  307. package/dist/pipeline/map-request-to-config.js +15 -2
  308. package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
  309. package/dist/pipeline/mirror-repo-tasks.js +1 -1
  310. package/dist/pipeline/plan-format.d.ts +1 -1
  311. package/dist/pipeline/plan-format.js +1 -1
  312. package/dist/pipeline/plan.d.ts +1 -1
  313. package/dist/pipeline/plan.js +67 -29
  314. package/dist/pipeline/probe.d.ts +1 -1
  315. package/dist/pipeline/probe.js +1 -1
  316. package/dist/pipeline/readiness-report.d.ts +2 -2
  317. package/dist/pipeline/readiness-report.js +2 -2
  318. package/dist/pipeline/release-classification.d.ts +1 -1
  319. package/dist/pipeline/release-classification.js +1 -1
  320. package/dist/pipeline/release-report.d.ts +1 -1
  321. package/dist/pipeline/release-report.js +1 -1
  322. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  323. package/dist/pipeline/repo-eval-comment.js +1 -1
  324. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  325. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  326. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  327. package/dist/pipeline/resolve-mappings.js +44 -44
  328. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  329. package/dist/pipeline/retrieval-metrics.js +28 -20
  330. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  331. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  332. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  333. package/dist/pipeline/steps/compare-step.js +90 -0
  334. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  335. package/dist/pipeline/steps/eval-step.js +347 -0
  336. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  337. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  338. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  339. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  340. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  341. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  342. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  343. package/dist/pipeline/steps/publish-report-step.js +243 -0
  344. package/dist/pipeline/steps/report-step.d.ts +13 -0
  345. package/dist/pipeline/steps/report-step.js +56 -0
  346. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  347. package/dist/pipeline/steps/update-scores-step.js +42 -0
  348. package/dist/pipeline/targeted-loo.d.ts +1 -1
  349. package/dist/pipeline/targeted-loo.js +1 -1
  350. package/dist/pipeline/thresholds.d.ts +1 -1
  351. package/dist/pipeline/thresholds.js +1 -1
  352. package/dist/pipeline/validate.js +13 -0
  353. package/dist/report-store.d.ts +17 -0
  354. package/dist/report-store.js +24 -0
  355. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  356. package/dist/scripts/agent-behavior-report.js +315 -0
  357. package/dist/scripts/baseline.d.ts +43 -0
  358. package/dist/scripts/baseline.js +267 -0
  359. package/dist/scripts/calculate-scores.d.ts +166 -0
  360. package/dist/scripts/calculate-scores.js +1296 -0
  361. package/dist/scripts/compare.d.ts +22 -0
  362. package/dist/scripts/compare.js +334 -0
  363. package/dist/scripts/coverage-audit.d.ts +44 -0
  364. package/dist/scripts/coverage-audit.js +209 -0
  365. package/dist/scripts/debug-eval.d.ts +19 -0
  366. package/dist/scripts/debug-eval.js +73 -0
  367. package/dist/scripts/discovery-report.d.ts +58 -0
  368. package/dist/scripts/discovery-report.js +250 -0
  369. package/dist/scripts/fetch-docs.d.ts +35 -0
  370. package/dist/scripts/fetch-docs.js +472 -0
  371. package/dist/scripts/generate-configs.d.ts +66 -0
  372. package/dist/scripts/generate-configs.js +459 -0
  373. package/dist/scripts/grader-api.d.ts +27 -0
  374. package/dist/scripts/grader-api.js +206 -0
  375. package/dist/scripts/grader-compare.d.ts +22 -0
  376. package/dist/scripts/grader-compare.js +368 -0
  377. package/dist/scripts/grader-consistency.d.ts +20 -0
  378. package/dist/scripts/grader-consistency.js +313 -0
  379. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  380. package/dist/scripts/grader-sensitivity.js +354 -0
  381. package/dist/scripts/grader-validate.d.ts +19 -0
  382. package/dist/scripts/grader-validate.js +267 -0
  383. package/dist/scripts/measure-retrieval.d.ts +10 -0
  384. package/dist/scripts/measure-retrieval.js +145 -0
  385. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  386. package/dist/scripts/migrate-task-mode.js +1 -1
  387. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  388. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  389. package/dist/scripts/pipeline.d.ts +76 -0
  390. package/dist/scripts/pipeline.js +1031 -0
  391. package/dist/scripts/pr-comment.d.ts +10 -0
  392. package/dist/scripts/pr-comment.js +510 -0
  393. package/dist/scripts/readiness-report.d.ts +88 -0
  394. package/dist/scripts/readiness-report.js +342 -0
  395. package/dist/scripts/update-quality-scores.d.ts +15 -0
  396. package/dist/scripts/update-quality-scores.js +184 -0
  397. package/dist/scripts/validate-task-sources.d.ts +1 -1
  398. package/dist/scripts/validate-task-sources.js +1 -1
  399. package/dist/scripts/validate.d.ts +13 -0
  400. package/dist/scripts/validate.js +79 -0
  401. package/dist/scripts/webhook-server.d.ts +26 -0
  402. package/dist/scripts/webhook-server.js +147 -0
  403. package/dist/scripts/weekly-digest.d.ts +24 -0
  404. package/dist/scripts/weekly-digest.js +144 -0
  405. package/dist/sinks/format-slack.d.ts +64 -0
  406. package/dist/sinks/format-slack.js +306 -0
  407. package/dist/sinks/slack-sink.d.ts +27 -0
  408. package/dist/sinks/slack-sink.js +78 -0
  409. package/dist/sinks/types.d.ts +1 -1
  410. package/dist/sinks/types.js +1 -1
  411. package/dist/sinks/webhook-sink.d.ts +19 -0
  412. package/dist/sinks/webhook-sink.js +50 -0
  413. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  414. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  415. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  416. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  417. package/dist/tasks/literacy/functions.task.ts +70 -0
  418. package/dist/tasks/literacy/groq.task.ts +259 -0
  419. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  420. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  421. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  422. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  423. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  424. package/package.json +25 -25
  425. package/tasks/.expanded.agentic.yaml +280 -0
  426. package/tasks/.expanded.yaml +565 -0
  427. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  428. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  429. package/tasks/literacy/content-lake.task.ts +181 -0
  430. package/tasks/literacy/frameworks.task.ts +1 -0
  431. package/tasks/literacy/functions.task.ts +1 -0
  432. package/tasks/literacy/groq.task.ts +1 -0
  433. package/tasks/literacy/image-handling.task.ts +95 -0
  434. package/tasks/literacy/nextjs-live.task.ts +2 -1
  435. package/tasks/literacy/portable-text.task.ts +169 -0
  436. package/tasks/literacy/studio-setup.task.ts +5 -2
  437. package/tasks/literacy/visual-editing.task.ts +1 -0
  438. package/LICENSE +0 -21
  439. package/tasks/frameworks.yaml +0 -98
  440. package/tasks/functions.yaml +0 -51
  441. package/tasks/groq.yaml +0 -216
  442. package/tasks/nextjs-live.yaml +0 -62
  443. package/tasks/studio-setup.yaml +0 -111
  444. package/tasks/visual-editing.yaml +0 -120
@@ -20,3 +20,4 @@ export * from "./examples/index.js";
20
20
  // ---------------------------------------------------------------------------
21
21
  export { defineConfig, defineFeatures, defineModeBase, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./config-helpers.js";
22
22
  export { env } from "./env-helper.js";
23
+ export { NoOpArtifactCollector } from "./artifact-capture/noop-collector.js";
@@ -0,0 +1,94 @@
1
+ /**
2
+ * Port: ArtifactCollector — captures pipeline artifacts during execution.
3
+ *
4
+ * Injected into AppContext. When capture is disabled (default), the
5
+ * composition root provides NoOpArtifactCollector. When --capture is
6
+ * set, provides FilesystemArtifactCollector.
7
+ *
8
+ * Design principles:
9
+ * - P1: Zero-cost when off (no-op stub)
10
+ * - P2: Capture, don't intercept (steps call capture() explicitly)
11
+ * - P5: Non-blocking (failures swallowed, never block the pipeline)
12
+ */
13
+ /**
14
+ * The contract for artifact capture during pipeline execution.
15
+ *
16
+ * Steps call capture() for in-memory data and captureFile() for
17
+ * artifacts already on disk. The orchestrator calls flush() once
18
+ * at pipeline end to write everything to the configured destination.
19
+ */
20
+ export interface ArtifactCollector {
21
+ /**
22
+ * Record an in-memory artifact produced during pipeline execution.
23
+ *
24
+ * Callers need not check `enabled` before calling — the NoOp
25
+ * implementation is zero-cost, so unconditional calls are safe.
26
+ *
27
+ * @param step - Pipeline step name (e.g., "run-eval")
28
+ * @param type - Artifact type identifier (e.g., "eval-results")
29
+ * @param data - Content to serialize (JSON or text)
30
+ * @param meta - Optional metadata (variant, model, etc.)
31
+ */
32
+ capture(step: string, type: string, data: unknown, meta?: Record<string, unknown>): void;
33
+ /**
34
+ * Record a file reference for an artifact already on disk.
35
+ * The file is copied into the capture directory on flush().
36
+ *
37
+ * @param step - Pipeline step name
38
+ * @param type - Artifact type identifier
39
+ * @param filePath - Absolute path to the existing file
40
+ * @param meta - Optional metadata
41
+ */
42
+ captureFile(step: string, type: string, filePath: string, meta?: Record<string, unknown>): void;
43
+ /**
44
+ * Flush all captured artifacts to the configured destination.
45
+ * Called once at pipeline end by the orchestrator.
46
+ */
47
+ flush(): Promise<CaptureFlushResult>;
48
+ /** Whether capture is active */
49
+ readonly enabled: boolean;
50
+ /** Whether mode-specific extras are being captured */
51
+ readonly extrasEnabled: boolean;
52
+ }
53
+ /** Result of flushing captured artifacts to the destination. */
54
+ export interface CaptureFlushResult {
55
+ /** Total number of artifacts captured */
56
+ artifactCount: number;
57
+ /** Output path (directory or .tar.gz) */
58
+ destination: string;
59
+ /** Total bytes written (uncompressed) */
60
+ totalBytes: number;
61
+ /** Whether output was compressed */
62
+ compressed: boolean;
63
+ }
64
+ /** A single entry in the capture manifest. */
65
+ export interface ArtifactManifestEntry {
66
+ /** Pipeline step that produced this artifact */
67
+ step: string;
68
+ /** Artifact type identifier */
69
+ type: string;
70
+ /** Relative path within the capture directory */
71
+ path: string;
72
+ /** ISO 8601 timestamp of when capture() was called */
73
+ capturedAt: string;
74
+ /** Byte size of the artifact */
75
+ bytes: number;
76
+ /** Content format */
77
+ format: "json" | "markdown" | "text";
78
+ /** Optional metadata */
79
+ meta?: Record<string, unknown>;
80
+ }
81
+ /** The manifest.json written to each capture directory. */
82
+ export interface ArtifactManifest {
83
+ version: 1;
84
+ captureId: string;
85
+ startedAt: string;
86
+ completedAt: string;
87
+ pipeline: {
88
+ mode: string;
89
+ variant?: string;
90
+ source?: string;
91
+ areas?: string[];
92
+ };
93
+ artifacts: ArtifactManifestEntry[];
94
+ }
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Port: ArtifactCollector — captures pipeline artifacts during execution.
3
+ *
4
+ * Injected into AppContext. When capture is disabled (default), the
5
+ * composition root provides NoOpArtifactCollector. When --capture is
6
+ * set, provides FilesystemArtifactCollector.
7
+ *
8
+ * Design principles:
9
+ * - P1: Zero-cost when off (no-op stub)
10
+ * - P2: Capture, don't intercept (steps call capture() explicitly)
11
+ * - P5: Non-blocking (failures swallowed, never block the pipeline)
12
+ */
13
+ export {};
@@ -0,0 +1,138 @@
1
+ /**
2
+ * Types for cross-run capture comparison.
3
+ *
4
+ * The CaptureComparator reads two capture directories (baseline + experiment)
5
+ * and produces a CaptureDiffReport. Types are defined in core so external
6
+ * tooling can consume diff reports without depending on the eval package.
7
+ *
8
+ * Implementation lives in packages/eval/src/artifact-capture/comparator.ts.
9
+ */
10
+ /** How deeply to compare artifacts. */
11
+ export type ComparisonMode = "strict" | "structural" | "inventory";
12
+ /** Configurable thresholds for comparison. */
13
+ export interface ComparisonOptions {
14
+ /** Comparison depth: inventory (existence), structural (shape), strict (content) */
15
+ mode: ComparisonMode;
16
+ /** Score regression thresholds */
17
+ scoreThresholds?: {
18
+ /** Maximum allowed aggregate score delta (percentage points, default 5) */
19
+ aggregate: number;
20
+ /** Maximum allowed per-task score drop (points, default 10) */
21
+ perTask: number;
22
+ };
23
+ /** Timing regression thresholds */
24
+ timingThresholds?: {
25
+ /** Multiplier — flag steps exceeding this ratio (default 2.0) */
26
+ multiplier: number;
27
+ /** Per-step overrides (step name → custom multiplier) */
28
+ perStep?: Record<string, number>;
29
+ };
30
+ /** JSON structural diff depth (default 3) */
31
+ jsonDiffDepth?: number;
32
+ /** Additional ephemeral fields to ignore (merged with defaults) */
33
+ ephemeralFields?: string[];
34
+ }
35
+ /** Inventory diff — which artifacts exist in each capture. */
36
+ export interface InventoryDiff {
37
+ /** Artifact types in experiment but not in baseline */
38
+ added: string[];
39
+ /** Artifact types in baseline but not in experiment */
40
+ removed: string[];
41
+ /** Artifact types present in both */
42
+ common: string[];
43
+ }
44
+ /** A single structural change in a JSON artifact. */
45
+ export interface JsonDiffEntry {
46
+ /** JSON pointer path (e.g., "config.mode") */
47
+ path: string;
48
+ /** Value in baseline (undefined if key is added) */
49
+ baseline?: unknown;
50
+ /** Value in experiment (undefined if key is removed) */
51
+ experiment?: unknown;
52
+ }
53
+ /** Content diff for a single artifact. */
54
+ export interface ArtifactContentDiff {
55
+ /** Artifact type identifier (step/type) */
56
+ artifactKey: string;
57
+ /** Content format */
58
+ format: "json" | "markdown" | "text";
59
+ /** Structural changes (JSON) or line diff summary (text/markdown) */
60
+ changes: JsonDiffEntry[] | {
61
+ addedLines: number;
62
+ removedLines: number;
63
+ };
64
+ }
65
+ /** Score comparison between two captures. */
66
+ export interface ScoreComparison {
67
+ /** Baseline aggregate score */
68
+ baselineMean: number;
69
+ /** Experiment aggregate score */
70
+ currentMean: number;
71
+ /** Absolute delta (current - baseline) */
72
+ delta: number;
73
+ /** Per-task score deltas */
74
+ perTask: {
75
+ task: string;
76
+ baseline: number;
77
+ current: number;
78
+ delta: number;
79
+ }[];
80
+ /** Tasks that breached configured thresholds */
81
+ breaches: string[];
82
+ }
83
+ /** Timing comparison between two captures. */
84
+ export interface TimingComparison {
85
+ /** Total pipeline duration delta in ms */
86
+ totalDeltaMs: number;
87
+ /** Per-step timing */
88
+ perStep: {
89
+ step: string;
90
+ baselineMs: number;
91
+ currentMs: number;
92
+ ratio: number;
93
+ }[];
94
+ /** Steps that breached the timing multiplier threshold */
95
+ breaches: string[];
96
+ }
97
+ /** Metadata comparison between two captures. */
98
+ export interface MetadataComparison {
99
+ /** Whether pipeline modes match */
100
+ modeMatch: boolean;
101
+ /** Whether pipeline variants match */
102
+ variantMatch: boolean;
103
+ /** Config key differences */
104
+ configDiffs: JsonDiffEntry[];
105
+ }
106
+ /** Security scan results. */
107
+ export interface SecurityScan {
108
+ /** Whether any potential secret leaks were found */
109
+ leaksFound: boolean;
110
+ /** Details of each violation */
111
+ violations: {
112
+ /** Relative artifact file path */
113
+ file: string;
114
+ /** Description of the finding */
115
+ detail: string;
116
+ }[];
117
+ }
118
+ /** The full diff report produced by CaptureComparator. */
119
+ export interface CaptureDiffReport {
120
+ /** Are the two captures semantically equivalent? */
121
+ equivalent: boolean;
122
+ /** Human-readable summary (1-3 sentences) */
123
+ summary: string;
124
+ /** Comparison mode used */
125
+ mode: ComparisonMode;
126
+ /** Artifact inventory diff */
127
+ inventory: InventoryDiff;
128
+ /** Content diffs for common artifacts (structural/strict modes only) */
129
+ content?: ArtifactContentDiff[];
130
+ /** Score comparison (if score-summary exists in both captures) */
131
+ scores?: ScoreComparison;
132
+ /** Timing comparison (if pipeline-context exists in both captures) */
133
+ timing?: TimingComparison;
134
+ /** Metadata comparison */
135
+ metadata?: MetadataComparison;
136
+ /** Security scan results */
137
+ security: SecurityScan;
138
+ }
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Types for cross-run capture comparison.
3
+ *
4
+ * The CaptureComparator reads two capture directories (baseline + experiment)
5
+ * and produces a CaptureDiffReport. Types are defined in core so external
6
+ * tooling can consume diff reports without depending on the eval package.
7
+ *
8
+ * Implementation lives in packages/eval/src/artifact-capture/comparator.ts.
9
+ */
10
+ export {};
@@ -12,6 +12,7 @@
12
12
  * as downstream consumers are converted to use them.
13
13
  */
14
14
  import type { DebugOptions, EvalMode, PluginRegistry } from "../types/index.js";
15
+ import type { ArtifactCollector } from "./artifact-collector.js";
15
16
  import type { CacheStore } from "./cache-store.js";
16
17
  import type { DocFetcher } from "./doc-fetcher.js";
17
18
  import type { EvalRunner } from "./eval-runner.js";
@@ -78,6 +79,8 @@ export interface ResolvedConfig {
78
79
  noRemoteCache: boolean;
79
80
  /** Grader replications for consistency measurement */
80
81
  graderReplications?: number;
82
+ /** Base directory for user-facing pipeline output artifacts. */
83
+ outputDir: string;
81
84
  /** Output path override */
82
85
  outputPath?: string;
83
86
  /** Doc source URL overrides */
@@ -90,6 +93,12 @@ export interface ResolvedConfig {
90
93
  searchMode: "off" | "open" | "origin-only";
91
94
  /** Eval concurrency */
92
95
  concurrency?: number;
96
+ /**
97
+ * Maximum wall-clock time per eval step in ms.
98
+ * When exceeded, the subprocess is killed and partial results are used.
99
+ * Sourced from models config `evalBudgetMs`.
100
+ */
101
+ evalBudgetMs?: number;
93
102
  /** Promptfoo URL from eval output */
94
103
  promptfooUrl?: string;
95
104
  /** Sanity dataset override */
@@ -109,7 +118,7 @@ export interface ResolvedConfig {
109
118
  /** Before option for comparison */
110
119
  beforeOption?: string;
111
120
  /** Task source adapter selection */
112
- taskSourceType?: "content-lake" | "repo" | "yaml";
121
+ taskSourceType?: "content-lake" | "repo";
113
122
  /** Path to repo-based tasks directory (e.g., .ailf/tasks/) */
114
123
  repoTasksPath?: string;
115
124
  /** Report store project ID from .ailf/config.yaml reportStore block */
@@ -142,6 +151,14 @@ export interface ResolvedConfig {
142
151
  apiKey?: string;
143
152
  /** External preset file paths or npm package names to load */
144
153
  presets?: string[];
154
+ /** Whether artifact capture is enabled for this run (default: false) */
155
+ captureEnabled?: boolean;
156
+ /** Base directory for capture output (default: results/captures/) */
157
+ captureDir?: string;
158
+ /** Whether to compress capture output to tar.gz (default: true) */
159
+ captureCompress?: boolean;
160
+ /** Whether to include mode-specific extra artifacts (default: true) */
161
+ captureExtras?: boolean;
145
162
  }
146
163
  /**
147
164
  * Application context — the complete dependency carrier.
@@ -158,6 +175,8 @@ export interface ResolvedConfig {
158
175
  export interface AppContext {
159
176
  /** Evaluation caching (filesystem + optional Content Lake fallback) */
160
177
  readonly cache?: CacheStore;
178
+ /** Artifact capture collector (no-op when --capture is not set) */
179
+ readonly collector: ArtifactCollector;
161
180
  /** Resolved pipeline configuration */
162
181
  readonly config: ResolvedConfig;
163
182
  /** Documentation context fetcher */
@@ -15,6 +15,12 @@ export interface EvalRunConfig {
15
15
  concurrency?: number;
16
16
  /** Environment variables to pass to the eval process */
17
17
  env?: Record<string, string>;
18
+ /**
19
+ * Maximum wall-clock time for this eval subprocess in ms.
20
+ * When exceeded, the process is killed and partial results are used.
21
+ * Default: no limit (backward compatible).
22
+ */
23
+ maxDurationMs?: number;
18
24
  }
19
25
  export interface EvalRunner {
20
26
  /** Run an evaluation and return the step result */
@@ -4,6 +4,8 @@
4
4
  * Ports define the contracts between the domain kernel and the outside world.
5
5
  * Adapters (in packages/eval) implement these interfaces.
6
6
  */
7
+ export type { ArtifactCollector, ArtifactManifest, ArtifactManifestEntry, CaptureFlushResult, } from "./artifact-collector.js";
8
+ export type { ArtifactContentDiff, CaptureDiffReport, ComparisonMode, ComparisonOptions, InventoryDiff, JsonDiffEntry, MetadataComparison, ScoreComparison, SecurityScan, TimingComparison, } from "./capture-comparator.js";
7
9
  export type { CacheEntryMetadata, CacheKey, CacheLookupResult, CacheRecordInput, CacheStore, } from "./cache-store.js";
8
10
  export type { ConfigSource } from "./config-source.js";
9
11
  export type { AppContext, ReportSinkPort, ReportStorePort, ResolvedConfig, } from "./context.js";
@@ -38,6 +38,17 @@ export interface PipelineStep {
38
38
  * When defined, the StepRunner computes a hash and checks the cache.
39
39
  */
40
40
  cacheInputs?(ctx: AppContext): string[];
41
+ /**
42
+ * Cache context strings — non-file state that participates in cache key
43
+ * computation (e.g., mode, variant, area/task/tag filters).
44
+ *
45
+ * Without these, two runs with different CLI flags but identical config
46
+ * files would share a cache entry, causing cross-mode or cross-area
47
+ * contamination.
48
+ *
49
+ * When undefined, only file content determines the cache key.
50
+ */
51
+ cacheContext?(ctx: AppContext): string[];
41
52
  /**
42
53
  * Whether this step is optional — a failure in an optional step
43
54
  * does not stop the pipeline.
@@ -2,9 +2,9 @@
2
2
  * Port: Where task definitions come from.
3
3
  *
4
4
  * Adapters:
5
- * - YamlTaskSource (current) reads tasks/*.yaml files
6
- * - ContentLakeTaskSource (tasks-as-content Phase 2) — GROQ query
7
- * - RepoTaskSource (tasks-as-content Phase 4) — reads .ailf/tasks/
5
+ * - ContentLakeTaskSourceGROQ query against Sanity Content Lake
6
+ * - RepoTaskSource reads .ailf/tasks/*.task.ts files
7
+ * - TsTaskFileLoader — reads tasks/{mode}/*.task.ts (eval package)
8
8
  *
9
9
  * The key invariant: the pipeline orchestrator and all downstream steps
10
10
  * work with GeneralizedTaskDefinition[] regardless of where they came from.
@@ -2,9 +2,9 @@
2
2
  * Port: Where task definitions come from.
3
3
  *
4
4
  * Adapters:
5
- * - YamlTaskSource (current) reads tasks/*.yaml files
6
- * - ContentLakeTaskSource (tasks-as-content Phase 2) — GROQ query
7
- * - RepoTaskSource (tasks-as-content Phase 4) — reads .ailf/tasks/
5
+ * - ContentLakeTaskSourceGROQ query against Sanity Content Lake
6
+ * - RepoTaskSource reads .ailf/tasks/*.task.ts files
7
+ * - TsTaskFileLoader — reads tasks/{mode}/*.task.ts (eval package)
8
8
  *
9
9
  * The key invariant: the pipeline orchestrator and all downstream steps
10
10
  * work with GeneralizedTaskDefinition[] regardless of where they came from.
@@ -77,10 +77,15 @@ export declare const PipelineRequestSchema: z.ZodObject<{
77
77
  taskMode: z.ZodOptional<z.ZodEnum<{
78
78
  inline: "inline";
79
79
  "content-lake": "content-lake";
80
- yaml: "yaml";
81
80
  }>>;
82
81
  tasks: z.ZodOptional<z.ZodArray<z.ZodString>>;
83
82
  urls: z.ZodOptional<z.ZodArray<z.ZodString>>;
83
+ variant: z.ZodOptional<z.ZodEnum<{
84
+ baseline: "baseline";
85
+ agentic: "agentic";
86
+ observed: "observed";
87
+ full: "full";
88
+ }>>;
84
89
  presets: z.ZodOptional<z.ZodArray<z.ZodString>>;
85
90
  }, z.core.$strip>;
86
91
  /** Inferred TypeScript type for a pipeline request payload. */
@@ -13,7 +13,7 @@
13
13
  * @see packages/eval/src/pipeline/map-request-to-config.ts — maps to ResolvedConfig
14
14
  */
15
15
  import { z } from "zod";
16
- import { RAW_EVAL_MODES } from "../../ailf-shared/index.js";
16
+ import { LITERACY_VARIANTS, RAW_EVAL_MODES } from "../../ailf-shared/index.js";
17
17
  // ---------------------------------------------------------------------------
18
18
  // Debug options — boolean shorthand or structured object
19
19
  // ---------------------------------------------------------------------------
@@ -86,9 +86,21 @@ export const PipelineRequestSchema = z.object({
86
86
  searchMode: z.enum(["off", "open", "origin-only"]).optional(),
87
87
  source: z.string().optional(),
88
88
  sourceReportId: z.string().optional(),
89
- taskMode: z.enum(["content-lake", "yaml", "inline"]).optional(),
89
+ taskMode: z.enum(["content-lake", "inline"]).optional(),
90
90
  tasks: z.array(z.string()).optional(),
91
91
  urls: z.array(z.string().url()).optional(),
92
+ /**
93
+ * Literacy variant — only meaningful when mode is "literacy".
94
+ *
95
+ * When provided with a canonical mode (`mode: "literacy"`), this field
96
+ * specifies the variant directly. When mode is a legacy alias (e.g.,
97
+ * `mode: "baseline"`), the variant is derived from the mode name and
98
+ * this field is ignored.
99
+ *
100
+ * Prefer explicit `mode: "literacy", variant: "baseline"` over the
101
+ * legacy `mode: "baseline"` form.
102
+ */
103
+ variant: z.enum(LITERACY_VARIANTS).optional(),
92
104
  /** External preset file paths or npm package names to load */
93
105
  presets: z.array(z.string()).optional(),
94
106
  });
@@ -6,7 +6,9 @@
6
6
  * Extracted from packages/eval/src/lib/generate-configs.ts during
7
7
  * the Ports & Adapters migration (Phase 4e).
8
8
  */
9
+ import type { EvalMode } from "../../ailf-shared/index.d.ts";
9
10
  import type { ModelEntry } from "../types/index.js";
11
+ import type { ModeBase } from "../types/plugin-registry.js";
10
12
  /**
11
13
  * Extract the raw API model name from a Promptfoo provider ID.
12
14
  *
@@ -38,4 +40,17 @@ export declare function mergeConfig(defaults: Record<string, unknown>, modelConf
38
40
  *
39
41
  * Models without a `modes` field match all modes.
40
42
  */
41
- export declare function modelMatchesMode(model: ModelEntry, mode: string): boolean;
43
+ export declare function modelMatchesMode(model: ModelEntry, mode: EvalMode): boolean;
44
+ /**
45
+ * Resolve which variants a model participates in for a given mode.
46
+ *
47
+ * Resolution rules:
48
+ * - If the mode has no variants defined → returns `undefined` (no variant filtering)
49
+ * - If the model specifies variants for this mode → returns that whitelist
50
+ * - If the model omits variants for this mode → returns ALL mode variants (default)
51
+ *
52
+ * @param model - The model entry from models config
53
+ * @param modeBase - The mode base (contains variant definitions)
54
+ * @returns Array of variant IDs, or `undefined` if the mode has no variants
55
+ */
56
+ export declare function resolveModelVariants(model: ModelEntry, modeBase: ModeBase): string[] | undefined;
@@ -84,3 +84,24 @@ export function modelMatchesMode(model, mode) {
84
84
  }
85
85
  return model.modes.includes(mode);
86
86
  }
87
+ /**
88
+ * Resolve which variants a model participates in for a given mode.
89
+ *
90
+ * Resolution rules:
91
+ * - If the mode has no variants defined → returns `undefined` (no variant filtering)
92
+ * - If the model specifies variants for this mode → returns that whitelist
93
+ * - If the model omits variants for this mode → returns ALL mode variants (default)
94
+ *
95
+ * @param model - The model entry from models config
96
+ * @param modeBase - The mode base (contains variant definitions)
97
+ * @returns Array of variant IDs, or `undefined` if the mode has no variants
98
+ */
99
+ export function resolveModelVariants(model, modeBase) {
100
+ const modeVariants = modeBase.mode.variants;
101
+ if (!modeVariants || modeVariants.length === 0)
102
+ return undefined;
103
+ const allVariantIds = modeVariants.map((v) => v.id);
104
+ const modeId = modeBase.mode.id;
105
+ const explicit = model.variants?.[modeId];
106
+ return explicit ?? allVariantIds;
107
+ }
@@ -10,4 +10,4 @@
10
10
  export { classifyRubric, detectFeatureArea, extractDimensions, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
11
11
  export { formatComparisonMarkdown, formatComparisonTable, } from "./comparison-formatters.js";
12
12
  export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, type AggregationStrategy, type AreaScore, type AssertionScore, type DimensionScore, type EnsembleGradingConfig, type GraderTransitionConfig, type TaskScore, type TaskScoreOptions, } from "./scoring-engine.js";
13
- export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "./config-helpers.js";
13
+ export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
@@ -10,4 +10,4 @@
10
10
  export { classifyRubric, detectFeatureArea, extractDimensions, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
11
11
  export { formatComparisonMarkdown, formatComparisonTable, } from "./comparison-formatters.js";
12
12
  export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, } from "./scoring-engine.js";
13
- export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "./config-helpers.js";
13
+ export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
@@ -65,6 +65,15 @@ export function classifyRubric(component) {
65
65
  */
66
66
  export function detectFeatureArea(description) {
67
67
  const desc = description.toLowerCase();
68
+ if (desc.includes("portable text")) {
69
+ return "portable-text";
70
+ }
71
+ if (desc.includes("content lake")) {
72
+ return "content-lake";
73
+ }
74
+ if (desc.includes("image handling") || desc.includes("image asset")) {
75
+ return "image-handling";
76
+ }
68
77
  if (desc.includes("studio")) {
69
78
  return "studio-setup";
70
79
  }
@@ -8,7 +8,7 @@
8
8
  *
9
9
  * @see docs/design-docs/architecture-overhaul/domain-model.md (canonical)
10
10
  * @see docs/design-docs/architecture-overhaul/test-definition.md (authoring surfaces)
11
- * @see docs/exec-plans/architecture-overhaul/phase-0-foundation-types.md (task 0h)
11
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-0-foundation-types.md (task 0h)
12
12
  */
13
13
  /** Difficulty level for a task */
14
14
  export type TaskDifficulty = "basic" | "intermediate" | "advanced";
@@ -178,6 +178,17 @@ export interface MCPServerTaskDefinition extends TaskCommonFields {
178
178
  url?: string;
179
179
  /** Environment variables for the server process */
180
180
  env?: Record<string, string>;
181
+ /**
182
+ * HTTP headers for remote transports (sse / streamable-http).
183
+ * Merged on top of any auth-derived headers, so explicit values
184
+ * here take precedence over `auth`-generated headers.
185
+ *
186
+ * Values support `{{env.VAR}}` template syntax for secrets.
187
+ *
188
+ * @example
189
+ * headers: { Authorization: "Bearer {{env.SANITY_API_TOKEN}}" }
190
+ */
191
+ headers?: Record<string, string>;
181
192
  /** Startup timeout in milliseconds */
182
193
  startupTimeoutMs?: number;
183
194
  /**
@@ -8,6 +8,6 @@
8
8
  *
9
9
  * @see docs/design-docs/architecture-overhaul/domain-model.md (canonical)
10
10
  * @see docs/design-docs/architecture-overhaul/test-definition.md (authoring surfaces)
11
- * @see docs/exec-plans/architecture-overhaul/phase-0-foundation-types.md (task 0h)
11
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-0-foundation-types.md (task 0h)
12
12
  */
13
13
  export {};
@@ -9,7 +9,7 @@
9
9
  * Ports & Adapters migration (Phase 0c). The original file is now a
10
10
  * re-export barrel that preserves backward compatibility.
11
11
  */
12
- import type { DocumentRef as _DocumentRef, EvalMode as _EvalMode } from "../../ailf-shared/index.d.ts";
12
+ import type { DocumentRef as _DocumentRef, EvalMode } from "../../ailf-shared/index.d.ts";
13
13
  export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "./scoring-input.js";
14
14
  export type { DocumentRef } from "../../ailf-shared/index.d.ts";
15
15
  export type { StoredBaseline, StoredReport, StoredRun, StoredTaskResult, StoredTrace, SchemaVersioned, } from "./storage-schema.js";
@@ -25,7 +25,6 @@ export type { ArtifactId, Brand, Err, FixtureId, IdValidationError, NewReportId,
25
25
  export { err, fixtureId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
26
26
  export type { AgentHarnessTaskDefinition, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PathDocRef, PerspectiveDocRef, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
27
27
  type DocumentRef = _DocumentRef;
28
- type EvalMode = _EvalMode;
29
28
  /** Aggregated retrieval metrics for a feature area */
30
29
  export interface AreaRetrievalMetrics {
31
30
  area: string;
@@ -119,7 +118,7 @@ export interface FailureModeReport {
119
118
  totalJudgments: number;
120
119
  }
121
120
  /** Failure mode classification for a low-scoring judgment */
122
- export type FailureModeType = "incorrect-docs" | "missing-docs" | "model-limitation" | "outdated-docs" | "poor-structure" | "unclassified";
121
+ export type FailureModeType = "api-error" | "incorrect-docs" | "missing-docs" | "model-limitation" | "outdated-docs" | "poor-structure" | "unclassified";
123
122
  /** Per-feature-area score breakdown */
124
123
  export interface FeatureScore {
125
124
  /**
@@ -352,11 +351,40 @@ export interface ModelEntry {
352
351
  env?: string;
353
352
  id: string;
354
353
  label: string;
355
- modes?: string[];
354
+ /**
355
+ * Which evaluation modes this model participates in.
356
+ *
357
+ * Values must be canonical eval mode names (e.g., "literacy", "mcp-server").
358
+ * When omitted, the model participates in all modes.
359
+ */
360
+ modes?: EvalMode[];
361
+ /**
362
+ * Per-provider timeout in ms. Emitted into Promptfoo provider config.
363
+ * Default: 300_000 (5 min, matching Promptfoo's built-in default).
364
+ */
365
+ timeoutMs?: number;
366
+ /**
367
+ * Per-mode variant whitelist. Keys are eval mode IDs, values are arrays
368
+ * of variant IDs to include for that mode.
369
+ *
370
+ * When a model enrolls in a mode (via `modes`) but does not specify
371
+ * variants for it here, ALL variants defined by the mode base are included.
372
+ *
373
+ * Only meaningful for modes that define variants (e.g., literacy has
374
+ * "baseline", "observed", "agentic-naive", "agentic-optimized").
375
+ * Ignored for modes without variants.
376
+ */
377
+ variants?: Partial<Record<EvalMode, string[]>>;
356
378
  }
357
379
  /** Parsed config/models.yaml structure */
358
380
  export interface ModelsConfig {
359
381
  defaults: Record<string, unknown>;
382
+ /**
383
+ * Maximum wall-clock time per eval step (all tests for one mode) in ms.
384
+ * When exceeded, the subprocess is killed and partial results are used.
385
+ * Default: no limit (backward compatible).
386
+ */
387
+ evalBudgetMs?: number;
360
388
  grader: {
361
389
  id: string;
362
390
  label?: string;
@@ -507,6 +535,21 @@ export interface TestSummary {
507
535
  task: string;
508
536
  error: string;
509
537
  }[];
538
+ /** Per-test timing statistics (when latencyMs is available from Promptfoo) */
539
+ timing?: {
540
+ /** Median test duration in ms */
541
+ medianMs: number;
542
+ /** 95th percentile test duration in ms */
543
+ p95Ms: number;
544
+ /** Maximum test duration in ms */
545
+ maxMs: number;
546
+ /** Tests that exceeded the slow threshold (2x median, min 60s) */
547
+ slowTests: {
548
+ task: string;
549
+ model: string;
550
+ durationMs: number;
551
+ }[];
552
+ };
510
553
  }
511
554
  /** Token usage and estimated cost for a pipeline run. */
512
555
  export interface PipelineUsage {