@sanity/ailf 1.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (499) hide show
  1. package/README.md +0 -1
  2. package/canonical/grader-references/README.md +2 -2
  3. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  4. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  5. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  6. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  7. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  8. package/config/features.ts +1 -1
  9. package/config/models.ts +29 -12
  10. package/config/sources.ts +1 -1
  11. package/config/thresholds.ts +1 -1
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  13. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  17. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  18. package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
  19. package/dist/_vendor/ailf-core/config-helpers.js +51 -2
  20. package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
  21. package/dist/_vendor/ailf-core/examples/index.js +213 -94
  22. package/dist/_vendor/ailf-core/index.d.ts +3 -2
  23. package/dist/_vendor/ailf-core/index.js +2 -1
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  25. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  27. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  28. package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
  29. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  30. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  31. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  32. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  33. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  34. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  35. package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
  36. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
  37. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
  38. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
  39. package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
  40. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  41. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  42. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  43. package/dist/_vendor/ailf-core/services/index.js +1 -1
  44. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
  46. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  47. package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
  50. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  51. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  52. package/dist/adapters/api-client/remediation.js +2 -2
  53. package/dist/adapters/config-sources/file-config-adapter.js +7 -1
  54. package/dist/adapters/config-sources/ts-config-loader.js +21 -13
  55. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  56. package/dist/adapters/index.d.ts +0 -1
  57. package/dist/adapters/index.js +0 -1
  58. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  59. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  60. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  61. package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
  62. package/dist/adapters/task-sources/index.d.ts +3 -4
  63. package/dist/adapters/task-sources/index.js +3 -4
  64. package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
  65. package/dist/adapters/task-sources/repo-schemas.js +228 -20
  66. package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
  67. package/dist/adapters/task-sources/repo-task-source.js +81 -122
  68. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  69. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  70. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  71. package/dist/adapters/task-sources/repo-validation.js +126 -5
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
  73. package/dist/adapters/task-sources/task-file-loader.js +21 -7
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/coverage-audit.js +3 -1
  95. package/dist/commands/explain-handler.d.ts +1 -1
  96. package/dist/commands/explain-handler.js +37 -8
  97. package/dist/commands/fetch-docs.js +1 -0
  98. package/dist/commands/generate-configs.d.ts +3 -3
  99. package/dist/commands/generate-configs.js +20 -8
  100. package/dist/commands/init.d.ts +5 -4
  101. package/dist/commands/init.js +190 -25
  102. package/dist/commands/pipeline-action.d.ts +7 -1
  103. package/dist/commands/pipeline-action.js +43 -19
  104. package/dist/commands/pipeline.d.ts +6 -1
  105. package/dist/commands/pipeline.js +7 -2
  106. package/dist/commands/pr-comment.js +1 -0
  107. package/dist/commands/publish.js +1 -0
  108. package/dist/commands/shared/help.js +2 -2
  109. package/dist/commands/update-quality-scores.d.ts +5 -0
  110. package/dist/commands/update-quality-scores.js +20 -0
  111. package/dist/commands/validate-tasks.d.ts +2 -2
  112. package/dist/commands/validate-tasks.js +26 -15
  113. package/dist/composition-root.d.ts +15 -4
  114. package/dist/composition-root.js +100 -55
  115. package/dist/config/features.ts +23 -0
  116. package/dist/config/models.ts +100 -0
  117. package/dist/config/prompts.ts +16 -0
  118. package/dist/config/rubrics.ts +225 -0
  119. package/dist/config/schedules.ts +47 -0
  120. package/dist/config/sinks.ts +37 -0
  121. package/dist/config/sources.ts +21 -0
  122. package/dist/config/thresholds.ts +61 -0
  123. package/dist/index.d.ts +41 -0
  124. package/dist/index.js +48 -0
  125. package/dist/lib/agent-behavior-report.d.ts +8 -0
  126. package/dist/lib/agent-behavior-report.js +185 -0
  127. package/dist/lib/baseline.d.ts +19 -0
  128. package/dist/lib/baseline.js +153 -0
  129. package/dist/lib/calculate-scores.d.ts +23 -0
  130. package/dist/lib/calculate-scores.js +42 -0
  131. package/dist/lib/compare.d.ts +18 -0
  132. package/dist/lib/compare.js +170 -0
  133. package/dist/lib/coverage-audit.d.ts +4 -0
  134. package/dist/lib/coverage-audit.js +42 -0
  135. package/dist/lib/discovery-report.d.ts +13 -0
  136. package/dist/lib/discovery-report.js +57 -0
  137. package/dist/lib/fetch-docs.d.ts +30 -0
  138. package/dist/lib/fetch-docs.js +171 -0
  139. package/dist/lib/generate-configs.d.ts +25 -0
  140. package/dist/lib/generate-configs.js +42 -0
  141. package/dist/lib/grader-api.d.ts +21 -0
  142. package/dist/lib/grader-api.js +34 -0
  143. package/dist/lib/grader-compare.d.ts +19 -0
  144. package/dist/lib/grader-compare.js +91 -0
  145. package/dist/lib/grader-consistency.d.ts +27 -0
  146. package/dist/lib/grader-consistency.js +79 -0
  147. package/dist/lib/grader-sensitivity.d.ts +19 -0
  148. package/dist/lib/grader-sensitivity.js +75 -0
  149. package/dist/lib/grader-validate.d.ts +19 -0
  150. package/dist/lib/grader-validate.js +78 -0
  151. package/dist/lib/measure-retrieval.d.ts +14 -0
  152. package/dist/lib/measure-retrieval.js +71 -0
  153. package/dist/lib/pr-comment.d.ts +16 -0
  154. package/dist/lib/pr-comment.js +28 -0
  155. package/dist/lib/readiness-report.d.ts +13 -0
  156. package/dist/lib/readiness-report.js +108 -0
  157. package/dist/lib/webhook-server.d.ts +11 -0
  158. package/dist/lib/webhook-server.js +24 -0
  159. package/dist/lib/weekly-digest.d.ts +24 -0
  160. package/dist/lib/weekly-digest.js +148 -0
  161. package/dist/orchestration/build-app-context.js +13 -0
  162. package/dist/orchestration/build-step-sequence.js +4 -2
  163. package/dist/orchestration/cache-context.d.ts +23 -0
  164. package/dist/orchestration/cache-context.js +43 -0
  165. package/dist/orchestration/env-bridge.d.ts +21 -0
  166. package/dist/orchestration/env-bridge.js +66 -0
  167. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  168. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  169. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  170. package/dist/orchestration/step-runner.js +5 -1
  171. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  172. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  173. package/dist/orchestration/steps/callback-step.js +10 -1
  174. package/dist/orchestration/steps/compare-step.js +6 -3
  175. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  176. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  177. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  178. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  179. package/dist/orchestration/steps/fetch-docs-step.js +32 -19
  180. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  181. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  182. package/dist/orchestration/steps/generate-configs-step.js +77 -26
  183. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  184. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  185. package/dist/orchestration/steps/publish-report-step.js +19 -0
  186. package/dist/orchestration/steps/readiness-step.js +8 -3
  187. package/dist/orchestration/steps/report-step.js +17 -4
  188. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  189. package/dist/orchestration/steps/run-eval-step.js +51 -31
  190. package/dist/pipeline/agent-behavior-report.js +6 -0
  191. package/dist/pipeline/attribution.d.ts +1 -1
  192. package/dist/pipeline/attribution.js +1 -1
  193. package/dist/pipeline/cache.js +29 -15
  194. package/dist/pipeline/calculate-scores.d.ts +2 -0
  195. package/dist/pipeline/calculate-scores.js +70 -33
  196. package/dist/pipeline/chronic-failures.d.ts +55 -0
  197. package/dist/pipeline/chronic-failures.js +110 -0
  198. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
  199. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
  200. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
  201. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
  202. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  203. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
  204. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  205. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  206. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  207. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  208. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  209. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  210. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  211. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  212. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  213. package/dist/pipeline/compiler/config-loader.js +42 -2
  214. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  215. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  216. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  217. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  218. package/dist/pipeline/compiler/index.d.ts +2 -5
  219. package/dist/pipeline/compiler/index.js +2 -5
  220. package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
  221. package/dist/pipeline/compiler/literacy-bridge.js +2 -2
  222. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  223. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  224. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  225. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  226. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  227. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  228. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
  229. package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
  230. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  231. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  232. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  233. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  234. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  235. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  236. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  237. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  238. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  239. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  240. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  241. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  242. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  243. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  244. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
  245. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  246. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  247. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  248. package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
  249. package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
  250. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  251. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  252. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  253. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  254. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  255. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  256. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  257. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  258. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  259. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  260. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  261. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  262. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  263. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  264. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  265. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  266. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  267. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  268. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  269. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  270. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  271. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  272. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  273. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  274. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
  275. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
  276. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
  277. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  278. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  279. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  280. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  281. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  282. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  283. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  284. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
  285. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  286. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  287. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  288. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  289. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
  290. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
  291. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  292. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
  293. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  294. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
  295. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  296. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  297. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
  298. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
  299. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
  300. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  301. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  302. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  303. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  304. package/dist/pipeline/compiler/preset-loader.js +99 -0
  305. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
  306. package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
  307. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  308. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  309. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  310. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  311. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  312. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  313. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  314. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  315. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  316. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  317. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  318. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  319. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  320. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  321. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  322. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  323. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  324. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  325. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  326. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  327. package/dist/pipeline/compiler/task-bridge.js +92 -0
  328. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  329. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  330. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  331. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  332. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  333. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  334. package/dist/pipeline/coverage-audit.d.ts +1 -1
  335. package/dist/pipeline/coverage-audit.js +1 -1
  336. package/dist/pipeline/degradations.d.ts +1 -1
  337. package/dist/pipeline/degradations.js +1 -1
  338. package/dist/pipeline/expand-tasks.d.ts +2 -2
  339. package/dist/pipeline/expand-tasks.js +2 -2
  340. package/dist/pipeline/failure-modes.d.ts +1 -1
  341. package/dist/pipeline/failure-modes.js +13 -1
  342. package/dist/pipeline/gap-analysis.d.ts +1 -1
  343. package/dist/pipeline/gap-analysis.js +3 -1
  344. package/dist/pipeline/generate-configs.d.ts +2 -2
  345. package/dist/pipeline/generate-configs.js +16 -9
  346. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  347. package/dist/pipeline/grader-compare-runner.js +7 -1
  348. package/dist/pipeline/grader-comparison.d.ts +1 -1
  349. package/dist/pipeline/grader-comparison.js +1 -1
  350. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  351. package/dist/pipeline/grader-consistency-runner.js +7 -1
  352. package/dist/pipeline/grader-consistency.d.ts +1 -1
  353. package/dist/pipeline/grader-consistency.js +1 -1
  354. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  355. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  356. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  357. package/dist/pipeline/grader-sensitivity.js +1 -1
  358. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  359. package/dist/pipeline/grader-validate-runner.js +2 -2
  360. package/dist/pipeline/grader-validation.d.ts +1 -1
  361. package/dist/pipeline/grader-validation.js +1 -1
  362. package/dist/pipeline/map-request-to-config.js +16 -2
  363. package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
  364. package/dist/pipeline/mirror-repo-tasks.js +10 -10
  365. package/dist/pipeline/plan-format.d.ts +1 -1
  366. package/dist/pipeline/plan-format.js +1 -1
  367. package/dist/pipeline/plan.d.ts +1 -1
  368. package/dist/pipeline/plan.js +68 -30
  369. package/dist/pipeline/probe.d.ts +1 -1
  370. package/dist/pipeline/probe.js +1 -1
  371. package/dist/pipeline/readiness-report.d.ts +2 -2
  372. package/dist/pipeline/readiness-report.js +2 -2
  373. package/dist/pipeline/release-classification.d.ts +1 -1
  374. package/dist/pipeline/release-classification.js +1 -1
  375. package/dist/pipeline/release-report.d.ts +1 -1
  376. package/dist/pipeline/release-report.js +1 -1
  377. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  378. package/dist/pipeline/repo-eval-comment.js +1 -1
  379. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  380. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  381. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  382. package/dist/pipeline/resolve-mappings.js +44 -44
  383. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  384. package/dist/pipeline/retrieval-metrics.js +28 -20
  385. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  386. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  387. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  388. package/dist/pipeline/steps/compare-step.js +90 -0
  389. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  390. package/dist/pipeline/steps/eval-step.js +347 -0
  391. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  392. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  393. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  394. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  395. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  396. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  397. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  398. package/dist/pipeline/steps/publish-report-step.js +243 -0
  399. package/dist/pipeline/steps/report-step.d.ts +13 -0
  400. package/dist/pipeline/steps/report-step.js +56 -0
  401. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  402. package/dist/pipeline/steps/update-scores-step.js +42 -0
  403. package/dist/pipeline/targeted-loo.d.ts +1 -1
  404. package/dist/pipeline/targeted-loo.js +1 -1
  405. package/dist/pipeline/thresholds.d.ts +1 -1
  406. package/dist/pipeline/thresholds.js +1 -1
  407. package/dist/pipeline/validate.js +13 -0
  408. package/dist/report-store.d.ts +17 -0
  409. package/dist/report-store.js +24 -0
  410. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  411. package/dist/scripts/agent-behavior-report.js +315 -0
  412. package/dist/scripts/baseline.d.ts +43 -0
  413. package/dist/scripts/baseline.js +267 -0
  414. package/dist/scripts/calculate-scores.d.ts +166 -0
  415. package/dist/scripts/calculate-scores.js +1296 -0
  416. package/dist/scripts/compare.d.ts +22 -0
  417. package/dist/scripts/compare.js +334 -0
  418. package/dist/scripts/coverage-audit.d.ts +44 -0
  419. package/dist/scripts/coverage-audit.js +209 -0
  420. package/dist/scripts/debug-eval.d.ts +19 -0
  421. package/dist/scripts/debug-eval.js +73 -0
  422. package/dist/scripts/discovery-report.d.ts +58 -0
  423. package/dist/scripts/discovery-report.js +250 -0
  424. package/dist/scripts/fetch-docs.d.ts +35 -0
  425. package/dist/scripts/fetch-docs.js +472 -0
  426. package/dist/scripts/generate-configs.d.ts +66 -0
  427. package/dist/scripts/generate-configs.js +459 -0
  428. package/dist/scripts/grader-api.d.ts +27 -0
  429. package/dist/scripts/grader-api.js +206 -0
  430. package/dist/scripts/grader-compare.d.ts +22 -0
  431. package/dist/scripts/grader-compare.js +368 -0
  432. package/dist/scripts/grader-consistency.d.ts +20 -0
  433. package/dist/scripts/grader-consistency.js +313 -0
  434. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  435. package/dist/scripts/grader-sensitivity.js +354 -0
  436. package/dist/scripts/grader-validate.d.ts +19 -0
  437. package/dist/scripts/grader-validate.js +267 -0
  438. package/dist/scripts/measure-retrieval.d.ts +10 -0
  439. package/dist/scripts/measure-retrieval.js +145 -0
  440. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  441. package/dist/scripts/migrate-task-mode.js +1 -1
  442. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  443. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  444. package/dist/scripts/pipeline.d.ts +76 -0
  445. package/dist/scripts/pipeline.js +1031 -0
  446. package/dist/scripts/pr-comment.d.ts +10 -0
  447. package/dist/scripts/pr-comment.js +510 -0
  448. package/dist/scripts/readiness-report.d.ts +88 -0
  449. package/dist/scripts/readiness-report.js +342 -0
  450. package/dist/scripts/update-quality-scores.d.ts +15 -0
  451. package/dist/scripts/update-quality-scores.js +184 -0
  452. package/dist/scripts/validate-task-sources.d.ts +1 -1
  453. package/dist/scripts/validate-task-sources.js +1 -1
  454. package/dist/scripts/validate.d.ts +13 -0
  455. package/dist/scripts/validate.js +79 -0
  456. package/dist/scripts/webhook-server.d.ts +26 -0
  457. package/dist/scripts/webhook-server.js +147 -0
  458. package/dist/scripts/weekly-digest.d.ts +24 -0
  459. package/dist/scripts/weekly-digest.js +144 -0
  460. package/dist/sinks/format-slack.d.ts +64 -0
  461. package/dist/sinks/format-slack.js +306 -0
  462. package/dist/sinks/slack-sink.d.ts +27 -0
  463. package/dist/sinks/slack-sink.js +78 -0
  464. package/dist/sinks/types.d.ts +1 -1
  465. package/dist/sinks/types.js +1 -1
  466. package/dist/sinks/webhook-sink.d.ts +19 -0
  467. package/dist/sinks/webhook-sink.js +50 -0
  468. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  469. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  470. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  471. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  472. package/dist/tasks/literacy/functions.task.ts +70 -0
  473. package/dist/tasks/literacy/groq.task.ts +259 -0
  474. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  475. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  476. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  477. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  478. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  479. package/package.json +32 -24
  480. package/tasks/.expanded.agentic.yaml +280 -0
  481. package/tasks/.expanded.yaml +565 -0
  482. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  483. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  484. package/tasks/literacy/content-lake.task.ts +181 -0
  485. package/tasks/literacy/frameworks.task.ts +1 -0
  486. package/tasks/literacy/functions.task.ts +1 -0
  487. package/tasks/literacy/groq.task.ts +1 -0
  488. package/tasks/literacy/image-handling.task.ts +95 -0
  489. package/tasks/literacy/nextjs-live.task.ts +2 -1
  490. package/tasks/literacy/portable-text.task.ts +169 -0
  491. package/tasks/literacy/studio-setup.task.ts +5 -2
  492. package/tasks/literacy/visual-editing.task.ts +1 -0
  493. package/LICENSE +0 -21
  494. package/tasks/frameworks.yaml +0 -98
  495. package/tasks/functions.yaml +0 -51
  496. package/tasks/groq.yaml +0 -216
  497. package/tasks/nextjs-live.yaml +0 -62
  498. package/tasks/studio-setup.yaml +0 -111
  499. package/tasks/visual-editing.yaml +0 -120
@@ -25,7 +25,7 @@
25
25
  * All functions accept rootDir as a parameter — no module-level constants.
26
26
  * No process.argv parsing. No env var fallbacks.
27
27
  *
28
- * @see docs/exec-plans/eliminate-lib-layer.md
28
+ * @see docs/archive/exec-plans/eliminate-lib-layer.md
29
29
  */
30
30
  import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
31
31
  import { join } from "path";
@@ -454,40 +454,55 @@ function readAndNormalizeResults(resultsPath, log) {
454
454
  resultCount: wrapper.results.length,
455
455
  stats: wrapper.stats,
456
456
  });
457
- const all = wrapper.results.map((r) => ({
458
- cost: r.cost ?? 0,
459
- description: r.testCase?.description ?? "unknown",
460
- error: r.error,
461
- gradingResult: r.gradingResult,
462
- metadata: r.metadata,
463
- provider: r.provider?.label ?? r.provider?.id,
464
- providerId: r.provider?.id,
465
- providerLabel: r.provider?.label,
466
- response: r.response,
467
- vars: r.vars ?? r.testCase?.vars ?? {},
468
- }));
469
- // Filter out results where gradingResult is null (errored/timed-out tests).
470
- // Promptfoo sets gradingResult to null when a test errors before grading.
471
- const valid = all.filter((r) => r.gradingResult !== null);
472
- const skipped = all.length - valid.length;
473
- _log.debug("Filtered null gradingResults", {
474
- totalResults: all.length,
475
- validResults: valid.length,
476
- skippedCount: skipped,
477
- });
478
- if (skipped > 0) {
479
- _log.warn(`⚠ Skipping ${skipped} of ${all.length} result(s) with null gradingResult (errored tests):`);
480
- for (const r of all) {
481
- if (r.gradingResult === null) {
482
- const providerLabel = r.provider ? `[${r.provider}] ` : "";
483
- const errorMsg = r.error
484
- ? r.error.slice(0, 150)
485
- : "unknown error (no error field in result)";
486
- _log.warn(`✗ ${providerLabel}"${r.description}" — ${errorMsg}`);
487
- }
457
+ // Normalize results. Errored tests (null gradingResult) get a synthetic
458
+ // zero-score result so they flow through scoring and failure mode
459
+ // classification as "api-error" instead of being silently dropped.
460
+ const results = [];
461
+ let synthesizedCount = 0;
462
+ for (const r of wrapper.results) {
463
+ const base = {
464
+ cost: r.cost ?? 0,
465
+ description: r.testCase?.description ?? "unknown",
466
+ metadata: r.metadata,
467
+ providerId: r.provider?.id,
468
+ providerLabel: r.provider?.label,
469
+ response: r.response ?? { output: "" },
470
+ vars: r.vars ?? r.testCase?.vars ?? {},
471
+ };
472
+ if (r.gradingResult === null || r.gradingResult === undefined) {
473
+ // Synthesize a zero-score result so errored tests are visible in
474
+ // scoring, gap analysis, and failure mode classification.
475
+ const errorMsg = r.error ?? "unknown error (null gradingResult)";
476
+ synthesizedCount++;
477
+ const providerLabel = r.provider?.label ?? r.provider?.id ?? "";
478
+ _log.warn(`⚠ [api-error] ${providerLabel ? `[${providerLabel}] ` : ""}"${base.description}" — ${errorMsg.slice(0, 150)}`);
479
+ results.push({
480
+ ...base,
481
+ gradingResult: {
482
+ pass: false,
483
+ componentResults: [
484
+ {
485
+ assertion: { type: "llm-rubric" },
486
+ pass: false,
487
+ reason: `[api-error] ${errorMsg}`,
488
+ score: 0,
489
+ },
490
+ ],
491
+ },
492
+ });
488
493
  }
494
+ else {
495
+ results.push({ ...base, gradingResult: r.gradingResult });
496
+ }
497
+ }
498
+ _log.debug("Normalized results", {
499
+ totalResults: wrapper.results.length,
500
+ synthesizedApiErrors: synthesizedCount,
501
+ });
502
+ if (synthesizedCount > 0) {
503
+ _log.warn(`⚠ Synthesized ${synthesizedCount} zero-score result(s) for errored tests (api-error)`);
489
504
  }
490
- return valid;
505
+ return results;
491
506
  }
492
507
  /**
493
508
  * Core scoring logic: takes a pre-filtered array of TestResult and produces
@@ -805,12 +820,34 @@ function computeTestSummary(resultsPath) {
805
820
  failed++;
806
821
  }
807
822
  }
823
+ // Extract per-test timing from latencyMs (when available from Promptfoo)
824
+ const durations = rawResults
825
+ .filter((r) => typeof r.latencyMs === "number")
826
+ .map((r) => ({
827
+ task: r.testCase?.description ?? "unknown",
828
+ model: r.provider?.label ?? r.provider?.id ?? "unknown",
829
+ durationMs: r.latencyMs,
830
+ }));
831
+ let timing;
832
+ if (durations.length > 0) {
833
+ const sorted = durations.map((d) => d.durationMs).sort((a, b) => a - b);
834
+ const medianMs = sorted[Math.floor(sorted.length / 2)];
835
+ const p95Ms = sorted[Math.floor(sorted.length * 0.95)];
836
+ const maxMs = sorted[sorted.length - 1];
837
+ // Flag tests exceeding 2x median (min 60s) as "slow"
838
+ const slowThreshold = Math.max(medianMs * 2, 60_000);
839
+ const slowTests = durations
840
+ .filter((d) => d.durationMs > slowThreshold)
841
+ .sort((a, b) => b.durationMs - a.durationMs);
842
+ timing = { medianMs, p95Ms, maxMs, slowTests };
843
+ }
808
844
  return {
809
845
  total: rawResults.length,
810
846
  passed,
811
847
  failed,
812
848
  errored,
813
849
  ...(errors.length > 0 ? { errors } : {}),
850
+ ...(timing ? { timing } : {}),
814
851
  };
815
852
  }
816
853
  function printPerModelReport(perModel, log) {
@@ -0,0 +1,55 @@
1
+ /**
2
+ * pipeline/chronic-failures.ts
3
+ *
4
+ * Aggregates error data across recent reports to identify tasks that
5
+ * consistently fail (>threshold error rate). This catches structurally
6
+ * broken tasks — wrong model config, tasks too complex for the provider,
7
+ * persistent API incompatibility — that would otherwise be invisible.
8
+ *
9
+ * @see docs/exec-plans/eval-pipeline-timeout-resilience.md — Phase 5
10
+ */
11
+ import type { ReportStore } from "../report-store.js";
12
+ export interface ChronicFailureEntry {
13
+ /** Task ID */
14
+ task: string;
15
+ /** Error rate as a fraction (0–1) */
16
+ errorRate: number;
17
+ /** Number of runs with errors / total runs analyzed */
18
+ errorCount: number;
19
+ totalRuns: number;
20
+ /** Which models are affected and how often */
21
+ modelBreakdown: {
22
+ model: string;
23
+ errorCount: number;
24
+ }[];
25
+ /** Most common error message */
26
+ commonError: string;
27
+ }
28
+ export interface ChronicFailureReport {
29
+ /** Number of reports analyzed */
30
+ lookback: number;
31
+ /** Threshold used for classification */
32
+ threshold: number;
33
+ /** Tasks exceeding the error threshold */
34
+ failures: ChronicFailureEntry[];
35
+ /** Total reports found (may be less than lookback if not enough history) */
36
+ reportsFound: number;
37
+ }
38
+ export interface ChronicFailureOptions {
39
+ /** Number of recent reports to analyze (default: 10) */
40
+ lookback?: number;
41
+ /** Error rate threshold (0–1) for "chronic" classification (default: 0.5) */
42
+ threshold?: number;
43
+ }
44
+ /**
45
+ * Query recent reports and identify tasks with chronic failures.
46
+ *
47
+ * @param reportStore - The report store to query
48
+ * @param options - Lookback window and threshold
49
+ * @returns Chronic failure report, or null if no reports found
50
+ */
51
+ export declare function detectChronicFailures(reportStore: ReportStore, options?: ChronicFailureOptions): Promise<ChronicFailureReport>;
52
+ /**
53
+ * Format a chronic failure report for console output.
54
+ */
55
+ export declare function formatChronicFailuresConsole(report: ChronicFailureReport): string;
@@ -0,0 +1,110 @@
1
+ /**
2
+ * pipeline/chronic-failures.ts
3
+ *
4
+ * Aggregates error data across recent reports to identify tasks that
5
+ * consistently fail (>threshold error rate). This catches structurally
6
+ * broken tasks — wrong model config, tasks too complex for the provider,
7
+ * persistent API incompatibility — that would otherwise be invisible.
8
+ *
9
+ * @see docs/exec-plans/eval-pipeline-timeout-resilience.md — Phase 5
10
+ */
11
+ // ---------------------------------------------------------------------------
12
+ // Public API
13
+ // ---------------------------------------------------------------------------
14
+ /**
15
+ * Query recent reports and identify tasks with chronic failures.
16
+ *
17
+ * @param reportStore - The report store to query
18
+ * @param options - Lookback window and threshold
19
+ * @returns Chronic failure report, or null if no reports found
20
+ */
21
+ export async function detectChronicFailures(reportStore, options = {}) {
22
+ const lookback = options.lookback ?? 10;
23
+ const threshold = options.threshold ?? 0.5;
24
+ const reports = await reportStore.queryRecentErrors(lookback);
25
+ if (reports.length === 0) {
26
+ return { lookback, threshold, failures: [], reportsFound: 0 };
27
+ }
28
+ // Aggregate errors by task
29
+ const taskErrors = new Map();
30
+ for (const report of reports) {
31
+ for (const error of report.errors) {
32
+ let entry = taskErrors.get(error.task);
33
+ if (!entry) {
34
+ entry = {
35
+ runsWith: new Set(),
36
+ modelErrors: new Map(),
37
+ errors: [],
38
+ };
39
+ taskErrors.set(error.task, entry);
40
+ }
41
+ entry.runsWith.add(report.reportId);
42
+ entry.modelErrors.set(error.model, (entry.modelErrors.get(error.model) ?? 0) + 1);
43
+ entry.errors.push(error.error);
44
+ }
45
+ }
46
+ // Identify chronic failures (error rate > threshold)
47
+ const failures = [];
48
+ const totalRuns = reports.length;
49
+ for (const [task, data] of taskErrors) {
50
+ const errorRate = data.runsWith.size / totalRuns;
51
+ if (errorRate >= threshold) {
52
+ // Find the most common error message
53
+ const errorCounts = new Map();
54
+ for (const err of data.errors) {
55
+ const truncated = err.slice(0, 200);
56
+ errorCounts.set(truncated, (errorCounts.get(truncated) ?? 0) + 1);
57
+ }
58
+ const commonError = [...errorCounts.entries()].sort((a, b) => b[1] - a[1])[0]?.[0] ??
59
+ "unknown";
60
+ const modelBreakdown = [...data.modelErrors.entries()]
61
+ .map(([model, errorCount]) => ({ model, errorCount }))
62
+ .sort((a, b) => b.errorCount - a.errorCount);
63
+ failures.push({
64
+ task,
65
+ errorRate,
66
+ errorCount: data.runsWith.size,
67
+ totalRuns,
68
+ modelBreakdown,
69
+ commonError,
70
+ });
71
+ }
72
+ }
73
+ // Sort by error rate descending
74
+ failures.sort((a, b) => b.errorRate - a.errorRate);
75
+ return { lookback, threshold, failures, reportsFound: reports.length };
76
+ }
77
+ // ---------------------------------------------------------------------------
78
+ // Formatting
79
+ // ---------------------------------------------------------------------------
80
+ /**
81
+ * Format a chronic failure report for console output.
82
+ */
83
+ export function formatChronicFailuresConsole(report) {
84
+ const lines = [];
85
+ lines.push(`Chronic Failure Report (last ${report.reportsFound} runs)`);
86
+ lines.push("━".repeat(50));
87
+ lines.push("");
88
+ if (report.failures.length === 0) {
89
+ lines.push(" ✅ No chronic failures detected (all tasks below " +
90
+ `${(report.threshold * 100).toFixed(0)}% error threshold)`);
91
+ lines.push("");
92
+ return lines.join("\n");
93
+ }
94
+ lines.push(` ⚠ ${report.failures.length} task(s) with chronic failures ` +
95
+ `(>${(report.threshold * 100).toFixed(0)}% error rate):`);
96
+ lines.push("");
97
+ for (const f of report.failures) {
98
+ lines.push(` ${f.task}`);
99
+ lines.push(` Error rate: ${f.errorCount}/${f.totalRuns} runs ` +
100
+ `(${(f.errorRate * 100).toFixed(0)}%)`);
101
+ const models = f.modelBreakdown
102
+ .map((m) => `${m.model} (${m.errorCount})`)
103
+ .join(", ");
104
+ lines.push(` Models affected: ${models}`);
105
+ lines.push(` Common error: "${f.commonError}"`);
106
+ lines.push(" Suggested action: Increase timeoutMs for affected models or simplify task");
107
+ lines.push("");
108
+ }
109
+ return lines.join("\n");
110
+ }
@@ -10,7 +10,7 @@
10
10
  import assert from "node:assert/strict";
11
11
  import { describe, it } from "node:test";
12
12
  import { LiteracyVariant } from "../../normalize-mode.js";
13
- import { compileAgentHarnessTask, AGENT_HARNESS_PROMPT_TEMPLATES, handler as agentHandler, validateAgentHarnessTask, } from "../mode-handlers/agent-harness-handler.js";
13
+ import { compileAgentHarnessTask, AGENT_HARNESS_PROMPT_TEMPLATES, handler as agentHandler, validateAgentHarnessTask, } from "../mode-handlers/agent-harness/index.js";
14
14
  import { allAgentHarnessExampleTasks, scaffoldProjectTask, modifyCodeTask, multiFileRefactorTask, } from "../mode-handlers/__fixtures__/agent-harness-example-tasks.js";
15
15
  // ---------------------------------------------------------------------------
16
16
  // Helpers
@@ -10,7 +10,7 @@
10
10
  import assert from "node:assert/strict";
11
11
  import { describe, it } from "node:test";
12
12
  import { LiteracyVariant } from "../../normalize-mode.js";
13
- import { compileKnowledgeProbeTask, handler as probeHandler, KNOWLEDGE_PROBE_PROMPT_TEMPLATES, validateKnowledgeProbeTask, } from "../mode-handlers/knowledge-probe-handler.js";
13
+ import { compileKnowledgeProbeTask, handler as probeHandler, KNOWLEDGE_PROBE_PROMPT_TEMPLATES, validateKnowledgeProbeTask, } from "../mode-handlers/knowledge-probe/index.js";
14
14
  import { allKnowledgeProbeExampleTasks, groqProjectionTask, defineTypeApiTask, ecosystemComparisonTask, } from "../mode-handlers/__fixtures__/knowledge-probe-example-tasks.js";
15
15
  // ---------------------------------------------------------------------------
16
16
  // Helpers
@@ -10,7 +10,7 @@
10
10
  import assert from "node:assert/strict";
11
11
  import { describe, it } from "node:test";
12
12
  import { LiteracyVariant } from "../../normalize-mode.js";
13
- import { compileLiteracyTask, validateLiteracyTask, } from "../mode-handlers/literacy-handler.js";
13
+ import { compileLiteracyTask, validateLiteracyTask, } from "../mode-handlers/literacy/index.js";
14
14
  import { compileLiteracyTasks, compareCompilerOutputs, } from "../literacy-bridge.js";
15
15
  // ---------------------------------------------------------------------------
16
16
  // Helpers
@@ -10,8 +10,7 @@
10
10
  import assert from "node:assert/strict";
11
11
  import { describe, it } from "node:test";
12
12
  import { LiteracyVariant } from "../../normalize-mode.js";
13
- import { compileMCPTask, handler as mcpHandler, MCP_PROMPT_TEMPLATES, validateMCPTask, } from "../mode-handlers/mcp-server-handler.js";
14
- import { buildMCPAssertions } from "../mode-handlers/mcp-assertions.js";
13
+ import { buildMCPAssertions, compileMCPTask, handler as mcpHandler, MCP_PROMPT_TEMPLATES, validateMCPTask, } from "../mode-handlers/mcp-server/index.js";
15
14
  import { allMCPExampleTasks, createAndPublishTask, inspectSchemaTask, queryDocumentsTask, semanticSearchTask, stdioServerTask, } from "../mode-handlers/__fixtures__/mcp-example-tasks.js";
16
15
  // ---------------------------------------------------------------------------
17
16
  // Helpers
@@ -26,6 +25,29 @@ function makeMinimalMCPTask(overrides) {
26
25
  ...overrides,
27
26
  };
28
27
  }
28
+ /** Test models for compilation — simulates models from the registry */
29
+ const TEST_MODELS = [
30
+ {
31
+ id: "anthropic:messages:claude-opus-4-6",
32
+ label: "Claude Opus 4.6",
33
+ config: { temperature: 0.2 },
34
+ },
35
+ {
36
+ id: "openai:responses:gpt-5.4",
37
+ label: "GPT 5.4",
38
+ config: { reasoning_effort: "medium" },
39
+ },
40
+ ];
41
+ /** The custom MCP provider file:// path */
42
+ const MCP_PROVIDER_PATH = "file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js";
43
+ /** Helper to get provider config */
44
+ function cfg(provider) {
45
+ return provider.config;
46
+ }
47
+ /** Helper to get mcpServer sub-config from provider */
48
+ function serverCfg(provider) {
49
+ return cfg(provider)?.mcpServer;
50
+ }
29
51
  // ---------------------------------------------------------------------------
30
52
  // handler.getPrompts() — prompt template ownership
31
53
  // ---------------------------------------------------------------------------
@@ -38,11 +60,9 @@ describe("MCPServerHandler.getPrompts", () => {
38
60
  it("returns templates keyed by MCP-specific IDs (not literacy names)", () => {
39
61
  const prompts = mcpHandler.getPrompts();
40
62
  const keys = Object.keys(prompts);
41
- // Must not use literacy template names
42
63
  assert.ok(!keys.includes("with-docs"), "should not use literacy key 'with-docs'");
43
64
  assert.ok(!keys.includes("without-docs"), "should not use literacy key 'without-docs'");
44
65
  assert.ok(!keys.includes(LiteracyVariant.AGENTIC), "should not use literacy key 'agentic'");
45
- // Must have MCP-appropriate key(s)
46
66
  assert.ok(keys.includes("mcp-server"), "should include 'mcp-server' template");
47
67
  });
48
68
  it("mcp-server template instructs model to use MCP tools", () => {
@@ -50,7 +70,6 @@ describe("MCPServerHandler.getPrompts", () => {
50
70
  const template = prompts["mcp-server"];
51
71
  assert.ok(template, "mcp-server template should exist");
52
72
  assert.ok(template.template.includes("{{task}}"), "should include {{task}} placeholder");
53
- // Should reference MCP tools / tool usage
54
73
  assert.ok(/tool/i.test(template.template), "template should mention tools (MCP-appropriate content)");
55
74
  });
56
75
  it("template has correct PromptTemplate shape", () => {
@@ -121,70 +140,124 @@ describe("validateMCPTask", () => {
121
140
  });
122
141
  });
123
142
  // ---------------------------------------------------------------------------
124
- // compileMCPTask
143
+ // compileMCPTask — provider assembly
125
144
  // ---------------------------------------------------------------------------
126
145
  describe("compileMCPTask", () => {
127
146
  it("produces provider, tests, and prompts", () => {
128
- const result = compileMCPTask(makeMinimalMCPTask());
147
+ const result = compileMCPTask(makeMinimalMCPTask(), { models: TEST_MODELS });
129
148
  assert.ok(result.providers.length > 0, "Should produce providers");
130
149
  assert.ok(result.tests.length > 0, "Should produce test cases");
131
150
  assert.ok(result.prompts.length > 0, "Should produce prompts");
132
151
  });
133
- it("builds Promptfoo-native MCP provider for stdio", () => {
152
+ it("emits file:// providers using the custom MCP tool provider", () => {
134
153
  const result = compileMCPTask(makeMinimalMCPTask({
135
154
  serverConfig: {
136
155
  transport: "stdio",
137
156
  command: "node dist/server.js --flag",
138
157
  },
139
- }));
140
- assert.equal(result.providers.length, 1);
141
- assert.equal(result.providers[0].id, "mcp");
142
- const config = result.providers[0].config;
143
- assert.equal(config.enabled, true);
144
- const server = config.server;
145
- assert.equal(server.command, "node");
146
- assert.deepEqual(server.args, ["dist/server.js", "--flag"]);
147
- });
148
- it("builds Promptfoo-native MCP provider for URL-based transport", () => {
158
+ }), { models: TEST_MODELS });
159
+ assert.equal(result.providers.length, 2, "One provider per model");
160
+ // All providers use the custom MCP tool provider path
161
+ assert.equal(result.providers[0].id, MCP_PROVIDER_PATH);
162
+ assert.equal(result.providers[1].id, MCP_PROVIDER_PATH);
163
+ // Model ID is passed in config
164
+ assert.equal(cfg(result.providers[0]).model, "anthropic:messages:claude-opus-4-6");
165
+ assert.equal(cfg(result.providers[1]).model, "openai:responses:gpt-5.4");
166
+ // MCP server config is in config.mcpServer
167
+ const server = serverCfg(result.providers[0]);
168
+ assert.equal(server.command, "node dist/server.js --flag");
169
+ });
170
+ it("preserves model config in provider config", () => {
171
+ const result = compileMCPTask(makeMinimalMCPTask({
172
+ serverConfig: { transport: "sse", url: "http://localhost:3000/sse" },
173
+ }), { models: TEST_MODELS });
174
+ const c = cfg(result.providers[0]);
175
+ assert.equal(c.temperature, 0.2, "Model config preserved");
176
+ assert.ok(c.mcpServer, "MCP server config present");
177
+ assert.equal(c.maxToolRounds, 5, "Default maxToolRounds");
178
+ });
179
+ it("builds MCP server config for URL-based transport", () => {
180
+ const result = compileMCPTask(makeMinimalMCPTask({
181
+ serverConfig: { transport: "sse", url: "http://localhost:3000/sse" },
182
+ }), { models: TEST_MODELS });
183
+ const server = serverCfg(result.providers[0]);
184
+ assert.equal(server.url, "http://localhost:3000/sse");
185
+ });
186
+ it("maps auth config to mcpServer config", () => {
149
187
  const result = compileMCPTask(makeMinimalMCPTask({
150
188
  serverConfig: {
151
- transport: "sse",
152
- url: "http://localhost:3000/sse",
189
+ transport: "streamable-http",
190
+ url: "https://mcp.example.com",
191
+ auth: { type: "bearer", token: "{{env.MY_TOKEN}}" },
153
192
  },
154
- }));
155
- assert.equal(result.providers[0].id, "mcp");
156
- const config = result.providers[0].config;
157
- const server = config.server;
158
- assert.equal(server.url, "http://localhost:3000/sse");
193
+ }), { models: TEST_MODELS });
194
+ const server = serverCfg(result.providers[0]);
195
+ assert.deepEqual(server.auth, { type: "bearer", token: "{{env.MY_TOKEN}}" });
159
196
  });
160
- it("maps auth config to Promptfoo provider", () => {
197
+ it("maps headers to mcpServer config", () => {
161
198
  const result = compileMCPTask(makeMinimalMCPTask({
162
199
  serverConfig: {
163
200
  transport: "streamable-http",
164
201
  url: "https://mcp.example.com",
165
- auth: {
166
- type: "bearer",
167
- token: "{{env.MY_TOKEN}}",
202
+ headers: {
203
+ Authorization: "Bearer {{env.MY_TOKEN}}",
204
+ "X-Custom": "value",
168
205
  },
169
206
  },
170
- }));
171
- const config = result.providers[0].config;
172
- const server = config.server;
207
+ }), { models: TEST_MODELS });
208
+ const server = serverCfg(result.providers[0]);
209
+ assert.deepEqual(server.headers, {
210
+ Authorization: "Bearer {{env.MY_TOKEN}}",
211
+ "X-Custom": "value",
212
+ });
213
+ });
214
+ it("passes both headers and auth when both present", () => {
215
+ const result = compileMCPTask(makeMinimalMCPTask({
216
+ serverConfig: {
217
+ transport: "streamable-http",
218
+ url: "https://mcp.example.com",
219
+ headers: { "X-Custom": "value" },
220
+ auth: { type: "bearer", token: "{{env.MY_TOKEN}}" },
221
+ },
222
+ }), { models: TEST_MODELS });
223
+ const server = serverCfg(result.providers[0]);
224
+ assert.deepEqual(server.headers, { "X-Custom": "value" });
173
225
  assert.deepEqual(server.auth, {
174
226
  type: "bearer",
175
227
  token: "{{env.MY_TOKEN}}",
176
228
  });
177
229
  });
178
- it("maps capabilities to Promptfoo tools filter", () => {
230
+ it("maps capabilities to mcpTools config", () => {
179
231
  const result = compileMCPTask(makeMinimalMCPTask({
180
232
  capabilities: ["query_documents", "get_schema"],
181
233
  serverConfig: {
182
234
  transport: "streamable-http",
183
235
  url: "https://mcp.example.com",
184
236
  },
185
- }));
186
- const config = result.providers[0].config;
187
- assert.deepEqual(config.tools, ["query_documents", "get_schema"]);
237
+ }), { models: TEST_MODELS });
238
+ assert.deepEqual(cfg(result.providers[0]).mcpTools, [
239
+ "query_documents",
240
+ "get_schema",
241
+ ]);
242
+ });
243
+ it("uses task-level models override when specified", () => {
244
+ const result = compileMCPTask(makeMinimalMCPTask({
245
+ models: ["anthropic:messages:claude-sonnet-4-20250514"],
246
+ serverConfig: { transport: "sse", url: "http://localhost:3000" },
247
+ }), { models: TEST_MODELS });
248
+ assert.equal(result.providers.length, 1);
249
+ assert.equal(cfg(result.providers[0]).model, "anthropic:messages:claude-sonnet-4-20250514");
250
+ });
251
+ it("respects task-level maxToolRounds", () => {
252
+ const result = compileMCPTask(makeMinimalMCPTask({ maxToolRounds: 10 }), {
253
+ models: TEST_MODELS,
254
+ });
255
+ assert.equal(cfg(result.providers[0]).maxToolRounds, 10);
256
+ });
257
+ it("falls back to default model when no models provided", () => {
258
+ const result = compileMCPTask(makeMinimalMCPTask());
259
+ assert.ok(result.providers.length > 0, "Should have a fallback provider");
260
+ assert.ok(result.warnings.some((w) => w.includes("no models")));
188
261
  });
189
262
  it("uses task description as prompt text", () => {
190
263
  const result = compileMCPTask(makeMinimalMCPTask({
@@ -217,7 +290,6 @@ describe("compileMCPTask", () => {
217
290
  ],
218
291
  },
219
292
  }));
220
- // Primary + multi-turn test cases
221
293
  assert.equal(result.tests.length, 2);
222
294
  assert.ok(result.tests[1].description.includes("[multi-turn]"));
223
295
  });
@@ -293,63 +365,61 @@ describe("buildMCPAssertions", () => {
293
365
  // Example task compilation (end-to-end)
294
366
  // ---------------------------------------------------------------------------
295
367
  describe("example MCP tasks — end-to-end compilation", () => {
368
+ const opts = { models: TEST_MODELS };
296
369
  it("compiles all example tasks without errors", () => {
297
370
  for (const task of allMCPExampleTasks) {
298
- const result = compileMCPTask(task);
371
+ const result = compileMCPTask(task, opts);
299
372
  assert.ok(result.providers.length > 0, `${task.id}: should produce providers`);
300
373
  assert.ok(result.tests.length > 0, `${task.id}: should produce test cases`);
301
374
  assert.ok(result.prompts.length > 0, `${task.id}: should produce prompts`);
302
375
  }
303
376
  });
304
377
  it("query task has tool-called + contains + llm-rubric assertions", () => {
305
- const result = compileMCPTask(queryDocumentsTask);
378
+ const result = compileMCPTask(queryDocumentsTask, opts);
306
379
  const asserts = result.tests[0].assert;
307
- // tool-called (→ javascript), contains × 2, llm-rubric
308
380
  assert.equal(asserts.length, 4);
309
- assert.equal(asserts[0].type, "javascript"); // tool-called → javascript
381
+ assert.equal(asserts[0].type, "javascript");
310
382
  assert.equal(asserts[1].type, "contains");
311
383
  assert.equal(asserts[2].type, "contains");
312
384
  assert.equal(asserts[3].type, "llm-rubric");
313
385
  });
314
386
  it("schema task uses get_schema tool", () => {
315
- const result = compileMCPTask(inspectSchemaTask);
387
+ const result = compileMCPTask(inspectSchemaTask, opts);
316
388
  const asserts = result.tests[0].assert;
317
389
  assert.ok(asserts.some((a) => a.type === "javascript" && a.value.includes("get_schema")), "Should have tool-called assertion for get_schema");
318
390
  });
319
391
  it("create-publish task produces multi-turn test case", () => {
320
- const result = compileMCPTask(createAndPublishTask);
321
- // Primary + multi-turn
392
+ const result = compileMCPTask(createAndPublishTask, opts);
322
393
  assert.equal(result.tests.length, 2);
323
394
  assert.ok(result.tests[1].description?.includes("[multi-turn]"));
324
395
  });
325
- it("stdio task has Promptfoo-native MCP provider with command", () => {
326
- const result = compileMCPTask(stdioServerTask);
327
- assert.equal(result.providers[0].id, "mcp");
328
- const config = result.providers[0].config;
329
- assert.equal(config.enabled, true);
330
- const server = config.server;
331
- assert.equal(server.command, "node");
332
- assert.deepEqual(server.args, ["dist/sanity-mcp-server.js"]);
396
+ it("stdio task uses custom provider with command config", () => {
397
+ const result = compileMCPTask(stdioServerTask, opts);
398
+ assert.equal(result.providers[0].id, MCP_PROVIDER_PATH);
399
+ assert.equal(cfg(result.providers[0]).model, "anthropic:messages:claude-opus-4-6");
400
+ const server = serverCfg(result.providers[0]);
401
+ assert.equal(server.command, "node dist/sanity-mcp-server.js");
333
402
  });
334
403
  it("semantic search task has two tool-called + one llm-rubric assertion", () => {
335
- const result = compileMCPTask(semanticSearchTask);
404
+ const result = compileMCPTask(semanticSearchTask, opts);
336
405
  const asserts = result.tests[0].assert;
337
- // tool-called × 2 (→ javascript) + llm-rubric
338
406
  assert.equal(asserts.length, 3);
339
- assert.equal(asserts[0].type, "javascript"); // tool-called → javascript
340
- assert.ok(asserts[0].value.includes("list_embeddings_indices"), "Should have tool-called assertion for list_embeddings_indices");
341
- assert.equal(asserts[1].type, "javascript"); // tool-called → javascript
342
- assert.ok(asserts[1].value.includes("semantic_search"), "Should have tool-called assertion for semantic_search");
407
+ assert.equal(asserts[0].type, "javascript");
408
+ assert.ok(asserts[0].value.includes("list_embeddings_indices"));
409
+ assert.equal(asserts[1].type, "javascript");
410
+ assert.ok(asserts[1].value.includes("semantic_search"));
343
411
  assert.equal(asserts[2].type, "llm-rubric");
344
412
  });
345
413
  it("remote task has bearer auth and tools filter", () => {
346
- const result = compileMCPTask(queryDocumentsTask);
347
- const config = result.providers[0].config;
348
- const server = config.server;
414
+ const result = compileMCPTask(queryDocumentsTask, opts);
415
+ const server = serverCfg(result.providers[0]);
349
416
  assert.deepEqual(server.auth, {
350
417
  type: "bearer",
351
418
  token: "{{env.SANITY_MCP_AUTH_TOKEN}}",
352
419
  });
353
- assert.deepEqual(config.tools, ["query_documents", "get_schema"]);
420
+ assert.deepEqual(cfg(result.providers[0]).mcpTools, [
421
+ "query_documents",
422
+ "get_schema",
423
+ ]);
354
424
  });
355
425
  });