@sanity/ailf 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (444) hide show
  1. package/canonical/grader-references/README.md +2 -2
  2. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  3. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  4. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  5. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  6. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  7. package/config/features.ts +1 -1
  8. package/config/models.ts +28 -23
  9. package/config/sources.ts +1 -1
  10. package/config/thresholds.ts +1 -1
  11. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  13. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  17. package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
  18. package/dist/_vendor/ailf-core/config-helpers.js +29 -0
  19. package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
  20. package/dist/_vendor/ailf-core/examples/index.js +208 -114
  21. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  22. package/dist/_vendor/ailf-core/index.js +1 -0
  23. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  25. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  27. package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
  28. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  29. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  30. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  31. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  32. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  33. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
  34. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
  35. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  36. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  37. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  38. package/dist/_vendor/ailf-core/services/index.js +1 -1
  39. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  40. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
  41. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  42. package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
  43. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
  44. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  45. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  46. package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
  47. package/dist/_vendor/ailf-tasks/cli.js +61 -0
  48. package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
  49. package/dist/_vendor/ailf-tasks/index.js +16 -0
  50. package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
  51. package/dist/_vendor/ailf-tasks/parser.js +73 -0
  52. package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
  53. package/dist/_vendor/ailf-tasks/schemas.js +180 -0
  54. package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
  55. package/dist/_vendor/ailf-tasks/validation.js +162 -0
  56. package/dist/adapters/api-client/remediation.js +2 -2
  57. package/dist/adapters/config-sources/file-config-adapter.js +6 -1
  58. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  59. package/dist/adapters/index.d.ts +0 -1
  60. package/dist/adapters/index.js +0 -1
  61. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  62. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  63. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  64. package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
  65. package/dist/adapters/task-sources/index.d.ts +1 -2
  66. package/dist/adapters/task-sources/index.js +1 -2
  67. package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
  68. package/dist/adapters/task-sources/repo-schemas.js +2 -2
  69. package/dist/adapters/task-sources/repo-task-source.js +1 -1
  70. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  71. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
  73. package/dist/adapters/task-sources/task-file-loader.js +20 -6
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/explain-handler.d.ts +1 -1
  95. package/dist/commands/explain-handler.js +37 -8
  96. package/dist/commands/fetch-docs.js +1 -0
  97. package/dist/commands/generate-configs.d.ts +3 -3
  98. package/dist/commands/generate-configs.js +20 -8
  99. package/dist/commands/init.d.ts +2 -3
  100. package/dist/commands/init.js +56 -170
  101. package/dist/commands/pipeline-action.d.ts +7 -1
  102. package/dist/commands/pipeline-action.js +43 -19
  103. package/dist/commands/pipeline.d.ts +6 -1
  104. package/dist/commands/pipeline.js +7 -2
  105. package/dist/commands/pr-comment.js +1 -0
  106. package/dist/commands/publish.js +1 -0
  107. package/dist/commands/shared/help.js +2 -2
  108. package/dist/commands/update-quality-scores.d.ts +5 -0
  109. package/dist/commands/update-quality-scores.js +20 -0
  110. package/dist/composition-root.d.ts +2 -3
  111. package/dist/composition-root.js +27 -14
  112. package/dist/config/features.ts +23 -0
  113. package/dist/config/models.ts +100 -0
  114. package/dist/config/prompts.ts +16 -0
  115. package/dist/config/rubrics.ts +225 -0
  116. package/dist/config/schedules.ts +47 -0
  117. package/dist/config/sinks.ts +37 -0
  118. package/dist/config/sources.ts +21 -0
  119. package/dist/config/thresholds.ts +61 -0
  120. package/dist/lib/agent-behavior-report.d.ts +8 -0
  121. package/dist/lib/agent-behavior-report.js +185 -0
  122. package/dist/lib/baseline.d.ts +19 -0
  123. package/dist/lib/baseline.js +153 -0
  124. package/dist/lib/calculate-scores.d.ts +23 -0
  125. package/dist/lib/calculate-scores.js +42 -0
  126. package/dist/lib/compare.d.ts +18 -0
  127. package/dist/lib/compare.js +170 -0
  128. package/dist/lib/coverage-audit.d.ts +4 -0
  129. package/dist/lib/coverage-audit.js +42 -0
  130. package/dist/lib/discovery-report.d.ts +13 -0
  131. package/dist/lib/discovery-report.js +57 -0
  132. package/dist/lib/fetch-docs.d.ts +30 -0
  133. package/dist/lib/fetch-docs.js +171 -0
  134. package/dist/lib/generate-configs.d.ts +25 -0
  135. package/dist/lib/generate-configs.js +42 -0
  136. package/dist/lib/grader-api.d.ts +21 -0
  137. package/dist/lib/grader-api.js +34 -0
  138. package/dist/lib/grader-compare.d.ts +19 -0
  139. package/dist/lib/grader-compare.js +91 -0
  140. package/dist/lib/grader-consistency.d.ts +27 -0
  141. package/dist/lib/grader-consistency.js +79 -0
  142. package/dist/lib/grader-sensitivity.d.ts +19 -0
  143. package/dist/lib/grader-sensitivity.js +75 -0
  144. package/dist/lib/grader-validate.d.ts +19 -0
  145. package/dist/lib/grader-validate.js +78 -0
  146. package/dist/lib/measure-retrieval.d.ts +14 -0
  147. package/dist/lib/measure-retrieval.js +71 -0
  148. package/dist/lib/pr-comment.d.ts +16 -0
  149. package/dist/lib/pr-comment.js +28 -0
  150. package/dist/lib/readiness-report.d.ts +13 -0
  151. package/dist/lib/readiness-report.js +108 -0
  152. package/dist/lib/webhook-server.d.ts +11 -0
  153. package/dist/lib/webhook-server.js +24 -0
  154. package/dist/lib/weekly-digest.d.ts +24 -0
  155. package/dist/lib/weekly-digest.js +148 -0
  156. package/dist/orchestration/build-app-context.js +13 -0
  157. package/dist/orchestration/cache-context.d.ts +23 -0
  158. package/dist/orchestration/cache-context.js +43 -0
  159. package/dist/orchestration/env-bridge.d.ts +21 -0
  160. package/dist/orchestration/env-bridge.js +66 -0
  161. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  162. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  163. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  164. package/dist/orchestration/step-runner.js +5 -1
  165. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  166. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  167. package/dist/orchestration/steps/callback-step.js +10 -1
  168. package/dist/orchestration/steps/compare-step.js +6 -3
  169. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  170. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  171. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  172. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  173. package/dist/orchestration/steps/fetch-docs-step.js +30 -16
  174. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  175. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  176. package/dist/orchestration/steps/generate-configs-step.js +50 -15
  177. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  178. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  179. package/dist/orchestration/steps/publish-report-step.js +19 -0
  180. package/dist/orchestration/steps/readiness-step.js +8 -3
  181. package/dist/orchestration/steps/report-step.js +17 -4
  182. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  183. package/dist/orchestration/steps/run-eval-step.js +52 -32
  184. package/dist/pipeline/agent-behavior-report.js +6 -0
  185. package/dist/pipeline/attribution.d.ts +1 -1
  186. package/dist/pipeline/attribution.js +1 -1
  187. package/dist/pipeline/cache.js +29 -15
  188. package/dist/pipeline/calculate-scores.d.ts +2 -0
  189. package/dist/pipeline/calculate-scores.js +70 -33
  190. package/dist/pipeline/checks.d.ts +8 -3
  191. package/dist/pipeline/checks.js +23 -3
  192. package/dist/pipeline/chronic-failures.d.ts +55 -0
  193. package/dist/pipeline/chronic-failures.js +110 -0
  194. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
  195. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  196. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  197. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  198. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  199. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  200. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  201. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  202. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  203. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  204. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  205. package/dist/pipeline/compiler/config-loader.js +42 -2
  206. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  207. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  208. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  209. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  210. package/dist/pipeline/compiler/index.d.ts +2 -5
  211. package/dist/pipeline/compiler/index.js +2 -5
  212. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  213. package/dist/pipeline/compiler/literacy-bridge.js +1 -1
  214. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
  215. package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
  216. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
  217. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
  218. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
  219. package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
  220. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
  221. package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
  222. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
  223. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
  224. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  225. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  226. package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
  227. package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
  228. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
  229. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
  230. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  231. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  232. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
  233. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
  234. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  235. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  236. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  237. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
  238. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
  239. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
  240. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
  241. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
  242. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
  243. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
  244. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
  245. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
  246. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
  247. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
  248. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
  249. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
  250. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  251. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  252. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
  253. package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
  254. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  255. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  256. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  257. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  258. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  259. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  260. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  261. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  262. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  263. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  264. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  265. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  266. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  267. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  268. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  269. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  270. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  271. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  272. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  273. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  274. package/dist/pipeline/compiler/task-bridge.js +92 -0
  275. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  276. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  277. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  278. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  279. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  280. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  281. package/dist/pipeline/coverage-audit.d.ts +1 -1
  282. package/dist/pipeline/coverage-audit.js +1 -1
  283. package/dist/pipeline/degradations.d.ts +1 -1
  284. package/dist/pipeline/degradations.js +1 -1
  285. package/dist/pipeline/failure-modes.d.ts +1 -1
  286. package/dist/pipeline/failure-modes.js +13 -1
  287. package/dist/pipeline/gap-analysis.d.ts +1 -1
  288. package/dist/pipeline/gap-analysis.js +3 -1
  289. package/dist/pipeline/generate-configs.d.ts +2 -2
  290. package/dist/pipeline/generate-configs.js +15 -8
  291. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  292. package/dist/pipeline/grader-compare-runner.js +7 -1
  293. package/dist/pipeline/grader-comparison.d.ts +1 -1
  294. package/dist/pipeline/grader-comparison.js +1 -1
  295. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  296. package/dist/pipeline/grader-consistency-runner.js +7 -1
  297. package/dist/pipeline/grader-consistency.d.ts +1 -1
  298. package/dist/pipeline/grader-consistency.js +1 -1
  299. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  300. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  301. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  302. package/dist/pipeline/grader-sensitivity.js +1 -1
  303. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  304. package/dist/pipeline/grader-validate-runner.js +2 -2
  305. package/dist/pipeline/grader-validation.d.ts +1 -1
  306. package/dist/pipeline/grader-validation.js +1 -1
  307. package/dist/pipeline/map-request-to-config.js +15 -2
  308. package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
  309. package/dist/pipeline/mirror-repo-tasks.js +1 -1
  310. package/dist/pipeline/plan-format.d.ts +1 -1
  311. package/dist/pipeline/plan-format.js +1 -1
  312. package/dist/pipeline/plan.d.ts +1 -1
  313. package/dist/pipeline/plan.js +67 -29
  314. package/dist/pipeline/probe.d.ts +1 -1
  315. package/dist/pipeline/probe.js +1 -1
  316. package/dist/pipeline/readiness-report.d.ts +2 -2
  317. package/dist/pipeline/readiness-report.js +2 -2
  318. package/dist/pipeline/release-classification.d.ts +1 -1
  319. package/dist/pipeline/release-classification.js +1 -1
  320. package/dist/pipeline/release-report.d.ts +1 -1
  321. package/dist/pipeline/release-report.js +1 -1
  322. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  323. package/dist/pipeline/repo-eval-comment.js +1 -1
  324. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  325. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  326. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  327. package/dist/pipeline/resolve-mappings.js +44 -44
  328. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  329. package/dist/pipeline/retrieval-metrics.js +28 -20
  330. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  331. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  332. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  333. package/dist/pipeline/steps/compare-step.js +90 -0
  334. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  335. package/dist/pipeline/steps/eval-step.js +347 -0
  336. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  337. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  338. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  339. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  340. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  341. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  342. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  343. package/dist/pipeline/steps/publish-report-step.js +243 -0
  344. package/dist/pipeline/steps/report-step.d.ts +13 -0
  345. package/dist/pipeline/steps/report-step.js +56 -0
  346. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  347. package/dist/pipeline/steps/update-scores-step.js +42 -0
  348. package/dist/pipeline/targeted-loo.d.ts +1 -1
  349. package/dist/pipeline/targeted-loo.js +1 -1
  350. package/dist/pipeline/thresholds.d.ts +1 -1
  351. package/dist/pipeline/thresholds.js +1 -1
  352. package/dist/pipeline/validate.js +13 -0
  353. package/dist/report-store.d.ts +17 -0
  354. package/dist/report-store.js +24 -0
  355. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  356. package/dist/scripts/agent-behavior-report.js +315 -0
  357. package/dist/scripts/baseline.d.ts +43 -0
  358. package/dist/scripts/baseline.js +267 -0
  359. package/dist/scripts/calculate-scores.d.ts +166 -0
  360. package/dist/scripts/calculate-scores.js +1296 -0
  361. package/dist/scripts/compare.d.ts +22 -0
  362. package/dist/scripts/compare.js +334 -0
  363. package/dist/scripts/coverage-audit.d.ts +44 -0
  364. package/dist/scripts/coverage-audit.js +209 -0
  365. package/dist/scripts/debug-eval.d.ts +19 -0
  366. package/dist/scripts/debug-eval.js +73 -0
  367. package/dist/scripts/discovery-report.d.ts +58 -0
  368. package/dist/scripts/discovery-report.js +250 -0
  369. package/dist/scripts/fetch-docs.d.ts +35 -0
  370. package/dist/scripts/fetch-docs.js +472 -0
  371. package/dist/scripts/generate-configs.d.ts +66 -0
  372. package/dist/scripts/generate-configs.js +459 -0
  373. package/dist/scripts/grader-api.d.ts +27 -0
  374. package/dist/scripts/grader-api.js +206 -0
  375. package/dist/scripts/grader-compare.d.ts +22 -0
  376. package/dist/scripts/grader-compare.js +368 -0
  377. package/dist/scripts/grader-consistency.d.ts +20 -0
  378. package/dist/scripts/grader-consistency.js +313 -0
  379. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  380. package/dist/scripts/grader-sensitivity.js +354 -0
  381. package/dist/scripts/grader-validate.d.ts +19 -0
  382. package/dist/scripts/grader-validate.js +267 -0
  383. package/dist/scripts/measure-retrieval.d.ts +10 -0
  384. package/dist/scripts/measure-retrieval.js +145 -0
  385. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  386. package/dist/scripts/migrate-task-mode.js +1 -1
  387. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  388. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  389. package/dist/scripts/pipeline.d.ts +76 -0
  390. package/dist/scripts/pipeline.js +1031 -0
  391. package/dist/scripts/pr-comment.d.ts +10 -0
  392. package/dist/scripts/pr-comment.js +510 -0
  393. package/dist/scripts/readiness-report.d.ts +88 -0
  394. package/dist/scripts/readiness-report.js +342 -0
  395. package/dist/scripts/update-quality-scores.d.ts +15 -0
  396. package/dist/scripts/update-quality-scores.js +184 -0
  397. package/dist/scripts/validate-task-sources.d.ts +1 -1
  398. package/dist/scripts/validate-task-sources.js +1 -1
  399. package/dist/scripts/validate.d.ts +13 -0
  400. package/dist/scripts/validate.js +79 -0
  401. package/dist/scripts/webhook-server.d.ts +26 -0
  402. package/dist/scripts/webhook-server.js +147 -0
  403. package/dist/scripts/weekly-digest.d.ts +24 -0
  404. package/dist/scripts/weekly-digest.js +144 -0
  405. package/dist/sinks/format-slack.d.ts +64 -0
  406. package/dist/sinks/format-slack.js +306 -0
  407. package/dist/sinks/slack-sink.d.ts +27 -0
  408. package/dist/sinks/slack-sink.js +78 -0
  409. package/dist/sinks/types.d.ts +1 -1
  410. package/dist/sinks/types.js +1 -1
  411. package/dist/sinks/webhook-sink.d.ts +19 -0
  412. package/dist/sinks/webhook-sink.js +50 -0
  413. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  414. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  415. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  416. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  417. package/dist/tasks/literacy/functions.task.ts +70 -0
  418. package/dist/tasks/literacy/groq.task.ts +259 -0
  419. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  420. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  421. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  422. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  423. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  424. package/package.json +25 -25
  425. package/tasks/.expanded.agentic.yaml +280 -0
  426. package/tasks/.expanded.yaml +565 -0
  427. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  428. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  429. package/tasks/literacy/content-lake.task.ts +181 -0
  430. package/tasks/literacy/frameworks.task.ts +1 -0
  431. package/tasks/literacy/functions.task.ts +1 -0
  432. package/tasks/literacy/groq.task.ts +1 -0
  433. package/tasks/literacy/image-handling.task.ts +95 -0
  434. package/tasks/literacy/nextjs-live.task.ts +2 -1
  435. package/tasks/literacy/portable-text.task.ts +169 -0
  436. package/tasks/literacy/studio-setup.task.ts +5 -2
  437. package/tasks/literacy/visual-editing.task.ts +1 -0
  438. package/LICENSE +0 -21
  439. package/tasks/frameworks.yaml +0 -98
  440. package/tasks/functions.yaml +0 -51
  441. package/tasks/groq.yaml +0 -216
  442. package/tasks/nextjs-live.yaml +0 -62
  443. package/tasks/studio-setup.yaml +0 -111
  444. package/tasks/visual-editing.yaml +0 -120
@@ -0,0 +1,368 @@
1
+ /**
2
+ * mcp-tool-provider.ts — Custom Promptfoo provider for MCP tool-use evaluation.
3
+ *
4
+ * Implements a multi-turn tool execution loop: the LLM receives a prompt,
5
+ * discovers MCP tools, calls them, gets results, and continues until it
6
+ * produces a final text answer or exhausts maxToolRounds.
7
+ *
8
+ * Promptfoo's built-in Anthropic/OpenAI providers with config.mcp only do
9
+ * single-turn tool calls. This provider fills that gap by managing the
10
+ * full conversation loop, similar to the agentic-provider.ts pattern.
11
+ *
12
+ * Promptfoo config usage:
13
+ *
14
+ * providers:
15
+ * - id: file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js
16
+ * label: "Claude Opus 4.6 + MCP"
17
+ * config:
18
+ * model: anthropic:messages:claude-opus-4-6
19
+ * maxToolRounds: 5
20
+ * temperature: 0.2
21
+ * max_tokens: 4096
22
+ * mcpServer:
23
+ * url: https://mcp.sanity.io
24
+ * auth: { type: bearer, token: "{{env.SANITY_API_TOKEN}}" }
25
+ * name: mcp-live-query-documents
26
+ * mcpTools: [query_documents, get_schema]
27
+ */
28
+ import { config as loadDotenv } from "dotenv";
29
+ loadDotenv({
30
+ override: true,
31
+ path: new URL("../../../../.env", import.meta.url).pathname,
32
+ });
33
+ // ---------------------------------------------------------------------------
34
+ // Provider
35
+ // ---------------------------------------------------------------------------
36
+ export default class MCPToolProvider {
37
+ config;
38
+ providerId;
39
+ constructor(options = {}) {
40
+ this.config = options.config || {};
41
+ this.providerId = options.id || "mcp-tool-provider";
42
+ }
43
+ id() {
44
+ return this.providerId;
45
+ }
46
+ async callApi(prompt, _context) {
47
+ const mcpServerConfig = this.config.mcpServer;
48
+ if (!mcpServerConfig) {
49
+ return { error: "mcpServer config is required", output: undefined };
50
+ }
51
+ // Resolve model provider
52
+ const modelId = this.config.model ||
53
+ "anthropic:messages:claude-sonnet-4-20250514";
54
+ if (modelId.startsWith("anthropic:")) {
55
+ return this.runAnthropicLoop(prompt, mcpServerConfig, modelId);
56
+ }
57
+ // For now, only Anthropic is supported. OpenAI support can be added later.
58
+ return {
59
+ error: `MCP tool provider only supports Anthropic models for now. Got: ${modelId}`,
60
+ output: undefined,
61
+ };
62
+ }
63
+ // -------------------------------------------------------------------------
64
+ // Anthropic multi-turn MCP tool loop
65
+ // -------------------------------------------------------------------------
66
+ async runAnthropicLoop(prompt, mcpServerConfig, modelId) {
67
+ // Parse model name from provider ID (e.g., "anthropic:messages:claude-opus-4-6" → "claude-opus-4-6")
68
+ const modelParts = modelId.split(":");
69
+ const model = modelParts.length > 2
70
+ ? modelParts.slice(2).join(":")
71
+ : modelParts[modelParts.length - 1];
72
+ const temperature = this.config.temperature ?? 0.2;
73
+ const maxTokens = this.config.max_tokens || 4096;
74
+ const maxToolRounds = this.config.maxToolRounds || 5;
75
+ const apiKey = this.config.apiKey || process.env.ANTHROPIC_API_KEY;
76
+ if (!apiKey) {
77
+ return {
78
+ error: "ANTHROPIC_API_KEY not set. Configure it in env or provider config.",
79
+ output: undefined,
80
+ };
81
+ }
82
+ // Connect to MCP server and discover tools
83
+ let mcpClient;
84
+ try {
85
+ mcpClient = await this.connectMCP(mcpServerConfig);
86
+ }
87
+ catch (err) {
88
+ return {
89
+ error: `Failed to connect to MCP server: ${err instanceof Error ? err.message : String(err)}`,
90
+ output: undefined,
91
+ };
92
+ }
93
+ try {
94
+ // Get available tools and convert to Anthropic format
95
+ const mcpTools = mcpClient.getAllTools();
96
+ const toolFilter = this.config.mcpTools;
97
+ const filteredTools = toolFilter
98
+ ? mcpTools.filter((t) => toolFilter.includes(t.name))
99
+ : mcpTools;
100
+ const tools = filteredTools.map((t) => ({
101
+ name: t.name,
102
+ description: t.description || `MCP tool: ${t.name}`,
103
+ input_schema: t.inputSchema || { type: "object", properties: {} },
104
+ }));
105
+ if (tools.length === 0) {
106
+ return {
107
+ error: "No MCP tools available after filtering. Check mcpTools config and server capabilities.",
108
+ output: undefined,
109
+ };
110
+ }
111
+ /** Append a machine-readable tool call summary to output for assertion detection */
112
+ function appendToolSummary(text, log) {
113
+ if (log.length === 0)
114
+ return text;
115
+ const names = JSON.stringify(log.map((tc) => tc.name));
116
+ return `${text}\n\n<!-- MCP_TOOLS_CALLED: ${names} -->`;
117
+ }
118
+ const systemPrompt = "You are an AI assistant with access to tools provided by an MCP server. " +
119
+ "Use the available tools to complete the task. Call tools with correct parameters, " +
120
+ "interpret responses, and provide a complete answer.";
121
+ const messages = [{ content: prompt, role: "user" }];
122
+ let inputTokens = 0;
123
+ let outputTokens = 0;
124
+ const startTime = Date.now();
125
+ const toolCallLog = [];
126
+ for (let round = 0; round <= maxToolRounds; round++) {
127
+ const isLastRound = round === maxToolRounds;
128
+ // On last round, omit tools to force a final text response
129
+ if (isLastRound) {
130
+ const lastMsg = messages[messages.length - 1];
131
+ const synthesisText = "You've used the tools available. Based on the information gathered, " +
132
+ "provide your complete, final answer now.";
133
+ if (lastMsg?.role === "user" && Array.isArray(lastMsg.content)) {
134
+ ;
135
+ lastMsg.content.push({
136
+ type: "text",
137
+ text: synthesisText,
138
+ });
139
+ }
140
+ else {
141
+ messages.push({ content: synthesisText, role: "user" });
142
+ }
143
+ }
144
+ const body = {
145
+ max_tokens: maxTokens,
146
+ messages,
147
+ model,
148
+ system: systemPrompt,
149
+ temperature,
150
+ };
151
+ if (!isLastRound) {
152
+ body.tools = tools;
153
+ }
154
+ const response = await fetch("https://api.anthropic.com/v1/messages", {
155
+ body: JSON.stringify(body),
156
+ headers: {
157
+ "anthropic-version": "2023-06-01",
158
+ "Content-Type": "application/json",
159
+ "x-api-key": apiKey,
160
+ },
161
+ method: "POST",
162
+ });
163
+ const data = (await response.json());
164
+ if (data.error) {
165
+ return {
166
+ error: data.error.message ??
167
+ `Anthropic API error: ${JSON.stringify(data.error)}`,
168
+ output: undefined,
169
+ };
170
+ }
171
+ inputTokens += data.usage?.input_tokens ?? 0;
172
+ outputTokens += data.usage?.output_tokens ?? 0;
173
+ if (!data.content?.length) {
174
+ return {
175
+ cost: 0,
176
+ metadata: { toolRounds: round, toolCallLog },
177
+ output: "",
178
+ tokenUsage: {
179
+ completion: outputTokens,
180
+ prompt: inputTokens,
181
+ total: inputTokens + outputTokens,
182
+ },
183
+ };
184
+ }
185
+ // Add assistant response to history
186
+ messages.push({ content: data.content, role: "assistant" });
187
+ // Check if model wants to use tools
188
+ const toolUseBlocks = data.content.filter((b) => b.type === "tool_use");
189
+ if (data.stop_reason !== "tool_use" || toolUseBlocks.length === 0) {
190
+ // Model is done — extract text
191
+ const textBlocks = data.content.filter((b) => b.type === "text");
192
+ const rawOutput = textBlocks.map((b) => b.text || "").join("\n") || "";
193
+ return {
194
+ cost: 0,
195
+ metadata: {
196
+ toolRounds: round,
197
+ toolCallLog,
198
+ latencyMs: Date.now() - startTime,
199
+ },
200
+ output: appendToolSummary(rawOutput, toolCallLog),
201
+ tokenUsage: {
202
+ completion: outputTokens,
203
+ prompt: inputTokens,
204
+ total: inputTokens + outputTokens,
205
+ },
206
+ };
207
+ }
208
+ // Execute each tool call via MCP
209
+ const toolResults = [];
210
+ for (const toolUse of toolUseBlocks) {
211
+ const toolName = toolUse.name;
212
+ const toolInput = (toolUse.input || {});
213
+ try {
214
+ const result = await mcpClient.callTool(toolName, toolInput);
215
+ const content = result.error
216
+ ? JSON.stringify({ error: result.error })
217
+ : result.content;
218
+ toolCallLog.push({
219
+ name: toolName,
220
+ input: toolInput,
221
+ output: content,
222
+ });
223
+ toolResults.push({
224
+ content,
225
+ tool_use_id: toolUse.id,
226
+ type: "tool_result",
227
+ });
228
+ }
229
+ catch (err) {
230
+ const errMsg = err instanceof Error ? err.message : String(err);
231
+ toolCallLog.push({
232
+ name: toolName,
233
+ input: toolInput,
234
+ output: `Error: ${errMsg}`,
235
+ });
236
+ toolResults.push({
237
+ content: JSON.stringify({ error: errMsg }),
238
+ tool_use_id: toolUse.id,
239
+ type: "tool_result",
240
+ });
241
+ }
242
+ }
243
+ // Add tool results to conversation
244
+ messages.push({ content: toolResults, role: "user" });
245
+ }
246
+ // Exhausted rounds
247
+ const lastAssistant = [...messages]
248
+ .reverse()
249
+ .find((m) => m.role === "assistant");
250
+ let lastText = "";
251
+ if (lastAssistant && Array.isArray(lastAssistant.content)) {
252
+ lastText = lastAssistant.content
253
+ .filter((b) => b.type === "text")
254
+ .map((b) => b.text || "")
255
+ .join("\n");
256
+ }
257
+ return {
258
+ cost: 0,
259
+ metadata: {
260
+ toolRounds: maxToolRounds,
261
+ exhaustedRounds: true,
262
+ toolCallLog,
263
+ latencyMs: Date.now() - startTime,
264
+ },
265
+ output: appendToolSummary(lastText || "[Exhausted tool rounds without final answer]", toolCallLog),
266
+ tokenUsage: {
267
+ completion: outputTokens,
268
+ prompt: inputTokens,
269
+ total: inputTokens + outputTokens,
270
+ },
271
+ };
272
+ }
273
+ finally {
274
+ await mcpClient.cleanup().catch(() => { });
275
+ }
276
+ }
277
+ // -------------------------------------------------------------------------
278
+ // MCP client management
279
+ // -------------------------------------------------------------------------
280
+ async connectMCP(serverConfig) {
281
+ // Dynamically import Promptfoo's MCPClient — reuse its MCP SDK integration
282
+ // rather than adding a direct dependency on @modelcontextprotocol/sdk
283
+ const { Client } = await import("@modelcontextprotocol/sdk/client/index.js");
284
+ const client = new Client({
285
+ name: "ailf-mcp-eval",
286
+ version: "1.0.0",
287
+ });
288
+ // Resolve auth — render {{env.VAR}} templates
289
+ const resolvedConfig = this.resolveEnvTemplates(serverConfig);
290
+ // Determine transport type and connect
291
+ let closeTransport;
292
+ if (resolvedConfig.command) {
293
+ // stdio transport
294
+ const { StdioClientTransport } = await import("@modelcontextprotocol/sdk/client/stdio.js");
295
+ const parts = String(resolvedConfig.command).split(/\s+/);
296
+ const transport = new StdioClientTransport({
297
+ command: parts[0],
298
+ args: parts.slice(1),
299
+ env: process.env,
300
+ });
301
+ await client.connect(transport);
302
+ closeTransport = () => transport.close();
303
+ }
304
+ else if (resolvedConfig.url) {
305
+ // streamable-http transport
306
+ const { StreamableHTTPClientTransport } = await import("@modelcontextprotocol/sdk/client/streamableHttp.js");
307
+ const headers = {};
308
+ const auth = resolvedConfig.auth;
309
+ if (auth?.type === "bearer" && auth.token) {
310
+ headers["Authorization"] = `Bearer ${auth.token}`;
311
+ }
312
+ const transport = new StreamableHTTPClientTransport(new URL(String(resolvedConfig.url)), { requestInit: { headers } });
313
+ await client.connect(transport);
314
+ closeTransport = () => transport.close();
315
+ }
316
+ else {
317
+ throw new Error("MCP server config must have either 'command' (stdio) or 'url' (http)");
318
+ }
319
+ // Discover tools
320
+ const { tools: toolsList } = await client.listTools();
321
+ const allTools = toolsList.map((t) => ({
322
+ name: t.name,
323
+ description: t.description,
324
+ inputSchema: t.inputSchema,
325
+ }));
326
+ return {
327
+ getAllTools: () => allTools,
328
+ callTool: async (name, args) => {
329
+ const result = await client.callTool({ name, arguments: args });
330
+ let content = "";
331
+ if (result?.content) {
332
+ if (Array.isArray(result.content)) {
333
+ content = result.content
334
+ .map((c) => c.text || JSON.stringify(c))
335
+ .join("\n");
336
+ }
337
+ else {
338
+ content = String(result.content);
339
+ }
340
+ }
341
+ return { content, error: result.isError ? content : undefined };
342
+ },
343
+ cleanup: async () => {
344
+ await closeTransport().catch(() => { });
345
+ },
346
+ };
347
+ }
348
+ /**
349
+ * Resolve {{env.VAR}} templates in config values.
350
+ */
351
+ resolveEnvTemplates(config) {
352
+ const resolved = {};
353
+ for (const [key, value] of Object.entries(config)) {
354
+ if (typeof value === "string") {
355
+ resolved[key] = value.replace(/\{\{env\.(\w+)\}\}/g, (_, varName) => {
356
+ return process.env[varName] || "";
357
+ });
358
+ }
359
+ else if (value && typeof value === "object" && !Array.isArray(value)) {
360
+ resolved[key] = this.resolveEnvTemplates(value);
361
+ }
362
+ else {
363
+ resolved[key] = value;
364
+ }
365
+ }
366
+ return resolved;
367
+ }
368
+ }
@@ -11,7 +11,7 @@
11
11
  * Evaluation methodology (rubrics, scoring, prompts) is inherited from the
12
12
  * `literacy` mode base — see mode-bases/literacy.ts.
13
13
  *
14
- * @see docs/exec-plans/architecture-overhaul/phase-8-scoring-storage-presets.md
14
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-8-scoring-storage-presets.md
15
15
  */
16
16
  import { type PresetDefinition } from "../../../_vendor/ailf-core/index.d.ts";
17
17
  export interface SanityLiteracyPresetOptions {
@@ -11,7 +11,7 @@
11
11
  * Evaluation methodology (rubrics, scoring, prompts) is inherited from the
12
12
  * `literacy` mode base — see mode-bases/literacy.ts.
13
13
  *
14
- * @see docs/exec-plans/architecture-overhaul/phase-8-scoring-storage-presets.md
14
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-8-scoring-storage-presets.md
15
15
  */
16
16
  import { env } from "../../../_vendor/ailf-core/index.js";
17
17
  import { SanityDocFetcher } from "../../../adapters/doc-fetchers/index.js";
@@ -9,10 +9,7 @@
9
9
  * TaskGraph → resolve fixtures → resolve variables → map assertions
10
10
  * → assemble prompts → assemble providers → emit YAML
11
11
  *
12
- * This module exists alongside `generate-configs.ts` — it does NOT replace
13
- * the existing codegen path. Phase 7 will swap callers over to the compiler.
14
- *
15
- * @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
12
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
16
13
  */
17
14
  import type { ModeHandler, ModelsConfig, TaskGraph } from "../../_vendor/ailf-core/index.d.ts";
18
15
  import type { EvalMode } from "../../_vendor/ailf-shared/index.d.ts";
@@ -9,10 +9,7 @@
9
9
  * TaskGraph → resolve fixtures → resolve variables → map assertions
10
10
  * → assemble prompts → assemble providers → emit YAML
11
11
  *
12
- * This module exists alongside `generate-configs.ts` — it does NOT replace
13
- * the existing codegen path. Phase 7 will swap callers over to the compiler.
14
- *
15
- * @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
12
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
16
13
  */
17
14
  import { mapAssertions } from "./assertion-mapper.js";
18
15
  import { resolveTaskFixtures } from "./fixture-resolver.js";
@@ -151,20 +148,14 @@ function buildProviders(models, mode) {
151
148
  /**
152
149
  * Check if a model entry matches the current evaluation mode.
153
150
  *
154
- * Literacy mode defaults to baseline model matching. Variant-specific
151
+ * Uses the typed EvalMode values on model.modes. Variant-specific
155
152
  * provider filtering is handled by the provider-assembler and
156
153
  * generate-configs-step, not here.
157
154
  */
158
155
  function modelMatchesMode(model, mode) {
159
156
  if (!model.modes || model.modes.length === 0)
160
157
  return true;
161
- switch (mode) {
162
- case "literacy":
163
- return model.modes.includes(LiteracyVariant.STANDARD);
164
- default:
165
- // Non-literacy modes accept all models by default
166
- return true;
167
- }
158
+ return model.modes.includes(mode);
168
159
  }
169
160
  // ---------------------------------------------------------------------------
170
161
  // Prompt resolution
@@ -7,9 +7,9 @@
7
7
  * Separated into its own module so GenerateConfigsStep can import it
8
8
  * without pulling in the full legacy generate-configs machinery.
9
9
  */
10
- import { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "../../_vendor/ailf-core/index.js";
11
- import { LiteracyVariant } from "../normalize-mode.js";
10
+ import { extractModelName, extractProvider, mergeConfig, } from "../../_vendor/ailf-core/index.js";
12
11
  import { loadConfigFile } from "./config-loader.js";
12
+ import { modelMatchesLiteracyVariant } from "./mode-bases/literacy.js";
13
13
  // ---------------------------------------------------------------------------
14
14
  // Public API
15
15
  // ---------------------------------------------------------------------------
@@ -36,9 +36,12 @@ export function loadModelsAndProviders(rootDir, source, searchMode, allowedOrigi
36
36
  // ---------------------------------------------------------------------------
37
37
  function buildBaselineProviders(models) {
38
38
  return models.models
39
- .filter((m) => modelMatchesMode(m, LiteracyVariant.STANDARD))
39
+ .filter((m) => modelMatchesLiteracyVariant(m, "baseline"))
40
40
  .map((model) => ({
41
- config: mergeConfig(models.defaults, model.config),
41
+ config: {
42
+ ...mergeConfig(models.defaults, model.config),
43
+ ...(model.timeoutMs ? { timeoutMs: model.timeoutMs } : {}),
44
+ },
42
45
  id: model.id,
43
46
  label: model.label,
44
47
  }));
@@ -48,12 +51,13 @@ function buildBaselineProviders(models) {
48
51
  // ---------------------------------------------------------------------------
49
52
  function buildObservedProviders(models) {
50
53
  return models.models
51
- .filter((m) => modelMatchesMode(m, LiteracyVariant.OBSERVED))
54
+ .filter((m) => modelMatchesLiteracyVariant(m, "observed"))
52
55
  .map((model) => {
53
56
  const modelName = extractModelName(model.id);
54
57
  return {
55
58
  config: {
56
59
  ...mergeConfig(models.defaults, model.config),
60
+ ...(model.timeoutMs ? { timeoutMs: model.timeoutMs } : {}),
57
61
  modelName,
58
62
  observe: true,
59
63
  recordOptions: models.defaults.observerOptions ?? {},
@@ -67,8 +71,8 @@ function buildObservedProviders(models) {
67
71
  // Agentic providers
68
72
  // ---------------------------------------------------------------------------
69
73
  function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
70
- const naiveModels = models.models.filter((m) => modelMatchesMode(m, "agentic-naive"));
71
- const optimizedModels = models.models.filter((m) => modelMatchesMode(m, "agentic-optimized"));
74
+ const naiveModels = models.models.filter((m) => modelMatchesLiteracyVariant(m, "agentic-naive"));
75
+ const optimizedModels = models.models.filter((m) => modelMatchesLiteracyVariant(m, "agentic-optimized"));
72
76
  const resolvedSearchMode = searchMode ?? "open";
73
77
  const sourceConfig = source
74
78
  ? {
@@ -100,6 +104,7 @@ function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
100
104
  model: modelName,
101
105
  provider,
102
106
  }),
107
+ ...(model.timeoutMs ? { timeoutMs: model.timeoutMs } : {}),
103
108
  ...sourceConfig,
104
109
  observe: true,
105
110
  observerOptions: models.defaults.observerOptions ?? {},
@@ -119,6 +124,7 @@ function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
119
124
  model: modelName,
120
125
  provider,
121
126
  }),
127
+ ...(model.timeoutMs ? { timeoutMs: model.timeoutMs } : {}),
122
128
  ...sourceConfig,
123
129
  observe: true,
124
130
  observerOptions: models.defaults.observerOptions ?? {},
@@ -8,7 +8,7 @@
8
8
  * no shell) to prevent shell injection from task-supplied values like
9
9
  * image names or task IDs.
10
10
  *
11
- * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
11
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
12
12
  */
13
13
  import type { SandboxArtifacts, SandboxInfo, SandboxProvisionOptions, SandboxStrategy } from "./sandbox-strategy.js";
14
14
  export declare class DockerSandboxStrategy implements SandboxStrategy {
@@ -8,7 +8,7 @@
8
8
  * no shell) to prevent shell injection from task-supplied values like
9
9
  * image names or task IDs.
10
10
  *
11
- * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
11
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
12
12
  */
13
13
  import { randomUUID } from "crypto";
14
14
  import { execFileSync } from "child_process";
@@ -10,7 +10,7 @@
10
10
  * - sanity:// — Content Lake document by ID or query
11
11
  *
12
12
  * @see docs/design-docs/architecture-overhaul/fixtures-artifacts.md
13
- * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
13
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
14
14
  */
15
15
  import type { SandboxInfo } from "./sandbox-strategy.js";
16
16
  /** A fixture reference from a task definition */
@@ -10,7 +10,7 @@
10
10
  * - sanity:// — Content Lake document by ID or query
11
11
  *
12
12
  * @see docs/design-docs/architecture-overhaul/fixtures-artifacts.md
13
- * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
13
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
14
14
  */
15
15
  import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
16
16
  import { createHash } from "crypto";
@@ -7,7 +7,7 @@
7
7
  * All git CLI calls use `execFileSync` (array form, no shell) to prevent
8
8
  * injection from task-supplied values like git refs or repo paths.
9
9
  *
10
- * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
10
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
11
11
  */
12
12
  import type { SandboxArtifacts, SandboxInfo, SandboxProvisionOptions, SandboxStrategy } from "./sandbox-strategy.js";
13
13
  export declare class GitWorktreeSandboxStrategy implements SandboxStrategy {
@@ -7,7 +7,7 @@
7
7
  * All git CLI calls use `execFileSync` (array form, no shell) to prevent
8
8
  * injection from task-supplied values like git refs or repo paths.
9
9
  *
10
- * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
10
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
11
11
  */
12
12
  import { randomUUID } from "crypto";
13
13
  import { execFileSync } from "child_process";
@@ -1,7 +1,7 @@
1
1
  /**
2
2
  * Sandbox infrastructure — isolated execution environments for agent harness mode.
3
3
  *
4
- * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
4
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
5
5
  */
6
6
  export type { SandboxArtifacts, SandboxInfo, SandboxProvisionOptions, SandboxStrategy, SandboxType, } from "./sandbox-strategy.js";
7
7
  export { DockerSandboxStrategy } from "./docker-sandbox.js";
@@ -1,7 +1,7 @@
1
1
  /**
2
2
  * Sandbox infrastructure — isolated execution environments for agent harness mode.
3
3
  *
4
- * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
4
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
5
5
  */
6
6
  // Implementations
7
7
  export { DockerSandboxStrategy } from "./docker-sandbox.js";
@@ -8,7 +8,7 @@
8
8
  *
9
9
  * CI environments (detected via CI env var) always prefer Docker.
10
10
  *
11
- * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
11
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
12
12
  */
13
13
  import type { SandboxStrategy, SandboxType } from "./sandbox-strategy.js";
14
14
  /** Result of sandbox selection */
@@ -8,7 +8,7 @@
8
8
  *
9
9
  * CI environments (detected via CI env var) always prefer Docker.
10
10
  *
11
- * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
11
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
12
12
  */
13
13
  import { DockerSandboxStrategy } from "./docker-sandbox.js";
14
14
  import { GitWorktreeSandboxStrategy } from "./git-worktree-sandbox.js";
@@ -10,7 +10,7 @@
10
10
  * Selection: task config specifies preferred strategy; runtime falls back
11
11
  * Docker → TempDir if Docker is unavailable. CI environments prefer Docker.
12
12
  *
13
- * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
13
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
14
14
  */
15
15
  /** Metadata describing a provisioned sandbox */
16
16
  export interface SandboxInfo {
@@ -10,6 +10,6 @@
10
10
  * Selection: task config specifies preferred strategy; runtime falls back
11
11
  * Docker → TempDir if Docker is unavailable. CI environments prefer Docker.
12
12
  *
13
- * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
13
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
14
14
  */
15
15
  export {};
@@ -7,7 +7,7 @@
7
7
  *
8
8
  * This is the universal fallback when Docker is unavailable.
9
9
  *
10
- * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
10
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
11
11
  */
12
12
  import type { SandboxArtifacts, SandboxInfo, SandboxProvisionOptions, SandboxStrategy } from "./sandbox-strategy.js";
13
13
  export declare class TempDirSandboxStrategy implements SandboxStrategy {
@@ -7,7 +7,7 @@
7
7
  *
8
8
  * This is the universal fallback when Docker is unavailable.
9
9
  *
10
- * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
10
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
11
11
  */
12
12
  import { randomUUID } from "crypto";
13
13
  import { existsSync, mkdirSync, readdirSync, rmSync } from "fs";
@@ -18,7 +18,7 @@
18
18
  *
19
19
  * @see packages/core/src/services/scoring-engine.ts — the 4-tier engine
20
20
  * @see packages/eval/src/pipeline/calculate-scores.ts — the consumer
21
- * @see docs/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
21
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
22
22
  */
23
23
  import { type DimensionScore } from "../../_vendor/ailf-core/index.d.ts";
24
24
  import type { TestResult } from "../../_vendor/ailf-core/index.d.ts";