@sanity/ailf 1.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (499) hide show
  1. package/README.md +0 -1
  2. package/canonical/grader-references/README.md +2 -2
  3. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  4. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  5. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  6. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  7. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  8. package/config/features.ts +1 -1
  9. package/config/models.ts +29 -12
  10. package/config/sources.ts +1 -1
  11. package/config/thresholds.ts +1 -1
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  13. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  17. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  18. package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
  19. package/dist/_vendor/ailf-core/config-helpers.js +51 -2
  20. package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
  21. package/dist/_vendor/ailf-core/examples/index.js +213 -94
  22. package/dist/_vendor/ailf-core/index.d.ts +3 -2
  23. package/dist/_vendor/ailf-core/index.js +2 -1
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  25. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  27. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  28. package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
  29. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  30. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  31. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  32. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  33. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  34. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  35. package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
  36. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
  37. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
  38. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
  39. package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
  40. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  41. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  42. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  43. package/dist/_vendor/ailf-core/services/index.js +1 -1
  44. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
  46. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  47. package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
  50. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  51. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  52. package/dist/adapters/api-client/remediation.js +2 -2
  53. package/dist/adapters/config-sources/file-config-adapter.js +7 -1
  54. package/dist/adapters/config-sources/ts-config-loader.js +21 -13
  55. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  56. package/dist/adapters/index.d.ts +0 -1
  57. package/dist/adapters/index.js +0 -1
  58. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  59. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  60. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  61. package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
  62. package/dist/adapters/task-sources/index.d.ts +3 -4
  63. package/dist/adapters/task-sources/index.js +3 -4
  64. package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
  65. package/dist/adapters/task-sources/repo-schemas.js +228 -20
  66. package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
  67. package/dist/adapters/task-sources/repo-task-source.js +81 -122
  68. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  69. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  70. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  71. package/dist/adapters/task-sources/repo-validation.js +126 -5
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
  73. package/dist/adapters/task-sources/task-file-loader.js +21 -7
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/coverage-audit.js +3 -1
  95. package/dist/commands/explain-handler.d.ts +1 -1
  96. package/dist/commands/explain-handler.js +37 -8
  97. package/dist/commands/fetch-docs.js +1 -0
  98. package/dist/commands/generate-configs.d.ts +3 -3
  99. package/dist/commands/generate-configs.js +20 -8
  100. package/dist/commands/init.d.ts +5 -4
  101. package/dist/commands/init.js +190 -25
  102. package/dist/commands/pipeline-action.d.ts +7 -1
  103. package/dist/commands/pipeline-action.js +43 -19
  104. package/dist/commands/pipeline.d.ts +6 -1
  105. package/dist/commands/pipeline.js +7 -2
  106. package/dist/commands/pr-comment.js +1 -0
  107. package/dist/commands/publish.js +1 -0
  108. package/dist/commands/shared/help.js +2 -2
  109. package/dist/commands/update-quality-scores.d.ts +5 -0
  110. package/dist/commands/update-quality-scores.js +20 -0
  111. package/dist/commands/validate-tasks.d.ts +2 -2
  112. package/dist/commands/validate-tasks.js +26 -15
  113. package/dist/composition-root.d.ts +15 -4
  114. package/dist/composition-root.js +100 -55
  115. package/dist/config/features.ts +23 -0
  116. package/dist/config/models.ts +100 -0
  117. package/dist/config/prompts.ts +16 -0
  118. package/dist/config/rubrics.ts +225 -0
  119. package/dist/config/schedules.ts +47 -0
  120. package/dist/config/sinks.ts +37 -0
  121. package/dist/config/sources.ts +21 -0
  122. package/dist/config/thresholds.ts +61 -0
  123. package/dist/index.d.ts +41 -0
  124. package/dist/index.js +48 -0
  125. package/dist/lib/agent-behavior-report.d.ts +8 -0
  126. package/dist/lib/agent-behavior-report.js +185 -0
  127. package/dist/lib/baseline.d.ts +19 -0
  128. package/dist/lib/baseline.js +153 -0
  129. package/dist/lib/calculate-scores.d.ts +23 -0
  130. package/dist/lib/calculate-scores.js +42 -0
  131. package/dist/lib/compare.d.ts +18 -0
  132. package/dist/lib/compare.js +170 -0
  133. package/dist/lib/coverage-audit.d.ts +4 -0
  134. package/dist/lib/coverage-audit.js +42 -0
  135. package/dist/lib/discovery-report.d.ts +13 -0
  136. package/dist/lib/discovery-report.js +57 -0
  137. package/dist/lib/fetch-docs.d.ts +30 -0
  138. package/dist/lib/fetch-docs.js +171 -0
  139. package/dist/lib/generate-configs.d.ts +25 -0
  140. package/dist/lib/generate-configs.js +42 -0
  141. package/dist/lib/grader-api.d.ts +21 -0
  142. package/dist/lib/grader-api.js +34 -0
  143. package/dist/lib/grader-compare.d.ts +19 -0
  144. package/dist/lib/grader-compare.js +91 -0
  145. package/dist/lib/grader-consistency.d.ts +27 -0
  146. package/dist/lib/grader-consistency.js +79 -0
  147. package/dist/lib/grader-sensitivity.d.ts +19 -0
  148. package/dist/lib/grader-sensitivity.js +75 -0
  149. package/dist/lib/grader-validate.d.ts +19 -0
  150. package/dist/lib/grader-validate.js +78 -0
  151. package/dist/lib/measure-retrieval.d.ts +14 -0
  152. package/dist/lib/measure-retrieval.js +71 -0
  153. package/dist/lib/pr-comment.d.ts +16 -0
  154. package/dist/lib/pr-comment.js +28 -0
  155. package/dist/lib/readiness-report.d.ts +13 -0
  156. package/dist/lib/readiness-report.js +108 -0
  157. package/dist/lib/webhook-server.d.ts +11 -0
  158. package/dist/lib/webhook-server.js +24 -0
  159. package/dist/lib/weekly-digest.d.ts +24 -0
  160. package/dist/lib/weekly-digest.js +148 -0
  161. package/dist/orchestration/build-app-context.js +13 -0
  162. package/dist/orchestration/build-step-sequence.js +4 -2
  163. package/dist/orchestration/cache-context.d.ts +23 -0
  164. package/dist/orchestration/cache-context.js +43 -0
  165. package/dist/orchestration/env-bridge.d.ts +21 -0
  166. package/dist/orchestration/env-bridge.js +66 -0
  167. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  168. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  169. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  170. package/dist/orchestration/step-runner.js +5 -1
  171. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  172. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  173. package/dist/orchestration/steps/callback-step.js +10 -1
  174. package/dist/orchestration/steps/compare-step.js +6 -3
  175. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  176. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  177. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  178. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  179. package/dist/orchestration/steps/fetch-docs-step.js +32 -19
  180. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  181. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  182. package/dist/orchestration/steps/generate-configs-step.js +77 -26
  183. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  184. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  185. package/dist/orchestration/steps/publish-report-step.js +19 -0
  186. package/dist/orchestration/steps/readiness-step.js +8 -3
  187. package/dist/orchestration/steps/report-step.js +17 -4
  188. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  189. package/dist/orchestration/steps/run-eval-step.js +51 -31
  190. package/dist/pipeline/agent-behavior-report.js +6 -0
  191. package/dist/pipeline/attribution.d.ts +1 -1
  192. package/dist/pipeline/attribution.js +1 -1
  193. package/dist/pipeline/cache.js +29 -15
  194. package/dist/pipeline/calculate-scores.d.ts +2 -0
  195. package/dist/pipeline/calculate-scores.js +70 -33
  196. package/dist/pipeline/chronic-failures.d.ts +55 -0
  197. package/dist/pipeline/chronic-failures.js +110 -0
  198. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
  199. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
  200. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
  201. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
  202. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  203. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
  204. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  205. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  206. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  207. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  208. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  209. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  210. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  211. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  212. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  213. package/dist/pipeline/compiler/config-loader.js +42 -2
  214. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  215. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  216. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  217. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  218. package/dist/pipeline/compiler/index.d.ts +2 -5
  219. package/dist/pipeline/compiler/index.js +2 -5
  220. package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
  221. package/dist/pipeline/compiler/literacy-bridge.js +2 -2
  222. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  223. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  224. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  225. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  226. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  227. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  228. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
  229. package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
  230. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  231. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  232. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  233. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  234. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  235. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  236. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  237. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  238. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  239. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  240. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  241. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  242. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  243. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  244. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
  245. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  246. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  247. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  248. package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
  249. package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
  250. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  251. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  252. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  253. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  254. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  255. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  256. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  257. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  258. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  259. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  260. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  261. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  262. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  263. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  264. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  265. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  266. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  267. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  268. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  269. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  270. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  271. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  272. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  273. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  274. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
  275. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
  276. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
  277. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  278. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  279. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  280. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  281. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  282. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  283. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  284. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
  285. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  286. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  287. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  288. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  289. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
  290. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
  291. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  292. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
  293. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  294. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
  295. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  296. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  297. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
  298. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
  299. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
  300. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  301. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  302. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  303. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  304. package/dist/pipeline/compiler/preset-loader.js +99 -0
  305. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
  306. package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
  307. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  308. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  309. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  310. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  311. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  312. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  313. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  314. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  315. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  316. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  317. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  318. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  319. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  320. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  321. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  322. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  323. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  324. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  325. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  326. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  327. package/dist/pipeline/compiler/task-bridge.js +92 -0
  328. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  329. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  330. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  331. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  332. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  333. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  334. package/dist/pipeline/coverage-audit.d.ts +1 -1
  335. package/dist/pipeline/coverage-audit.js +1 -1
  336. package/dist/pipeline/degradations.d.ts +1 -1
  337. package/dist/pipeline/degradations.js +1 -1
  338. package/dist/pipeline/expand-tasks.d.ts +2 -2
  339. package/dist/pipeline/expand-tasks.js +2 -2
  340. package/dist/pipeline/failure-modes.d.ts +1 -1
  341. package/dist/pipeline/failure-modes.js +13 -1
  342. package/dist/pipeline/gap-analysis.d.ts +1 -1
  343. package/dist/pipeline/gap-analysis.js +3 -1
  344. package/dist/pipeline/generate-configs.d.ts +2 -2
  345. package/dist/pipeline/generate-configs.js +16 -9
  346. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  347. package/dist/pipeline/grader-compare-runner.js +7 -1
  348. package/dist/pipeline/grader-comparison.d.ts +1 -1
  349. package/dist/pipeline/grader-comparison.js +1 -1
  350. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  351. package/dist/pipeline/grader-consistency-runner.js +7 -1
  352. package/dist/pipeline/grader-consistency.d.ts +1 -1
  353. package/dist/pipeline/grader-consistency.js +1 -1
  354. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  355. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  356. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  357. package/dist/pipeline/grader-sensitivity.js +1 -1
  358. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  359. package/dist/pipeline/grader-validate-runner.js +2 -2
  360. package/dist/pipeline/grader-validation.d.ts +1 -1
  361. package/dist/pipeline/grader-validation.js +1 -1
  362. package/dist/pipeline/map-request-to-config.js +16 -2
  363. package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
  364. package/dist/pipeline/mirror-repo-tasks.js +10 -10
  365. package/dist/pipeline/plan-format.d.ts +1 -1
  366. package/dist/pipeline/plan-format.js +1 -1
  367. package/dist/pipeline/plan.d.ts +1 -1
  368. package/dist/pipeline/plan.js +68 -30
  369. package/dist/pipeline/probe.d.ts +1 -1
  370. package/dist/pipeline/probe.js +1 -1
  371. package/dist/pipeline/readiness-report.d.ts +2 -2
  372. package/dist/pipeline/readiness-report.js +2 -2
  373. package/dist/pipeline/release-classification.d.ts +1 -1
  374. package/dist/pipeline/release-classification.js +1 -1
  375. package/dist/pipeline/release-report.d.ts +1 -1
  376. package/dist/pipeline/release-report.js +1 -1
  377. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  378. package/dist/pipeline/repo-eval-comment.js +1 -1
  379. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  380. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  381. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  382. package/dist/pipeline/resolve-mappings.js +44 -44
  383. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  384. package/dist/pipeline/retrieval-metrics.js +28 -20
  385. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  386. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  387. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  388. package/dist/pipeline/steps/compare-step.js +90 -0
  389. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  390. package/dist/pipeline/steps/eval-step.js +347 -0
  391. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  392. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  393. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  394. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  395. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  396. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  397. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  398. package/dist/pipeline/steps/publish-report-step.js +243 -0
  399. package/dist/pipeline/steps/report-step.d.ts +13 -0
  400. package/dist/pipeline/steps/report-step.js +56 -0
  401. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  402. package/dist/pipeline/steps/update-scores-step.js +42 -0
  403. package/dist/pipeline/targeted-loo.d.ts +1 -1
  404. package/dist/pipeline/targeted-loo.js +1 -1
  405. package/dist/pipeline/thresholds.d.ts +1 -1
  406. package/dist/pipeline/thresholds.js +1 -1
  407. package/dist/pipeline/validate.js +13 -0
  408. package/dist/report-store.d.ts +17 -0
  409. package/dist/report-store.js +24 -0
  410. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  411. package/dist/scripts/agent-behavior-report.js +315 -0
  412. package/dist/scripts/baseline.d.ts +43 -0
  413. package/dist/scripts/baseline.js +267 -0
  414. package/dist/scripts/calculate-scores.d.ts +166 -0
  415. package/dist/scripts/calculate-scores.js +1296 -0
  416. package/dist/scripts/compare.d.ts +22 -0
  417. package/dist/scripts/compare.js +334 -0
  418. package/dist/scripts/coverage-audit.d.ts +44 -0
  419. package/dist/scripts/coverage-audit.js +209 -0
  420. package/dist/scripts/debug-eval.d.ts +19 -0
  421. package/dist/scripts/debug-eval.js +73 -0
  422. package/dist/scripts/discovery-report.d.ts +58 -0
  423. package/dist/scripts/discovery-report.js +250 -0
  424. package/dist/scripts/fetch-docs.d.ts +35 -0
  425. package/dist/scripts/fetch-docs.js +472 -0
  426. package/dist/scripts/generate-configs.d.ts +66 -0
  427. package/dist/scripts/generate-configs.js +459 -0
  428. package/dist/scripts/grader-api.d.ts +27 -0
  429. package/dist/scripts/grader-api.js +206 -0
  430. package/dist/scripts/grader-compare.d.ts +22 -0
  431. package/dist/scripts/grader-compare.js +368 -0
  432. package/dist/scripts/grader-consistency.d.ts +20 -0
  433. package/dist/scripts/grader-consistency.js +313 -0
  434. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  435. package/dist/scripts/grader-sensitivity.js +354 -0
  436. package/dist/scripts/grader-validate.d.ts +19 -0
  437. package/dist/scripts/grader-validate.js +267 -0
  438. package/dist/scripts/measure-retrieval.d.ts +10 -0
  439. package/dist/scripts/measure-retrieval.js +145 -0
  440. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  441. package/dist/scripts/migrate-task-mode.js +1 -1
  442. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  443. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  444. package/dist/scripts/pipeline.d.ts +76 -0
  445. package/dist/scripts/pipeline.js +1031 -0
  446. package/dist/scripts/pr-comment.d.ts +10 -0
  447. package/dist/scripts/pr-comment.js +510 -0
  448. package/dist/scripts/readiness-report.d.ts +88 -0
  449. package/dist/scripts/readiness-report.js +342 -0
  450. package/dist/scripts/update-quality-scores.d.ts +15 -0
  451. package/dist/scripts/update-quality-scores.js +184 -0
  452. package/dist/scripts/validate-task-sources.d.ts +1 -1
  453. package/dist/scripts/validate-task-sources.js +1 -1
  454. package/dist/scripts/validate.d.ts +13 -0
  455. package/dist/scripts/validate.js +79 -0
  456. package/dist/scripts/webhook-server.d.ts +26 -0
  457. package/dist/scripts/webhook-server.js +147 -0
  458. package/dist/scripts/weekly-digest.d.ts +24 -0
  459. package/dist/scripts/weekly-digest.js +144 -0
  460. package/dist/sinks/format-slack.d.ts +64 -0
  461. package/dist/sinks/format-slack.js +306 -0
  462. package/dist/sinks/slack-sink.d.ts +27 -0
  463. package/dist/sinks/slack-sink.js +78 -0
  464. package/dist/sinks/types.d.ts +1 -1
  465. package/dist/sinks/types.js +1 -1
  466. package/dist/sinks/webhook-sink.d.ts +19 -0
  467. package/dist/sinks/webhook-sink.js +50 -0
  468. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  469. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  470. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  471. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  472. package/dist/tasks/literacy/functions.task.ts +70 -0
  473. package/dist/tasks/literacy/groq.task.ts +259 -0
  474. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  475. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  476. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  477. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  478. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  479. package/package.json +32 -24
  480. package/tasks/.expanded.agentic.yaml +280 -0
  481. package/tasks/.expanded.yaml +565 -0
  482. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  483. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  484. package/tasks/literacy/content-lake.task.ts +181 -0
  485. package/tasks/literacy/frameworks.task.ts +1 -0
  486. package/tasks/literacy/functions.task.ts +1 -0
  487. package/tasks/literacy/groq.task.ts +1 -0
  488. package/tasks/literacy/image-handling.task.ts +95 -0
  489. package/tasks/literacy/nextjs-live.task.ts +2 -1
  490. package/tasks/literacy/portable-text.task.ts +169 -0
  491. package/tasks/literacy/studio-setup.task.ts +5 -2
  492. package/tasks/literacy/visual-editing.task.ts +1 -0
  493. package/LICENSE +0 -21
  494. package/tasks/frameworks.yaml +0 -98
  495. package/tasks/functions.yaml +0 -51
  496. package/tasks/groq.yaml +0 -216
  497. package/tasks/nextjs-live.yaml +0 -62
  498. package/tasks/studio-setup.yaml +0 -111
  499. package/tasks/visual-editing.yaml +0 -120
@@ -10,7 +10,7 @@
10
10
  * means unchanged tasks are skipped. Changed tasks are upserted via
11
11
  * createOrReplace.
12
12
  *
13
- * @see docs/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
13
+ * @see docs/archive/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
14
14
  */
15
15
  import type { SanityClient } from "@sanity/client";
16
16
  import { type LiteracyTaskDefinition, type Logger } from "../_vendor/ailf-core/index.d.ts";
@@ -58,7 +58,7 @@ export interface MirrorResult {
58
58
  skipped: number;
59
59
  /** Feature areas auto-created */
60
60
  areasCreated: string[];
61
- /** Canonical doc slugs that failed to resolve */
61
+ /** Context doc slugs that failed to resolve */
62
62
  unresolvedSlugs: string[];
63
63
  /** Errors (non-fatal — mirror continues) */
64
64
  errors: string[];
@@ -70,7 +70,7 @@ export interface MirrorResult {
70
70
  * 1. Compute deterministic document ID
71
71
  * 2. Compute content hash of the task definition
72
72
  * 3. Check if mirror document exists with same hash → skip if unchanged
73
- * 4. Resolve canonical doc slugs → Sanity references
73
+ * 4. Resolve context doc slugs → Sanity references
74
74
  * 5. Auto-create feature areas if needed
75
75
  * 6. Upsert the ailf.task document with origin block
76
76
  */
@@ -114,8 +114,8 @@ export declare function buildMirrorDocument(task: LiteracyTaskDefinition, opts:
114
114
  _type: string;
115
115
  ownership: string;
116
116
  status: import("@sanity/ailf-core").TaskStatus;
117
- assert: Record<string, unknown>[];
118
- canonicalDocs: ({
117
+ assertions: Record<string, unknown>[];
118
+ contextDocs: ({
119
119
  _key: string;
120
120
  reason: string;
121
121
  } | {
@@ -138,9 +138,9 @@ export declare function buildMirrorDocument(task: LiteracyTaskDefinition, opts:
138
138
  _key: string;
139
139
  reason: string;
140
140
  })[];
141
- description: string;
141
+ title: string;
142
142
  docCoverage: boolean;
143
- featureArea: {
143
+ area: {
144
144
  _ref: string;
145
145
  _type: string;
146
146
  };
@@ -161,5 +161,5 @@ export declare function buildMirrorDocument(task: LiteracyTaskDefinition, opts:
161
161
  author: GitAuthor;
162
162
  lastEditor: GitAuthor;
163
163
  };
164
- taskPrompt: string;
164
+ promptText: string;
165
165
  };
@@ -10,7 +10,7 @@
10
10
  * means unchanged tasks are skipped. Changed tasks are upserted via
11
11
  * createOrReplace.
12
12
  *
13
- * @see docs/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
13
+ * @see docs/archive/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
14
14
  */
15
15
  import { createHash } from "crypto";
16
16
  import { readFileSync } from "fs";
@@ -26,7 +26,7 @@ import { ConsoleLogger } from "../adapters/loggers/index.js";
26
26
  * 1. Compute deterministic document ID
27
27
  * 2. Compute content hash of the task definition
28
28
  * 3. Check if mirror document exists with same hash → skip if unchanged
29
- * 4. Resolve canonical doc slugs → Sanity references
29
+ * 4. Resolve context doc slugs → Sanity references
30
30
  * 5. Auto-create feature areas if needed
31
31
  * 6. Upsert the ailf.task document with origin block
32
32
  */
@@ -43,7 +43,7 @@ export async function mirrorRepoTasks(options) {
43
43
  };
44
44
  if (tasks.length === 0)
45
45
  return result;
46
- // Batch-resolve all canonical doc slugs (slug refs only — other ref types
46
+ // Batch-resolve all context doc slugs (slug refs only — other ref types
47
47
  // are stored without a resolved article reference for now)
48
48
  const allSlugs = [
49
49
  ...new Set(tasks.flatMap((t) => (t.context?.docs ?? []).filter(isSlugRef).map((d) => d.slug))),
@@ -353,10 +353,10 @@ async function fetchExistingDocState(client, docIds) {
353
353
  /** @internal Exported for testing — not part of the public API. */
354
354
  export function buildMirrorDocument(task, opts) {
355
355
  const { contentHash, docId, existingAuthor, git, slugToDocId } = opts;
356
- // Build canonical docs with resolved references and correct refType.
356
+ // Build context docs with resolved references and correct refType.
357
357
  // Each ref type gets the appropriate resolution fields set on the
358
358
  // mirror document so Studio can display them correctly.
359
- const canonicalDocs = (task.context?.docs ?? []).map((ref, i) => {
359
+ const contextDocs = (task.context?.docs ?? []).map((ref, i) => {
360
360
  const base = { _key: `cd${i}`, reason: ref.reason ?? "" };
361
361
  if (isSlugRef(ref)) {
362
362
  const resolvedId = slugToDocId.get(ref.slug);
@@ -428,11 +428,11 @@ export function buildMirrorDocument(task, opts) {
428
428
  _type: "ailf.task",
429
429
  ownership: "repo",
430
430
  status: task.status ?? "active",
431
- assert: assertArray,
432
- canonicalDocs,
433
- description: task.title,
431
+ assertions: assertArray,
432
+ contextDocs,
433
+ title: task.title,
434
434
  docCoverage: task.docCoverage ?? false,
435
- featureArea: {
435
+ area: {
436
436
  _ref: `ailf.featureArea.${area}`,
437
437
  _type: "reference",
438
438
  },
@@ -452,7 +452,7 @@ export function buildMirrorDocument(task, opts) {
452
452
  author: existingAuthor ?? git.author,
453
453
  lastEditor: git.author,
454
454
  },
455
- taskPrompt: task.prompt?.text ?? "",
455
+ promptText: task.prompt?.text ?? "",
456
456
  ...(task.baseline
457
457
  ? {
458
458
  baseline: {
@@ -7,7 +7,7 @@
7
7
  * emoji markers, alignment, and color-coding (via unicode markers).
8
8
  * The JSON formatter produces machine-readable output for CI/CD.
9
9
  *
10
- * @see docs/exec-plans/execution-preview.md
10
+ * @see docs/archive/exec-plans/execution-preview.md
11
11
  */
12
12
  import type { ExecutionPlan } from "./plan.js";
13
13
  /**
@@ -7,7 +7,7 @@
7
7
  * emoji markers, alignment, and color-coding (via unicode markers).
8
8
  * The JSON formatter produces machine-readable output for CI/CD.
9
9
  *
10
- * @see docs/exec-plans/execution-preview.md
10
+ * @see docs/archive/exec-plans/execution-preview.md
11
11
  */
12
12
  import { formatCost } from "../agent-observer/pricing.js";
13
13
  // ---------------------------------------------------------------------------
@@ -7,7 +7,7 @@
7
7
  * anything. Calls existing pure functions (task expansion, model loading,
8
8
  * cache hashing, pricing) and composes them into an `ExecutionPlan`.
9
9
  *
10
- * @see docs/exec-plans/execution-preview.md
10
+ * @see docs/archive/exec-plans/execution-preview.md
11
11
  */
12
12
  import type { DebugOptions, EvalMode } from "./types.js";
13
13
  import { LiteracyVariant } from "./normalize-mode.js";
@@ -7,16 +7,17 @@
7
7
  * anything. Calls existing pure functions (task expansion, model loading,
8
8
  * cache hashing, pricing) and composes them into an `ExecutionPlan`.
9
9
  *
10
- * @see docs/exec-plans/execution-preview.md
10
+ * @see docs/archive/exec-plans/execution-preview.md
11
11
  */
12
12
  import { existsSync, readdirSync, statSync } from "fs";
13
13
  import { resolve } from "path";
14
+ import { createLiteracyModeBase, modelMatchesLiteracyVariant, } from "./compiler/mode-bases/literacy.js";
14
15
  import { lookupPricing } from "../agent-observer/pricing.js";
15
16
  import { RepoTaskSource } from "../adapters/task-sources/repo-task-source.js";
16
17
  import { loadAllTsTaskFiles } from "../adapters/task-sources/task-file-loader.js";
17
18
  import { lookupCache } from "./cache.js";
18
19
  import { compileLiteracyTasks } from "./compiler/literacy-bridge.js";
19
- import { tryLoadConfigFile } from "./compiler/config-loader.js";
20
+ import { resolveVendoredSubdir, tryLoadConfigFile, } from "./compiler/config-loader.js";
20
21
  import { LiteracyVariant } from "./normalize-mode.js";
21
22
  import { validateConfiguration } from "./validate.js";
22
23
  /**
@@ -44,33 +45,35 @@ function loadModelsFile(rootDir) {
44
45
  const result = tryLoadConfigFile("models", rootDir);
45
46
  return result?.data ?? null;
46
47
  }
48
+ const _literacyBase = createLiteracyModeBase();
47
49
  /**
48
- * Map eval mode + variant to the model "modes" array values from models config.
50
+ * Check whether a model participates in a given eval mode + optional variant.
49
51
  *
50
- * Literacy mode uses the variant to determine which model sub-modes match.
51
- * Non-literacy modes accept all models by default (filtering is done
52
- * elsewhere for those modes).
52
+ * For literacy mode, checks both mode enrollment and variant participation
53
+ * via the shared `modelMatchesLiteracyVariant` helper. For non-literacy
54
+ * modes, checks mode enrollment only.
53
55
  */
54
- function modeMatchesModelModes(mode, modelModes, variant) {
55
- if (!modelModes || modelModes.length === 0)
56
- return true;
57
- if (mode === "literacy") {
56
+ function modeMatchesModel(mode, model, variant) {
57
+ // Check basic mode enrollment
58
+ if (model.modes &&
59
+ model.modes.length > 0 &&
60
+ !model.modes.includes(mode)) {
61
+ return false;
62
+ }
63
+ // For literacy mode with a variant, check variant participation
64
+ if (mode === "literacy" && variant) {
58
65
  switch (variant) {
59
66
  case LiteracyVariant.AGENTIC:
60
- return (modelModes.includes("agentic-naive") ||
61
- modelModes.includes("agentic-optimized"));
62
- case LiteracyVariant.OBSERVED:
63
- return modelModes.includes(LiteracyVariant.OBSERVED);
67
+ return (modelMatchesLiteracyVariant(model, "agentic-naive") ||
68
+ modelMatchesLiteracyVariant(model, "agentic-optimized"));
64
69
  case LiteracyVariant.FULL:
65
- return (modelModes.includes(LiteracyVariant.STANDARD) ||
66
- modelModes.includes("agentic-naive") ||
67
- modelModes.includes("agentic-optimized"));
68
- case LiteracyVariant.STANDARD:
70
+ return (modelMatchesLiteracyVariant(model, "baseline") ||
71
+ modelMatchesLiteracyVariant(model, "agentic-naive") ||
72
+ modelMatchesLiteracyVariant(model, "agentic-optimized"));
69
73
  default:
70
- return modelModes.includes(LiteracyVariant.STANDARD);
74
+ return modelMatchesLiteracyVariant(model, variant);
71
75
  }
72
76
  }
73
- // Non-literacy modes accept all models by default
74
77
  return true;
75
78
  }
76
79
  // ---------------------------------------------------------------------------
@@ -139,18 +142,25 @@ export async function buildPipelinePlan(opts, rootDir) {
139
142
  const modelsForCompile = loadModelsFile(rootDir);
140
143
  const graderProvider = modelsForCompile?.grader?.id ?? "openai:chat:gpt-4o";
141
144
  const modelEntries = (modelsForCompile?.models ?? []).map((m) => ({ id: m.id, label: m.label }));
142
- // Load *.task.ts files from tasks/<mode>/
143
- const modeTasksDir = resolve(rootDir, "tasks", opts.mode);
145
+ // Load *.task.ts files from tasks/<mode>/ (or dist/tasks/<mode>/ when vendored)
146
+ const modeTasksDir = resolveVendoredSubdir(rootDir, `tasks/${opts.mode}`);
144
147
  if (existsSync(modeTasksDir)) {
145
148
  const rawTasks = await loadAllTsTaskFiles(modeTasksDir);
146
149
  if (rawTasks.length > 0) {
147
150
  // Dynamic import of the handler module
148
- const handlerModulePath = `./compiler/mode-handlers/${opts.mode}-handler.js`;
151
+ const handlerModulePath = `./compiler/mode-handlers/${opts.mode}/index.js`;
149
152
  const mod = await import(handlerModulePath);
150
153
  const handler = mod.handler;
154
+ const skippedByMode = new Map();
151
155
  for (const rawFile of rawTasks) {
152
156
  for (const taskDef of rawFile.tasks) {
153
157
  const task = taskDef;
158
+ // Filter to matching mode (skip tasks from other modes in same dir)
159
+ if ("mode" in task && task.mode !== opts.mode) {
160
+ const taskMode = task.mode ?? "unknown";
161
+ skippedByMode.set(taskMode, (skippedByMode.get(taskMode) ?? 0) + 1);
162
+ continue;
163
+ }
154
164
  // Apply area/task/tag filter
155
165
  if (filter) {
156
166
  if (filter.areas?.length &&
@@ -192,6 +202,13 @@ export async function buildPipelinePlan(opts, rootDir) {
192
202
  }
193
203
  }
194
204
  }
205
+ if (skippedByMode.size > 0) {
206
+ const summary = [...skippedByMode.entries()]
207
+ .map(([m, n]) => `${n} ${m}`)
208
+ .join(", ");
209
+ const total = [...skippedByMode.values()].reduce((a, b) => a + b, 0);
210
+ warnings.push(`Skipped ${total} task(s) with non-matching mode (${summary}). Current pipeline mode: ${opts.mode}. Run with --mode <mode> to include them.`);
211
+ }
195
212
  }
196
213
  }
197
214
  }
@@ -203,13 +220,29 @@ export async function buildPipelinePlan(opts, rootDir) {
203
220
  if (opts.repoTasksPath) {
204
221
  try {
205
222
  const repoSource = new RepoTaskSource(opts.repoTasksPath);
206
- // Type-narrow to literacy tasks — compileLiteracyTasks accepts LiteracyTaskDefinition[]
207
- const repoTasks = (await repoSource.loadTasks(filter)).filter((t) => t.mode === "literacy");
223
+ const allRepoTasks = await repoSource.loadTasks(filter);
224
+ // Filter to current mode tasks
225
+ const repoTasks = allRepoTasks.filter((t) => t.mode === opts.mode);
226
+ const skippedRepoTasks = allRepoTasks.length - repoTasks.length;
227
+ if (skippedRepoTasks > 0) {
228
+ const skippedModes = new Map();
229
+ for (const t of allRepoTasks) {
230
+ if (t.mode !== opts.mode) {
231
+ skippedModes.set(t.mode, (skippedModes.get(t.mode) ?? 0) + 1);
232
+ }
233
+ }
234
+ const summary = [...skippedModes.entries()]
235
+ .map(([m, n]) => `${n} ${m}`)
236
+ .join(", ");
237
+ warnings.push(`Skipped ${skippedRepoTasks} repo task(s) with non-matching mode (${summary}). Current pipeline mode: ${opts.mode}. Run with --mode <mode> to include them.`);
238
+ }
208
239
  repoTaskCount = repoTasks.length;
209
- if (repoTaskCount > 0) {
240
+ if (repoTaskCount > 0 && opts.mode === "literacy") {
241
+ // Literacy-specific compilation for repo tasks (detailed test expansion)
242
+ const literacyRepoTasks = repoTasks.filter((t) => t.mode === "literacy");
210
243
  const modelsForCompile = loadModelsFile(rootDir);
211
244
  const graderProvider = modelsForCompile?.grader?.id ?? "openai:chat:gpt-4o";
212
- const compileResult = compileLiteracyTasks(repoTasks, {
245
+ const compileResult = compileLiteracyTasks(literacyRepoTasks, {
213
246
  rootDir,
214
247
  evalMode: opts.variant === LiteracyVariant.AGENTIC
215
248
  ? LiteracyVariant.AGENTIC
@@ -231,6 +264,11 @@ export async function buildPipelinePlan(opts, rootDir) {
231
264
  }
232
265
  }
233
266
  }
267
+ else if (repoTaskCount > 0) {
268
+ // Non-literacy modes: approximate 1 test per task (compilation not
269
+ // supported for non-literacy repo tasks in the explain preview yet)
270
+ totalTests += repoTaskCount;
271
+ }
234
272
  }
235
273
  catch {
236
274
  warnings.push(`Failed to scan repo tasks at ${opts.repoTasksPath} — count may be underestimated`);
@@ -244,19 +282,19 @@ export async function buildPipelinePlan(opts, rootDir) {
244
282
  const models = [];
245
283
  let graderModelName = "";
246
284
  if (modelsFile) {
247
- const activeModels = modelsFile.models.filter((m) => modeMatchesModelModes(opts.mode, m.modes, opts.variant));
285
+ const activeModels = modelsFile.models.filter((m) => modeMatchesModel(opts.mode, m, opts.variant));
248
286
  // For agentic mode, each model appears twice (naive + optimized)
249
287
  for (const m of activeModels) {
250
288
  const modelName = extractModelName(m.id);
251
289
  if (opts.variant === LiteracyVariant.AGENTIC) {
252
- if (m.modes?.includes("agentic-naive")) {
290
+ if (modelMatchesLiteracyVariant(m, "agentic-naive")) {
253
291
  models.push({
254
292
  id: m.id,
255
293
  label: `${m.label} (Naive)`,
256
294
  modelName,
257
295
  });
258
296
  }
259
- if (m.modes?.includes("agentic-optimized")) {
297
+ if (modelMatchesLiteracyVariant(m, "agentic-optimized")) {
260
298
  models.push({
261
299
  id: m.id,
262
300
  label: `${m.label} (Optimized)`,
@@ -14,7 +14,7 @@
14
14
  * not "are these docs good enough?" The output is always labeled as
15
15
  * directional and never displayed on the same scale as scored evaluations.
16
16
  *
17
- * @see docs/exec-plans/scenario-matrix-implementation/phase-4-content-release-integration.md
17
+ * @see docs/archive/exec-plans/scenario-matrix-implementation/phase-4-content-release-integration.md
18
18
  */
19
19
  import type { ProbeResult } from "./types.js";
20
20
  /** Generic probe prompt template */
@@ -14,7 +14,7 @@
14
14
  * not "are these docs good enough?" The output is always labeled as
15
15
  * directional and never displayed on the same scale as scored evaluations.
16
16
  *
17
- * @see docs/exec-plans/scenario-matrix-implementation/phase-4-content-release-integration.md
17
+ * @see docs/archive/exec-plans/scenario-matrix-implementation/phase-4-content-release-integration.md
18
18
  */
19
19
  // ---------------------------------------------------------------------------
20
20
  // Constants
@@ -13,8 +13,8 @@
13
13
  * - generateReadinessReport() — builds the structured report
14
14
  * - formatReadinessMarkdown() — renders the report as markdown
15
15
  *
16
- * @see docs/exec-plans/scenario-matrix-implementation/phase-5-readiness-thresholds.md
17
- * @see docs/exec-plans/eliminate-lib-layer.md
16
+ * @see docs/archive/exec-plans/scenario-matrix-implementation/phase-5-readiness-thresholds.md
17
+ * @see docs/archive/exec-plans/eliminate-lib-layer.md
18
18
  */
19
19
  import type { ThresholdConfig } from "./schemas.js";
20
20
  import type { GapAnalysisReport, GapEstimate, ScoreSummary, ThresholdEvaluation, ThresholdViolation } from "./types.js";
@@ -13,8 +13,8 @@
13
13
  * - generateReadinessReport() — builds the structured report
14
14
  * - formatReadinessMarkdown() — renders the report as markdown
15
15
  *
16
- * @see docs/exec-plans/scenario-matrix-implementation/phase-5-readiness-thresholds.md
17
- * @see docs/exec-plans/eliminate-lib-layer.md
16
+ * @see docs/archive/exec-plans/scenario-matrix-implementation/phase-5-readiness-thresholds.md
17
+ * @see docs/archive/exec-plans/eliminate-lib-layer.md
18
18
  */
19
19
  import { evaluateThresholds } from "./thresholds.js";
20
20
  // ---------------------------------------------------------------------------
@@ -15,7 +15,7 @@
15
15
  * - **not-applicable**: Updated, removed, or unchanged documents (these
16
16
  * follow the standard before/after comparison flow from Phase 2).
17
17
  *
18
- * @see docs/exec-plans/scenario-matrix-implementation/phase-4-content-release-integration.md
18
+ * @see docs/archive/exec-plans/scenario-matrix-implementation/phase-4-content-release-integration.md
19
19
  */
20
20
  import type { ClassifiedReleaseDocument, ProductFeature, ReleaseClassification } from "./types.js";
21
21
  import type { ReverseMapping } from "./reverse-mapping.js";
@@ -15,7 +15,7 @@
15
15
  * - **not-applicable**: Updated, removed, or unchanged documents (these
16
16
  * follow the standard before/after comparison flow from Phase 2).
17
17
  *
18
- * @see docs/exec-plans/scenario-matrix-implementation/phase-4-content-release-integration.md
18
+ * @see docs/archive/exec-plans/scenario-matrix-implementation/phase-4-content-release-integration.md
19
19
  */
20
20
  // ---------------------------------------------------------------------------
21
21
  // Public API
@@ -10,7 +10,7 @@
10
10
  * attribution (2c), and probe results (4b) into the document × area × task
11
11
  * impact matrix specified by Scenario 2.4.
12
12
  *
13
- * @see docs/exec-plans/scenario-matrix-implementation/phase-4-content-release-integration.md
13
+ * @see docs/archive/exec-plans/scenario-matrix-implementation/phase-4-content-release-integration.md
14
14
  */
15
15
  import type { AttributionReport, ComparisonReport, ProbeResult, ReleaseClassification, ReleaseImpactReport } from "./types.js";
16
16
  /**
@@ -10,7 +10,7 @@
10
10
  * attribution (2c), and probe results (4b) into the document × area × task
11
11
  * impact matrix specified by Scenario 2.4.
12
12
  *
13
- * @see docs/exec-plans/scenario-matrix-implementation/phase-4-content-release-integration.md
13
+ * @see docs/archive/exec-plans/scenario-matrix-implementation/phase-4-content-release-integration.md
14
14
  */
15
15
  // ---------------------------------------------------------------------------
16
16
  // Public API
@@ -12,7 +12,7 @@
12
12
  * - Clear "what does this mean?" context
13
13
  * - skip-ailf bypass instructions
14
14
  *
15
- * @see docs/exec-plans/tasks-as-content/phase-6-pr-quality-gates.md
15
+ * @see docs/archive/exec-plans/tasks-as-content/phase-6-pr-quality-gates.md
16
16
  * @see packages/eval/src/pipeline/repo-threshold-evaluator.ts
17
17
  */
18
18
  import type { ComparisonReport, ScoreSummary } from "./types.js";
@@ -12,7 +12,7 @@
12
12
  * - Clear "what does this mean?" context
13
13
  * - skip-ailf bypass instructions
14
14
  *
15
- * @see docs/exec-plans/tasks-as-content/phase-6-pr-quality-gates.md
15
+ * @see docs/archive/exec-plans/tasks-as-content/phase-6-pr-quality-gates.md
16
16
  * @see packages/eval/src/pipeline/repo-threshold-evaluator.ts
17
17
  */
18
18
  // ---------------------------------------------------------------------------
@@ -10,7 +10,7 @@
10
10
  * thresholds are per-area, defined by the AILF team, and drive
11
11
  * readiness reports.
12
12
  *
13
- * @see docs/exec-plans/tasks-as-content/phase-6-pr-quality-gates.md
13
+ * @see docs/archive/exec-plans/tasks-as-content/phase-6-pr-quality-gates.md
14
14
  * @see packages/eval/src/adapters/task-sources/repo-schemas.ts
15
15
  */
16
16
  import type { ScoreSummary } from "./types.js";
@@ -10,7 +10,7 @@
10
10
  * thresholds are per-area, defined by the AILF team, and drive
11
11
  * readiness reports.
12
12
  *
13
- * @see docs/exec-plans/tasks-as-content/phase-6-pr-quality-gates.md
13
+ * @see docs/archive/exec-plans/tasks-as-content/phase-6-pr-quality-gates.md
14
14
  * @see packages/eval/src/adapters/task-sources/repo-schemas.ts
15
15
  */
16
16
  // ---------------------------------------------------------------------------
@@ -1,9 +1,9 @@
1
1
  /**
2
2
  * pipeline/resolve-mappings.ts
3
3
  *
4
- * Extracts canonical mappings from inline task YAML definitions.
5
- * Each task file contains canonical_docs and reference_solution fields
6
- * directly — there is no separate mappings.yaml file.
4
+ * Extracts canonical mappings from task definitions (*.task.ts files).
5
+ * Each task contains context.docs and referenceSolution fields
6
+ * directly — there is no separate mappings file.
7
7
  *
8
8
  * The output shape matches what downstream consumers expect so
9
9
  * fetch-docs, validate, and calculate-scores work without changes.
@@ -24,12 +24,12 @@ export interface ResolvedMappings {
24
24
  }>;
25
25
  }
26
26
  /**
27
- * Extract inline canonical mappings from task YAML files.
28
- * Only tasks with both an id and canonical_docs fields are included.
27
+ * Extract canonical mappings from *.task.ts files in tasks/literacy/.
28
+ * Only tasks with context.docs and referenceSolution are included.
29
29
  */
30
30
  export declare function extractInlineMappings(rootDir: string): ResolvedMappings;
31
31
  /**
32
- * Resolve canonical mappings from inline task YAML definitions.
32
+ * Resolve canonical mappings from task definitions.
33
33
  * This is the single source of truth — there is no external mappings file.
34
34
  */
35
35
  export declare function resolveMappings(rootDir: string): ResolvedMappings;
@@ -1,72 +1,72 @@
1
1
  /**
2
2
  * pipeline/resolve-mappings.ts
3
3
  *
4
- * Extracts canonical mappings from inline task YAML definitions.
5
- * Each task file contains canonical_docs and reference_solution fields
6
- * directly — there is no separate mappings.yaml file.
4
+ * Extracts canonical mappings from task definitions (*.task.ts files).
5
+ * Each task contains context.docs and referenceSolution fields
6
+ * directly — there is no separate mappings file.
7
7
  *
8
8
  * The output shape matches what downstream consumers expect so
9
9
  * fetch-docs, validate, and calculate-scores work without changes.
10
10
  */
11
- import { existsSync, readFileSync, readdirSync } from "fs";
12
- import { resolve } from "path";
13
- import { load } from "js-yaml";
11
+ import { existsSync } from "fs";
12
+ import { discoverTsTaskFiles, loadTsTaskFileSync, } from "../adapters/task-sources/task-file-loader.js";
13
+ import { resolveVendoredSubdir } from "./compiler/config-loader.js";
14
14
  // ---------------------------------------------------------------------------
15
15
  // Resolution
16
16
  // ---------------------------------------------------------------------------
17
17
  /**
18
- * Extract inline canonical mappings from task YAML files.
19
- * Only tasks with both an id and canonical_docs fields are included.
18
+ * Extract canonical mappings from *.task.ts files in tasks/literacy/.
19
+ * Only tasks with context.docs and referenceSolution are included.
20
20
  */
21
21
  export function extractInlineMappings(rootDir) {
22
- const tasksDir = resolve(rootDir, "tasks");
22
+ const tasksDir = resolveVendoredSubdir(rootDir, "tasks/literacy");
23
23
  const result = { feature_areas: {} };
24
24
  if (!existsSync(tasksDir))
25
25
  return result;
26
- const yamlFiles = readdirSync(tasksDir)
27
- .filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
28
- .sort();
29
- for (const file of yamlFiles) {
30
- const featureArea = file.replace(/\.(yaml|yml)$/, "");
31
- const filePath = resolve(tasksDir, file);
32
- const raw = readFileSync(filePath, "utf-8");
33
- const parsed = load(raw);
34
- if (!Array.isArray(parsed))
35
- continue;
36
- const tasks = [];
37
- for (const entry of parsed) {
38
- if (!isInlineTaskWithMappings(entry))
26
+ const files = discoverTsTaskFiles(tasksDir);
27
+ for (const file of files) {
28
+ const loaded = loadTsTaskFileSync(file);
29
+ for (const task of loaded.tasks) {
30
+ const t = task;
31
+ const area = typeof t.area === "string" ? t.area : undefined;
32
+ const id = typeof t.id === "string" ? t.id : undefined;
33
+ const title = typeof t.title === "string" ? t.title : "";
34
+ const referenceSolution = typeof t.referenceSolution === "string" ? t.referenceSolution : "";
35
+ if (!area || !id)
39
36
  continue;
40
- tasks.push({
41
- canonical_docs: entry.canonical_docs,
42
- description: entry.description,
43
- id: entry.id,
44
- reference_solution: entry.reference_solution,
37
+ // Extract docs from context.docs (GeneralizedDocRef[])
38
+ const context = t.context;
39
+ const docs = [];
40
+ if (context?.docs && Array.isArray(context.docs)) {
41
+ for (const doc of context.docs) {
42
+ const d = doc;
43
+ if (typeof d.slug === "string") {
44
+ docs.push({
45
+ slug: d.slug,
46
+ reason: typeof d.reason === "string" ? d.reason : "",
47
+ });
48
+ }
49
+ }
50
+ }
51
+ if (docs.length === 0 || !referenceSolution)
52
+ continue;
53
+ if (!result.feature_areas[area]) {
54
+ result.feature_areas[area] = { tasks: [] };
55
+ }
56
+ result.feature_areas[area].tasks.push({
57
+ canonical_docs: docs,
58
+ description: title,
59
+ id,
60
+ reference_solution: referenceSolution,
45
61
  });
46
62
  }
47
- if (tasks.length > 0) {
48
- result.feature_areas[featureArea] = { tasks };
49
- }
50
63
  }
51
64
  return result;
52
65
  }
53
66
  /**
54
- * Resolve canonical mappings from inline task YAML definitions.
67
+ * Resolve canonical mappings from task definitions.
55
68
  * This is the single source of truth — there is no external mappings file.
56
69
  */
57
70
  export function resolveMappings(rootDir) {
58
71
  return extractInlineMappings(rootDir);
59
72
  }
60
- // ---------------------------------------------------------------------------
61
- // Helpers
62
- // ---------------------------------------------------------------------------
63
- function isInlineTaskWithMappings(entry) {
64
- if (typeof entry !== "object" || entry === null)
65
- return false;
66
- const e = entry;
67
- return (typeof e.id === "string" &&
68
- typeof e.description === "string" &&
69
- Array.isArray(e.canonical_docs) &&
70
- e.canonical_docs.length > 0 &&
71
- typeof e.reference_solution === "string");
72
- }
@@ -2,9 +2,9 @@
2
2
  * pipeline/retrieval-metrics.ts
3
3
  *
4
4
  * Computes retrieval precision and recall by comparing agent-retrieved
5
- * doc slugs against canonical_docs defined in task YAML files.
5
+ * doc slugs against canonical_docs defined in task definitions.
6
6
  *
7
- * This is a pure computation module — no file I/O beyond reading task YAMLs.
7
+ * This is a pure computation module — no file I/O beyond reading task files.
8
8
  */
9
9
  import type { RetrievalMetrics, TaskRetrievalMetrics } from "./types.js";
10
10
  export interface AgenticBehaviorData {
@@ -30,7 +30,7 @@ export declare function computeRetrievalMetrics(rootDir: string, behaviors: Agen
30
30
  */
31
31
  export declare function computeTaskMetrics(taskId: string, area: string, retrieved: string[], canonical: Set<string>): TaskRetrievalMetrics;
32
32
  /**
33
- * Load canonical_docs from all task YAML files.
33
+ * Load canonical docs from *.task.ts files in tasks/literacy/.
34
34
  * Returns a map of taskId → { slugs: Set<string>, area: string }.
35
35
  */
36
36
  export declare function loadCanonicalDocs(rootDir: string): Map<string, {