@sanity/ailf 1.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (499) hide show
  1. package/README.md +0 -1
  2. package/canonical/grader-references/README.md +2 -2
  3. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  4. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  5. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  6. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  7. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  8. package/config/features.ts +1 -1
  9. package/config/models.ts +29 -12
  10. package/config/sources.ts +1 -1
  11. package/config/thresholds.ts +1 -1
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  13. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  17. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  18. package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
  19. package/dist/_vendor/ailf-core/config-helpers.js +51 -2
  20. package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
  21. package/dist/_vendor/ailf-core/examples/index.js +213 -94
  22. package/dist/_vendor/ailf-core/index.d.ts +3 -2
  23. package/dist/_vendor/ailf-core/index.js +2 -1
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  25. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  27. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  28. package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
  29. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  30. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  31. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  32. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  33. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  34. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  35. package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
  36. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
  37. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
  38. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
  39. package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
  40. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  41. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  42. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  43. package/dist/_vendor/ailf-core/services/index.js +1 -1
  44. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
  46. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  47. package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
  50. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  51. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  52. package/dist/adapters/api-client/remediation.js +2 -2
  53. package/dist/adapters/config-sources/file-config-adapter.js +7 -1
  54. package/dist/adapters/config-sources/ts-config-loader.js +21 -13
  55. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  56. package/dist/adapters/index.d.ts +0 -1
  57. package/dist/adapters/index.js +0 -1
  58. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  59. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  60. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  61. package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
  62. package/dist/adapters/task-sources/index.d.ts +3 -4
  63. package/dist/adapters/task-sources/index.js +3 -4
  64. package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
  65. package/dist/adapters/task-sources/repo-schemas.js +228 -20
  66. package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
  67. package/dist/adapters/task-sources/repo-task-source.js +81 -122
  68. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  69. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  70. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  71. package/dist/adapters/task-sources/repo-validation.js +126 -5
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
  73. package/dist/adapters/task-sources/task-file-loader.js +21 -7
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/coverage-audit.js +3 -1
  95. package/dist/commands/explain-handler.d.ts +1 -1
  96. package/dist/commands/explain-handler.js +37 -8
  97. package/dist/commands/fetch-docs.js +1 -0
  98. package/dist/commands/generate-configs.d.ts +3 -3
  99. package/dist/commands/generate-configs.js +20 -8
  100. package/dist/commands/init.d.ts +5 -4
  101. package/dist/commands/init.js +190 -25
  102. package/dist/commands/pipeline-action.d.ts +7 -1
  103. package/dist/commands/pipeline-action.js +43 -19
  104. package/dist/commands/pipeline.d.ts +6 -1
  105. package/dist/commands/pipeline.js +7 -2
  106. package/dist/commands/pr-comment.js +1 -0
  107. package/dist/commands/publish.js +1 -0
  108. package/dist/commands/shared/help.js +2 -2
  109. package/dist/commands/update-quality-scores.d.ts +5 -0
  110. package/dist/commands/update-quality-scores.js +20 -0
  111. package/dist/commands/validate-tasks.d.ts +2 -2
  112. package/dist/commands/validate-tasks.js +26 -15
  113. package/dist/composition-root.d.ts +15 -4
  114. package/dist/composition-root.js +100 -55
  115. package/dist/config/features.ts +23 -0
  116. package/dist/config/models.ts +100 -0
  117. package/dist/config/prompts.ts +16 -0
  118. package/dist/config/rubrics.ts +225 -0
  119. package/dist/config/schedules.ts +47 -0
  120. package/dist/config/sinks.ts +37 -0
  121. package/dist/config/sources.ts +21 -0
  122. package/dist/config/thresholds.ts +61 -0
  123. package/dist/index.d.ts +41 -0
  124. package/dist/index.js +48 -0
  125. package/dist/lib/agent-behavior-report.d.ts +8 -0
  126. package/dist/lib/agent-behavior-report.js +185 -0
  127. package/dist/lib/baseline.d.ts +19 -0
  128. package/dist/lib/baseline.js +153 -0
  129. package/dist/lib/calculate-scores.d.ts +23 -0
  130. package/dist/lib/calculate-scores.js +42 -0
  131. package/dist/lib/compare.d.ts +18 -0
  132. package/dist/lib/compare.js +170 -0
  133. package/dist/lib/coverage-audit.d.ts +4 -0
  134. package/dist/lib/coverage-audit.js +42 -0
  135. package/dist/lib/discovery-report.d.ts +13 -0
  136. package/dist/lib/discovery-report.js +57 -0
  137. package/dist/lib/fetch-docs.d.ts +30 -0
  138. package/dist/lib/fetch-docs.js +171 -0
  139. package/dist/lib/generate-configs.d.ts +25 -0
  140. package/dist/lib/generate-configs.js +42 -0
  141. package/dist/lib/grader-api.d.ts +21 -0
  142. package/dist/lib/grader-api.js +34 -0
  143. package/dist/lib/grader-compare.d.ts +19 -0
  144. package/dist/lib/grader-compare.js +91 -0
  145. package/dist/lib/grader-consistency.d.ts +27 -0
  146. package/dist/lib/grader-consistency.js +79 -0
  147. package/dist/lib/grader-sensitivity.d.ts +19 -0
  148. package/dist/lib/grader-sensitivity.js +75 -0
  149. package/dist/lib/grader-validate.d.ts +19 -0
  150. package/dist/lib/grader-validate.js +78 -0
  151. package/dist/lib/measure-retrieval.d.ts +14 -0
  152. package/dist/lib/measure-retrieval.js +71 -0
  153. package/dist/lib/pr-comment.d.ts +16 -0
  154. package/dist/lib/pr-comment.js +28 -0
  155. package/dist/lib/readiness-report.d.ts +13 -0
  156. package/dist/lib/readiness-report.js +108 -0
  157. package/dist/lib/webhook-server.d.ts +11 -0
  158. package/dist/lib/webhook-server.js +24 -0
  159. package/dist/lib/weekly-digest.d.ts +24 -0
  160. package/dist/lib/weekly-digest.js +148 -0
  161. package/dist/orchestration/build-app-context.js +13 -0
  162. package/dist/orchestration/build-step-sequence.js +4 -2
  163. package/dist/orchestration/cache-context.d.ts +23 -0
  164. package/dist/orchestration/cache-context.js +43 -0
  165. package/dist/orchestration/env-bridge.d.ts +21 -0
  166. package/dist/orchestration/env-bridge.js +66 -0
  167. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  168. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  169. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  170. package/dist/orchestration/step-runner.js +5 -1
  171. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  172. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  173. package/dist/orchestration/steps/callback-step.js +10 -1
  174. package/dist/orchestration/steps/compare-step.js +6 -3
  175. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  176. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  177. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  178. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  179. package/dist/orchestration/steps/fetch-docs-step.js +32 -19
  180. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  181. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  182. package/dist/orchestration/steps/generate-configs-step.js +77 -26
  183. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  184. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  185. package/dist/orchestration/steps/publish-report-step.js +19 -0
  186. package/dist/orchestration/steps/readiness-step.js +8 -3
  187. package/dist/orchestration/steps/report-step.js +17 -4
  188. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  189. package/dist/orchestration/steps/run-eval-step.js +51 -31
  190. package/dist/pipeline/agent-behavior-report.js +6 -0
  191. package/dist/pipeline/attribution.d.ts +1 -1
  192. package/dist/pipeline/attribution.js +1 -1
  193. package/dist/pipeline/cache.js +29 -15
  194. package/dist/pipeline/calculate-scores.d.ts +2 -0
  195. package/dist/pipeline/calculate-scores.js +70 -33
  196. package/dist/pipeline/chronic-failures.d.ts +55 -0
  197. package/dist/pipeline/chronic-failures.js +110 -0
  198. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
  199. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
  200. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
  201. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
  202. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  203. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
  204. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  205. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  206. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  207. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  208. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  209. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  210. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  211. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  212. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  213. package/dist/pipeline/compiler/config-loader.js +42 -2
  214. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  215. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  216. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  217. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  218. package/dist/pipeline/compiler/index.d.ts +2 -5
  219. package/dist/pipeline/compiler/index.js +2 -5
  220. package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
  221. package/dist/pipeline/compiler/literacy-bridge.js +2 -2
  222. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  223. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  224. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  225. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  226. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  227. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  228. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
  229. package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
  230. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  231. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  232. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  233. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  234. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  235. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  236. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  237. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  238. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  239. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  240. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  241. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  242. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  243. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  244. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
  245. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  246. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  247. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  248. package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
  249. package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
  250. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  251. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  252. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  253. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  254. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  255. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  256. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  257. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  258. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  259. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  260. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  261. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  262. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  263. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  264. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  265. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  266. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  267. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  268. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  269. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  270. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  271. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  272. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  273. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  274. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
  275. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
  276. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
  277. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  278. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  279. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  280. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  281. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  282. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  283. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  284. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
  285. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  286. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  287. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  288. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  289. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
  290. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
  291. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  292. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
  293. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  294. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
  295. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  296. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  297. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
  298. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
  299. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
  300. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  301. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  302. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  303. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  304. package/dist/pipeline/compiler/preset-loader.js +99 -0
  305. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
  306. package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
  307. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  308. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  309. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  310. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  311. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  312. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  313. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  314. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  315. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  316. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  317. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  318. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  319. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  320. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  321. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  322. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  323. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  324. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  325. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  326. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  327. package/dist/pipeline/compiler/task-bridge.js +92 -0
  328. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  329. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  330. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  331. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  332. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  333. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  334. package/dist/pipeline/coverage-audit.d.ts +1 -1
  335. package/dist/pipeline/coverage-audit.js +1 -1
  336. package/dist/pipeline/degradations.d.ts +1 -1
  337. package/dist/pipeline/degradations.js +1 -1
  338. package/dist/pipeline/expand-tasks.d.ts +2 -2
  339. package/dist/pipeline/expand-tasks.js +2 -2
  340. package/dist/pipeline/failure-modes.d.ts +1 -1
  341. package/dist/pipeline/failure-modes.js +13 -1
  342. package/dist/pipeline/gap-analysis.d.ts +1 -1
  343. package/dist/pipeline/gap-analysis.js +3 -1
  344. package/dist/pipeline/generate-configs.d.ts +2 -2
  345. package/dist/pipeline/generate-configs.js +16 -9
  346. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  347. package/dist/pipeline/grader-compare-runner.js +7 -1
  348. package/dist/pipeline/grader-comparison.d.ts +1 -1
  349. package/dist/pipeline/grader-comparison.js +1 -1
  350. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  351. package/dist/pipeline/grader-consistency-runner.js +7 -1
  352. package/dist/pipeline/grader-consistency.d.ts +1 -1
  353. package/dist/pipeline/grader-consistency.js +1 -1
  354. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  355. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  356. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  357. package/dist/pipeline/grader-sensitivity.js +1 -1
  358. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  359. package/dist/pipeline/grader-validate-runner.js +2 -2
  360. package/dist/pipeline/grader-validation.d.ts +1 -1
  361. package/dist/pipeline/grader-validation.js +1 -1
  362. package/dist/pipeline/map-request-to-config.js +16 -2
  363. package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
  364. package/dist/pipeline/mirror-repo-tasks.js +10 -10
  365. package/dist/pipeline/plan-format.d.ts +1 -1
  366. package/dist/pipeline/plan-format.js +1 -1
  367. package/dist/pipeline/plan.d.ts +1 -1
  368. package/dist/pipeline/plan.js +68 -30
  369. package/dist/pipeline/probe.d.ts +1 -1
  370. package/dist/pipeline/probe.js +1 -1
  371. package/dist/pipeline/readiness-report.d.ts +2 -2
  372. package/dist/pipeline/readiness-report.js +2 -2
  373. package/dist/pipeline/release-classification.d.ts +1 -1
  374. package/dist/pipeline/release-classification.js +1 -1
  375. package/dist/pipeline/release-report.d.ts +1 -1
  376. package/dist/pipeline/release-report.js +1 -1
  377. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  378. package/dist/pipeline/repo-eval-comment.js +1 -1
  379. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  380. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  381. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  382. package/dist/pipeline/resolve-mappings.js +44 -44
  383. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  384. package/dist/pipeline/retrieval-metrics.js +28 -20
  385. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  386. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  387. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  388. package/dist/pipeline/steps/compare-step.js +90 -0
  389. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  390. package/dist/pipeline/steps/eval-step.js +347 -0
  391. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  392. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  393. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  394. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  395. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  396. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  397. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  398. package/dist/pipeline/steps/publish-report-step.js +243 -0
  399. package/dist/pipeline/steps/report-step.d.ts +13 -0
  400. package/dist/pipeline/steps/report-step.js +56 -0
  401. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  402. package/dist/pipeline/steps/update-scores-step.js +42 -0
  403. package/dist/pipeline/targeted-loo.d.ts +1 -1
  404. package/dist/pipeline/targeted-loo.js +1 -1
  405. package/dist/pipeline/thresholds.d.ts +1 -1
  406. package/dist/pipeline/thresholds.js +1 -1
  407. package/dist/pipeline/validate.js +13 -0
  408. package/dist/report-store.d.ts +17 -0
  409. package/dist/report-store.js +24 -0
  410. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  411. package/dist/scripts/agent-behavior-report.js +315 -0
  412. package/dist/scripts/baseline.d.ts +43 -0
  413. package/dist/scripts/baseline.js +267 -0
  414. package/dist/scripts/calculate-scores.d.ts +166 -0
  415. package/dist/scripts/calculate-scores.js +1296 -0
  416. package/dist/scripts/compare.d.ts +22 -0
  417. package/dist/scripts/compare.js +334 -0
  418. package/dist/scripts/coverage-audit.d.ts +44 -0
  419. package/dist/scripts/coverage-audit.js +209 -0
  420. package/dist/scripts/debug-eval.d.ts +19 -0
  421. package/dist/scripts/debug-eval.js +73 -0
  422. package/dist/scripts/discovery-report.d.ts +58 -0
  423. package/dist/scripts/discovery-report.js +250 -0
  424. package/dist/scripts/fetch-docs.d.ts +35 -0
  425. package/dist/scripts/fetch-docs.js +472 -0
  426. package/dist/scripts/generate-configs.d.ts +66 -0
  427. package/dist/scripts/generate-configs.js +459 -0
  428. package/dist/scripts/grader-api.d.ts +27 -0
  429. package/dist/scripts/grader-api.js +206 -0
  430. package/dist/scripts/grader-compare.d.ts +22 -0
  431. package/dist/scripts/grader-compare.js +368 -0
  432. package/dist/scripts/grader-consistency.d.ts +20 -0
  433. package/dist/scripts/grader-consistency.js +313 -0
  434. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  435. package/dist/scripts/grader-sensitivity.js +354 -0
  436. package/dist/scripts/grader-validate.d.ts +19 -0
  437. package/dist/scripts/grader-validate.js +267 -0
  438. package/dist/scripts/measure-retrieval.d.ts +10 -0
  439. package/dist/scripts/measure-retrieval.js +145 -0
  440. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  441. package/dist/scripts/migrate-task-mode.js +1 -1
  442. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  443. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  444. package/dist/scripts/pipeline.d.ts +76 -0
  445. package/dist/scripts/pipeline.js +1031 -0
  446. package/dist/scripts/pr-comment.d.ts +10 -0
  447. package/dist/scripts/pr-comment.js +510 -0
  448. package/dist/scripts/readiness-report.d.ts +88 -0
  449. package/dist/scripts/readiness-report.js +342 -0
  450. package/dist/scripts/update-quality-scores.d.ts +15 -0
  451. package/dist/scripts/update-quality-scores.js +184 -0
  452. package/dist/scripts/validate-task-sources.d.ts +1 -1
  453. package/dist/scripts/validate-task-sources.js +1 -1
  454. package/dist/scripts/validate.d.ts +13 -0
  455. package/dist/scripts/validate.js +79 -0
  456. package/dist/scripts/webhook-server.d.ts +26 -0
  457. package/dist/scripts/webhook-server.js +147 -0
  458. package/dist/scripts/weekly-digest.d.ts +24 -0
  459. package/dist/scripts/weekly-digest.js +144 -0
  460. package/dist/sinks/format-slack.d.ts +64 -0
  461. package/dist/sinks/format-slack.js +306 -0
  462. package/dist/sinks/slack-sink.d.ts +27 -0
  463. package/dist/sinks/slack-sink.js +78 -0
  464. package/dist/sinks/types.d.ts +1 -1
  465. package/dist/sinks/types.js +1 -1
  466. package/dist/sinks/webhook-sink.d.ts +19 -0
  467. package/dist/sinks/webhook-sink.js +50 -0
  468. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  469. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  470. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  471. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  472. package/dist/tasks/literacy/functions.task.ts +70 -0
  473. package/dist/tasks/literacy/groq.task.ts +259 -0
  474. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  475. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  476. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  477. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  478. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  479. package/package.json +32 -24
  480. package/tasks/.expanded.agentic.yaml +280 -0
  481. package/tasks/.expanded.yaml +565 -0
  482. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  483. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  484. package/tasks/literacy/content-lake.task.ts +181 -0
  485. package/tasks/literacy/frameworks.task.ts +1 -0
  486. package/tasks/literacy/functions.task.ts +1 -0
  487. package/tasks/literacy/groq.task.ts +1 -0
  488. package/tasks/literacy/image-handling.task.ts +95 -0
  489. package/tasks/literacy/nextjs-live.task.ts +2 -1
  490. package/tasks/literacy/portable-text.task.ts +169 -0
  491. package/tasks/literacy/studio-setup.task.ts +5 -2
  492. package/tasks/literacy/visual-editing.task.ts +1 -0
  493. package/LICENSE +0 -21
  494. package/tasks/frameworks.yaml +0 -98
  495. package/tasks/functions.yaml +0 -51
  496. package/tasks/groq.yaml +0 -216
  497. package/tasks/nextjs-live.yaml +0 -62
  498. package/tasks/studio-setup.yaml +0 -111
  499. package/tasks/visual-editing.yaml +0 -120
@@ -0,0 +1,204 @@
1
+ /**
2
+ * Reference Solution: Custom Block Types in Portable Text
3
+ *
4
+ * Demonstrates:
5
+ * - Defining custom block types with defineArrayMember
6
+ * - Adding "code" and "callout" blocks to a PT field
7
+ * - Rendering custom blocks with @portabletext/react
8
+ * - TypeScript types for custom block shapes
9
+ */
10
+
11
+ // === Part 1: Schema Definition (schemas/post.ts) ===
12
+
13
+ import {
14
+ defineType,
15
+ defineField,
16
+ defineArrayMember,
17
+ } from "sanity"
18
+
19
+ export const postType = defineType({
20
+ name: "post",
21
+ title: "Post",
22
+ type: "document",
23
+ fields: [
24
+ defineField({
25
+ name: "title",
26
+ title: "Title",
27
+ type: "string",
28
+ }),
29
+ defineField({
30
+ name: "body",
31
+ title: "Body",
32
+ type: "array",
33
+ of: [
34
+ // Standard block (paragraphs, headings, lists)
35
+ defineArrayMember({
36
+ type: "block",
37
+ styles: [
38
+ { title: "Normal", value: "normal" },
39
+ { title: "H2", value: "h2" },
40
+ { title: "H3", value: "h3" },
41
+ { title: "Quote", value: "blockquote" },
42
+ ],
43
+ marks: {
44
+ decorators: [
45
+ { title: "Bold", value: "strong" },
46
+ { title: "Italic", value: "em" },
47
+ { title: "Code", value: "code" },
48
+ ],
49
+ },
50
+ }),
51
+
52
+ // Custom: Code block with language selection
53
+ defineArrayMember({
54
+ name: "code",
55
+ title: "Code Block",
56
+ type: "object",
57
+ fields: [
58
+ defineField({
59
+ name: "language",
60
+ title: "Language",
61
+ type: "string",
62
+ options: {
63
+ list: [
64
+ { title: "JavaScript", value: "javascript" },
65
+ { title: "TypeScript", value: "typescript" },
66
+ { title: "HTML", value: "html" },
67
+ { title: "CSS", value: "css" },
68
+ { title: "Shell", value: "bash" },
69
+ { title: "JSON", value: "json" },
70
+ ],
71
+ },
72
+ }),
73
+ defineField({
74
+ name: "code",
75
+ title: "Code",
76
+ type: "text",
77
+ rows: 10,
78
+ }),
79
+ defineField({
80
+ name: "filename",
81
+ title: "Filename",
82
+ type: "string",
83
+ }),
84
+ ],
85
+ preview: {
86
+ select: { language: "language", code: "code" },
87
+ prepare({ language, code }) {
88
+ return {
89
+ title: `Code: ${language || "plain"}`,
90
+ subtitle: code ? code.slice(0, 50) + "…" : "",
91
+ }
92
+ },
93
+ },
94
+ }),
95
+
96
+ // Custom: Callout block with tone
97
+ defineArrayMember({
98
+ name: "callout",
99
+ title: "Callout",
100
+ type: "object",
101
+ fields: [
102
+ defineField({
103
+ name: "tone",
104
+ title: "Tone",
105
+ type: "string",
106
+ options: {
107
+ list: [
108
+ { title: "Info", value: "info" },
109
+ { title: "Warning", value: "warning" },
110
+ { title: "Error", value: "error" },
111
+ { title: "Tip", value: "tip" },
112
+ ],
113
+ layout: "radio",
114
+ },
115
+ initialValue: "info",
116
+ }),
117
+ defineField({
118
+ name: "text",
119
+ title: "Text",
120
+ type: "text",
121
+ rows: 3,
122
+ }),
123
+ ],
124
+ preview: {
125
+ select: { tone: "tone", text: "text" },
126
+ prepare({ tone, text }) {
127
+ const icons = { info: "ℹ️", warning: "⚠️", error: "🚨", tip: "💡" }
128
+ return {
129
+ title: `${icons[tone as keyof typeof icons] || ""} ${tone || "info"} callout`,
130
+ subtitle: text,
131
+ }
132
+ },
133
+ },
134
+ }),
135
+ ],
136
+ }),
137
+ ],
138
+ })
139
+
140
+ // === Part 2: Frontend Rendering (components/PortableTextBody.tsx) ===
141
+
142
+ // import { PortableText, type PortableTextComponents } from "@portabletext/react"
143
+
144
+ interface CodeBlockValue {
145
+ _type: "code"
146
+ language?: string
147
+ code: string
148
+ filename?: string
149
+ }
150
+
151
+ interface CalloutBlockValue {
152
+ _type: "callout"
153
+ tone: "info" | "warning" | "error" | "tip"
154
+ text: string
155
+ }
156
+
157
+ function CodeBlock({ value }: { value: CodeBlockValue }) {
158
+ return (
159
+ <figure>
160
+ {value.filename && (
161
+ <figcaption>{value.filename}</figcaption>
162
+ )}
163
+ <pre data-language={value.language}>
164
+ <code>{value.code}</code>
165
+ </pre>
166
+ </figure>
167
+ )
168
+ }
169
+
170
+ const toneStyles: Record<string, { background: string; border: string }> = {
171
+ info: { background: "#e8f4fd", border: "#2196f3" },
172
+ warning: { background: "#fff8e1", border: "#ff9800" },
173
+ error: { background: "#fde8e8", border: "#f44336" },
174
+ tip: { background: "#e8f5e9", border: "#4caf50" },
175
+ }
176
+
177
+ function CalloutBlock({ value }: { value: CalloutBlockValue }) {
178
+ const style = toneStyles[value.tone] || toneStyles.info
179
+
180
+ return (
181
+ <aside
182
+ role="note"
183
+ style={{
184
+ padding: "1rem",
185
+ borderLeft: `4px solid ${style.border}`,
186
+ background: style.background,
187
+ margin: "1.5rem 0",
188
+ }}
189
+ >
190
+ <p>{value.text}</p>
191
+ </aside>
192
+ )
193
+ }
194
+
195
+ // Register custom types in the PortableText component map
196
+ export const components = {
197
+ types: {
198
+ code: CodeBlock,
199
+ callout: CalloutBlock,
200
+ },
201
+ }
202
+
203
+ // Usage:
204
+ // <PortableText value={post.body} components={components} />
@@ -0,0 +1,163 @@
1
+ /**
2
+ * Reference Solution: Portable Text Rendering in React
3
+ *
4
+ * Demonstrates:
5
+ * - Setting up @portabletext/react with custom components
6
+ * - Handling image blocks with @sanity/image-url
7
+ * - Custom marks for links (internal + external)
8
+ * - Code block rendering with language metadata
9
+ * - TypeScript types for component props
10
+ */
11
+
12
+ import { PortableText, type PortableTextComponents } from "@portabletext/react"
13
+ import imageUrlBuilder from "@sanity/image-url"
14
+ import { createClient } from "@sanity/client"
15
+
16
+ // === Sanity Client & Image Builder ===
17
+
18
+ const client = createClient({
19
+ projectId: "your-project-id",
20
+ dataset: "production",
21
+ apiVersion: "2024-01-01",
22
+ useCdn: true,
23
+ })
24
+
25
+ const builder = imageUrlBuilder(client)
26
+
27
+ function urlFor(source: SanityImageSource) {
28
+ return builder.image(source)
29
+ }
30
+
31
+ // === Types ===
32
+
33
+ interface SanityImageSource {
34
+ _type: "image"
35
+ asset: {
36
+ _ref: string
37
+ _type: "reference"
38
+ }
39
+ hotspot?: { x: number; y: number; height: number; width: number }
40
+ crop?: { top: number; bottom: number; left: number; right: number }
41
+ }
42
+
43
+ interface ImageBlockValue {
44
+ _type: "image"
45
+ asset: { _ref: string; _type: "reference" }
46
+ hotspot?: SanityImageSource["hotspot"]
47
+ crop?: SanityImageSource["crop"]
48
+ alt?: string
49
+ caption?: string
50
+ }
51
+
52
+ interface CodeBlockValue {
53
+ _type: "code"
54
+ language?: string
55
+ code: string
56
+ }
57
+
58
+ interface LinkMarkValue {
59
+ _type: "link"
60
+ href: string
61
+ blank?: boolean
62
+ }
63
+
64
+ interface InternalLinkMarkValue {
65
+ _type: "internalLink"
66
+ reference: { _ref: string; _type: "reference" }
67
+ slug?: { current: string }
68
+ }
69
+
70
+ interface PortableTextBodyProps {
71
+ value: unknown[]
72
+ className?: string
73
+ }
74
+
75
+ // === Custom Components ===
76
+
77
+ // Image block — renders Sanity images with hotspot/crop via the URL builder
78
+ function ImageBlock({ value }: { value: ImageBlockValue }) {
79
+ if (!value?.asset) return null
80
+
81
+ const url = urlFor(value)
82
+ .width(800)
83
+ .auto("format")
84
+ .url()
85
+
86
+ return (
87
+ <figure>
88
+ <img
89
+ src={url}
90
+ alt={value.alt || ""}
91
+ loading="lazy"
92
+ style={{ maxWidth: "100%", height: "auto" }}
93
+ />
94
+ {value.caption && <figcaption>{value.caption}</figcaption>}
95
+ </figure>
96
+ )
97
+ }
98
+
99
+ // Code block — renders preformatted code with language annotation
100
+ function CodeBlock({ value }: { value: CodeBlockValue }) {
101
+ return (
102
+ <pre data-language={value.language}>
103
+ <code>{value.code}</code>
104
+ </pre>
105
+ )
106
+ }
107
+
108
+ // External link mark — opens in new tab when blank is true
109
+ function LinkMark({
110
+ value,
111
+ children,
112
+ }: {
113
+ value?: LinkMarkValue
114
+ children: React.ReactNode
115
+ }) {
116
+ const target = value?.blank ? "_blank" : undefined
117
+ const rel = value?.blank ? "noopener noreferrer" : undefined
118
+
119
+ return (
120
+ <a href={value?.href} target={target} rel={rel}>
121
+ {children}
122
+ </a>
123
+ )
124
+ }
125
+
126
+ // Internal link mark — resolves to a local route
127
+ function InternalLinkMark({
128
+ value,
129
+ children,
130
+ }: {
131
+ value?: InternalLinkMarkValue
132
+ children: React.ReactNode
133
+ }) {
134
+ const slug = value?.slug?.current
135
+ const href = slug ? `/${slug}` : "#"
136
+
137
+ return <a href={href}>{children}</a>
138
+ }
139
+
140
+ // === Component Map ===
141
+
142
+ const components: PortableTextComponents = {
143
+ types: {
144
+ image: ImageBlock,
145
+ code: CodeBlock,
146
+ },
147
+ marks: {
148
+ link: LinkMark,
149
+ internalLink: InternalLinkMark,
150
+ },
151
+ }
152
+
153
+ // === Main Component ===
154
+
155
+ export function PortableTextBody({ value, className }: PortableTextBodyProps) {
156
+ if (!value) return null
157
+
158
+ return (
159
+ <div className={className}>
160
+ <PortableText value={value} components={components} />
161
+ </div>
162
+ )
163
+ }
@@ -15,7 +15,7 @@
15
15
  * })
16
16
  *
17
17
  * @see packages/eval/src/pipeline/compiler/presets/sanity-literacy.ts
18
- * @see docs/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
18
+ * @see docs/archive/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
19
19
  */
20
20
 
21
21
  import { defineFeatures } from "@sanity/ailf-core"
package/config/models.ts CHANGED
@@ -1,10 +1,14 @@
1
1
  /**
2
2
  * models.ts — Central model registry for AILF evaluations.
3
3
  *
4
- * Define all models to test here. Each eval mode (baseline, observed, agentic)
5
- * reads this config and generates the appropriate provider entries.
4
+ * Define all models to test here. Each model declares which evaluation
5
+ * modes it participates in (e.g., "literacy", "mcp-server") and
6
+ * optionally which variants within those modes.
6
7
  *
7
- * @see docs/exec-plans/architecture-overhaul/phase-1-ts-config-loading.md
8
+ * When a model enrolls in a mode without specifying variants, all
9
+ * variants defined by the mode base are included by default.
10
+ *
11
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-1-ts-config-loading.md
8
12
  */
9
13
 
10
14
  import { defineModels } from "@sanity/ailf-core"
@@ -16,7 +20,9 @@ export default defineModels({
16
20
  id: "anthropic:messages:claude-opus-4-6",
17
21
  label: "Claude Opus 4.6",
18
22
  config: { temperature: 0.2, max_tokens: 4096 },
19
- modes: ["baseline", "observed", "agentic-naive", "agentic-optimized"],
23
+ modes: ["literacy", "mcp-server", "knowledge-probe"],
24
+ // All literacy variants included by default (baseline, observed,
25
+ // agentic-naive, agentic-optimized)
20
26
  },
21
27
 
22
28
  // ── Google ─────────────────────────────────────────────────
@@ -24,7 +30,7 @@ export default defineModels({
24
30
  // id: "google:gemini-2.5-pro",
25
31
  // label: "Gemini 2.5 Pro",
26
32
  // config: { temperature: 0.2, max_tokens: 4096 },
27
- // modes: ["baseline", "observed", "agentic-naive", "agentic-optimized"],
33
+ // modes: ["literacy"],
28
34
  // },
29
35
 
30
36
  // ── OpenAI ─────────────────────────────────────────────────
@@ -32,29 +38,39 @@ export default defineModels({
32
38
  id: "openai:chat:gpt-5.2",
33
39
  label: "GPT 5.2",
34
40
  config: { temperature: 0.2, max_tokens: 4096 },
35
- modes: ["baseline", "observed", "agentic-naive", "agentic-optimized"],
41
+ modes: ["literacy", "knowledge-probe"],
42
+ // All literacy variants included by default
36
43
  },
37
44
  {
38
- id: "openai:chat:gpt-5.4",
45
+ id: "openai:responses:gpt-5.4",
39
46
  label: "GPT 5.4",
40
47
  config: {
41
48
  reasoning_effort: "medium",
42
49
  max_output_tokens: 4096,
43
50
  maxRetries: 1,
44
51
  },
45
- modes: ["baseline", "observed", "agentic-naive", "agentic-optimized"],
52
+ timeoutMs: 600_000, // 10 min — reasoning model needs more headroom
53
+ modes: ["literacy", "mcp-server", "knowledge-probe"],
54
+ // All literacy variants included by default
46
55
  },
47
56
 
48
57
  // ── Disabled models (uncomment to enable) ──────────────────
49
58
  // { id: "anthropic:claude-sonnet-4-20250514", label: "Claude Sonnet 4",
50
- // config: { temperature: 0.2, max_tokens: 4096 }, modes: ["baseline"] },
59
+ // config: { temperature: 0.2, max_tokens: 4096 },
60
+ // modes: ["literacy"],
61
+ // variants: { literacy: ["baseline"] } },
51
62
  // { id: "anthropic:claude-3.5-sonnet-20241022", label: "Claude 3.5 Sonnet",
52
63
  // config: { temperature: 0.2, max_tokens: 4096 },
53
- // modes: ["baseline", "agentic-naive", "agentic-optimized"] },
64
+ // modes: ["literacy"],
65
+ // variants: { literacy: ["baseline", "agentic-naive", "agentic-optimized"] } },
54
66
  // { id: "google:gemini-2.0-flash", label: "Gemini 2.0 Flash",
55
- // config: { temperature: 0.2, max_tokens: 4096 }, modes: ["baseline"] },
67
+ // config: { temperature: 0.2, max_tokens: 4096 },
68
+ // modes: ["literacy"],
69
+ // variants: { literacy: ["baseline"] } },
56
70
  // { id: "openrouter:deepseek/deepseek-r1", label: "DeepSeek R1",
57
- // config: { temperature: 0.2, max_tokens: 4096 }, modes: ["baseline"] },
71
+ // config: { temperature: 0.2, max_tokens: 4096 },
72
+ // modes: ["literacy"],
73
+ // variants: { literacy: ["baseline"] } },
58
74
  ],
59
75
 
60
76
  // ── Grading Model ──────────────────────────────────────────
@@ -65,6 +81,7 @@ export default defineModels({
65
81
  },
66
82
 
67
83
  // ── Evaluation Options ─────────────────────────────────────
84
+ evalBudgetMs: 1_200_000, // 20 min per eval mode — outer kill switch
68
85
  maxConcurrency: 32, // max parallel API calls — benchmarked in DOC-1896
69
86
 
70
87
  // ── Default Config ─────────────────────────────────────────
package/config/sources.ts CHANGED
@@ -13,7 +13,7 @@
13
13
  * ])
14
14
  *
15
15
  * @see packages/eval/src/pipeline/compiler/presets/sanity-literacy.ts
16
- * @see docs/exec-plans/dynamic-doc-sources.md
16
+ * @see docs/archive/exec-plans/dynamic-doc-sources.md
17
17
  */
18
18
 
19
19
  import { defineSources } from "@sanity/ailf-core"
@@ -6,7 +6,7 @@
6
6
  * - `npx @sanity/ailf pipeline --publish` (severity-aware sink routing)
7
7
  * - `npx @sanity/ailf pipeline --compare` (regression alerting)
8
8
  *
9
- * @see docs/exec-plans/scenario-matrix-implementation/phase-5-readiness-thresholds.md
9
+ * @see docs/archive/exec-plans/scenario-matrix-implementation/phase-5-readiness-thresholds.md
10
10
  */
11
11
 
12
12
  import { defineThresholds } from "@sanity/ailf-core"
@@ -0,0 +1,10 @@
1
+ /**
2
+ * comparison-formatters.test.ts
3
+ *
4
+ * Verifies that formatComparisonMarkdown() and formatComparisonTable()
5
+ * dynamically derive column headers from the dimension keys present
6
+ * in the report data, rather than hardcoding literacy-specific names.
7
+ *
8
+ * Run: npx tsx --test src/__tests__/comparison-formatters.test.ts
9
+ */
10
+ export {};
@@ -0,0 +1,185 @@
1
+ /**
2
+ * comparison-formatters.test.ts
3
+ *
4
+ * Verifies that formatComparisonMarkdown() and formatComparisonTable()
5
+ * dynamically derive column headers from the dimension keys present
6
+ * in the report data, rather than hardcoding literacy-specific names.
7
+ *
8
+ * Run: npx tsx --test src/__tests__/comparison-formatters.test.ts
9
+ */
10
+ import assert from "node:assert/strict";
11
+ import { describe, it } from "node:test";
12
+ import { formatComparisonMarkdown, formatComparisonTable, } from "../services/comparison-formatters.js";
13
+ // ---------------------------------------------------------------------------
14
+ // Helpers
15
+ // ---------------------------------------------------------------------------
16
+ /** Minimal ScoreSummary stub — only fields the formatters actually read */
17
+ function stubSummary(avgScore) {
18
+ return {
19
+ belowCritical: [],
20
+ lowestArea: "area-a",
21
+ lowestScore: 40,
22
+ overall: {
23
+ avgCeilingScore: 80,
24
+ avgScore,
25
+ avgDocLift: 10,
26
+ avgDocQualityGap: 20,
27
+ avgFloorScore: 30,
28
+ negativeDocLiftCount: 0,
29
+ },
30
+ scores: [],
31
+ timestamp: "2026-04-05T00:00:00.000Z",
32
+ };
33
+ }
34
+ function makeReport(overrides) {
35
+ return {
36
+ areas: [
37
+ {
38
+ area: "area-a",
39
+ baseline: 60,
40
+ experiment: 65,
41
+ delta: 5,
42
+ change: "improved",
43
+ dimensions: overrides.areaDimensions,
44
+ ceilingDelta: 0,
45
+ docLiftDelta: 2,
46
+ floorDelta: 0,
47
+ },
48
+ ],
49
+ baseline: stubSummary(60),
50
+ experiment: stubSummary(65),
51
+ deltas: {
52
+ overall: 5,
53
+ perArea: { "area-a": 5 },
54
+ perDimension: overrides.perDimension,
55
+ docLift: 2,
56
+ },
57
+ generatedAt: "2026-04-05T00:00:00.000Z",
58
+ improved: ["area-a"],
59
+ regressed: [],
60
+ unchanged: [],
61
+ notEvaluated: [],
62
+ mismatched: { onlyInBaseline: [], onlyInExperiment: [] },
63
+ noiseThreshold: 2,
64
+ noiseThresholdEmpirical: false,
65
+ };
66
+ }
67
+ // ---------------------------------------------------------------------------
68
+ // Tests — literacy dimensions (backward compatibility)
69
+ // ---------------------------------------------------------------------------
70
+ describe("formatComparisonMarkdown", () => {
71
+ it("renders literacy dimension columns dynamically", () => {
72
+ const report = makeReport({
73
+ areaDimensions: {
74
+ "task-completion": { baseline: 60, experiment: 65, delta: 5 },
75
+ "code-correctness": { baseline: 50, experiment: 55, delta: 5 },
76
+ "doc-coverage": { baseline: 40, experiment: 42, delta: 2 },
77
+ },
78
+ perDimension: {
79
+ "task-completion": 5,
80
+ "code-correctness": 5,
81
+ "doc-coverage": 2,
82
+ },
83
+ });
84
+ const md = formatComparisonMarkdown(report);
85
+ // Column headers should be title-cased from kebab-case
86
+ assert.ok(md.includes("Task Completion"), "should have Task Completion column header");
87
+ assert.ok(md.includes("Code Correctness"), "should have Code Correctness column header");
88
+ assert.ok(md.includes("Doc Coverage"), "should have Doc Coverage column header");
89
+ // Per-dimension averages section should also show dynamic labels
90
+ assert.ok(md.includes("| Task Completion |"), "dimension averages should include Task Completion");
91
+ assert.ok(md.includes("| Code Correctness |"), "dimension averages should include Code Correctness");
92
+ assert.ok(md.includes("| Doc Coverage |"), "dimension averages should include Doc Coverage");
93
+ });
94
+ it("renders MCP dimension columns dynamically", () => {
95
+ const report = makeReport({
96
+ areaDimensions: {
97
+ "input-validation": { baseline: 50, experiment: 60, delta: 10 },
98
+ "output-correctness": { baseline: 70, experiment: 75, delta: 5 },
99
+ "error-handling": { baseline: 40, experiment: 45, delta: 5 },
100
+ security: { baseline: 80, experiment: 82, delta: 2 },
101
+ },
102
+ perDimension: {
103
+ "input-validation": 10,
104
+ "output-correctness": 5,
105
+ "error-handling": 5,
106
+ security: 2,
107
+ },
108
+ });
109
+ const md = formatComparisonMarkdown(report);
110
+ // 4 MCP columns instead of 3 literacy columns
111
+ assert.ok(md.includes("Input Validation"), "should have Input Validation column");
112
+ assert.ok(md.includes("Output Correctness"), "should have Output Correctness column");
113
+ assert.ok(md.includes("Error Handling"), "should have Error Handling column");
114
+ assert.ok(md.includes("Security"), "should have Security column");
115
+ // Per-dimension averages
116
+ assert.ok(md.includes("| Input Validation |"), "dimension averages should include Input Validation");
117
+ assert.ok(md.includes("| Security |"), "dimension averages should include Security");
118
+ });
119
+ });
120
+ describe("formatComparisonTable", () => {
121
+ it("renders literacy dimension columns dynamically", () => {
122
+ const report = makeReport({
123
+ areaDimensions: {
124
+ "task-completion": { baseline: 60, experiment: 65, delta: 5 },
125
+ "code-correctness": { baseline: 50, experiment: 55, delta: 5 },
126
+ "doc-coverage": { baseline: 40, experiment: 42, delta: 2 },
127
+ },
128
+ perDimension: {
129
+ "task-completion": 5,
130
+ "code-correctness": 5,
131
+ "doc-coverage": 2,
132
+ },
133
+ });
134
+ const table = formatComparisonTable(report);
135
+ // Dimension averages section
136
+ assert.ok(table.includes("Task Completion:"), "should show Task Completion in dimension averages");
137
+ assert.ok(table.includes("Code Correctness:"), "should show Code Correctness in dimension averages");
138
+ assert.ok(table.includes("Doc Coverage:"), "should show Doc Coverage in dimension averages");
139
+ // Per-area table header
140
+ assert.ok(table.includes("Task Completion"), "per-area table should have Task Completion header");
141
+ assert.ok(table.includes("Code Correctness"), "per-area table should have Code Correctness header");
142
+ assert.ok(table.includes("Doc Coverage"), "per-area table should have Doc Coverage header");
143
+ });
144
+ it("renders MCP dimension columns dynamically", () => {
145
+ const report = makeReport({
146
+ areaDimensions: {
147
+ "input-validation": { baseline: 50, experiment: 60, delta: 10 },
148
+ "output-correctness": { baseline: 70, experiment: 75, delta: 5 },
149
+ "error-handling": { baseline: 40, experiment: 45, delta: 5 },
150
+ security: { baseline: 80, experiment: 82, delta: 2 },
151
+ },
152
+ perDimension: {
153
+ "input-validation": 10,
154
+ "output-correctness": 5,
155
+ "error-handling": 5,
156
+ security: 2,
157
+ },
158
+ });
159
+ const table = formatComparisonTable(report);
160
+ // 4 MCP columns in the per-area table
161
+ assert.ok(table.includes("Input Validation"), "should have Input Validation");
162
+ assert.ok(table.includes("Output Correctness"), "should have Output Correctness");
163
+ assert.ok(table.includes("Error Handling"), "should have Error Handling");
164
+ assert.ok(table.includes("Security"), "should have Security");
165
+ // Should NOT have literacy dimension headers
166
+ assert.ok(!table.includes("Task Completion"), "should not contain Task Completion");
167
+ assert.ok(!table.includes("Doc Coverage"), "should not contain Doc Coverage");
168
+ });
169
+ it("includes delta values for each dimension in the per-area rows", () => {
170
+ const report = makeReport({
171
+ areaDimensions: {
172
+ "input-validation": { baseline: 50, experiment: 60, delta: 10 },
173
+ "output-correctness": { baseline: 70, experiment: 75, delta: 5 },
174
+ },
175
+ perDimension: {
176
+ "input-validation": 10,
177
+ "output-correctness": 5,
178
+ },
179
+ });
180
+ const table = formatComparisonTable(report);
181
+ // The per-area row should include the delta values (+10 and +5)
182
+ assert.ok(table.includes("+10"), "should show +10 delta for area-a");
183
+ assert.ok(table.includes("+5"), "should show +5 delta for area-a");
184
+ });
185
+ });