@sanity/ailf 2.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (442) hide show
  1. package/canonical/grader-references/README.md +2 -2
  2. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  3. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  4. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  5. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  6. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  7. package/config/features.ts +1 -1
  8. package/config/models.ts +28 -23
  9. package/config/sources.ts +1 -1
  10. package/config/thresholds.ts +1 -1
  11. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  13. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  17. package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
  18. package/dist/_vendor/ailf-core/config-helpers.js +29 -0
  19. package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
  20. package/dist/_vendor/ailf-core/examples/index.js +208 -114
  21. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  22. package/dist/_vendor/ailf-core/index.js +1 -0
  23. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  25. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  27. package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
  28. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  29. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  30. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  31. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  32. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  33. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
  34. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
  35. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  36. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  37. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  38. package/dist/_vendor/ailf-core/services/index.js +1 -1
  39. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  40. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
  41. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  42. package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
  43. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
  44. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  45. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  46. package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
  47. package/dist/_vendor/ailf-tasks/cli.js +61 -0
  48. package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
  49. package/dist/_vendor/ailf-tasks/index.js +16 -0
  50. package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
  51. package/dist/_vendor/ailf-tasks/parser.js +73 -0
  52. package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
  53. package/dist/_vendor/ailf-tasks/schemas.js +180 -0
  54. package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
  55. package/dist/_vendor/ailf-tasks/validation.js +162 -0
  56. package/dist/adapters/api-client/remediation.js +2 -2
  57. package/dist/adapters/config-sources/file-config-adapter.js +6 -1
  58. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  59. package/dist/adapters/index.d.ts +0 -1
  60. package/dist/adapters/index.js +0 -1
  61. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  62. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  63. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  64. package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
  65. package/dist/adapters/task-sources/index.d.ts +1 -2
  66. package/dist/adapters/task-sources/index.js +1 -2
  67. package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
  68. package/dist/adapters/task-sources/repo-schemas.js +2 -2
  69. package/dist/adapters/task-sources/repo-task-source.js +1 -1
  70. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  71. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
  73. package/dist/adapters/task-sources/task-file-loader.js +20 -6
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/explain-handler.d.ts +1 -1
  95. package/dist/commands/explain-handler.js +37 -8
  96. package/dist/commands/fetch-docs.js +1 -0
  97. package/dist/commands/generate-configs.d.ts +3 -3
  98. package/dist/commands/generate-configs.js +20 -8
  99. package/dist/commands/init.d.ts +2 -3
  100. package/dist/commands/init.js +56 -170
  101. package/dist/commands/pipeline-action.d.ts +7 -1
  102. package/dist/commands/pipeline-action.js +43 -19
  103. package/dist/commands/pipeline.d.ts +6 -1
  104. package/dist/commands/pipeline.js +7 -2
  105. package/dist/commands/pr-comment.js +1 -0
  106. package/dist/commands/publish.js +1 -0
  107. package/dist/commands/shared/help.js +2 -2
  108. package/dist/commands/update-quality-scores.d.ts +5 -0
  109. package/dist/commands/update-quality-scores.js +20 -0
  110. package/dist/composition-root.d.ts +2 -3
  111. package/dist/composition-root.js +27 -14
  112. package/dist/config/features.ts +23 -0
  113. package/dist/config/models.ts +100 -0
  114. package/dist/config/prompts.ts +16 -0
  115. package/dist/config/rubrics.ts +225 -0
  116. package/dist/config/schedules.ts +47 -0
  117. package/dist/config/sinks.ts +37 -0
  118. package/dist/config/sources.ts +21 -0
  119. package/dist/config/thresholds.ts +61 -0
  120. package/dist/lib/agent-behavior-report.d.ts +8 -0
  121. package/dist/lib/agent-behavior-report.js +185 -0
  122. package/dist/lib/baseline.d.ts +19 -0
  123. package/dist/lib/baseline.js +153 -0
  124. package/dist/lib/calculate-scores.d.ts +23 -0
  125. package/dist/lib/calculate-scores.js +42 -0
  126. package/dist/lib/compare.d.ts +18 -0
  127. package/dist/lib/compare.js +170 -0
  128. package/dist/lib/coverage-audit.d.ts +4 -0
  129. package/dist/lib/coverage-audit.js +42 -0
  130. package/dist/lib/discovery-report.d.ts +13 -0
  131. package/dist/lib/discovery-report.js +57 -0
  132. package/dist/lib/fetch-docs.d.ts +30 -0
  133. package/dist/lib/fetch-docs.js +171 -0
  134. package/dist/lib/generate-configs.d.ts +25 -0
  135. package/dist/lib/generate-configs.js +42 -0
  136. package/dist/lib/grader-api.d.ts +21 -0
  137. package/dist/lib/grader-api.js +34 -0
  138. package/dist/lib/grader-compare.d.ts +19 -0
  139. package/dist/lib/grader-compare.js +91 -0
  140. package/dist/lib/grader-consistency.d.ts +27 -0
  141. package/dist/lib/grader-consistency.js +79 -0
  142. package/dist/lib/grader-sensitivity.d.ts +19 -0
  143. package/dist/lib/grader-sensitivity.js +75 -0
  144. package/dist/lib/grader-validate.d.ts +19 -0
  145. package/dist/lib/grader-validate.js +78 -0
  146. package/dist/lib/measure-retrieval.d.ts +14 -0
  147. package/dist/lib/measure-retrieval.js +71 -0
  148. package/dist/lib/pr-comment.d.ts +16 -0
  149. package/dist/lib/pr-comment.js +28 -0
  150. package/dist/lib/readiness-report.d.ts +13 -0
  151. package/dist/lib/readiness-report.js +108 -0
  152. package/dist/lib/webhook-server.d.ts +11 -0
  153. package/dist/lib/webhook-server.js +24 -0
  154. package/dist/lib/weekly-digest.d.ts +24 -0
  155. package/dist/lib/weekly-digest.js +148 -0
  156. package/dist/orchestration/build-app-context.js +13 -0
  157. package/dist/orchestration/cache-context.d.ts +23 -0
  158. package/dist/orchestration/cache-context.js +43 -0
  159. package/dist/orchestration/env-bridge.d.ts +21 -0
  160. package/dist/orchestration/env-bridge.js +66 -0
  161. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  162. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  163. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  164. package/dist/orchestration/step-runner.js +5 -1
  165. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  166. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  167. package/dist/orchestration/steps/callback-step.js +10 -1
  168. package/dist/orchestration/steps/compare-step.js +6 -3
  169. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  170. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  171. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  172. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  173. package/dist/orchestration/steps/fetch-docs-step.js +30 -16
  174. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  175. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  176. package/dist/orchestration/steps/generate-configs-step.js +50 -15
  177. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  178. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  179. package/dist/orchestration/steps/publish-report-step.js +19 -0
  180. package/dist/orchestration/steps/readiness-step.js +8 -3
  181. package/dist/orchestration/steps/report-step.js +17 -4
  182. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  183. package/dist/orchestration/steps/run-eval-step.js +51 -31
  184. package/dist/pipeline/agent-behavior-report.js +6 -0
  185. package/dist/pipeline/attribution.d.ts +1 -1
  186. package/dist/pipeline/attribution.js +1 -1
  187. package/dist/pipeline/cache.js +29 -15
  188. package/dist/pipeline/calculate-scores.d.ts +2 -0
  189. package/dist/pipeline/calculate-scores.js +70 -33
  190. package/dist/pipeline/chronic-failures.d.ts +55 -0
  191. package/dist/pipeline/chronic-failures.js +110 -0
  192. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
  193. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  194. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  195. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  196. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  197. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  198. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  199. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  200. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  201. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  202. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  203. package/dist/pipeline/compiler/config-loader.js +42 -2
  204. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  205. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  206. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  207. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  208. package/dist/pipeline/compiler/index.d.ts +2 -5
  209. package/dist/pipeline/compiler/index.js +2 -5
  210. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  211. package/dist/pipeline/compiler/literacy-bridge.js +1 -1
  212. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
  213. package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
  214. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
  215. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
  216. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
  217. package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
  218. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
  219. package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
  220. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
  221. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
  222. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  223. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  224. package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
  225. package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
  226. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
  227. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
  228. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  229. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  230. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
  231. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
  232. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  233. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  234. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  235. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
  236. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
  237. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
  238. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
  239. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
  240. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
  241. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
  242. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
  243. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
  244. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
  245. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
  246. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
  247. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
  248. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  249. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  250. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
  251. package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
  252. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  253. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  254. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  255. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  256. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  257. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  258. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  259. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  260. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  261. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  262. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  263. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  264. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  265. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  266. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  267. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  268. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  269. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  270. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  271. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  272. package/dist/pipeline/compiler/task-bridge.js +92 -0
  273. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  274. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  275. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  276. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  277. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  278. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  279. package/dist/pipeline/coverage-audit.d.ts +1 -1
  280. package/dist/pipeline/coverage-audit.js +1 -1
  281. package/dist/pipeline/degradations.d.ts +1 -1
  282. package/dist/pipeline/degradations.js +1 -1
  283. package/dist/pipeline/failure-modes.d.ts +1 -1
  284. package/dist/pipeline/failure-modes.js +13 -1
  285. package/dist/pipeline/gap-analysis.d.ts +1 -1
  286. package/dist/pipeline/gap-analysis.js +3 -1
  287. package/dist/pipeline/generate-configs.d.ts +2 -2
  288. package/dist/pipeline/generate-configs.js +15 -8
  289. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  290. package/dist/pipeline/grader-compare-runner.js +7 -1
  291. package/dist/pipeline/grader-comparison.d.ts +1 -1
  292. package/dist/pipeline/grader-comparison.js +1 -1
  293. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  294. package/dist/pipeline/grader-consistency-runner.js +7 -1
  295. package/dist/pipeline/grader-consistency.d.ts +1 -1
  296. package/dist/pipeline/grader-consistency.js +1 -1
  297. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  298. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  299. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  300. package/dist/pipeline/grader-sensitivity.js +1 -1
  301. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  302. package/dist/pipeline/grader-validate-runner.js +2 -2
  303. package/dist/pipeline/grader-validation.d.ts +1 -1
  304. package/dist/pipeline/grader-validation.js +1 -1
  305. package/dist/pipeline/map-request-to-config.js +15 -2
  306. package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
  307. package/dist/pipeline/mirror-repo-tasks.js +1 -1
  308. package/dist/pipeline/plan-format.d.ts +1 -1
  309. package/dist/pipeline/plan-format.js +1 -1
  310. package/dist/pipeline/plan.d.ts +1 -1
  311. package/dist/pipeline/plan.js +67 -29
  312. package/dist/pipeline/probe.d.ts +1 -1
  313. package/dist/pipeline/probe.js +1 -1
  314. package/dist/pipeline/readiness-report.d.ts +2 -2
  315. package/dist/pipeline/readiness-report.js +2 -2
  316. package/dist/pipeline/release-classification.d.ts +1 -1
  317. package/dist/pipeline/release-classification.js +1 -1
  318. package/dist/pipeline/release-report.d.ts +1 -1
  319. package/dist/pipeline/release-report.js +1 -1
  320. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  321. package/dist/pipeline/repo-eval-comment.js +1 -1
  322. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  323. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  324. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  325. package/dist/pipeline/resolve-mappings.js +44 -44
  326. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  327. package/dist/pipeline/retrieval-metrics.js +28 -20
  328. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  329. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  330. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  331. package/dist/pipeline/steps/compare-step.js +90 -0
  332. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  333. package/dist/pipeline/steps/eval-step.js +347 -0
  334. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  335. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  336. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  337. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  338. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  339. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  340. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  341. package/dist/pipeline/steps/publish-report-step.js +243 -0
  342. package/dist/pipeline/steps/report-step.d.ts +13 -0
  343. package/dist/pipeline/steps/report-step.js +56 -0
  344. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  345. package/dist/pipeline/steps/update-scores-step.js +42 -0
  346. package/dist/pipeline/targeted-loo.d.ts +1 -1
  347. package/dist/pipeline/targeted-loo.js +1 -1
  348. package/dist/pipeline/thresholds.d.ts +1 -1
  349. package/dist/pipeline/thresholds.js +1 -1
  350. package/dist/pipeline/validate.js +13 -0
  351. package/dist/report-store.d.ts +17 -0
  352. package/dist/report-store.js +24 -0
  353. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  354. package/dist/scripts/agent-behavior-report.js +315 -0
  355. package/dist/scripts/baseline.d.ts +43 -0
  356. package/dist/scripts/baseline.js +267 -0
  357. package/dist/scripts/calculate-scores.d.ts +166 -0
  358. package/dist/scripts/calculate-scores.js +1296 -0
  359. package/dist/scripts/compare.d.ts +22 -0
  360. package/dist/scripts/compare.js +334 -0
  361. package/dist/scripts/coverage-audit.d.ts +44 -0
  362. package/dist/scripts/coverage-audit.js +209 -0
  363. package/dist/scripts/debug-eval.d.ts +19 -0
  364. package/dist/scripts/debug-eval.js +73 -0
  365. package/dist/scripts/discovery-report.d.ts +58 -0
  366. package/dist/scripts/discovery-report.js +250 -0
  367. package/dist/scripts/fetch-docs.d.ts +35 -0
  368. package/dist/scripts/fetch-docs.js +472 -0
  369. package/dist/scripts/generate-configs.d.ts +66 -0
  370. package/dist/scripts/generate-configs.js +459 -0
  371. package/dist/scripts/grader-api.d.ts +27 -0
  372. package/dist/scripts/grader-api.js +206 -0
  373. package/dist/scripts/grader-compare.d.ts +22 -0
  374. package/dist/scripts/grader-compare.js +368 -0
  375. package/dist/scripts/grader-consistency.d.ts +20 -0
  376. package/dist/scripts/grader-consistency.js +313 -0
  377. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  378. package/dist/scripts/grader-sensitivity.js +354 -0
  379. package/dist/scripts/grader-validate.d.ts +19 -0
  380. package/dist/scripts/grader-validate.js +267 -0
  381. package/dist/scripts/measure-retrieval.d.ts +10 -0
  382. package/dist/scripts/measure-retrieval.js +145 -0
  383. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  384. package/dist/scripts/migrate-task-mode.js +1 -1
  385. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  386. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  387. package/dist/scripts/pipeline.d.ts +76 -0
  388. package/dist/scripts/pipeline.js +1031 -0
  389. package/dist/scripts/pr-comment.d.ts +10 -0
  390. package/dist/scripts/pr-comment.js +510 -0
  391. package/dist/scripts/readiness-report.d.ts +88 -0
  392. package/dist/scripts/readiness-report.js +342 -0
  393. package/dist/scripts/update-quality-scores.d.ts +15 -0
  394. package/dist/scripts/update-quality-scores.js +184 -0
  395. package/dist/scripts/validate-task-sources.d.ts +1 -1
  396. package/dist/scripts/validate-task-sources.js +1 -1
  397. package/dist/scripts/validate.d.ts +13 -0
  398. package/dist/scripts/validate.js +79 -0
  399. package/dist/scripts/webhook-server.d.ts +26 -0
  400. package/dist/scripts/webhook-server.js +147 -0
  401. package/dist/scripts/weekly-digest.d.ts +24 -0
  402. package/dist/scripts/weekly-digest.js +144 -0
  403. package/dist/sinks/format-slack.d.ts +64 -0
  404. package/dist/sinks/format-slack.js +306 -0
  405. package/dist/sinks/slack-sink.d.ts +27 -0
  406. package/dist/sinks/slack-sink.js +78 -0
  407. package/dist/sinks/types.d.ts +1 -1
  408. package/dist/sinks/types.js +1 -1
  409. package/dist/sinks/webhook-sink.d.ts +19 -0
  410. package/dist/sinks/webhook-sink.js +50 -0
  411. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  412. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  413. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  414. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  415. package/dist/tasks/literacy/functions.task.ts +70 -0
  416. package/dist/tasks/literacy/groq.task.ts +259 -0
  417. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  418. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  419. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  420. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  421. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  422. package/package.json +24 -24
  423. package/tasks/.expanded.agentic.yaml +280 -0
  424. package/tasks/.expanded.yaml +565 -0
  425. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  426. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  427. package/tasks/literacy/content-lake.task.ts +181 -0
  428. package/tasks/literacy/frameworks.task.ts +1 -0
  429. package/tasks/literacy/functions.task.ts +1 -0
  430. package/tasks/literacy/groq.task.ts +1 -0
  431. package/tasks/literacy/image-handling.task.ts +95 -0
  432. package/tasks/literacy/nextjs-live.task.ts +2 -1
  433. package/tasks/literacy/portable-text.task.ts +169 -0
  434. package/tasks/literacy/studio-setup.task.ts +5 -2
  435. package/tasks/literacy/visual-editing.task.ts +1 -0
  436. package/LICENSE +0 -21
  437. package/tasks/frameworks.yaml +0 -98
  438. package/tasks/functions.yaml +0 -51
  439. package/tasks/groq.yaml +0 -216
  440. package/tasks/nextjs-live.yaml +0 -62
  441. package/tasks/studio-setup.yaml +0 -111
  442. package/tasks/visual-editing.yaml +0 -120
@@ -0,0 +1,19 @@
1
+ /**
2
+ * grader-validate.ts
3
+ *
4
+ * CLI script for validating grader accuracy against human reference grades
5
+ * (Phase 2 of grader reliability).
6
+ *
7
+ * Loads human-graded reference samples from canonical/grader-references/,
8
+ * runs the grader model on each sample, and compares against human scores.
9
+ *
10
+ * Usage:
11
+ * pnpm grader-validate # validate with default grader
12
+ * pnpm grader-validate --grader openai:gpt-5.5 # validate a candidate grader
13
+ * pnpm grader-validate --threshold 15 # custom MAE threshold
14
+ *
15
+ * Reads: canonical/grader-references/*.yaml
16
+ * Reads: config/models.yaml (for default grader model)
17
+ * Writes: results/latest/grader-validation.json
18
+ */
19
+ import "dotenv/config";
@@ -0,0 +1,267 @@
1
+ /**
2
+ * grader-validate.ts
3
+ *
4
+ * CLI script for validating grader accuracy against human reference grades
5
+ * (Phase 2 of grader reliability).
6
+ *
7
+ * Loads human-graded reference samples from canonical/grader-references/,
8
+ * runs the grader model on each sample, and compares against human scores.
9
+ *
10
+ * Usage:
11
+ * pnpm grader-validate # validate with default grader
12
+ * pnpm grader-validate --grader openai:gpt-5.5 # validate a candidate grader
13
+ * pnpm grader-validate --threshold 15 # custom MAE threshold
14
+ *
15
+ * Reads: canonical/grader-references/*.yaml
16
+ * Reads: config/models.yaml (for default grader model)
17
+ * Writes: results/latest/grader-validation.json
18
+ */
19
+ // oxlint-disable-next-line import/no-unassigned-import -- side-effect: loads .env into process.env
20
+ import "dotenv/config";
21
+ import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
22
+ import { dirname, join, resolve } from "path";
23
+ import { fileURLToPath } from "url";
24
+ import { load } from "js-yaml";
25
+ import { classifyCorrelation, validateGrader, } from "../pipeline/grader-validation.js";
26
+ import { gradeOnce, loadGraderModel } from "./grader-api.js";
27
+ const __dirname = dirname(fileURLToPath(import.meta.url));
28
+ const ROOT = resolve(__dirname, "..", "..");
29
+ // ---------------------------------------------------------------------------
30
+ // CLI argument parsing
31
+ // ---------------------------------------------------------------------------
32
+ const args = process.argv.slice(2);
33
+ function getFlag(name) {
34
+ return args.includes(`--${name}`);
35
+ }
36
+ function getOption(name) {
37
+ const idx = args.indexOf(`--${name}`);
38
+ return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined;
39
+ }
40
+ const graderOverride = getOption("grader");
41
+ const thresholdStr = getOption("threshold");
42
+ const maeThreshold = thresholdStr ? parseFloat(thresholdStr) : 10;
43
+ const showHelp = getFlag("help") || getFlag("h");
44
+ if (showHelp) {
45
+ console.log(`
46
+ Usage: pnpm grader-validate [options]
47
+
48
+ Validate grader accuracy against human reference grades.
49
+
50
+ Options:
51
+ --grader <model> Grader model to validate (default: from config/models.yaml)
52
+ --threshold <n> MAE threshold for pass/fail (default: 10)
53
+ --help, -h Show this help
54
+
55
+ Examples:
56
+ pnpm grader-validate # validate current grader
57
+ pnpm grader-validate --grader openai:gpt-5.5 # test a candidate
58
+ pnpm grader-validate --threshold 15 # lenient threshold
59
+ `);
60
+ process.exit(0);
61
+ }
62
+ // ---------------------------------------------------------------------------
63
+ // Load reference grades
64
+ // ---------------------------------------------------------------------------
65
+ // ---------------------------------------------------------------------------
66
+ // Dimension mapping
67
+ // ---------------------------------------------------------------------------
68
+ function loadReferenceGrades() {
69
+ const refsDir = join(ROOT, "canonical", "grader-references");
70
+ if (!existsSync(refsDir)) {
71
+ console.error(`❌ Reference grades directory not found: ${refsDir}`);
72
+ console.error("Create canonical/grader-references/ with YAML reference files.");
73
+ console.error("See docs/exec-plans/completed/grader-reliability.md — Phase 2.");
74
+ process.exit(1);
75
+ }
76
+ const files = readdirSync(refsDir)
77
+ .filter((f) => f.endsWith(".yaml") || f.endsWith(".yml"))
78
+ .sort();
79
+ if (files.length === 0) {
80
+ console.error(`❌ No YAML files found in ${refsDir}`);
81
+ process.exit(1);
82
+ }
83
+ const allGrades = [];
84
+ for (const file of files) {
85
+ const filePath = join(refsDir, file);
86
+ const raw = readFileSync(filePath, "utf-8");
87
+ const parsed = load(raw);
88
+ if (Array.isArray(parsed)) {
89
+ allGrades.push(...parsed);
90
+ }
91
+ else if (typeof parsed === "object" && parsed !== null) {
92
+ allGrades.push(parsed);
93
+ }
94
+ }
95
+ return allGrades;
96
+ }
97
+ // ---------------------------------------------------------------------------
98
+ // OpenAI grading API call (reuse from grader-consistency)
99
+ // ---------------------------------------------------------------------------
100
+ async function main() {
101
+ console.log("=== Grader Validation ===\n");
102
+ // Resolve grader model
103
+ const graderModel = graderOverride ?? loadGraderModel().id;
104
+ console.log(` Grader: ${graderModel}`);
105
+ console.log(` Threshold: MAE < ${maeThreshold}`);
106
+ // Load reference grades
107
+ const rawGrades = loadReferenceGrades();
108
+ console.log(` Samples: ${rawGrades.length} reference-graded responses`);
109
+ // Count total rubric judgments
110
+ let totalJudgments = 0;
111
+ for (const rg of rawGrades) {
112
+ totalJudgments += rg.rubrics.length;
113
+ }
114
+ console.log(` Judgments: ${totalJudgments} (response × rubric pairs)`);
115
+ const estimatedCost = totalJudgments * 0.005;
116
+ console.log(` Est. cost: ~$${estimatedCost.toFixed(2)}`);
117
+ console.log();
118
+ // Grade each reference sample
119
+ console.log(" Running grader on reference samples...");
120
+ const grades = [];
121
+ let completed = 0;
122
+ let failed = 0;
123
+ for (const ref of rawGrades) {
124
+ for (const rubric of ref.rubrics) {
125
+ const dimension = mapDimension(rubric.dimension);
126
+ if (!dimension) {
127
+ console.error(` ⚠ Unknown dimension '${rubric.dimension}' — skipping`);
128
+ continue;
129
+ }
130
+ const graderScore = await gradeOnce(graderModel, ref.response, rubric.rubricText);
131
+ completed++;
132
+ if (completed % 5 === 0 || completed === totalJudgments) {
133
+ process.stdout.write(`\r Progress: ${completed}/${totalJudgments}`);
134
+ }
135
+ if (graderScore === null) {
136
+ failed++;
137
+ continue;
138
+ }
139
+ grades.push({
140
+ area: ref.area,
141
+ dimension,
142
+ graderScore,
143
+ humanScore: rubric.humanScore,
144
+ taskId: ref.taskId,
145
+ ...(rubric.notes && { notes: rubric.notes }),
146
+ });
147
+ }
148
+ }
149
+ console.log(); // newline after progress
150
+ if (failed > 0) {
151
+ console.log(` ⚠ ${failed} grading calls failed (excluded from analysis)`);
152
+ }
153
+ console.log();
154
+ if (grades.length === 0) {
155
+ console.error("❌ No grades to analyze.");
156
+ process.exit(1);
157
+ }
158
+ // Validate
159
+ const result = validateGrader(grades, graderModel, { maeThreshold });
160
+ // Print report
161
+ printReport(result);
162
+ // Write output
163
+ const outDir = join(ROOT, "results", "latest");
164
+ mkdirSync(outDir, { recursive: true });
165
+ const outPath = join(outDir, "grader-validation.json");
166
+ writeFileSync(outPath, JSON.stringify(result, null, 2));
167
+ console.log(`\n 📄 Results written to ${outPath}`);
168
+ // Exit with error code if threshold not met
169
+ if (!result.passesThreshold) {
170
+ console.error(`\n ❌ VALIDATION FAILED: MAE ${result.overallMae} exceeds threshold ${maeThreshold}`);
171
+ process.exit(1);
172
+ }
173
+ }
174
+ // ---------------------------------------------------------------------------
175
+ // Main
176
+ // ---------------------------------------------------------------------------
177
+ function mapDimension(dim) {
178
+ switch (dim) {
179
+ case "code-correctness":
180
+ return "codeCorrectness";
181
+ case "doc-coverage":
182
+ return "docCoverage";
183
+ case "task-completion":
184
+ return "taskCompletion";
185
+ default:
186
+ return null;
187
+ }
188
+ }
189
+ // ---------------------------------------------------------------------------
190
+ // Report formatting
191
+ // ---------------------------------------------------------------------------
192
+ function printReport(result) {
193
+ console.log("=".repeat(80));
194
+ console.log(" GRADER VALIDATION REPORT");
195
+ console.log("=".repeat(80));
196
+ console.log();
197
+ console.log(` Grader: ${result.graderModel}`);
198
+ console.log(` Observations: ${result.totalObservations}`);
199
+ console.log();
200
+ // Overall metrics
201
+ console.log("-".repeat(80));
202
+ console.log("OVERALL METRICS");
203
+ console.log("-".repeat(80));
204
+ console.log();
205
+ console.log(` MAE: ${result.overallMae} points`);
206
+ console.log(` Correlation: r=${result.overallCorrelation} (${classifyCorrelation(result.overallCorrelation)})`);
207
+ console.log(` Bias: ${result.overallBias > 0 ? "+" : ""}${result.overallBias} (${result.overallBias > 0 ? "grader scores higher" : result.overallBias < 0 ? "grader scores lower" : "no bias"})`);
208
+ console.log();
209
+ // Per-dimension table
210
+ console.log("-".repeat(80));
211
+ console.log("PER-DIMENSION VALIDITY");
212
+ console.log("-".repeat(80));
213
+ console.log();
214
+ const h = "| Dimension | MAE | Correlation | Quality | Bias | Count |";
215
+ const sep = "|------------------|-------|-------------|-----------|--------|-------|";
216
+ console.log(h);
217
+ console.log(sep);
218
+ const dims = [
219
+ { data: result.perDimension.taskCompletion, name: "Task Completion" },
220
+ { data: result.perDimension.codeCorrectness, name: "Code Correctness" },
221
+ { data: result.perDimension.docCoverage, name: "Doc Coverage" },
222
+ ];
223
+ for (const { data, name } of dims) {
224
+ const quality = classifyCorrelation(data.correlation);
225
+ const biasStr = data.bias > 0 ? `+${data.bias}` : `${data.bias}`;
226
+ console.log(`| ${name.padEnd(16)} | ${String(data.mae).padStart(5)} | r=${String(data.correlation).padStart(9)} | ${quality.padEnd(9)} | ${biasStr.padStart(6)} | ${String(data.count).padStart(5)} |`);
227
+ }
228
+ console.log();
229
+ // Pass/fail verdict
230
+ console.log("-".repeat(80));
231
+ console.log("VERDICT");
232
+ console.log("-".repeat(80));
233
+ console.log();
234
+ if (result.passesThreshold) {
235
+ console.log(` ✅ PASSED: MAE ${result.overallMae} < threshold ${result.maeThreshold}`);
236
+ }
237
+ else {
238
+ console.log(` ❌ FAILED: MAE ${result.overallMae} >= threshold ${result.maeThreshold}`);
239
+ }
240
+ console.log();
241
+ // Largest disagreements
242
+ const topN = Math.min(5, result.largestDisagreements.length);
243
+ if (topN > 0) {
244
+ console.log("-".repeat(80));
245
+ console.log(`TOP ${topN} LARGEST DISAGREEMENTS`);
246
+ console.log("-".repeat(80));
247
+ console.log();
248
+ for (let i = 0; i < topN; i++) {
249
+ const d = result.largestDisagreements[i];
250
+ const sign = d.signedError > 0 ? "+" : "";
251
+ console.log(` ${i + 1}. ${d.taskId} — ${d.dimension}`);
252
+ console.log(` Human=${d.humanScore}, Grader=${d.graderScore} (${sign}${d.signedError})`);
253
+ if (d.notes) {
254
+ console.log(` Note: ${d.notes}`);
255
+ }
256
+ }
257
+ console.log();
258
+ }
259
+ }
260
+ // Only run when invoked directly
261
+ if (process.argv[1]?.endsWith("grader-validate.ts") ||
262
+ process.argv[1]?.endsWith("grader-validate.js")) {
263
+ main().catch((err) => {
264
+ console.error("❌ Fatal error:", err);
265
+ process.exit(1);
266
+ });
267
+ }
@@ -0,0 +1,10 @@
1
+ /**
2
+ * measure-retrieval.ts
3
+ *
4
+ * Evaluates retrieval quality by comparing what Sanity's text search
5
+ * returns against the manually-annotated canonical documents for each
6
+ * evaluation task. Produces Recall@K and NDCG@K metrics.
7
+ *
8
+ * This answers: "Can a retriever find the docs an LLM actually needs?"
9
+ */
10
+ import "dotenv/config";
@@ -0,0 +1,145 @@
1
+ /**
2
+ * measure-retrieval.ts
3
+ *
4
+ * Evaluates retrieval quality by comparing what Sanity's text search
5
+ * returns against the manually-annotated canonical documents for each
6
+ * evaluation task. Produces Recall@K and NDCG@K metrics.
7
+ *
8
+ * This answers: "Can a retriever find the docs an LLM actually needs?"
9
+ */
10
+ // oxlint-disable-next-line import/no-unassigned-import -- side-effect: loads .env into process.env
11
+ import "dotenv/config";
12
+ import { writeFileSync, mkdirSync } from "fs";
13
+ import { join, dirname } from "path";
14
+ import { resolveMappings, } from "../pipeline/resolve-mappings.js";
15
+ import { getSanityClient } from "../sanity/client.js";
16
+ // ---------------------------------------------------------------------------
17
+ // Retrieval via Sanity text search
18
+ // ---------------------------------------------------------------------------
19
+ function calculateNDCG(canonical, retrieved, k) {
20
+ const canonicalSet = new Set(canonical);
21
+ // Discounted Cumulative Gain
22
+ let dcg = 0;
23
+ for (let i = 0; i < Math.min(k, retrieved.length); i++) {
24
+ if (canonicalSet.has(retrieved[i])) {
25
+ dcg += 1 / Math.log2(i + 2); // +2 because log2(1) = 0
26
+ }
27
+ }
28
+ // Ideal DCG
29
+ let idcg = 0;
30
+ for (let i = 0; i < Math.min(k, canonical.length); i++) {
31
+ idcg += 1 / Math.log2(i + 2);
32
+ }
33
+ return idcg === 0 ? 0 : dcg / idcg;
34
+ }
35
+ // ---------------------------------------------------------------------------
36
+ // Metrics
37
+ // ---------------------------------------------------------------------------
38
+ function calculateRecall(canonical, retrieved, k) {
39
+ const retrievedSet = new Set(retrieved.slice(0, k));
40
+ const hits = canonical.filter((doc) => retrievedSet.has(doc)).length;
41
+ return canonical.length === 0 ? 0 : hits / canonical.length;
42
+ }
43
+ async function main() {
44
+ console.log("=== Sanity AI Literacy — Retrieval Quality Measurement ===\n");
45
+ const ROOT = join(dirname(new URL(import.meta.url).pathname), "..", "..");
46
+ const mappings = resolveMappings(ROOT);
47
+ const results = [];
48
+ for (const [area, areaData] of Object.entries(mappings.feature_areas)) {
49
+ console.log(`Feature area: ${area}`);
50
+ for (const task of areaData.tasks) {
51
+ const canonicalSlugs = task.canonical_docs.map((d) => d.slug);
52
+ // Use the task description as a search query
53
+ const retrieved = await retrieveDocsForQuery(task.description, 10);
54
+ const result = {
55
+ canonical_docs: canonicalSlugs,
56
+ feature_area: area,
57
+ ndcg_at_10: calculateNDCG(canonicalSlugs, retrieved, 10),
58
+ recall_at_5: calculateRecall(canonicalSlugs, retrieved, 5),
59
+ recall_at_10: calculateRecall(canonicalSlugs, retrieved, 10),
60
+ retrieved_docs: retrieved,
61
+ task_id: task.id,
62
+ };
63
+ results.push(result);
64
+ console.log(` ${task.id}:`);
65
+ console.log(` Recall@5: ${(result.recall_at_5 * 100).toFixed(1)}%`);
66
+ console.log(` Recall@10: ${(result.recall_at_10 * 100).toFixed(1)}%`);
67
+ console.log(` NDCG@10: ${(result.ndcg_at_10 * 100).toFixed(1)}%`);
68
+ }
69
+ console.log();
70
+ }
71
+ // -----------------------------------------------------------------------
72
+ // Aggregate by feature area
73
+ // -----------------------------------------------------------------------
74
+ const byArea = {};
75
+ for (const area of Object.keys(mappings.feature_areas)) {
76
+ const areaResults = results.filter((r) => r.feature_area === area);
77
+ if (areaResults.length === 0)
78
+ continue;
79
+ byArea[area] = {
80
+ avg_ndcg_at_10: areaResults.reduce((s, r) => s + r.ndcg_at_10, 0) / areaResults.length,
81
+ avg_recall_at_5: areaResults.reduce((s, r) => s + r.recall_at_5, 0) / areaResults.length,
82
+ avg_recall_at_10: areaResults.reduce((s, r) => s + r.recall_at_10, 0) /
83
+ areaResults.length,
84
+ task_count: areaResults.length,
85
+ };
86
+ }
87
+ // -----------------------------------------------------------------------
88
+ // Overall
89
+ // -----------------------------------------------------------------------
90
+ const overall = {
91
+ avg_ndcg_at_10: results.reduce((s, r) => s + r.ndcg_at_10, 0) / results.length,
92
+ avg_recall_at_5: results.reduce((s, r) => s + r.recall_at_5, 0) / results.length,
93
+ avg_recall_at_10: results.reduce((s, r) => s + r.recall_at_10, 0) / results.length,
94
+ };
95
+ // -----------------------------------------------------------------------
96
+ // Print summary
97
+ // -----------------------------------------------------------------------
98
+ console.log("=".repeat(70));
99
+ console.log("RETRIEVAL QUALITY SUMMARY");
100
+ console.log("=".repeat(70));
101
+ console.log();
102
+ console.log("| Feature Area | Recall@5 | Recall@10 | NDCG@10 | Tasks |");
103
+ console.log("|---------------------|----------|-----------|---------|-------|");
104
+ for (const [area, stats] of Object.entries(byArea)) {
105
+ console.log(`| ${area.padEnd(19)} | ${(stats.avg_recall_at_5 * 100).toFixed(1).padStart(7)}% | ` +
106
+ `${(stats.avg_recall_at_10 * 100).toFixed(1).padStart(8)}% | ` +
107
+ `${(stats.avg_ndcg_at_10 * 100).toFixed(1).padStart(6)}% | ` +
108
+ `${stats.task_count.toString().padStart(5)} |`);
109
+ }
110
+ console.log();
111
+ console.log(`Overall: Recall@5=${(overall.avg_recall_at_5 * 100).toFixed(1)}% ` +
112
+ `Recall@10=${(overall.avg_recall_at_10 * 100).toFixed(1)}% ` +
113
+ `NDCG@10=${(overall.avg_ndcg_at_10 * 100).toFixed(1)}%`);
114
+ // -----------------------------------------------------------------------
115
+ // Persist results
116
+ // -----------------------------------------------------------------------
117
+ const summary = { by_area: byArea, overall, results };
118
+ const outDir = join(ROOT, "results", "latest");
119
+ mkdirSync(outDir, { recursive: true });
120
+ writeFileSync(join(outDir, "retrieval-results.json"), JSON.stringify(summary, null, 2));
121
+ console.log("\nResults written to results/latest/retrieval-results.json");
122
+ }
123
+ // ---------------------------------------------------------------------------
124
+ // Main
125
+ // ---------------------------------------------------------------------------
126
+ async function retrieveDocsForQuery(query, k = 10) {
127
+ const client = getSanityClient();
128
+ const results = await client.fetch(`
129
+ *[_type == "article" && !(_id in path("drafts.**"))]
130
+ | score(
131
+ boost(title match $query, 3),
132
+ boost(pt::text(content) match $query, 1)
133
+ )
134
+ | order(_score desc)
135
+ [0...$k] {
136
+ "slug": slug.current,
137
+ _score
138
+ }
139
+ `, { k, query });
140
+ return results.map((r) => r.slug);
141
+ }
142
+ main().catch((err) => {
143
+ console.error("Fatal error:", err);
144
+ process.exit(1);
145
+ });
@@ -19,6 +19,6 @@
19
19
  * - SANITY_API_TOKEN with write access to the project
20
20
  * - SANITY_PROJECT_ID and SANITY_DATASET configured (or defaults used)
21
21
  *
22
- * @see docs/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
22
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
23
23
  */
24
24
  export {};
@@ -19,7 +19,7 @@
19
19
  * - SANITY_API_TOKEN with write access to the project
20
20
  * - SANITY_PROJECT_ID and SANITY_DATASET configured (or defaults used)
21
21
  *
22
- * @see docs/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
22
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
23
23
  */
24
24
  import { config as dotenvConfig } from "dotenv";
25
25
  import { existsSync } from "fs";
@@ -19,6 +19,6 @@
19
19
  * - SANITY_API_TOKEN (or AILF_REPORT_SANITY_API_TOKEN) with write access
20
20
  * - SANITY_PROJECT_ID and SANITY_DATASET configured
21
21
  *
22
- * @see docs/exec-plans/tasks-as-content/phase-3-migration.md
22
+ * @see docs/archive/exec-plans/tasks-as-content/phase-3-migration.md
23
23
  */
24
24
  export {};
@@ -19,7 +19,7 @@
19
19
  * - SANITY_API_TOKEN (or AILF_REPORT_SANITY_API_TOKEN) with write access
20
20
  * - SANITY_PROJECT_ID and SANITY_DATASET configured
21
21
  *
22
- * @see docs/exec-plans/tasks-as-content/phase-3-migration.md
22
+ * @see docs/archive/exec-plans/tasks-as-content/phase-3-migration.md
23
23
  */
24
24
  import { config as dotenvConfig } from "dotenv";
25
25
  import { existsSync, readFileSync } from "fs";
@@ -0,0 +1,76 @@
1
+ /**
2
+ * pipeline.ts
3
+ *
4
+ * CLI orchestrator for the modular evaluation pipeline.
5
+ * Runs steps in sequence with validation between each.
6
+ *
7
+ * This is the single entry point for both local and CI evaluation.
8
+ * The CI workflow (eval.yml) calls this script, then layers on
9
+ * CI-specific post-steps (PR comment posting, artifact upload).
10
+ *
11
+ * Usage:
12
+ * pnpm pipeline # full baseline pipeline
13
+ * pnpm pipeline --dry-run # validate only, no execution
14
+ * pnpm pipeline --skip-fetch # reuse cached doc contexts
15
+ * pnpm pipeline --skip-eval # recalculate from existing results
16
+ * pnpm pipeline --mode agentic # run agentic pipeline
17
+ * pnpm pipeline --mode observed # run observed pipeline
18
+ * pnpm pipeline --source staging # use staging doc source
19
+ * pnpm pipeline --debug # run first 2 tests only (fast)
20
+ * pnpm pipeline --debug-n 5 # run first 5 tests
21
+ * pnpm pipeline --debug-pattern "Blog" # filter by description
22
+ * pnpm pipeline --debug-sample 3 # random sample of 3 tests
23
+ * pnpm pipeline --no-cache # bypass caching, force re-run
24
+ * pnpm pipeline --concurrency 64 # override max parallel API calls
25
+ * pnpm pipeline --area groq,frameworks # only evaluate these areas
26
+ * pnpm pipeline --task groq-blog-queries # only evaluate this task
27
+ * pnpm pipeline --changed-docs groq-introduction,how-queries-work
28
+ * # auto-scope to affected tasks
29
+ * pnpm pipeline --url https://... # override docs base URL
30
+ * pnpm pipeline --sanity-dataset staging # override Sanity dataset
31
+ * pnpm pipeline --sanity-project abc123 # override Sanity project ID
32
+ * pnpm pipeline --sanity-perspective agent-c7OKTk
33
+ * # evaluate a Sanity release
34
+ * pnpm pipeline --sanity-document <uuid>
35
+ * # evaluate specific document(s)
36
+ * pnpm pipeline --sanity-document <uuid> --sanity-documents <uuid>
37
+ * # singular and plural aliases work
38
+ * pnpm pipeline --header "X-Vercel-Protection-Bypass: <secret>"
39
+ * # custom HTTP header (repeatable)
40
+ * pnpm pipeline --allowed-origin my-branch.sanity.build
41
+ * # sandbox agent to this origin
42
+ * pnpm pipeline --before published # run before/after impact evaluation
43
+ * pnpm pipeline --before production # before = production source
44
+ * pnpm pipeline --before results/baselines/20260310.json # use existing scores
45
+ * pnpm pipeline --before latest-baseline # use most recent baseline
46
+ * pnpm pipeline --compare # compare scores against latest baseline
47
+ * pnpm pipeline --compare --compare-baseline <path> # compare against specific file
48
+ * pnpm pipeline --compare --threshold 5 # noise threshold for unchanged (default: 2)
49
+ * pnpm pipeline --output /tmp/report.md # write report to specific path
50
+ * pnpm pipeline --promptfoo-url <url> # include Promptfoo URL in report
51
+ * pnpm pipeline --gap-analysis # run failure mode + impact analysis
52
+ * pnpm pipeline --publish # write report to Sanity + fan out to sinks
53
+ * pnpm pipeline --publish --publish-tag "daily-2026-03-11" # tag the report
54
+ * pnpm pipeline --publish --report-dataset ailf-reports # report store dataset
55
+ * pnpm pipeline --publish --report-project abc123 # report store project
56
+ *
57
+ * Override precedence (highest wins):
58
+ * CLI flag (--url, --sanity-dataset, --sanity-project, --allowed-origin)
59
+ * → Environment variable (DOC_BASE_URL, SANITY_DATASET, SANITY_PROJECT_ID, DOC_ALLOWED_ORIGIN)
60
+ * → config/sources.yaml default value
61
+ *
62
+ * --header flags are additive and do not override env vars — they are
63
+ * always merged with any headers defined in DOC_HEADERS env var.
64
+ *
65
+ * Environment variable fallbacks (for CI):
66
+ * DEBUG_EVAL=1 → --debug
67
+ * DEBUG_EVAL_N=2 → --debug-n 2
68
+ * DEBUG_EVAL_PATTERN → --debug-pattern
69
+ * DEBUG_EVAL_SAMPLE → --debug-sample
70
+ * EVAL_FILTER_AREAS → --area
71
+ * EVAL_FILTER_TASKS → --task
72
+ * EVAL_CHANGED_DOCS → --changed-docs
73
+ * AILF_REPORT_DATASET → --report-dataset (report store, not eval)
74
+ * AILF_REPORT_PROJECT_ID → --report-project (report store, not eval)
75
+ */
76
+ export {};