@sanity/ailf 2.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (442) hide show
  1. package/canonical/grader-references/README.md +2 -2
  2. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  3. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  4. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  5. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  6. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  7. package/config/features.ts +1 -1
  8. package/config/models.ts +28 -23
  9. package/config/sources.ts +1 -1
  10. package/config/thresholds.ts +1 -1
  11. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  13. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  17. package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
  18. package/dist/_vendor/ailf-core/config-helpers.js +29 -0
  19. package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
  20. package/dist/_vendor/ailf-core/examples/index.js +208 -114
  21. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  22. package/dist/_vendor/ailf-core/index.js +1 -0
  23. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  25. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  27. package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
  28. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  29. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  30. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  31. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  32. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  33. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
  34. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
  35. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  36. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  37. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  38. package/dist/_vendor/ailf-core/services/index.js +1 -1
  39. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  40. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
  41. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  42. package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
  43. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
  44. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  45. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  46. package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
  47. package/dist/_vendor/ailf-tasks/cli.js +61 -0
  48. package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
  49. package/dist/_vendor/ailf-tasks/index.js +16 -0
  50. package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
  51. package/dist/_vendor/ailf-tasks/parser.js +73 -0
  52. package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
  53. package/dist/_vendor/ailf-tasks/schemas.js +180 -0
  54. package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
  55. package/dist/_vendor/ailf-tasks/validation.js +162 -0
  56. package/dist/adapters/api-client/remediation.js +2 -2
  57. package/dist/adapters/config-sources/file-config-adapter.js +6 -1
  58. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  59. package/dist/adapters/index.d.ts +0 -1
  60. package/dist/adapters/index.js +0 -1
  61. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  62. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  63. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  64. package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
  65. package/dist/adapters/task-sources/index.d.ts +1 -2
  66. package/dist/adapters/task-sources/index.js +1 -2
  67. package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
  68. package/dist/adapters/task-sources/repo-schemas.js +2 -2
  69. package/dist/adapters/task-sources/repo-task-source.js +1 -1
  70. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  71. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
  73. package/dist/adapters/task-sources/task-file-loader.js +20 -6
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/explain-handler.d.ts +1 -1
  95. package/dist/commands/explain-handler.js +37 -8
  96. package/dist/commands/fetch-docs.js +1 -0
  97. package/dist/commands/generate-configs.d.ts +3 -3
  98. package/dist/commands/generate-configs.js +20 -8
  99. package/dist/commands/init.d.ts +2 -3
  100. package/dist/commands/init.js +56 -170
  101. package/dist/commands/pipeline-action.d.ts +7 -1
  102. package/dist/commands/pipeline-action.js +43 -19
  103. package/dist/commands/pipeline.d.ts +6 -1
  104. package/dist/commands/pipeline.js +7 -2
  105. package/dist/commands/pr-comment.js +1 -0
  106. package/dist/commands/publish.js +1 -0
  107. package/dist/commands/shared/help.js +2 -2
  108. package/dist/commands/update-quality-scores.d.ts +5 -0
  109. package/dist/commands/update-quality-scores.js +20 -0
  110. package/dist/composition-root.d.ts +2 -3
  111. package/dist/composition-root.js +27 -14
  112. package/dist/config/features.ts +23 -0
  113. package/dist/config/models.ts +100 -0
  114. package/dist/config/prompts.ts +16 -0
  115. package/dist/config/rubrics.ts +225 -0
  116. package/dist/config/schedules.ts +47 -0
  117. package/dist/config/sinks.ts +37 -0
  118. package/dist/config/sources.ts +21 -0
  119. package/dist/config/thresholds.ts +61 -0
  120. package/dist/lib/agent-behavior-report.d.ts +8 -0
  121. package/dist/lib/agent-behavior-report.js +185 -0
  122. package/dist/lib/baseline.d.ts +19 -0
  123. package/dist/lib/baseline.js +153 -0
  124. package/dist/lib/calculate-scores.d.ts +23 -0
  125. package/dist/lib/calculate-scores.js +42 -0
  126. package/dist/lib/compare.d.ts +18 -0
  127. package/dist/lib/compare.js +170 -0
  128. package/dist/lib/coverage-audit.d.ts +4 -0
  129. package/dist/lib/coverage-audit.js +42 -0
  130. package/dist/lib/discovery-report.d.ts +13 -0
  131. package/dist/lib/discovery-report.js +57 -0
  132. package/dist/lib/fetch-docs.d.ts +30 -0
  133. package/dist/lib/fetch-docs.js +171 -0
  134. package/dist/lib/generate-configs.d.ts +25 -0
  135. package/dist/lib/generate-configs.js +42 -0
  136. package/dist/lib/grader-api.d.ts +21 -0
  137. package/dist/lib/grader-api.js +34 -0
  138. package/dist/lib/grader-compare.d.ts +19 -0
  139. package/dist/lib/grader-compare.js +91 -0
  140. package/dist/lib/grader-consistency.d.ts +27 -0
  141. package/dist/lib/grader-consistency.js +79 -0
  142. package/dist/lib/grader-sensitivity.d.ts +19 -0
  143. package/dist/lib/grader-sensitivity.js +75 -0
  144. package/dist/lib/grader-validate.d.ts +19 -0
  145. package/dist/lib/grader-validate.js +78 -0
  146. package/dist/lib/measure-retrieval.d.ts +14 -0
  147. package/dist/lib/measure-retrieval.js +71 -0
  148. package/dist/lib/pr-comment.d.ts +16 -0
  149. package/dist/lib/pr-comment.js +28 -0
  150. package/dist/lib/readiness-report.d.ts +13 -0
  151. package/dist/lib/readiness-report.js +108 -0
  152. package/dist/lib/webhook-server.d.ts +11 -0
  153. package/dist/lib/webhook-server.js +24 -0
  154. package/dist/lib/weekly-digest.d.ts +24 -0
  155. package/dist/lib/weekly-digest.js +148 -0
  156. package/dist/orchestration/build-app-context.js +13 -0
  157. package/dist/orchestration/cache-context.d.ts +23 -0
  158. package/dist/orchestration/cache-context.js +43 -0
  159. package/dist/orchestration/env-bridge.d.ts +21 -0
  160. package/dist/orchestration/env-bridge.js +66 -0
  161. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  162. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  163. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  164. package/dist/orchestration/step-runner.js +5 -1
  165. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  166. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  167. package/dist/orchestration/steps/callback-step.js +10 -1
  168. package/dist/orchestration/steps/compare-step.js +6 -3
  169. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  170. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  171. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  172. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  173. package/dist/orchestration/steps/fetch-docs-step.js +30 -16
  174. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  175. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  176. package/dist/orchestration/steps/generate-configs-step.js +50 -15
  177. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  178. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  179. package/dist/orchestration/steps/publish-report-step.js +19 -0
  180. package/dist/orchestration/steps/readiness-step.js +8 -3
  181. package/dist/orchestration/steps/report-step.js +17 -4
  182. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  183. package/dist/orchestration/steps/run-eval-step.js +51 -31
  184. package/dist/pipeline/agent-behavior-report.js +6 -0
  185. package/dist/pipeline/attribution.d.ts +1 -1
  186. package/dist/pipeline/attribution.js +1 -1
  187. package/dist/pipeline/cache.js +29 -15
  188. package/dist/pipeline/calculate-scores.d.ts +2 -0
  189. package/dist/pipeline/calculate-scores.js +70 -33
  190. package/dist/pipeline/chronic-failures.d.ts +55 -0
  191. package/dist/pipeline/chronic-failures.js +110 -0
  192. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
  193. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  194. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  195. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  196. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  197. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  198. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  199. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  200. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  201. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  202. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  203. package/dist/pipeline/compiler/config-loader.js +42 -2
  204. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  205. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  206. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  207. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  208. package/dist/pipeline/compiler/index.d.ts +2 -5
  209. package/dist/pipeline/compiler/index.js +2 -5
  210. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  211. package/dist/pipeline/compiler/literacy-bridge.js +1 -1
  212. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
  213. package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
  214. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
  215. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
  216. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
  217. package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
  218. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
  219. package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
  220. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
  221. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
  222. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  223. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  224. package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
  225. package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
  226. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
  227. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
  228. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  229. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  230. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
  231. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
  232. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  233. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  234. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  235. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
  236. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
  237. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
  238. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
  239. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
  240. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
  241. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
  242. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
  243. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
  244. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
  245. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
  246. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
  247. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
  248. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  249. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  250. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
  251. package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
  252. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  253. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  254. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  255. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  256. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  257. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  258. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  259. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  260. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  261. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  262. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  263. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  264. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  265. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  266. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  267. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  268. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  269. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  270. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  271. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  272. package/dist/pipeline/compiler/task-bridge.js +92 -0
  273. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  274. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  275. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  276. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  277. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  278. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  279. package/dist/pipeline/coverage-audit.d.ts +1 -1
  280. package/dist/pipeline/coverage-audit.js +1 -1
  281. package/dist/pipeline/degradations.d.ts +1 -1
  282. package/dist/pipeline/degradations.js +1 -1
  283. package/dist/pipeline/failure-modes.d.ts +1 -1
  284. package/dist/pipeline/failure-modes.js +13 -1
  285. package/dist/pipeline/gap-analysis.d.ts +1 -1
  286. package/dist/pipeline/gap-analysis.js +3 -1
  287. package/dist/pipeline/generate-configs.d.ts +2 -2
  288. package/dist/pipeline/generate-configs.js +15 -8
  289. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  290. package/dist/pipeline/grader-compare-runner.js +7 -1
  291. package/dist/pipeline/grader-comparison.d.ts +1 -1
  292. package/dist/pipeline/grader-comparison.js +1 -1
  293. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  294. package/dist/pipeline/grader-consistency-runner.js +7 -1
  295. package/dist/pipeline/grader-consistency.d.ts +1 -1
  296. package/dist/pipeline/grader-consistency.js +1 -1
  297. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  298. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  299. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  300. package/dist/pipeline/grader-sensitivity.js +1 -1
  301. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  302. package/dist/pipeline/grader-validate-runner.js +2 -2
  303. package/dist/pipeline/grader-validation.d.ts +1 -1
  304. package/dist/pipeline/grader-validation.js +1 -1
  305. package/dist/pipeline/map-request-to-config.js +15 -2
  306. package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
  307. package/dist/pipeline/mirror-repo-tasks.js +1 -1
  308. package/dist/pipeline/plan-format.d.ts +1 -1
  309. package/dist/pipeline/plan-format.js +1 -1
  310. package/dist/pipeline/plan.d.ts +1 -1
  311. package/dist/pipeline/plan.js +67 -29
  312. package/dist/pipeline/probe.d.ts +1 -1
  313. package/dist/pipeline/probe.js +1 -1
  314. package/dist/pipeline/readiness-report.d.ts +2 -2
  315. package/dist/pipeline/readiness-report.js +2 -2
  316. package/dist/pipeline/release-classification.d.ts +1 -1
  317. package/dist/pipeline/release-classification.js +1 -1
  318. package/dist/pipeline/release-report.d.ts +1 -1
  319. package/dist/pipeline/release-report.js +1 -1
  320. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  321. package/dist/pipeline/repo-eval-comment.js +1 -1
  322. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  323. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  324. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  325. package/dist/pipeline/resolve-mappings.js +44 -44
  326. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  327. package/dist/pipeline/retrieval-metrics.js +28 -20
  328. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  329. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  330. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  331. package/dist/pipeline/steps/compare-step.js +90 -0
  332. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  333. package/dist/pipeline/steps/eval-step.js +347 -0
  334. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  335. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  336. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  337. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  338. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  339. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  340. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  341. package/dist/pipeline/steps/publish-report-step.js +243 -0
  342. package/dist/pipeline/steps/report-step.d.ts +13 -0
  343. package/dist/pipeline/steps/report-step.js +56 -0
  344. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  345. package/dist/pipeline/steps/update-scores-step.js +42 -0
  346. package/dist/pipeline/targeted-loo.d.ts +1 -1
  347. package/dist/pipeline/targeted-loo.js +1 -1
  348. package/dist/pipeline/thresholds.d.ts +1 -1
  349. package/dist/pipeline/thresholds.js +1 -1
  350. package/dist/pipeline/validate.js +13 -0
  351. package/dist/report-store.d.ts +17 -0
  352. package/dist/report-store.js +24 -0
  353. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  354. package/dist/scripts/agent-behavior-report.js +315 -0
  355. package/dist/scripts/baseline.d.ts +43 -0
  356. package/dist/scripts/baseline.js +267 -0
  357. package/dist/scripts/calculate-scores.d.ts +166 -0
  358. package/dist/scripts/calculate-scores.js +1296 -0
  359. package/dist/scripts/compare.d.ts +22 -0
  360. package/dist/scripts/compare.js +334 -0
  361. package/dist/scripts/coverage-audit.d.ts +44 -0
  362. package/dist/scripts/coverage-audit.js +209 -0
  363. package/dist/scripts/debug-eval.d.ts +19 -0
  364. package/dist/scripts/debug-eval.js +73 -0
  365. package/dist/scripts/discovery-report.d.ts +58 -0
  366. package/dist/scripts/discovery-report.js +250 -0
  367. package/dist/scripts/fetch-docs.d.ts +35 -0
  368. package/dist/scripts/fetch-docs.js +472 -0
  369. package/dist/scripts/generate-configs.d.ts +66 -0
  370. package/dist/scripts/generate-configs.js +459 -0
  371. package/dist/scripts/grader-api.d.ts +27 -0
  372. package/dist/scripts/grader-api.js +206 -0
  373. package/dist/scripts/grader-compare.d.ts +22 -0
  374. package/dist/scripts/grader-compare.js +368 -0
  375. package/dist/scripts/grader-consistency.d.ts +20 -0
  376. package/dist/scripts/grader-consistency.js +313 -0
  377. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  378. package/dist/scripts/grader-sensitivity.js +354 -0
  379. package/dist/scripts/grader-validate.d.ts +19 -0
  380. package/dist/scripts/grader-validate.js +267 -0
  381. package/dist/scripts/measure-retrieval.d.ts +10 -0
  382. package/dist/scripts/measure-retrieval.js +145 -0
  383. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  384. package/dist/scripts/migrate-task-mode.js +1 -1
  385. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  386. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  387. package/dist/scripts/pipeline.d.ts +76 -0
  388. package/dist/scripts/pipeline.js +1031 -0
  389. package/dist/scripts/pr-comment.d.ts +10 -0
  390. package/dist/scripts/pr-comment.js +510 -0
  391. package/dist/scripts/readiness-report.d.ts +88 -0
  392. package/dist/scripts/readiness-report.js +342 -0
  393. package/dist/scripts/update-quality-scores.d.ts +15 -0
  394. package/dist/scripts/update-quality-scores.js +184 -0
  395. package/dist/scripts/validate-task-sources.d.ts +1 -1
  396. package/dist/scripts/validate-task-sources.js +1 -1
  397. package/dist/scripts/validate.d.ts +13 -0
  398. package/dist/scripts/validate.js +79 -0
  399. package/dist/scripts/webhook-server.d.ts +26 -0
  400. package/dist/scripts/webhook-server.js +147 -0
  401. package/dist/scripts/weekly-digest.d.ts +24 -0
  402. package/dist/scripts/weekly-digest.js +144 -0
  403. package/dist/sinks/format-slack.d.ts +64 -0
  404. package/dist/sinks/format-slack.js +306 -0
  405. package/dist/sinks/slack-sink.d.ts +27 -0
  406. package/dist/sinks/slack-sink.js +78 -0
  407. package/dist/sinks/types.d.ts +1 -1
  408. package/dist/sinks/types.js +1 -1
  409. package/dist/sinks/webhook-sink.d.ts +19 -0
  410. package/dist/sinks/webhook-sink.js +50 -0
  411. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  412. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  413. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  414. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  415. package/dist/tasks/literacy/functions.task.ts +70 -0
  416. package/dist/tasks/literacy/groq.task.ts +259 -0
  417. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  418. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  419. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  420. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  421. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  422. package/package.json +24 -24
  423. package/tasks/.expanded.agentic.yaml +280 -0
  424. package/tasks/.expanded.yaml +565 -0
  425. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  426. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  427. package/tasks/literacy/content-lake.task.ts +181 -0
  428. package/tasks/literacy/frameworks.task.ts +1 -0
  429. package/tasks/literacy/functions.task.ts +1 -0
  430. package/tasks/literacy/groq.task.ts +1 -0
  431. package/tasks/literacy/image-handling.task.ts +95 -0
  432. package/tasks/literacy/nextjs-live.task.ts +2 -1
  433. package/tasks/literacy/portable-text.task.ts +169 -0
  434. package/tasks/literacy/studio-setup.task.ts +5 -2
  435. package/tasks/literacy/visual-editing.task.ts +1 -0
  436. package/LICENSE +0 -21
  437. package/tasks/frameworks.yaml +0 -98
  438. package/tasks/functions.yaml +0 -51
  439. package/tasks/groq.yaml +0 -216
  440. package/tasks/nextjs-live.yaml +0 -62
  441. package/tasks/studio-setup.yaml +0 -111
  442. package/tasks/visual-editing.yaml +0 -120
@@ -8,7 +8,9 @@
8
8
  * Once all commands construct ResolvedConfig directly (or use --config),
9
9
  * this bridge can be deleted.
10
10
  */
11
+ import { join } from "node:path";
11
12
  import { createAppContext } from "../composition-root.js";
13
+ import { tryLoadConfigFile } from "../pipeline/compiler/config-loader.js";
12
14
  /**
13
15
  * Map legacy ResolvedOptions to ResolvedConfig.
14
16
  *
@@ -50,6 +52,7 @@ export function mapToResolvedConfig(opts, rootDir) {
50
52
  noCache: opts.noCache,
51
53
  noRemoteCache: opts.noRemoteCache,
52
54
  graderReplications: opts.graderReplications,
55
+ outputDir: opts.outputDir,
53
56
  outputPath: opts.outputPath,
54
57
  urls: opts.urlArgs.length > 0 ? opts.urlArgs : undefined,
55
58
  headers: opts.headerArgs.length > 0
@@ -75,6 +78,10 @@ export function mapToResolvedConfig(opts, rootDir) {
75
78
  remote: opts.remote ?? false,
76
79
  apiUrl: opts.apiUrl ?? "https://ailf-api.sanity.build",
77
80
  apiKey: opts.apiKey,
81
+ captureEnabled: opts.captureEnabled ?? false,
82
+ captureDir: opts.captureDir ?? join(rootDir, "results", "captures"),
83
+ captureCompress: opts.captureCompress ?? true,
84
+ captureExtras: opts.captureExtras ?? true,
78
85
  };
79
86
  }
80
87
  /**
@@ -85,5 +92,11 @@ export function mapToResolvedConfig(opts, rootDir) {
85
92
  */
86
93
  export function buildAppContext(opts, rootDir) {
87
94
  const config = mapToResolvedConfig(opts, rootDir);
95
+ // Inject config-file-only values that don't come from CLI options.
96
+ // evalBudgetMs lives on ModelsConfig, not CLI flags.
97
+ const models = tryLoadConfigFile("models", rootDir);
98
+ if (models?.data?.evalBudgetMs) {
99
+ config.evalBudgetMs = models.data.evalBudgetMs;
100
+ }
88
101
  return createAppContext(config);
89
102
  }
@@ -0,0 +1,23 @@
1
+ /**
2
+ * Build cache context strings from the resolved pipeline configuration.
3
+ *
4
+ * These non-file strings participate in cache key computation so that
5
+ * different CLI filters (mode, variant, area, task, tag) produce
6
+ * distinct cache entries. Without them, running `--mode knowledge-probe`
7
+ * after `--mode literacy` would return cached literacy results.
8
+ *
9
+ * @see packages/core/src/ports/pipeline-step.ts — cacheContext() method
10
+ * @see packages/eval/src/pipeline/cache.ts — hashFiles() context parameter
11
+ */
12
+ import type { ResolvedConfig } from "../_vendor/ailf-core/index.d.ts";
13
+ /**
14
+ * Derive deterministic context strings from the resolved pipeline config.
15
+ *
16
+ * Included in every cacheable step's key so that:
17
+ * - `--mode literacy` and `--mode knowledge-probe` never share cache
18
+ * - `--variant agentic` and `--variant baseline` never share cache
19
+ * - `--area studio` and `--area groq` never share cache
20
+ * - `--task T001` and `--task T002` never share cache
21
+ * - `--tag critical` and `--tag smoke` never share cache
22
+ */
23
+ export declare function buildCacheContext(config: ResolvedConfig): string[];
@@ -0,0 +1,43 @@
1
+ /**
2
+ * Build cache context strings from the resolved pipeline configuration.
3
+ *
4
+ * These non-file strings participate in cache key computation so that
5
+ * different CLI filters (mode, variant, area, task, tag) produce
6
+ * distinct cache entries. Without them, running `--mode knowledge-probe`
7
+ * after `--mode literacy` would return cached literacy results.
8
+ *
9
+ * @see packages/core/src/ports/pipeline-step.ts — cacheContext() method
10
+ * @see packages/eval/src/pipeline/cache.ts — hashFiles() context parameter
11
+ */
12
+ /**
13
+ * Derive deterministic context strings from the resolved pipeline config.
14
+ *
15
+ * Included in every cacheable step's key so that:
16
+ * - `--mode literacy` and `--mode knowledge-probe` never share cache
17
+ * - `--variant agentic` and `--variant baseline` never share cache
18
+ * - `--area studio` and `--area groq` never share cache
19
+ * - `--task T001` and `--task T002` never share cache
20
+ * - `--tag critical` and `--tag smoke` never share cache
21
+ */
22
+ export function buildCacheContext(config) {
23
+ const context = [];
24
+ // Mode is always present — it's required in ResolvedConfig
25
+ context.push(`mode:${config.mode}`);
26
+ // Variant (literacy sub-mode: baseline, agentic, observed, full)
27
+ if (config.variant) {
28
+ context.push(`variant:${config.variant}`);
29
+ }
30
+ // Area filter — sorted for deterministic hashing
31
+ if (config.areas && config.areas.length > 0) {
32
+ context.push(`areas:${[...config.areas].sort().join(",")}`);
33
+ }
34
+ // Task filter — sorted for deterministic hashing
35
+ if (config.tasks && config.tasks.length > 0) {
36
+ context.push(`tasks:${[...config.tasks].sort().join(",")}`);
37
+ }
38
+ // Tag filter — sorted for deterministic hashing
39
+ if (config.tags && config.tags.length > 0) {
40
+ context.push(`tags:${[...config.tags].sort().join(",")}`);
41
+ }
42
+ return context;
43
+ }
@@ -0,0 +1,21 @@
1
+ /**
2
+ * Environment variable bridge — writes ResolvedConfig values to process.env
3
+ * so that lib/*.ts modules (which still read process.env) work correctly.
4
+ *
5
+ * This replaces the former global applyEnvironment() with an explicit
6
+ * per-step bridge. Each orchestration step calls this before invoking
7
+ * its lib/*.ts main() function.
8
+ *
9
+ * Phase 9 will eliminate this file entirely by giving lib/*.ts main()
10
+ * functions typed option parameters.
11
+ *
12
+ * @see docs/exec-plans/active/ports-and-adapters/phase-8-delete-legacy-step-layer.md
13
+ */
14
+ import type { ResolvedConfig } from "../_vendor/ailf-core/index.d.ts";
15
+ /**
16
+ * Bridge ResolvedConfig values to process.env.
17
+ *
18
+ * Idempotent — safe to call multiple times. Only sets env vars for
19
+ * config values that are defined (never deletes or resets).
20
+ */
21
+ export declare function bridgeConfigToEnv(config: ResolvedConfig): void;
@@ -0,0 +1,66 @@
1
+ /**
2
+ * Environment variable bridge — writes ResolvedConfig values to process.env
3
+ * so that lib/*.ts modules (which still read process.env) work correctly.
4
+ *
5
+ * This replaces the former global applyEnvironment() with an explicit
6
+ * per-step bridge. Each orchestration step calls this before invoking
7
+ * its lib/*.ts main() function.
8
+ *
9
+ * Phase 9 will eliminate this file entirely by giving lib/*.ts main()
10
+ * functions typed option parameters.
11
+ *
12
+ * @see docs/exec-plans/active/ports-and-adapters/phase-8-delete-legacy-step-layer.md
13
+ */
14
+ /**
15
+ * Bridge ResolvedConfig values to process.env.
16
+ *
17
+ * Idempotent — safe to call multiple times. Only sets env vars for
18
+ * config values that are defined (never deletes or resets).
19
+ */
20
+ export function bridgeConfigToEnv(config) {
21
+ // Mode
22
+ process.env.EVAL_MODE = config.mode;
23
+ // Search mode
24
+ if (config.searchMode !== "open") {
25
+ process.env.EVAL_SEARCH_MODE = config.searchMode;
26
+ }
27
+ // Source
28
+ if (config.source) {
29
+ process.env.DOC_SOURCE = config.source;
30
+ }
31
+ // URL-derived overrides
32
+ if (config.urls?.[0]) {
33
+ process.env.DOC_BASE_URL = config.urls[0];
34
+ }
35
+ // Sanity overrides
36
+ if (config.datasetOverride) {
37
+ process.env.SANITY_DATASET = config.datasetOverride;
38
+ }
39
+ if (config.projectIdOverride) {
40
+ process.env.SANITY_PROJECT_ID = config.projectIdOverride;
41
+ }
42
+ if (config.perspectiveOverride) {
43
+ process.env.SANITY_PERSPECTIVE = config.perspectiveOverride;
44
+ }
45
+ if (config.studioOriginOverride) {
46
+ process.env.SANITY_STUDIO_ORIGIN = config.studioOriginOverride;
47
+ }
48
+ if (config.sanityDocumentArgs?.length) {
49
+ process.env.SANITY_DOCUMENT_IDS = config.sanityDocumentArgs.join(",");
50
+ }
51
+ // Custom headers
52
+ if (config.headers) {
53
+ process.env.DOC_HEADERS = JSON.stringify(config.headers);
54
+ }
55
+ // Allowed origins
56
+ if (config.allowedOrigins?.length) {
57
+ process.env.DOC_ALLOWED_ORIGINS = config.allowedOrigins.join(",");
58
+ }
59
+ // Scoping filters
60
+ if (config.areas) {
61
+ process.env.EVAL_FILTER_AREAS = config.areas.join(",");
62
+ }
63
+ if (config.tasks) {
64
+ process.env.EVAL_FILTER_TASKS = config.tasks.join(",");
65
+ }
66
+ }
@@ -0,0 +1,34 @@
1
+ /**
2
+ * Shared task loading for pipeline orchestration steps.
3
+ *
4
+ * Both FetchDocsStep and GenerateConfigsStep need to see the same set of
5
+ * tasks. This function loads from filesystem .task.ts files — the
6
+ * authoritative source for the current pipeline architecture.
7
+ *
8
+ * Background: The composition root wires ctx.taskSource to
9
+ * ContentLakeTaskSource by default, but GenerateConfigsStep bypasses it
10
+ * and loads directly from the filesystem. FetchDocsStep must use the
11
+ * same source to avoid a mismatch where configs reference context files
12
+ * that were never fetched.
13
+ *
14
+ * @see packages/eval/src/orchestration/steps/generate-configs-step.ts
15
+ * @see packages/eval/src/orchestration/steps/fetch-docs-step.ts
16
+ */
17
+ import type { GeneralizedTaskDefinition } from "../_vendor/ailf-core/index.d.ts";
18
+ export interface LoadPipelineTasksOptions {
19
+ /** Absolute path to the eval package root (packages/eval) */
20
+ rootDir: string;
21
+ /** Evaluation mode — determines the tasks/{mode}/ subdirectory */
22
+ mode: string;
23
+ /** Optional extra directory for repo-based tasks (--repo-tasks-path) */
24
+ repoTasksPath?: string;
25
+ }
26
+ /**
27
+ * Load task definitions from the filesystem, matching the pipeline's
28
+ * authoritative task source.
29
+ *
30
+ * Discovers and loads `*.task.ts` files from `tasks/{mode}/` and
31
+ * optionally `--repo-tasks-path`. Tasks whose `mode` field doesn't
32
+ * match the requested mode are excluded.
33
+ */
34
+ export declare function loadPipelineTasks(opts: LoadPipelineTasksOptions): Promise<GeneralizedTaskDefinition[]>;
@@ -0,0 +1,52 @@
1
+ /**
2
+ * Shared task loading for pipeline orchestration steps.
3
+ *
4
+ * Both FetchDocsStep and GenerateConfigsStep need to see the same set of
5
+ * tasks. This function loads from filesystem .task.ts files — the
6
+ * authoritative source for the current pipeline architecture.
7
+ *
8
+ * Background: The composition root wires ctx.taskSource to
9
+ * ContentLakeTaskSource by default, but GenerateConfigsStep bypasses it
10
+ * and loads directly from the filesystem. FetchDocsStep must use the
11
+ * same source to avoid a mismatch where configs reference context files
12
+ * that were never fetched.
13
+ *
14
+ * @see packages/eval/src/orchestration/steps/generate-configs-step.ts
15
+ * @see packages/eval/src/orchestration/steps/fetch-docs-step.ts
16
+ */
17
+ import { resolve } from "path";
18
+ import { discoverTsTaskFiles, loadTsTaskFile, } from "../adapters/task-sources/task-file-loader.js";
19
+ import { resolveVendoredSubdir } from "../pipeline/compiler/config-loader.js";
20
+ /**
21
+ * Load task definitions from the filesystem, matching the pipeline's
22
+ * authoritative task source.
23
+ *
24
+ * Discovers and loads `*.task.ts` files from `tasks/{mode}/` and
25
+ * optionally `--repo-tasks-path`. Tasks whose `mode` field doesn't
26
+ * match the requested mode are excluded.
27
+ */
28
+ export async function loadPipelineTasks(opts) {
29
+ const tasksDir = resolveVendoredSubdir(opts.rootDir, `tasks/${opts.mode}`);
30
+ const dirs = [tasksDir];
31
+ if (opts.repoTasksPath) {
32
+ const repoDir = resolve(opts.repoTasksPath);
33
+ if (!dirs.includes(repoDir)) {
34
+ dirs.push(repoDir);
35
+ }
36
+ }
37
+ const tasks = [];
38
+ for (const dir of dirs) {
39
+ const files = discoverTsTaskFiles(dir);
40
+ for (const file of files) {
41
+ const raw = await loadTsTaskFile(file);
42
+ for (const t of raw.tasks) {
43
+ const task = t;
44
+ // Filter to matching mode (skip tasks from other modes in same dir)
45
+ if (!("mode" in task) || task.mode === opts.mode) {
46
+ tasks.push(task);
47
+ }
48
+ }
49
+ }
50
+ }
51
+ return tasks;
52
+ }
@@ -20,10 +20,20 @@ import { runStep } from "./step-runner.js";
20
20
  * underlying Sanity client. Best-effort — failures are logged and
21
21
  * never block the pipeline.
22
22
  */
23
- async function reportJobProgress(ctx, stepName, completedSteps, totalSteps, status, errorInfo) {
23
+ async function reportJobProgress(ctx, stepName, completedSteps, totalSteps, status, errorInfo, jobUpdates) {
24
24
  const jobId = ctx.config.jobId;
25
25
  if (!jobId)
26
26
  return;
27
+ // Accumulate update for artifact capture
28
+ jobUpdates?.push({
29
+ jobId,
30
+ stepName,
31
+ completedSteps,
32
+ totalSteps,
33
+ status,
34
+ errorInfo,
35
+ timestamp: new Date().toISOString(),
36
+ });
27
37
  // Use the report store's write capability to patch the job document.
28
38
  // The report store exposes a Sanity client — we access it through
29
39
  // a best-effort PATCH via the same client infrastructure.
@@ -59,6 +69,51 @@ async function reportJobProgress(ctx, stepName, completedSteps, totalSteps, stat
59
69
  }
60
70
  }
61
71
  // ---------------------------------------------------------------------------
72
+ // Artifact capture
73
+ // ---------------------------------------------------------------------------
74
+ /**
75
+ * Capture a snapshot of the pipeline config, final state, and step results.
76
+ * Strips secrets (API keys, tokens) from the config.
77
+ */
78
+ function capturePipelineContext(ctx, state, results) {
79
+ if (!ctx.collector.enabled)
80
+ return;
81
+ const sanitized = Object.fromEntries(Object.entries(ctx.config).filter(([k]) => !/token|secret|key/i.test(k)));
82
+ ctx.collector.capture("pipeline", "pipeline-context", {
83
+ config: sanitized,
84
+ state: {
85
+ reportId: state.reportId,
86
+ evalFingerprint: state.evalFingerprint,
87
+ belowCritical: state.belowCritical,
88
+ remoteCacheHits: state.remoteCacheHits
89
+ ? [...state.remoteCacheHits]
90
+ : undefined,
91
+ releaseAutoScope: state.releaseAutoScope,
92
+ testSummary: state.testSummary,
93
+ },
94
+ steps: Object.entries(results).map(([name, result]) => ({
95
+ name,
96
+ status: result.status,
97
+ durationMs: result.status !== "skipped" ? result.durationMs : undefined,
98
+ })),
99
+ });
100
+ }
101
+ /**
102
+ * Flush captured artifacts to disk. Non-blocking — failures are logged
103
+ * but never affect the pipeline result.
104
+ */
105
+ async function flushArtifacts(ctx) {
106
+ if (!ctx.collector.enabled)
107
+ return;
108
+ try {
109
+ const result = await ctx.collector.flush();
110
+ ctx.logger.info(`Captured ${result.artifactCount} artifacts → ${result.destination}`);
111
+ }
112
+ catch (err) {
113
+ ctx.logger.warn(`Artifact capture flush failed: ${err instanceof Error ? err.message : err}`);
114
+ }
115
+ }
116
+ // ---------------------------------------------------------------------------
62
117
  // Orchestrator
63
118
  // ---------------------------------------------------------------------------
64
119
  /**
@@ -76,6 +131,7 @@ export async function orchestratePipeline(ctx, steps) {
76
131
  const validation = { issues: [], valid: true };
77
132
  const pipelineStart = Date.now();
78
133
  const hasJob = !!ctx.config.jobId;
134
+ const jobUpdates = [];
79
135
  ctx.logger.section("ai-literacy-framework — Evaluation Pipeline");
80
136
  ctx.logger.debug(`Pipeline starting with ${steps.length} steps`, {
81
137
  steps: steps.map((s) => s.name),
@@ -86,7 +142,7 @@ export async function orchestratePipeline(ctx, steps) {
86
142
  });
87
143
  // Report initial running status
88
144
  if (hasJob) {
89
- await reportJobProgress(ctx, steps[0]?.name ?? "init", 0, steps.length, "running");
145
+ await reportJobProgress(ctx, steps[0]?.name ?? "init", 0, steps.length, "running", undefined, jobUpdates);
90
146
  }
91
147
  for (let i = 0; i < steps.length; i++) {
92
148
  const step = steps[i];
@@ -94,7 +150,7 @@ export async function orchestratePipeline(ctx, steps) {
94
150
  ctx.logger.section(step.name);
95
151
  // Report current step progress
96
152
  if (hasJob) {
97
- await reportJobProgress(ctx, step.name, i, steps.length, "running");
153
+ await reportJobProgress(ctx, step.name, i, steps.length, "running", undefined, jobUpdates);
98
154
  }
99
155
  const result = await runStep(step, ctx, state);
100
156
  results[step.name] = result;
@@ -111,8 +167,15 @@ export async function orchestratePipeline(ctx, steps) {
111
167
  await reportJobProgress(ctx, step.name, i + 1, steps.length, "failed", {
112
168
  message: failedError,
113
169
  step: step.name,
114
- });
170
+ }, jobUpdates);
115
171
  }
172
+ // Capture pipeline context and job updates before flushing
173
+ capturePipelineContext(ctx, state, results);
174
+ if (jobUpdates.length > 0) {
175
+ ctx.collector.capture("job-store", "job-updates", jobUpdates);
176
+ }
177
+ // Flush captured artifacts even on failure (partial capture is useful)
178
+ await flushArtifacts(ctx);
116
179
  return {
117
180
  belowCritical: state.belowCritical,
118
181
  durationMs: Date.now() - pipelineStart,
@@ -129,7 +192,7 @@ export async function orchestratePipeline(ctx, steps) {
129
192
  }
130
193
  // Report step completion
131
194
  if (hasJob) {
132
- await reportJobProgress(ctx, step.name, i + 1, steps.length, "running");
195
+ await reportJobProgress(ctx, step.name, i + 1, steps.length, "running", undefined, jobUpdates);
133
196
  }
134
197
  }
135
198
  const durationMs = Date.now() - pipelineStart;
@@ -166,6 +229,13 @@ export async function orchestratePipeline(ctx, steps) {
166
229
  ctx.logger.warn("Failed to report job completion — continuing");
167
230
  }
168
231
  }
232
+ // Capture pipeline context and job updates before flushing
233
+ capturePipelineContext(ctx, state, results);
234
+ if (jobUpdates.length > 0) {
235
+ ctx.collector.capture("job-store", "job-updates", jobUpdates);
236
+ }
237
+ // Flush captured artifacts (non-blocking — failures never affect pipeline result)
238
+ await flushArtifacts(ctx);
169
239
  return {
170
240
  belowCritical: state.belowCritical,
171
241
  durationMs,
@@ -36,8 +36,12 @@ export async function runStep(step, ctx, state = {}) {
36
36
  if (canCache) {
37
37
  try {
38
38
  const inputs = step.cacheInputs(ctx);
39
+ const context = step.cacheContext?.(ctx);
39
40
  ctx.logger.debug(`[${step.name}] Cache inputs: ${inputs.length} files`);
40
- const key = await ctx.cache.computeKey(inputs);
41
+ if (context?.length) {
42
+ ctx.logger.debug(`[${step.name}] Cache context: ${context.join(", ")}`);
43
+ }
44
+ const key = await ctx.cache.computeKey(inputs, context);
41
45
  cacheKey = key;
42
46
  ctx.logger.debug(`[${step.name}] Cache key: ${key}`);
43
47
  const cached = await ctx.cache.lookup(step.name, key);
@@ -10,4 +10,5 @@ export declare class CalculateScoresStep implements PipelineStep {
10
10
  check(): ValidationIssue[];
11
11
  execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
12
12
  cacheInputs(ctx: AppContext): string[];
13
+ cacheContext(ctx: AppContext): string[];
13
14
  }
@@ -4,9 +4,11 @@
4
4
  * Calls calculateAndWriteScores() from pipeline/calculate-scores.ts with
5
5
  * typed options derived from AppContext. No env bridge needed.
6
6
  */
7
+ import { existsSync } from "node:fs";
7
8
  import { join } from "path";
8
9
  import { LiteracyVariant } from "../../pipeline/normalize-mode.js";
9
10
  import { getStepInputPaths } from "../../pipeline/cache.js";
11
+ import { buildCacheContext } from "../cache-context.js";
10
12
  import { calculateAndWriteScores } from "../../pipeline/calculate-scores.js";
11
13
  import { checkResultsExist, checkScoreSummaryValid, } from "../../pipeline/checks.js";
12
14
  import { resultsFileForMode } from "../../pipeline/eval-constants.js";
@@ -118,6 +120,14 @@ export class CalculateScoresStep {
118
120
  if (belowCritical.length > 0) {
119
121
  state.belowCritical = belowCritical;
120
122
  }
123
+ // Capture score artifacts
124
+ const resultsDir = join(ctx.config.rootDir, "results", "latest");
125
+ for (const file of ["score-summary.json", "grader-judgments.json"]) {
126
+ const filePath = join(resultsDir, file);
127
+ if (existsSync(filePath)) {
128
+ ctx.collector.captureFile("calculate-scores", file.replace(".json", ""), filePath);
129
+ }
130
+ }
121
131
  const criticalSuffix = belowCritical.length > 0
122
132
  ? ` (${belowCritical.length} area(s) below critical threshold: ${belowCritical.join(", ")})`
123
133
  : "";
@@ -130,4 +140,7 @@ export class CalculateScoresStep {
130
140
  cacheInputs(ctx) {
131
141
  return getStepInputPaths(ctx.config.rootDir, "calculate-scores");
132
142
  }
143
+ cacheContext(ctx) {
144
+ return buildCacheContext(ctx.config);
145
+ }
133
146
  }
@@ -52,11 +52,20 @@ export class CallbackStep {
52
52
  }
53
53
  // Deliver callback — read reportId from pipeline state (set by PublishReportStep)
54
54
  ctx.logger.info(`Delivering results to ${this.callback.url}`);
55
- const result = await deliverCallback(this.callback, {
55
+ const callbackPayload = {
56
56
  deliveredAt: new Date().toISOString(),
57
57
  jobId: this.jobId,
58
58
  reportId: state.reportId,
59
59
  summary,
60
+ };
61
+ // Capture callback payload (Tier 2 — no secrets: headers are NOT captured)
62
+ ctx.collector.capture("callback", "callback-payload", callbackPayload);
63
+ const result = await deliverCallback(this.callback, callbackPayload);
64
+ // Capture callback response status (not the body — that's the user's system)
65
+ ctx.collector.capture("callback", "callback-response", {
66
+ ok: result.ok,
67
+ attempts: result.attempts,
68
+ error: result.error,
60
69
  });
61
70
  if (result.ok) {
62
71
  return {
@@ -5,7 +5,7 @@
5
5
  * inlined directly from the former pipeline/steps/compare-step.ts.
6
6
  * This is an optional step — failure doesn't stop the pipeline.
7
7
  */
8
- import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
8
+ import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
9
9
  import { join, resolve } from "path";
10
10
  import { compare } from "../../pipeline/compare.js";
11
11
  export class CompareStep {
@@ -65,9 +65,12 @@ export class CompareStep {
65
65
  ? { noiseThreshold: ctx.config.compareThreshold }
66
66
  : undefined;
67
67
  const report = compare(baseline, experiment, options);
68
- // Write report
69
- const reportPath = resolve(rootDir, "results", "latest", "comparison-report.json");
68
+ // Write report to outputDir (respects --output-dir)
69
+ mkdirSync(ctx.config.outputDir, { recursive: true });
70
+ const reportPath = resolve(ctx.config.outputDir, "comparison-report.json");
70
71
  writeFileSync(reportPath, JSON.stringify(report, null, 2));
72
+ // Capture comparison report
73
+ ctx.collector.captureFile("compare", "comparison-report", reportPath);
71
74
  // Build summary
72
75
  const improved = report.improved.length;
73
76
  const regressed = report.regressed.length;
@@ -4,7 +4,7 @@
4
4
  * Calls pure functions from pipeline/discovery-report.ts directly.
5
5
  * Optional step — failure doesn't stop the pipeline.
6
6
  */
7
- import { existsSync, readFileSync, writeFileSync } from "fs";
7
+ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
8
8
  import { resolve } from "path";
9
9
  import { formatDiscoveryMarkdown, generateDiscoveryReport, } from "../../pipeline/discovery-report.js";
10
10
  export class DiscoveryReportStep {
@@ -34,7 +34,11 @@ export class DiscoveryReportStep {
34
34
  }
35
35
  const report = generateDiscoveryReport(scoreSummary, ctx.config.areas);
36
36
  const md = formatDiscoveryMarkdown(report);
37
- writeFileSync(resolve(root, "results", "latest", "discovery-report.md"), md);
37
+ // Write to outputDir (respects --output-dir)
38
+ mkdirSync(ctx.config.outputDir, { recursive: true });
39
+ const discoveryPath = resolve(ctx.config.outputDir, "discovery-report.md");
40
+ writeFileSync(discoveryPath, md);
41
+ ctx.collector.captureFile("discovery-report", "discovery-report", discoveryPath);
38
42
  console.log(md);
39
43
  const invisible = report.invisibleDocs.length;
40
44
  const f1 = report.overall.avgF1.toFixed(2);
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Shell delegation for the fetch-docs step.
3
+ *
4
+ * Isolates the execSync call so it can be replaced when the pipeline
5
+ * fully migrates to the DocFetcher port.
6
+ */
7
+ export interface ShellResult {
8
+ ok: boolean;
9
+ error?: string;
10
+ }
11
+ /**
12
+ * Run `pnpm fetch-docs` via shell.
13
+ *
14
+ * Returns a result object instead of throwing so the step can
15
+ * handle the failure uniformly.
16
+ */
17
+ export declare function runFetchDocsShell(rootDir: string, source?: string): ShellResult;
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Shell delegation for the fetch-docs step.
3
+ *
4
+ * Isolates the execSync call so it can be replaced when the pipeline
5
+ * fully migrates to the DocFetcher port.
6
+ */
7
+ import { execSync } from "child_process";
8
+ /**
9
+ * Run `pnpm fetch-docs` via shell.
10
+ *
11
+ * Returns a result object instead of throwing so the step can
12
+ * handle the failure uniformly.
13
+ */
14
+ export function runFetchDocsShell(rootDir, source) {
15
+ try {
16
+ const sourceArg = source ? ` --source ${source}` : "";
17
+ execSync(`pnpm fetch-docs${sourceArg}`, {
18
+ cwd: rootDir,
19
+ env: process.env,
20
+ stdio: "inherit",
21
+ });
22
+ return { ok: true };
23
+ }
24
+ catch (err) {
25
+ return {
26
+ ok: false,
27
+ error: err instanceof Error ? err.message : String(err),
28
+ };
29
+ }
30
+ }
@@ -16,4 +16,5 @@ export declare class FetchDocsStep implements PipelineStep {
16
16
  check(): ValidationIssue[];
17
17
  execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
18
18
  cacheInputs(ctx: AppContext): string[];
19
+ cacheContext(ctx: AppContext): string[];
19
20
  }