@sanity/ailf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (530) hide show
  1. package/README.md +89 -0
  2. package/bin/ailf.js +64 -0
  3. package/canonical/grader-references/README.md +88 -0
  4. package/canonical/grader-references/groq.yaml +234 -0
  5. package/canonical/grader-references/studio-setup.yaml +275 -0
  6. package/canonical/reference-solutions/.gitkeep +1 -0
  7. package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
  8. package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
  9. package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
  10. package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
  11. package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
  12. package/canonical/reference-solutions/groq/joins-references.ts +300 -0
  13. package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
  14. package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
  15. package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
  16. package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
  17. package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
  18. package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
  19. package/config/bigquery/README.md +74 -0
  20. package/config/bigquery/views/area_scores.sql +87 -0
  21. package/config/bigquery/views/reports.sql +49 -0
  22. package/config/features.yaml +116 -0
  23. package/config/models.yaml +115 -0
  24. package/config/prompts.yaml +75 -0
  25. package/config/rubrics.yaml +62 -0
  26. package/config/schedules.yaml +43 -0
  27. package/config/sinks.yaml +54 -0
  28. package/config/sources.yaml +51 -0
  29. package/config/thresholds.yaml +49 -0
  30. package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
  31. package/dist/_vendor/ailf-core/examples/index.js +285 -0
  32. package/dist/_vendor/ailf-core/index.d.ts +17 -0
  33. package/dist/_vendor/ailf-core/index.js +17 -0
  34. package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
  35. package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
  36. package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
  37. package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
  38. package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
  39. package/dist/_vendor/ailf-core/ports/context.js +14 -0
  40. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
  41. package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
  42. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
  43. package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
  44. package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
  45. package/dist/_vendor/ailf-core/ports/index.js +7 -0
  46. package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
  47. package/dist/_vendor/ailf-core/ports/logger.js +11 -0
  48. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
  49. package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
  50. package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
  51. package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
  52. package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
  53. package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
  54. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
  55. package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
  56. package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
  57. package/dist/_vendor/ailf-core/schemas/index.js +16 -0
  58. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
  59. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
  60. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
  61. package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
  62. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
  63. package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
  64. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
  65. package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
  66. package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
  67. package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
  68. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
  69. package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
  70. package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
  71. package/dist/_vendor/ailf-core/services/index.js +12 -0
  72. package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
  73. package/dist/_vendor/ailf-core/services/scoring.js +222 -0
  74. package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
  75. package/dist/_vendor/ailf-core/types/index.js +21 -0
  76. package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
  77. package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
  78. package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
  79. package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
  80. package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
  81. package/dist/_vendor/ailf-shared/document-ref.js +1 -0
  82. package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
  83. package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
  84. package/dist/_vendor/ailf-shared/index.d.ts +16 -0
  85. package/dist/_vendor/ailf-shared/index.js +16 -0
  86. package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
  87. package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
  88. package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
  89. package/dist/_vendor/ailf-shared/score-grades.js +23 -0
  90. package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
  91. package/dist/adapters/cache/content-lake-cache.js +59 -0
  92. package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
  93. package/dist/adapters/cache/filesystem-cache.js +54 -0
  94. package/dist/adapters/cache/index.d.ts +2 -0
  95. package/dist/adapters/cache/index.js +2 -0
  96. package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
  97. package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
  98. package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
  99. package/dist/adapters/config-sources/file-config-adapter.js +96 -0
  100. package/dist/adapters/config-sources/index.d.ts +2 -0
  101. package/dist/adapters/config-sources/index.js +2 -0
  102. package/dist/adapters/doc-fetchers/index.d.ts +1 -0
  103. package/dist/adapters/doc-fetchers/index.js +1 -0
  104. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
  105. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
  106. package/dist/adapters/eval-runners/index.d.ts +1 -0
  107. package/dist/adapters/eval-runners/index.js +1 -0
  108. package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
  109. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
  110. package/dist/adapters/index.d.ts +12 -0
  111. package/dist/adapters/index.js +12 -0
  112. package/dist/adapters/loggers/console-logger.d.ts +22 -0
  113. package/dist/adapters/loggers/console-logger.js +54 -0
  114. package/dist/adapters/loggers/index.d.ts +9 -0
  115. package/dist/adapters/loggers/index.js +9 -0
  116. package/dist/adapters/loggers/json-logger.d.ts +18 -0
  117. package/dist/adapters/loggers/json-logger.js +33 -0
  118. package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
  119. package/dist/adapters/loggers/quiet-logger.js +30 -0
  120. package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
  121. package/dist/adapters/task-sources/composite-task-source.js +59 -0
  122. package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
  123. package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
  124. package/dist/adapters/task-sources/index.d.ts +7 -0
  125. package/dist/adapters/task-sources/index.js +7 -0
  126. package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
  127. package/dist/adapters/task-sources/repo-schemas.js +234 -0
  128. package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
  129. package/dist/adapters/task-sources/repo-task-source.js +104 -0
  130. package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
  131. package/dist/adapters/task-sources/repo-trigger.js +153 -0
  132. package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
  133. package/dist/adapters/task-sources/repo-validation.js +164 -0
  134. package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
  135. package/dist/adapters/task-sources/yaml-task-source.js +136 -0
  136. package/dist/agent-observer/agentic-provider.d.ts +132 -0
  137. package/dist/agent-observer/agentic-provider.js +983 -0
  138. package/dist/agent-observer/classifier.d.ts +62 -0
  139. package/dist/agent-observer/classifier.js +269 -0
  140. package/dist/agent-observer/index.d.ts +7 -0
  141. package/dist/agent-observer/index.js +4 -0
  142. package/dist/agent-observer/pricing.d.ts +35 -0
  143. package/dist/agent-observer/pricing.js +82 -0
  144. package/dist/agent-observer/provider.d.ts +77 -0
  145. package/dist/agent-observer/provider.js +151 -0
  146. package/dist/agent-observer/proxy.d.ts +91 -0
  147. package/dist/agent-observer/proxy.js +321 -0
  148. package/dist/agent-observer/test-imports.d.ts +7 -0
  149. package/dist/agent-observer/test-imports.js +185 -0
  150. package/dist/agent-observer/types.d.ts +137 -0
  151. package/dist/agent-observer/types.js +16 -0
  152. package/dist/assertions/source-isolation.d.ts +72 -0
  153. package/dist/assertions/source-isolation.js +117 -0
  154. package/dist/cli.d.ts +24 -0
  155. package/dist/cli.js +199 -0
  156. package/dist/commands/agent-report.d.ts +5 -0
  157. package/dist/commands/agent-report.js +69 -0
  158. package/dist/commands/baseline.d.ts +9 -0
  159. package/dist/commands/baseline.js +141 -0
  160. package/dist/commands/cache.d.ts +13 -0
  161. package/dist/commands/cache.js +135 -0
  162. package/dist/commands/calculate-scores.d.ts +8 -0
  163. package/dist/commands/calculate-scores.js +48 -0
  164. package/dist/commands/compare.d.ts +8 -0
  165. package/dist/commands/compare.js +120 -0
  166. package/dist/commands/completion.d.ts +18 -0
  167. package/dist/commands/completion.js +260 -0
  168. package/dist/commands/coverage-audit.d.ts +7 -0
  169. package/dist/commands/coverage-audit.js +40 -0
  170. package/dist/commands/discovery-report.d.ts +10 -0
  171. package/dist/commands/discovery-report.js +44 -0
  172. package/dist/commands/eval.d.ts +9 -0
  173. package/dist/commands/eval.js +35 -0
  174. package/dist/commands/explain-handler.d.ts +34 -0
  175. package/dist/commands/explain-handler.js +719 -0
  176. package/dist/commands/fetch-docs.d.ts +8 -0
  177. package/dist/commands/fetch-docs.js +128 -0
  178. package/dist/commands/generate-configs.d.ts +8 -0
  179. package/dist/commands/generate-configs.js +46 -0
  180. package/dist/commands/grader/index.d.ts +11 -0
  181. package/dist/commands/grader/index.js +118 -0
  182. package/dist/commands/init.d.ts +19 -0
  183. package/dist/commands/init.js +150 -0
  184. package/dist/commands/interactive.d.ts +12 -0
  185. package/dist/commands/interactive.js +238 -0
  186. package/dist/commands/lookup-doc.d.ts +15 -0
  187. package/dist/commands/lookup-doc.js +84 -0
  188. package/dist/commands/measure-retrieval.d.ts +5 -0
  189. package/dist/commands/measure-retrieval.js +65 -0
  190. package/dist/commands/pipeline-action.d.ts +71 -0
  191. package/dist/commands/pipeline-action.js +305 -0
  192. package/dist/commands/pipeline.d.ts +62 -0
  193. package/dist/commands/pipeline.js +53 -0
  194. package/dist/commands/pr-comment.d.ts +8 -0
  195. package/dist/commands/pr-comment.js +47 -0
  196. package/dist/commands/publish.d.ts +26 -0
  197. package/dist/commands/publish.js +253 -0
  198. package/dist/commands/readiness-report.d.ts +10 -0
  199. package/dist/commands/readiness-report.js +104 -0
  200. package/dist/commands/shared/options.d.ts +29 -0
  201. package/dist/commands/shared/options.js +57 -0
  202. package/dist/commands/update-quality-scores.d.ts +5 -0
  203. package/dist/commands/update-quality-scores.js +20 -0
  204. package/dist/commands/validate-tasks.d.ts +16 -0
  205. package/dist/commands/validate-tasks.js +93 -0
  206. package/dist/commands/validate.d.ts +9 -0
  207. package/dist/commands/validate.js +73 -0
  208. package/dist/commands/webhook-server.d.ts +5 -0
  209. package/dist/commands/webhook-server.js +30 -0
  210. package/dist/commands/weekly-digest.d.ts +10 -0
  211. package/dist/commands/weekly-digest.js +104 -0
  212. package/dist/composition-root.d.ts +26 -0
  213. package/dist/composition-root.js +107 -0
  214. package/dist/interpolate.d.ts +26 -0
  215. package/dist/interpolate.js +70 -0
  216. package/dist/job-store.d.ts +104 -0
  217. package/dist/job-store.js +188 -0
  218. package/dist/lib/agent-behavior-report.d.ts +8 -0
  219. package/dist/lib/agent-behavior-report.js +185 -0
  220. package/dist/lib/baseline.d.ts +19 -0
  221. package/dist/lib/baseline.js +153 -0
  222. package/dist/lib/calculate-scores.d.ts +23 -0
  223. package/dist/lib/calculate-scores.js +42 -0
  224. package/dist/lib/compare.d.ts +18 -0
  225. package/dist/lib/compare.js +170 -0
  226. package/dist/lib/coverage-audit.d.ts +4 -0
  227. package/dist/lib/coverage-audit.js +42 -0
  228. package/dist/lib/discovery-report.d.ts +13 -0
  229. package/dist/lib/discovery-report.js +57 -0
  230. package/dist/lib/fetch-docs.d.ts +30 -0
  231. package/dist/lib/fetch-docs.js +171 -0
  232. package/dist/lib/generate-configs.d.ts +25 -0
  233. package/dist/lib/generate-configs.js +42 -0
  234. package/dist/lib/grader-api.d.ts +21 -0
  235. package/dist/lib/grader-api.js +34 -0
  236. package/dist/lib/grader-compare.d.ts +19 -0
  237. package/dist/lib/grader-compare.js +91 -0
  238. package/dist/lib/grader-consistency.d.ts +27 -0
  239. package/dist/lib/grader-consistency.js +79 -0
  240. package/dist/lib/grader-sensitivity.d.ts +19 -0
  241. package/dist/lib/grader-sensitivity.js +75 -0
  242. package/dist/lib/grader-validate.d.ts +19 -0
  243. package/dist/lib/grader-validate.js +78 -0
  244. package/dist/lib/measure-retrieval.d.ts +14 -0
  245. package/dist/lib/measure-retrieval.js +71 -0
  246. package/dist/lib/pr-comment.d.ts +16 -0
  247. package/dist/lib/pr-comment.js +28 -0
  248. package/dist/lib/readiness-report.d.ts +13 -0
  249. package/dist/lib/readiness-report.js +108 -0
  250. package/dist/lib/webhook-server.d.ts +11 -0
  251. package/dist/lib/webhook-server.js +24 -0
  252. package/dist/lib/weekly-digest.d.ts +24 -0
  253. package/dist/lib/weekly-digest.js +148 -0
  254. package/dist/orchestration/build-app-context.d.ts +27 -0
  255. package/dist/orchestration/build-app-context.js +81 -0
  256. package/dist/orchestration/build-step-sequence.d.ts +15 -0
  257. package/dist/orchestration/build-step-sequence.js +84 -0
  258. package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
  259. package/dist/orchestration/config-to-source-overrides.js +28 -0
  260. package/dist/orchestration/env-bridge.d.ts +21 -0
  261. package/dist/orchestration/env-bridge.js +66 -0
  262. package/dist/orchestration/index.d.ts +11 -0
  263. package/dist/orchestration/index.js +11 -0
  264. package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
  265. package/dist/orchestration/pipeline-orchestrator.js +153 -0
  266. package/dist/orchestration/step-runner.d.ts +20 -0
  267. package/dist/orchestration/step-runner.js +88 -0
  268. package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
  269. package/dist/orchestration/steps/calculate-scores-step.js +95 -0
  270. package/dist/orchestration/steps/callback-step.d.ts +24 -0
  271. package/dist/orchestration/steps/callback-step.js +76 -0
  272. package/dist/orchestration/steps/compare-step.d.ts +14 -0
  273. package/dist/orchestration/steps/compare-step.js +92 -0
  274. package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
  275. package/dist/orchestration/steps/discovery-report-step.js +55 -0
  276. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  277. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  278. package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
  279. package/dist/orchestration/steps/fetch-docs-step.js +135 -0
  280. package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
  281. package/dist/orchestration/steps/gap-analysis-step.js +136 -0
  282. package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
  283. package/dist/orchestration/steps/generate-configs-step.js +85 -0
  284. package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
  285. package/dist/orchestration/steps/grader-consistency-step.js +64 -0
  286. package/dist/orchestration/steps/index.d.ts +19 -0
  287. package/dist/orchestration/steps/index.js +19 -0
  288. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
  289. package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
  290. package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
  291. package/dist/orchestration/steps/publish-report-step.js +216 -0
  292. package/dist/orchestration/steps/readiness-step.d.ts +13 -0
  293. package/dist/orchestration/steps/readiness-step.js +91 -0
  294. package/dist/orchestration/steps/report-step.d.ts +12 -0
  295. package/dist/orchestration/steps/report-step.js +49 -0
  296. package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
  297. package/dist/orchestration/steps/run-eval-step.js +195 -0
  298. package/dist/orchestration/steps/validate-step.d.ts +12 -0
  299. package/dist/orchestration/steps/validate-step.js +41 -0
  300. package/dist/pipeline/agent-behavior-report.d.ts +53 -0
  301. package/dist/pipeline/agent-behavior-report.js +132 -0
  302. package/dist/pipeline/attribution.d.ts +47 -0
  303. package/dist/pipeline/attribution.js +226 -0
  304. package/dist/pipeline/baseline.d.ts +37 -0
  305. package/dist/pipeline/baseline.js +141 -0
  306. package/dist/pipeline/cache.d.ts +101 -0
  307. package/dist/pipeline/cache.js +283 -0
  308. package/dist/pipeline/calculate-scores.d.ts +102 -0
  309. package/dist/pipeline/calculate-scores.js +1128 -0
  310. package/dist/pipeline/callback-delivery.d.ts +50 -0
  311. package/dist/pipeline/callback-delivery.js +89 -0
  312. package/dist/pipeline/checks.d.ts +39 -0
  313. package/dist/pipeline/checks.js +280 -0
  314. package/dist/pipeline/classify-url.d.ts +61 -0
  315. package/dist/pipeline/classify-url.js +93 -0
  316. package/dist/pipeline/compare.d.ts +31 -0
  317. package/dist/pipeline/compare.js +208 -0
  318. package/dist/pipeline/coverage-audit.d.ts +39 -0
  319. package/dist/pipeline/coverage-audit.js +165 -0
  320. package/dist/pipeline/degradations.d.ts +85 -0
  321. package/dist/pipeline/degradations.js +242 -0
  322. package/dist/pipeline/discovery-report.d.ts +55 -0
  323. package/dist/pipeline/discovery-report.js +178 -0
  324. package/dist/pipeline/eval-constants.d.ts +68 -0
  325. package/dist/pipeline/eval-constants.js +111 -0
  326. package/dist/pipeline/eval-fingerprint.d.ts +66 -0
  327. package/dist/pipeline/eval-fingerprint.js +175 -0
  328. package/dist/pipeline/expand-tasks.d.ts +220 -0
  329. package/dist/pipeline/expand-tasks.js +421 -0
  330. package/dist/pipeline/failure-modes.d.ts +46 -0
  331. package/dist/pipeline/failure-modes.js +348 -0
  332. package/dist/pipeline/fetch-url-content.d.ts +44 -0
  333. package/dist/pipeline/fetch-url-content.js +93 -0
  334. package/dist/pipeline/gap-analysis.d.ts +48 -0
  335. package/dist/pipeline/gap-analysis.js +231 -0
  336. package/dist/pipeline/generate-configs.d.ts +72 -0
  337. package/dist/pipeline/generate-configs.js +395 -0
  338. package/dist/pipeline/grader-api.d.ts +49 -0
  339. package/dist/pipeline/grader-api.js +200 -0
  340. package/dist/pipeline/grader-compare-runner.d.ts +44 -0
  341. package/dist/pipeline/grader-compare-runner.js +301 -0
  342. package/dist/pipeline/grader-comparison.d.ts +111 -0
  343. package/dist/pipeline/grader-comparison.js +161 -0
  344. package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
  345. package/dist/pipeline/grader-consistency-runner.js +270 -0
  346. package/dist/pipeline/grader-consistency.d.ts +103 -0
  347. package/dist/pipeline/grader-consistency.js +146 -0
  348. package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
  349. package/dist/pipeline/grader-sensitivity-runner.js +282 -0
  350. package/dist/pipeline/grader-sensitivity.d.ts +94 -0
  351. package/dist/pipeline/grader-sensitivity.js +144 -0
  352. package/dist/pipeline/grader-validate-runner.d.ts +38 -0
  353. package/dist/pipeline/grader-validate-runner.js +229 -0
  354. package/dist/pipeline/grader-validation.d.ts +107 -0
  355. package/dist/pipeline/grader-validation.js +169 -0
  356. package/dist/pipeline/map-request-to-config.d.ts +19 -0
  357. package/dist/pipeline/map-request-to-config.js +80 -0
  358. package/dist/pipeline/measure-retrieval.d.ts +59 -0
  359. package/dist/pipeline/measure-retrieval.js +111 -0
  360. package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
  361. package/dist/pipeline/mirror-repo-tasks.js +350 -0
  362. package/dist/pipeline/plan-format.d.ts +33 -0
  363. package/dist/pipeline/plan-format.js +202 -0
  364. package/dist/pipeline/plan.d.ts +169 -0
  365. package/dist/pipeline/plan.js +708 -0
  366. package/dist/pipeline/pr-comment.d.ts +19 -0
  367. package/dist/pipeline/pr-comment.js +502 -0
  368. package/dist/pipeline/probe.d.ts +52 -0
  369. package/dist/pipeline/probe.js +390 -0
  370. package/dist/pipeline/provenance.d.ts +47 -0
  371. package/dist/pipeline/provenance.js +146 -0
  372. package/dist/pipeline/readiness-report.d.ts +87 -0
  373. package/dist/pipeline/readiness-report.js +205 -0
  374. package/dist/pipeline/release-classification.d.ts +54 -0
  375. package/dist/pipeline/release-classification.js +238 -0
  376. package/dist/pipeline/release-report.d.ts +37 -0
  377. package/dist/pipeline/release-report.js +222 -0
  378. package/dist/pipeline/repo-eval-comment.d.ts +37 -0
  379. package/dist/pipeline/repo-eval-comment.js +165 -0
  380. package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
  381. package/dist/pipeline/repo-threshold-evaluator.js +162 -0
  382. package/dist/pipeline/resolve-mappings.d.ts +35 -0
  383. package/dist/pipeline/resolve-mappings.js +72 -0
  384. package/dist/pipeline/retrieval-metrics.d.ts +39 -0
  385. package/dist/pipeline/retrieval-metrics.js +136 -0
  386. package/dist/pipeline/reverse-mapping.d.ts +67 -0
  387. package/dist/pipeline/reverse-mapping.js +88 -0
  388. package/dist/pipeline/schemas.d.ts +9 -0
  389. package/dist/pipeline/schemas.js +9 -0
  390. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  391. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  392. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  393. package/dist/pipeline/steps/compare-step.js +90 -0
  394. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  395. package/dist/pipeline/steps/eval-step.js +347 -0
  396. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  397. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  398. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  399. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  400. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  401. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  402. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  403. package/dist/pipeline/steps/publish-report-step.js +243 -0
  404. package/dist/pipeline/steps/report-step.d.ts +13 -0
  405. package/dist/pipeline/steps/report-step.js +56 -0
  406. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  407. package/dist/pipeline/steps/update-scores-step.js +42 -0
  408. package/dist/pipeline/targeted-loo.d.ts +88 -0
  409. package/dist/pipeline/targeted-loo.js +203 -0
  410. package/dist/pipeline/thresholds.d.ts +27 -0
  411. package/dist/pipeline/thresholds.js +245 -0
  412. package/dist/pipeline/types.d.ts +10 -0
  413. package/dist/pipeline/types.js +10 -0
  414. package/dist/pipeline/validate.d.ts +67 -0
  415. package/dist/pipeline/validate.js +406 -0
  416. package/dist/pipeline/webhook-server.d.ts +37 -0
  417. package/dist/pipeline/webhook-server.js +133 -0
  418. package/dist/report-store.d.ts +84 -0
  419. package/dist/report-store.js +208 -0
  420. package/dist/sanity/client.d.ts +38 -0
  421. package/dist/sanity/client.js +86 -0
  422. package/dist/sanity/portable-text.d.ts +11 -0
  423. package/dist/sanity/portable-text.js +211 -0
  424. package/dist/sanity/queries.d.ts +133 -0
  425. package/dist/sanity/queries.js +300 -0
  426. package/dist/schedules/digest.d.ts +116 -0
  427. package/dist/schedules/digest.js +156 -0
  428. package/dist/schedules/index.d.ts +12 -0
  429. package/dist/schedules/index.js +10 -0
  430. package/dist/schedules/loader.d.ts +31 -0
  431. package/dist/schedules/loader.js +73 -0
  432. package/dist/schedules/schema.d.ts +9 -0
  433. package/dist/schedules/schema.js +9 -0
  434. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  435. package/dist/scripts/agent-behavior-report.js +315 -0
  436. package/dist/scripts/baseline.d.ts +43 -0
  437. package/dist/scripts/baseline.js +267 -0
  438. package/dist/scripts/calculate-scores.d.ts +166 -0
  439. package/dist/scripts/calculate-scores.js +1296 -0
  440. package/dist/scripts/compare.d.ts +22 -0
  441. package/dist/scripts/compare.js +334 -0
  442. package/dist/scripts/coverage-audit.d.ts +44 -0
  443. package/dist/scripts/coverage-audit.js +209 -0
  444. package/dist/scripts/debug-eval.d.ts +19 -0
  445. package/dist/scripts/debug-eval.js +73 -0
  446. package/dist/scripts/discovery-report.d.ts +58 -0
  447. package/dist/scripts/discovery-report.js +250 -0
  448. package/dist/scripts/fetch-docs.d.ts +35 -0
  449. package/dist/scripts/fetch-docs.js +472 -0
  450. package/dist/scripts/generate-configs.d.ts +66 -0
  451. package/dist/scripts/generate-configs.js +459 -0
  452. package/dist/scripts/grader-api.d.ts +27 -0
  453. package/dist/scripts/grader-api.js +206 -0
  454. package/dist/scripts/grader-compare.d.ts +22 -0
  455. package/dist/scripts/grader-compare.js +368 -0
  456. package/dist/scripts/grader-consistency.d.ts +20 -0
  457. package/dist/scripts/grader-consistency.js +313 -0
  458. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  459. package/dist/scripts/grader-sensitivity.js +354 -0
  460. package/dist/scripts/grader-validate.d.ts +19 -0
  461. package/dist/scripts/grader-validate.js +267 -0
  462. package/dist/scripts/measure-retrieval.d.ts +10 -0
  463. package/dist/scripts/measure-retrieval.js +145 -0
  464. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
  465. package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
  466. package/dist/scripts/pipeline.d.ts +76 -0
  467. package/dist/scripts/pipeline.js +1031 -0
  468. package/dist/scripts/pr-comment.d.ts +10 -0
  469. package/dist/scripts/pr-comment.js +510 -0
  470. package/dist/scripts/readiness-report.d.ts +88 -0
  471. package/dist/scripts/readiness-report.js +342 -0
  472. package/dist/scripts/update-quality-scores.d.ts +15 -0
  473. package/dist/scripts/update-quality-scores.js +184 -0
  474. package/dist/scripts/validate-task-sources.d.ts +21 -0
  475. package/dist/scripts/validate-task-sources.js +210 -0
  476. package/dist/scripts/validate.d.ts +13 -0
  477. package/dist/scripts/validate.js +79 -0
  478. package/dist/scripts/webhook-server.d.ts +26 -0
  479. package/dist/scripts/webhook-server.js +147 -0
  480. package/dist/scripts/weekly-digest.d.ts +24 -0
  481. package/dist/scripts/weekly-digest.js +144 -0
  482. package/dist/sinks/bigquery/index.d.ts +131 -0
  483. package/dist/sinks/bigquery/index.js +222 -0
  484. package/dist/sinks/format-slack.d.ts +64 -0
  485. package/dist/sinks/format-slack.js +306 -0
  486. package/dist/sinks/index.d.ts +23 -0
  487. package/dist/sinks/index.js +18 -0
  488. package/dist/sinks/loader.d.ts +18 -0
  489. package/dist/sinks/loader.js +82 -0
  490. package/dist/sinks/retry.d.ts +24 -0
  491. package/dist/sinks/retry.js +52 -0
  492. package/dist/sinks/schema.d.ts +9 -0
  493. package/dist/sinks/schema.js +9 -0
  494. package/dist/sinks/slack/format.d.ts +65 -0
  495. package/dist/sinks/slack/format.js +327 -0
  496. package/dist/sinks/slack/index.d.ts +27 -0
  497. package/dist/sinks/slack/index.js +78 -0
  498. package/dist/sinks/slack-sink.d.ts +27 -0
  499. package/dist/sinks/slack-sink.js +78 -0
  500. package/dist/sinks/types.d.ts +59 -0
  501. package/dist/sinks/types.js +44 -0
  502. package/dist/sinks/webhook/index.d.ts +19 -0
  503. package/dist/sinks/webhook/index.js +50 -0
  504. package/dist/sinks/webhook-sink.d.ts +19 -0
  505. package/dist/sinks/webhook-sink.js +50 -0
  506. package/dist/sources.d.ts +104 -0
  507. package/dist/sources.js +292 -0
  508. package/dist/webhook/budget.d.ts +42 -0
  509. package/dist/webhook/budget.js +60 -0
  510. package/dist/webhook/debounce.d.ts +67 -0
  511. package/dist/webhook/debounce.js +76 -0
  512. package/dist/webhook/dispatch.d.ts +45 -0
  513. package/dist/webhook/dispatch.js +84 -0
  514. package/dist/webhook/eval-request-handler.d.ts +87 -0
  515. package/dist/webhook/eval-request-handler.js +181 -0
  516. package/dist/webhook/handler.d.ts +88 -0
  517. package/dist/webhook/handler.js +203 -0
  518. package/dist/webhook/index.d.ts +17 -0
  519. package/dist/webhook/index.js +12 -0
  520. package/dist/webhook/types.d.ts +109 -0
  521. package/dist/webhook/types.js +10 -0
  522. package/package.json +72 -0
  523. package/tasks/.expanded.agentic.yaml +51 -0
  524. package/tasks/.expanded.yaml +66 -0
  525. package/tasks/frameworks.yaml +98 -0
  526. package/tasks/functions.yaml +51 -0
  527. package/tasks/groq.yaml +216 -0
  528. package/tasks/nextjs-live.yaml +62 -0
  529. package/tasks/studio-setup.yaml +111 -0
  530. package/tasks/visual-editing.yaml +120 -0
@@ -0,0 +1,74 @@
1
+ /**
2
+ * Pipeline step: Measure grader consistency via replicated grading.
3
+ *
4
+ * This step is OPTIONAL — it only runs when --grader-replications N is passed.
5
+ * It re-runs grading assertions N additional times on the same model responses
6
+ * and measures score variance across replications.
7
+ *
8
+ * Preconditions: eval-results.json exists (model responses to re-grade)
9
+ * Postconditions: grader-consistency.json written to results/latest/
10
+ *
11
+ * Not cached: Each run involves fresh API calls to the grader model.
12
+ * The whole point is to measure variance, so caching would defeat the purpose.
13
+ */
14
+ import { execSync } from "child_process";
15
+ import { existsSync } from "fs";
16
+ import { dirname, resolve } from "path";
17
+ import { fileURLToPath } from "url";
18
+ import { checkResultsExist } from "../checks.js";
19
+ import { RESULTS_FILES } from "./eval-step.js";
20
+ const __dirname = dirname(fileURLToPath(import.meta.url));
21
+ const ROOT = resolve(__dirname, "..", "..", "..");
22
+ /**
23
+ * Run grader consistency analysis.
24
+ *
25
+ * @param replications Number of additional grading replications (default: 5)
26
+ * @param mode Eval mode — determines which results file to read
27
+ */
28
+ export function runGraderConsistency(replications = 5, mode = "baseline") {
29
+ const start = Date.now();
30
+ // For full mode, use baseline results for grader consistency analysis
31
+ const concreteMode = mode === "full" ? "baseline" : mode;
32
+ const resultsFile = RESULTS_FILES[concreteMode];
33
+ const resultsIssues = checkResultsExist(ROOT, resultsFile);
34
+ const resultsErrors = resultsIssues.filter((i) => i.severity === "error");
35
+ if (resultsErrors.length > 0) {
36
+ return {
37
+ durationMs: Date.now() - start,
38
+ error: `Results missing: ${resultsErrors.map((e) => e.message).join("; ")}. Run eval first.`,
39
+ status: "failed",
40
+ };
41
+ }
42
+ // Execute
43
+ try {
44
+ execSync(`tsx src/lib/grader-consistency.ts --replications ${replications} --results ${resultsFile}`, {
45
+ cwd: ROOT,
46
+ env: process.env,
47
+ stdio: "inherit",
48
+ });
49
+ }
50
+ catch (err) {
51
+ const code = err !== null && typeof err === "object" && "status" in err
52
+ ? err.status
53
+ : 1;
54
+ return {
55
+ durationMs: Date.now() - start,
56
+ error: `grader-consistency failed with exit code ${code}`,
57
+ status: "failed",
58
+ };
59
+ }
60
+ // Postcondition: output file exists
61
+ const outputPath = resolve(ROOT, "results", "latest", "grader-consistency.json");
62
+ if (!existsSync(outputPath)) {
63
+ return {
64
+ durationMs: Date.now() - start,
65
+ error: "grader-consistency.json was not created",
66
+ status: "failed",
67
+ };
68
+ }
69
+ return {
70
+ durationMs: Date.now() - start,
71
+ status: "success",
72
+ summary: `Grader consistency analysis complete (${replications} replications)`,
73
+ };
74
+ }
@@ -0,0 +1,57 @@
1
+ /**
2
+ * Pipeline step: Publish evaluation report to the report store.
3
+ *
4
+ * This step wraps ScoreSummary + provenance into a Report, writes it to
5
+ * the Sanity Content Lake (system of record), optionally auto-compares
6
+ * against the most recent comparable baseline, and fans out to configured
7
+ * sinks (BigQuery, Slack, webhooks, etc.).
8
+ *
9
+ * Opt-in via `--publish` flag or `AILF_PUBLISH=1` environment variable.
10
+ * Without this flag, the pipeline writes results locally only (unchanged
11
+ * from current behavior).
12
+ *
13
+ * Design principles:
14
+ * - P1: Reports are immutable events (write-once to Sanity)
15
+ * - P5: Local-first (pipeline never fails because of a store write)
16
+ * - P6: Sinks are fire-and-forget (failures logged, not thrown)
17
+ *
18
+ * Preconditions: score-summary.json exists and is valid
19
+ * Postconditions: Report written to Sanity (best-effort), sinks notified
20
+ *
21
+ * @see docs/design-docs/report-store/architecture.md
22
+ * @see docs/design-docs/report-store/sink-architecture.md
23
+ */
24
+ import { type ProvenanceInput } from "../provenance.js";
25
+ import type { PromptfooUrlEntry, StepResult } from "../types.js";
26
+ export interface PublishOptions {
27
+ /** Whether this is a debug run (debug runs don't store fingerprints) */
28
+ debug?: boolean;
29
+ /** Evaluation fingerprint override (computed externally by the pipeline) */
30
+ evalFingerprint?: string;
31
+ /** @deprecated Use `promptfooUrls` — kept for backward compatibility */
32
+ promptfooUrl?: string;
33
+ /** Per-mode Promptfoo share URLs */
34
+ promptfooUrls?: PromptfooUrlEntry[];
35
+ /** Override provenance input (for testing or custom workflows) */
36
+ provenanceInput?: Partial<ProvenanceInput>;
37
+ /** Sanity dataset for report storage (independent of eval dataset) */
38
+ reportDataset?: string;
39
+ /** Sanity project ID for report storage (independent of eval project) */
40
+ reportProjectId?: string;
41
+ /** Sanity API token for writes */
42
+ sanityToken?: string;
43
+ /** Optional human-supplied tag */
44
+ tag?: string;
45
+ }
46
+ /**
47
+ * Run the publish-report pipeline step.
48
+ *
49
+ * 1. Read score-summary.json
50
+ * 2. Build provenance from pipeline context
51
+ * 3. Create Report with generated UUID v7 ID
52
+ * 4. Auto-compare against most recent comparable baseline
53
+ * 5. Write to Sanity Content Lake (system of record)
54
+ * 6. Fan out to configured sinks (fire-and-forget)
55
+ * 7. Return step result with report ID and sink outcomes
56
+ */
57
+ export declare function runPublishReport(pipelineStart: number, options?: PublishOptions): Promise<StepResult>;
@@ -0,0 +1,243 @@
1
+ /**
2
+ * Pipeline step: Publish evaluation report to the report store.
3
+ *
4
+ * This step wraps ScoreSummary + provenance into a Report, writes it to
5
+ * the Sanity Content Lake (system of record), optionally auto-compares
6
+ * against the most recent comparable baseline, and fans out to configured
7
+ * sinks (BigQuery, Slack, webhooks, etc.).
8
+ *
9
+ * Opt-in via `--publish` flag or `AILF_PUBLISH=1` environment variable.
10
+ * Without this flag, the pipeline writes results locally only (unchanged
11
+ * from current behavior).
12
+ *
13
+ * Design principles:
14
+ * - P1: Reports are immutable events (write-once to Sanity)
15
+ * - P5: Local-first (pipeline never fails because of a store write)
16
+ * - P6: Sinks are fire-and-forget (failures logged, not thrown)
17
+ *
18
+ * Preconditions: score-summary.json exists and is valid
19
+ * Postconditions: Report written to Sanity (best-effort), sinks notified
20
+ *
21
+ * @see docs/design-docs/report-store/architecture.md
22
+ * @see docs/design-docs/report-store/sink-architecture.md
23
+ */
24
+ import { readFileSync } from "fs";
25
+ import { dirname, resolve } from "path";
26
+ import { fileURLToPath } from "url";
27
+ import { checkScoreSummaryValid } from "../checks.js";
28
+ import { buildProvenance } from "../provenance.js";
29
+ import { generateReportId, ReportStore } from "../../report-store.js";
30
+ import { loadSinks } from "../../sinks/index.js";
31
+ import { withRetry } from "../../sinks/retry.js";
32
+ const __dirname = dirname(fileURLToPath(import.meta.url));
33
+ const ROOT = resolve(__dirname, "..", "..", "..");
34
+ /**
35
+ * Run the publish-report pipeline step.
36
+ *
37
+ * 1. Read score-summary.json
38
+ * 2. Build provenance from pipeline context
39
+ * 3. Create Report with generated UUID v7 ID
40
+ * 4. Auto-compare against most recent comparable baseline
41
+ * 5. Write to Sanity Content Lake (system of record)
42
+ * 6. Fan out to configured sinks (fire-and-forget)
43
+ * 7. Return step result with report ID and sink outcomes
44
+ */
45
+ export async function runPublishReport(pipelineStart, options = {}) {
46
+ const start = Date.now();
47
+ // Precondition: score summary exists
48
+ const summaryIssues = checkScoreSummaryValid(ROOT);
49
+ const summaryErrors = summaryIssues.filter((i) => i.severity === "error");
50
+ if (summaryErrors.length > 0) {
51
+ return {
52
+ durationMs: Date.now() - start,
53
+ error: `Score summary missing: ${summaryErrors.map((e) => e.message).join("; ")}`,
54
+ status: "failed",
55
+ };
56
+ }
57
+ // Read score summary
58
+ let summary;
59
+ try {
60
+ const summaryPath = resolve(ROOT, "results", "latest", "score-summary.json");
61
+ summary = JSON.parse(readFileSync(summaryPath, "utf-8"));
62
+ }
63
+ catch (err) {
64
+ return {
65
+ durationMs: Date.now() - start,
66
+ error: `Failed to read score-summary.json: ${err instanceof Error ? err.message : String(err)}`,
67
+ status: "failed",
68
+ };
69
+ }
70
+ // Build provenance
71
+ const provenanceInput = buildProvenanceInput(summary, options);
72
+ const provenance = buildProvenance(provenanceInput);
73
+ // Create report
74
+ const now = new Date().toISOString();
75
+ const reportId = generateReportId();
76
+ const durationMs = Date.now() - pipelineStart;
77
+ // Initialize report store — uses AILF_REPORT_* env vars, independent of
78
+ // SANITY_DATASET/SANITY_PROJECT_ID which control doc evaluation.
79
+ const token = options.sanityToken ??
80
+ process.env.AILF_REPORT_SANITY_API_TOKEN ??
81
+ process.env.SANITY_API_TOKEN;
82
+ const dataset = options.reportDataset ?? process.env.AILF_REPORT_DATASET ?? undefined;
83
+ const projectId = options.reportProjectId ?? process.env.AILF_REPORT_PROJECT_ID ?? undefined;
84
+ const store = new ReportStore({ dataset, projectId, token });
85
+ // Auto-compare against most recent comparable baseline
86
+ const comparison = await store.autoCompare(summary, provenance, now);
87
+ const report = {
88
+ comparison: comparison ?? undefined,
89
+ completedAt: now,
90
+ durationMs,
91
+ id: reportId,
92
+ provenance,
93
+ summary,
94
+ tag: options.tag,
95
+ };
96
+ // Write to Sanity (system of record — best-effort, P5)
97
+ const sanityResult = await store.write(report);
98
+ // Load and run sinks (fire-and-forget, P6)
99
+ const publishResult = await runSinks(report);
100
+ // Build result summary
101
+ const parts = [];
102
+ if (sanityResult) {
103
+ parts.push(`report:${sanityResult}`);
104
+ }
105
+ else {
106
+ parts.push("Sanity write skipped (no token or unreachable)");
107
+ }
108
+ if (comparison) {
109
+ const delta = comparison.deltas.overall;
110
+ const sign = delta >= 0 ? "+" : "";
111
+ parts.push(`vs baseline: ${sign}${delta.toFixed(1)}`);
112
+ }
113
+ if (publishResult.sinkResults.length > 0) {
114
+ const succeeded = publishResult.sinkResults.filter((r) => r.result.status === "success").length;
115
+ const total = publishResult.sinkResults.length;
116
+ parts.push(`sinks: ${succeeded}/${total}`);
117
+ }
118
+ return {
119
+ durationMs: Date.now() - start,
120
+ status: "success",
121
+ summary: `Published — ${parts.join(", ")}`,
122
+ };
123
+ }
124
+ // ---------------------------------------------------------------------------
125
+ // Sink runner
126
+ // ---------------------------------------------------------------------------
127
+ /**
128
+ * Assemble provenance input from the score summary and pipeline context.
129
+ */
130
+ function buildProvenanceInput(summary, options) {
131
+ const areas = summary.scores.map((s) => s.feature);
132
+ const mode = (process.env.EVAL_MODE ?? "baseline");
133
+ // Read document IDs from env (set by pipeline.ts from --sanity-document flags)
134
+ const docIds = process.env.SANITY_DOCUMENT_IDS;
135
+ const sanityDocumentIds = docIds
136
+ ? docIds
137
+ .split(",")
138
+ .map((id) => id.trim())
139
+ .filter(Boolean)
140
+ : undefined;
141
+ // Read task filter from env
142
+ const taskFilter = process.env.EVAL_FILTER_TASKS;
143
+ const taskIds = taskFilter
144
+ ? taskFilter
145
+ .split(",")
146
+ .map((t) => t.trim())
147
+ .filter(Boolean)
148
+ : undefined;
149
+ // Build source from summary metadata or env
150
+ const source = {
151
+ baseUrl: summary.source?.baseUrl ?? "https://www.sanity.io/docs",
152
+ dataset: summary.source?.dataset ?? process.env.SANITY_DATASET ?? "next",
153
+ documentIds: [],
154
+ llmsTxt: (summary.source?.baseUrl ?? "https://www.sanity.io/docs") + "/llms.txt",
155
+ name: summary.source?.name ?? "production",
156
+ perspective: summary.source?.perspective ??
157
+ process.env.SANITY_PERSPECTIVE ??
158
+ undefined,
159
+ priorityDomain: "sanity.io",
160
+ projectId: summary.source?.projectId ?? process.env.SANITY_PROJECT_ID ?? "3do82whm",
161
+ studioOrigin: "https://admin.sanity.io",
162
+ urls: [],
163
+ };
164
+ // Pass through eval fingerprint for cross-environment cache lookup.
165
+ // Debug runs don't store fingerprints (they evaluate a subset of tests
166
+ // and would produce misleading cache hits for full runs).
167
+ const evalFingerprint = !options.debug ? options.evalFingerprint : undefined;
168
+ return {
169
+ areas,
170
+ evalFingerprint,
171
+ mode,
172
+ promptfooUrl: options.promptfooUrl,
173
+ promptfooUrls: options.promptfooUrls,
174
+ rootDir: ROOT,
175
+ sanityDocumentIds,
176
+ source,
177
+ taskIds,
178
+ ...options.provenanceInput,
179
+ };
180
+ }
181
+ // ---------------------------------------------------------------------------
182
+ // Provenance input builder
183
+ // ---------------------------------------------------------------------------
184
+ /**
185
+ * Fan out a report to all configured sinks.
186
+ *
187
+ * Sinks are loaded from config/sinks.yaml. Each sink is run with retry
188
+ * logic (3 attempts, exponential backoff). Failures are logged but never
189
+ * block the pipeline.
190
+ */
191
+ async function runSinks(report) {
192
+ const sinks = loadSinks();
193
+ const sinkResults = [];
194
+ if (sinks.length === 0) {
195
+ return { report, sinkResults };
196
+ }
197
+ // Health check all sinks first (non-blocking)
198
+ for (const sink of sinks) {
199
+ if (sink.healthCheck) {
200
+ try {
201
+ const health = await sink.healthCheck();
202
+ if (!health.healthy) {
203
+ console.warn(` ⚠️ Sink ${sink.name} health check failed: ${health.reason}`);
204
+ }
205
+ }
206
+ catch (err) {
207
+ console.warn(` ⚠️ Sink ${sink.name} health check error: ${err instanceof Error ? err.message : String(err)}`);
208
+ }
209
+ }
210
+ }
211
+ // Publish to all sinks in parallel (fire-and-forget with retries)
212
+ const settled = await Promise.allSettled(sinks.map(async (sink) => {
213
+ const result = await withRetry(() => sink.publish(report));
214
+ return { name: sink.name, result };
215
+ }));
216
+ for (const outcome of settled) {
217
+ if (outcome.status === "fulfilled") {
218
+ sinkResults.push(outcome.value);
219
+ const { name, result } = outcome.value;
220
+ if (result.status === "failed") {
221
+ console.warn(` ⚠️ Sink ${name} failed: ${result.error}`);
222
+ }
223
+ else if (result.status === "skipped") {
224
+ console.log(` ⏭️ Sink ${name} skipped: ${result.reason}`);
225
+ }
226
+ else {
227
+ console.log(` ✅ Sink ${name} delivered${result.detail ? ` (${result.detail})` : ""}`);
228
+ }
229
+ }
230
+ else {
231
+ // Promise.allSettled rejection — shouldn't happen with withRetry, but just in case
232
+ const error = outcome.reason instanceof Error
233
+ ? outcome.reason.message
234
+ : String(outcome.reason);
235
+ sinkResults.push({
236
+ name: "unknown",
237
+ result: { error, status: "failed" },
238
+ });
239
+ console.warn(` ⚠️ Sink delivery error: ${error}`);
240
+ }
241
+ }
242
+ return { report, sinkResults };
243
+ }
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Pipeline step: Generate PR comment / report from scores.
3
+ *
4
+ * Preconditions: score-summary.json exists
5
+ * Postconditions: report markdown generated
6
+ *
7
+ * Cache key: results/latest/score-summary.json
8
+ * Note: Report is always regenerated (not cached) since it may include
9
+ * dynamic data like Promptfoo URLs. The cache infrastructure is wired up
10
+ * for consistency but reports are cheap to generate.
11
+ */
12
+ import type { StepResult } from "../types.js";
13
+ export declare function runReport(outputPath?: string, promptfooUrl?: string): StepResult;
@@ -0,0 +1,56 @@
1
+ /**
2
+ * Pipeline step: Generate PR comment / report from scores.
3
+ *
4
+ * Preconditions: score-summary.json exists
5
+ * Postconditions: report markdown generated
6
+ *
7
+ * Cache key: results/latest/score-summary.json
8
+ * Note: Report is always regenerated (not cached) since it may include
9
+ * dynamic data like Promptfoo URLs. The cache infrastructure is wired up
10
+ * for consistency but reports are cheap to generate.
11
+ */
12
+ import { execSync } from "child_process";
13
+ import { dirname, resolve } from "path";
14
+ import { fileURLToPath } from "url";
15
+ import { checkScoreSummaryValid } from "../checks.js";
16
+ const __dirname = dirname(fileURLToPath(import.meta.url));
17
+ const ROOT = resolve(__dirname, "..", "..", "..");
18
+ const DEFAULT_REPORT_PATH = resolve(ROOT, "results/latest/pr-comment.md");
19
+ export function runReport(outputPath, promptfooUrl) {
20
+ const start = Date.now();
21
+ // Precondition: score summary exists
22
+ const summaryIssues = checkScoreSummaryValid(ROOT);
23
+ const summaryErrors = summaryIssues.filter((i) => i.severity === "error");
24
+ if (summaryErrors.length > 0) {
25
+ return {
26
+ durationMs: Date.now() - start,
27
+ error: `Score summary missing: ${summaryErrors.map((e) => e.message).join("; ")}`,
28
+ status: "failed",
29
+ };
30
+ }
31
+ // Always write to a file — use the caller's path or a default.
32
+ // This avoids dumping the full PR-comment markdown into the terminal.
33
+ const resolvedOutput = outputPath ?? DEFAULT_REPORT_PATH;
34
+ // Execute — reports are always regenerated (cheap, may include dynamic URLs)
35
+ try {
36
+ const outputArg = ` --output ${resolvedOutput}`;
37
+ const urlArg = promptfooUrl ? ` --promptfoo-url ${promptfooUrl}` : "";
38
+ execSync(`pnpm pr-comment${outputArg}${urlArg}`, {
39
+ cwd: ROOT,
40
+ env: process.env,
41
+ stdio: ["inherit", "ignore", "inherit"],
42
+ });
43
+ }
44
+ catch (err) {
45
+ return {
46
+ durationMs: Date.now() - start,
47
+ error: `pr-comment failed: ${err instanceof Error ? err.message : String(err)}`,
48
+ status: "failed",
49
+ };
50
+ }
51
+ return {
52
+ durationMs: Date.now() - start,
53
+ status: "success",
54
+ summary: `Report written to ${resolvedOutput}`,
55
+ };
56
+ }
@@ -0,0 +1,11 @@
1
+ /**
2
+ * Pipeline step: Update QUALITY_SCORE.md from score summary.
3
+ *
4
+ * Preconditions: score-summary.json exists and is valid
5
+ * Postconditions: QUALITY_SCORE.md is updated with latest scores
6
+ *
7
+ * Note: Not cached — writing to a git-tracked file is intentionally
8
+ * always re-executed to ensure the file reflects the latest scores.
9
+ */
10
+ import type { StepResult } from "../types.js";
11
+ export declare function runUpdateQualityScores(): Promise<StepResult>;
@@ -0,0 +1,42 @@
1
+ /**
2
+ * Pipeline step: Update QUALITY_SCORE.md from score summary.
3
+ *
4
+ * Preconditions: score-summary.json exists and is valid
5
+ * Postconditions: QUALITY_SCORE.md is updated with latest scores
6
+ *
7
+ * Note: Not cached — writing to a git-tracked file is intentionally
8
+ * always re-executed to ensure the file reflects the latest scores.
9
+ */
10
+ import { dirname, resolve } from "path";
11
+ import { fileURLToPath } from "url";
12
+ import { checkScoreSummaryValid } from "../checks.js";
13
+ const __dirname = dirname(fileURLToPath(import.meta.url));
14
+ const ROOT = resolve(__dirname, "..", "..", "..");
15
+ export async function runUpdateQualityScores() {
16
+ const start = Date.now();
17
+ // Precondition: score summary exists and is valid
18
+ const summaryIssues = checkScoreSummaryValid(ROOT);
19
+ const summaryErrors = summaryIssues.filter((i) => i.severity === "error");
20
+ if (summaryErrors.length > 0) {
21
+ return {
22
+ durationMs: Date.now() - start,
23
+ error: `Score summary missing or invalid: ${summaryErrors.map((e) => e.message).join("; ")}`,
24
+ status: "failed",
25
+ };
26
+ }
27
+ // Dynamic import to avoid loading file-writing code at module parse time
28
+ const { updateQualityScores } = await import("../../scripts/update-quality-scores.js");
29
+ const result = updateQualityScores();
30
+ if (!result.success) {
31
+ return {
32
+ durationMs: Date.now() - start,
33
+ error: result.message,
34
+ status: "failed",
35
+ };
36
+ }
37
+ return {
38
+ durationMs: Date.now() - start,
39
+ status: "success",
40
+ summary: result.message,
41
+ };
42
+ }
@@ -0,0 +1,88 @@
1
+ /**
2
+ * pipeline/targeted-loo.ts
3
+ *
4
+ * Targeted leave-one-out (LOO) attribution for ambiguous cases.
5
+ *
6
+ * Phase 4d of the Scenario Matrix implementation.
7
+ *
8
+ * When correlation-based attribution (Phase 2c) identifies ambiguous tasks
9
+ * (2+ changed docs map to the same task), targeted LOO resolves the ambiguity
10
+ * by running per-document mini-evaluations to measure each document's
11
+ * marginal contribution.
12
+ *
13
+ * This module handles:
14
+ * - Cost estimation before execution (to enable user confirmation)
15
+ * - Result analysis after LOO runs complete
16
+ * - Integration with the existing AttributionReport
17
+ *
18
+ * The actual evaluation execution is handled by the pipeline orchestrator —
19
+ * this module is pure computation on inputs and outputs.
20
+ *
21
+ * @see docs/exec-plans/completed/scenario-matrix-implementation/phase-4-content-release-integration.md
22
+ * @see docs/design-docs/scenario-matrix/per-document-attribution.md
23
+ */
24
+ import type { AttributionReport, LOOCostEstimate, LOOResult, TaskAttribution } from "./types.js";
25
+ /**
26
+ * Analyze LOO evaluation results to compute marginal contributions.
27
+ *
28
+ * Given the full-release score and per-document revert scores,
29
+ * calculates each document's marginal contribution as:
30
+ * marginal(doc) = fullReleaseScore - revertedScore(doc)
31
+ *
32
+ * @param taskId - The task being analyzed
33
+ * @param fullReleaseDelta - The total task delta from the full release
34
+ * @param revertResults - Per-document scores when that document is reverted
35
+ * @param noiseThreshold - Threshold for marking contributions as noise
36
+ * @param additionalCost - Actual cost incurred for the LOO evaluations
37
+ * @returns LOO result with per-document marginal contributions
38
+ */
39
+ export declare function analyzeLOOResults(taskId: string, fullReleaseDelta: number, revertResults: {
40
+ /** Score delta with this document reverted */
41
+ revertedDelta: number;
42
+ /** The document slug that was reverted */
43
+ slug: string;
44
+ }[], noiseThreshold: number, additionalCost: number): LOOResult;
45
+ /**
46
+ * Enrich an attribution report with LOO results for ambiguous tasks.
47
+ *
48
+ * Replaces the "ambiguous" classification with resolved per-document
49
+ * contributions for tasks that have LOO data.
50
+ *
51
+ * @param attribution - Original attribution report
52
+ * @param looResults - LOO results for ambiguous tasks
53
+ * @returns New attribution report with LOO data integrated
54
+ */
55
+ export declare function enrichAttributionWithLOO(attribution: AttributionReport, looResults: LOOResult[]): AttributionReport & {
56
+ looResults: LOOResult[];
57
+ };
58
+ /**
59
+ * Estimate the cost of running targeted LOO for ambiguous tasks.
60
+ *
61
+ * @param ambiguousTasks - Tasks identified as ambiguous
62
+ * @param testsPerTask - Number of tests per task (from task YAML)
63
+ * @param costPerTest - Estimated cost per test (default: $0.08)
64
+ * @returns Cost estimate with per-task breakdown
65
+ */
66
+ export declare function estimateLOOCost(ambiguousTasks: TaskAttribution[], testsPerTask: Record<string, number>, costPerTest?: number): LOOCostEstimate;
67
+ /**
68
+ * Identify ambiguous tasks that would benefit from targeted LOO.
69
+ *
70
+ * Filters the attribution report to find tasks where 2+ changed documents
71
+ * are in the canonical set and the delta is outside the noise floor.
72
+ *
73
+ * @param attribution - Attribution report from Phase 2c
74
+ * @returns Ambiguous task attributions suitable for LOO
75
+ */
76
+ export declare function findAmbiguousTasks(attribution: AttributionReport): TaskAttribution[];
77
+ /**
78
+ * Format a LOO cost estimate for console output (for user confirmation).
79
+ */
80
+ export declare function formatLOOCostEstimate(estimate: LOOCostEstimate): string;
81
+ /**
82
+ * Format LOO results for console output.
83
+ */
84
+ export declare function formatLOOResultsConsole(results: LOOResult[]): string;
85
+ /**
86
+ * Format LOO results as markdown.
87
+ */
88
+ export declare function formatLOOResultsMarkdown(results: LOOResult[]): string;