@sanity/ailf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (530) hide show
  1. package/README.md +89 -0
  2. package/bin/ailf.js +64 -0
  3. package/canonical/grader-references/README.md +88 -0
  4. package/canonical/grader-references/groq.yaml +234 -0
  5. package/canonical/grader-references/studio-setup.yaml +275 -0
  6. package/canonical/reference-solutions/.gitkeep +1 -0
  7. package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
  8. package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
  9. package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
  10. package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
  11. package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
  12. package/canonical/reference-solutions/groq/joins-references.ts +300 -0
  13. package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
  14. package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
  15. package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
  16. package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
  17. package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
  18. package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
  19. package/config/bigquery/README.md +74 -0
  20. package/config/bigquery/views/area_scores.sql +87 -0
  21. package/config/bigquery/views/reports.sql +49 -0
  22. package/config/features.yaml +116 -0
  23. package/config/models.yaml +115 -0
  24. package/config/prompts.yaml +75 -0
  25. package/config/rubrics.yaml +62 -0
  26. package/config/schedules.yaml +43 -0
  27. package/config/sinks.yaml +54 -0
  28. package/config/sources.yaml +51 -0
  29. package/config/thresholds.yaml +49 -0
  30. package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
  31. package/dist/_vendor/ailf-core/examples/index.js +285 -0
  32. package/dist/_vendor/ailf-core/index.d.ts +17 -0
  33. package/dist/_vendor/ailf-core/index.js +17 -0
  34. package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
  35. package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
  36. package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
  37. package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
  38. package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
  39. package/dist/_vendor/ailf-core/ports/context.js +14 -0
  40. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
  41. package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
  42. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
  43. package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
  44. package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
  45. package/dist/_vendor/ailf-core/ports/index.js +7 -0
  46. package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
  47. package/dist/_vendor/ailf-core/ports/logger.js +11 -0
  48. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
  49. package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
  50. package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
  51. package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
  52. package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
  53. package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
  54. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
  55. package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
  56. package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
  57. package/dist/_vendor/ailf-core/schemas/index.js +16 -0
  58. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
  59. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
  60. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
  61. package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
  62. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
  63. package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
  64. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
  65. package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
  66. package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
  67. package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
  68. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
  69. package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
  70. package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
  71. package/dist/_vendor/ailf-core/services/index.js +12 -0
  72. package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
  73. package/dist/_vendor/ailf-core/services/scoring.js +222 -0
  74. package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
  75. package/dist/_vendor/ailf-core/types/index.js +21 -0
  76. package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
  77. package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
  78. package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
  79. package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
  80. package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
  81. package/dist/_vendor/ailf-shared/document-ref.js +1 -0
  82. package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
  83. package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
  84. package/dist/_vendor/ailf-shared/index.d.ts +16 -0
  85. package/dist/_vendor/ailf-shared/index.js +16 -0
  86. package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
  87. package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
  88. package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
  89. package/dist/_vendor/ailf-shared/score-grades.js +23 -0
  90. package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
  91. package/dist/adapters/cache/content-lake-cache.js +59 -0
  92. package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
  93. package/dist/adapters/cache/filesystem-cache.js +54 -0
  94. package/dist/adapters/cache/index.d.ts +2 -0
  95. package/dist/adapters/cache/index.js +2 -0
  96. package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
  97. package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
  98. package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
  99. package/dist/adapters/config-sources/file-config-adapter.js +96 -0
  100. package/dist/adapters/config-sources/index.d.ts +2 -0
  101. package/dist/adapters/config-sources/index.js +2 -0
  102. package/dist/adapters/doc-fetchers/index.d.ts +1 -0
  103. package/dist/adapters/doc-fetchers/index.js +1 -0
  104. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
  105. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
  106. package/dist/adapters/eval-runners/index.d.ts +1 -0
  107. package/dist/adapters/eval-runners/index.js +1 -0
  108. package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
  109. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
  110. package/dist/adapters/index.d.ts +12 -0
  111. package/dist/adapters/index.js +12 -0
  112. package/dist/adapters/loggers/console-logger.d.ts +22 -0
  113. package/dist/adapters/loggers/console-logger.js +54 -0
  114. package/dist/adapters/loggers/index.d.ts +9 -0
  115. package/dist/adapters/loggers/index.js +9 -0
  116. package/dist/adapters/loggers/json-logger.d.ts +18 -0
  117. package/dist/adapters/loggers/json-logger.js +33 -0
  118. package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
  119. package/dist/adapters/loggers/quiet-logger.js +30 -0
  120. package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
  121. package/dist/adapters/task-sources/composite-task-source.js +59 -0
  122. package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
  123. package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
  124. package/dist/adapters/task-sources/index.d.ts +7 -0
  125. package/dist/adapters/task-sources/index.js +7 -0
  126. package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
  127. package/dist/adapters/task-sources/repo-schemas.js +234 -0
  128. package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
  129. package/dist/adapters/task-sources/repo-task-source.js +104 -0
  130. package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
  131. package/dist/adapters/task-sources/repo-trigger.js +153 -0
  132. package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
  133. package/dist/adapters/task-sources/repo-validation.js +164 -0
  134. package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
  135. package/dist/adapters/task-sources/yaml-task-source.js +136 -0
  136. package/dist/agent-observer/agentic-provider.d.ts +132 -0
  137. package/dist/agent-observer/agentic-provider.js +983 -0
  138. package/dist/agent-observer/classifier.d.ts +62 -0
  139. package/dist/agent-observer/classifier.js +269 -0
  140. package/dist/agent-observer/index.d.ts +7 -0
  141. package/dist/agent-observer/index.js +4 -0
  142. package/dist/agent-observer/pricing.d.ts +35 -0
  143. package/dist/agent-observer/pricing.js +82 -0
  144. package/dist/agent-observer/provider.d.ts +77 -0
  145. package/dist/agent-observer/provider.js +151 -0
  146. package/dist/agent-observer/proxy.d.ts +91 -0
  147. package/dist/agent-observer/proxy.js +321 -0
  148. package/dist/agent-observer/test-imports.d.ts +7 -0
  149. package/dist/agent-observer/test-imports.js +185 -0
  150. package/dist/agent-observer/types.d.ts +137 -0
  151. package/dist/agent-observer/types.js +16 -0
  152. package/dist/assertions/source-isolation.d.ts +72 -0
  153. package/dist/assertions/source-isolation.js +117 -0
  154. package/dist/cli.d.ts +24 -0
  155. package/dist/cli.js +199 -0
  156. package/dist/commands/agent-report.d.ts +5 -0
  157. package/dist/commands/agent-report.js +69 -0
  158. package/dist/commands/baseline.d.ts +9 -0
  159. package/dist/commands/baseline.js +141 -0
  160. package/dist/commands/cache.d.ts +13 -0
  161. package/dist/commands/cache.js +135 -0
  162. package/dist/commands/calculate-scores.d.ts +8 -0
  163. package/dist/commands/calculate-scores.js +48 -0
  164. package/dist/commands/compare.d.ts +8 -0
  165. package/dist/commands/compare.js +120 -0
  166. package/dist/commands/completion.d.ts +18 -0
  167. package/dist/commands/completion.js +260 -0
  168. package/dist/commands/coverage-audit.d.ts +7 -0
  169. package/dist/commands/coverage-audit.js +40 -0
  170. package/dist/commands/discovery-report.d.ts +10 -0
  171. package/dist/commands/discovery-report.js +44 -0
  172. package/dist/commands/eval.d.ts +9 -0
  173. package/dist/commands/eval.js +35 -0
  174. package/dist/commands/explain-handler.d.ts +34 -0
  175. package/dist/commands/explain-handler.js +719 -0
  176. package/dist/commands/fetch-docs.d.ts +8 -0
  177. package/dist/commands/fetch-docs.js +128 -0
  178. package/dist/commands/generate-configs.d.ts +8 -0
  179. package/dist/commands/generate-configs.js +46 -0
  180. package/dist/commands/grader/index.d.ts +11 -0
  181. package/dist/commands/grader/index.js +118 -0
  182. package/dist/commands/init.d.ts +19 -0
  183. package/dist/commands/init.js +150 -0
  184. package/dist/commands/interactive.d.ts +12 -0
  185. package/dist/commands/interactive.js +238 -0
  186. package/dist/commands/lookup-doc.d.ts +15 -0
  187. package/dist/commands/lookup-doc.js +84 -0
  188. package/dist/commands/measure-retrieval.d.ts +5 -0
  189. package/dist/commands/measure-retrieval.js +65 -0
  190. package/dist/commands/pipeline-action.d.ts +71 -0
  191. package/dist/commands/pipeline-action.js +305 -0
  192. package/dist/commands/pipeline.d.ts +62 -0
  193. package/dist/commands/pipeline.js +53 -0
  194. package/dist/commands/pr-comment.d.ts +8 -0
  195. package/dist/commands/pr-comment.js +47 -0
  196. package/dist/commands/publish.d.ts +26 -0
  197. package/dist/commands/publish.js +253 -0
  198. package/dist/commands/readiness-report.d.ts +10 -0
  199. package/dist/commands/readiness-report.js +104 -0
  200. package/dist/commands/shared/options.d.ts +29 -0
  201. package/dist/commands/shared/options.js +57 -0
  202. package/dist/commands/update-quality-scores.d.ts +5 -0
  203. package/dist/commands/update-quality-scores.js +20 -0
  204. package/dist/commands/validate-tasks.d.ts +16 -0
  205. package/dist/commands/validate-tasks.js +93 -0
  206. package/dist/commands/validate.d.ts +9 -0
  207. package/dist/commands/validate.js +73 -0
  208. package/dist/commands/webhook-server.d.ts +5 -0
  209. package/dist/commands/webhook-server.js +30 -0
  210. package/dist/commands/weekly-digest.d.ts +10 -0
  211. package/dist/commands/weekly-digest.js +104 -0
  212. package/dist/composition-root.d.ts +26 -0
  213. package/dist/composition-root.js +107 -0
  214. package/dist/interpolate.d.ts +26 -0
  215. package/dist/interpolate.js +70 -0
  216. package/dist/job-store.d.ts +104 -0
  217. package/dist/job-store.js +188 -0
  218. package/dist/lib/agent-behavior-report.d.ts +8 -0
  219. package/dist/lib/agent-behavior-report.js +185 -0
  220. package/dist/lib/baseline.d.ts +19 -0
  221. package/dist/lib/baseline.js +153 -0
  222. package/dist/lib/calculate-scores.d.ts +23 -0
  223. package/dist/lib/calculate-scores.js +42 -0
  224. package/dist/lib/compare.d.ts +18 -0
  225. package/dist/lib/compare.js +170 -0
  226. package/dist/lib/coverage-audit.d.ts +4 -0
  227. package/dist/lib/coverage-audit.js +42 -0
  228. package/dist/lib/discovery-report.d.ts +13 -0
  229. package/dist/lib/discovery-report.js +57 -0
  230. package/dist/lib/fetch-docs.d.ts +30 -0
  231. package/dist/lib/fetch-docs.js +171 -0
  232. package/dist/lib/generate-configs.d.ts +25 -0
  233. package/dist/lib/generate-configs.js +42 -0
  234. package/dist/lib/grader-api.d.ts +21 -0
  235. package/dist/lib/grader-api.js +34 -0
  236. package/dist/lib/grader-compare.d.ts +19 -0
  237. package/dist/lib/grader-compare.js +91 -0
  238. package/dist/lib/grader-consistency.d.ts +27 -0
  239. package/dist/lib/grader-consistency.js +79 -0
  240. package/dist/lib/grader-sensitivity.d.ts +19 -0
  241. package/dist/lib/grader-sensitivity.js +75 -0
  242. package/dist/lib/grader-validate.d.ts +19 -0
  243. package/dist/lib/grader-validate.js +78 -0
  244. package/dist/lib/measure-retrieval.d.ts +14 -0
  245. package/dist/lib/measure-retrieval.js +71 -0
  246. package/dist/lib/pr-comment.d.ts +16 -0
  247. package/dist/lib/pr-comment.js +28 -0
  248. package/dist/lib/readiness-report.d.ts +13 -0
  249. package/dist/lib/readiness-report.js +108 -0
  250. package/dist/lib/webhook-server.d.ts +11 -0
  251. package/dist/lib/webhook-server.js +24 -0
  252. package/dist/lib/weekly-digest.d.ts +24 -0
  253. package/dist/lib/weekly-digest.js +148 -0
  254. package/dist/orchestration/build-app-context.d.ts +27 -0
  255. package/dist/orchestration/build-app-context.js +81 -0
  256. package/dist/orchestration/build-step-sequence.d.ts +15 -0
  257. package/dist/orchestration/build-step-sequence.js +84 -0
  258. package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
  259. package/dist/orchestration/config-to-source-overrides.js +28 -0
  260. package/dist/orchestration/env-bridge.d.ts +21 -0
  261. package/dist/orchestration/env-bridge.js +66 -0
  262. package/dist/orchestration/index.d.ts +11 -0
  263. package/dist/orchestration/index.js +11 -0
  264. package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
  265. package/dist/orchestration/pipeline-orchestrator.js +153 -0
  266. package/dist/orchestration/step-runner.d.ts +20 -0
  267. package/dist/orchestration/step-runner.js +88 -0
  268. package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
  269. package/dist/orchestration/steps/calculate-scores-step.js +95 -0
  270. package/dist/orchestration/steps/callback-step.d.ts +24 -0
  271. package/dist/orchestration/steps/callback-step.js +76 -0
  272. package/dist/orchestration/steps/compare-step.d.ts +14 -0
  273. package/dist/orchestration/steps/compare-step.js +92 -0
  274. package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
  275. package/dist/orchestration/steps/discovery-report-step.js +55 -0
  276. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  277. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  278. package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
  279. package/dist/orchestration/steps/fetch-docs-step.js +135 -0
  280. package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
  281. package/dist/orchestration/steps/gap-analysis-step.js +136 -0
  282. package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
  283. package/dist/orchestration/steps/generate-configs-step.js +85 -0
  284. package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
  285. package/dist/orchestration/steps/grader-consistency-step.js +64 -0
  286. package/dist/orchestration/steps/index.d.ts +19 -0
  287. package/dist/orchestration/steps/index.js +19 -0
  288. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
  289. package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
  290. package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
  291. package/dist/orchestration/steps/publish-report-step.js +216 -0
  292. package/dist/orchestration/steps/readiness-step.d.ts +13 -0
  293. package/dist/orchestration/steps/readiness-step.js +91 -0
  294. package/dist/orchestration/steps/report-step.d.ts +12 -0
  295. package/dist/orchestration/steps/report-step.js +49 -0
  296. package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
  297. package/dist/orchestration/steps/run-eval-step.js +195 -0
  298. package/dist/orchestration/steps/validate-step.d.ts +12 -0
  299. package/dist/orchestration/steps/validate-step.js +41 -0
  300. package/dist/pipeline/agent-behavior-report.d.ts +53 -0
  301. package/dist/pipeline/agent-behavior-report.js +132 -0
  302. package/dist/pipeline/attribution.d.ts +47 -0
  303. package/dist/pipeline/attribution.js +226 -0
  304. package/dist/pipeline/baseline.d.ts +37 -0
  305. package/dist/pipeline/baseline.js +141 -0
  306. package/dist/pipeline/cache.d.ts +101 -0
  307. package/dist/pipeline/cache.js +283 -0
  308. package/dist/pipeline/calculate-scores.d.ts +102 -0
  309. package/dist/pipeline/calculate-scores.js +1128 -0
  310. package/dist/pipeline/callback-delivery.d.ts +50 -0
  311. package/dist/pipeline/callback-delivery.js +89 -0
  312. package/dist/pipeline/checks.d.ts +39 -0
  313. package/dist/pipeline/checks.js +280 -0
  314. package/dist/pipeline/classify-url.d.ts +61 -0
  315. package/dist/pipeline/classify-url.js +93 -0
  316. package/dist/pipeline/compare.d.ts +31 -0
  317. package/dist/pipeline/compare.js +208 -0
  318. package/dist/pipeline/coverage-audit.d.ts +39 -0
  319. package/dist/pipeline/coverage-audit.js +165 -0
  320. package/dist/pipeline/degradations.d.ts +85 -0
  321. package/dist/pipeline/degradations.js +242 -0
  322. package/dist/pipeline/discovery-report.d.ts +55 -0
  323. package/dist/pipeline/discovery-report.js +178 -0
  324. package/dist/pipeline/eval-constants.d.ts +68 -0
  325. package/dist/pipeline/eval-constants.js +111 -0
  326. package/dist/pipeline/eval-fingerprint.d.ts +66 -0
  327. package/dist/pipeline/eval-fingerprint.js +175 -0
  328. package/dist/pipeline/expand-tasks.d.ts +220 -0
  329. package/dist/pipeline/expand-tasks.js +421 -0
  330. package/dist/pipeline/failure-modes.d.ts +46 -0
  331. package/dist/pipeline/failure-modes.js +348 -0
  332. package/dist/pipeline/fetch-url-content.d.ts +44 -0
  333. package/dist/pipeline/fetch-url-content.js +93 -0
  334. package/dist/pipeline/gap-analysis.d.ts +48 -0
  335. package/dist/pipeline/gap-analysis.js +231 -0
  336. package/dist/pipeline/generate-configs.d.ts +72 -0
  337. package/dist/pipeline/generate-configs.js +395 -0
  338. package/dist/pipeline/grader-api.d.ts +49 -0
  339. package/dist/pipeline/grader-api.js +200 -0
  340. package/dist/pipeline/grader-compare-runner.d.ts +44 -0
  341. package/dist/pipeline/grader-compare-runner.js +301 -0
  342. package/dist/pipeline/grader-comparison.d.ts +111 -0
  343. package/dist/pipeline/grader-comparison.js +161 -0
  344. package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
  345. package/dist/pipeline/grader-consistency-runner.js +270 -0
  346. package/dist/pipeline/grader-consistency.d.ts +103 -0
  347. package/dist/pipeline/grader-consistency.js +146 -0
  348. package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
  349. package/dist/pipeline/grader-sensitivity-runner.js +282 -0
  350. package/dist/pipeline/grader-sensitivity.d.ts +94 -0
  351. package/dist/pipeline/grader-sensitivity.js +144 -0
  352. package/dist/pipeline/grader-validate-runner.d.ts +38 -0
  353. package/dist/pipeline/grader-validate-runner.js +229 -0
  354. package/dist/pipeline/grader-validation.d.ts +107 -0
  355. package/dist/pipeline/grader-validation.js +169 -0
  356. package/dist/pipeline/map-request-to-config.d.ts +19 -0
  357. package/dist/pipeline/map-request-to-config.js +80 -0
  358. package/dist/pipeline/measure-retrieval.d.ts +59 -0
  359. package/dist/pipeline/measure-retrieval.js +111 -0
  360. package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
  361. package/dist/pipeline/mirror-repo-tasks.js +350 -0
  362. package/dist/pipeline/plan-format.d.ts +33 -0
  363. package/dist/pipeline/plan-format.js +202 -0
  364. package/dist/pipeline/plan.d.ts +169 -0
  365. package/dist/pipeline/plan.js +708 -0
  366. package/dist/pipeline/pr-comment.d.ts +19 -0
  367. package/dist/pipeline/pr-comment.js +502 -0
  368. package/dist/pipeline/probe.d.ts +52 -0
  369. package/dist/pipeline/probe.js +390 -0
  370. package/dist/pipeline/provenance.d.ts +47 -0
  371. package/dist/pipeline/provenance.js +146 -0
  372. package/dist/pipeline/readiness-report.d.ts +87 -0
  373. package/dist/pipeline/readiness-report.js +205 -0
  374. package/dist/pipeline/release-classification.d.ts +54 -0
  375. package/dist/pipeline/release-classification.js +238 -0
  376. package/dist/pipeline/release-report.d.ts +37 -0
  377. package/dist/pipeline/release-report.js +222 -0
  378. package/dist/pipeline/repo-eval-comment.d.ts +37 -0
  379. package/dist/pipeline/repo-eval-comment.js +165 -0
  380. package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
  381. package/dist/pipeline/repo-threshold-evaluator.js +162 -0
  382. package/dist/pipeline/resolve-mappings.d.ts +35 -0
  383. package/dist/pipeline/resolve-mappings.js +72 -0
  384. package/dist/pipeline/retrieval-metrics.d.ts +39 -0
  385. package/dist/pipeline/retrieval-metrics.js +136 -0
  386. package/dist/pipeline/reverse-mapping.d.ts +67 -0
  387. package/dist/pipeline/reverse-mapping.js +88 -0
  388. package/dist/pipeline/schemas.d.ts +9 -0
  389. package/dist/pipeline/schemas.js +9 -0
  390. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  391. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  392. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  393. package/dist/pipeline/steps/compare-step.js +90 -0
  394. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  395. package/dist/pipeline/steps/eval-step.js +347 -0
  396. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  397. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  398. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  399. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  400. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  401. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  402. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  403. package/dist/pipeline/steps/publish-report-step.js +243 -0
  404. package/dist/pipeline/steps/report-step.d.ts +13 -0
  405. package/dist/pipeline/steps/report-step.js +56 -0
  406. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  407. package/dist/pipeline/steps/update-scores-step.js +42 -0
  408. package/dist/pipeline/targeted-loo.d.ts +88 -0
  409. package/dist/pipeline/targeted-loo.js +203 -0
  410. package/dist/pipeline/thresholds.d.ts +27 -0
  411. package/dist/pipeline/thresholds.js +245 -0
  412. package/dist/pipeline/types.d.ts +10 -0
  413. package/dist/pipeline/types.js +10 -0
  414. package/dist/pipeline/validate.d.ts +67 -0
  415. package/dist/pipeline/validate.js +406 -0
  416. package/dist/pipeline/webhook-server.d.ts +37 -0
  417. package/dist/pipeline/webhook-server.js +133 -0
  418. package/dist/report-store.d.ts +84 -0
  419. package/dist/report-store.js +208 -0
  420. package/dist/sanity/client.d.ts +38 -0
  421. package/dist/sanity/client.js +86 -0
  422. package/dist/sanity/portable-text.d.ts +11 -0
  423. package/dist/sanity/portable-text.js +211 -0
  424. package/dist/sanity/queries.d.ts +133 -0
  425. package/dist/sanity/queries.js +300 -0
  426. package/dist/schedules/digest.d.ts +116 -0
  427. package/dist/schedules/digest.js +156 -0
  428. package/dist/schedules/index.d.ts +12 -0
  429. package/dist/schedules/index.js +10 -0
  430. package/dist/schedules/loader.d.ts +31 -0
  431. package/dist/schedules/loader.js +73 -0
  432. package/dist/schedules/schema.d.ts +9 -0
  433. package/dist/schedules/schema.js +9 -0
  434. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  435. package/dist/scripts/agent-behavior-report.js +315 -0
  436. package/dist/scripts/baseline.d.ts +43 -0
  437. package/dist/scripts/baseline.js +267 -0
  438. package/dist/scripts/calculate-scores.d.ts +166 -0
  439. package/dist/scripts/calculate-scores.js +1296 -0
  440. package/dist/scripts/compare.d.ts +22 -0
  441. package/dist/scripts/compare.js +334 -0
  442. package/dist/scripts/coverage-audit.d.ts +44 -0
  443. package/dist/scripts/coverage-audit.js +209 -0
  444. package/dist/scripts/debug-eval.d.ts +19 -0
  445. package/dist/scripts/debug-eval.js +73 -0
  446. package/dist/scripts/discovery-report.d.ts +58 -0
  447. package/dist/scripts/discovery-report.js +250 -0
  448. package/dist/scripts/fetch-docs.d.ts +35 -0
  449. package/dist/scripts/fetch-docs.js +472 -0
  450. package/dist/scripts/generate-configs.d.ts +66 -0
  451. package/dist/scripts/generate-configs.js +459 -0
  452. package/dist/scripts/grader-api.d.ts +27 -0
  453. package/dist/scripts/grader-api.js +206 -0
  454. package/dist/scripts/grader-compare.d.ts +22 -0
  455. package/dist/scripts/grader-compare.js +368 -0
  456. package/dist/scripts/grader-consistency.d.ts +20 -0
  457. package/dist/scripts/grader-consistency.js +313 -0
  458. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  459. package/dist/scripts/grader-sensitivity.js +354 -0
  460. package/dist/scripts/grader-validate.d.ts +19 -0
  461. package/dist/scripts/grader-validate.js +267 -0
  462. package/dist/scripts/measure-retrieval.d.ts +10 -0
  463. package/dist/scripts/measure-retrieval.js +145 -0
  464. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
  465. package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
  466. package/dist/scripts/pipeline.d.ts +76 -0
  467. package/dist/scripts/pipeline.js +1031 -0
  468. package/dist/scripts/pr-comment.d.ts +10 -0
  469. package/dist/scripts/pr-comment.js +510 -0
  470. package/dist/scripts/readiness-report.d.ts +88 -0
  471. package/dist/scripts/readiness-report.js +342 -0
  472. package/dist/scripts/update-quality-scores.d.ts +15 -0
  473. package/dist/scripts/update-quality-scores.js +184 -0
  474. package/dist/scripts/validate-task-sources.d.ts +21 -0
  475. package/dist/scripts/validate-task-sources.js +210 -0
  476. package/dist/scripts/validate.d.ts +13 -0
  477. package/dist/scripts/validate.js +79 -0
  478. package/dist/scripts/webhook-server.d.ts +26 -0
  479. package/dist/scripts/webhook-server.js +147 -0
  480. package/dist/scripts/weekly-digest.d.ts +24 -0
  481. package/dist/scripts/weekly-digest.js +144 -0
  482. package/dist/sinks/bigquery/index.d.ts +131 -0
  483. package/dist/sinks/bigquery/index.js +222 -0
  484. package/dist/sinks/format-slack.d.ts +64 -0
  485. package/dist/sinks/format-slack.js +306 -0
  486. package/dist/sinks/index.d.ts +23 -0
  487. package/dist/sinks/index.js +18 -0
  488. package/dist/sinks/loader.d.ts +18 -0
  489. package/dist/sinks/loader.js +82 -0
  490. package/dist/sinks/retry.d.ts +24 -0
  491. package/dist/sinks/retry.js +52 -0
  492. package/dist/sinks/schema.d.ts +9 -0
  493. package/dist/sinks/schema.js +9 -0
  494. package/dist/sinks/slack/format.d.ts +65 -0
  495. package/dist/sinks/slack/format.js +327 -0
  496. package/dist/sinks/slack/index.d.ts +27 -0
  497. package/dist/sinks/slack/index.js +78 -0
  498. package/dist/sinks/slack-sink.d.ts +27 -0
  499. package/dist/sinks/slack-sink.js +78 -0
  500. package/dist/sinks/types.d.ts +59 -0
  501. package/dist/sinks/types.js +44 -0
  502. package/dist/sinks/webhook/index.d.ts +19 -0
  503. package/dist/sinks/webhook/index.js +50 -0
  504. package/dist/sinks/webhook-sink.d.ts +19 -0
  505. package/dist/sinks/webhook-sink.js +50 -0
  506. package/dist/sources.d.ts +104 -0
  507. package/dist/sources.js +292 -0
  508. package/dist/webhook/budget.d.ts +42 -0
  509. package/dist/webhook/budget.js +60 -0
  510. package/dist/webhook/debounce.d.ts +67 -0
  511. package/dist/webhook/debounce.js +76 -0
  512. package/dist/webhook/dispatch.d.ts +45 -0
  513. package/dist/webhook/dispatch.js +84 -0
  514. package/dist/webhook/eval-request-handler.d.ts +87 -0
  515. package/dist/webhook/eval-request-handler.js +181 -0
  516. package/dist/webhook/handler.d.ts +88 -0
  517. package/dist/webhook/handler.js +203 -0
  518. package/dist/webhook/index.d.ts +17 -0
  519. package/dist/webhook/index.js +12 -0
  520. package/dist/webhook/types.d.ts +109 -0
  521. package/dist/webhook/types.js +10 -0
  522. package/package.json +72 -0
  523. package/tasks/.expanded.agentic.yaml +51 -0
  524. package/tasks/.expanded.yaml +66 -0
  525. package/tasks/frameworks.yaml +98 -0
  526. package/tasks/functions.yaml +51 -0
  527. package/tasks/groq.yaml +216 -0
  528. package/tasks/nextjs-live.yaml +62 -0
  529. package/tasks/studio-setup.yaml +111 -0
  530. package/tasks/visual-editing.yaml +120 -0
@@ -0,0 +1,203 @@
1
+ /**
2
+ * pipeline/targeted-loo.ts
3
+ *
4
+ * Targeted leave-one-out (LOO) attribution for ambiguous cases.
5
+ *
6
+ * Phase 4d of the Scenario Matrix implementation.
7
+ *
8
+ * When correlation-based attribution (Phase 2c) identifies ambiguous tasks
9
+ * (2+ changed docs map to the same task), targeted LOO resolves the ambiguity
10
+ * by running per-document mini-evaluations to measure each document's
11
+ * marginal contribution.
12
+ *
13
+ * This module handles:
14
+ * - Cost estimation before execution (to enable user confirmation)
15
+ * - Result analysis after LOO runs complete
16
+ * - Integration with the existing AttributionReport
17
+ *
18
+ * The actual evaluation execution is handled by the pipeline orchestrator —
19
+ * this module is pure computation on inputs and outputs.
20
+ *
21
+ * @see docs/exec-plans/completed/scenario-matrix-implementation/phase-4-content-release-integration.md
22
+ * @see docs/design-docs/scenario-matrix/per-document-attribution.md
23
+ */
24
+ // ---------------------------------------------------------------------------
25
+ // Constants
26
+ // ---------------------------------------------------------------------------
27
+ /** Default estimated cost per test (provider + grader combined) */
28
+ const DEFAULT_COST_PER_TEST = 0.08;
29
+ /** Sum tolerance — marginal contributions should sum to ~total delta (±15%) */
30
+ const SUM_TOLERANCE = 0.15;
31
+ // ---------------------------------------------------------------------------
32
+ // Public API
33
+ // ---------------------------------------------------------------------------
34
+ /**
35
+ * Analyze LOO evaluation results to compute marginal contributions.
36
+ *
37
+ * Given the full-release score and per-document revert scores,
38
+ * calculates each document's marginal contribution as:
39
+ * marginal(doc) = fullReleaseScore - revertedScore(doc)
40
+ *
41
+ * @param taskId - The task being analyzed
42
+ * @param fullReleaseDelta - The total task delta from the full release
43
+ * @param revertResults - Per-document scores when that document is reverted
44
+ * @param noiseThreshold - Threshold for marking contributions as noise
45
+ * @param additionalCost - Actual cost incurred for the LOO evaluations
46
+ * @returns LOO result with per-document marginal contributions
47
+ */
48
+ export function analyzeLOOResults(taskId, fullReleaseDelta, revertResults, noiseThreshold, additionalCost) {
49
+ const contributions = revertResults.map(({ revertedDelta, slug }) => {
50
+ // Marginal contribution = full delta - delta when this doc is reverted
51
+ const marginalContribution = fullReleaseDelta - revertedDelta;
52
+ return {
53
+ marginalContribution: Math.round(marginalContribution * 10) / 10,
54
+ slug,
55
+ withinNoiseFloor: Math.abs(marginalContribution) <= noiseThreshold,
56
+ };
57
+ });
58
+ // Check if contributions sum to approximately the total delta
59
+ const contributionSum = contributions.reduce((sum, c) => sum + c.marginalContribution, 0);
60
+ const sumMatchesTotal = fullReleaseDelta === 0 ||
61
+ Math.abs(contributionSum - fullReleaseDelta) / Math.abs(fullReleaseDelta) <=
62
+ SUM_TOLERANCE;
63
+ return {
64
+ additionalCost,
65
+ contributions: contributions.sort((a, b) => Math.abs(b.marginalContribution) - Math.abs(a.marginalContribution)),
66
+ sumMatchesTotal,
67
+ taskId,
68
+ };
69
+ }
70
+ /**
71
+ * Enrich an attribution report with LOO results for ambiguous tasks.
72
+ *
73
+ * Replaces the "ambiguous" classification with resolved per-document
74
+ * contributions for tasks that have LOO data.
75
+ *
76
+ * @param attribution - Original attribution report
77
+ * @param looResults - LOO results for ambiguous tasks
78
+ * @returns New attribution report with LOO data integrated
79
+ */
80
+ export function enrichAttributionWithLOO(attribution, looResults) {
81
+ const looByTask = new Map();
82
+ for (const result of looResults) {
83
+ looByTask.set(result.taskId, result);
84
+ }
85
+ return {
86
+ ...attribution,
87
+ looResults,
88
+ };
89
+ }
90
+ /**
91
+ * Estimate the cost of running targeted LOO for ambiguous tasks.
92
+ *
93
+ * @param ambiguousTasks - Tasks identified as ambiguous
94
+ * @param testsPerTask - Number of tests per task (from task YAML)
95
+ * @param costPerTest - Estimated cost per test (default: $0.08)
96
+ * @returns Cost estimate with per-task breakdown
97
+ */
98
+ export function estimateLOOCost(ambiguousTasks, testsPerTask, costPerTest = DEFAULT_COST_PER_TEST) {
99
+ const perTask = [];
100
+ for (const task of ambiguousTasks) {
101
+ const numDocuments = task.attributedDocs.length;
102
+ const numTests = testsPerTask[task.taskId] ?? 6; // Default estimate
103
+ const estimatedCost = numDocuments * numTests * costPerTest;
104
+ perTask.push({
105
+ estimatedCost: Math.round(estimatedCost * 100) / 100,
106
+ numDocuments,
107
+ numTests,
108
+ taskId: task.taskId,
109
+ });
110
+ }
111
+ const totalEstimatedCost = perTask.reduce((sum, t) => sum + t.estimatedCost, 0);
112
+ return {
113
+ perTask,
114
+ totalEstimatedCost: Math.round(totalEstimatedCost * 100) / 100,
115
+ };
116
+ }
117
+ /**
118
+ * Identify ambiguous tasks that would benefit from targeted LOO.
119
+ *
120
+ * Filters the attribution report to find tasks where 2+ changed documents
121
+ * are in the canonical set and the delta is outside the noise floor.
122
+ *
123
+ * @param attribution - Attribution report from Phase 2c
124
+ * @returns Ambiguous task attributions suitable for LOO
125
+ */
126
+ export function findAmbiguousTasks(attribution) {
127
+ return attribution.attributions.filter((a) => a.classification === "ambiguous" && !a.withinNoiseFloor);
128
+ }
129
+ // ---------------------------------------------------------------------------
130
+ // Formatting
131
+ // ---------------------------------------------------------------------------
132
+ /**
133
+ * Format a LOO cost estimate for console output (for user confirmation).
134
+ */
135
+ export function formatLOOCostEstimate(estimate) {
136
+ const lines = [];
137
+ lines.push("💰 TARGETED LOO COST ESTIMATE");
138
+ lines.push("");
139
+ lines.push(` Targeted LOO for ${estimate.perTask.length} ambiguous task(s):`);
140
+ lines.push("");
141
+ for (const task of estimate.perTask) {
142
+ lines.push(` ${task.taskId}: ${task.numDocuments} documents × ~${task.numTests} tests × $${DEFAULT_COST_PER_TEST}/test = ~$${task.estimatedCost.toFixed(2)}`);
143
+ }
144
+ lines.push("");
145
+ lines.push(` Total additional cost: ~$${estimate.totalEstimatedCost.toFixed(2)}`);
146
+ lines.push("");
147
+ return lines.join("\n");
148
+ }
149
+ /**
150
+ * Format LOO results for console output.
151
+ */
152
+ export function formatLOOResultsConsole(results) {
153
+ const lines = [];
154
+ lines.push("🔬 TARGETED LOO RESULTS");
155
+ lines.push("");
156
+ for (const result of results) {
157
+ lines.push(` ${result.taskId}:`);
158
+ for (const c of result.contributions) {
159
+ const sign = c.marginalContribution >= 0 ? "+" : "";
160
+ const noise = c.withinNoiseFloor ? " (within noise)" : "";
161
+ lines.push(` ${c.slug}: ${sign}${c.marginalContribution.toFixed(1)}${noise}`);
162
+ }
163
+ if (!result.sumMatchesTotal) {
164
+ lines.push(" ⚠️ Contributions don't sum to total delta (interaction effects likely)");
165
+ }
166
+ lines.push(` Additional cost: $${result.additionalCost.toFixed(2)}`);
167
+ lines.push("");
168
+ }
169
+ return lines.join("\n");
170
+ }
171
+ /**
172
+ * Format LOO results as markdown.
173
+ */
174
+ export function formatLOOResultsMarkdown(results) {
175
+ const lines = [];
176
+ lines.push("#### 🔬 Targeted LOO Attribution");
177
+ lines.push("");
178
+ if (results.length === 0) {
179
+ lines.push("No ambiguous tasks required LOO resolution.");
180
+ lines.push("");
181
+ return lines.join("\n");
182
+ }
183
+ for (const result of results) {
184
+ lines.push(`**${result.taskId}:**`);
185
+ lines.push("");
186
+ for (const c of result.contributions) {
187
+ const sign = c.marginalContribution >= 0 ? "+" : "";
188
+ const pct = result.contributions.length > 0
189
+ ? Math.round((Math.abs(c.marginalContribution) /
190
+ result.contributions.reduce((s, cc) => s + Math.abs(cc.marginalContribution), 0)) *
191
+ 100)
192
+ : 0;
193
+ const noise = c.withinNoiseFloor ? " ⚠️" : "";
194
+ lines.push(`- \`${c.slug}\`: ${sign}${c.marginalContribution.toFixed(1)} (${pct}%)${noise}`);
195
+ }
196
+ if (!result.sumMatchesTotal) {
197
+ lines.push("> ⚠️ Marginal contributions don't sum to total delta due to interaction effects.");
198
+ }
199
+ lines.push(`- Additional cost: $${result.additionalCost.toFixed(2)}`);
200
+ lines.push("");
201
+ }
202
+ return lines.join("\n");
203
+ }
@@ -0,0 +1,27 @@
1
+ /**
2
+ * pipeline/thresholds.ts
3
+ *
4
+ * Threshold evaluation engine — compares a ScoreSummary against configurable
5
+ * quality thresholds and produces a set of typed violations.
6
+ *
7
+ * Pure functions only: no I/O, no side effects. The caller (pipeline.ts,
8
+ * publish-report-step.ts) is responsible for loading the threshold config
9
+ * and score summary.
10
+ *
11
+ * @see docs/exec-plans/completed/scenario-matrix-implementation/phase-5-readiness-thresholds.md
12
+ */
13
+ import type { ThresholdConfig } from "./schemas.js";
14
+ import type { ComparisonReport, ScoreSummary, ThresholdEvaluation, ThresholdViolation } from "./types.js";
15
+ /**
16
+ * Evaluate regression thresholds against a comparison report.
17
+ *
18
+ * Returns violations for regressions that exceed configured thresholds.
19
+ */
20
+ export declare function evaluateRegressionThresholds(comparison: ComparisonReport, config: ThresholdConfig): ThresholdViolation[];
21
+ /**
22
+ * Evaluate a score summary against threshold configuration.
23
+ *
24
+ * Returns all violations sorted by severity (critical first), then by
25
+ * the magnitude of the threshold breach (largest gap first).
26
+ */
27
+ export declare function evaluateThresholds(scores: ScoreSummary, config: ThresholdConfig): ThresholdEvaluation;
@@ -0,0 +1,245 @@
1
+ /**
2
+ * pipeline/thresholds.ts
3
+ *
4
+ * Threshold evaluation engine — compares a ScoreSummary against configurable
5
+ * quality thresholds and produces a set of typed violations.
6
+ *
7
+ * Pure functions only: no I/O, no side effects. The caller (pipeline.ts,
8
+ * publish-report-step.ts) is responsible for loading the threshold config
9
+ * and score summary.
10
+ *
11
+ * @see docs/exec-plans/completed/scenario-matrix-implementation/phase-5-readiness-thresholds.md
12
+ */
13
+ // ---------------------------------------------------------------------------
14
+ // Severity priority for sorting (higher = more severe)
15
+ // ---------------------------------------------------------------------------
16
+ const SEVERITY_RANK = {
17
+ critical: 3,
18
+ info: 1,
19
+ warning: 2,
20
+ };
21
+ /**
22
+ * Evaluate regression thresholds against a comparison report.
23
+ *
24
+ * Returns violations for regressions that exceed configured thresholds.
25
+ */
26
+ export function evaluateRegressionThresholds(comparison, config) {
27
+ const regression = config.regression;
28
+ if (!regression)
29
+ return [];
30
+ const violations = [];
31
+ // Overall composite regression
32
+ if (comparison.deltas.overall < regression.composite) {
33
+ violations.push({
34
+ actual: comparison.deltas.overall,
35
+ area: "overall",
36
+ description: `Overall score dropped by ${fmt(Math.abs(comparison.deltas.overall))} (threshold: ${regression.composite})`,
37
+ metric: "regression-composite",
38
+ severity: classifyRegressionSeverity(comparison.deltas.overall, config),
39
+ threshold: regression.composite,
40
+ });
41
+ }
42
+ // Per-area regressions
43
+ for (const areaDelta of comparison.areas) {
44
+ if (areaDelta.delta < regression["per-area"]) {
45
+ violations.push({
46
+ actual: areaDelta.delta,
47
+ area: areaDelta.area,
48
+ description: `${areaDelta.area} dropped by ${fmt(Math.abs(areaDelta.delta))} (threshold: ${regression["per-area"]})`,
49
+ metric: "regression-area",
50
+ severity: classifyRegressionSeverity(areaDelta.delta, config),
51
+ threshold: regression["per-area"],
52
+ });
53
+ }
54
+ // Per-dimension regressions
55
+ const dims = areaDelta.dimensions;
56
+ for (const [dimKey, dimData] of Object.entries(dims)) {
57
+ if (dimData.delta < regression["per-dimension"]) {
58
+ violations.push({
59
+ actual: dimData.delta,
60
+ area: areaDelta.area,
61
+ description: `${areaDelta.area} ${dimKey} dropped by ${fmt(Math.abs(dimData.delta))} (threshold: ${regression["per-dimension"]})`,
62
+ metric: `regression-${dimKey}`,
63
+ severity: "warning",
64
+ threshold: regression["per-dimension"],
65
+ });
66
+ }
67
+ }
68
+ }
69
+ // Sort by severity then magnitude
70
+ violations.sort((a, b) => {
71
+ const sevDiff = SEVERITY_RANK[b.severity] - SEVERITY_RANK[a.severity];
72
+ if (sevDiff !== 0)
73
+ return sevDiff;
74
+ return a.actual - b.actual;
75
+ });
76
+ return violations;
77
+ }
78
+ // ---------------------------------------------------------------------------
79
+ // Per-area evaluation
80
+ // ---------------------------------------------------------------------------
81
+ /**
82
+ * Evaluate a score summary against threshold configuration.
83
+ *
84
+ * Returns all violations sorted by severity (critical first), then by
85
+ * the magnitude of the threshold breach (largest gap first).
86
+ */
87
+ export function evaluateThresholds(scores, config) {
88
+ const violations = [];
89
+ // Evaluate overall composite score against defaults
90
+ const overallScore = scores.overall.avgScore;
91
+ if (overallScore < config.defaults.composite) {
92
+ violations.push({
93
+ actual: overallScore,
94
+ area: "overall",
95
+ description: `Overall composite score ${fmt(overallScore)} is below threshold ${config.defaults.composite}`,
96
+ metric: "composite",
97
+ severity: classifySeverity(overallScore, config),
98
+ threshold: config.defaults.composite,
99
+ });
100
+ }
101
+ // Evaluate per-area scores
102
+ for (const areaScore of scores.scores) {
103
+ const areaOverrides = config.areas?.[areaScore.feature];
104
+ const merged = mergeDefaults(config.defaults, areaOverrides);
105
+ evaluateArea(areaScore, merged, config, violations);
106
+ }
107
+ // Sort: critical first, then by delta magnitude (largest breach first)
108
+ violations.sort((a, b) => {
109
+ const sevDiff = SEVERITY_RANK[b.severity] - SEVERITY_RANK[a.severity];
110
+ if (sevDiff !== 0)
111
+ return sevDiff;
112
+ // Larger breach = more negative (actual - threshold)
113
+ return a.actual - a.threshold - (b.actual - b.threshold);
114
+ });
115
+ const maxSeverity = violations.length > 0 ? violations[0].severity : "none";
116
+ return {
117
+ maxSeverity,
118
+ pass: violations.length === 0,
119
+ violations,
120
+ };
121
+ }
122
+ /**
123
+ * Classify severity for a regression delta using the severity config.
124
+ */
125
+ function classifyRegressionSeverity(delta, config) {
126
+ const sev = config.severity;
127
+ if (!sev)
128
+ return "warning";
129
+ if (sev.warning?.["regression-exceeds"] !== undefined &&
130
+ delta < sev.warning["regression-exceeds"]) {
131
+ return "warning";
132
+ }
133
+ return "info";
134
+ }
135
+ // ---------------------------------------------------------------------------
136
+ // Severity classification
137
+ // ---------------------------------------------------------------------------
138
+ /**
139
+ * Classify the severity of a score-based violation using the severity config.
140
+ * Checks from most severe to least: critical → warning → info.
141
+ */
142
+ function classifySeverity(score, config) {
143
+ const sev = config.severity;
144
+ if (!sev)
145
+ return "warning";
146
+ if (sev.critical?.["composite-below"] !== undefined &&
147
+ score < sev.critical["composite-below"]) {
148
+ return "critical";
149
+ }
150
+ if (sev.warning?.["composite-below"] !== undefined &&
151
+ score < sev.warning["composite-below"]) {
152
+ return "warning";
153
+ }
154
+ if (sev.info?.["composite-below"] !== undefined &&
155
+ score < sev.info["composite-below"]) {
156
+ return "info";
157
+ }
158
+ return "warning";
159
+ }
160
+ function evaluateArea(score, thresholds, config, violations) {
161
+ const area = score.feature;
162
+ // Composite score
163
+ if (score.totalScore < thresholds.composite) {
164
+ violations.push({
165
+ actual: score.totalScore,
166
+ area,
167
+ description: `${area} composite score ${fmt(score.totalScore)} is below threshold ${thresholds.composite}`,
168
+ metric: "composite",
169
+ severity: classifySeverity(score.totalScore, config),
170
+ threshold: thresholds.composite,
171
+ });
172
+ }
173
+ // Per-dimension thresholds
174
+ const dims = thresholds.dimensions;
175
+ if (dims) {
176
+ const dimMap = [
177
+ ["task-completion", score.taskCompletion, dims["task-completion"]],
178
+ ["code-correctness", score.codeCorrectness, dims["code-correctness"]],
179
+ ["doc-coverage", score.docCoverage, dims["doc-coverage"]],
180
+ ];
181
+ for (const [dimKey, actual, threshold] of dimMap) {
182
+ if (threshold !== undefined && actual < threshold) {
183
+ violations.push({
184
+ actual,
185
+ area,
186
+ description: `${area} ${dimKey} score ${fmt(actual)} is below threshold ${threshold}`,
187
+ metric: dimKey,
188
+ severity: classifySeverity(actual, config),
189
+ threshold,
190
+ });
191
+ }
192
+ }
193
+ }
194
+ // Doc Lift threshold (docs must not hurt)
195
+ const docLiftThreshold = thresholds["doc-lift"];
196
+ if (docLiftThreshold !== undefined && score.docLift < docLiftThreshold) {
197
+ const severity = score.negativeDocLift && config.severity?.critical?.["negative-doc-lift"]
198
+ ? "critical"
199
+ : "warning";
200
+ violations.push({
201
+ actual: score.docLift,
202
+ area,
203
+ description: `${area} Doc Lift ${fmt(score.docLift)} is below threshold ${docLiftThreshold}${score.negativeDocLift ? " — docs are hurting performance" : ""}`,
204
+ metric: "doc-lift",
205
+ severity,
206
+ threshold: docLiftThreshold,
207
+ });
208
+ }
209
+ // Ceiling threshold (doc quality floor)
210
+ const ceilingThreshold = thresholds.ceiling;
211
+ if (ceilingThreshold !== undefined && score.ceilingScore < ceilingThreshold) {
212
+ violations.push({
213
+ actual: score.ceilingScore,
214
+ area,
215
+ description: `${area} ceiling score ${fmt(score.ceilingScore)} is below threshold ${ceilingThreshold} — documentation quality is insufficient`,
216
+ metric: "ceiling",
217
+ severity: "warning",
218
+ threshold: ceilingThreshold,
219
+ });
220
+ }
221
+ }
222
+ // ---------------------------------------------------------------------------
223
+ // Helpers
224
+ // ---------------------------------------------------------------------------
225
+ /** Format a score for display (round to nearest integer) */
226
+ function fmt(n) {
227
+ return String(Math.round(n));
228
+ }
229
+ /**
230
+ * Merge per-area overrides with defaults. Area overrides take precedence.
231
+ */
232
+ function mergeDefaults(defaults, overrides) {
233
+ if (!overrides) {
234
+ return { ...defaults };
235
+ }
236
+ return {
237
+ ceiling: overrides.ceiling ?? defaults.ceiling,
238
+ composite: overrides.composite ?? defaults.composite,
239
+ dimensions: {
240
+ ...defaults.dimensions,
241
+ ...overrides.dimensions,
242
+ },
243
+ "doc-lift": overrides["doc-lift"] ?? defaults["doc-lift"],
244
+ };
245
+ }
@@ -0,0 +1,10 @@
1
+ /**
2
+ * pipeline/types.ts — Re-export barrel
3
+ *
4
+ * All domain types now live in @sanity/ailf-core. This file re-exports
5
+ * them for backward compatibility — existing imports throughout
6
+ * packages/eval continue to work unchanged.
7
+ *
8
+ * @see packages/core/src/types/index.ts (canonical source)
9
+ */
10
+ export * from "../_vendor/ailf-core/index.d.ts";
@@ -0,0 +1,10 @@
1
+ /**
2
+ * pipeline/types.ts — Re-export barrel
3
+ *
4
+ * All domain types now live in @sanity/ailf-core. This file re-exports
5
+ * them for backward compatibility — existing imports throughout
6
+ * packages/eval continue to work unchanged.
7
+ *
8
+ * @see packages/core/src/types/index.ts (canonical source)
9
+ */
10
+ export * from "../_vendor/ailf-core/index.js";
@@ -0,0 +1,67 @@
1
+ /**
2
+ * pipeline/validate.ts
3
+ *
4
+ * Configuration validation for the evaluation pipeline. Checks that all YAML
5
+ * config files are consistent: every task has a canonical mapping, every
6
+ * mapping has a reference solution file, required files exist, etc.
7
+ *
8
+ * All individual validators are exported so they can be tested independently.
9
+ */
10
+ import type { ValidationIssue, ValidationResult } from "./types.js";
11
+ /**
12
+ * Run all validation checks and return a combined result.
13
+ * `valid` is true only if there are zero error-severity issues.
14
+ */
15
+ export declare function validateConfiguration(rootDir: string): ValidationResult;
16
+ /**
17
+ * Check that canonical context files exist. These are the per-task
18
+ * gold-retrieval contexts actually referenced by task definitions.
19
+ *
20
+ * Contexts are generated by fetch-docs and may not exist yet —
21
+ * returns warnings, not errors.
22
+ */
23
+ export declare function validateContexts(rootDir: string): ValidationIssue[];
24
+ /**
25
+ * Check that config/features.yaml exists, parses, and conforms to the Zod schema.
26
+ * Also cross-references covered features against actual task files for consistency.
27
+ *
28
+ * Returns warnings (not errors) if the file is missing — the feature registry
29
+ * is optional and doesn't block evaluation.
30
+ */
31
+ export declare function validateFeaturesYaml(rootDir: string): ValidationIssue[];
32
+ /**
33
+ * Check that config/models.yaml exists, parses, has at least one model with an id
34
+ * and label, and has a grader defined.
35
+ */
36
+ export declare function validateModelsYaml(rootDir: string): ValidationIssue[];
37
+ /**
38
+ * Check that reference solution files exist on disk for every task
39
+ * that declares a reference_solution path. Reads from inline task
40
+ * definitions via resolveMappings.
41
+ *
42
+ * NOTE: When tasks come from the Content Lake, reference solutions also
43
+ * live there (as ailf.referenceSolution documents). This validator only
44
+ * applies to YAML-based tasks with local file paths.
45
+ */
46
+ export declare function validateReferenceSolutions(rootDir: string): ValidationIssue[];
47
+ /**
48
+ * Check that config/rubrics.yaml exists, parses, and conforms to the Zod schema.
49
+ * Returns the set of valid template keys for cross-referencing by task
50
+ * validation.
51
+ */
52
+ export declare function validateRubricsYaml(rootDir: string): ValidationIssue[];
53
+ /**
54
+ * Check that tasks/*.yaml files exist, parse, and conform to the Zod schema.
55
+ * Validates both the new single-definition format (with `id`) and the legacy
56
+ * paired format. Uses `TaskFileSchema` from schemas.ts for structural
57
+ * validation, plus cross-entry checks (duplicate IDs, docs path consistency).
58
+ */
59
+ export declare function validateTaskFiles(rootDir: string): ValidationIssue[];
60
+ /**
61
+ * Check that config/thresholds.yaml exists, parses, and conforms to the Zod schema.
62
+ *
63
+ * Returns warnings (not errors) if the file is missing — thresholds are
64
+ * optional and don't block evaluation. They only activate when
65
+ * `--readiness` or severity-aware sink routing is used.
66
+ */
67
+ export declare function validateThresholdsYaml(rootDir: string): ValidationIssue[];