@sanity/ailf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (530) hide show
  1. package/README.md +89 -0
  2. package/bin/ailf.js +64 -0
  3. package/canonical/grader-references/README.md +88 -0
  4. package/canonical/grader-references/groq.yaml +234 -0
  5. package/canonical/grader-references/studio-setup.yaml +275 -0
  6. package/canonical/reference-solutions/.gitkeep +1 -0
  7. package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
  8. package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
  9. package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
  10. package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
  11. package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
  12. package/canonical/reference-solutions/groq/joins-references.ts +300 -0
  13. package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
  14. package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
  15. package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
  16. package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
  17. package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
  18. package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
  19. package/config/bigquery/README.md +74 -0
  20. package/config/bigquery/views/area_scores.sql +87 -0
  21. package/config/bigquery/views/reports.sql +49 -0
  22. package/config/features.yaml +116 -0
  23. package/config/models.yaml +115 -0
  24. package/config/prompts.yaml +75 -0
  25. package/config/rubrics.yaml +62 -0
  26. package/config/schedules.yaml +43 -0
  27. package/config/sinks.yaml +54 -0
  28. package/config/sources.yaml +51 -0
  29. package/config/thresholds.yaml +49 -0
  30. package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
  31. package/dist/_vendor/ailf-core/examples/index.js +285 -0
  32. package/dist/_vendor/ailf-core/index.d.ts +17 -0
  33. package/dist/_vendor/ailf-core/index.js +17 -0
  34. package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
  35. package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
  36. package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
  37. package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
  38. package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
  39. package/dist/_vendor/ailf-core/ports/context.js +14 -0
  40. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
  41. package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
  42. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
  43. package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
  44. package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
  45. package/dist/_vendor/ailf-core/ports/index.js +7 -0
  46. package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
  47. package/dist/_vendor/ailf-core/ports/logger.js +11 -0
  48. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
  49. package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
  50. package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
  51. package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
  52. package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
  53. package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
  54. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
  55. package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
  56. package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
  57. package/dist/_vendor/ailf-core/schemas/index.js +16 -0
  58. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
  59. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
  60. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
  61. package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
  62. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
  63. package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
  64. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
  65. package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
  66. package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
  67. package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
  68. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
  69. package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
  70. package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
  71. package/dist/_vendor/ailf-core/services/index.js +12 -0
  72. package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
  73. package/dist/_vendor/ailf-core/services/scoring.js +222 -0
  74. package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
  75. package/dist/_vendor/ailf-core/types/index.js +21 -0
  76. package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
  77. package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
  78. package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
  79. package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
  80. package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
  81. package/dist/_vendor/ailf-shared/document-ref.js +1 -0
  82. package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
  83. package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
  84. package/dist/_vendor/ailf-shared/index.d.ts +16 -0
  85. package/dist/_vendor/ailf-shared/index.js +16 -0
  86. package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
  87. package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
  88. package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
  89. package/dist/_vendor/ailf-shared/score-grades.js +23 -0
  90. package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
  91. package/dist/adapters/cache/content-lake-cache.js +59 -0
  92. package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
  93. package/dist/adapters/cache/filesystem-cache.js +54 -0
  94. package/dist/adapters/cache/index.d.ts +2 -0
  95. package/dist/adapters/cache/index.js +2 -0
  96. package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
  97. package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
  98. package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
  99. package/dist/adapters/config-sources/file-config-adapter.js +96 -0
  100. package/dist/adapters/config-sources/index.d.ts +2 -0
  101. package/dist/adapters/config-sources/index.js +2 -0
  102. package/dist/adapters/doc-fetchers/index.d.ts +1 -0
  103. package/dist/adapters/doc-fetchers/index.js +1 -0
  104. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
  105. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
  106. package/dist/adapters/eval-runners/index.d.ts +1 -0
  107. package/dist/adapters/eval-runners/index.js +1 -0
  108. package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
  109. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
  110. package/dist/adapters/index.d.ts +12 -0
  111. package/dist/adapters/index.js +12 -0
  112. package/dist/adapters/loggers/console-logger.d.ts +22 -0
  113. package/dist/adapters/loggers/console-logger.js +54 -0
  114. package/dist/adapters/loggers/index.d.ts +9 -0
  115. package/dist/adapters/loggers/index.js +9 -0
  116. package/dist/adapters/loggers/json-logger.d.ts +18 -0
  117. package/dist/adapters/loggers/json-logger.js +33 -0
  118. package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
  119. package/dist/adapters/loggers/quiet-logger.js +30 -0
  120. package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
  121. package/dist/adapters/task-sources/composite-task-source.js +59 -0
  122. package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
  123. package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
  124. package/dist/adapters/task-sources/index.d.ts +7 -0
  125. package/dist/adapters/task-sources/index.js +7 -0
  126. package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
  127. package/dist/adapters/task-sources/repo-schemas.js +234 -0
  128. package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
  129. package/dist/adapters/task-sources/repo-task-source.js +104 -0
  130. package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
  131. package/dist/adapters/task-sources/repo-trigger.js +153 -0
  132. package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
  133. package/dist/adapters/task-sources/repo-validation.js +164 -0
  134. package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
  135. package/dist/adapters/task-sources/yaml-task-source.js +136 -0
  136. package/dist/agent-observer/agentic-provider.d.ts +132 -0
  137. package/dist/agent-observer/agentic-provider.js +983 -0
  138. package/dist/agent-observer/classifier.d.ts +62 -0
  139. package/dist/agent-observer/classifier.js +269 -0
  140. package/dist/agent-observer/index.d.ts +7 -0
  141. package/dist/agent-observer/index.js +4 -0
  142. package/dist/agent-observer/pricing.d.ts +35 -0
  143. package/dist/agent-observer/pricing.js +82 -0
  144. package/dist/agent-observer/provider.d.ts +77 -0
  145. package/dist/agent-observer/provider.js +151 -0
  146. package/dist/agent-observer/proxy.d.ts +91 -0
  147. package/dist/agent-observer/proxy.js +321 -0
  148. package/dist/agent-observer/test-imports.d.ts +7 -0
  149. package/dist/agent-observer/test-imports.js +185 -0
  150. package/dist/agent-observer/types.d.ts +137 -0
  151. package/dist/agent-observer/types.js +16 -0
  152. package/dist/assertions/source-isolation.d.ts +72 -0
  153. package/dist/assertions/source-isolation.js +117 -0
  154. package/dist/cli.d.ts +24 -0
  155. package/dist/cli.js +199 -0
  156. package/dist/commands/agent-report.d.ts +5 -0
  157. package/dist/commands/agent-report.js +69 -0
  158. package/dist/commands/baseline.d.ts +9 -0
  159. package/dist/commands/baseline.js +141 -0
  160. package/dist/commands/cache.d.ts +13 -0
  161. package/dist/commands/cache.js +135 -0
  162. package/dist/commands/calculate-scores.d.ts +8 -0
  163. package/dist/commands/calculate-scores.js +48 -0
  164. package/dist/commands/compare.d.ts +8 -0
  165. package/dist/commands/compare.js +120 -0
  166. package/dist/commands/completion.d.ts +18 -0
  167. package/dist/commands/completion.js +260 -0
  168. package/dist/commands/coverage-audit.d.ts +7 -0
  169. package/dist/commands/coverage-audit.js +40 -0
  170. package/dist/commands/discovery-report.d.ts +10 -0
  171. package/dist/commands/discovery-report.js +44 -0
  172. package/dist/commands/eval.d.ts +9 -0
  173. package/dist/commands/eval.js +35 -0
  174. package/dist/commands/explain-handler.d.ts +34 -0
  175. package/dist/commands/explain-handler.js +719 -0
  176. package/dist/commands/fetch-docs.d.ts +8 -0
  177. package/dist/commands/fetch-docs.js +128 -0
  178. package/dist/commands/generate-configs.d.ts +8 -0
  179. package/dist/commands/generate-configs.js +46 -0
  180. package/dist/commands/grader/index.d.ts +11 -0
  181. package/dist/commands/grader/index.js +118 -0
  182. package/dist/commands/init.d.ts +19 -0
  183. package/dist/commands/init.js +150 -0
  184. package/dist/commands/interactive.d.ts +12 -0
  185. package/dist/commands/interactive.js +238 -0
  186. package/dist/commands/lookup-doc.d.ts +15 -0
  187. package/dist/commands/lookup-doc.js +84 -0
  188. package/dist/commands/measure-retrieval.d.ts +5 -0
  189. package/dist/commands/measure-retrieval.js +65 -0
  190. package/dist/commands/pipeline-action.d.ts +71 -0
  191. package/dist/commands/pipeline-action.js +305 -0
  192. package/dist/commands/pipeline.d.ts +62 -0
  193. package/dist/commands/pipeline.js +53 -0
  194. package/dist/commands/pr-comment.d.ts +8 -0
  195. package/dist/commands/pr-comment.js +47 -0
  196. package/dist/commands/publish.d.ts +26 -0
  197. package/dist/commands/publish.js +253 -0
  198. package/dist/commands/readiness-report.d.ts +10 -0
  199. package/dist/commands/readiness-report.js +104 -0
  200. package/dist/commands/shared/options.d.ts +29 -0
  201. package/dist/commands/shared/options.js +57 -0
  202. package/dist/commands/update-quality-scores.d.ts +5 -0
  203. package/dist/commands/update-quality-scores.js +20 -0
  204. package/dist/commands/validate-tasks.d.ts +16 -0
  205. package/dist/commands/validate-tasks.js +93 -0
  206. package/dist/commands/validate.d.ts +9 -0
  207. package/dist/commands/validate.js +73 -0
  208. package/dist/commands/webhook-server.d.ts +5 -0
  209. package/dist/commands/webhook-server.js +30 -0
  210. package/dist/commands/weekly-digest.d.ts +10 -0
  211. package/dist/commands/weekly-digest.js +104 -0
  212. package/dist/composition-root.d.ts +26 -0
  213. package/dist/composition-root.js +107 -0
  214. package/dist/interpolate.d.ts +26 -0
  215. package/dist/interpolate.js +70 -0
  216. package/dist/job-store.d.ts +104 -0
  217. package/dist/job-store.js +188 -0
  218. package/dist/lib/agent-behavior-report.d.ts +8 -0
  219. package/dist/lib/agent-behavior-report.js +185 -0
  220. package/dist/lib/baseline.d.ts +19 -0
  221. package/dist/lib/baseline.js +153 -0
  222. package/dist/lib/calculate-scores.d.ts +23 -0
  223. package/dist/lib/calculate-scores.js +42 -0
  224. package/dist/lib/compare.d.ts +18 -0
  225. package/dist/lib/compare.js +170 -0
  226. package/dist/lib/coverage-audit.d.ts +4 -0
  227. package/dist/lib/coverage-audit.js +42 -0
  228. package/dist/lib/discovery-report.d.ts +13 -0
  229. package/dist/lib/discovery-report.js +57 -0
  230. package/dist/lib/fetch-docs.d.ts +30 -0
  231. package/dist/lib/fetch-docs.js +171 -0
  232. package/dist/lib/generate-configs.d.ts +25 -0
  233. package/dist/lib/generate-configs.js +42 -0
  234. package/dist/lib/grader-api.d.ts +21 -0
  235. package/dist/lib/grader-api.js +34 -0
  236. package/dist/lib/grader-compare.d.ts +19 -0
  237. package/dist/lib/grader-compare.js +91 -0
  238. package/dist/lib/grader-consistency.d.ts +27 -0
  239. package/dist/lib/grader-consistency.js +79 -0
  240. package/dist/lib/grader-sensitivity.d.ts +19 -0
  241. package/dist/lib/grader-sensitivity.js +75 -0
  242. package/dist/lib/grader-validate.d.ts +19 -0
  243. package/dist/lib/grader-validate.js +78 -0
  244. package/dist/lib/measure-retrieval.d.ts +14 -0
  245. package/dist/lib/measure-retrieval.js +71 -0
  246. package/dist/lib/pr-comment.d.ts +16 -0
  247. package/dist/lib/pr-comment.js +28 -0
  248. package/dist/lib/readiness-report.d.ts +13 -0
  249. package/dist/lib/readiness-report.js +108 -0
  250. package/dist/lib/webhook-server.d.ts +11 -0
  251. package/dist/lib/webhook-server.js +24 -0
  252. package/dist/lib/weekly-digest.d.ts +24 -0
  253. package/dist/lib/weekly-digest.js +148 -0
  254. package/dist/orchestration/build-app-context.d.ts +27 -0
  255. package/dist/orchestration/build-app-context.js +81 -0
  256. package/dist/orchestration/build-step-sequence.d.ts +15 -0
  257. package/dist/orchestration/build-step-sequence.js +84 -0
  258. package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
  259. package/dist/orchestration/config-to-source-overrides.js +28 -0
  260. package/dist/orchestration/env-bridge.d.ts +21 -0
  261. package/dist/orchestration/env-bridge.js +66 -0
  262. package/dist/orchestration/index.d.ts +11 -0
  263. package/dist/orchestration/index.js +11 -0
  264. package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
  265. package/dist/orchestration/pipeline-orchestrator.js +153 -0
  266. package/dist/orchestration/step-runner.d.ts +20 -0
  267. package/dist/orchestration/step-runner.js +88 -0
  268. package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
  269. package/dist/orchestration/steps/calculate-scores-step.js +95 -0
  270. package/dist/orchestration/steps/callback-step.d.ts +24 -0
  271. package/dist/orchestration/steps/callback-step.js +76 -0
  272. package/dist/orchestration/steps/compare-step.d.ts +14 -0
  273. package/dist/orchestration/steps/compare-step.js +92 -0
  274. package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
  275. package/dist/orchestration/steps/discovery-report-step.js +55 -0
  276. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  277. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  278. package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
  279. package/dist/orchestration/steps/fetch-docs-step.js +135 -0
  280. package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
  281. package/dist/orchestration/steps/gap-analysis-step.js +136 -0
  282. package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
  283. package/dist/orchestration/steps/generate-configs-step.js +85 -0
  284. package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
  285. package/dist/orchestration/steps/grader-consistency-step.js +64 -0
  286. package/dist/orchestration/steps/index.d.ts +19 -0
  287. package/dist/orchestration/steps/index.js +19 -0
  288. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
  289. package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
  290. package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
  291. package/dist/orchestration/steps/publish-report-step.js +216 -0
  292. package/dist/orchestration/steps/readiness-step.d.ts +13 -0
  293. package/dist/orchestration/steps/readiness-step.js +91 -0
  294. package/dist/orchestration/steps/report-step.d.ts +12 -0
  295. package/dist/orchestration/steps/report-step.js +49 -0
  296. package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
  297. package/dist/orchestration/steps/run-eval-step.js +195 -0
  298. package/dist/orchestration/steps/validate-step.d.ts +12 -0
  299. package/dist/orchestration/steps/validate-step.js +41 -0
  300. package/dist/pipeline/agent-behavior-report.d.ts +53 -0
  301. package/dist/pipeline/agent-behavior-report.js +132 -0
  302. package/dist/pipeline/attribution.d.ts +47 -0
  303. package/dist/pipeline/attribution.js +226 -0
  304. package/dist/pipeline/baseline.d.ts +37 -0
  305. package/dist/pipeline/baseline.js +141 -0
  306. package/dist/pipeline/cache.d.ts +101 -0
  307. package/dist/pipeline/cache.js +283 -0
  308. package/dist/pipeline/calculate-scores.d.ts +102 -0
  309. package/dist/pipeline/calculate-scores.js +1128 -0
  310. package/dist/pipeline/callback-delivery.d.ts +50 -0
  311. package/dist/pipeline/callback-delivery.js +89 -0
  312. package/dist/pipeline/checks.d.ts +39 -0
  313. package/dist/pipeline/checks.js +280 -0
  314. package/dist/pipeline/classify-url.d.ts +61 -0
  315. package/dist/pipeline/classify-url.js +93 -0
  316. package/dist/pipeline/compare.d.ts +31 -0
  317. package/dist/pipeline/compare.js +208 -0
  318. package/dist/pipeline/coverage-audit.d.ts +39 -0
  319. package/dist/pipeline/coverage-audit.js +165 -0
  320. package/dist/pipeline/degradations.d.ts +85 -0
  321. package/dist/pipeline/degradations.js +242 -0
  322. package/dist/pipeline/discovery-report.d.ts +55 -0
  323. package/dist/pipeline/discovery-report.js +178 -0
  324. package/dist/pipeline/eval-constants.d.ts +68 -0
  325. package/dist/pipeline/eval-constants.js +111 -0
  326. package/dist/pipeline/eval-fingerprint.d.ts +66 -0
  327. package/dist/pipeline/eval-fingerprint.js +175 -0
  328. package/dist/pipeline/expand-tasks.d.ts +220 -0
  329. package/dist/pipeline/expand-tasks.js +421 -0
  330. package/dist/pipeline/failure-modes.d.ts +46 -0
  331. package/dist/pipeline/failure-modes.js +348 -0
  332. package/dist/pipeline/fetch-url-content.d.ts +44 -0
  333. package/dist/pipeline/fetch-url-content.js +93 -0
  334. package/dist/pipeline/gap-analysis.d.ts +48 -0
  335. package/dist/pipeline/gap-analysis.js +231 -0
  336. package/dist/pipeline/generate-configs.d.ts +72 -0
  337. package/dist/pipeline/generate-configs.js +395 -0
  338. package/dist/pipeline/grader-api.d.ts +49 -0
  339. package/dist/pipeline/grader-api.js +200 -0
  340. package/dist/pipeline/grader-compare-runner.d.ts +44 -0
  341. package/dist/pipeline/grader-compare-runner.js +301 -0
  342. package/dist/pipeline/grader-comparison.d.ts +111 -0
  343. package/dist/pipeline/grader-comparison.js +161 -0
  344. package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
  345. package/dist/pipeline/grader-consistency-runner.js +270 -0
  346. package/dist/pipeline/grader-consistency.d.ts +103 -0
  347. package/dist/pipeline/grader-consistency.js +146 -0
  348. package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
  349. package/dist/pipeline/grader-sensitivity-runner.js +282 -0
  350. package/dist/pipeline/grader-sensitivity.d.ts +94 -0
  351. package/dist/pipeline/grader-sensitivity.js +144 -0
  352. package/dist/pipeline/grader-validate-runner.d.ts +38 -0
  353. package/dist/pipeline/grader-validate-runner.js +229 -0
  354. package/dist/pipeline/grader-validation.d.ts +107 -0
  355. package/dist/pipeline/grader-validation.js +169 -0
  356. package/dist/pipeline/map-request-to-config.d.ts +19 -0
  357. package/dist/pipeline/map-request-to-config.js +80 -0
  358. package/dist/pipeline/measure-retrieval.d.ts +59 -0
  359. package/dist/pipeline/measure-retrieval.js +111 -0
  360. package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
  361. package/dist/pipeline/mirror-repo-tasks.js +350 -0
  362. package/dist/pipeline/plan-format.d.ts +33 -0
  363. package/dist/pipeline/plan-format.js +202 -0
  364. package/dist/pipeline/plan.d.ts +169 -0
  365. package/dist/pipeline/plan.js +708 -0
  366. package/dist/pipeline/pr-comment.d.ts +19 -0
  367. package/dist/pipeline/pr-comment.js +502 -0
  368. package/dist/pipeline/probe.d.ts +52 -0
  369. package/dist/pipeline/probe.js +390 -0
  370. package/dist/pipeline/provenance.d.ts +47 -0
  371. package/dist/pipeline/provenance.js +146 -0
  372. package/dist/pipeline/readiness-report.d.ts +87 -0
  373. package/dist/pipeline/readiness-report.js +205 -0
  374. package/dist/pipeline/release-classification.d.ts +54 -0
  375. package/dist/pipeline/release-classification.js +238 -0
  376. package/dist/pipeline/release-report.d.ts +37 -0
  377. package/dist/pipeline/release-report.js +222 -0
  378. package/dist/pipeline/repo-eval-comment.d.ts +37 -0
  379. package/dist/pipeline/repo-eval-comment.js +165 -0
  380. package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
  381. package/dist/pipeline/repo-threshold-evaluator.js +162 -0
  382. package/dist/pipeline/resolve-mappings.d.ts +35 -0
  383. package/dist/pipeline/resolve-mappings.js +72 -0
  384. package/dist/pipeline/retrieval-metrics.d.ts +39 -0
  385. package/dist/pipeline/retrieval-metrics.js +136 -0
  386. package/dist/pipeline/reverse-mapping.d.ts +67 -0
  387. package/dist/pipeline/reverse-mapping.js +88 -0
  388. package/dist/pipeline/schemas.d.ts +9 -0
  389. package/dist/pipeline/schemas.js +9 -0
  390. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  391. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  392. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  393. package/dist/pipeline/steps/compare-step.js +90 -0
  394. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  395. package/dist/pipeline/steps/eval-step.js +347 -0
  396. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  397. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  398. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  399. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  400. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  401. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  402. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  403. package/dist/pipeline/steps/publish-report-step.js +243 -0
  404. package/dist/pipeline/steps/report-step.d.ts +13 -0
  405. package/dist/pipeline/steps/report-step.js +56 -0
  406. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  407. package/dist/pipeline/steps/update-scores-step.js +42 -0
  408. package/dist/pipeline/targeted-loo.d.ts +88 -0
  409. package/dist/pipeline/targeted-loo.js +203 -0
  410. package/dist/pipeline/thresholds.d.ts +27 -0
  411. package/dist/pipeline/thresholds.js +245 -0
  412. package/dist/pipeline/types.d.ts +10 -0
  413. package/dist/pipeline/types.js +10 -0
  414. package/dist/pipeline/validate.d.ts +67 -0
  415. package/dist/pipeline/validate.js +406 -0
  416. package/dist/pipeline/webhook-server.d.ts +37 -0
  417. package/dist/pipeline/webhook-server.js +133 -0
  418. package/dist/report-store.d.ts +84 -0
  419. package/dist/report-store.js +208 -0
  420. package/dist/sanity/client.d.ts +38 -0
  421. package/dist/sanity/client.js +86 -0
  422. package/dist/sanity/portable-text.d.ts +11 -0
  423. package/dist/sanity/portable-text.js +211 -0
  424. package/dist/sanity/queries.d.ts +133 -0
  425. package/dist/sanity/queries.js +300 -0
  426. package/dist/schedules/digest.d.ts +116 -0
  427. package/dist/schedules/digest.js +156 -0
  428. package/dist/schedules/index.d.ts +12 -0
  429. package/dist/schedules/index.js +10 -0
  430. package/dist/schedules/loader.d.ts +31 -0
  431. package/dist/schedules/loader.js +73 -0
  432. package/dist/schedules/schema.d.ts +9 -0
  433. package/dist/schedules/schema.js +9 -0
  434. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  435. package/dist/scripts/agent-behavior-report.js +315 -0
  436. package/dist/scripts/baseline.d.ts +43 -0
  437. package/dist/scripts/baseline.js +267 -0
  438. package/dist/scripts/calculate-scores.d.ts +166 -0
  439. package/dist/scripts/calculate-scores.js +1296 -0
  440. package/dist/scripts/compare.d.ts +22 -0
  441. package/dist/scripts/compare.js +334 -0
  442. package/dist/scripts/coverage-audit.d.ts +44 -0
  443. package/dist/scripts/coverage-audit.js +209 -0
  444. package/dist/scripts/debug-eval.d.ts +19 -0
  445. package/dist/scripts/debug-eval.js +73 -0
  446. package/dist/scripts/discovery-report.d.ts +58 -0
  447. package/dist/scripts/discovery-report.js +250 -0
  448. package/dist/scripts/fetch-docs.d.ts +35 -0
  449. package/dist/scripts/fetch-docs.js +472 -0
  450. package/dist/scripts/generate-configs.d.ts +66 -0
  451. package/dist/scripts/generate-configs.js +459 -0
  452. package/dist/scripts/grader-api.d.ts +27 -0
  453. package/dist/scripts/grader-api.js +206 -0
  454. package/dist/scripts/grader-compare.d.ts +22 -0
  455. package/dist/scripts/grader-compare.js +368 -0
  456. package/dist/scripts/grader-consistency.d.ts +20 -0
  457. package/dist/scripts/grader-consistency.js +313 -0
  458. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  459. package/dist/scripts/grader-sensitivity.js +354 -0
  460. package/dist/scripts/grader-validate.d.ts +19 -0
  461. package/dist/scripts/grader-validate.js +267 -0
  462. package/dist/scripts/measure-retrieval.d.ts +10 -0
  463. package/dist/scripts/measure-retrieval.js +145 -0
  464. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
  465. package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
  466. package/dist/scripts/pipeline.d.ts +76 -0
  467. package/dist/scripts/pipeline.js +1031 -0
  468. package/dist/scripts/pr-comment.d.ts +10 -0
  469. package/dist/scripts/pr-comment.js +510 -0
  470. package/dist/scripts/readiness-report.d.ts +88 -0
  471. package/dist/scripts/readiness-report.js +342 -0
  472. package/dist/scripts/update-quality-scores.d.ts +15 -0
  473. package/dist/scripts/update-quality-scores.js +184 -0
  474. package/dist/scripts/validate-task-sources.d.ts +21 -0
  475. package/dist/scripts/validate-task-sources.js +210 -0
  476. package/dist/scripts/validate.d.ts +13 -0
  477. package/dist/scripts/validate.js +79 -0
  478. package/dist/scripts/webhook-server.d.ts +26 -0
  479. package/dist/scripts/webhook-server.js +147 -0
  480. package/dist/scripts/weekly-digest.d.ts +24 -0
  481. package/dist/scripts/weekly-digest.js +144 -0
  482. package/dist/sinks/bigquery/index.d.ts +131 -0
  483. package/dist/sinks/bigquery/index.js +222 -0
  484. package/dist/sinks/format-slack.d.ts +64 -0
  485. package/dist/sinks/format-slack.js +306 -0
  486. package/dist/sinks/index.d.ts +23 -0
  487. package/dist/sinks/index.js +18 -0
  488. package/dist/sinks/loader.d.ts +18 -0
  489. package/dist/sinks/loader.js +82 -0
  490. package/dist/sinks/retry.d.ts +24 -0
  491. package/dist/sinks/retry.js +52 -0
  492. package/dist/sinks/schema.d.ts +9 -0
  493. package/dist/sinks/schema.js +9 -0
  494. package/dist/sinks/slack/format.d.ts +65 -0
  495. package/dist/sinks/slack/format.js +327 -0
  496. package/dist/sinks/slack/index.d.ts +27 -0
  497. package/dist/sinks/slack/index.js +78 -0
  498. package/dist/sinks/slack-sink.d.ts +27 -0
  499. package/dist/sinks/slack-sink.js +78 -0
  500. package/dist/sinks/types.d.ts +59 -0
  501. package/dist/sinks/types.js +44 -0
  502. package/dist/sinks/webhook/index.d.ts +19 -0
  503. package/dist/sinks/webhook/index.js +50 -0
  504. package/dist/sinks/webhook-sink.d.ts +19 -0
  505. package/dist/sinks/webhook-sink.js +50 -0
  506. package/dist/sources.d.ts +104 -0
  507. package/dist/sources.js +292 -0
  508. package/dist/webhook/budget.d.ts +42 -0
  509. package/dist/webhook/budget.js +60 -0
  510. package/dist/webhook/debounce.d.ts +67 -0
  511. package/dist/webhook/debounce.js +76 -0
  512. package/dist/webhook/dispatch.d.ts +45 -0
  513. package/dist/webhook/dispatch.js +84 -0
  514. package/dist/webhook/eval-request-handler.d.ts +87 -0
  515. package/dist/webhook/eval-request-handler.js +181 -0
  516. package/dist/webhook/handler.d.ts +88 -0
  517. package/dist/webhook/handler.js +203 -0
  518. package/dist/webhook/index.d.ts +17 -0
  519. package/dist/webhook/index.js +12 -0
  520. package/dist/webhook/types.d.ts +109 -0
  521. package/dist/webhook/types.js +10 -0
  522. package/package.json +72 -0
  523. package/tasks/.expanded.agentic.yaml +51 -0
  524. package/tasks/.expanded.yaml +66 -0
  525. package/tasks/frameworks.yaml +98 -0
  526. package/tasks/functions.yaml +51 -0
  527. package/tasks/groq.yaml +216 -0
  528. package/tasks/nextjs-live.yaml +62 -0
  529. package/tasks/studio-setup.yaml +111 -0
  530. package/tasks/visual-editing.yaml +120 -0
@@ -0,0 +1,1082 @@
1
+ /**
2
+ * @sanity/ailf-core — Domain types
3
+ *
4
+ * Core types for the AI Literacy Framework evaluation pipeline.
5
+ * These define the contracts between pipeline steps, domain services,
6
+ * and port adapters.
7
+ *
8
+ * Extracted from packages/eval/src/pipeline/types.ts during the
9
+ * Ports & Adapters migration (Phase 0c). The original file is now a
10
+ * re-export barrel that preserves backward compatibility.
11
+ */
12
+ import type { ConcreteEvalMode as _ConcreteEvalMode, DocumentRef as _DocumentRef, EvalMode as _EvalMode } from "../../ailf-shared/index.d.ts";
13
+ export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "./scoring-input.js";
14
+ export type { DocumentRef } from "../../ailf-shared/index.d.ts";
15
+ type DocumentRef = _DocumentRef;
16
+ type EvalMode = _EvalMode;
17
+ type ConcreteEvalMode = _ConcreteEvalMode;
18
+ /** Aggregated retrieval metrics for a feature area */
19
+ export interface AreaRetrievalMetrics {
20
+ area: string;
21
+ avgF1: number;
22
+ avgPrecision: number;
23
+ avgRecall: number;
24
+ /** Unique canonical slugs that were never retrieved across all tasks */
25
+ invisibleDocs: string[];
26
+ /** Number of tasks with retrieval data */
27
+ taskCount: number;
28
+ tasks: TaskRetrievalMetrics[];
29
+ }
30
+ /** A parsed task-to-canonical-docs mapping */
31
+ export interface CanonicalMapping {
32
+ canonical_docs: {
33
+ slug: string;
34
+ reason: string;
35
+ }[];
36
+ description: string;
37
+ id: string;
38
+ reference_solution: string;
39
+ }
40
+ /** A document in a content release, classified by change type and evaluation tier */
41
+ export interface ClassifiedReleaseDocument {
42
+ /** Feature areas whose tasks reference this document (via canonical_docs) */
43
+ affectedAreas: string[];
44
+ /** Whether this is a new, updated, or removed document */
45
+ changeType: "new" | "removed" | "updated";
46
+ /** The document slug */
47
+ slug: string;
48
+ /** For new docs: Tier A (area has tasks) or Tier B (no tasks exist) */
49
+ tier: "A" | "B" | "not-applicable";
50
+ /** Whether this document maps to any existing task's canonical_docs */
51
+ tracked: boolean;
52
+ }
53
+ /** Coverage audit report */
54
+ export interface CoverageAuditReport {
55
+ /** Coverage percentage */
56
+ coveragePercent: number;
57
+ /** Covered features (with tasks) */
58
+ covered: (ProductFeature & {
59
+ actualTaskCount: number;
60
+ })[];
61
+ /** Timestamp of audit */
62
+ generatedAt: string;
63
+ /** Total features in registry */
64
+ totalFeatures: number;
65
+ /** Uncovered features (no tasks), sorted by priority */
66
+ uncovered: ProductFeature[];
67
+ }
68
+ /** Debug mode options for running a subset of tests */
69
+ export interface DebugOptions {
70
+ /** Enable debug mode (run subset of tests) */
71
+ enabled: boolean;
72
+ /** Number of tests to run (default: 2) */
73
+ firstN?: number;
74
+ /** Regex pattern to filter tests by description */
75
+ pattern?: string;
76
+ /** Random sample of N tests */
77
+ sample?: number;
78
+ }
79
+ export type { DimensionName, EvalMode, ConcreteEvalMode, } from "../../ailf-shared/index.d.ts";
80
+ export { FULL_MODE_SUBMODES } from "../../ailf-shared/index.d.ts";
81
+ /** A classified failure mode with confidence level */
82
+ export interface FailureMode {
83
+ /** How confident we are in this classification */
84
+ confidence: "high" | "low" | "medium";
85
+ /** The failure mode category */
86
+ mode: FailureModeType;
87
+ /** Source of the classification signal */
88
+ source: "ceiling" | "keyword" | "keyword+ceiling";
89
+ }
90
+ /** Summary report of failure modes across all judgments */
91
+ export interface FailureModeReport {
92
+ /** Per-area failure mode distribution */
93
+ byArea: Record<string, {
94
+ area: string;
95
+ modes: Record<FailureModeType, number>;
96
+ topMode: FailureModeType;
97
+ totalJudgments: number;
98
+ }>;
99
+ /** Percentage of judgments classified as something other than unclassified */
100
+ classificationRate: number;
101
+ /** Judgments that were classified */
102
+ classifiedJudgments: {
103
+ classification: FailureMode;
104
+ judgment: GraderJudgment;
105
+ }[];
106
+ /** Summary counts by failure mode */
107
+ summary: Record<FailureModeType, number>;
108
+ /** Total judgments analyzed */
109
+ totalJudgments: number;
110
+ }
111
+ /** Failure mode classification for a low-scoring judgment */
112
+ export type FailureModeType = "incorrect-docs" | "missing-docs" | "model-limitation" | "outdated-docs" | "poor-structure" | "unclassified";
113
+ /** Per-feature-area score breakdown */
114
+ export interface FeatureScore {
115
+ /**
116
+ * Actual score — agentic mode, model retrieves docs via tools.
117
+ * Only present when agentic evaluation data is available.
118
+ */
119
+ actualScore?: number;
120
+ /**
121
+ * Ceiling score — gold-standard docs injected directly.
122
+ * This is the theoretical maximum score for this area given the current docs.
123
+ */
124
+ ceilingScore: number;
125
+ codeCorrectness: number;
126
+ docCoverage: number;
127
+ /** Sanity documents used for this feature area's evaluation */
128
+ documents?: DocumentRef[];
129
+ /**
130
+ * Doc Lift — documentation quality contribution (ceiling − floor).
131
+ * Positive when docs help, negative when docs hurt (interference).
132
+ */
133
+ docLift: number;
134
+ /**
135
+ * Doc quality gap — room for documentation improvement (100 − ceiling).
136
+ * Lower is better.
137
+ */
138
+ docQualityGap: number;
139
+ feature: string;
140
+ /**
141
+ * Floor score — no docs, training data only.
142
+ * The model's inherent knowledge baseline.
143
+ */
144
+ floorScore: number;
145
+ /**
146
+ * Infrastructure efficiency — actual / ceiling (0.0–1.0).
147
+ * What fraction of documentation quality reaches agents through discovery?
148
+ * Null when ceiling ≤ 0, negative Doc Lift, or data is missing.
149
+ */
150
+ infrastructureEfficiency?: number | null;
151
+ /**
152
+ * Whether the retrieval gap is inverted — agents that can't find bad docs
153
+ * outperform those that can. Only true when Doc Lift is negative AND
154
+ * actual > ceiling. Signals a documentation quality problem being masked
155
+ * by retrieval failure. See evaluation-ceiling.md.
156
+ */
157
+ invertedRetrievalGap?: boolean;
158
+ /** Model identifier (e.g., "openai:gpt-4o"). Only present in per-model breakdowns. */
159
+ modelId?: string;
160
+ /**
161
+ * Whether Doc Lift is negative — docs hurt performance.
162
+ * When true, documentation is actively interfering with the model's
163
+ * training-data knowledge. See evaluation-ceiling.md.
164
+ */
165
+ negativeDocLift: boolean;
166
+ /**
167
+ * Retrieval gap — discovery/infrastructure loss (ceiling − actual).
168
+ * Positive means agents can't find all the good docs.
169
+ * Negative means agents outperform injected docs (inverted — see above).
170
+ * Only present when both ceiling and agentic data are available.
171
+ */
172
+ retrievalGap?: number;
173
+ taskCompletion: number;
174
+ testCount: number;
175
+ totalCost: number;
176
+ totalScore: number;
177
+ }
178
+ /** Scoping options to evaluate only specific areas or tasks */
179
+ export interface FilterOptions {
180
+ /** Feature areas to include (filename stems, e.g., ["groq", "frameworks"]) */
181
+ areas?: string[];
182
+ /** Specific task IDs to include (e.g., ["groq-blog-queries"]) */
183
+ taskIds?: string[];
184
+ }
185
+ /** Full gap analysis report */
186
+ export interface GapAnalysisReport {
187
+ /** All identified gaps, sorted by priority */
188
+ gaps: GapEstimate[];
189
+ /** Timestamp of analysis */
190
+ generatedAt: string;
191
+ /** Aggregate potential lift if all gaps were fixed */
192
+ totalPotentialLift: number;
193
+ }
194
+ /** A single gap with estimated impact if fixed */
195
+ export interface GapEstimate {
196
+ /** The tasks affected by this gap */
197
+ affectedTaskIds: string[];
198
+ /** The area this gap affects */
199
+ area: string;
200
+ /** The dimension(s) bottlenecked by this gap */
201
+ bottleneckDimensions: string[];
202
+ /** Confidence in the estimate */
203
+ confidence: "high" | "low" | "medium";
204
+ /** Current score of the bottleneck dimension(s) */
205
+ currentDimensionScores: Record<string, number>;
206
+ /** Estimated composite score lift if this gap is fully closed */
207
+ estimatedLift: number;
208
+ /** The failure mode causing the gap */
209
+ failureMode: FailureModeType;
210
+ /** Priority score (lift × affectedTaskCount) */
211
+ priority: number;
212
+ /** Specific remediation description */
213
+ remediation: string;
214
+ }
215
+ /** A single grader judgment — one per assertion per test */
216
+ export interface GraderJudgment {
217
+ /** The rubric template used (task-completion, code-correctness, doc-coverage) */
218
+ dimension: string;
219
+ /** The model that produced the response being graded */
220
+ modelId: string;
221
+ /** The grader's natural language reasoning */
222
+ reason: string;
223
+ /** The numeric score (0–100) */
224
+ score: number;
225
+ /** The task this judgment belongs to */
226
+ taskId: string;
227
+ }
228
+ /** Enriched grader judgment with canonical doc references, stored in reports */
229
+ export interface StoredJudgment extends GraderJudgment {
230
+ /** Canonical docs that the task expected the model to use */
231
+ canonicalDocs?: DocumentRef[];
232
+ }
233
+ /** Grader consistency diagnostics — does not affect scores, reported alongside */
234
+ export interface GraderReliability {
235
+ /** Inter-grader agreement (from multi-grader comparison) — Phase 3 */
236
+ agreement?: {
237
+ /** Models compared against the primary grader */
238
+ comparedModels: string[];
239
+ /** Pearson correlation with each compared model */
240
+ correlations: Record<string, number>;
241
+ /** Systematic bias (mean signed difference) per model */
242
+ bias: Record<string, number>;
243
+ /** Mean absolute difference per model */
244
+ meanAbsDiff: Record<string, number>;
245
+ /** Recommendation per candidate: comparable, divergent, or strongly-divergent */
246
+ recommendations: Record<string, string>;
247
+ };
248
+ /** Intra-grader consistency (from replicated grading runs) — Phase 1 */
249
+ consistency?: {
250
+ /** Average standard deviation across all grading judgments */
251
+ avgStdDev: number;
252
+ /** Max standard deviation observed (worst-case noise) */
253
+ maxStdDev: number;
254
+ /** Average score range (max - min) across judgments */
255
+ avgRange: number;
256
+ /** Number of replications per judgment */
257
+ replications: number;
258
+ /** Total judgments analyzed */
259
+ totalJudgments: number;
260
+ /** Recommended noise threshold for comparisons (2× max dimension σ) */
261
+ recommendedThreshold: number;
262
+ /** Per-dimension consistency */
263
+ perDimension: {
264
+ taskCompletion: {
265
+ avgStdDev: number;
266
+ maxStdDev: number;
267
+ };
268
+ codeCorrectness: {
269
+ avgStdDev: number;
270
+ maxStdDev: number;
271
+ };
272
+ docCoverage: {
273
+ avgStdDev: number;
274
+ maxStdDev: number;
275
+ };
276
+ };
277
+ };
278
+ /** Grader model used for this evaluation */
279
+ graderModel: string;
280
+ /** Sensitivity / discrimination power (from degradation testing) — Phase 4 */
281
+ sensitivity?: {
282
+ /** % of paired comparisons where grader ranked original higher */
283
+ concordanceRate: number;
284
+ /** Average score separation (original - degraded) */
285
+ avgSeparation: number;
286
+ /** Total paired comparisons analyzed */
287
+ totalPairs: number;
288
+ /** Per-dimension sensitivity */
289
+ perDimension: {
290
+ taskCompletion: {
291
+ concordanceRate: number;
292
+ avgSeparation: number;
293
+ };
294
+ codeCorrectness: {
295
+ concordanceRate: number;
296
+ avgSeparation: number;
297
+ };
298
+ docCoverage: {
299
+ concordanceRate: number;
300
+ avgSeparation: number;
301
+ };
302
+ };
303
+ };
304
+ /** Criterion validity (from human reference grades) — Phase 2 */
305
+ validity?: {
306
+ /** Mean Absolute Error vs. human grades */
307
+ mae: number;
308
+ /** Pearson correlation with human grades */
309
+ correlation: number;
310
+ /** Systematic bias (positive = grader scores higher than humans) */
311
+ bias: number;
312
+ /** Per-dimension correlation with human grades */
313
+ perDimension: {
314
+ taskCompletion: number;
315
+ codeCorrectness: number;
316
+ docCoverage: number;
317
+ };
318
+ /** Number of human-graded reference samples */
319
+ sampleSize: number;
320
+ /** Whether the grader passes the MAE threshold */
321
+ passesThreshold: boolean;
322
+ };
323
+ }
324
+ /** Cost estimate for targeted LOO before execution */
325
+ export interface LOOCostEstimate {
326
+ /** Breakdown per ambiguous task */
327
+ perTask: {
328
+ /** Estimated cost for this task's LOO */
329
+ estimatedCost: number;
330
+ /** Number of documents to evaluate */
331
+ numDocuments: number;
332
+ /** Number of tests per document */
333
+ numTests: number;
334
+ /** Task ID */
335
+ taskId: string;
336
+ }[];
337
+ /** Total estimated additional cost */
338
+ totalEstimatedCost: number;
339
+ }
340
+ /** Result of targeted leave-one-out attribution for an ambiguous task */
341
+ export interface LOOResult {
342
+ /** Additional eval cost incurred for this LOO analysis */
343
+ additionalCost: number;
344
+ /** Per-document marginal contributions */
345
+ contributions: {
346
+ /** Marginal score contribution of this document */
347
+ marginalContribution: number;
348
+ /** The document slug */
349
+ slug: string;
350
+ /** Whether the contribution is within grader noise floor */
351
+ withinNoiseFloor: boolean;
352
+ }[];
353
+ /** Whether contributions sum to approximately the total delta (±15%) */
354
+ sumMatchesTotal: boolean;
355
+ /** The task being analyzed */
356
+ taskId: string;
357
+ }
358
+ /** A parsed model entry from config/models.yaml */
359
+ export interface ModelEntry {
360
+ config?: Record<string, unknown>;
361
+ /** Environment variable gate — model is skipped when this var is unset */
362
+ env?: string;
363
+ id: string;
364
+ label: string;
365
+ modes?: string[];
366
+ }
367
+ /** Parsed config/models.yaml structure */
368
+ export interface ModelsConfig {
369
+ defaults: Record<string, unknown>;
370
+ grader: {
371
+ id: string;
372
+ label?: string;
373
+ };
374
+ /** Global max concurrency across all providers */
375
+ maxConcurrency?: number;
376
+ models: ModelEntry[];
377
+ }
378
+ /** Per-model score entry — one element per evaluated model */
379
+ export interface PerModelEntry {
380
+ /** Model label (human-readable name) */
381
+ label: string;
382
+ /** Provider model ID (e.g., "openai:chat:gpt-4o") */
383
+ modelId: string;
384
+ /** Model-level aggregates */
385
+ overall: {
386
+ avgDocLift: number;
387
+ avgScore: number;
388
+ /** Total cost for this model's tests */
389
+ cost?: number;
390
+ /** Number of tests run for this model */
391
+ testCount: number;
392
+ };
393
+ /** Per-area scores for this model */
394
+ scores: FeatureScore[];
395
+ }
396
+ /** Per-model score breakdown within a single evaluation run */
397
+ export type PerModelScores = PerModelEntry[];
398
+ /** Options for a pipeline run */
399
+ export interface PipelineOptions {
400
+ /** Debug mode — run a subset of tests for speed */
401
+ debug?: DebugOptions;
402
+ /** Run validation only, don't execute steps */
403
+ dryRun: boolean;
404
+ /** Scope evaluation to specific areas or tasks */
405
+ filter?: FilterOptions;
406
+ /** Which evaluation mode to run */
407
+ mode: EvalMode;
408
+ /** Bypass all pipeline caching (force re-execution of every step) */
409
+ noCache: boolean;
410
+ /** Skip the eval step (recalculate from existing results) */
411
+ skipEval: boolean;
412
+ /** Skip the documentation fetch step (reuse cached contexts) */
413
+ skipFetch: boolean;
414
+ /** Documentation source name (from config/sources.yaml) */
415
+ source?: string;
416
+ }
417
+ /** A Promptfoo share URL tagged with the evaluation mode that produced it. */
418
+ export interface PromptfooUrlEntry {
419
+ mode: ConcreteEvalMode;
420
+ url: string;
421
+ }
422
+ /**
423
+ * Mutable state bag shared between pipeline steps during execution.
424
+ *
425
+ * Flows through the orchestrator loop — producer steps write to it,
426
+ * consumer steps read from it. This eliminates filesystem round-trips
427
+ * for small inter-step coordination values (e.g., a reportId generated
428
+ * by PublishReportStep and consumed by CallbackStep).
429
+ *
430
+ * Large artifacts (score-summary.json, comparison-report.json) still
431
+ * live on the filesystem — PipelineState is for ephemeral values that
432
+ * only matter during a single pipeline execution.
433
+ */
434
+ export interface PipelineState {
435
+ /** Report ID generated by PublishReportStep, consumed by CallbackStep + orchestrator job update */
436
+ reportId?: string;
437
+ /** Eval fingerprint computed by RunEvalStep, consumed by PublishReportStep */
438
+ evalFingerprint?: string;
439
+ /** Promptfoo share URLs produced by RunEvalStep, consumed by PublishReportStep */
440
+ promptfooUrls?: PromptfooUrlEntry[];
441
+ }
442
+ /** Result of a full pipeline run */
443
+ export interface PipelineResult {
444
+ /** Cache hit/miss statistics for this run */
445
+ cache?: {
446
+ hits: number;
447
+ misses: number;
448
+ skipped: number;
449
+ total: number;
450
+ };
451
+ /** Total duration in milliseconds */
452
+ durationMs: number;
453
+ /** @deprecated Use `promptfooUrls` — kept for backward compatibility */
454
+ promptfooUrl?: string;
455
+ /** Per-mode Promptfoo share URLs (one per sub-eval that produced a shareable link) */
456
+ promptfooUrls?: PromptfooUrlEntry[];
457
+ /** Results per step */
458
+ steps: Record<string, StepResult>;
459
+ /** Overall success (all non-skipped steps succeeded) */
460
+ success: boolean;
461
+ /** Validation issues found (if any) */
462
+ validation: ValidationResult;
463
+ }
464
+ /** Result of a Tier B probe (new feature, no tasks) */
465
+ export interface ProbeResult {
466
+ /** API names the model extracted from the docs */
467
+ apiNamesExtracted: string[];
468
+ /** The documents evaluated */
469
+ documentSlugs: string[];
470
+ /** API names the model used that are NOT in the docs (hallucinations) */
471
+ hallucinatedApis: string[];
472
+ /** Whether the model produced any code output */
473
+ producedCode: boolean;
474
+ /** The model's raw output (for human review) */
475
+ rawOutput: string;
476
+ /** Suggested task descriptions based on document content analysis */
477
+ suggestedTasks: string[];
478
+ /** Qualitative assessment */
479
+ usability: "not-usable" | "partially-usable" | "usable";
480
+ }
481
+ /** A product feature in the registry */
482
+ export interface ProductFeature {
483
+ /** Maps to tasks/<area>.yaml */
484
+ area?: string;
485
+ /** Unique feature identifier */
486
+ id: string;
487
+ /** Human-readable name */
488
+ name: string;
489
+ /** Priority for evaluation coverage */
490
+ priority: "critical" | "high" | "low" | "medium";
491
+ /** Product sections this feature belongs to */
492
+ sections: string[];
493
+ /** Coverage status */
494
+ status: "covered" | "out-of-scope" | "planned" | "uncovered";
495
+ /** Number of evaluation tasks (if covered) */
496
+ taskCount?: number;
497
+ }
498
+ /** Full classification of a content release for evaluation */
499
+ export interface ReleaseClassification {
500
+ /** All classified documents in the release */
501
+ documents: ClassifiedReleaseDocument[];
502
+ /** Timestamp of classification */
503
+ generatedAt: string;
504
+ /** Summary counts */
505
+ summary: {
506
+ /** New documents in areas with existing tasks */
507
+ tierA: number;
508
+ /** New documents in areas without tasks */
509
+ tierB: number;
510
+ /** Total documents in the release */
511
+ total: number;
512
+ /** Documents tracked by at least one task */
513
+ tracked: number;
514
+ /** Updated documents (existing in canonical sets) */
515
+ updated: number;
516
+ /** Documents not tracked by any task */
517
+ untracked: number;
518
+ };
519
+ }
520
+ /** Cross-area release impact report */
521
+ export interface ReleaseImpactReport {
522
+ /** Per-area impact summaries */
523
+ areas: {
524
+ /** Area name */
525
+ area: string;
526
+ /** Overall area score delta */
527
+ delta: number;
528
+ /** Whether the area regressed (delta < -threshold) */
529
+ regressed: boolean;
530
+ /** Per-task deltas within this area */
531
+ tasks: {
532
+ /** Documents attributed to this delta */
533
+ attributedDocs: string[];
534
+ /** Task score delta */
535
+ delta: number;
536
+ /** Task ID */
537
+ taskId: string;
538
+ }[];
539
+ }[];
540
+ /** Areas evaluated but with no score change (within noise) */
541
+ confirmedUnchanged: string[];
542
+ /** Classified documents from the release */
543
+ documents: ClassifiedReleaseDocument[];
544
+ /** Timestamp of report */
545
+ generatedAt: string;
546
+ /** Areas not evaluated (no release documents match) */
547
+ notEvaluated: string[];
548
+ /** Overall score delta across all affected areas */
549
+ overallDelta: number;
550
+ /** Probe results for Tier B documents (if any) */
551
+ probes: ProbeResult[];
552
+ /** Areas that regressed (delta < -threshold) */
553
+ regressions: string[];
554
+ }
555
+ /** Top-level retrieval metrics (added to ScoreSummary when agentic data exists) */
556
+ export interface RetrievalMetrics {
557
+ /** Per-area retrieval metrics */
558
+ areas: AreaRetrievalMetrics[];
559
+ /** All canonical doc slugs that were never retrieved by any task */
560
+ globalInvisibleDocs: string[];
561
+ /** Overall averages across all areas */
562
+ overall: {
563
+ avgF1: number;
564
+ avgPrecision: number;
565
+ avgRecall: number;
566
+ };
567
+ }
568
+ /** Top-level score summary (the shape of score-summary.json) */
569
+ export interface ScoreSummary {
570
+ belowCritical: string[];
571
+ /**
572
+ * All Sanity documents used across the entire evaluation.
573
+ * De-duplicated union of documents from all areas/tasks.
574
+ * Populated during the gap-analysis enrichment step from the document manifest.
575
+ */
576
+ documentManifest?: DocumentRef[];
577
+ /**
578
+ * Which evaluation modes contributed data to this summary.
579
+ * - `'full'`: both baseline and agentic data present
580
+ * - `'baseline'`: floor + ceiling only (no agentic data)
581
+ * - `'agentic'`: actual only (no floor/ceiling data)
582
+ * - `'observed'`: observed mode data
583
+ * Absent in legacy summaries — treat as `'baseline'` for backward compat.
584
+ */
585
+ evaluationMode?: EvalMode;
586
+ /** Failure mode analysis (Phase 3a) — diagnostic breakdown of why scores are low */
587
+ failureModes?: FailureModeReport;
588
+ /**
589
+ * Low-scoring grader judgments (normalized score < 70) — the raw "red text"
590
+ * from the grader explaining why specific tests scored poorly.
591
+ * Scores are normalized to 0–100 scale before filtering.
592
+ * Each judgment includes the canonical docs the task expected.
593
+ */
594
+ lowScoringJudgments?: StoredJudgment[];
595
+ /** Gap analysis recommendations (Phase 3b) — prioritized remediation plan */
596
+ recommendations?: GapAnalysisReport;
597
+ /** Grader reliability diagnostics (does not affect scores) */
598
+ graderReliability?: GraderReliability;
599
+ lowestArea: string;
600
+ lowestScore: number;
601
+ /** Areas with negative Doc Lift — docs hurt performance. Empty when all lifts are positive. */
602
+ negativeDocLiftAreas?: {
603
+ area: string;
604
+ docLift: number;
605
+ }[];
606
+ overall: {
607
+ /** Average actual (agentic) score across areas. Absent if no agentic data. */
608
+ avgActualScore?: number;
609
+ /** Average ceiling score across all areas */
610
+ avgCeilingScore: number;
611
+ avgScore: number;
612
+ avgDocLift: number;
613
+ /** Average doc quality gap across all areas (100 − ceiling) */
614
+ avgDocQualityGap: number;
615
+ /** Average floor score across all areas */
616
+ avgFloorScore: number;
617
+ /** Average infrastructure efficiency (actual / ceiling). Absent if no agentic data. */
618
+ avgInfrastructureEfficiency?: number;
619
+ /** Average retrieval gap (ceiling − actual). Absent if no agentic data. */
620
+ avgRetrievalGap?: number;
621
+ /** Count of areas with negative Doc Lift */
622
+ negativeDocLiftCount: number;
623
+ cost?: {
624
+ total: number;
625
+ perTest: number;
626
+ graderTotal: number;
627
+ graderModel?: string;
628
+ totalTokens: number;
629
+ };
630
+ };
631
+ /** Per-model score breakdowns (Tier 3 from metrics-design.md) */
632
+ perModel?: PerModelScores;
633
+ /** Retrieval metrics from agentic mode (Approach 4) */
634
+ retrievalMetrics?: RetrievalMetrics;
635
+ scores: FeatureScore[];
636
+ source?: {
637
+ name: string;
638
+ baseUrl: string;
639
+ dataset?: string;
640
+ /** Sanity release perspective ID (when evaluating content releases) */
641
+ perspective?: string;
642
+ projectId: string;
643
+ };
644
+ timestamp: string;
645
+ }
646
+ /** Result of a single pipeline step */
647
+ export type StepResult = {
648
+ status: "failed";
649
+ durationMs: number;
650
+ error: string;
651
+ } | {
652
+ status: "skipped";
653
+ reason: string;
654
+ } | {
655
+ status: "success";
656
+ durationMs: number;
657
+ summary: string;
658
+ };
659
+ /** Per-task attribution result — which documents caused a score change */
660
+ export interface TaskAttribution {
661
+ /** The feature area */
662
+ area: string;
663
+ /** Documents attributed (1 for unambiguous, N for ambiguous, 0 for uncorrelated) */
664
+ attributedDocs: string[];
665
+ /** Attribution classification */
666
+ classification: AttributionClass;
667
+ /** Score delta for this task (area-level delta used as proxy) */
668
+ delta: number;
669
+ /** The task ID */
670
+ taskId: string;
671
+ /** Whether delta is within grader noise floor */
672
+ withinNoiseFloor: boolean;
673
+ }
674
+ /** A parsed task entry from a tasks/*.yaml file (single-definition format) */
675
+ export interface TaskEntry {
676
+ assert?: unknown[];
677
+ /** Baseline generation options (new format only). */
678
+ baseline?: {
679
+ enabled?: boolean;
680
+ rubric?: "abbreviated" | "full" | "none";
681
+ };
682
+ description: string;
683
+ /** Explicit task ID — determines the canonical context filename. */
684
+ id?: string;
685
+ transform?: string;
686
+ vars?: Record<string, unknown>;
687
+ }
688
+ /** Retrieval metrics for a single task */
689
+ export interface TaskRetrievalMetrics {
690
+ /** Feature area (from task file) */
691
+ area: string;
692
+ /** Slugs expected (from canonical_docs) */
693
+ expected: string[];
694
+ /** F1 score: 2 * precision * recall / (precision + recall) */
695
+ f1: number;
696
+ /** Slugs in both retrieved and expected */
697
+ hits: string[];
698
+ /** Expected slugs the agent missed */
699
+ missed: string[];
700
+ /** Precision: |retrieved ∩ canonical| / |retrieved| */
701
+ precision: number;
702
+ /** Recall: |retrieved ∩ canonical| / |canonical| */
703
+ recall: number;
704
+ /** Slugs the agent actually retrieved */
705
+ retrieved: string[];
706
+ /** Task ID matching the task YAML definition */
707
+ taskId: string;
708
+ /** Retrieved slugs not in the expected set */
709
+ unexpected: string[];
710
+ }
711
+ /** A single validation finding — either an error (blocks execution) or a warning */
712
+ export interface ValidationIssue {
713
+ /** What went wrong */
714
+ message: string;
715
+ /** Optional file path relevant to the issue */
716
+ path?: string;
717
+ /** Error = blocks execution, warning = informational */
718
+ severity: ValidationSeverity;
719
+ /** Which step or check produced this finding */
720
+ source: string;
721
+ }
722
+ /** Result of a validation pass */
723
+ export interface ValidationResult {
724
+ /** All findings */
725
+ issues: ValidationIssue[];
726
+ /** True if there are no errors (warnings are OK) */
727
+ valid: boolean;
728
+ }
729
+ /** Severity of a validation finding */
730
+ export type ValidationSeverity = "error" | "warning";
731
+ /**
732
+ * Default noise threshold — re-exported from @sanity/ailf-shared.
733
+ * Named DEFAULT_NOISE_THRESHOLD here for backward compatibility with eval code
734
+ * that imports it from @sanity/ailf-core.
735
+ */
736
+ export { NOISE_THRESHOLD as DEFAULT_NOISE_THRESHOLD } from "../../ailf-shared/index.d.ts";
737
+ /** Per-area delta with dimension breakdowns */
738
+ export interface AreaDelta {
739
+ /** Actual (agentic) score delta. Absent if either run lacks agentic data. */
740
+ actualDelta?: number;
741
+ area: string;
742
+ /** Baseline total score */
743
+ baseline: number;
744
+ /** Ceiling score delta (doc quality change). If ceiling moved, doc content changed. */
745
+ ceilingDelta: number;
746
+ /** Change classification based on noise threshold */
747
+ change: ChangeClass;
748
+ /** Cost delta (if cost data is available) */
749
+ costDelta?: number;
750
+ /** Overall score delta (experiment − baseline) */
751
+ delta: number;
752
+ /** Per-dimension deltas */
753
+ dimensions: {
754
+ taskCompletion: {
755
+ baseline: number;
756
+ experiment: number;
757
+ delta: number;
758
+ };
759
+ codeCorrectness: {
760
+ baseline: number;
761
+ experiment: number;
762
+ delta: number;
763
+ };
764
+ docCoverage: {
765
+ baseline: number;
766
+ experiment: number;
767
+ delta: number;
768
+ };
769
+ };
770
+ /** Doc Lift delta */
771
+ docLiftDelta: number;
772
+ /** Experiment total score */
773
+ experiment: number;
774
+ /** Floor score delta (model knowledge change — should be ~0 between runs) */
775
+ floorDelta: number;
776
+ /** Infrastructure efficiency delta. Absent if either run lacks full decomposition data. */
777
+ infrastructureEfficiencyDelta?: number;
778
+ /** Retrieval gap delta. Positive = gap grew (worse), negative = gap shrank (better). */
779
+ retrievalGapDelta?: number;
780
+ }
781
+ /** Attribution classification for a task delta */
782
+ export type AttributionClass = "ambiguous" | "unambiguous" | "uncorrelated";
783
+ /** Complete attribution report for a set of changed documents */
784
+ export interface AttributionReport {
785
+ /** All task attributions */
786
+ attributions: TaskAttribution[];
787
+ /** Summary counts */
788
+ summary: {
789
+ ambiguous: number;
790
+ unambiguous: number;
791
+ uncorrelated: number;
792
+ withinNoise: number;
793
+ };
794
+ /** Changed documents that aren't in any task's canonical docs */
795
+ untrackedDocs: string[];
796
+ }
797
+ /** Classification of a feature area's score change */
798
+ export type ChangeClass = "improved" | "regressed" | "unchanged";
799
+ /** Options for the compare function */
800
+ export interface CompareOptions {
801
+ /** Grader consistency data — if provided, used to compute empirical noise threshold */
802
+ graderConsistency?: GraderConsistencyData;
803
+ /**
804
+ * Deltas within ±threshold are classified as "unchanged".
805
+ * Defaults to DEFAULT_NOISE_THRESHOLD (2).
806
+ * Overridden by graderConsistency.recommendedThreshold when available.
807
+ */
808
+ noiseThreshold?: number;
809
+ }
810
+ /**
811
+ * Structured comparison between two evaluation runs.
812
+ *
813
+ * This is the core comparison primitive (see BP5 in the evaluation roadmap).
814
+ * Every comparison scenario uses the same shape — what varies is what
815
+ * produced each ScoreSummary:
816
+ *
817
+ * | Scenario | Baseline | Experiment |
818
+ * |---------------------|---------------------|--------------------|
819
+ * | Doc improvement | Before rewrite | After rewrite |
820
+ * | Model comparison | Model A | Model B |
821
+ * | Model regression | Previous version | Current version |
822
+ * | Branch validation | Production | Branch deploy |
823
+ * | Infrastructure ROI | Naive agent | Optimized agent |
824
+ * | Training freshness | Without docs | With docs (DocLift)|
825
+ */
826
+ export interface ComparisonReport {
827
+ /** Detailed per-area breakdowns */
828
+ areas: AreaDelta[];
829
+ /** Per-document attribution (when changed docs are known) */
830
+ attribution?: AttributionReport;
831
+ /** The "before" or "control" summary */
832
+ baseline: ScoreSummary;
833
+ /** Aggregate deltas */
834
+ deltas: {
835
+ /** Overall score delta (experiment.avgScore − baseline.avgScore) */
836
+ overall: number;
837
+ /** Per-area total score deltas */
838
+ perArea: Record<string, number>;
839
+ /** Per-dimension average deltas (across all areas) */
840
+ perDimension: {
841
+ taskCompletion: number;
842
+ codeCorrectness: number;
843
+ docCoverage: number;
844
+ };
845
+ /** Doc Lift average delta */
846
+ docLift: number;
847
+ /** Cost delta (if both runs have cost data) */
848
+ cost?: number;
849
+ /** Per-model overall score deltas (when both summaries have per-model data) */
850
+ perModel?: {
851
+ delta: number;
852
+ modelId: string;
853
+ }[];
854
+ };
855
+ /** The "after" or "treatment" summary */
856
+ experiment: ScoreSummary;
857
+ /** When this comparison was generated */
858
+ generatedAt: string;
859
+ /** Areas that improved beyond the noise threshold */
860
+ improved: string[];
861
+ /** Areas present in only one of the two summaries */
862
+ mismatched: {
863
+ /** Areas in baseline but not experiment */
864
+ onlyInBaseline: string[];
865
+ /** Areas in experiment but not baseline */
866
+ onlyInExperiment: string[];
867
+ };
868
+ /** Noise threshold used for classification */
869
+ noiseThreshold: number;
870
+ /** Whether the noise threshold was derived from empirical grader consistency data */
871
+ noiseThresholdEmpirical: boolean;
872
+ /** Areas that regressed beyond the noise threshold */
873
+ regressed: string[];
874
+ /** Areas within the noise threshold */
875
+ unchanged: string[];
876
+ }
877
+ /** Confidence annotation for a single delta */
878
+ export interface ConfidenceAnnotation {
879
+ /** 95% confidence interval half-width (±this value) */
880
+ ci95: number;
881
+ /** The measured delta */
882
+ delta: number;
883
+ /** The grader σ used to compute this CI */
884
+ graderSigma: number;
885
+ /** Whether the delta exceeds the confidence interval (statistically significant) */
886
+ significant: boolean;
887
+ }
888
+ /**
889
+ * Grader consistency data for empirical confidence intervals.
890
+ * This is the subset of GraderConsistency needed for comparison annotation.
891
+ * Matches the shape produced by pipeline/grader-consistency.ts.
892
+ */
893
+ export interface GraderConsistencyData {
894
+ /** Per-dimension consistency metrics */
895
+ perDimension: {
896
+ codeCorrectness: {
897
+ avgStdDev: number;
898
+ maxStdDev: number;
899
+ };
900
+ docCoverage: {
901
+ avgStdDev: number;
902
+ maxStdDev: number;
903
+ };
904
+ taskCompletion: {
905
+ avgStdDev: number;
906
+ maxStdDev: number;
907
+ };
908
+ };
909
+ /** Recommended noise threshold for comparisons (2× max dimension σ) */
910
+ recommendedThreshold: number;
911
+ }
912
+ /** ISO 8601 timestamp string */
913
+ export type ISOTimestamp = string & {
914
+ readonly __brand: "ISOTimestamp";
915
+ };
916
+ /** Query parameters for finding comparable baseline reports */
917
+ export interface LineageQuery {
918
+ /** Match on feature areas (overlapping set) */
919
+ areas?: string[];
920
+ /** Only reports before this timestamp */
921
+ before?: ISOTimestamp;
922
+ /** Match on evaluation mode */
923
+ mode: EvalMode;
924
+ /** Match on models */
925
+ models?: string[];
926
+ /** Match on source */
927
+ source?: Partial<ReportProvenance["source"]>;
928
+ }
929
+ /** Result of publishing a report to all sinks */
930
+ export interface PublishResult {
931
+ /** The auto-comparison (if a comparable baseline was found) */
932
+ comparison?: ComparisonReport;
933
+ /** The published report */
934
+ report: Report;
935
+ /** Per-sink results */
936
+ sinkResults: {
937
+ name: string;
938
+ result: SinkResult;
939
+ }[];
940
+ }
941
+ /** A published evaluation report — the atomic unit of the report store */
942
+ export interface Report {
943
+ /** Optional auto-comparison against the most recent comparable report */
944
+ comparison?: ComparisonReport;
945
+ /** When the evaluation completed */
946
+ completedAt: ISOTimestamp;
947
+ /** How long the pipeline took */
948
+ durationMs: number;
949
+ /** Stable unique identifier (UUID v7 for time-sortability) */
950
+ id: ReportId;
951
+ /** What produced this report */
952
+ provenance: ReportProvenance;
953
+ /** The full score summary */
954
+ summary: ScoreSummary;
955
+ /** Optional human-supplied label */
956
+ tag?: string;
957
+ }
958
+ /** Branded type for report identifiers (UUID v7 for time-sortability) */
959
+ export type ReportId = string & {
960
+ readonly __brand: "ReportId";
961
+ };
962
+ /** Full provenance metadata for an evaluation report */
963
+ export interface ReportProvenance {
964
+ /** Which feature areas were evaluated */
965
+ areas: string[];
966
+ /** Content hash of the documentation context at eval time */
967
+ contextHash?: string;
968
+ /**
969
+ * Evaluation fingerprint — SHA-256 of all inputs that affect eval output.
970
+ * Used for cross-environment cache lookup (CI → Content Lake).
971
+ * @see docs/design-docs/content-lake-eval-caching.md
972
+ */
973
+ evalFingerprint?: string;
974
+ /** Git metadata (when run from CI) */
975
+ git?: {
976
+ branch: string;
977
+ prNumber?: number;
978
+ repo: string;
979
+ sha: string;
980
+ };
981
+ /** Grader model used for scoring */
982
+ graderModel: string;
983
+ /** Evaluation mode */
984
+ mode: EvalMode;
985
+ /** Models under evaluation */
986
+ models: {
987
+ id: string;
988
+ label: string;
989
+ }[];
990
+ /** @deprecated Use `promptfooUrls` — kept for backward compatibility */
991
+ promptfooUrl?: string;
992
+ /** Per-mode Promptfoo share URLs (one per sub-eval that produced a shareable link) */
993
+ promptfooUrls?: PromptfooUrlEntry[];
994
+ /** Documentation source configuration */
995
+ source: {
996
+ baseUrl: string;
997
+ dataset?: string;
998
+ name: string;
999
+ perspective?: string;
1000
+ projectId?: string;
1001
+ };
1002
+ /** Sanity document IDs that were targeted (if using --sanity-document) */
1003
+ targetDocuments?: string[];
1004
+ /** Which specific task IDs were evaluated (if scoped) */
1005
+ taskIds?: string[];
1006
+ /** What initiated this evaluation */
1007
+ trigger: ReportTrigger;
1008
+ }
1009
+ /** What triggered this evaluation */
1010
+ export type ReportTrigger = {
1011
+ type: "ci";
1012
+ runId: string;
1013
+ workflow: string;
1014
+ } | {
1015
+ type: "cross-repo";
1016
+ callerRef?: string;
1017
+ callerRepo: string;
1018
+ } | {
1019
+ type: "manual";
1020
+ } | {
1021
+ type: "scheduled";
1022
+ schedule: string;
1023
+ } | {
1024
+ type: "webhook";
1025
+ documentId?: string;
1026
+ source: string;
1027
+ };
1028
+ /** Health check result for a sink */
1029
+ export type SinkHealthStatus = {
1030
+ healthy: false;
1031
+ reason: string;
1032
+ } | {
1033
+ healthy: true;
1034
+ };
1035
+ /** Payload delivered to sinks — extends Report with threshold context */
1036
+ export interface SinkPayload {
1037
+ /** Comparison report (if available) */
1038
+ comparison?: ComparisonReport;
1039
+ /** Maximum severity across all violations */
1040
+ maxSeverity: "none" | ThresholdSeverity;
1041
+ /** The published report */
1042
+ report: Report;
1043
+ /** Threshold violations (empty if thresholds not configured) */
1044
+ violations: ThresholdViolation[];
1045
+ }
1046
+ /** Result of a sink publish attempt */
1047
+ export type SinkResult = {
1048
+ status: "failed";
1049
+ error: string;
1050
+ } | {
1051
+ status: "skipped";
1052
+ reason: string;
1053
+ } | {
1054
+ status: "success";
1055
+ detail?: string;
1056
+ };
1057
+ /** Result of evaluating all thresholds against a score summary */
1058
+ export interface ThresholdEvaluation {
1059
+ /** Maximum severity across all violations ("none" if no violations) */
1060
+ maxSeverity: "critical" | "info" | "none" | "warning";
1061
+ /** Whether all thresholds pass (no violations) */
1062
+ pass: boolean;
1063
+ /** All violations, sorted by severity (critical first) then by delta */
1064
+ violations: ThresholdViolation[];
1065
+ }
1066
+ /** Severity level for threshold violations */
1067
+ export type ThresholdSeverity = "critical" | "info" | "warning";
1068
+ /** A single threshold violation — one dimension or metric exceeding its limit */
1069
+ export interface ThresholdViolation {
1070
+ /** The actual measured value */
1071
+ actual: number;
1072
+ /** The area (or "overall" for global thresholds) */
1073
+ area: string;
1074
+ /** Human-readable description of what violated and why */
1075
+ description: string;
1076
+ /** The dimension or metric that violated (e.g., "composite", "task-completion", "doc-lift") */
1077
+ metric: string;
1078
+ /** Severity classification based on severity config rules */
1079
+ severity: ThresholdSeverity;
1080
+ /** The threshold value that was exceeded */
1081
+ threshold: number;
1082
+ }