@sanity/ailf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (530) hide show
  1. package/README.md +89 -0
  2. package/bin/ailf.js +64 -0
  3. package/canonical/grader-references/README.md +88 -0
  4. package/canonical/grader-references/groq.yaml +234 -0
  5. package/canonical/grader-references/studio-setup.yaml +275 -0
  6. package/canonical/reference-solutions/.gitkeep +1 -0
  7. package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
  8. package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
  9. package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
  10. package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
  11. package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
  12. package/canonical/reference-solutions/groq/joins-references.ts +300 -0
  13. package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
  14. package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
  15. package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
  16. package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
  17. package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
  18. package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
  19. package/config/bigquery/README.md +74 -0
  20. package/config/bigquery/views/area_scores.sql +87 -0
  21. package/config/bigquery/views/reports.sql +49 -0
  22. package/config/features.yaml +116 -0
  23. package/config/models.yaml +115 -0
  24. package/config/prompts.yaml +75 -0
  25. package/config/rubrics.yaml +62 -0
  26. package/config/schedules.yaml +43 -0
  27. package/config/sinks.yaml +54 -0
  28. package/config/sources.yaml +51 -0
  29. package/config/thresholds.yaml +49 -0
  30. package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
  31. package/dist/_vendor/ailf-core/examples/index.js +285 -0
  32. package/dist/_vendor/ailf-core/index.d.ts +17 -0
  33. package/dist/_vendor/ailf-core/index.js +17 -0
  34. package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
  35. package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
  36. package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
  37. package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
  38. package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
  39. package/dist/_vendor/ailf-core/ports/context.js +14 -0
  40. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
  41. package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
  42. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
  43. package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
  44. package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
  45. package/dist/_vendor/ailf-core/ports/index.js +7 -0
  46. package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
  47. package/dist/_vendor/ailf-core/ports/logger.js +11 -0
  48. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
  49. package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
  50. package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
  51. package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
  52. package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
  53. package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
  54. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
  55. package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
  56. package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
  57. package/dist/_vendor/ailf-core/schemas/index.js +16 -0
  58. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
  59. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
  60. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
  61. package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
  62. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
  63. package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
  64. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
  65. package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
  66. package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
  67. package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
  68. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
  69. package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
  70. package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
  71. package/dist/_vendor/ailf-core/services/index.js +12 -0
  72. package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
  73. package/dist/_vendor/ailf-core/services/scoring.js +222 -0
  74. package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
  75. package/dist/_vendor/ailf-core/types/index.js +21 -0
  76. package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
  77. package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
  78. package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
  79. package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
  80. package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
  81. package/dist/_vendor/ailf-shared/document-ref.js +1 -0
  82. package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
  83. package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
  84. package/dist/_vendor/ailf-shared/index.d.ts +16 -0
  85. package/dist/_vendor/ailf-shared/index.js +16 -0
  86. package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
  87. package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
  88. package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
  89. package/dist/_vendor/ailf-shared/score-grades.js +23 -0
  90. package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
  91. package/dist/adapters/cache/content-lake-cache.js +59 -0
  92. package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
  93. package/dist/adapters/cache/filesystem-cache.js +54 -0
  94. package/dist/adapters/cache/index.d.ts +2 -0
  95. package/dist/adapters/cache/index.js +2 -0
  96. package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
  97. package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
  98. package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
  99. package/dist/adapters/config-sources/file-config-adapter.js +96 -0
  100. package/dist/adapters/config-sources/index.d.ts +2 -0
  101. package/dist/adapters/config-sources/index.js +2 -0
  102. package/dist/adapters/doc-fetchers/index.d.ts +1 -0
  103. package/dist/adapters/doc-fetchers/index.js +1 -0
  104. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
  105. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
  106. package/dist/adapters/eval-runners/index.d.ts +1 -0
  107. package/dist/adapters/eval-runners/index.js +1 -0
  108. package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
  109. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
  110. package/dist/adapters/index.d.ts +12 -0
  111. package/dist/adapters/index.js +12 -0
  112. package/dist/adapters/loggers/console-logger.d.ts +22 -0
  113. package/dist/adapters/loggers/console-logger.js +54 -0
  114. package/dist/adapters/loggers/index.d.ts +9 -0
  115. package/dist/adapters/loggers/index.js +9 -0
  116. package/dist/adapters/loggers/json-logger.d.ts +18 -0
  117. package/dist/adapters/loggers/json-logger.js +33 -0
  118. package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
  119. package/dist/adapters/loggers/quiet-logger.js +30 -0
  120. package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
  121. package/dist/adapters/task-sources/composite-task-source.js +59 -0
  122. package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
  123. package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
  124. package/dist/adapters/task-sources/index.d.ts +7 -0
  125. package/dist/adapters/task-sources/index.js +7 -0
  126. package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
  127. package/dist/adapters/task-sources/repo-schemas.js +234 -0
  128. package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
  129. package/dist/adapters/task-sources/repo-task-source.js +104 -0
  130. package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
  131. package/dist/adapters/task-sources/repo-trigger.js +153 -0
  132. package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
  133. package/dist/adapters/task-sources/repo-validation.js +164 -0
  134. package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
  135. package/dist/adapters/task-sources/yaml-task-source.js +136 -0
  136. package/dist/agent-observer/agentic-provider.d.ts +132 -0
  137. package/dist/agent-observer/agentic-provider.js +983 -0
  138. package/dist/agent-observer/classifier.d.ts +62 -0
  139. package/dist/agent-observer/classifier.js +269 -0
  140. package/dist/agent-observer/index.d.ts +7 -0
  141. package/dist/agent-observer/index.js +4 -0
  142. package/dist/agent-observer/pricing.d.ts +35 -0
  143. package/dist/agent-observer/pricing.js +82 -0
  144. package/dist/agent-observer/provider.d.ts +77 -0
  145. package/dist/agent-observer/provider.js +151 -0
  146. package/dist/agent-observer/proxy.d.ts +91 -0
  147. package/dist/agent-observer/proxy.js +321 -0
  148. package/dist/agent-observer/test-imports.d.ts +7 -0
  149. package/dist/agent-observer/test-imports.js +185 -0
  150. package/dist/agent-observer/types.d.ts +137 -0
  151. package/dist/agent-observer/types.js +16 -0
  152. package/dist/assertions/source-isolation.d.ts +72 -0
  153. package/dist/assertions/source-isolation.js +117 -0
  154. package/dist/cli.d.ts +24 -0
  155. package/dist/cli.js +199 -0
  156. package/dist/commands/agent-report.d.ts +5 -0
  157. package/dist/commands/agent-report.js +69 -0
  158. package/dist/commands/baseline.d.ts +9 -0
  159. package/dist/commands/baseline.js +141 -0
  160. package/dist/commands/cache.d.ts +13 -0
  161. package/dist/commands/cache.js +135 -0
  162. package/dist/commands/calculate-scores.d.ts +8 -0
  163. package/dist/commands/calculate-scores.js +48 -0
  164. package/dist/commands/compare.d.ts +8 -0
  165. package/dist/commands/compare.js +120 -0
  166. package/dist/commands/completion.d.ts +18 -0
  167. package/dist/commands/completion.js +260 -0
  168. package/dist/commands/coverage-audit.d.ts +7 -0
  169. package/dist/commands/coverage-audit.js +40 -0
  170. package/dist/commands/discovery-report.d.ts +10 -0
  171. package/dist/commands/discovery-report.js +44 -0
  172. package/dist/commands/eval.d.ts +9 -0
  173. package/dist/commands/eval.js +35 -0
  174. package/dist/commands/explain-handler.d.ts +34 -0
  175. package/dist/commands/explain-handler.js +719 -0
  176. package/dist/commands/fetch-docs.d.ts +8 -0
  177. package/dist/commands/fetch-docs.js +128 -0
  178. package/dist/commands/generate-configs.d.ts +8 -0
  179. package/dist/commands/generate-configs.js +46 -0
  180. package/dist/commands/grader/index.d.ts +11 -0
  181. package/dist/commands/grader/index.js +118 -0
  182. package/dist/commands/init.d.ts +19 -0
  183. package/dist/commands/init.js +150 -0
  184. package/dist/commands/interactive.d.ts +12 -0
  185. package/dist/commands/interactive.js +238 -0
  186. package/dist/commands/lookup-doc.d.ts +15 -0
  187. package/dist/commands/lookup-doc.js +84 -0
  188. package/dist/commands/measure-retrieval.d.ts +5 -0
  189. package/dist/commands/measure-retrieval.js +65 -0
  190. package/dist/commands/pipeline-action.d.ts +71 -0
  191. package/dist/commands/pipeline-action.js +305 -0
  192. package/dist/commands/pipeline.d.ts +62 -0
  193. package/dist/commands/pipeline.js +53 -0
  194. package/dist/commands/pr-comment.d.ts +8 -0
  195. package/dist/commands/pr-comment.js +47 -0
  196. package/dist/commands/publish.d.ts +26 -0
  197. package/dist/commands/publish.js +253 -0
  198. package/dist/commands/readiness-report.d.ts +10 -0
  199. package/dist/commands/readiness-report.js +104 -0
  200. package/dist/commands/shared/options.d.ts +29 -0
  201. package/dist/commands/shared/options.js +57 -0
  202. package/dist/commands/update-quality-scores.d.ts +5 -0
  203. package/dist/commands/update-quality-scores.js +20 -0
  204. package/dist/commands/validate-tasks.d.ts +16 -0
  205. package/dist/commands/validate-tasks.js +93 -0
  206. package/dist/commands/validate.d.ts +9 -0
  207. package/dist/commands/validate.js +73 -0
  208. package/dist/commands/webhook-server.d.ts +5 -0
  209. package/dist/commands/webhook-server.js +30 -0
  210. package/dist/commands/weekly-digest.d.ts +10 -0
  211. package/dist/commands/weekly-digest.js +104 -0
  212. package/dist/composition-root.d.ts +26 -0
  213. package/dist/composition-root.js +107 -0
  214. package/dist/interpolate.d.ts +26 -0
  215. package/dist/interpolate.js +70 -0
  216. package/dist/job-store.d.ts +104 -0
  217. package/dist/job-store.js +188 -0
  218. package/dist/lib/agent-behavior-report.d.ts +8 -0
  219. package/dist/lib/agent-behavior-report.js +185 -0
  220. package/dist/lib/baseline.d.ts +19 -0
  221. package/dist/lib/baseline.js +153 -0
  222. package/dist/lib/calculate-scores.d.ts +23 -0
  223. package/dist/lib/calculate-scores.js +42 -0
  224. package/dist/lib/compare.d.ts +18 -0
  225. package/dist/lib/compare.js +170 -0
  226. package/dist/lib/coverage-audit.d.ts +4 -0
  227. package/dist/lib/coverage-audit.js +42 -0
  228. package/dist/lib/discovery-report.d.ts +13 -0
  229. package/dist/lib/discovery-report.js +57 -0
  230. package/dist/lib/fetch-docs.d.ts +30 -0
  231. package/dist/lib/fetch-docs.js +171 -0
  232. package/dist/lib/generate-configs.d.ts +25 -0
  233. package/dist/lib/generate-configs.js +42 -0
  234. package/dist/lib/grader-api.d.ts +21 -0
  235. package/dist/lib/grader-api.js +34 -0
  236. package/dist/lib/grader-compare.d.ts +19 -0
  237. package/dist/lib/grader-compare.js +91 -0
  238. package/dist/lib/grader-consistency.d.ts +27 -0
  239. package/dist/lib/grader-consistency.js +79 -0
  240. package/dist/lib/grader-sensitivity.d.ts +19 -0
  241. package/dist/lib/grader-sensitivity.js +75 -0
  242. package/dist/lib/grader-validate.d.ts +19 -0
  243. package/dist/lib/grader-validate.js +78 -0
  244. package/dist/lib/measure-retrieval.d.ts +14 -0
  245. package/dist/lib/measure-retrieval.js +71 -0
  246. package/dist/lib/pr-comment.d.ts +16 -0
  247. package/dist/lib/pr-comment.js +28 -0
  248. package/dist/lib/readiness-report.d.ts +13 -0
  249. package/dist/lib/readiness-report.js +108 -0
  250. package/dist/lib/webhook-server.d.ts +11 -0
  251. package/dist/lib/webhook-server.js +24 -0
  252. package/dist/lib/weekly-digest.d.ts +24 -0
  253. package/dist/lib/weekly-digest.js +148 -0
  254. package/dist/orchestration/build-app-context.d.ts +27 -0
  255. package/dist/orchestration/build-app-context.js +81 -0
  256. package/dist/orchestration/build-step-sequence.d.ts +15 -0
  257. package/dist/orchestration/build-step-sequence.js +84 -0
  258. package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
  259. package/dist/orchestration/config-to-source-overrides.js +28 -0
  260. package/dist/orchestration/env-bridge.d.ts +21 -0
  261. package/dist/orchestration/env-bridge.js +66 -0
  262. package/dist/orchestration/index.d.ts +11 -0
  263. package/dist/orchestration/index.js +11 -0
  264. package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
  265. package/dist/orchestration/pipeline-orchestrator.js +153 -0
  266. package/dist/orchestration/step-runner.d.ts +20 -0
  267. package/dist/orchestration/step-runner.js +88 -0
  268. package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
  269. package/dist/orchestration/steps/calculate-scores-step.js +95 -0
  270. package/dist/orchestration/steps/callback-step.d.ts +24 -0
  271. package/dist/orchestration/steps/callback-step.js +76 -0
  272. package/dist/orchestration/steps/compare-step.d.ts +14 -0
  273. package/dist/orchestration/steps/compare-step.js +92 -0
  274. package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
  275. package/dist/orchestration/steps/discovery-report-step.js +55 -0
  276. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  277. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  278. package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
  279. package/dist/orchestration/steps/fetch-docs-step.js +135 -0
  280. package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
  281. package/dist/orchestration/steps/gap-analysis-step.js +136 -0
  282. package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
  283. package/dist/orchestration/steps/generate-configs-step.js +85 -0
  284. package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
  285. package/dist/orchestration/steps/grader-consistency-step.js +64 -0
  286. package/dist/orchestration/steps/index.d.ts +19 -0
  287. package/dist/orchestration/steps/index.js +19 -0
  288. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
  289. package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
  290. package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
  291. package/dist/orchestration/steps/publish-report-step.js +216 -0
  292. package/dist/orchestration/steps/readiness-step.d.ts +13 -0
  293. package/dist/orchestration/steps/readiness-step.js +91 -0
  294. package/dist/orchestration/steps/report-step.d.ts +12 -0
  295. package/dist/orchestration/steps/report-step.js +49 -0
  296. package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
  297. package/dist/orchestration/steps/run-eval-step.js +195 -0
  298. package/dist/orchestration/steps/validate-step.d.ts +12 -0
  299. package/dist/orchestration/steps/validate-step.js +41 -0
  300. package/dist/pipeline/agent-behavior-report.d.ts +53 -0
  301. package/dist/pipeline/agent-behavior-report.js +132 -0
  302. package/dist/pipeline/attribution.d.ts +47 -0
  303. package/dist/pipeline/attribution.js +226 -0
  304. package/dist/pipeline/baseline.d.ts +37 -0
  305. package/dist/pipeline/baseline.js +141 -0
  306. package/dist/pipeline/cache.d.ts +101 -0
  307. package/dist/pipeline/cache.js +283 -0
  308. package/dist/pipeline/calculate-scores.d.ts +102 -0
  309. package/dist/pipeline/calculate-scores.js +1128 -0
  310. package/dist/pipeline/callback-delivery.d.ts +50 -0
  311. package/dist/pipeline/callback-delivery.js +89 -0
  312. package/dist/pipeline/checks.d.ts +39 -0
  313. package/dist/pipeline/checks.js +280 -0
  314. package/dist/pipeline/classify-url.d.ts +61 -0
  315. package/dist/pipeline/classify-url.js +93 -0
  316. package/dist/pipeline/compare.d.ts +31 -0
  317. package/dist/pipeline/compare.js +208 -0
  318. package/dist/pipeline/coverage-audit.d.ts +39 -0
  319. package/dist/pipeline/coverage-audit.js +165 -0
  320. package/dist/pipeline/degradations.d.ts +85 -0
  321. package/dist/pipeline/degradations.js +242 -0
  322. package/dist/pipeline/discovery-report.d.ts +55 -0
  323. package/dist/pipeline/discovery-report.js +178 -0
  324. package/dist/pipeline/eval-constants.d.ts +68 -0
  325. package/dist/pipeline/eval-constants.js +111 -0
  326. package/dist/pipeline/eval-fingerprint.d.ts +66 -0
  327. package/dist/pipeline/eval-fingerprint.js +175 -0
  328. package/dist/pipeline/expand-tasks.d.ts +220 -0
  329. package/dist/pipeline/expand-tasks.js +421 -0
  330. package/dist/pipeline/failure-modes.d.ts +46 -0
  331. package/dist/pipeline/failure-modes.js +348 -0
  332. package/dist/pipeline/fetch-url-content.d.ts +44 -0
  333. package/dist/pipeline/fetch-url-content.js +93 -0
  334. package/dist/pipeline/gap-analysis.d.ts +48 -0
  335. package/dist/pipeline/gap-analysis.js +231 -0
  336. package/dist/pipeline/generate-configs.d.ts +72 -0
  337. package/dist/pipeline/generate-configs.js +395 -0
  338. package/dist/pipeline/grader-api.d.ts +49 -0
  339. package/dist/pipeline/grader-api.js +200 -0
  340. package/dist/pipeline/grader-compare-runner.d.ts +44 -0
  341. package/dist/pipeline/grader-compare-runner.js +301 -0
  342. package/dist/pipeline/grader-comparison.d.ts +111 -0
  343. package/dist/pipeline/grader-comparison.js +161 -0
  344. package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
  345. package/dist/pipeline/grader-consistency-runner.js +270 -0
  346. package/dist/pipeline/grader-consistency.d.ts +103 -0
  347. package/dist/pipeline/grader-consistency.js +146 -0
  348. package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
  349. package/dist/pipeline/grader-sensitivity-runner.js +282 -0
  350. package/dist/pipeline/grader-sensitivity.d.ts +94 -0
  351. package/dist/pipeline/grader-sensitivity.js +144 -0
  352. package/dist/pipeline/grader-validate-runner.d.ts +38 -0
  353. package/dist/pipeline/grader-validate-runner.js +229 -0
  354. package/dist/pipeline/grader-validation.d.ts +107 -0
  355. package/dist/pipeline/grader-validation.js +169 -0
  356. package/dist/pipeline/map-request-to-config.d.ts +19 -0
  357. package/dist/pipeline/map-request-to-config.js +80 -0
  358. package/dist/pipeline/measure-retrieval.d.ts +59 -0
  359. package/dist/pipeline/measure-retrieval.js +111 -0
  360. package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
  361. package/dist/pipeline/mirror-repo-tasks.js +350 -0
  362. package/dist/pipeline/plan-format.d.ts +33 -0
  363. package/dist/pipeline/plan-format.js +202 -0
  364. package/dist/pipeline/plan.d.ts +169 -0
  365. package/dist/pipeline/plan.js +708 -0
  366. package/dist/pipeline/pr-comment.d.ts +19 -0
  367. package/dist/pipeline/pr-comment.js +502 -0
  368. package/dist/pipeline/probe.d.ts +52 -0
  369. package/dist/pipeline/probe.js +390 -0
  370. package/dist/pipeline/provenance.d.ts +47 -0
  371. package/dist/pipeline/provenance.js +146 -0
  372. package/dist/pipeline/readiness-report.d.ts +87 -0
  373. package/dist/pipeline/readiness-report.js +205 -0
  374. package/dist/pipeline/release-classification.d.ts +54 -0
  375. package/dist/pipeline/release-classification.js +238 -0
  376. package/dist/pipeline/release-report.d.ts +37 -0
  377. package/dist/pipeline/release-report.js +222 -0
  378. package/dist/pipeline/repo-eval-comment.d.ts +37 -0
  379. package/dist/pipeline/repo-eval-comment.js +165 -0
  380. package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
  381. package/dist/pipeline/repo-threshold-evaluator.js +162 -0
  382. package/dist/pipeline/resolve-mappings.d.ts +35 -0
  383. package/dist/pipeline/resolve-mappings.js +72 -0
  384. package/dist/pipeline/retrieval-metrics.d.ts +39 -0
  385. package/dist/pipeline/retrieval-metrics.js +136 -0
  386. package/dist/pipeline/reverse-mapping.d.ts +67 -0
  387. package/dist/pipeline/reverse-mapping.js +88 -0
  388. package/dist/pipeline/schemas.d.ts +9 -0
  389. package/dist/pipeline/schemas.js +9 -0
  390. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  391. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  392. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  393. package/dist/pipeline/steps/compare-step.js +90 -0
  394. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  395. package/dist/pipeline/steps/eval-step.js +347 -0
  396. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  397. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  398. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  399. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  400. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  401. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  402. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  403. package/dist/pipeline/steps/publish-report-step.js +243 -0
  404. package/dist/pipeline/steps/report-step.d.ts +13 -0
  405. package/dist/pipeline/steps/report-step.js +56 -0
  406. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  407. package/dist/pipeline/steps/update-scores-step.js +42 -0
  408. package/dist/pipeline/targeted-loo.d.ts +88 -0
  409. package/dist/pipeline/targeted-loo.js +203 -0
  410. package/dist/pipeline/thresholds.d.ts +27 -0
  411. package/dist/pipeline/thresholds.js +245 -0
  412. package/dist/pipeline/types.d.ts +10 -0
  413. package/dist/pipeline/types.js +10 -0
  414. package/dist/pipeline/validate.d.ts +67 -0
  415. package/dist/pipeline/validate.js +406 -0
  416. package/dist/pipeline/webhook-server.d.ts +37 -0
  417. package/dist/pipeline/webhook-server.js +133 -0
  418. package/dist/report-store.d.ts +84 -0
  419. package/dist/report-store.js +208 -0
  420. package/dist/sanity/client.d.ts +38 -0
  421. package/dist/sanity/client.js +86 -0
  422. package/dist/sanity/portable-text.d.ts +11 -0
  423. package/dist/sanity/portable-text.js +211 -0
  424. package/dist/sanity/queries.d.ts +133 -0
  425. package/dist/sanity/queries.js +300 -0
  426. package/dist/schedules/digest.d.ts +116 -0
  427. package/dist/schedules/digest.js +156 -0
  428. package/dist/schedules/index.d.ts +12 -0
  429. package/dist/schedules/index.js +10 -0
  430. package/dist/schedules/loader.d.ts +31 -0
  431. package/dist/schedules/loader.js +73 -0
  432. package/dist/schedules/schema.d.ts +9 -0
  433. package/dist/schedules/schema.js +9 -0
  434. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  435. package/dist/scripts/agent-behavior-report.js +315 -0
  436. package/dist/scripts/baseline.d.ts +43 -0
  437. package/dist/scripts/baseline.js +267 -0
  438. package/dist/scripts/calculate-scores.d.ts +166 -0
  439. package/dist/scripts/calculate-scores.js +1296 -0
  440. package/dist/scripts/compare.d.ts +22 -0
  441. package/dist/scripts/compare.js +334 -0
  442. package/dist/scripts/coverage-audit.d.ts +44 -0
  443. package/dist/scripts/coverage-audit.js +209 -0
  444. package/dist/scripts/debug-eval.d.ts +19 -0
  445. package/dist/scripts/debug-eval.js +73 -0
  446. package/dist/scripts/discovery-report.d.ts +58 -0
  447. package/dist/scripts/discovery-report.js +250 -0
  448. package/dist/scripts/fetch-docs.d.ts +35 -0
  449. package/dist/scripts/fetch-docs.js +472 -0
  450. package/dist/scripts/generate-configs.d.ts +66 -0
  451. package/dist/scripts/generate-configs.js +459 -0
  452. package/dist/scripts/grader-api.d.ts +27 -0
  453. package/dist/scripts/grader-api.js +206 -0
  454. package/dist/scripts/grader-compare.d.ts +22 -0
  455. package/dist/scripts/grader-compare.js +368 -0
  456. package/dist/scripts/grader-consistency.d.ts +20 -0
  457. package/dist/scripts/grader-consistency.js +313 -0
  458. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  459. package/dist/scripts/grader-sensitivity.js +354 -0
  460. package/dist/scripts/grader-validate.d.ts +19 -0
  461. package/dist/scripts/grader-validate.js +267 -0
  462. package/dist/scripts/measure-retrieval.d.ts +10 -0
  463. package/dist/scripts/measure-retrieval.js +145 -0
  464. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
  465. package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
  466. package/dist/scripts/pipeline.d.ts +76 -0
  467. package/dist/scripts/pipeline.js +1031 -0
  468. package/dist/scripts/pr-comment.d.ts +10 -0
  469. package/dist/scripts/pr-comment.js +510 -0
  470. package/dist/scripts/readiness-report.d.ts +88 -0
  471. package/dist/scripts/readiness-report.js +342 -0
  472. package/dist/scripts/update-quality-scores.d.ts +15 -0
  473. package/dist/scripts/update-quality-scores.js +184 -0
  474. package/dist/scripts/validate-task-sources.d.ts +21 -0
  475. package/dist/scripts/validate-task-sources.js +210 -0
  476. package/dist/scripts/validate.d.ts +13 -0
  477. package/dist/scripts/validate.js +79 -0
  478. package/dist/scripts/webhook-server.d.ts +26 -0
  479. package/dist/scripts/webhook-server.js +147 -0
  480. package/dist/scripts/weekly-digest.d.ts +24 -0
  481. package/dist/scripts/weekly-digest.js +144 -0
  482. package/dist/sinks/bigquery/index.d.ts +131 -0
  483. package/dist/sinks/bigquery/index.js +222 -0
  484. package/dist/sinks/format-slack.d.ts +64 -0
  485. package/dist/sinks/format-slack.js +306 -0
  486. package/dist/sinks/index.d.ts +23 -0
  487. package/dist/sinks/index.js +18 -0
  488. package/dist/sinks/loader.d.ts +18 -0
  489. package/dist/sinks/loader.js +82 -0
  490. package/dist/sinks/retry.d.ts +24 -0
  491. package/dist/sinks/retry.js +52 -0
  492. package/dist/sinks/schema.d.ts +9 -0
  493. package/dist/sinks/schema.js +9 -0
  494. package/dist/sinks/slack/format.d.ts +65 -0
  495. package/dist/sinks/slack/format.js +327 -0
  496. package/dist/sinks/slack/index.d.ts +27 -0
  497. package/dist/sinks/slack/index.js +78 -0
  498. package/dist/sinks/slack-sink.d.ts +27 -0
  499. package/dist/sinks/slack-sink.js +78 -0
  500. package/dist/sinks/types.d.ts +59 -0
  501. package/dist/sinks/types.js +44 -0
  502. package/dist/sinks/webhook/index.d.ts +19 -0
  503. package/dist/sinks/webhook/index.js +50 -0
  504. package/dist/sinks/webhook-sink.d.ts +19 -0
  505. package/dist/sinks/webhook-sink.js +50 -0
  506. package/dist/sources.d.ts +104 -0
  507. package/dist/sources.js +292 -0
  508. package/dist/webhook/budget.d.ts +42 -0
  509. package/dist/webhook/budget.js +60 -0
  510. package/dist/webhook/debounce.d.ts +67 -0
  511. package/dist/webhook/debounce.js +76 -0
  512. package/dist/webhook/dispatch.d.ts +45 -0
  513. package/dist/webhook/dispatch.js +84 -0
  514. package/dist/webhook/eval-request-handler.d.ts +87 -0
  515. package/dist/webhook/eval-request-handler.js +181 -0
  516. package/dist/webhook/handler.d.ts +88 -0
  517. package/dist/webhook/handler.js +203 -0
  518. package/dist/webhook/index.d.ts +17 -0
  519. package/dist/webhook/index.js +12 -0
  520. package/dist/webhook/types.d.ts +109 -0
  521. package/dist/webhook/types.js +10 -0
  522. package/package.json +72 -0
  523. package/tasks/.expanded.agentic.yaml +51 -0
  524. package/tasks/.expanded.yaml +66 -0
  525. package/tasks/frameworks.yaml +98 -0
  526. package/tasks/functions.yaml +51 -0
  527. package/tasks/groq.yaml +216 -0
  528. package/tasks/nextjs-live.yaml +62 -0
  529. package/tasks/studio-setup.yaml +111 -0
  530. package/tasks/visual-editing.yaml +120 -0
@@ -0,0 +1,301 @@
1
+ /**
2
+ * pipeline/grader-compare-runner.ts
3
+ *
4
+ * Orchestration module for inter-grader comparison (Phase 3).
5
+ *
6
+ * Reads eval results, extracts grading judgments, re-grades each with
7
+ * candidate models, and calls `compareGraders()` from the pure
8
+ * computation module.
9
+ *
10
+ * Migrated from lib/grader-compare.ts — no process.argv, no process.exit(),
11
+ * no module-level constants. Accepts rootDir as parameter.
12
+ *
13
+ * @see docs/exec-plans/completed/grader-reliability.md — Phase 3
14
+ */
15
+ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
16
+ import { join } from "path";
17
+ import { load } from "js-yaml";
18
+ import { compareGraders, } from "./grader-comparison.js";
19
+ import { classifyCorrelation } from "./grader-validation.js";
20
+ import { gradeOnce } from "./grader-api.js";
21
+ // ---------------------------------------------------------------------------
22
+ // Internal helpers
23
+ // ---------------------------------------------------------------------------
24
+ function classifyDimension(component) {
25
+ const metadata = component.assertion?.metadata;
26
+ if (metadata?.dimension) {
27
+ switch (metadata.dimension) {
28
+ case "code-correctness":
29
+ return "codeCorrectness";
30
+ case "doc-coverage":
31
+ return "docCoverage";
32
+ case "task-completion":
33
+ return "taskCompletion";
34
+ default:
35
+ return null;
36
+ }
37
+ }
38
+ const value = (component.assertion?.value ?? "").toLowerCase();
39
+ if (value.includes("task completion"))
40
+ return "taskCompletion";
41
+ if (value.includes("code correctness"))
42
+ return "codeCorrectness";
43
+ if (value.includes("documentation coverage") || value.includes("hallucinate"))
44
+ return "docCoverage";
45
+ return null;
46
+ }
47
+ function detectFeatureArea(description) {
48
+ const desc = description.toLowerCase();
49
+ if (desc.includes("studio"))
50
+ return "studio-setup";
51
+ if (desc.includes("visual") ||
52
+ desc.includes("presentation") ||
53
+ desc.includes("live preview"))
54
+ return "visual-editing";
55
+ if (desc.includes("function") || desc.includes("webhook"))
56
+ return "functions";
57
+ if (desc.startsWith("groq"))
58
+ return "groq";
59
+ if (desc.includes("next") || desc.includes("app router"))
60
+ return "nextjs-live";
61
+ if (desc.includes("remix") ||
62
+ desc.includes("nuxt") ||
63
+ desc.includes("svelte"))
64
+ return "frameworks";
65
+ return "other";
66
+ }
67
+ /**
68
+ * Extract llm-rubric judgments from eval results.
69
+ * Only includes gold (with-docs) tests.
70
+ */
71
+ function extractJudgments(file) {
72
+ const judgments = [];
73
+ for (const result of file.results.results) {
74
+ if (!result.gradingResult)
75
+ continue;
76
+ const description = result.testCase?.description ?? "unknown";
77
+ const hasDocs = result.vars?.docs && result.vars.docs.trim().length > 0;
78
+ if (!hasDocs)
79
+ continue;
80
+ const area = detectFeatureArea(description);
81
+ for (const comp of result.gradingResult.componentResults) {
82
+ if (comp.assertion?.type !== "llm-rubric")
83
+ continue;
84
+ const dimension = classifyDimension(comp);
85
+ if (!dimension)
86
+ continue;
87
+ const rubricText = typeof comp.assertion.value === "string" ? comp.assertion.value : "";
88
+ if (!rubricText)
89
+ continue;
90
+ judgments.push({
91
+ area,
92
+ description,
93
+ dimension,
94
+ responseText: result.response?.output ?? "",
95
+ rubricText,
96
+ });
97
+ }
98
+ }
99
+ return judgments;
100
+ }
101
+ /**
102
+ * Load config: resolve baseline grader and candidate graders.
103
+ * Candidate overrides take precedence over config/models.yaml.
104
+ */
105
+ function loadConfig(rootDir, candidateOverrides) {
106
+ const modelsPath = join(rootDir, "config", "models.yaml");
107
+ if (!existsSync(modelsPath)) {
108
+ throw new Error(`config/models.yaml not found at ${modelsPath}`);
109
+ }
110
+ const raw = readFileSync(modelsPath, "utf-8");
111
+ const data = load(raw);
112
+ const baseline = {
113
+ id: data?.grader?.id ?? "openai:gpt-5",
114
+ label: data?.grader?.label ?? "GPT-5 (grader)",
115
+ };
116
+ let candidates;
117
+ if (candidateOverrides && candidateOverrides.length > 0) {
118
+ candidates = candidateOverrides;
119
+ }
120
+ else {
121
+ const configCandidates = data?.["grader-candidates"] ?? [];
122
+ candidates = configCandidates.map((c) => ({
123
+ id: c.id,
124
+ label: c.label ?? c.id.split(":").pop() ?? c.id,
125
+ }));
126
+ }
127
+ return { baseline, candidates };
128
+ }
129
+ // ---------------------------------------------------------------------------
130
+ // Report formatting (pure)
131
+ // ---------------------------------------------------------------------------
132
+ /**
133
+ * Format a GraderComparison result as a human-readable table report.
134
+ * Returns a string — does NOT print to console.
135
+ */
136
+ export function formatComparisonReport(result) {
137
+ const lines = [];
138
+ lines.push("=".repeat(80));
139
+ lines.push(" GRADER COMPARISON REPORT");
140
+ lines.push("=".repeat(80));
141
+ lines.push("");
142
+ lines.push(` Baseline: ${result.baselineGrader}`);
143
+ lines.push(` Candidates: ${result.candidateGraders.join(", ")}`);
144
+ lines.push(` Generated: ${result.generatedAt}`);
145
+ lines.push("");
146
+ for (const pair of result.pairwise) {
147
+ lines.push("-".repeat(80));
148
+ lines.push(`${pair.graderA} vs ${pair.graderB}`);
149
+ lines.push("-".repeat(80));
150
+ lines.push("");
151
+ lines.push(` Correlation: r=${pair.correlation} (${classifyCorrelation(pair.correlation)})`);
152
+ lines.push(` Bias: ${pair.bias > 0 ? "+" : ""}${pair.bias} (${pair.bias > 0 ? "candidate scores higher" : pair.bias < 0 ? "candidate scores lower" : "no bias"})`);
153
+ lines.push(` Mean |Δ|: ${pair.meanAbsDiff} points`);
154
+ lines.push("");
155
+ // Per-dimension table
156
+ const h = "| Dimension | Correlation | Bias | Mean |Δ| | Count |";
157
+ const sep = "|------------------|-------------|--------|---------|-------|";
158
+ lines.push(h);
159
+ lines.push(sep);
160
+ const dims = [
161
+ { data: pair.perDimension.taskCompletion, name: "Task Completion" },
162
+ { data: pair.perDimension.codeCorrectness, name: "Code Correctness" },
163
+ { data: pair.perDimension.docCoverage, name: "Doc Coverage" },
164
+ ];
165
+ for (const { data, name } of dims) {
166
+ const biasStr = data.bias > 0 ? `+${data.bias}` : `${data.bias}`;
167
+ lines.push(`| ${name.padEnd(16)} | r=${String(data.correlation).padStart(9)} | ${biasStr.padStart(6)} | ${String(data.meanAbsDiff).padStart(7)} | ${String(data.count).padStart(5)} |`);
168
+ }
169
+ lines.push("");
170
+ }
171
+ // Recommendations
172
+ lines.push("-".repeat(80));
173
+ lines.push("RECOMMENDATIONS");
174
+ lines.push("-".repeat(80));
175
+ lines.push("");
176
+ for (const rec of result.recommendations) {
177
+ const icon = rec.recommendation === "comparable"
178
+ ? "✅"
179
+ : rec.recommendation === "divergent"
180
+ ? "⚠️"
181
+ : "❌";
182
+ lines.push(` ${icon} ${rec.modelId}: ${rec.recommendation}`);
183
+ lines.push(` ${rec.reason}`);
184
+ }
185
+ lines.push("");
186
+ return lines.join("\n");
187
+ }
188
+ // ---------------------------------------------------------------------------
189
+ // Main runner
190
+ // ---------------------------------------------------------------------------
191
+ /**
192
+ * Run inter-grader comparison.
193
+ *
194
+ * Reads eval results, grades each judgment with the baseline and candidate
195
+ * graders, and produces a comparison report.
196
+ *
197
+ * @throws Error if results file not found, no candidates configured, or no judgments found
198
+ */
199
+ export async function runGraderCompare(options) {
200
+ const { rootDir, format = "table" } = options;
201
+ const resultsPath = options.resultsPath
202
+ ? join(rootDir, options.resultsPath)
203
+ : join(rootDir, "results", "latest", "eval-results.json");
204
+ console.log("=== Grader Comparison ===\n");
205
+ // Load config
206
+ const { baseline, candidates } = loadConfig(rootDir, options.candidates);
207
+ if (candidates.length === 0) {
208
+ throw new Error("No candidate graders configured. " +
209
+ "Add grader-candidates to config/models.yaml or pass --candidate.");
210
+ }
211
+ // Load eval results
212
+ if (!existsSync(resultsPath)) {
213
+ throw new Error(`Results file not found: ${resultsPath}. Run 'pnpm eval' first.`);
214
+ }
215
+ const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
216
+ // Extract judgments
217
+ const judgments = extractJudgments(file);
218
+ console.log(` Baseline: ${baseline.label} (${baseline.id})`);
219
+ console.log(` Candidates: ${candidates.map((c) => c.label).join(", ")}`);
220
+ console.log(` Judgments: ${judgments.length}`);
221
+ if (judgments.length === 0) {
222
+ throw new Error("No gradable judgments found in results.");
223
+ }
224
+ const totalCalls = judgments.length * (1 + candidates.length);
225
+ const estimatedCost = totalCalls * 0.005;
226
+ console.log(` API calls: ${totalCalls} (${judgments.length} × ${1 + candidates.length} models)`);
227
+ console.log(` Est. cost: ~$${estimatedCost.toFixed(2)}`);
228
+ console.log();
229
+ // Grade with baseline
230
+ console.log(` Grading with baseline: ${baseline.label}...`);
231
+ const baselineScores = await gradeJudgments(judgments, baseline.id);
232
+ // Grade with each candidate
233
+ const candidateScoreSets = [];
234
+ for (const candidate of candidates) {
235
+ console.log(` Grading with candidate: ${candidate.label}...`);
236
+ const scores = await gradeJudgments(judgments, candidate.id);
237
+ candidateScoreSets.push({
238
+ label: candidate.label,
239
+ modelId: candidate.id,
240
+ scores,
241
+ });
242
+ }
243
+ console.log();
244
+ // Compare
245
+ const baselineScoreSet = {
246
+ label: baseline.label,
247
+ modelId: baseline.id,
248
+ scores: baselineScores,
249
+ };
250
+ const result = compareGraders(baselineScoreSet, candidateScoreSets);
251
+ // Output
252
+ if (format === "table") {
253
+ console.log(formatComparisonReport(result));
254
+ }
255
+ else {
256
+ console.log(JSON.stringify(result, null, 2));
257
+ }
258
+ // Write output
259
+ const outPath = options.outputPath ??
260
+ join(rootDir, "results", "latest", "grader-comparison.json");
261
+ const outDir = join(outPath, "..");
262
+ mkdirSync(outDir, { recursive: true });
263
+ writeFileSync(outPath, JSON.stringify(result, null, 2));
264
+ console.log(`\n 📄 Results written to ${outPath}`);
265
+ return result;
266
+ }
267
+ // ---------------------------------------------------------------------------
268
+ // Grading helper
269
+ // ---------------------------------------------------------------------------
270
+ /**
271
+ * Grade a set of judgments with a specific grader model.
272
+ * Returns GraderScore[] with one score per judgment.
273
+ */
274
+ async function gradeJudgments(judgments, graderModel) {
275
+ const scores = [];
276
+ let completed = 0;
277
+ let failed = 0;
278
+ for (const judgment of judgments) {
279
+ const score = await gradeOnce(graderModel, judgment.responseText, judgment.rubricText);
280
+ completed++;
281
+ if (completed % 10 === 0 || completed === judgments.length) {
282
+ const pct = Math.round((completed / judgments.length) * 100);
283
+ process.stdout.write(`\r Progress: ${completed}/${judgments.length} (${pct}%)`);
284
+ }
285
+ if (score === null) {
286
+ failed++;
287
+ continue;
288
+ }
289
+ scores.push({
290
+ area: judgment.area,
291
+ dimension: judgment.dimension,
292
+ score,
293
+ taskId: judgment.description,
294
+ });
295
+ }
296
+ console.log(); // newline after progress
297
+ if (failed > 0) {
298
+ console.log(` ⚠ ${failed} grading calls failed (excluded)`);
299
+ }
300
+ return scores;
301
+ }
@@ -0,0 +1,111 @@
1
+ /**
2
+ * pipeline/grader-comparison.ts
3
+ *
4
+ * Pure computation module for inter-grader comparison.
5
+ *
6
+ * Takes score data from multiple grader models (each having graded the same
7
+ * set of responses) and produces a comparison matrix with correlations,
8
+ * bias measurements, and per-area deltas between every grader pair.
9
+ *
10
+ * This module has NO side effects — no file I/O, no API calls.
11
+ * It operates on pre-collected data only.
12
+ *
13
+ * @see docs/exec-plans/completed/grader-reliability.md — Phase 3
14
+ */
15
+ import type { ComparisonReport, ScoreSummary } from "./types.js";
16
+ /** Per-dimension comparison between two graders */
17
+ export interface DimensionPairComparison {
18
+ /** Systematic bias for this dimension */
19
+ bias: number;
20
+ /** Pearson correlation for this dimension */
21
+ correlation: number;
22
+ /** Number of paired observations */
23
+ count: number;
24
+ /** Mean absolute difference for this dimension */
25
+ meanAbsDiff: number;
26
+ }
27
+ /** Full inter-grader comparison result */
28
+ export interface GraderComparison {
29
+ /** The baseline grader model ID */
30
+ baselineGrader: string;
31
+ /** Candidate grader model IDs */
32
+ candidateGraders: string[];
33
+ /** When this comparison was generated */
34
+ generatedAt: string;
35
+ /** Pairwise comparisons between the baseline and each candidate */
36
+ pairwise: GraderPairComparison[];
37
+ /** Recommendation summary per candidate */
38
+ recommendations: GraderRecommendation[];
39
+ /** ScoreSummary-level comparison reports (reuses compare() primitive) */
40
+ scoreSummaryComparisons?: ComparisonReport[];
41
+ }
42
+ /** Comparison between a pair of grader models */
43
+ export interface GraderPairComparison {
44
+ /** Systematic bias: mean(B scores - A scores). Positive = B grades higher */
45
+ bias: number;
46
+ /** Pearson correlation between the two graders' scores */
47
+ correlation: number;
48
+ /** Grader A model ID */
49
+ graderA: string;
50
+ /** Grader B model ID */
51
+ graderB: string;
52
+ /** Mean absolute difference between scores */
53
+ meanAbsDiff: number;
54
+ /** Per-dimension comparisons */
55
+ perDimension: {
56
+ taskCompletion: DimensionPairComparison;
57
+ codeCorrectness: DimensionPairComparison;
58
+ docCoverage: DimensionPairComparison;
59
+ };
60
+ }
61
+ /** Recommendation for a candidate grader */
62
+ export interface GraderRecommendation {
63
+ /** Candidate grader model ID */
64
+ modelId: string;
65
+ /** Short explanation */
66
+ reason: string;
67
+ /** Human-readable recommendation */
68
+ recommendation: "comparable" | "divergent" | "strongly-divergent";
69
+ }
70
+ /** Score data from a single grader model on one (task, dimension) judgment */
71
+ export interface GraderScore {
72
+ /** Feature area (e.g., "groq") */
73
+ area: string;
74
+ /** Which scoring dimension */
75
+ dimension: "codeCorrectness" | "docCoverage" | "taskCompletion";
76
+ /** Score assigned by this grader (0–100) */
77
+ score: number;
78
+ /** Task ID (e.g., "groq-blog-queries") */
79
+ taskId: string;
80
+ }
81
+ /** Score data from a single grader model across all judgments */
82
+ export interface GraderScoreSet {
83
+ /** Grader model label (human-readable) */
84
+ label: string;
85
+ /** Grader model ID (e.g., "openai:gpt-5-2025-08-07") */
86
+ modelId: string;
87
+ /** All scores from this grader */
88
+ scores: GraderScore[];
89
+ }
90
+ /**
91
+ * Compare graders at the ScoreSummary level using the existing compare() primitive.
92
+ *
93
+ * This extends a basic GraderComparison with full ComparisonReport objects,
94
+ * showing per-area and per-dimension deltas at the aggregate score level.
95
+ *
96
+ * @param comparison Basic grader comparison (from compareGraders)
97
+ * @param baselineSummary ScoreSummary produced using the baseline grader
98
+ * @param candidateSummaries ScoreSummaries produced using each candidate grader
99
+ * @returns The comparison object augmented with scoreSummaryComparisons
100
+ */
101
+ export declare function attachScoreSummaryComparisons(comparison: GraderComparison, baselineSummary: ScoreSummary, candidateSummaries: ScoreSummary[]): GraderComparison;
102
+ /**
103
+ * Compare multiple grader models against a baseline grader.
104
+ *
105
+ * This is the main entry point — a pure function with no side effects.
106
+ *
107
+ * @param baseline Score data from the baseline grader
108
+ * @param candidates Score data from candidate grader(s)
109
+ * @returns GraderComparison with pairwise metrics and recommendations
110
+ */
111
+ export declare function compareGraders(baseline: GraderScoreSet, candidates: GraderScoreSet[]): GraderComparison;
@@ -0,0 +1,161 @@
1
+ /**
2
+ * pipeline/grader-comparison.ts
3
+ *
4
+ * Pure computation module for inter-grader comparison.
5
+ *
6
+ * Takes score data from multiple grader models (each having graded the same
7
+ * set of responses) and produces a comparison matrix with correlations,
8
+ * bias measurements, and per-area deltas between every grader pair.
9
+ *
10
+ * This module has NO side effects — no file I/O, no API calls.
11
+ * It operates on pre-collected data only.
12
+ *
13
+ * @see docs/exec-plans/completed/grader-reliability.md — Phase 3
14
+ */
15
+ import { compare } from "./compare.js";
16
+ import { pearsonCorrelation } from "./grader-validation.js";
17
+ // ---------------------------------------------------------------------------
18
+ // Pure computation
19
+ // ---------------------------------------------------------------------------
20
+ /**
21
+ * Compare graders at the ScoreSummary level using the existing compare() primitive.
22
+ *
23
+ * This extends a basic GraderComparison with full ComparisonReport objects,
24
+ * showing per-area and per-dimension deltas at the aggregate score level.
25
+ *
26
+ * @param comparison Basic grader comparison (from compareGraders)
27
+ * @param baselineSummary ScoreSummary produced using the baseline grader
28
+ * @param candidateSummaries ScoreSummaries produced using each candidate grader
29
+ * @returns The comparison object augmented with scoreSummaryComparisons
30
+ */
31
+ export function attachScoreSummaryComparisons(comparison, baselineSummary, candidateSummaries) {
32
+ const scoreSummaryComparisons = candidateSummaries.map((candidateSummary) => compare(baselineSummary, candidateSummary));
33
+ return {
34
+ ...comparison,
35
+ scoreSummaryComparisons,
36
+ };
37
+ }
38
+ /**
39
+ * Compare multiple grader models against a baseline grader.
40
+ *
41
+ * This is the main entry point — a pure function with no side effects.
42
+ *
43
+ * @param baseline Score data from the baseline grader
44
+ * @param candidates Score data from candidate grader(s)
45
+ * @returns GraderComparison with pairwise metrics and recommendations
46
+ */
47
+ export function compareGraders(baseline, candidates) {
48
+ const pairwise = candidates.map((candidate) => comparePair(baseline, candidate));
49
+ const recommendations = pairwise.map((pair) => makeRecommendation(pair));
50
+ return {
51
+ baselineGrader: baseline.modelId,
52
+ candidateGraders: candidates.map((c) => c.modelId),
53
+ generatedAt: new Date().toISOString(),
54
+ pairwise,
55
+ recommendations,
56
+ };
57
+ }
58
+ // ---------------------------------------------------------------------------
59
+ // Internal helpers
60
+ // ---------------------------------------------------------------------------
61
+ /** Compare a single pair of graders on their raw judgment scores */
62
+ function comparePair(a, b) {
63
+ // Build a lookup for grader B scores by (taskId, dimension) key
64
+ const bScoreMap = new Map();
65
+ for (const s of b.scores) {
66
+ bScoreMap.set(`${s.taskId}::${s.dimension}`, s.score);
67
+ }
68
+ // Find paired observations (present in both graders)
69
+ const pairedA = [];
70
+ const pairedB = [];
71
+ const dimPairsA = {
72
+ codeCorrectness: [],
73
+ docCoverage: [],
74
+ taskCompletion: [],
75
+ };
76
+ const dimPairsB = {
77
+ codeCorrectness: [],
78
+ docCoverage: [],
79
+ taskCompletion: [],
80
+ };
81
+ for (const sA of a.scores) {
82
+ const key = `${sA.taskId}::${sA.dimension}`;
83
+ const scoreB = bScoreMap.get(key);
84
+ if (scoreB === undefined)
85
+ continue;
86
+ pairedA.push(sA.score);
87
+ pairedB.push(scoreB);
88
+ dimPairsA[sA.dimension].push(sA.score);
89
+ dimPairsB[sA.dimension].push(scoreB);
90
+ }
91
+ return {
92
+ bias: computeBias(pairedA, pairedB),
93
+ correlation: safeCorrelation(pairedA, pairedB),
94
+ graderA: a.modelId,
95
+ graderB: b.modelId,
96
+ meanAbsDiff: computeMeanAbsDiff(pairedA, pairedB),
97
+ perDimension: {
98
+ codeCorrectness: computeDimensionPair(dimPairsA.codeCorrectness, dimPairsB.codeCorrectness),
99
+ docCoverage: computeDimensionPair(dimPairsA.docCoverage, dimPairsB.docCoverage),
100
+ taskCompletion: computeDimensionPair(dimPairsA.taskCompletion, dimPairsB.taskCompletion),
101
+ },
102
+ };
103
+ }
104
+ /** Mean signed difference (B - A). Positive = B scores higher. */
105
+ function computeBias(a, b) {
106
+ if (a.length === 0)
107
+ return 0;
108
+ const sum = b.reduce((s, v, i) => s + (v - a[i]), 0);
109
+ return Math.round((sum / a.length) * 10) / 10;
110
+ }
111
+ /** Compute dimension-level pair comparison */
112
+ function computeDimensionPair(scoresA, scoresB) {
113
+ return {
114
+ bias: computeBias(scoresA, scoresB),
115
+ correlation: safeCorrelation(scoresA, scoresB),
116
+ count: scoresA.length,
117
+ meanAbsDiff: computeMeanAbsDiff(scoresA, scoresB),
118
+ };
119
+ }
120
+ /** Mean absolute difference between paired scores */
121
+ function computeMeanAbsDiff(a, b) {
122
+ if (a.length === 0)
123
+ return 0;
124
+ const sum = b.reduce((s, v, i) => s + Math.abs(v - a[i]), 0);
125
+ return Math.round((sum / a.length) * 10) / 10;
126
+ }
127
+ /** Classify a grader pair as comparable, divergent, or strongly-divergent */
128
+ function makeRecommendation(pair) {
129
+ const r = pair.correlation;
130
+ const absBias = Math.abs(pair.bias);
131
+ const mad = pair.meanAbsDiff;
132
+ // Thresholds for recommendation
133
+ // comparable: high correlation, low bias and MAD
134
+ // divergent: moderate correlation or noticeable bias
135
+ // strongly-divergent: low correlation or large bias
136
+ if (r >= 0.9 && absBias <= 5 && mad <= 8) {
137
+ return {
138
+ modelId: pair.graderB,
139
+ reason: `High correlation (r=${r}), low bias (${pair.bias}), low MAD (${mad})`,
140
+ recommendation: "comparable",
141
+ };
142
+ }
143
+ if (r >= 0.7 && absBias <= 10 && mad <= 15) {
144
+ return {
145
+ modelId: pair.graderB,
146
+ reason: `Moderate correlation (r=${r}), bias=${pair.bias}, MAD=${mad}`,
147
+ recommendation: "divergent",
148
+ };
149
+ }
150
+ return {
151
+ modelId: pair.graderB,
152
+ reason: `Low correlation (r=${r}), bias=${pair.bias}, MAD=${mad}`,
153
+ recommendation: "strongly-divergent",
154
+ };
155
+ }
156
+ /** Safe Pearson correlation that handles edge cases */
157
+ function safeCorrelation(a, b) {
158
+ if (a.length < 2)
159
+ return 0;
160
+ return Math.round(pearsonCorrelation(a, b) * 100) / 100;
161
+ }
@@ -0,0 +1,60 @@
1
+ /**
2
+ * grader-consistency-runner.ts
3
+ *
4
+ * Orchestration logic for grader consistency analysis.
5
+ *
6
+ * Reads existing eval results, re-runs ONLY the grading assertions N additional
7
+ * times with the configured grader model, and analyzes score variance.
8
+ *
9
+ * This does NOT re-run the models under test — it only re-grades the same
10
+ * responses. Cost is low: ~$0.005 per grading call × N replications.
11
+ *
12
+ * Migrated from lib/grader-consistency.ts — no process.argv, no process.exit(),
13
+ * no module-level constants.
14
+ *
15
+ * @see docs/exec-plans/completed/grader-reliability.md — Phase 1
16
+ */
17
+ import type { RawPromptfooFile } from "./calculate-scores.js";
18
+ import { type GraderConsistency } from "./grader-consistency.js";
19
+ import type { DimensionName } from "./types.js";
20
+ /** Options for the grader consistency runner. */
21
+ export interface GraderConsistencyRunnerOptions {
22
+ /** Number of additional grading replications (default: 5) */
23
+ replications: number;
24
+ /** Path to eval-results.json */
25
+ resultsPath: string;
26
+ /** Root directory of the eval package (for output paths) */
27
+ rootDir: string;
28
+ }
29
+ interface GradingJudgment {
30
+ /** Feature area */
31
+ area: string;
32
+ /** Task description */
33
+ description: string;
34
+ /** Scoring dimension */
35
+ dimension: DimensionName;
36
+ /** The original score from the eval run */
37
+ originalScore: number;
38
+ /** Provider (model under test) */
39
+ providerId?: string;
40
+ /** The model's response text to be graded */
41
+ responseText: string;
42
+ /** The rubric text (assertion value) */
43
+ rubricText: string;
44
+ }
45
+ /**
46
+ * Extract all llm-rubric grading judgments from eval results.
47
+ * Only includes gold (with-docs) tests to keep the analysis focused.
48
+ */
49
+ export declare function extractGradingJudgments(file: RawPromptfooFile): GradingJudgment[];
50
+ export declare function formatConsistencyReport(result: GraderConsistency, graderModel: string): string;
51
+ /**
52
+ * Run the grader consistency analysis.
53
+ *
54
+ * Reads eval results, re-grades each judgment N times, and writes the
55
+ * consistency report to results/latest/grader-consistency.json.
56
+ *
57
+ * @throws Error if results file not found, replications < 2, or no judgments found
58
+ */
59
+ export declare function runGraderConsistency(options: GraderConsistencyRunnerOptions): Promise<GraderConsistency>;
60
+ export {};