@sanity/ailf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (530) hide show
  1. package/README.md +89 -0
  2. package/bin/ailf.js +64 -0
  3. package/canonical/grader-references/README.md +88 -0
  4. package/canonical/grader-references/groq.yaml +234 -0
  5. package/canonical/grader-references/studio-setup.yaml +275 -0
  6. package/canonical/reference-solutions/.gitkeep +1 -0
  7. package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
  8. package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
  9. package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
  10. package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
  11. package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
  12. package/canonical/reference-solutions/groq/joins-references.ts +300 -0
  13. package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
  14. package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
  15. package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
  16. package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
  17. package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
  18. package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
  19. package/config/bigquery/README.md +74 -0
  20. package/config/bigquery/views/area_scores.sql +87 -0
  21. package/config/bigquery/views/reports.sql +49 -0
  22. package/config/features.yaml +116 -0
  23. package/config/models.yaml +115 -0
  24. package/config/prompts.yaml +75 -0
  25. package/config/rubrics.yaml +62 -0
  26. package/config/schedules.yaml +43 -0
  27. package/config/sinks.yaml +54 -0
  28. package/config/sources.yaml +51 -0
  29. package/config/thresholds.yaml +49 -0
  30. package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
  31. package/dist/_vendor/ailf-core/examples/index.js +285 -0
  32. package/dist/_vendor/ailf-core/index.d.ts +17 -0
  33. package/dist/_vendor/ailf-core/index.js +17 -0
  34. package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
  35. package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
  36. package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
  37. package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
  38. package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
  39. package/dist/_vendor/ailf-core/ports/context.js +14 -0
  40. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
  41. package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
  42. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
  43. package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
  44. package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
  45. package/dist/_vendor/ailf-core/ports/index.js +7 -0
  46. package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
  47. package/dist/_vendor/ailf-core/ports/logger.js +11 -0
  48. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
  49. package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
  50. package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
  51. package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
  52. package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
  53. package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
  54. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
  55. package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
  56. package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
  57. package/dist/_vendor/ailf-core/schemas/index.js +16 -0
  58. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
  59. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
  60. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
  61. package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
  62. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
  63. package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
  64. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
  65. package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
  66. package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
  67. package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
  68. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
  69. package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
  70. package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
  71. package/dist/_vendor/ailf-core/services/index.js +12 -0
  72. package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
  73. package/dist/_vendor/ailf-core/services/scoring.js +222 -0
  74. package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
  75. package/dist/_vendor/ailf-core/types/index.js +21 -0
  76. package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
  77. package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
  78. package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
  79. package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
  80. package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
  81. package/dist/_vendor/ailf-shared/document-ref.js +1 -0
  82. package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
  83. package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
  84. package/dist/_vendor/ailf-shared/index.d.ts +16 -0
  85. package/dist/_vendor/ailf-shared/index.js +16 -0
  86. package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
  87. package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
  88. package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
  89. package/dist/_vendor/ailf-shared/score-grades.js +23 -0
  90. package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
  91. package/dist/adapters/cache/content-lake-cache.js +59 -0
  92. package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
  93. package/dist/adapters/cache/filesystem-cache.js +54 -0
  94. package/dist/adapters/cache/index.d.ts +2 -0
  95. package/dist/adapters/cache/index.js +2 -0
  96. package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
  97. package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
  98. package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
  99. package/dist/adapters/config-sources/file-config-adapter.js +96 -0
  100. package/dist/adapters/config-sources/index.d.ts +2 -0
  101. package/dist/adapters/config-sources/index.js +2 -0
  102. package/dist/adapters/doc-fetchers/index.d.ts +1 -0
  103. package/dist/adapters/doc-fetchers/index.js +1 -0
  104. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
  105. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
  106. package/dist/adapters/eval-runners/index.d.ts +1 -0
  107. package/dist/adapters/eval-runners/index.js +1 -0
  108. package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
  109. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
  110. package/dist/adapters/index.d.ts +12 -0
  111. package/dist/adapters/index.js +12 -0
  112. package/dist/adapters/loggers/console-logger.d.ts +22 -0
  113. package/dist/adapters/loggers/console-logger.js +54 -0
  114. package/dist/adapters/loggers/index.d.ts +9 -0
  115. package/dist/adapters/loggers/index.js +9 -0
  116. package/dist/adapters/loggers/json-logger.d.ts +18 -0
  117. package/dist/adapters/loggers/json-logger.js +33 -0
  118. package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
  119. package/dist/adapters/loggers/quiet-logger.js +30 -0
  120. package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
  121. package/dist/adapters/task-sources/composite-task-source.js +59 -0
  122. package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
  123. package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
  124. package/dist/adapters/task-sources/index.d.ts +7 -0
  125. package/dist/adapters/task-sources/index.js +7 -0
  126. package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
  127. package/dist/adapters/task-sources/repo-schemas.js +234 -0
  128. package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
  129. package/dist/adapters/task-sources/repo-task-source.js +104 -0
  130. package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
  131. package/dist/adapters/task-sources/repo-trigger.js +153 -0
  132. package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
  133. package/dist/adapters/task-sources/repo-validation.js +164 -0
  134. package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
  135. package/dist/adapters/task-sources/yaml-task-source.js +136 -0
  136. package/dist/agent-observer/agentic-provider.d.ts +132 -0
  137. package/dist/agent-observer/agentic-provider.js +983 -0
  138. package/dist/agent-observer/classifier.d.ts +62 -0
  139. package/dist/agent-observer/classifier.js +269 -0
  140. package/dist/agent-observer/index.d.ts +7 -0
  141. package/dist/agent-observer/index.js +4 -0
  142. package/dist/agent-observer/pricing.d.ts +35 -0
  143. package/dist/agent-observer/pricing.js +82 -0
  144. package/dist/agent-observer/provider.d.ts +77 -0
  145. package/dist/agent-observer/provider.js +151 -0
  146. package/dist/agent-observer/proxy.d.ts +91 -0
  147. package/dist/agent-observer/proxy.js +321 -0
  148. package/dist/agent-observer/test-imports.d.ts +7 -0
  149. package/dist/agent-observer/test-imports.js +185 -0
  150. package/dist/agent-observer/types.d.ts +137 -0
  151. package/dist/agent-observer/types.js +16 -0
  152. package/dist/assertions/source-isolation.d.ts +72 -0
  153. package/dist/assertions/source-isolation.js +117 -0
  154. package/dist/cli.d.ts +24 -0
  155. package/dist/cli.js +199 -0
  156. package/dist/commands/agent-report.d.ts +5 -0
  157. package/dist/commands/agent-report.js +69 -0
  158. package/dist/commands/baseline.d.ts +9 -0
  159. package/dist/commands/baseline.js +141 -0
  160. package/dist/commands/cache.d.ts +13 -0
  161. package/dist/commands/cache.js +135 -0
  162. package/dist/commands/calculate-scores.d.ts +8 -0
  163. package/dist/commands/calculate-scores.js +48 -0
  164. package/dist/commands/compare.d.ts +8 -0
  165. package/dist/commands/compare.js +120 -0
  166. package/dist/commands/completion.d.ts +18 -0
  167. package/dist/commands/completion.js +260 -0
  168. package/dist/commands/coverage-audit.d.ts +7 -0
  169. package/dist/commands/coverage-audit.js +40 -0
  170. package/dist/commands/discovery-report.d.ts +10 -0
  171. package/dist/commands/discovery-report.js +44 -0
  172. package/dist/commands/eval.d.ts +9 -0
  173. package/dist/commands/eval.js +35 -0
  174. package/dist/commands/explain-handler.d.ts +34 -0
  175. package/dist/commands/explain-handler.js +719 -0
  176. package/dist/commands/fetch-docs.d.ts +8 -0
  177. package/dist/commands/fetch-docs.js +128 -0
  178. package/dist/commands/generate-configs.d.ts +8 -0
  179. package/dist/commands/generate-configs.js +46 -0
  180. package/dist/commands/grader/index.d.ts +11 -0
  181. package/dist/commands/grader/index.js +118 -0
  182. package/dist/commands/init.d.ts +19 -0
  183. package/dist/commands/init.js +150 -0
  184. package/dist/commands/interactive.d.ts +12 -0
  185. package/dist/commands/interactive.js +238 -0
  186. package/dist/commands/lookup-doc.d.ts +15 -0
  187. package/dist/commands/lookup-doc.js +84 -0
  188. package/dist/commands/measure-retrieval.d.ts +5 -0
  189. package/dist/commands/measure-retrieval.js +65 -0
  190. package/dist/commands/pipeline-action.d.ts +71 -0
  191. package/dist/commands/pipeline-action.js +305 -0
  192. package/dist/commands/pipeline.d.ts +62 -0
  193. package/dist/commands/pipeline.js +53 -0
  194. package/dist/commands/pr-comment.d.ts +8 -0
  195. package/dist/commands/pr-comment.js +47 -0
  196. package/dist/commands/publish.d.ts +26 -0
  197. package/dist/commands/publish.js +253 -0
  198. package/dist/commands/readiness-report.d.ts +10 -0
  199. package/dist/commands/readiness-report.js +104 -0
  200. package/dist/commands/shared/options.d.ts +29 -0
  201. package/dist/commands/shared/options.js +57 -0
  202. package/dist/commands/update-quality-scores.d.ts +5 -0
  203. package/dist/commands/update-quality-scores.js +20 -0
  204. package/dist/commands/validate-tasks.d.ts +16 -0
  205. package/dist/commands/validate-tasks.js +93 -0
  206. package/dist/commands/validate.d.ts +9 -0
  207. package/dist/commands/validate.js +73 -0
  208. package/dist/commands/webhook-server.d.ts +5 -0
  209. package/dist/commands/webhook-server.js +30 -0
  210. package/dist/commands/weekly-digest.d.ts +10 -0
  211. package/dist/commands/weekly-digest.js +104 -0
  212. package/dist/composition-root.d.ts +26 -0
  213. package/dist/composition-root.js +107 -0
  214. package/dist/interpolate.d.ts +26 -0
  215. package/dist/interpolate.js +70 -0
  216. package/dist/job-store.d.ts +104 -0
  217. package/dist/job-store.js +188 -0
  218. package/dist/lib/agent-behavior-report.d.ts +8 -0
  219. package/dist/lib/agent-behavior-report.js +185 -0
  220. package/dist/lib/baseline.d.ts +19 -0
  221. package/dist/lib/baseline.js +153 -0
  222. package/dist/lib/calculate-scores.d.ts +23 -0
  223. package/dist/lib/calculate-scores.js +42 -0
  224. package/dist/lib/compare.d.ts +18 -0
  225. package/dist/lib/compare.js +170 -0
  226. package/dist/lib/coverage-audit.d.ts +4 -0
  227. package/dist/lib/coverage-audit.js +42 -0
  228. package/dist/lib/discovery-report.d.ts +13 -0
  229. package/dist/lib/discovery-report.js +57 -0
  230. package/dist/lib/fetch-docs.d.ts +30 -0
  231. package/dist/lib/fetch-docs.js +171 -0
  232. package/dist/lib/generate-configs.d.ts +25 -0
  233. package/dist/lib/generate-configs.js +42 -0
  234. package/dist/lib/grader-api.d.ts +21 -0
  235. package/dist/lib/grader-api.js +34 -0
  236. package/dist/lib/grader-compare.d.ts +19 -0
  237. package/dist/lib/grader-compare.js +91 -0
  238. package/dist/lib/grader-consistency.d.ts +27 -0
  239. package/dist/lib/grader-consistency.js +79 -0
  240. package/dist/lib/grader-sensitivity.d.ts +19 -0
  241. package/dist/lib/grader-sensitivity.js +75 -0
  242. package/dist/lib/grader-validate.d.ts +19 -0
  243. package/dist/lib/grader-validate.js +78 -0
  244. package/dist/lib/measure-retrieval.d.ts +14 -0
  245. package/dist/lib/measure-retrieval.js +71 -0
  246. package/dist/lib/pr-comment.d.ts +16 -0
  247. package/dist/lib/pr-comment.js +28 -0
  248. package/dist/lib/readiness-report.d.ts +13 -0
  249. package/dist/lib/readiness-report.js +108 -0
  250. package/dist/lib/webhook-server.d.ts +11 -0
  251. package/dist/lib/webhook-server.js +24 -0
  252. package/dist/lib/weekly-digest.d.ts +24 -0
  253. package/dist/lib/weekly-digest.js +148 -0
  254. package/dist/orchestration/build-app-context.d.ts +27 -0
  255. package/dist/orchestration/build-app-context.js +81 -0
  256. package/dist/orchestration/build-step-sequence.d.ts +15 -0
  257. package/dist/orchestration/build-step-sequence.js +84 -0
  258. package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
  259. package/dist/orchestration/config-to-source-overrides.js +28 -0
  260. package/dist/orchestration/env-bridge.d.ts +21 -0
  261. package/dist/orchestration/env-bridge.js +66 -0
  262. package/dist/orchestration/index.d.ts +11 -0
  263. package/dist/orchestration/index.js +11 -0
  264. package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
  265. package/dist/orchestration/pipeline-orchestrator.js +153 -0
  266. package/dist/orchestration/step-runner.d.ts +20 -0
  267. package/dist/orchestration/step-runner.js +88 -0
  268. package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
  269. package/dist/orchestration/steps/calculate-scores-step.js +95 -0
  270. package/dist/orchestration/steps/callback-step.d.ts +24 -0
  271. package/dist/orchestration/steps/callback-step.js +76 -0
  272. package/dist/orchestration/steps/compare-step.d.ts +14 -0
  273. package/dist/orchestration/steps/compare-step.js +92 -0
  274. package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
  275. package/dist/orchestration/steps/discovery-report-step.js +55 -0
  276. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  277. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  278. package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
  279. package/dist/orchestration/steps/fetch-docs-step.js +135 -0
  280. package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
  281. package/dist/orchestration/steps/gap-analysis-step.js +136 -0
  282. package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
  283. package/dist/orchestration/steps/generate-configs-step.js +85 -0
  284. package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
  285. package/dist/orchestration/steps/grader-consistency-step.js +64 -0
  286. package/dist/orchestration/steps/index.d.ts +19 -0
  287. package/dist/orchestration/steps/index.js +19 -0
  288. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
  289. package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
  290. package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
  291. package/dist/orchestration/steps/publish-report-step.js +216 -0
  292. package/dist/orchestration/steps/readiness-step.d.ts +13 -0
  293. package/dist/orchestration/steps/readiness-step.js +91 -0
  294. package/dist/orchestration/steps/report-step.d.ts +12 -0
  295. package/dist/orchestration/steps/report-step.js +49 -0
  296. package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
  297. package/dist/orchestration/steps/run-eval-step.js +195 -0
  298. package/dist/orchestration/steps/validate-step.d.ts +12 -0
  299. package/dist/orchestration/steps/validate-step.js +41 -0
  300. package/dist/pipeline/agent-behavior-report.d.ts +53 -0
  301. package/dist/pipeline/agent-behavior-report.js +132 -0
  302. package/dist/pipeline/attribution.d.ts +47 -0
  303. package/dist/pipeline/attribution.js +226 -0
  304. package/dist/pipeline/baseline.d.ts +37 -0
  305. package/dist/pipeline/baseline.js +141 -0
  306. package/dist/pipeline/cache.d.ts +101 -0
  307. package/dist/pipeline/cache.js +283 -0
  308. package/dist/pipeline/calculate-scores.d.ts +102 -0
  309. package/dist/pipeline/calculate-scores.js +1128 -0
  310. package/dist/pipeline/callback-delivery.d.ts +50 -0
  311. package/dist/pipeline/callback-delivery.js +89 -0
  312. package/dist/pipeline/checks.d.ts +39 -0
  313. package/dist/pipeline/checks.js +280 -0
  314. package/dist/pipeline/classify-url.d.ts +61 -0
  315. package/dist/pipeline/classify-url.js +93 -0
  316. package/dist/pipeline/compare.d.ts +31 -0
  317. package/dist/pipeline/compare.js +208 -0
  318. package/dist/pipeline/coverage-audit.d.ts +39 -0
  319. package/dist/pipeline/coverage-audit.js +165 -0
  320. package/dist/pipeline/degradations.d.ts +85 -0
  321. package/dist/pipeline/degradations.js +242 -0
  322. package/dist/pipeline/discovery-report.d.ts +55 -0
  323. package/dist/pipeline/discovery-report.js +178 -0
  324. package/dist/pipeline/eval-constants.d.ts +68 -0
  325. package/dist/pipeline/eval-constants.js +111 -0
  326. package/dist/pipeline/eval-fingerprint.d.ts +66 -0
  327. package/dist/pipeline/eval-fingerprint.js +175 -0
  328. package/dist/pipeline/expand-tasks.d.ts +220 -0
  329. package/dist/pipeline/expand-tasks.js +421 -0
  330. package/dist/pipeline/failure-modes.d.ts +46 -0
  331. package/dist/pipeline/failure-modes.js +348 -0
  332. package/dist/pipeline/fetch-url-content.d.ts +44 -0
  333. package/dist/pipeline/fetch-url-content.js +93 -0
  334. package/dist/pipeline/gap-analysis.d.ts +48 -0
  335. package/dist/pipeline/gap-analysis.js +231 -0
  336. package/dist/pipeline/generate-configs.d.ts +72 -0
  337. package/dist/pipeline/generate-configs.js +395 -0
  338. package/dist/pipeline/grader-api.d.ts +49 -0
  339. package/dist/pipeline/grader-api.js +200 -0
  340. package/dist/pipeline/grader-compare-runner.d.ts +44 -0
  341. package/dist/pipeline/grader-compare-runner.js +301 -0
  342. package/dist/pipeline/grader-comparison.d.ts +111 -0
  343. package/dist/pipeline/grader-comparison.js +161 -0
  344. package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
  345. package/dist/pipeline/grader-consistency-runner.js +270 -0
  346. package/dist/pipeline/grader-consistency.d.ts +103 -0
  347. package/dist/pipeline/grader-consistency.js +146 -0
  348. package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
  349. package/dist/pipeline/grader-sensitivity-runner.js +282 -0
  350. package/dist/pipeline/grader-sensitivity.d.ts +94 -0
  351. package/dist/pipeline/grader-sensitivity.js +144 -0
  352. package/dist/pipeline/grader-validate-runner.d.ts +38 -0
  353. package/dist/pipeline/grader-validate-runner.js +229 -0
  354. package/dist/pipeline/grader-validation.d.ts +107 -0
  355. package/dist/pipeline/grader-validation.js +169 -0
  356. package/dist/pipeline/map-request-to-config.d.ts +19 -0
  357. package/dist/pipeline/map-request-to-config.js +80 -0
  358. package/dist/pipeline/measure-retrieval.d.ts +59 -0
  359. package/dist/pipeline/measure-retrieval.js +111 -0
  360. package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
  361. package/dist/pipeline/mirror-repo-tasks.js +350 -0
  362. package/dist/pipeline/plan-format.d.ts +33 -0
  363. package/dist/pipeline/plan-format.js +202 -0
  364. package/dist/pipeline/plan.d.ts +169 -0
  365. package/dist/pipeline/plan.js +708 -0
  366. package/dist/pipeline/pr-comment.d.ts +19 -0
  367. package/dist/pipeline/pr-comment.js +502 -0
  368. package/dist/pipeline/probe.d.ts +52 -0
  369. package/dist/pipeline/probe.js +390 -0
  370. package/dist/pipeline/provenance.d.ts +47 -0
  371. package/dist/pipeline/provenance.js +146 -0
  372. package/dist/pipeline/readiness-report.d.ts +87 -0
  373. package/dist/pipeline/readiness-report.js +205 -0
  374. package/dist/pipeline/release-classification.d.ts +54 -0
  375. package/dist/pipeline/release-classification.js +238 -0
  376. package/dist/pipeline/release-report.d.ts +37 -0
  377. package/dist/pipeline/release-report.js +222 -0
  378. package/dist/pipeline/repo-eval-comment.d.ts +37 -0
  379. package/dist/pipeline/repo-eval-comment.js +165 -0
  380. package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
  381. package/dist/pipeline/repo-threshold-evaluator.js +162 -0
  382. package/dist/pipeline/resolve-mappings.d.ts +35 -0
  383. package/dist/pipeline/resolve-mappings.js +72 -0
  384. package/dist/pipeline/retrieval-metrics.d.ts +39 -0
  385. package/dist/pipeline/retrieval-metrics.js +136 -0
  386. package/dist/pipeline/reverse-mapping.d.ts +67 -0
  387. package/dist/pipeline/reverse-mapping.js +88 -0
  388. package/dist/pipeline/schemas.d.ts +9 -0
  389. package/dist/pipeline/schemas.js +9 -0
  390. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  391. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  392. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  393. package/dist/pipeline/steps/compare-step.js +90 -0
  394. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  395. package/dist/pipeline/steps/eval-step.js +347 -0
  396. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  397. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  398. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  399. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  400. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  401. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  402. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  403. package/dist/pipeline/steps/publish-report-step.js +243 -0
  404. package/dist/pipeline/steps/report-step.d.ts +13 -0
  405. package/dist/pipeline/steps/report-step.js +56 -0
  406. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  407. package/dist/pipeline/steps/update-scores-step.js +42 -0
  408. package/dist/pipeline/targeted-loo.d.ts +88 -0
  409. package/dist/pipeline/targeted-loo.js +203 -0
  410. package/dist/pipeline/thresholds.d.ts +27 -0
  411. package/dist/pipeline/thresholds.js +245 -0
  412. package/dist/pipeline/types.d.ts +10 -0
  413. package/dist/pipeline/types.js +10 -0
  414. package/dist/pipeline/validate.d.ts +67 -0
  415. package/dist/pipeline/validate.js +406 -0
  416. package/dist/pipeline/webhook-server.d.ts +37 -0
  417. package/dist/pipeline/webhook-server.js +133 -0
  418. package/dist/report-store.d.ts +84 -0
  419. package/dist/report-store.js +208 -0
  420. package/dist/sanity/client.d.ts +38 -0
  421. package/dist/sanity/client.js +86 -0
  422. package/dist/sanity/portable-text.d.ts +11 -0
  423. package/dist/sanity/portable-text.js +211 -0
  424. package/dist/sanity/queries.d.ts +133 -0
  425. package/dist/sanity/queries.js +300 -0
  426. package/dist/schedules/digest.d.ts +116 -0
  427. package/dist/schedules/digest.js +156 -0
  428. package/dist/schedules/index.d.ts +12 -0
  429. package/dist/schedules/index.js +10 -0
  430. package/dist/schedules/loader.d.ts +31 -0
  431. package/dist/schedules/loader.js +73 -0
  432. package/dist/schedules/schema.d.ts +9 -0
  433. package/dist/schedules/schema.js +9 -0
  434. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  435. package/dist/scripts/agent-behavior-report.js +315 -0
  436. package/dist/scripts/baseline.d.ts +43 -0
  437. package/dist/scripts/baseline.js +267 -0
  438. package/dist/scripts/calculate-scores.d.ts +166 -0
  439. package/dist/scripts/calculate-scores.js +1296 -0
  440. package/dist/scripts/compare.d.ts +22 -0
  441. package/dist/scripts/compare.js +334 -0
  442. package/dist/scripts/coverage-audit.d.ts +44 -0
  443. package/dist/scripts/coverage-audit.js +209 -0
  444. package/dist/scripts/debug-eval.d.ts +19 -0
  445. package/dist/scripts/debug-eval.js +73 -0
  446. package/dist/scripts/discovery-report.d.ts +58 -0
  447. package/dist/scripts/discovery-report.js +250 -0
  448. package/dist/scripts/fetch-docs.d.ts +35 -0
  449. package/dist/scripts/fetch-docs.js +472 -0
  450. package/dist/scripts/generate-configs.d.ts +66 -0
  451. package/dist/scripts/generate-configs.js +459 -0
  452. package/dist/scripts/grader-api.d.ts +27 -0
  453. package/dist/scripts/grader-api.js +206 -0
  454. package/dist/scripts/grader-compare.d.ts +22 -0
  455. package/dist/scripts/grader-compare.js +368 -0
  456. package/dist/scripts/grader-consistency.d.ts +20 -0
  457. package/dist/scripts/grader-consistency.js +313 -0
  458. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  459. package/dist/scripts/grader-sensitivity.js +354 -0
  460. package/dist/scripts/grader-validate.d.ts +19 -0
  461. package/dist/scripts/grader-validate.js +267 -0
  462. package/dist/scripts/measure-retrieval.d.ts +10 -0
  463. package/dist/scripts/measure-retrieval.js +145 -0
  464. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
  465. package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
  466. package/dist/scripts/pipeline.d.ts +76 -0
  467. package/dist/scripts/pipeline.js +1031 -0
  468. package/dist/scripts/pr-comment.d.ts +10 -0
  469. package/dist/scripts/pr-comment.js +510 -0
  470. package/dist/scripts/readiness-report.d.ts +88 -0
  471. package/dist/scripts/readiness-report.js +342 -0
  472. package/dist/scripts/update-quality-scores.d.ts +15 -0
  473. package/dist/scripts/update-quality-scores.js +184 -0
  474. package/dist/scripts/validate-task-sources.d.ts +21 -0
  475. package/dist/scripts/validate-task-sources.js +210 -0
  476. package/dist/scripts/validate.d.ts +13 -0
  477. package/dist/scripts/validate.js +79 -0
  478. package/dist/scripts/webhook-server.d.ts +26 -0
  479. package/dist/scripts/webhook-server.js +147 -0
  480. package/dist/scripts/weekly-digest.d.ts +24 -0
  481. package/dist/scripts/weekly-digest.js +144 -0
  482. package/dist/sinks/bigquery/index.d.ts +131 -0
  483. package/dist/sinks/bigquery/index.js +222 -0
  484. package/dist/sinks/format-slack.d.ts +64 -0
  485. package/dist/sinks/format-slack.js +306 -0
  486. package/dist/sinks/index.d.ts +23 -0
  487. package/dist/sinks/index.js +18 -0
  488. package/dist/sinks/loader.d.ts +18 -0
  489. package/dist/sinks/loader.js +82 -0
  490. package/dist/sinks/retry.d.ts +24 -0
  491. package/dist/sinks/retry.js +52 -0
  492. package/dist/sinks/schema.d.ts +9 -0
  493. package/dist/sinks/schema.js +9 -0
  494. package/dist/sinks/slack/format.d.ts +65 -0
  495. package/dist/sinks/slack/format.js +327 -0
  496. package/dist/sinks/slack/index.d.ts +27 -0
  497. package/dist/sinks/slack/index.js +78 -0
  498. package/dist/sinks/slack-sink.d.ts +27 -0
  499. package/dist/sinks/slack-sink.js +78 -0
  500. package/dist/sinks/types.d.ts +59 -0
  501. package/dist/sinks/types.js +44 -0
  502. package/dist/sinks/webhook/index.d.ts +19 -0
  503. package/dist/sinks/webhook/index.js +50 -0
  504. package/dist/sinks/webhook-sink.d.ts +19 -0
  505. package/dist/sinks/webhook-sink.js +50 -0
  506. package/dist/sources.d.ts +104 -0
  507. package/dist/sources.js +292 -0
  508. package/dist/webhook/budget.d.ts +42 -0
  509. package/dist/webhook/budget.js +60 -0
  510. package/dist/webhook/debounce.d.ts +67 -0
  511. package/dist/webhook/debounce.js +76 -0
  512. package/dist/webhook/dispatch.d.ts +45 -0
  513. package/dist/webhook/dispatch.js +84 -0
  514. package/dist/webhook/eval-request-handler.d.ts +87 -0
  515. package/dist/webhook/eval-request-handler.js +181 -0
  516. package/dist/webhook/handler.d.ts +88 -0
  517. package/dist/webhook/handler.js +203 -0
  518. package/dist/webhook/index.d.ts +17 -0
  519. package/dist/webhook/index.js +12 -0
  520. package/dist/webhook/types.d.ts +109 -0
  521. package/dist/webhook/types.js +10 -0
  522. package/package.json +72 -0
  523. package/tasks/.expanded.agentic.yaml +51 -0
  524. package/tasks/.expanded.yaml +66 -0
  525. package/tasks/frameworks.yaml +98 -0
  526. package/tasks/functions.yaml +51 -0
  527. package/tasks/groq.yaml +216 -0
  528. package/tasks/nextjs-live.yaml +62 -0
  529. package/tasks/studio-setup.yaml +111 -0
  530. package/tasks/visual-editing.yaml +120 -0
@@ -0,0 +1,282 @@
1
+ /**
2
+ * pipeline/grader-sensitivity-runner.ts
3
+ *
4
+ * Orchestration module for grader sensitivity (discrimination power) testing
5
+ * (Phase 4).
6
+ *
7
+ * Discovers canonical reference solutions, applies programmatic degradations,
8
+ * grades each original/degraded pair, and calls `analyzeSensitivity()` from
9
+ * the pure computation module.
10
+ *
11
+ * Migrated from lib/grader-sensitivity.ts — no process.argv, no process.exit(),
12
+ * no module-level constants. Accepts rootDir as parameter.
13
+ *
14
+ * @see docs/exec-plans/completed/grader-reliability.md — Phase 4
15
+ */
16
+ import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync, } from "fs";
17
+ import { basename, join } from "path";
18
+ import { DEGRADATION_STRATEGIES } from "./degradations.js";
19
+ import { gradeOnce, loadGraderModel } from "./grader-api.js";
20
+ import { analyzeSensitivity, } from "./grader-sensitivity.js";
21
+ // ---------------------------------------------------------------------------
22
+ // Internal helpers
23
+ // ---------------------------------------------------------------------------
24
+ /**
25
+ * Discover canonical reference solutions.
26
+ * Scans canonical/reference-solutions/<area>/ for source files.
27
+ */
28
+ function discoverReferenceSolutions(rootDir, areaFilter) {
29
+ const refsDir = join(rootDir, "canonical", "reference-solutions");
30
+ if (!existsSync(refsDir)) {
31
+ // Reference solutions now live in the Content Lake (ailf.referenceSolution).
32
+ // Return empty array when local files don't exist.
33
+ return [];
34
+ }
35
+ const areas = readdirSync(refsDir, { withFileTypes: true })
36
+ .filter((d) => d.isDirectory())
37
+ .map((d) => d.name)
38
+ .filter((name) => !areaFilter || name === areaFilter)
39
+ .sort();
40
+ if (areas.length === 0) {
41
+ throw new Error(areaFilter
42
+ ? `No reference solutions found for area "${areaFilter}".`
43
+ : "No reference solution directories found.");
44
+ }
45
+ const solutions = [];
46
+ for (const area of areas) {
47
+ const areaDir = join(refsDir, area);
48
+ const files = readdirSync(areaDir)
49
+ .filter((f) => f.endsWith(".ts") ||
50
+ f.endsWith(".tsx") ||
51
+ f.endsWith(".js") ||
52
+ f.endsWith(".jsx"))
53
+ .sort();
54
+ for (const file of files) {
55
+ const filePath = join(areaDir, file);
56
+ const content = readFileSync(filePath, "utf-8");
57
+ solutions.push({
58
+ area,
59
+ content,
60
+ sourcePath: `canonical/reference-solutions/${area}/${file}`,
61
+ });
62
+ }
63
+ }
64
+ return solutions;
65
+ }
66
+ /**
67
+ * Generate all degraded pairs from reference solutions.
68
+ * Each solution × each degradation strategy = one pair.
69
+ */
70
+ function generateDegradedPairs(solutions) {
71
+ const pairs = [];
72
+ for (const solution of solutions) {
73
+ for (const strategy of DEGRADATION_STRATEGIES) {
74
+ const degraded = strategy.apply(solution.content);
75
+ // Only include if degradation actually changed the code
76
+ if (degraded !== solution.content) {
77
+ pairs.push({
78
+ degradation: strategy,
79
+ degraded,
80
+ original: solution.content,
81
+ sourcePath: solution.sourcePath,
82
+ });
83
+ }
84
+ }
85
+ }
86
+ return pairs;
87
+ }
88
+ // ---------------------------------------------------------------------------
89
+ // Report formatting (pure)
90
+ // ---------------------------------------------------------------------------
91
+ /**
92
+ * Format a GraderSensitivityResult as a human-readable table report.
93
+ * Returns a string — does NOT print to console.
94
+ */
95
+ export function formatSensitivityReport(result) {
96
+ const lines = [];
97
+ lines.push("=".repeat(80));
98
+ lines.push(" GRADER SENSITIVITY REPORT");
99
+ lines.push("=".repeat(80));
100
+ lines.push("");
101
+ lines.push(` Grader: ${result.graderModel}`);
102
+ lines.push(` Total pairs: ${result.totalPairs}`);
103
+ lines.push("");
104
+ // Overall metrics
105
+ lines.push("-".repeat(80));
106
+ lines.push("OVERALL");
107
+ lines.push("-".repeat(80));
108
+ lines.push("");
109
+ lines.push(` Concordance: ${result.concordanceRate}%`);
110
+ lines.push(` Avg separation: ${result.avgSeparation} points`);
111
+ lines.push("");
112
+ // Per-dimension table
113
+ lines.push("-".repeat(80));
114
+ lines.push("PER-DIMENSION SENSITIVITY");
115
+ lines.push("-".repeat(80));
116
+ lines.push("");
117
+ const h = "| Dimension | Concordance | Avg Sep | Tied | Pairs |";
118
+ const sep = "|------------------|-------------|---------|-------|-------|";
119
+ lines.push(h);
120
+ lines.push(sep);
121
+ const dims = [
122
+ { data: result.perDimension.taskCompletion, name: "Task Completion" },
123
+ { data: result.perDimension.codeCorrectness, name: "Code Correctness" },
124
+ { data: result.perDimension.docCoverage, name: "Doc Coverage" },
125
+ ];
126
+ for (const { data, name } of dims) {
127
+ lines.push(`| ${name.padEnd(16)} | ${String(data.concordanceRate + "%").padStart(11)} | ${String(data.avgSeparation).padStart(7)} | ${String(data.tiedRate + "%").padStart(5)} | ${String(data.pairCount).padStart(5)} |`);
128
+ }
129
+ lines.push("");
130
+ // Cross-dimension analysis
131
+ lines.push("-".repeat(80));
132
+ lines.push("CROSS-DIMENSION SENSITIVITY");
133
+ lines.push("-".repeat(80));
134
+ lines.push("");
135
+ lines.push(` On-target: ${result.crossDimension.onTarget.concordanceRate}% concordance, ${result.crossDimension.onTarget.avgSeparation} avg sep (${result.crossDimension.onTarget.pairCount} pairs)`);
136
+ lines.push(` Off-target: ${result.crossDimension.offTarget.concordanceRate}% concordance, ${result.crossDimension.offTarget.avgSeparation} avg sep (${result.crossDimension.offTarget.pairCount} pairs)`);
137
+ lines.push("");
138
+ // Per-degradation breakdown
139
+ if (result.byDegradation.length > 0) {
140
+ lines.push("-".repeat(80));
141
+ lines.push("PER-DEGRADATION TYPE");
142
+ lines.push("-".repeat(80));
143
+ lines.push("");
144
+ const dh = "| Degradation | Concordance | Avg Sep | Pairs |";
145
+ const ds = "|----------------------------------------------|-------------|---------|-------|";
146
+ lines.push(dh);
147
+ lines.push(ds);
148
+ for (const deg of result.byDegradation) {
149
+ const desc = deg.description.slice(0, 44).padEnd(44);
150
+ lines.push(`| ${desc} | ${String(deg.concordanceRate + "%").padStart(11)} | ${String(deg.avgSeparation).padStart(7)} | ${String(deg.pairCount).padStart(5)} |`);
151
+ }
152
+ lines.push("");
153
+ }
154
+ // Failed pairs (worst failures)
155
+ const topN = Math.min(5, result.failedPairs.length);
156
+ if (topN > 0) {
157
+ lines.push("-".repeat(80));
158
+ lines.push(`TOP ${topN} REVERSED PAIRS (grader ranked degraded higher)`);
159
+ lines.push("-".repeat(80));
160
+ lines.push("");
161
+ for (let i = 0; i < topN; i++) {
162
+ const p = result.failedPairs[i];
163
+ const delta = p.degradedScore - p.originalScore;
164
+ lines.push(` ${i + 1}. ${basename(p.sourcePath)} — ${p.dimension}`);
165
+ lines.push(` Original=${p.originalScore}, Degraded=${p.degradedScore} (Δ=+${delta})`);
166
+ lines.push(` Degradation: ${p.degradationDescription}`);
167
+ }
168
+ lines.push("");
169
+ }
170
+ return lines.join("\n");
171
+ }
172
+ // ---------------------------------------------------------------------------
173
+ // Main runner
174
+ // ---------------------------------------------------------------------------
175
+ /**
176
+ * Run grader sensitivity analysis.
177
+ *
178
+ * Discovers reference solutions, applies degradations, grades each pair
179
+ * with the configured grader, and analyzes discrimination power.
180
+ *
181
+ * @throws Error if no reference solutions found or no pairs generated
182
+ */
183
+ export async function runGraderSensitivity(options) {
184
+ const { rootDir, areaFilter, format = "table" } = options;
185
+ console.log("=== Grader Sensitivity Analysis ===\n");
186
+ // Resolve grader model
187
+ const grader = loadGraderModel(rootDir);
188
+ console.log(` Grader: ${grader.label} (${grader.id})`);
189
+ // Discover reference solutions
190
+ const solutions = discoverReferenceSolutions(rootDir, areaFilter);
191
+ console.log(` Solutions: ${solutions.length} reference files`);
192
+ if (areaFilter) {
193
+ console.log(` Area filter: ${areaFilter}`);
194
+ }
195
+ // Generate degraded pairs
196
+ const degradedPairs = generateDegradedPairs(solutions);
197
+ console.log(` Pairs: ${degradedPairs.length} (solutions × degradations)`);
198
+ if (degradedPairs.length === 0) {
199
+ throw new Error("No degraded pairs generated. Check reference solutions.");
200
+ }
201
+ // Each pair needs 2 grades (original + degraded) × 3 dimensions = 6 calls
202
+ const dimensions = [
203
+ "taskCompletion",
204
+ "codeCorrectness",
205
+ "docCoverage",
206
+ ];
207
+ const totalCalls = degradedPairs.length * dimensions.length * 2;
208
+ const estimatedCost = totalCalls * 0.005;
209
+ console.log(` API calls: ${totalCalls}`);
210
+ console.log(` Est. cost: ~$${estimatedCost.toFixed(2)}`);
211
+ console.log();
212
+ // Grade each pair
213
+ console.log(" Grading original and degraded pairs...");
214
+ const rubricTemplate = (dim) => {
215
+ const labels = {
216
+ codeCorrectness: "Evaluate code correctness: Does the code follow best practices, use correct APIs, and avoid anti-patterns? Score 0–100.",
217
+ docCoverage: "Evaluate documentation coverage: Does the response demonstrate knowledge from official documentation? Are imports, API names, and configuration correct per the docs? Score 0–100.",
218
+ taskCompletion: "Evaluate task completion: Does the response fully implement the requested feature? Are all requirements addressed? Score 0–100.",
219
+ };
220
+ return labels[dim] ?? `Evaluate ${dim}. Score 0–100.`;
221
+ };
222
+ const sensitivityPairs = [];
223
+ let completed = 0;
224
+ let failed = 0;
225
+ for (const pair of degradedPairs) {
226
+ const area = pair.sourcePath
227
+ .split("/")
228
+ .find((_s, i, arr) => arr[i - 1] === "reference-solutions") ?? "unknown";
229
+ for (const dimension of dimensions) {
230
+ const rubric = rubricTemplate(dimension);
231
+ // Grade original
232
+ const originalScore = await gradeOnce(grader.id, pair.original, rubric);
233
+ // Grade degraded
234
+ const degradedScore = await gradeOnce(grader.id, pair.degraded, rubric);
235
+ completed++;
236
+ if (completed % 10 === 0 ||
237
+ completed === degradedPairs.length * dimensions.length) {
238
+ const total = degradedPairs.length * dimensions.length;
239
+ const pct = Math.round((completed / total) * 100);
240
+ process.stdout.write(`\r Progress: ${completed}/${total} (${pct}%)`);
241
+ }
242
+ if (originalScore === null || degradedScore === null) {
243
+ failed++;
244
+ continue;
245
+ }
246
+ sensitivityPairs.push({
247
+ area,
248
+ degradationDescription: pair.degradation.description,
249
+ degradedScore,
250
+ dimension,
251
+ originalScore,
252
+ sourcePath: pair.sourcePath,
253
+ targetDimension: pair.degradation.targetDimension,
254
+ });
255
+ }
256
+ }
257
+ console.log(); // newline after progress
258
+ if (failed > 0) {
259
+ console.log(` ⚠ ${failed} grading pairs failed (excluded)`);
260
+ }
261
+ console.log();
262
+ if (sensitivityPairs.length === 0) {
263
+ throw new Error("No sensitivity pairs to analyze. All grading calls failed.");
264
+ }
265
+ // Analyze
266
+ const result = analyzeSensitivity(sensitivityPairs, grader.id);
267
+ // Output
268
+ if (format === "table") {
269
+ console.log(formatSensitivityReport(result));
270
+ }
271
+ else {
272
+ console.log(JSON.stringify(result, null, 2));
273
+ }
274
+ // Write output
275
+ const outPath = options.outputPath ??
276
+ join(rootDir, "results", "latest", "grader-sensitivity.json");
277
+ const outDir = join(outPath, "..");
278
+ mkdirSync(outDir, { recursive: true });
279
+ writeFileSync(outPath, JSON.stringify(result, null, 2));
280
+ console.log(`\n 📄 Results written to ${outPath}`);
281
+ return result;
282
+ }
@@ -0,0 +1,94 @@
1
+ /**
2
+ * pipeline/grader-sensitivity.ts
3
+ *
4
+ * Pure computation module for measuring grader sensitivity (discrimination power).
5
+ *
6
+ * Takes paired grading results — original (good) and degraded (bad) versions
7
+ * of the same response graded by the same grader — and computes:
8
+ * - Concordance rate: % of pairs where the grader ranked the original higher
9
+ * - Score separation: average score difference between good and bad
10
+ * - Per-dimension sensitivity
11
+ *
12
+ * This module has NO side effects — no file I/O, no API calls.
13
+ *
14
+ * @see docs/exec-plans/completed/grader-reliability.md — Phase 4
15
+ */
16
+ /** Sensitivity broken down by degradation type */
17
+ export interface DegradationSensitivity {
18
+ /** Average score separation for this degradation type */
19
+ avgSeparation: number;
20
+ /** Concordance rate for this degradation type */
21
+ concordanceRate: number;
22
+ /** Description of the degradation */
23
+ description: string;
24
+ /** Number of pairs using this degradation */
25
+ pairCount: number;
26
+ /** Which dimension this degradation targeted */
27
+ targetDimension: "codeCorrectness" | "docCoverage" | "taskCompletion";
28
+ }
29
+ /** Per-dimension sensitivity metrics */
30
+ export interface DimensionSensitivity {
31
+ /** Average score separation (original - degraded) */
32
+ avgSeparation: number;
33
+ /** % of pairs where grader ranked original higher */
34
+ concordanceRate: number;
35
+ /** Number of paired comparisons */
36
+ pairCount: number;
37
+ /** % of pairs where scores were exactly equal */
38
+ tiedRate: number;
39
+ }
40
+ /** Full sensitivity analysis result */
41
+ export interface GraderSensitivityResult {
42
+ /** Average score separation across all pairs */
43
+ avgSeparation: number;
44
+ /** Per-degradation-type sensitivity */
45
+ byDegradation: DegradationSensitivity[];
46
+ /** Overall concordance rate (% of correctly ranked pairs) */
47
+ concordanceRate: number;
48
+ /** Cross-dimension sensitivity: does degradation in one dimension affect others? */
49
+ crossDimension: {
50
+ /** When targetDimension != dimension, does the score still drop? */
51
+ onTarget: DimensionSensitivity;
52
+ /** Scores for non-targeted dimensions */
53
+ offTarget: DimensionSensitivity;
54
+ };
55
+ /** Number of pairs where the grader gave a HIGHER score to the degraded version */
56
+ failedPairs: SensitivityPair[];
57
+ /** When this analysis was generated */
58
+ generatedAt: string;
59
+ /** Grader model used */
60
+ graderModel: string;
61
+ /** Per-dimension sensitivity metrics */
62
+ perDimension: {
63
+ taskCompletion: DimensionSensitivity;
64
+ codeCorrectness: DimensionSensitivity;
65
+ docCoverage: DimensionSensitivity;
66
+ };
67
+ /** Total paired comparisons analyzed */
68
+ totalPairs: number;
69
+ }
70
+ /** A single paired comparison: original vs. degraded response */
71
+ export interface SensitivityPair {
72
+ /** Feature area (e.g., "groq") */
73
+ area: string;
74
+ /** What degradation was applied */
75
+ degradationDescription: string;
76
+ /** Score assigned to the degraded version (0–100) */
77
+ degradedScore: number;
78
+ /** Which dimension this judgment measures */
79
+ dimension: "codeCorrectness" | "docCoverage" | "taskCompletion";
80
+ /** Score assigned to the original version (0–100) */
81
+ originalScore: number;
82
+ /** Source file of the reference solution */
83
+ sourcePath: string;
84
+ /** Which dimension the degradation targeted */
85
+ targetDimension: "codeCorrectness" | "docCoverage" | "taskCompletion";
86
+ }
87
+ /**
88
+ * Analyze sensitivity from a set of paired grading results.
89
+ *
90
+ * @param pairs Array of paired comparisons (original vs. degraded)
91
+ * @param graderModel Grader model ID for the report
92
+ * @returns Full sensitivity analysis with concordance, separation, and per-dimension metrics
93
+ */
94
+ export declare function analyzeSensitivity(pairs: SensitivityPair[], graderModel: string): GraderSensitivityResult;
@@ -0,0 +1,144 @@
1
+ /**
2
+ * pipeline/grader-sensitivity.ts
3
+ *
4
+ * Pure computation module for measuring grader sensitivity (discrimination power).
5
+ *
6
+ * Takes paired grading results — original (good) and degraded (bad) versions
7
+ * of the same response graded by the same grader — and computes:
8
+ * - Concordance rate: % of pairs where the grader ranked the original higher
9
+ * - Score separation: average score difference between good and bad
10
+ * - Per-dimension sensitivity
11
+ *
12
+ * This module has NO side effects — no file I/O, no API calls.
13
+ *
14
+ * @see docs/exec-plans/completed/grader-reliability.md — Phase 4
15
+ */
16
+ // ---------------------------------------------------------------------------
17
+ // Pure computation
18
+ // ---------------------------------------------------------------------------
19
+ /**
20
+ * Analyze sensitivity from a set of paired grading results.
21
+ *
22
+ * @param pairs Array of paired comparisons (original vs. degraded)
23
+ * @param graderModel Grader model ID for the report
24
+ * @returns Full sensitivity analysis with concordance, separation, and per-dimension metrics
25
+ */
26
+ export function analyzeSensitivity(pairs, graderModel) {
27
+ if (pairs.length === 0) {
28
+ return emptyResult(graderModel);
29
+ }
30
+ // Overall concordance and separation
31
+ const { avgSeparation, concordanceRate, tiedRate: _tiedRate, } = computeMetrics(pairs);
32
+ // Per-dimension (based on the grading dimension, not the target dimension)
33
+ const perDimension = {
34
+ codeCorrectness: computeMetrics(pairs.filter((p) => p.dimension === "codeCorrectness")),
35
+ docCoverage: computeMetrics(pairs.filter((p) => p.dimension === "docCoverage")),
36
+ taskCompletion: computeMetrics(pairs.filter((p) => p.dimension === "taskCompletion")),
37
+ };
38
+ // Cross-dimension: on-target (dimension matches targetDimension) vs off-target
39
+ const onTargetPairs = pairs.filter((p) => p.dimension === p.targetDimension);
40
+ const offTargetPairs = pairs.filter((p) => p.dimension !== p.targetDimension);
41
+ const crossDimension = {
42
+ offTarget: computeMetrics(offTargetPairs),
43
+ onTarget: computeMetrics(onTargetPairs),
44
+ };
45
+ // Per-degradation type
46
+ const byDegradation = computeByDegradation(pairs);
47
+ // Failed pairs (grader ranked degraded higher)
48
+ const failedPairs = pairs
49
+ .filter((p) => p.degradedScore > p.originalScore)
50
+ .sort((a, b) => {
51
+ const sepA = a.degradedScore - a.originalScore;
52
+ const sepB = b.degradedScore - b.originalScore;
53
+ return sepB - sepA; // worst failures first
54
+ });
55
+ return {
56
+ avgSeparation,
57
+ byDegradation,
58
+ concordanceRate,
59
+ crossDimension,
60
+ failedPairs,
61
+ generatedAt: new Date().toISOString(),
62
+ graderModel,
63
+ perDimension,
64
+ totalPairs: pairs.length,
65
+ };
66
+ }
67
+ // ---------------------------------------------------------------------------
68
+ // Internal helpers
69
+ // ---------------------------------------------------------------------------
70
+ /** Group pairs by degradation description and compute per-group metrics */
71
+ function computeByDegradation(pairs) {
72
+ const groups = new Map();
73
+ for (const p of pairs) {
74
+ const key = p.degradationDescription;
75
+ const group = groups.get(key) ?? [];
76
+ group.push(p);
77
+ groups.set(key, group);
78
+ }
79
+ const results = [];
80
+ for (const [description, group] of groups) {
81
+ const metrics = computeMetrics(group);
82
+ results.push({
83
+ avgSeparation: metrics.avgSeparation,
84
+ concordanceRate: metrics.concordanceRate,
85
+ description,
86
+ pairCount: metrics.pairCount,
87
+ targetDimension: group[0].targetDimension,
88
+ });
89
+ }
90
+ // Sort by concordance rate ascending (worst discrimination first)
91
+ results.sort((a, b) => a.concordanceRate - b.concordanceRate);
92
+ return results;
93
+ }
94
+ /** Compute concordance, separation, and tied rate from a set of pairs */
95
+ function computeMetrics(pairs) {
96
+ if (pairs.length === 0) {
97
+ return { avgSeparation: 0, concordanceRate: 0, pairCount: 0, tiedRate: 0 };
98
+ }
99
+ let concordant = 0;
100
+ let tied = 0;
101
+ let totalSeparation = 0;
102
+ for (const p of pairs) {
103
+ const sep = p.originalScore - p.degradedScore;
104
+ totalSeparation += sep;
105
+ if (p.originalScore > p.degradedScore)
106
+ concordant++;
107
+ if (p.originalScore === p.degradedScore)
108
+ tied++;
109
+ }
110
+ return {
111
+ avgSeparation: round(totalSeparation / pairs.length),
112
+ concordanceRate: round((concordant / pairs.length) * 100),
113
+ pairCount: pairs.length,
114
+ tiedRate: round((tied / pairs.length) * 100),
115
+ };
116
+ }
117
+ /** Empty result for when there are no pairs */
118
+ function emptyResult(graderModel) {
119
+ const emptyDim = {
120
+ avgSeparation: 0,
121
+ concordanceRate: 0,
122
+ pairCount: 0,
123
+ tiedRate: 0,
124
+ };
125
+ return {
126
+ avgSeparation: 0,
127
+ byDegradation: [],
128
+ concordanceRate: 0,
129
+ crossDimension: { offTarget: emptyDim, onTarget: emptyDim },
130
+ failedPairs: [],
131
+ generatedAt: new Date().toISOString(),
132
+ graderModel,
133
+ perDimension: {
134
+ codeCorrectness: emptyDim,
135
+ docCoverage: emptyDim,
136
+ taskCompletion: emptyDim,
137
+ },
138
+ totalPairs: 0,
139
+ };
140
+ }
141
+ /** Round to 1 decimal place */
142
+ function round(n) {
143
+ return Math.round(n * 10) / 10;
144
+ }
@@ -0,0 +1,38 @@
1
+ /**
2
+ * pipeline/grader-validate-runner.ts
3
+ *
4
+ * Orchestration module for grader validation against human reference grades
5
+ * (Phase 2).
6
+ *
7
+ * Loads human-graded reference samples from canonical/grader-references/,
8
+ * runs the grader model on each sample, and compares against human scores
9
+ * using `validateGrader()` from the pure computation module.
10
+ *
11
+ * Migrated from lib/grader-validate.ts — no process.argv, no process.exit(),
12
+ * no module-level constants. Accepts rootDir as parameter.
13
+ *
14
+ * @see docs/exec-plans/completed/grader-reliability.md — Phase 2
15
+ */
16
+ import { type GraderValidation } from "./grader-validation.js";
17
+ export interface GraderValidateRunnerOptions {
18
+ /** Grader model to validate (defaults to loadGraderModel(rootDir).id) */
19
+ graderModel?: string;
20
+ /** MAE threshold for pass/fail (default: 10) */
21
+ maeThreshold?: number;
22
+ /** Root directory of the eval package */
23
+ rootDir: string;
24
+ }
25
+ /**
26
+ * Format a GraderValidation result as a human-readable table report.
27
+ * Returns a string — does NOT print to console.
28
+ */
29
+ export declare function formatValidationReport(result: GraderValidation): string;
30
+ /**
31
+ * Run grader validation against human reference grades.
32
+ *
33
+ * Loads human-graded samples, grades each with the grader model,
34
+ * and computes validation metrics (MAE, correlation, bias).
35
+ *
36
+ * @throws Error if no reference grades found, or no grades to analyze
37
+ */
38
+ export declare function runGraderValidate(options: GraderValidateRunnerOptions): Promise<GraderValidation>;