@sanity/ailf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (530) hide show
  1. package/README.md +89 -0
  2. package/bin/ailf.js +64 -0
  3. package/canonical/grader-references/README.md +88 -0
  4. package/canonical/grader-references/groq.yaml +234 -0
  5. package/canonical/grader-references/studio-setup.yaml +275 -0
  6. package/canonical/reference-solutions/.gitkeep +1 -0
  7. package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
  8. package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
  9. package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
  10. package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
  11. package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
  12. package/canonical/reference-solutions/groq/joins-references.ts +300 -0
  13. package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
  14. package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
  15. package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
  16. package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
  17. package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
  18. package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
  19. package/config/bigquery/README.md +74 -0
  20. package/config/bigquery/views/area_scores.sql +87 -0
  21. package/config/bigquery/views/reports.sql +49 -0
  22. package/config/features.yaml +116 -0
  23. package/config/models.yaml +115 -0
  24. package/config/prompts.yaml +75 -0
  25. package/config/rubrics.yaml +62 -0
  26. package/config/schedules.yaml +43 -0
  27. package/config/sinks.yaml +54 -0
  28. package/config/sources.yaml +51 -0
  29. package/config/thresholds.yaml +49 -0
  30. package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
  31. package/dist/_vendor/ailf-core/examples/index.js +285 -0
  32. package/dist/_vendor/ailf-core/index.d.ts +17 -0
  33. package/dist/_vendor/ailf-core/index.js +17 -0
  34. package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
  35. package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
  36. package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
  37. package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
  38. package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
  39. package/dist/_vendor/ailf-core/ports/context.js +14 -0
  40. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
  41. package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
  42. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
  43. package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
  44. package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
  45. package/dist/_vendor/ailf-core/ports/index.js +7 -0
  46. package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
  47. package/dist/_vendor/ailf-core/ports/logger.js +11 -0
  48. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
  49. package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
  50. package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
  51. package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
  52. package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
  53. package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
  54. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
  55. package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
  56. package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
  57. package/dist/_vendor/ailf-core/schemas/index.js +16 -0
  58. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
  59. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
  60. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
  61. package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
  62. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
  63. package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
  64. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
  65. package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
  66. package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
  67. package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
  68. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
  69. package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
  70. package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
  71. package/dist/_vendor/ailf-core/services/index.js +12 -0
  72. package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
  73. package/dist/_vendor/ailf-core/services/scoring.js +222 -0
  74. package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
  75. package/dist/_vendor/ailf-core/types/index.js +21 -0
  76. package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
  77. package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
  78. package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
  79. package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
  80. package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
  81. package/dist/_vendor/ailf-shared/document-ref.js +1 -0
  82. package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
  83. package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
  84. package/dist/_vendor/ailf-shared/index.d.ts +16 -0
  85. package/dist/_vendor/ailf-shared/index.js +16 -0
  86. package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
  87. package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
  88. package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
  89. package/dist/_vendor/ailf-shared/score-grades.js +23 -0
  90. package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
  91. package/dist/adapters/cache/content-lake-cache.js +59 -0
  92. package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
  93. package/dist/adapters/cache/filesystem-cache.js +54 -0
  94. package/dist/adapters/cache/index.d.ts +2 -0
  95. package/dist/adapters/cache/index.js +2 -0
  96. package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
  97. package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
  98. package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
  99. package/dist/adapters/config-sources/file-config-adapter.js +96 -0
  100. package/dist/adapters/config-sources/index.d.ts +2 -0
  101. package/dist/adapters/config-sources/index.js +2 -0
  102. package/dist/adapters/doc-fetchers/index.d.ts +1 -0
  103. package/dist/adapters/doc-fetchers/index.js +1 -0
  104. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
  105. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
  106. package/dist/adapters/eval-runners/index.d.ts +1 -0
  107. package/dist/adapters/eval-runners/index.js +1 -0
  108. package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
  109. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
  110. package/dist/adapters/index.d.ts +12 -0
  111. package/dist/adapters/index.js +12 -0
  112. package/dist/adapters/loggers/console-logger.d.ts +22 -0
  113. package/dist/adapters/loggers/console-logger.js +54 -0
  114. package/dist/adapters/loggers/index.d.ts +9 -0
  115. package/dist/adapters/loggers/index.js +9 -0
  116. package/dist/adapters/loggers/json-logger.d.ts +18 -0
  117. package/dist/adapters/loggers/json-logger.js +33 -0
  118. package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
  119. package/dist/adapters/loggers/quiet-logger.js +30 -0
  120. package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
  121. package/dist/adapters/task-sources/composite-task-source.js +59 -0
  122. package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
  123. package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
  124. package/dist/adapters/task-sources/index.d.ts +7 -0
  125. package/dist/adapters/task-sources/index.js +7 -0
  126. package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
  127. package/dist/adapters/task-sources/repo-schemas.js +234 -0
  128. package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
  129. package/dist/adapters/task-sources/repo-task-source.js +104 -0
  130. package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
  131. package/dist/adapters/task-sources/repo-trigger.js +153 -0
  132. package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
  133. package/dist/adapters/task-sources/repo-validation.js +164 -0
  134. package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
  135. package/dist/adapters/task-sources/yaml-task-source.js +136 -0
  136. package/dist/agent-observer/agentic-provider.d.ts +132 -0
  137. package/dist/agent-observer/agentic-provider.js +983 -0
  138. package/dist/agent-observer/classifier.d.ts +62 -0
  139. package/dist/agent-observer/classifier.js +269 -0
  140. package/dist/agent-observer/index.d.ts +7 -0
  141. package/dist/agent-observer/index.js +4 -0
  142. package/dist/agent-observer/pricing.d.ts +35 -0
  143. package/dist/agent-observer/pricing.js +82 -0
  144. package/dist/agent-observer/provider.d.ts +77 -0
  145. package/dist/agent-observer/provider.js +151 -0
  146. package/dist/agent-observer/proxy.d.ts +91 -0
  147. package/dist/agent-observer/proxy.js +321 -0
  148. package/dist/agent-observer/test-imports.d.ts +7 -0
  149. package/dist/agent-observer/test-imports.js +185 -0
  150. package/dist/agent-observer/types.d.ts +137 -0
  151. package/dist/agent-observer/types.js +16 -0
  152. package/dist/assertions/source-isolation.d.ts +72 -0
  153. package/dist/assertions/source-isolation.js +117 -0
  154. package/dist/cli.d.ts +24 -0
  155. package/dist/cli.js +199 -0
  156. package/dist/commands/agent-report.d.ts +5 -0
  157. package/dist/commands/agent-report.js +69 -0
  158. package/dist/commands/baseline.d.ts +9 -0
  159. package/dist/commands/baseline.js +141 -0
  160. package/dist/commands/cache.d.ts +13 -0
  161. package/dist/commands/cache.js +135 -0
  162. package/dist/commands/calculate-scores.d.ts +8 -0
  163. package/dist/commands/calculate-scores.js +48 -0
  164. package/dist/commands/compare.d.ts +8 -0
  165. package/dist/commands/compare.js +120 -0
  166. package/dist/commands/completion.d.ts +18 -0
  167. package/dist/commands/completion.js +260 -0
  168. package/dist/commands/coverage-audit.d.ts +7 -0
  169. package/dist/commands/coverage-audit.js +40 -0
  170. package/dist/commands/discovery-report.d.ts +10 -0
  171. package/dist/commands/discovery-report.js +44 -0
  172. package/dist/commands/eval.d.ts +9 -0
  173. package/dist/commands/eval.js +35 -0
  174. package/dist/commands/explain-handler.d.ts +34 -0
  175. package/dist/commands/explain-handler.js +719 -0
  176. package/dist/commands/fetch-docs.d.ts +8 -0
  177. package/dist/commands/fetch-docs.js +128 -0
  178. package/dist/commands/generate-configs.d.ts +8 -0
  179. package/dist/commands/generate-configs.js +46 -0
  180. package/dist/commands/grader/index.d.ts +11 -0
  181. package/dist/commands/grader/index.js +118 -0
  182. package/dist/commands/init.d.ts +19 -0
  183. package/dist/commands/init.js +150 -0
  184. package/dist/commands/interactive.d.ts +12 -0
  185. package/dist/commands/interactive.js +238 -0
  186. package/dist/commands/lookup-doc.d.ts +15 -0
  187. package/dist/commands/lookup-doc.js +84 -0
  188. package/dist/commands/measure-retrieval.d.ts +5 -0
  189. package/dist/commands/measure-retrieval.js +65 -0
  190. package/dist/commands/pipeline-action.d.ts +71 -0
  191. package/dist/commands/pipeline-action.js +305 -0
  192. package/dist/commands/pipeline.d.ts +62 -0
  193. package/dist/commands/pipeline.js +53 -0
  194. package/dist/commands/pr-comment.d.ts +8 -0
  195. package/dist/commands/pr-comment.js +47 -0
  196. package/dist/commands/publish.d.ts +26 -0
  197. package/dist/commands/publish.js +253 -0
  198. package/dist/commands/readiness-report.d.ts +10 -0
  199. package/dist/commands/readiness-report.js +104 -0
  200. package/dist/commands/shared/options.d.ts +29 -0
  201. package/dist/commands/shared/options.js +57 -0
  202. package/dist/commands/update-quality-scores.d.ts +5 -0
  203. package/dist/commands/update-quality-scores.js +20 -0
  204. package/dist/commands/validate-tasks.d.ts +16 -0
  205. package/dist/commands/validate-tasks.js +93 -0
  206. package/dist/commands/validate.d.ts +9 -0
  207. package/dist/commands/validate.js +73 -0
  208. package/dist/commands/webhook-server.d.ts +5 -0
  209. package/dist/commands/webhook-server.js +30 -0
  210. package/dist/commands/weekly-digest.d.ts +10 -0
  211. package/dist/commands/weekly-digest.js +104 -0
  212. package/dist/composition-root.d.ts +26 -0
  213. package/dist/composition-root.js +107 -0
  214. package/dist/interpolate.d.ts +26 -0
  215. package/dist/interpolate.js +70 -0
  216. package/dist/job-store.d.ts +104 -0
  217. package/dist/job-store.js +188 -0
  218. package/dist/lib/agent-behavior-report.d.ts +8 -0
  219. package/dist/lib/agent-behavior-report.js +185 -0
  220. package/dist/lib/baseline.d.ts +19 -0
  221. package/dist/lib/baseline.js +153 -0
  222. package/dist/lib/calculate-scores.d.ts +23 -0
  223. package/dist/lib/calculate-scores.js +42 -0
  224. package/dist/lib/compare.d.ts +18 -0
  225. package/dist/lib/compare.js +170 -0
  226. package/dist/lib/coverage-audit.d.ts +4 -0
  227. package/dist/lib/coverage-audit.js +42 -0
  228. package/dist/lib/discovery-report.d.ts +13 -0
  229. package/dist/lib/discovery-report.js +57 -0
  230. package/dist/lib/fetch-docs.d.ts +30 -0
  231. package/dist/lib/fetch-docs.js +171 -0
  232. package/dist/lib/generate-configs.d.ts +25 -0
  233. package/dist/lib/generate-configs.js +42 -0
  234. package/dist/lib/grader-api.d.ts +21 -0
  235. package/dist/lib/grader-api.js +34 -0
  236. package/dist/lib/grader-compare.d.ts +19 -0
  237. package/dist/lib/grader-compare.js +91 -0
  238. package/dist/lib/grader-consistency.d.ts +27 -0
  239. package/dist/lib/grader-consistency.js +79 -0
  240. package/dist/lib/grader-sensitivity.d.ts +19 -0
  241. package/dist/lib/grader-sensitivity.js +75 -0
  242. package/dist/lib/grader-validate.d.ts +19 -0
  243. package/dist/lib/grader-validate.js +78 -0
  244. package/dist/lib/measure-retrieval.d.ts +14 -0
  245. package/dist/lib/measure-retrieval.js +71 -0
  246. package/dist/lib/pr-comment.d.ts +16 -0
  247. package/dist/lib/pr-comment.js +28 -0
  248. package/dist/lib/readiness-report.d.ts +13 -0
  249. package/dist/lib/readiness-report.js +108 -0
  250. package/dist/lib/webhook-server.d.ts +11 -0
  251. package/dist/lib/webhook-server.js +24 -0
  252. package/dist/lib/weekly-digest.d.ts +24 -0
  253. package/dist/lib/weekly-digest.js +148 -0
  254. package/dist/orchestration/build-app-context.d.ts +27 -0
  255. package/dist/orchestration/build-app-context.js +81 -0
  256. package/dist/orchestration/build-step-sequence.d.ts +15 -0
  257. package/dist/orchestration/build-step-sequence.js +84 -0
  258. package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
  259. package/dist/orchestration/config-to-source-overrides.js +28 -0
  260. package/dist/orchestration/env-bridge.d.ts +21 -0
  261. package/dist/orchestration/env-bridge.js +66 -0
  262. package/dist/orchestration/index.d.ts +11 -0
  263. package/dist/orchestration/index.js +11 -0
  264. package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
  265. package/dist/orchestration/pipeline-orchestrator.js +153 -0
  266. package/dist/orchestration/step-runner.d.ts +20 -0
  267. package/dist/orchestration/step-runner.js +88 -0
  268. package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
  269. package/dist/orchestration/steps/calculate-scores-step.js +95 -0
  270. package/dist/orchestration/steps/callback-step.d.ts +24 -0
  271. package/dist/orchestration/steps/callback-step.js +76 -0
  272. package/dist/orchestration/steps/compare-step.d.ts +14 -0
  273. package/dist/orchestration/steps/compare-step.js +92 -0
  274. package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
  275. package/dist/orchestration/steps/discovery-report-step.js +55 -0
  276. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  277. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  278. package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
  279. package/dist/orchestration/steps/fetch-docs-step.js +135 -0
  280. package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
  281. package/dist/orchestration/steps/gap-analysis-step.js +136 -0
  282. package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
  283. package/dist/orchestration/steps/generate-configs-step.js +85 -0
  284. package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
  285. package/dist/orchestration/steps/grader-consistency-step.js +64 -0
  286. package/dist/orchestration/steps/index.d.ts +19 -0
  287. package/dist/orchestration/steps/index.js +19 -0
  288. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
  289. package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
  290. package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
  291. package/dist/orchestration/steps/publish-report-step.js +216 -0
  292. package/dist/orchestration/steps/readiness-step.d.ts +13 -0
  293. package/dist/orchestration/steps/readiness-step.js +91 -0
  294. package/dist/orchestration/steps/report-step.d.ts +12 -0
  295. package/dist/orchestration/steps/report-step.js +49 -0
  296. package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
  297. package/dist/orchestration/steps/run-eval-step.js +195 -0
  298. package/dist/orchestration/steps/validate-step.d.ts +12 -0
  299. package/dist/orchestration/steps/validate-step.js +41 -0
  300. package/dist/pipeline/agent-behavior-report.d.ts +53 -0
  301. package/dist/pipeline/agent-behavior-report.js +132 -0
  302. package/dist/pipeline/attribution.d.ts +47 -0
  303. package/dist/pipeline/attribution.js +226 -0
  304. package/dist/pipeline/baseline.d.ts +37 -0
  305. package/dist/pipeline/baseline.js +141 -0
  306. package/dist/pipeline/cache.d.ts +101 -0
  307. package/dist/pipeline/cache.js +283 -0
  308. package/dist/pipeline/calculate-scores.d.ts +102 -0
  309. package/dist/pipeline/calculate-scores.js +1128 -0
  310. package/dist/pipeline/callback-delivery.d.ts +50 -0
  311. package/dist/pipeline/callback-delivery.js +89 -0
  312. package/dist/pipeline/checks.d.ts +39 -0
  313. package/dist/pipeline/checks.js +280 -0
  314. package/dist/pipeline/classify-url.d.ts +61 -0
  315. package/dist/pipeline/classify-url.js +93 -0
  316. package/dist/pipeline/compare.d.ts +31 -0
  317. package/dist/pipeline/compare.js +208 -0
  318. package/dist/pipeline/coverage-audit.d.ts +39 -0
  319. package/dist/pipeline/coverage-audit.js +165 -0
  320. package/dist/pipeline/degradations.d.ts +85 -0
  321. package/dist/pipeline/degradations.js +242 -0
  322. package/dist/pipeline/discovery-report.d.ts +55 -0
  323. package/dist/pipeline/discovery-report.js +178 -0
  324. package/dist/pipeline/eval-constants.d.ts +68 -0
  325. package/dist/pipeline/eval-constants.js +111 -0
  326. package/dist/pipeline/eval-fingerprint.d.ts +66 -0
  327. package/dist/pipeline/eval-fingerprint.js +175 -0
  328. package/dist/pipeline/expand-tasks.d.ts +220 -0
  329. package/dist/pipeline/expand-tasks.js +421 -0
  330. package/dist/pipeline/failure-modes.d.ts +46 -0
  331. package/dist/pipeline/failure-modes.js +348 -0
  332. package/dist/pipeline/fetch-url-content.d.ts +44 -0
  333. package/dist/pipeline/fetch-url-content.js +93 -0
  334. package/dist/pipeline/gap-analysis.d.ts +48 -0
  335. package/dist/pipeline/gap-analysis.js +231 -0
  336. package/dist/pipeline/generate-configs.d.ts +72 -0
  337. package/dist/pipeline/generate-configs.js +395 -0
  338. package/dist/pipeline/grader-api.d.ts +49 -0
  339. package/dist/pipeline/grader-api.js +200 -0
  340. package/dist/pipeline/grader-compare-runner.d.ts +44 -0
  341. package/dist/pipeline/grader-compare-runner.js +301 -0
  342. package/dist/pipeline/grader-comparison.d.ts +111 -0
  343. package/dist/pipeline/grader-comparison.js +161 -0
  344. package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
  345. package/dist/pipeline/grader-consistency-runner.js +270 -0
  346. package/dist/pipeline/grader-consistency.d.ts +103 -0
  347. package/dist/pipeline/grader-consistency.js +146 -0
  348. package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
  349. package/dist/pipeline/grader-sensitivity-runner.js +282 -0
  350. package/dist/pipeline/grader-sensitivity.d.ts +94 -0
  351. package/dist/pipeline/grader-sensitivity.js +144 -0
  352. package/dist/pipeline/grader-validate-runner.d.ts +38 -0
  353. package/dist/pipeline/grader-validate-runner.js +229 -0
  354. package/dist/pipeline/grader-validation.d.ts +107 -0
  355. package/dist/pipeline/grader-validation.js +169 -0
  356. package/dist/pipeline/map-request-to-config.d.ts +19 -0
  357. package/dist/pipeline/map-request-to-config.js +80 -0
  358. package/dist/pipeline/measure-retrieval.d.ts +59 -0
  359. package/dist/pipeline/measure-retrieval.js +111 -0
  360. package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
  361. package/dist/pipeline/mirror-repo-tasks.js +350 -0
  362. package/dist/pipeline/plan-format.d.ts +33 -0
  363. package/dist/pipeline/plan-format.js +202 -0
  364. package/dist/pipeline/plan.d.ts +169 -0
  365. package/dist/pipeline/plan.js +708 -0
  366. package/dist/pipeline/pr-comment.d.ts +19 -0
  367. package/dist/pipeline/pr-comment.js +502 -0
  368. package/dist/pipeline/probe.d.ts +52 -0
  369. package/dist/pipeline/probe.js +390 -0
  370. package/dist/pipeline/provenance.d.ts +47 -0
  371. package/dist/pipeline/provenance.js +146 -0
  372. package/dist/pipeline/readiness-report.d.ts +87 -0
  373. package/dist/pipeline/readiness-report.js +205 -0
  374. package/dist/pipeline/release-classification.d.ts +54 -0
  375. package/dist/pipeline/release-classification.js +238 -0
  376. package/dist/pipeline/release-report.d.ts +37 -0
  377. package/dist/pipeline/release-report.js +222 -0
  378. package/dist/pipeline/repo-eval-comment.d.ts +37 -0
  379. package/dist/pipeline/repo-eval-comment.js +165 -0
  380. package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
  381. package/dist/pipeline/repo-threshold-evaluator.js +162 -0
  382. package/dist/pipeline/resolve-mappings.d.ts +35 -0
  383. package/dist/pipeline/resolve-mappings.js +72 -0
  384. package/dist/pipeline/retrieval-metrics.d.ts +39 -0
  385. package/dist/pipeline/retrieval-metrics.js +136 -0
  386. package/dist/pipeline/reverse-mapping.d.ts +67 -0
  387. package/dist/pipeline/reverse-mapping.js +88 -0
  388. package/dist/pipeline/schemas.d.ts +9 -0
  389. package/dist/pipeline/schemas.js +9 -0
  390. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  391. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  392. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  393. package/dist/pipeline/steps/compare-step.js +90 -0
  394. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  395. package/dist/pipeline/steps/eval-step.js +347 -0
  396. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  397. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  398. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  399. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  400. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  401. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  402. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  403. package/dist/pipeline/steps/publish-report-step.js +243 -0
  404. package/dist/pipeline/steps/report-step.d.ts +13 -0
  405. package/dist/pipeline/steps/report-step.js +56 -0
  406. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  407. package/dist/pipeline/steps/update-scores-step.js +42 -0
  408. package/dist/pipeline/targeted-loo.d.ts +88 -0
  409. package/dist/pipeline/targeted-loo.js +203 -0
  410. package/dist/pipeline/thresholds.d.ts +27 -0
  411. package/dist/pipeline/thresholds.js +245 -0
  412. package/dist/pipeline/types.d.ts +10 -0
  413. package/dist/pipeline/types.js +10 -0
  414. package/dist/pipeline/validate.d.ts +67 -0
  415. package/dist/pipeline/validate.js +406 -0
  416. package/dist/pipeline/webhook-server.d.ts +37 -0
  417. package/dist/pipeline/webhook-server.js +133 -0
  418. package/dist/report-store.d.ts +84 -0
  419. package/dist/report-store.js +208 -0
  420. package/dist/sanity/client.d.ts +38 -0
  421. package/dist/sanity/client.js +86 -0
  422. package/dist/sanity/portable-text.d.ts +11 -0
  423. package/dist/sanity/portable-text.js +211 -0
  424. package/dist/sanity/queries.d.ts +133 -0
  425. package/dist/sanity/queries.js +300 -0
  426. package/dist/schedules/digest.d.ts +116 -0
  427. package/dist/schedules/digest.js +156 -0
  428. package/dist/schedules/index.d.ts +12 -0
  429. package/dist/schedules/index.js +10 -0
  430. package/dist/schedules/loader.d.ts +31 -0
  431. package/dist/schedules/loader.js +73 -0
  432. package/dist/schedules/schema.d.ts +9 -0
  433. package/dist/schedules/schema.js +9 -0
  434. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  435. package/dist/scripts/agent-behavior-report.js +315 -0
  436. package/dist/scripts/baseline.d.ts +43 -0
  437. package/dist/scripts/baseline.js +267 -0
  438. package/dist/scripts/calculate-scores.d.ts +166 -0
  439. package/dist/scripts/calculate-scores.js +1296 -0
  440. package/dist/scripts/compare.d.ts +22 -0
  441. package/dist/scripts/compare.js +334 -0
  442. package/dist/scripts/coverage-audit.d.ts +44 -0
  443. package/dist/scripts/coverage-audit.js +209 -0
  444. package/dist/scripts/debug-eval.d.ts +19 -0
  445. package/dist/scripts/debug-eval.js +73 -0
  446. package/dist/scripts/discovery-report.d.ts +58 -0
  447. package/dist/scripts/discovery-report.js +250 -0
  448. package/dist/scripts/fetch-docs.d.ts +35 -0
  449. package/dist/scripts/fetch-docs.js +472 -0
  450. package/dist/scripts/generate-configs.d.ts +66 -0
  451. package/dist/scripts/generate-configs.js +459 -0
  452. package/dist/scripts/grader-api.d.ts +27 -0
  453. package/dist/scripts/grader-api.js +206 -0
  454. package/dist/scripts/grader-compare.d.ts +22 -0
  455. package/dist/scripts/grader-compare.js +368 -0
  456. package/dist/scripts/grader-consistency.d.ts +20 -0
  457. package/dist/scripts/grader-consistency.js +313 -0
  458. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  459. package/dist/scripts/grader-sensitivity.js +354 -0
  460. package/dist/scripts/grader-validate.d.ts +19 -0
  461. package/dist/scripts/grader-validate.js +267 -0
  462. package/dist/scripts/measure-retrieval.d.ts +10 -0
  463. package/dist/scripts/measure-retrieval.js +145 -0
  464. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
  465. package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
  466. package/dist/scripts/pipeline.d.ts +76 -0
  467. package/dist/scripts/pipeline.js +1031 -0
  468. package/dist/scripts/pr-comment.d.ts +10 -0
  469. package/dist/scripts/pr-comment.js +510 -0
  470. package/dist/scripts/readiness-report.d.ts +88 -0
  471. package/dist/scripts/readiness-report.js +342 -0
  472. package/dist/scripts/update-quality-scores.d.ts +15 -0
  473. package/dist/scripts/update-quality-scores.js +184 -0
  474. package/dist/scripts/validate-task-sources.d.ts +21 -0
  475. package/dist/scripts/validate-task-sources.js +210 -0
  476. package/dist/scripts/validate.d.ts +13 -0
  477. package/dist/scripts/validate.js +79 -0
  478. package/dist/scripts/webhook-server.d.ts +26 -0
  479. package/dist/scripts/webhook-server.js +147 -0
  480. package/dist/scripts/weekly-digest.d.ts +24 -0
  481. package/dist/scripts/weekly-digest.js +144 -0
  482. package/dist/sinks/bigquery/index.d.ts +131 -0
  483. package/dist/sinks/bigquery/index.js +222 -0
  484. package/dist/sinks/format-slack.d.ts +64 -0
  485. package/dist/sinks/format-slack.js +306 -0
  486. package/dist/sinks/index.d.ts +23 -0
  487. package/dist/sinks/index.js +18 -0
  488. package/dist/sinks/loader.d.ts +18 -0
  489. package/dist/sinks/loader.js +82 -0
  490. package/dist/sinks/retry.d.ts +24 -0
  491. package/dist/sinks/retry.js +52 -0
  492. package/dist/sinks/schema.d.ts +9 -0
  493. package/dist/sinks/schema.js +9 -0
  494. package/dist/sinks/slack/format.d.ts +65 -0
  495. package/dist/sinks/slack/format.js +327 -0
  496. package/dist/sinks/slack/index.d.ts +27 -0
  497. package/dist/sinks/slack/index.js +78 -0
  498. package/dist/sinks/slack-sink.d.ts +27 -0
  499. package/dist/sinks/slack-sink.js +78 -0
  500. package/dist/sinks/types.d.ts +59 -0
  501. package/dist/sinks/types.js +44 -0
  502. package/dist/sinks/webhook/index.d.ts +19 -0
  503. package/dist/sinks/webhook/index.js +50 -0
  504. package/dist/sinks/webhook-sink.d.ts +19 -0
  505. package/dist/sinks/webhook-sink.js +50 -0
  506. package/dist/sources.d.ts +104 -0
  507. package/dist/sources.js +292 -0
  508. package/dist/webhook/budget.d.ts +42 -0
  509. package/dist/webhook/budget.js +60 -0
  510. package/dist/webhook/debounce.d.ts +67 -0
  511. package/dist/webhook/debounce.js +76 -0
  512. package/dist/webhook/dispatch.d.ts +45 -0
  513. package/dist/webhook/dispatch.js +84 -0
  514. package/dist/webhook/eval-request-handler.d.ts +87 -0
  515. package/dist/webhook/eval-request-handler.js +181 -0
  516. package/dist/webhook/handler.d.ts +88 -0
  517. package/dist/webhook/handler.js +203 -0
  518. package/dist/webhook/index.d.ts +17 -0
  519. package/dist/webhook/index.js +12 -0
  520. package/dist/webhook/types.d.ts +109 -0
  521. package/dist/webhook/types.js +10 -0
  522. package/package.json +72 -0
  523. package/tasks/.expanded.agentic.yaml +51 -0
  524. package/tasks/.expanded.yaml +66 -0
  525. package/tasks/frameworks.yaml +98 -0
  526. package/tasks/functions.yaml +51 -0
  527. package/tasks/groq.yaml +216 -0
  528. package/tasks/nextjs-live.yaml +62 -0
  529. package/tasks/studio-setup.yaml +111 -0
  530. package/tasks/visual-editing.yaml +120 -0
@@ -0,0 +1,53 @@
1
+ /**
2
+ * pipeline/agent-behavior-report.ts
3
+ *
4
+ * Pure analysis functions for agent behavior observation reports.
5
+ * No I/O, no process.env, no process.argv — all data is passed in.
6
+ */
7
+ import type { AgentBehaviorSummary } from "../agent-observer/types.js";
8
+ export interface PromptfooResults {
9
+ results: TestResult[];
10
+ }
11
+ export interface PromptfooResultsEnvelope {
12
+ results: {
13
+ results: TestResult[];
14
+ };
15
+ }
16
+ export interface TestResult {
17
+ description: string;
18
+ metadata?: Record<string, unknown>;
19
+ response: {
20
+ output: string;
21
+ };
22
+ vars: Record<string, string>;
23
+ }
24
+ export interface TaskBehavior {
25
+ behavior: AgentBehaviorSummary;
26
+ description: string;
27
+ feature: string;
28
+ hasDocs: boolean;
29
+ }
30
+ export interface FeatureAnalysis {
31
+ allDocSlugs: string[];
32
+ allExternalDomains: string[];
33
+ allSearchQueries: string[];
34
+ avgDocPages: number;
35
+ avgNetworkMs: number;
36
+ avgSearches: number;
37
+ canonicalCoverage: number;
38
+ canonicalSlugs: string[];
39
+ feature: string;
40
+ tasks: TaskBehavior[];
41
+ }
42
+ export interface AnalysisResult {
43
+ features: FeatureAnalysis[];
44
+ hasData: boolean;
45
+ tasks: TaskBehavior[];
46
+ }
47
+ export declare const CANONICAL_DOC_MAP: Record<string, string[]>;
48
+ export declare function detectFeatureArea(description: string): string;
49
+ /**
50
+ * Analyze pre-parsed test results for agent behavior patterns.
51
+ * Caller is responsible for reading/parsing the file — this function is pure.
52
+ */
53
+ export declare function analyzeResults(results: TestResult[]): AnalysisResult;
@@ -0,0 +1,132 @@
1
+ // ---------------------------------------------------------------------------
2
+ // Canonical doc mapping
3
+ // ---------------------------------------------------------------------------
4
+ // Canonical doc mapping: task description patterns -> expected doc slugs
5
+ // This maps what docs a well-informed agent *should* visit for each task
6
+ export const CANONICAL_DOC_MAP = {
7
+ frameworks: [
8
+ "remix",
9
+ "nuxt",
10
+ "svelte",
11
+ "astro",
12
+ "gatsby",
13
+ "client-libraries",
14
+ ],
15
+ functions: [
16
+ "functions",
17
+ "webhooks",
18
+ "groq-powered-webhooks",
19
+ "event-driven",
20
+ "automations",
21
+ ],
22
+ "nextjs-live": [
23
+ "next-js",
24
+ "live-content-api",
25
+ "content-source-maps",
26
+ "app-router",
27
+ "groq",
28
+ "client-libraries",
29
+ ],
30
+ "studio-setup": [
31
+ "studio",
32
+ "schema-types",
33
+ "structure-builder",
34
+ "configuration",
35
+ "plugins",
36
+ ],
37
+ "visual-editing": [
38
+ "visual-editing",
39
+ "presentation",
40
+ "preview",
41
+ "overlays",
42
+ "loaders",
43
+ ],
44
+ };
45
+ // ---------------------------------------------------------------------------
46
+ // Feature area detection
47
+ // ---------------------------------------------------------------------------
48
+ export function detectFeatureArea(description) {
49
+ const desc = description.toLowerCase();
50
+ if (desc.includes("studio"))
51
+ return "studio-setup";
52
+ if (desc.includes("visual") ||
53
+ desc.includes("presentation") ||
54
+ desc.includes("live preview"))
55
+ return "visual-editing";
56
+ if (desc.includes("function") || desc.includes("webhook"))
57
+ return "functions";
58
+ if (desc.includes("next") || desc.includes("app router"))
59
+ return "nextjs-live";
60
+ if (desc.includes("remix") ||
61
+ desc.includes("nuxt") ||
62
+ desc.includes("svelte"))
63
+ return "frameworks";
64
+ return "other";
65
+ }
66
+ // ---------------------------------------------------------------------------
67
+ // Analysis
68
+ // ---------------------------------------------------------------------------
69
+ /**
70
+ * Analyze pre-parsed test results for agent behavior patterns.
71
+ * Caller is responsible for reading/parsing the file — this function is pure.
72
+ */
73
+ export function analyzeResults(results) {
74
+ const tasks = [];
75
+ for (const result of results) {
76
+ const metadata = result.metadata;
77
+ if (!metadata?.agentBehaviorSummary)
78
+ continue;
79
+ const behavior = metadata.agentBehaviorSummary;
80
+ tasks.push({
81
+ behavior,
82
+ description: result.description,
83
+ feature: detectFeatureArea(result.description),
84
+ hasDocs: !!(result.vars.docs && result.vars.docs.trim().length > 0),
85
+ });
86
+ }
87
+ if (tasks.length === 0) {
88
+ return { features: [], hasData: false, tasks: [] };
89
+ }
90
+ // Group by feature
91
+ const byFeature = {};
92
+ for (const t of tasks) {
93
+ if (!byFeature[t.feature])
94
+ byFeature[t.feature] = [];
95
+ byFeature[t.feature].push(t);
96
+ }
97
+ const features = Object.entries(byFeature)
98
+ .map(([feature, featureTasks]) => {
99
+ const allDocSlugs = [
100
+ ...new Set(featureTasks.flatMap((t) => t.behavior.docSlugsVisited)),
101
+ ];
102
+ const allSearchQueries = [
103
+ ...new Set(featureTasks.flatMap((t) => t.behavior.uniqueSearchQueries)),
104
+ ];
105
+ const allExternalDomains = [
106
+ ...new Set(featureTasks.flatMap((t) => t.behavior.externalDomains)),
107
+ ];
108
+ const canonicalSlugs = CANONICAL_DOC_MAP[feature] || [];
109
+ const matchedCanonical = canonicalSlugs.filter((slug) => allDocSlugs.some((visited) => visited.includes(slug)));
110
+ const canonicalCoverage = canonicalSlugs.length > 0
111
+ ? matchedCanonical.length / canonicalSlugs.length
112
+ : 0;
113
+ const count = featureTasks.length || 1;
114
+ return {
115
+ allDocSlugs,
116
+ allExternalDomains,
117
+ allSearchQueries,
118
+ avgDocPages: featureTasks.reduce((s, t) => s + t.behavior.docPagesVisited, 0) /
119
+ count,
120
+ avgNetworkMs: featureTasks.reduce((s, t) => s + t.behavior.totalNetworkMs, 0) /
121
+ count,
122
+ avgSearches: featureTasks.reduce((s, t) => s + t.behavior.searchesPerformed, 0) /
123
+ count,
124
+ canonicalCoverage,
125
+ canonicalSlugs,
126
+ feature,
127
+ tasks: featureTasks,
128
+ };
129
+ })
130
+ .sort((a, b) => a.feature.localeCompare(b.feature));
131
+ return { features, hasData: true, tasks };
132
+ }
@@ -0,0 +1,47 @@
1
+ /**
2
+ * pipeline/attribution.ts
3
+ *
4
+ * Per-document attribution for score changes.
5
+ *
6
+ * Given a ComparisonReport, a list of changed document slugs, and the
7
+ * resolved task mappings, this module classifies each task's score delta
8
+ * as unambiguous (one changed doc), ambiguous (multiple changed docs),
9
+ * or uncorrelated (no changed docs in the task's canonical set).
10
+ *
11
+ * This is Phase 1 of the hybrid attribution approach — zero additional
12
+ * eval cost, purely correlation-based. Phase 2 (targeted LOO) can be
13
+ * layered on top for ambiguous cases when higher precision is needed.
14
+ *
15
+ * @see docs/design-docs/scenario-matrix/per-document-attribution.md
16
+ * @see docs/exec-plans/completed/scenario-matrix-implementation/phase-2-impact-scenarios.md
17
+ */
18
+ import type { AttributionReport, ComparisonReport } from "./types.js";
19
+ import type { ResolvedMappings } from "./resolve-mappings.js";
20
+ /**
21
+ * Attribute score changes to individual documents.
22
+ *
23
+ * For each area in the comparison report, cross-references the task's
24
+ * canonical_docs with the changed slugs to classify the attribution:
25
+ *
26
+ * - **unambiguous**: exactly 1 changed doc in the task's canonical set
27
+ * - **ambiguous**: 2+ changed docs in the task's canonical set
28
+ * - **uncorrelated**: 0 changed docs in the task's canonical set
29
+ *
30
+ * Tasks with deltas within the noise floor are still attributed but
31
+ * flagged as `withinNoiseFloor: true`.
32
+ *
33
+ * @param comparison - The structured comparison between before/after runs
34
+ * @param changedSlugs - Document slugs that changed
35
+ * @param mappings - Resolved task-to-canonical-docs mappings
36
+ * @param noiseThreshold - Deltas within ±this value are marked as noise
37
+ * @returns Attribution report with per-task classifications
38
+ */
39
+ export declare function attributeChanges(comparison: ComparisonReport, changedSlugs: string[], mappings: ResolvedMappings, noiseThreshold: number): AttributionReport;
40
+ /**
41
+ * Format an attribution report for console output.
42
+ */
43
+ export declare function formatAttributionConsole(report: AttributionReport): string;
44
+ /**
45
+ * Format an attribution report as markdown for PR comments.
46
+ */
47
+ export declare function formatAttributionMarkdown(report: AttributionReport): string;
@@ -0,0 +1,226 @@
1
+ /**
2
+ * pipeline/attribution.ts
3
+ *
4
+ * Per-document attribution for score changes.
5
+ *
6
+ * Given a ComparisonReport, a list of changed document slugs, and the
7
+ * resolved task mappings, this module classifies each task's score delta
8
+ * as unambiguous (one changed doc), ambiguous (multiple changed docs),
9
+ * or uncorrelated (no changed docs in the task's canonical set).
10
+ *
11
+ * This is Phase 1 of the hybrid attribution approach — zero additional
12
+ * eval cost, purely correlation-based. Phase 2 (targeted LOO) can be
13
+ * layered on top for ambiguous cases when higher precision is needed.
14
+ *
15
+ * @see docs/design-docs/scenario-matrix/per-document-attribution.md
16
+ * @see docs/exec-plans/completed/scenario-matrix-implementation/phase-2-impact-scenarios.md
17
+ */
18
+ // ---------------------------------------------------------------------------
19
+ // Public API
20
+ // ---------------------------------------------------------------------------
21
+ /**
22
+ * Attribute score changes to individual documents.
23
+ *
24
+ * For each area in the comparison report, cross-references the task's
25
+ * canonical_docs with the changed slugs to classify the attribution:
26
+ *
27
+ * - **unambiguous**: exactly 1 changed doc in the task's canonical set
28
+ * - **ambiguous**: 2+ changed docs in the task's canonical set
29
+ * - **uncorrelated**: 0 changed docs in the task's canonical set
30
+ *
31
+ * Tasks with deltas within the noise floor are still attributed but
32
+ * flagged as `withinNoiseFloor: true`.
33
+ *
34
+ * @param comparison - The structured comparison between before/after runs
35
+ * @param changedSlugs - Document slugs that changed
36
+ * @param mappings - Resolved task-to-canonical-docs mappings
37
+ * @param noiseThreshold - Deltas within ±this value are marked as noise
38
+ * @returns Attribution report with per-task classifications
39
+ */
40
+ export function attributeChanges(comparison, changedSlugs, mappings, noiseThreshold) {
41
+ const changedSet = new Set(changedSlugs);
42
+ const attributions = [];
43
+ // Build a flat list of all tasks from the mappings
44
+ const taskCanonicalDocs = new Map();
45
+ for (const [area, config] of Object.entries(mappings.feature_areas)) {
46
+ for (const task of config.tasks) {
47
+ taskCanonicalDocs.set(task.id, {
48
+ area,
49
+ slugs: task.canonical_docs.map((d) => d.slug),
50
+ });
51
+ }
52
+ }
53
+ // For each area in the comparison, attribute the delta
54
+ for (const areaDelta of comparison.areas) {
55
+ // Find tasks in this area
56
+ const areaTasks = [...taskCanonicalDocs.entries()].filter(([, info]) => info.area === areaDelta.area);
57
+ if (areaTasks.length === 0) {
58
+ // Area exists in comparison but has no tasks in mappings
59
+ // (shouldn't happen in practice, but handle gracefully)
60
+ continue;
61
+ }
62
+ // For area-level attribution: check which changed docs overlap
63
+ // with any task's canonical docs in this area
64
+ const areaCanonicalSlugs = new Set(areaTasks.flatMap(([, info]) => info.slugs));
65
+ const matchingSlugs = changedSlugs.filter((s) => areaCanonicalSlugs.has(s));
66
+ // Classify each task
67
+ for (const [taskId, taskInfo] of areaTasks) {
68
+ const taskMatchingSlugs = taskInfo.slugs.filter((s) => changedSet.has(s));
69
+ const classification = classifyAttribution(taskMatchingSlugs.length);
70
+ attributions.push({
71
+ area: taskInfo.area,
72
+ attributedDocs: taskMatchingSlugs,
73
+ classification,
74
+ delta: areaDelta.delta,
75
+ taskId,
76
+ withinNoiseFloor: Math.abs(areaDelta.delta) <= noiseThreshold,
77
+ });
78
+ }
79
+ // If no task-level matches but area has a delta, record area-level
80
+ if (areaTasks.length > 0 && matchingSlugs.length === 0) {
81
+ // All tasks in this area are uncorrelated — already handled above
82
+ }
83
+ }
84
+ // Find untracked documents: changed slugs not in ANY task's canonical docs
85
+ const allTrackedSlugs = new Set([...taskCanonicalDocs.values()].flatMap((info) => info.slugs));
86
+ const untrackedDocs = changedSlugs
87
+ .filter((s) => !allTrackedSlugs.has(s))
88
+ .sort();
89
+ // Compute summary counts
90
+ const summary = {
91
+ ambiguous: attributions.filter((a) => a.classification === "ambiguous")
92
+ .length,
93
+ unambiguous: attributions.filter((a) => a.classification === "unambiguous")
94
+ .length,
95
+ uncorrelated: attributions.filter((a) => a.classification === "uncorrelated").length,
96
+ withinNoise: attributions.filter((a) => a.withinNoiseFloor).length,
97
+ };
98
+ return {
99
+ attributions,
100
+ summary,
101
+ untrackedDocs,
102
+ };
103
+ }
104
+ // ---------------------------------------------------------------------------
105
+ // Formatting
106
+ // ---------------------------------------------------------------------------
107
+ /**
108
+ * Format an attribution report for console output.
109
+ */
110
+ export function formatAttributionConsole(report) {
111
+ const lines = [];
112
+ lines.push("📋 PER-DOCUMENT ATTRIBUTION");
113
+ lines.push("");
114
+ // Unambiguous attributions
115
+ const unambiguous = report.attributions.filter((a) => a.classification === "unambiguous");
116
+ if (unambiguous.length > 0) {
117
+ lines.push(`Unambiguous (${unambiguous.length} tasks):`);
118
+ for (const a of unambiguous) {
119
+ const noiseTag = a.withinNoiseFloor ? " ⚠️ within noise" : "";
120
+ const deltaStr = a.delta > 0 ? `+${Math.round(a.delta)}` : String(Math.round(a.delta));
121
+ lines.push(` ${a.taskId}: ${deltaStr} → ${a.attributedDocs[0]}${noiseTag}`);
122
+ }
123
+ lines.push("");
124
+ }
125
+ // Ambiguous attributions
126
+ const ambiguous = report.attributions.filter((a) => a.classification === "ambiguous");
127
+ if (ambiguous.length > 0) {
128
+ lines.push(`Ambiguous (${ambiguous.length} tasks):`);
129
+ for (const a of ambiguous) {
130
+ const noiseTag = a.withinNoiseFloor ? " ⚠️ within noise" : "";
131
+ const deltaStr = a.delta > 0 ? `+${Math.round(a.delta)}` : String(Math.round(a.delta));
132
+ lines.push(` ${a.taskId}: ${deltaStr} → {${a.attributedDocs.join(", ")}}${noiseTag}`);
133
+ }
134
+ lines.push("");
135
+ }
136
+ // Uncorrelated
137
+ const uncorrelated = report.attributions.filter((a) => a.classification === "uncorrelated" && !a.withinNoiseFloor);
138
+ if (uncorrelated.length > 0) {
139
+ lines.push(`Uncorrelated (${uncorrelated.length} tasks):`);
140
+ for (const a of uncorrelated) {
141
+ const deltaStr = a.delta > 0 ? `+${Math.round(a.delta)}` : String(Math.round(a.delta));
142
+ lines.push(` ${a.taskId}: ${deltaStr} → likely grader variance (no changed docs in canonical set)`);
143
+ }
144
+ lines.push("");
145
+ }
146
+ // Untracked documents
147
+ if (report.untrackedDocs.length > 0) {
148
+ lines.push("Untracked documents:");
149
+ for (const doc of report.untrackedDocs) {
150
+ lines.push(` ${doc} — not in any task's canonical_docs`);
151
+ }
152
+ lines.push("");
153
+ }
154
+ return lines.join("\n");
155
+ }
156
+ /**
157
+ * Format an attribution report as markdown for PR comments.
158
+ */
159
+ export function formatAttributionMarkdown(report) {
160
+ const lines = [];
161
+ lines.push("### 📋 Per-Document Attribution");
162
+ lines.push("");
163
+ if (report.attributions.length === 0 && report.untrackedDocs.length === 0) {
164
+ lines.push("No attribution data available.");
165
+ lines.push("");
166
+ return lines.join("\n");
167
+ }
168
+ // Summary
169
+ const { ambiguous, unambiguous, uncorrelated, withinNoise } = report.summary;
170
+ const parts = [];
171
+ if (unambiguous > 0)
172
+ parts.push(`${unambiguous} unambiguous`);
173
+ if (ambiguous > 0)
174
+ parts.push(`${ambiguous} ambiguous`);
175
+ if (uncorrelated > 0)
176
+ parts.push(`${uncorrelated} uncorrelated`);
177
+ if (withinNoise > 0)
178
+ parts.push(`${withinNoise} within noise`);
179
+ if (parts.length > 0) {
180
+ lines.push(`**${parts.join(" · ")}**`);
181
+ lines.push("");
182
+ }
183
+ // Attribution table
184
+ const hasAttributions = report.attributions.some((a) => a.delta !== 0);
185
+ if (hasAttributions) {
186
+ lines.push("| Task | Area | Delta | Attribution | Documents |");
187
+ lines.push("|------|------|-------|-------------|-----------|");
188
+ for (const a of report.attributions) {
189
+ if (a.delta === 0 && a.classification === "uncorrelated")
190
+ continue;
191
+ const deltaStr = a.delta > 0 ? `+${Math.round(a.delta)}` : String(Math.round(a.delta));
192
+ const noiseIcon = a.withinNoiseFloor ? " ⚠️" : "";
193
+ const classIcon = a.classification === "unambiguous"
194
+ ? "✅"
195
+ : a.classification === "ambiguous"
196
+ ? "🟡"
197
+ : "❓";
198
+ const docs = a.attributedDocs.length > 0
199
+ ? a.attributedDocs.map((d) => `\`${d}\``).join(", ")
200
+ : "—";
201
+ lines.push(`| ${a.taskId} | ${a.area} | ${deltaStr}${noiseIcon} | ${classIcon} ${a.classification} | ${docs} |`);
202
+ }
203
+ lines.push("");
204
+ }
205
+ // Untracked documents
206
+ if (report.untrackedDocs.length > 0) {
207
+ lines.push("**Untracked documents** (not in any task's canonical_docs):");
208
+ lines.push("");
209
+ for (const doc of report.untrackedDocs) {
210
+ lines.push(`- \`${doc}\``);
211
+ }
212
+ lines.push("");
213
+ }
214
+ return lines.join("\n");
215
+ }
216
+ // ---------------------------------------------------------------------------
217
+ // Helpers
218
+ // ---------------------------------------------------------------------------
219
+ /** Classify attribution based on the number of matching changed documents */
220
+ function classifyAttribution(matchCount) {
221
+ if (matchCount === 0)
222
+ return "uncorrelated";
223
+ if (matchCount === 1)
224
+ return "unambiguous";
225
+ return "ambiguous";
226
+ }
@@ -0,0 +1,37 @@
1
+ /**
2
+ * pipeline/baseline.ts
3
+ *
4
+ * Pure computation functions for managing historical baseline snapshots of
5
+ * evaluation scores. All functions accept `rootDir` as their first parameter
6
+ * and perform no process.env / process.argv access.
7
+ */
8
+ export interface BaselineMetadata {
9
+ areaCount: number;
10
+ avgScore: number;
11
+ filename: string;
12
+ graderCost?: number;
13
+ tag?: string;
14
+ timestamp: string;
15
+ totalCost?: number;
16
+ }
17
+ export interface CompareResult {
18
+ comparisons?: ScoreComparison[];
19
+ message: string;
20
+ overallDelta?: number;
21
+ success: boolean;
22
+ }
23
+ export interface ScoreComparison {
24
+ baseline: number;
25
+ costBaseline?: number;
26
+ costCurrent?: number;
27
+ costDelta?: number;
28
+ current: number;
29
+ delta: number;
30
+ feature: string;
31
+ }
32
+ export declare function compareBaseline(rootDir: string, baselineFile?: string): CompareResult;
33
+ export declare function listBaselines(rootDir: string): BaselineMetadata[];
34
+ export declare function saveBaseline(rootDir: string, tag?: string): {
35
+ success: boolean;
36
+ message: string;
37
+ };
@@ -0,0 +1,141 @@
1
+ /**
2
+ * pipeline/baseline.ts
3
+ *
4
+ * Pure computation functions for managing historical baseline snapshots of
5
+ * evaluation scores. All functions accept `rootDir` as their first parameter
6
+ * and perform no process.env / process.argv access.
7
+ */
8
+ import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
9
+ import { join } from "path";
10
+ // ---------------------------------------------------------------------------
11
+ // Compare
12
+ // ---------------------------------------------------------------------------
13
+ export function compareBaseline(rootDir, baselineFile) {
14
+ const baselinesDir = join(rootDir, "results", "baselines");
15
+ const scoreSummaryPath = join(rootDir, "results", "latest", "score-summary.json");
16
+ if (!existsSync(scoreSummaryPath)) {
17
+ return {
18
+ message: "No current score-summary.json found.",
19
+ success: false,
20
+ };
21
+ }
22
+ // Find baseline to compare against
23
+ const baselines = listBaselines(rootDir);
24
+ if (baselines.length === 0) {
25
+ return {
26
+ message: "No baselines saved yet. Run 'pnpm baseline:save' first.",
27
+ success: false,
28
+ };
29
+ }
30
+ const targetFile = baselineFile ?? baselines[0].filename;
31
+ const baselinePath = join(baselinesDir, targetFile);
32
+ if (!existsSync(baselinePath)) {
33
+ return {
34
+ message: `Baseline file not found: ${targetFile}`,
35
+ success: false,
36
+ };
37
+ }
38
+ const current = JSON.parse(readFileSync(scoreSummaryPath, "utf-8"));
39
+ const baseline = JSON.parse(readFileSync(baselinePath, "utf-8"));
40
+ const baselineMap = new Map(baseline.scores.map((s) => [s.feature, s.totalScore]));
41
+ const baselineCostMap = new Map(baseline.scores.map((s) => [s.feature, s.totalCost ?? 0]));
42
+ const comparisons = current.scores.map((s) => {
43
+ const baseScore = baselineMap.get(s.feature) ?? 0;
44
+ const currentCost = s.totalCost ?? 0;
45
+ const baseCost = baselineCostMap.get(s.feature) ?? 0;
46
+ return {
47
+ baseline: baseScore,
48
+ costBaseline: baseCost > 0 ? baseCost : undefined,
49
+ costCurrent: currentCost > 0 ? currentCost : undefined,
50
+ costDelta: currentCost > 0 || baseCost > 0 ? currentCost - baseCost : undefined,
51
+ current: s.totalScore,
52
+ delta: s.totalScore - baseScore,
53
+ feature: s.feature,
54
+ };
55
+ });
56
+ // Check for areas in baseline but not in current
57
+ for (const [feature, score] of baselineMap) {
58
+ if (!comparisons.find((c) => c.feature === feature)) {
59
+ comparisons.push({
60
+ baseline: score,
61
+ current: 0,
62
+ delta: -score,
63
+ feature,
64
+ });
65
+ }
66
+ }
67
+ comparisons.sort((a, b) => b.delta - a.delta);
68
+ const overallDelta = Math.round(current.overall.avgScore) - Math.round(baseline.overall.avgScore);
69
+ return {
70
+ comparisons,
71
+ message: `Compared against ${targetFile}`,
72
+ overallDelta,
73
+ success: true,
74
+ };
75
+ }
76
+ // ---------------------------------------------------------------------------
77
+ // List
78
+ // ---------------------------------------------------------------------------
79
+ export function listBaselines(rootDir) {
80
+ const baselinesDir = join(rootDir, "results", "baselines");
81
+ if (!existsSync(baselinesDir)) {
82
+ return [];
83
+ }
84
+ const files = readdirSync(baselinesDir)
85
+ .filter((f) => f.endsWith(".json"))
86
+ .sort()
87
+ .reverse(); // Newest first
88
+ return files.map((filename) => {
89
+ const raw = readFileSync(join(baselinesDir, filename), "utf-8");
90
+ const data = JSON.parse(raw);
91
+ return {
92
+ areaCount: data.scores.length,
93
+ avgScore: Math.round(data.overall.avgScore),
94
+ filename,
95
+ graderCost: data.overall.cost?.graderTotal,
96
+ tag: data.baselineMeta?.tag,
97
+ timestamp: data.timestamp,
98
+ totalCost: data.overall.cost?.total,
99
+ };
100
+ });
101
+ }
102
+ // ---------------------------------------------------------------------------
103
+ // Save
104
+ // ---------------------------------------------------------------------------
105
+ export function saveBaseline(rootDir, tag) {
106
+ const baselinesDir = join(rootDir, "results", "baselines");
107
+ const scoreSummaryPath = join(rootDir, "results", "latest", "score-summary.json");
108
+ if (!existsSync(scoreSummaryPath)) {
109
+ return {
110
+ message: "No score-summary.json found. Run 'pnpm calculate-scores' first.",
111
+ success: false,
112
+ };
113
+ }
114
+ const raw = readFileSync(scoreSummaryPath, "utf-8");
115
+ const summary = JSON.parse(raw);
116
+ mkdirSync(baselinesDir, { recursive: true });
117
+ // Generate filename: YYYY-MM-DD_HHmmss[_tag].json
118
+ const now = new Date();
119
+ const datePart = now
120
+ .toISOString()
121
+ .slice(0, 19)
122
+ .replace(/[T:]/g, "_")
123
+ .replace(/-/g, "");
124
+ const tagPart = tag
125
+ ? `_${tag.replace(/[^a-z0-9-]/gi, "-").toLowerCase()}`
126
+ : "";
127
+ const filename = `${datePart}${tagPart}.json`;
128
+ const baseline = {
129
+ ...summary,
130
+ baselineMeta: {
131
+ savedAt: now.toISOString(),
132
+ // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string tag should be treated as no tag
133
+ tag: tag || undefined,
134
+ },
135
+ };
136
+ writeFileSync(join(baselinesDir, filename), JSON.stringify(baseline, null, 2));
137
+ return {
138
+ message: `Saved baseline to results/baselines/${filename} (avg: ${Math.round(summary.overall.avgScore)}, ${summary.scores.length} areas)`,
139
+ success: true,
140
+ };
141
+ }