@sanity/ailf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (530) hide show
  1. package/README.md +89 -0
  2. package/bin/ailf.js +64 -0
  3. package/canonical/grader-references/README.md +88 -0
  4. package/canonical/grader-references/groq.yaml +234 -0
  5. package/canonical/grader-references/studio-setup.yaml +275 -0
  6. package/canonical/reference-solutions/.gitkeep +1 -0
  7. package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
  8. package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
  9. package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
  10. package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
  11. package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
  12. package/canonical/reference-solutions/groq/joins-references.ts +300 -0
  13. package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
  14. package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
  15. package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
  16. package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
  17. package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
  18. package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
  19. package/config/bigquery/README.md +74 -0
  20. package/config/bigquery/views/area_scores.sql +87 -0
  21. package/config/bigquery/views/reports.sql +49 -0
  22. package/config/features.yaml +116 -0
  23. package/config/models.yaml +115 -0
  24. package/config/prompts.yaml +75 -0
  25. package/config/rubrics.yaml +62 -0
  26. package/config/schedules.yaml +43 -0
  27. package/config/sinks.yaml +54 -0
  28. package/config/sources.yaml +51 -0
  29. package/config/thresholds.yaml +49 -0
  30. package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
  31. package/dist/_vendor/ailf-core/examples/index.js +285 -0
  32. package/dist/_vendor/ailf-core/index.d.ts +17 -0
  33. package/dist/_vendor/ailf-core/index.js +17 -0
  34. package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
  35. package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
  36. package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
  37. package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
  38. package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
  39. package/dist/_vendor/ailf-core/ports/context.js +14 -0
  40. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
  41. package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
  42. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
  43. package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
  44. package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
  45. package/dist/_vendor/ailf-core/ports/index.js +7 -0
  46. package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
  47. package/dist/_vendor/ailf-core/ports/logger.js +11 -0
  48. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
  49. package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
  50. package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
  51. package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
  52. package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
  53. package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
  54. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
  55. package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
  56. package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
  57. package/dist/_vendor/ailf-core/schemas/index.js +16 -0
  58. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
  59. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
  60. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
  61. package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
  62. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
  63. package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
  64. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
  65. package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
  66. package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
  67. package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
  68. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
  69. package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
  70. package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
  71. package/dist/_vendor/ailf-core/services/index.js +12 -0
  72. package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
  73. package/dist/_vendor/ailf-core/services/scoring.js +222 -0
  74. package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
  75. package/dist/_vendor/ailf-core/types/index.js +21 -0
  76. package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
  77. package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
  78. package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
  79. package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
  80. package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
  81. package/dist/_vendor/ailf-shared/document-ref.js +1 -0
  82. package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
  83. package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
  84. package/dist/_vendor/ailf-shared/index.d.ts +16 -0
  85. package/dist/_vendor/ailf-shared/index.js +16 -0
  86. package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
  87. package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
  88. package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
  89. package/dist/_vendor/ailf-shared/score-grades.js +23 -0
  90. package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
  91. package/dist/adapters/cache/content-lake-cache.js +59 -0
  92. package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
  93. package/dist/adapters/cache/filesystem-cache.js +54 -0
  94. package/dist/adapters/cache/index.d.ts +2 -0
  95. package/dist/adapters/cache/index.js +2 -0
  96. package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
  97. package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
  98. package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
  99. package/dist/adapters/config-sources/file-config-adapter.js +96 -0
  100. package/dist/adapters/config-sources/index.d.ts +2 -0
  101. package/dist/adapters/config-sources/index.js +2 -0
  102. package/dist/adapters/doc-fetchers/index.d.ts +1 -0
  103. package/dist/adapters/doc-fetchers/index.js +1 -0
  104. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
  105. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
  106. package/dist/adapters/eval-runners/index.d.ts +1 -0
  107. package/dist/adapters/eval-runners/index.js +1 -0
  108. package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
  109. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
  110. package/dist/adapters/index.d.ts +12 -0
  111. package/dist/adapters/index.js +12 -0
  112. package/dist/adapters/loggers/console-logger.d.ts +22 -0
  113. package/dist/adapters/loggers/console-logger.js +54 -0
  114. package/dist/adapters/loggers/index.d.ts +9 -0
  115. package/dist/adapters/loggers/index.js +9 -0
  116. package/dist/adapters/loggers/json-logger.d.ts +18 -0
  117. package/dist/adapters/loggers/json-logger.js +33 -0
  118. package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
  119. package/dist/adapters/loggers/quiet-logger.js +30 -0
  120. package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
  121. package/dist/adapters/task-sources/composite-task-source.js +59 -0
  122. package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
  123. package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
  124. package/dist/adapters/task-sources/index.d.ts +7 -0
  125. package/dist/adapters/task-sources/index.js +7 -0
  126. package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
  127. package/dist/adapters/task-sources/repo-schemas.js +234 -0
  128. package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
  129. package/dist/adapters/task-sources/repo-task-source.js +104 -0
  130. package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
  131. package/dist/adapters/task-sources/repo-trigger.js +153 -0
  132. package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
  133. package/dist/adapters/task-sources/repo-validation.js +164 -0
  134. package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
  135. package/dist/adapters/task-sources/yaml-task-source.js +136 -0
  136. package/dist/agent-observer/agentic-provider.d.ts +132 -0
  137. package/dist/agent-observer/agentic-provider.js +983 -0
  138. package/dist/agent-observer/classifier.d.ts +62 -0
  139. package/dist/agent-observer/classifier.js +269 -0
  140. package/dist/agent-observer/index.d.ts +7 -0
  141. package/dist/agent-observer/index.js +4 -0
  142. package/dist/agent-observer/pricing.d.ts +35 -0
  143. package/dist/agent-observer/pricing.js +82 -0
  144. package/dist/agent-observer/provider.d.ts +77 -0
  145. package/dist/agent-observer/provider.js +151 -0
  146. package/dist/agent-observer/proxy.d.ts +91 -0
  147. package/dist/agent-observer/proxy.js +321 -0
  148. package/dist/agent-observer/test-imports.d.ts +7 -0
  149. package/dist/agent-observer/test-imports.js +185 -0
  150. package/dist/agent-observer/types.d.ts +137 -0
  151. package/dist/agent-observer/types.js +16 -0
  152. package/dist/assertions/source-isolation.d.ts +72 -0
  153. package/dist/assertions/source-isolation.js +117 -0
  154. package/dist/cli.d.ts +24 -0
  155. package/dist/cli.js +199 -0
  156. package/dist/commands/agent-report.d.ts +5 -0
  157. package/dist/commands/agent-report.js +69 -0
  158. package/dist/commands/baseline.d.ts +9 -0
  159. package/dist/commands/baseline.js +141 -0
  160. package/dist/commands/cache.d.ts +13 -0
  161. package/dist/commands/cache.js +135 -0
  162. package/dist/commands/calculate-scores.d.ts +8 -0
  163. package/dist/commands/calculate-scores.js +48 -0
  164. package/dist/commands/compare.d.ts +8 -0
  165. package/dist/commands/compare.js +120 -0
  166. package/dist/commands/completion.d.ts +18 -0
  167. package/dist/commands/completion.js +260 -0
  168. package/dist/commands/coverage-audit.d.ts +7 -0
  169. package/dist/commands/coverage-audit.js +40 -0
  170. package/dist/commands/discovery-report.d.ts +10 -0
  171. package/dist/commands/discovery-report.js +44 -0
  172. package/dist/commands/eval.d.ts +9 -0
  173. package/dist/commands/eval.js +35 -0
  174. package/dist/commands/explain-handler.d.ts +34 -0
  175. package/dist/commands/explain-handler.js +719 -0
  176. package/dist/commands/fetch-docs.d.ts +8 -0
  177. package/dist/commands/fetch-docs.js +128 -0
  178. package/dist/commands/generate-configs.d.ts +8 -0
  179. package/dist/commands/generate-configs.js +46 -0
  180. package/dist/commands/grader/index.d.ts +11 -0
  181. package/dist/commands/grader/index.js +118 -0
  182. package/dist/commands/init.d.ts +19 -0
  183. package/dist/commands/init.js +150 -0
  184. package/dist/commands/interactive.d.ts +12 -0
  185. package/dist/commands/interactive.js +238 -0
  186. package/dist/commands/lookup-doc.d.ts +15 -0
  187. package/dist/commands/lookup-doc.js +84 -0
  188. package/dist/commands/measure-retrieval.d.ts +5 -0
  189. package/dist/commands/measure-retrieval.js +65 -0
  190. package/dist/commands/pipeline-action.d.ts +71 -0
  191. package/dist/commands/pipeline-action.js +305 -0
  192. package/dist/commands/pipeline.d.ts +62 -0
  193. package/dist/commands/pipeline.js +53 -0
  194. package/dist/commands/pr-comment.d.ts +8 -0
  195. package/dist/commands/pr-comment.js +47 -0
  196. package/dist/commands/publish.d.ts +26 -0
  197. package/dist/commands/publish.js +253 -0
  198. package/dist/commands/readiness-report.d.ts +10 -0
  199. package/dist/commands/readiness-report.js +104 -0
  200. package/dist/commands/shared/options.d.ts +29 -0
  201. package/dist/commands/shared/options.js +57 -0
  202. package/dist/commands/update-quality-scores.d.ts +5 -0
  203. package/dist/commands/update-quality-scores.js +20 -0
  204. package/dist/commands/validate-tasks.d.ts +16 -0
  205. package/dist/commands/validate-tasks.js +93 -0
  206. package/dist/commands/validate.d.ts +9 -0
  207. package/dist/commands/validate.js +73 -0
  208. package/dist/commands/webhook-server.d.ts +5 -0
  209. package/dist/commands/webhook-server.js +30 -0
  210. package/dist/commands/weekly-digest.d.ts +10 -0
  211. package/dist/commands/weekly-digest.js +104 -0
  212. package/dist/composition-root.d.ts +26 -0
  213. package/dist/composition-root.js +107 -0
  214. package/dist/interpolate.d.ts +26 -0
  215. package/dist/interpolate.js +70 -0
  216. package/dist/job-store.d.ts +104 -0
  217. package/dist/job-store.js +188 -0
  218. package/dist/lib/agent-behavior-report.d.ts +8 -0
  219. package/dist/lib/agent-behavior-report.js +185 -0
  220. package/dist/lib/baseline.d.ts +19 -0
  221. package/dist/lib/baseline.js +153 -0
  222. package/dist/lib/calculate-scores.d.ts +23 -0
  223. package/dist/lib/calculate-scores.js +42 -0
  224. package/dist/lib/compare.d.ts +18 -0
  225. package/dist/lib/compare.js +170 -0
  226. package/dist/lib/coverage-audit.d.ts +4 -0
  227. package/dist/lib/coverage-audit.js +42 -0
  228. package/dist/lib/discovery-report.d.ts +13 -0
  229. package/dist/lib/discovery-report.js +57 -0
  230. package/dist/lib/fetch-docs.d.ts +30 -0
  231. package/dist/lib/fetch-docs.js +171 -0
  232. package/dist/lib/generate-configs.d.ts +25 -0
  233. package/dist/lib/generate-configs.js +42 -0
  234. package/dist/lib/grader-api.d.ts +21 -0
  235. package/dist/lib/grader-api.js +34 -0
  236. package/dist/lib/grader-compare.d.ts +19 -0
  237. package/dist/lib/grader-compare.js +91 -0
  238. package/dist/lib/grader-consistency.d.ts +27 -0
  239. package/dist/lib/grader-consistency.js +79 -0
  240. package/dist/lib/grader-sensitivity.d.ts +19 -0
  241. package/dist/lib/grader-sensitivity.js +75 -0
  242. package/dist/lib/grader-validate.d.ts +19 -0
  243. package/dist/lib/grader-validate.js +78 -0
  244. package/dist/lib/measure-retrieval.d.ts +14 -0
  245. package/dist/lib/measure-retrieval.js +71 -0
  246. package/dist/lib/pr-comment.d.ts +16 -0
  247. package/dist/lib/pr-comment.js +28 -0
  248. package/dist/lib/readiness-report.d.ts +13 -0
  249. package/dist/lib/readiness-report.js +108 -0
  250. package/dist/lib/webhook-server.d.ts +11 -0
  251. package/dist/lib/webhook-server.js +24 -0
  252. package/dist/lib/weekly-digest.d.ts +24 -0
  253. package/dist/lib/weekly-digest.js +148 -0
  254. package/dist/orchestration/build-app-context.d.ts +27 -0
  255. package/dist/orchestration/build-app-context.js +81 -0
  256. package/dist/orchestration/build-step-sequence.d.ts +15 -0
  257. package/dist/orchestration/build-step-sequence.js +84 -0
  258. package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
  259. package/dist/orchestration/config-to-source-overrides.js +28 -0
  260. package/dist/orchestration/env-bridge.d.ts +21 -0
  261. package/dist/orchestration/env-bridge.js +66 -0
  262. package/dist/orchestration/index.d.ts +11 -0
  263. package/dist/orchestration/index.js +11 -0
  264. package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
  265. package/dist/orchestration/pipeline-orchestrator.js +153 -0
  266. package/dist/orchestration/step-runner.d.ts +20 -0
  267. package/dist/orchestration/step-runner.js +88 -0
  268. package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
  269. package/dist/orchestration/steps/calculate-scores-step.js +95 -0
  270. package/dist/orchestration/steps/callback-step.d.ts +24 -0
  271. package/dist/orchestration/steps/callback-step.js +76 -0
  272. package/dist/orchestration/steps/compare-step.d.ts +14 -0
  273. package/dist/orchestration/steps/compare-step.js +92 -0
  274. package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
  275. package/dist/orchestration/steps/discovery-report-step.js +55 -0
  276. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  277. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  278. package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
  279. package/dist/orchestration/steps/fetch-docs-step.js +135 -0
  280. package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
  281. package/dist/orchestration/steps/gap-analysis-step.js +136 -0
  282. package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
  283. package/dist/orchestration/steps/generate-configs-step.js +85 -0
  284. package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
  285. package/dist/orchestration/steps/grader-consistency-step.js +64 -0
  286. package/dist/orchestration/steps/index.d.ts +19 -0
  287. package/dist/orchestration/steps/index.js +19 -0
  288. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
  289. package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
  290. package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
  291. package/dist/orchestration/steps/publish-report-step.js +216 -0
  292. package/dist/orchestration/steps/readiness-step.d.ts +13 -0
  293. package/dist/orchestration/steps/readiness-step.js +91 -0
  294. package/dist/orchestration/steps/report-step.d.ts +12 -0
  295. package/dist/orchestration/steps/report-step.js +49 -0
  296. package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
  297. package/dist/orchestration/steps/run-eval-step.js +195 -0
  298. package/dist/orchestration/steps/validate-step.d.ts +12 -0
  299. package/dist/orchestration/steps/validate-step.js +41 -0
  300. package/dist/pipeline/agent-behavior-report.d.ts +53 -0
  301. package/dist/pipeline/agent-behavior-report.js +132 -0
  302. package/dist/pipeline/attribution.d.ts +47 -0
  303. package/dist/pipeline/attribution.js +226 -0
  304. package/dist/pipeline/baseline.d.ts +37 -0
  305. package/dist/pipeline/baseline.js +141 -0
  306. package/dist/pipeline/cache.d.ts +101 -0
  307. package/dist/pipeline/cache.js +283 -0
  308. package/dist/pipeline/calculate-scores.d.ts +102 -0
  309. package/dist/pipeline/calculate-scores.js +1128 -0
  310. package/dist/pipeline/callback-delivery.d.ts +50 -0
  311. package/dist/pipeline/callback-delivery.js +89 -0
  312. package/dist/pipeline/checks.d.ts +39 -0
  313. package/dist/pipeline/checks.js +280 -0
  314. package/dist/pipeline/classify-url.d.ts +61 -0
  315. package/dist/pipeline/classify-url.js +93 -0
  316. package/dist/pipeline/compare.d.ts +31 -0
  317. package/dist/pipeline/compare.js +208 -0
  318. package/dist/pipeline/coverage-audit.d.ts +39 -0
  319. package/dist/pipeline/coverage-audit.js +165 -0
  320. package/dist/pipeline/degradations.d.ts +85 -0
  321. package/dist/pipeline/degradations.js +242 -0
  322. package/dist/pipeline/discovery-report.d.ts +55 -0
  323. package/dist/pipeline/discovery-report.js +178 -0
  324. package/dist/pipeline/eval-constants.d.ts +68 -0
  325. package/dist/pipeline/eval-constants.js +111 -0
  326. package/dist/pipeline/eval-fingerprint.d.ts +66 -0
  327. package/dist/pipeline/eval-fingerprint.js +175 -0
  328. package/dist/pipeline/expand-tasks.d.ts +220 -0
  329. package/dist/pipeline/expand-tasks.js +421 -0
  330. package/dist/pipeline/failure-modes.d.ts +46 -0
  331. package/dist/pipeline/failure-modes.js +348 -0
  332. package/dist/pipeline/fetch-url-content.d.ts +44 -0
  333. package/dist/pipeline/fetch-url-content.js +93 -0
  334. package/dist/pipeline/gap-analysis.d.ts +48 -0
  335. package/dist/pipeline/gap-analysis.js +231 -0
  336. package/dist/pipeline/generate-configs.d.ts +72 -0
  337. package/dist/pipeline/generate-configs.js +395 -0
  338. package/dist/pipeline/grader-api.d.ts +49 -0
  339. package/dist/pipeline/grader-api.js +200 -0
  340. package/dist/pipeline/grader-compare-runner.d.ts +44 -0
  341. package/dist/pipeline/grader-compare-runner.js +301 -0
  342. package/dist/pipeline/grader-comparison.d.ts +111 -0
  343. package/dist/pipeline/grader-comparison.js +161 -0
  344. package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
  345. package/dist/pipeline/grader-consistency-runner.js +270 -0
  346. package/dist/pipeline/grader-consistency.d.ts +103 -0
  347. package/dist/pipeline/grader-consistency.js +146 -0
  348. package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
  349. package/dist/pipeline/grader-sensitivity-runner.js +282 -0
  350. package/dist/pipeline/grader-sensitivity.d.ts +94 -0
  351. package/dist/pipeline/grader-sensitivity.js +144 -0
  352. package/dist/pipeline/grader-validate-runner.d.ts +38 -0
  353. package/dist/pipeline/grader-validate-runner.js +229 -0
  354. package/dist/pipeline/grader-validation.d.ts +107 -0
  355. package/dist/pipeline/grader-validation.js +169 -0
  356. package/dist/pipeline/map-request-to-config.d.ts +19 -0
  357. package/dist/pipeline/map-request-to-config.js +80 -0
  358. package/dist/pipeline/measure-retrieval.d.ts +59 -0
  359. package/dist/pipeline/measure-retrieval.js +111 -0
  360. package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
  361. package/dist/pipeline/mirror-repo-tasks.js +350 -0
  362. package/dist/pipeline/plan-format.d.ts +33 -0
  363. package/dist/pipeline/plan-format.js +202 -0
  364. package/dist/pipeline/plan.d.ts +169 -0
  365. package/dist/pipeline/plan.js +708 -0
  366. package/dist/pipeline/pr-comment.d.ts +19 -0
  367. package/dist/pipeline/pr-comment.js +502 -0
  368. package/dist/pipeline/probe.d.ts +52 -0
  369. package/dist/pipeline/probe.js +390 -0
  370. package/dist/pipeline/provenance.d.ts +47 -0
  371. package/dist/pipeline/provenance.js +146 -0
  372. package/dist/pipeline/readiness-report.d.ts +87 -0
  373. package/dist/pipeline/readiness-report.js +205 -0
  374. package/dist/pipeline/release-classification.d.ts +54 -0
  375. package/dist/pipeline/release-classification.js +238 -0
  376. package/dist/pipeline/release-report.d.ts +37 -0
  377. package/dist/pipeline/release-report.js +222 -0
  378. package/dist/pipeline/repo-eval-comment.d.ts +37 -0
  379. package/dist/pipeline/repo-eval-comment.js +165 -0
  380. package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
  381. package/dist/pipeline/repo-threshold-evaluator.js +162 -0
  382. package/dist/pipeline/resolve-mappings.d.ts +35 -0
  383. package/dist/pipeline/resolve-mappings.js +72 -0
  384. package/dist/pipeline/retrieval-metrics.d.ts +39 -0
  385. package/dist/pipeline/retrieval-metrics.js +136 -0
  386. package/dist/pipeline/reverse-mapping.d.ts +67 -0
  387. package/dist/pipeline/reverse-mapping.js +88 -0
  388. package/dist/pipeline/schemas.d.ts +9 -0
  389. package/dist/pipeline/schemas.js +9 -0
  390. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  391. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  392. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  393. package/dist/pipeline/steps/compare-step.js +90 -0
  394. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  395. package/dist/pipeline/steps/eval-step.js +347 -0
  396. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  397. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  398. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  399. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  400. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  401. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  402. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  403. package/dist/pipeline/steps/publish-report-step.js +243 -0
  404. package/dist/pipeline/steps/report-step.d.ts +13 -0
  405. package/dist/pipeline/steps/report-step.js +56 -0
  406. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  407. package/dist/pipeline/steps/update-scores-step.js +42 -0
  408. package/dist/pipeline/targeted-loo.d.ts +88 -0
  409. package/dist/pipeline/targeted-loo.js +203 -0
  410. package/dist/pipeline/thresholds.d.ts +27 -0
  411. package/dist/pipeline/thresholds.js +245 -0
  412. package/dist/pipeline/types.d.ts +10 -0
  413. package/dist/pipeline/types.js +10 -0
  414. package/dist/pipeline/validate.d.ts +67 -0
  415. package/dist/pipeline/validate.js +406 -0
  416. package/dist/pipeline/webhook-server.d.ts +37 -0
  417. package/dist/pipeline/webhook-server.js +133 -0
  418. package/dist/report-store.d.ts +84 -0
  419. package/dist/report-store.js +208 -0
  420. package/dist/sanity/client.d.ts +38 -0
  421. package/dist/sanity/client.js +86 -0
  422. package/dist/sanity/portable-text.d.ts +11 -0
  423. package/dist/sanity/portable-text.js +211 -0
  424. package/dist/sanity/queries.d.ts +133 -0
  425. package/dist/sanity/queries.js +300 -0
  426. package/dist/schedules/digest.d.ts +116 -0
  427. package/dist/schedules/digest.js +156 -0
  428. package/dist/schedules/index.d.ts +12 -0
  429. package/dist/schedules/index.js +10 -0
  430. package/dist/schedules/loader.d.ts +31 -0
  431. package/dist/schedules/loader.js +73 -0
  432. package/dist/schedules/schema.d.ts +9 -0
  433. package/dist/schedules/schema.js +9 -0
  434. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  435. package/dist/scripts/agent-behavior-report.js +315 -0
  436. package/dist/scripts/baseline.d.ts +43 -0
  437. package/dist/scripts/baseline.js +267 -0
  438. package/dist/scripts/calculate-scores.d.ts +166 -0
  439. package/dist/scripts/calculate-scores.js +1296 -0
  440. package/dist/scripts/compare.d.ts +22 -0
  441. package/dist/scripts/compare.js +334 -0
  442. package/dist/scripts/coverage-audit.d.ts +44 -0
  443. package/dist/scripts/coverage-audit.js +209 -0
  444. package/dist/scripts/debug-eval.d.ts +19 -0
  445. package/dist/scripts/debug-eval.js +73 -0
  446. package/dist/scripts/discovery-report.d.ts +58 -0
  447. package/dist/scripts/discovery-report.js +250 -0
  448. package/dist/scripts/fetch-docs.d.ts +35 -0
  449. package/dist/scripts/fetch-docs.js +472 -0
  450. package/dist/scripts/generate-configs.d.ts +66 -0
  451. package/dist/scripts/generate-configs.js +459 -0
  452. package/dist/scripts/grader-api.d.ts +27 -0
  453. package/dist/scripts/grader-api.js +206 -0
  454. package/dist/scripts/grader-compare.d.ts +22 -0
  455. package/dist/scripts/grader-compare.js +368 -0
  456. package/dist/scripts/grader-consistency.d.ts +20 -0
  457. package/dist/scripts/grader-consistency.js +313 -0
  458. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  459. package/dist/scripts/grader-sensitivity.js +354 -0
  460. package/dist/scripts/grader-validate.d.ts +19 -0
  461. package/dist/scripts/grader-validate.js +267 -0
  462. package/dist/scripts/measure-retrieval.d.ts +10 -0
  463. package/dist/scripts/measure-retrieval.js +145 -0
  464. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
  465. package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
  466. package/dist/scripts/pipeline.d.ts +76 -0
  467. package/dist/scripts/pipeline.js +1031 -0
  468. package/dist/scripts/pr-comment.d.ts +10 -0
  469. package/dist/scripts/pr-comment.js +510 -0
  470. package/dist/scripts/readiness-report.d.ts +88 -0
  471. package/dist/scripts/readiness-report.js +342 -0
  472. package/dist/scripts/update-quality-scores.d.ts +15 -0
  473. package/dist/scripts/update-quality-scores.js +184 -0
  474. package/dist/scripts/validate-task-sources.d.ts +21 -0
  475. package/dist/scripts/validate-task-sources.js +210 -0
  476. package/dist/scripts/validate.d.ts +13 -0
  477. package/dist/scripts/validate.js +79 -0
  478. package/dist/scripts/webhook-server.d.ts +26 -0
  479. package/dist/scripts/webhook-server.js +147 -0
  480. package/dist/scripts/weekly-digest.d.ts +24 -0
  481. package/dist/scripts/weekly-digest.js +144 -0
  482. package/dist/sinks/bigquery/index.d.ts +131 -0
  483. package/dist/sinks/bigquery/index.js +222 -0
  484. package/dist/sinks/format-slack.d.ts +64 -0
  485. package/dist/sinks/format-slack.js +306 -0
  486. package/dist/sinks/index.d.ts +23 -0
  487. package/dist/sinks/index.js +18 -0
  488. package/dist/sinks/loader.d.ts +18 -0
  489. package/dist/sinks/loader.js +82 -0
  490. package/dist/sinks/retry.d.ts +24 -0
  491. package/dist/sinks/retry.js +52 -0
  492. package/dist/sinks/schema.d.ts +9 -0
  493. package/dist/sinks/schema.js +9 -0
  494. package/dist/sinks/slack/format.d.ts +65 -0
  495. package/dist/sinks/slack/format.js +327 -0
  496. package/dist/sinks/slack/index.d.ts +27 -0
  497. package/dist/sinks/slack/index.js +78 -0
  498. package/dist/sinks/slack-sink.d.ts +27 -0
  499. package/dist/sinks/slack-sink.js +78 -0
  500. package/dist/sinks/types.d.ts +59 -0
  501. package/dist/sinks/types.js +44 -0
  502. package/dist/sinks/webhook/index.d.ts +19 -0
  503. package/dist/sinks/webhook/index.js +50 -0
  504. package/dist/sinks/webhook-sink.d.ts +19 -0
  505. package/dist/sinks/webhook-sink.js +50 -0
  506. package/dist/sources.d.ts +104 -0
  507. package/dist/sources.js +292 -0
  508. package/dist/webhook/budget.d.ts +42 -0
  509. package/dist/webhook/budget.js +60 -0
  510. package/dist/webhook/debounce.d.ts +67 -0
  511. package/dist/webhook/debounce.js +76 -0
  512. package/dist/webhook/dispatch.d.ts +45 -0
  513. package/dist/webhook/dispatch.js +84 -0
  514. package/dist/webhook/eval-request-handler.d.ts +87 -0
  515. package/dist/webhook/eval-request-handler.js +181 -0
  516. package/dist/webhook/handler.d.ts +88 -0
  517. package/dist/webhook/handler.js +203 -0
  518. package/dist/webhook/index.d.ts +17 -0
  519. package/dist/webhook/index.js +12 -0
  520. package/dist/webhook/types.d.ts +109 -0
  521. package/dist/webhook/types.js +10 -0
  522. package/package.json +72 -0
  523. package/tasks/.expanded.agentic.yaml +51 -0
  524. package/tasks/.expanded.yaml +66 -0
  525. package/tasks/frameworks.yaml +98 -0
  526. package/tasks/functions.yaml +51 -0
  527. package/tasks/groq.yaml +216 -0
  528. package/tasks/nextjs-live.yaml +62 -0
  529. package/tasks/studio-setup.yaml +111 -0
  530. package/tasks/visual-editing.yaml +120 -0
@@ -0,0 +1,242 @@
1
+ /**
2
+ * pipeline/degradations.ts
3
+ *
4
+ * Programmatic code degradation strategies for sensitivity testing.
5
+ *
6
+ * Each strategy takes a "good" reference solution (string) and returns a
7
+ * "bad" version that should score lower on a specific dimension:
8
+ *
9
+ * - Task Completion: remove key functional sections
10
+ * - Code Correctness: introduce anti-patterns and deprecated APIs
11
+ * - Doc Coverage: strip documentation references, add hallucinated details
12
+ *
13
+ * These are deterministic, pure functions — no randomness, no side effects.
14
+ *
15
+ * @see docs/exec-plans/completed/grader-reliability.md — Phase 4
16
+ */
17
+ // ---------------------------------------------------------------------------
18
+ // Task Completion degradations
19
+ // ---------------------------------------------------------------------------
20
+ /**
21
+ * Remove the bottom half of the code (functions, exports, etc).
22
+ * A response missing half its functionality should score lower on Task Completion.
23
+ */
24
+ export const removeBottomHalf = {
25
+ apply(source) {
26
+ const lines = source.split("\n");
27
+ const midpoint = Math.floor(lines.length / 2);
28
+ const kept = lines.slice(0, midpoint);
29
+ kept.push("");
30
+ kept.push("// ... (remaining implementation not provided)");
31
+ return kept.join("\n");
32
+ },
33
+ description: "Remove bottom half of code — missing key functionality",
34
+ targetDimension: "taskCompletion",
35
+ };
36
+ /**
37
+ * Remove all export statements and exported functions.
38
+ * Missing exports = incomplete API surface → lower Task Completion.
39
+ */
40
+ export const removeExports = {
41
+ apply(source) {
42
+ const lines = source.split("\n");
43
+ return lines
44
+ .filter((line) => {
45
+ const trimmed = line.trim();
46
+ // Remove export declarations
47
+ if (trimmed.startsWith("export "))
48
+ return false;
49
+ // Remove lines that are just "export {"
50
+ if (/^export\s*\{/.test(trimmed))
51
+ return false;
52
+ return true;
53
+ })
54
+ .join("\n");
55
+ },
56
+ description: "Remove export statements — incomplete public API",
57
+ targetDimension: "taskCompletion",
58
+ };
59
+ /**
60
+ * Replace function bodies with TODO comments.
61
+ * Skeleton code that doesn't actually implement anything.
62
+ */
63
+ export const stubFunctions = {
64
+ apply(source) {
65
+ // Match function/method declarations and replace their bodies
66
+ // This handles: async function foo() { ... }, function foo() { ... },
67
+ // const foo = async () => { ... }, etc.
68
+ return source.replace(/(\{)\s*\n([\s\S]*?)(\n\s*\})/g, (match, open, _body, close) => {
69
+ // Only replace if the body has more than 2 lines (avoid replacing object literals)
70
+ const bodyLines = _body.split("\n").filter((l) => l.trim().length > 0);
71
+ if (bodyLines.length < 3)
72
+ return match;
73
+ return `${open}\n // TODO: implement\n throw new Error("Not implemented")\n${close}`;
74
+ });
75
+ },
76
+ description: "Replace function bodies with TODO stubs — no implementation",
77
+ targetDimension: "taskCompletion",
78
+ };
79
+ // ---------------------------------------------------------------------------
80
+ // Code Correctness degradations
81
+ // ---------------------------------------------------------------------------
82
+ /**
83
+ * Replace modern API calls with deprecated/incorrect patterns.
84
+ * Targets Sanity-specific patterns that the grader should catch.
85
+ */
86
+ export const introduceDeprecatedAPIs = {
87
+ apply(source) {
88
+ let result = source;
89
+ // Replace createClient with deprecated @sanity/client import
90
+ result = result.replace(/import\s*\{\s*createClient\s*\}\s*from\s*["']@sanity\/client["']/g, 'import sanityClient from "@sanity/client" // deprecated default import');
91
+ result = result.replace(/import\s*\{\s*createClient\s*\}\s*from\s*["']next-sanity["']/g, 'import sanityClient from "next-sanity" // deprecated default import');
92
+ // Replace defineType/defineField with plain objects
93
+ result = result.replace(/import\s*\{[^}]*defineType[^}]*\}\s*from\s*["']sanity["']/g, "// Using plain objects instead of defineType/defineField");
94
+ result = result.replace(/defineType\(\{/g, "({");
95
+ result = result.replace(/defineField\(\{/g, "({");
96
+ // Replace apiVersion with very old version
97
+ result = result.replace(/apiVersion:\s*["'][^"']+["']/g, 'apiVersion: "2021-03-25" // outdated API version');
98
+ // Replace useCdn: true with useCdn: false (incorrect for read-only)
99
+ result = result.replace(/useCdn:\s*true/g, "useCdn: false // unnecessary");
100
+ return result;
101
+ },
102
+ description: "Replace modern APIs with deprecated patterns",
103
+ targetDimension: "codeCorrectness",
104
+ };
105
+ /**
106
+ * Introduce common GROQ syntax errors.
107
+ * Invalid queries that look plausible but wouldn't work.
108
+ */
109
+ export const introduceGroqErrors = {
110
+ apply(source) {
111
+ let result = source;
112
+ // Replace -> dereference with . (incorrect)
113
+ result = result.replace(/->/g, ".");
114
+ // Replace valid array slice [0...10] with incorrect [0:10]
115
+ result = result.replace(/\[(\d+)\.\.\.(\d+)\]/g, "[$1:$2]");
116
+ // Replace references() with invalid refs()
117
+ result = result.replace(/references\(/g, "refs(");
118
+ // Replace | order() with incorrect .sort()
119
+ result = result.replace(/\|\s*order\(/g, ".sort(");
120
+ return result;
121
+ },
122
+ description: "Introduce GROQ syntax errors — plausible but broken queries",
123
+ targetDimension: "codeCorrectness",
124
+ };
125
+ /**
126
+ * Remove all TypeScript types and use any everywhere.
127
+ * Technically works but is an anti-pattern.
128
+ */
129
+ export const removeTypes = {
130
+ apply(source) {
131
+ let result = source;
132
+ // Remove interface/type declarations
133
+ result = result.replace(/^(?:export\s+)?(?:interface|type)\s+\w+[\s\S]*?^\}/gm, "");
134
+ // Replace typed parameters with any
135
+ result = result.replace(/:\s*[A-Z]\w+(?:\[\])?(?:\s*\|[^,)]+)?/g, ": any");
136
+ // Replace Promise<Type> with Promise<any>
137
+ result = result.replace(/Promise<[^>]+>/g, "Promise<any>");
138
+ // Remove generic type parameters
139
+ result = result.replace(/<[A-Z]\w+(?:\[\])?>/g, "");
140
+ return result;
141
+ },
142
+ description: "Strip TypeScript types and use 'any' — works but anti-pattern",
143
+ targetDimension: "codeCorrectness",
144
+ };
145
+ // ---------------------------------------------------------------------------
146
+ // Doc Coverage degradations
147
+ // ---------------------------------------------------------------------------
148
+ /**
149
+ * Remove all comments and documentation.
150
+ * The response shows no evidence of using documentation.
151
+ */
152
+ export const stripComments = {
153
+ apply(source) {
154
+ const lines = source.split("\n");
155
+ return (lines
156
+ .filter((line) => {
157
+ const trimmed = line.trim();
158
+ // Remove single-line comments
159
+ if (trimmed.startsWith("//"))
160
+ return false;
161
+ // Remove JSDoc comment blocks
162
+ if (trimmed.startsWith("*") ||
163
+ trimmed.startsWith("/**") ||
164
+ trimmed.startsWith("*/"))
165
+ return false;
166
+ return true;
167
+ })
168
+ .join("\n")
169
+ // Remove remaining inline comments
170
+ .replace(/\s*\/\/[^"']*$/gm, "")
171
+ // Remove multi-line comments
172
+ .replace(/\/\*[\s\S]*?\*\//g, "")
173
+ // Clean up excessive blank lines
174
+ .replace(/\n{3,}/g, "\n\n"));
175
+ },
176
+ description: "Remove all comments — no evidence of documentation usage",
177
+ targetDimension: "docCoverage",
178
+ };
179
+ /**
180
+ * Add hallucinated API calls and made-up configuration options.
181
+ * Response looks confident but uses APIs that don't exist.
182
+ */
183
+ export const addHallucinations = {
184
+ apply(source) {
185
+ const hallucinations = [
186
+ "\n// Enable real-time sync (recommended for production)",
187
+ 'const sync = client.enableRealTimeSync({ mode: "aggressive" })',
188
+ "",
189
+ "// Configure auto-indexing for faster queries",
190
+ 'client.configureIndex({ fields: ["title", "slug"], type: "fulltext" })',
191
+ "",
192
+ "// Use built-in caching middleware",
193
+ 'import { withSanityCache } from "@sanity/cache-middleware"',
194
+ "const cachedClient = withSanityCache(client, { ttl: 3600 })",
195
+ "",
196
+ ];
197
+ return hallucinations.join("\n") + "\n" + source;
198
+ },
199
+ description: "Add hallucinated APIs — confident but wrong information",
200
+ targetDimension: "docCoverage",
201
+ };
202
+ /**
203
+ * Replace correct import paths with plausible but wrong ones.
204
+ * Shows the response was guessing at the API surface.
205
+ */
206
+ export const wrongImports = {
207
+ apply(source) {
208
+ let result = source;
209
+ // Replace real packages with plausible wrong ones
210
+ result = result.replace(/from\s*["']@sanity\/client["']/g, 'from "sanity-client"');
211
+ result = result.replace(/from\s*["']next-sanity["']/g, 'from "@next/sanity"');
212
+ result = result.replace(/from\s*["']sanity["']/g, 'from "@sanity/core"');
213
+ result = result.replace(/from\s*["']sanity\/presentation["']/g, 'from "@sanity/presentation-tool"');
214
+ result = result.replace(/from\s*["']sanity\/structure["']/g, 'from "@sanity/desk-tool"');
215
+ result = result.replace(/from\s*["']sanity\/functions["']/g, 'from "@sanity/serverless"');
216
+ return result;
217
+ },
218
+ description: "Replace imports with wrong packages — guessing at API surface",
219
+ targetDimension: "docCoverage",
220
+ };
221
+ // ---------------------------------------------------------------------------
222
+ // Strategy registry
223
+ // ---------------------------------------------------------------------------
224
+ /** All available degradation strategies, grouped by target dimension */
225
+ export const DEGRADATION_STRATEGIES = [
226
+ // Task Completion
227
+ removeBottomHalf,
228
+ removeExports,
229
+ stubFunctions,
230
+ // Code Correctness
231
+ introduceDeprecatedAPIs,
232
+ introduceGroqErrors,
233
+ removeTypes,
234
+ // Doc Coverage
235
+ stripComments,
236
+ addHallucinations,
237
+ wrongImports,
238
+ ];
239
+ /** Get degradation strategies targeting a specific dimension */
240
+ export function getStrategiesForDimension(dimension) {
241
+ return DEGRADATION_STRATEGIES.filter((s) => s.targetDimension === dimension);
242
+ }
@@ -0,0 +1,55 @@
1
+ /**
2
+ * pipeline/discovery-report.ts
3
+ *
4
+ * Generates an agent discoverability report from agentic mode retrieval
5
+ * metrics. Reads score-summary.json (which contains `retrievalMetrics`
6
+ * from agentic evaluation) and produces a markdown report showing:
7
+ *
8
+ * - Retrieval summary (recall, precision, F1)
9
+ * - Per-area retrieval breakdown
10
+ * - Invisible documents (never retrieved by any task)
11
+ * - Recommendations for improving discoverability
12
+ *
13
+ * All functions accept rootDir as a parameter — no module-level constants.
14
+ * No process.argv parsing. No env var fallbacks.
15
+ *
16
+ * Phase 5c of the Scenario Matrix implementation (Scenarios 4.1 and 4.2).
17
+ *
18
+ * @see docs/design-docs/retrieval-metrics.md
19
+ */
20
+ import type { AreaRetrievalMetrics, RetrievalMetrics, ScoreSummary } from "./types.js";
21
+ export interface DiscoveryReport {
22
+ /** All areas included in the report (after filtering) */
23
+ areas: AreaRetrievalMetrics[];
24
+ /** Base URL from the score summary source config */
25
+ baseUrl: string | undefined;
26
+ /** Document slugs that were never retrieved by any task */
27
+ invisibleDocs: InvisibleDoc[];
28
+ /** Overall retrieval metrics */
29
+ overall: RetrievalMetrics["overall"];
30
+ /** Actionable recommendations */
31
+ recommendations: string[];
32
+ /** ISO timestamp of the source evaluation */
33
+ timestamp: string;
34
+ /** Total canonical docs across included areas */
35
+ totalCanonicalDocs: number;
36
+ /** Total hits (canonical docs successfully retrieved) */
37
+ totalHits: number;
38
+ }
39
+ export interface InvisibleDoc {
40
+ /** Tasks that reference this document via canonical_docs */
41
+ referencedBy: string[];
42
+ /** The document slug */
43
+ slug: string;
44
+ }
45
+ /**
46
+ * Format a discovery report as markdown.
47
+ */
48
+ export declare function formatDiscoveryMarkdown(report: DiscoveryReport): string;
49
+ /**
50
+ * Generate a structured discovery report from a score summary.
51
+ *
52
+ * @param summary - Parsed score-summary.json
53
+ * @param areaFilter - Optional area names to include (all if empty)
54
+ */
55
+ export declare function generateDiscoveryReport(summary: ScoreSummary, areaFilter?: string[]): DiscoveryReport;
@@ -0,0 +1,178 @@
1
+ // ---------------------------------------------------------------------------
2
+ // Core logic (exported for testing)
3
+ // ---------------------------------------------------------------------------
4
+ /**
5
+ * Format a discovery report as markdown.
6
+ */
7
+ export function formatDiscoveryMarkdown(report) {
8
+ const lines = [];
9
+ // Header
10
+ lines.push("## 🔍 Agent Discoverability Report");
11
+ lines.push("");
12
+ if (report.baseUrl) {
13
+ lines.push(`**Base URL:** ${report.baseUrl}`);
14
+ }
15
+ lines.push("**Mode:** Agentic");
16
+ lines.push("");
17
+ // Retrieval summary table
18
+ lines.push("### Retrieval Summary");
19
+ lines.push("");
20
+ lines.push("| Metric | Value |");
21
+ lines.push("|---|---|");
22
+ lines.push(`| Recall (canonical docs found) | ${pct(report.overall.avgRecall)} (${report.totalHits}/${report.totalCanonicalDocs}) |`);
23
+ lines.push(`| Precision (relevant docs fetched) | ${pct(report.overall.avgPrecision)} |`);
24
+ lines.push(`| F1 Score | ${report.overall.avgF1.toFixed(2)} |`);
25
+ lines.push(`| Invisible docs | ${report.invisibleDocs.length} |`);
26
+ lines.push("");
27
+ // Per-area breakdown
28
+ if (report.areas.length > 0) {
29
+ lines.push("### Per-Area Breakdown");
30
+ lines.push("");
31
+ lines.push("| Area | Recall | Precision | F1 | Tasks |");
32
+ lines.push("|---|---|---|---|---|");
33
+ for (const area of sortedAreas(report.areas)) {
34
+ lines.push(`| ${area.area} | ${pct(area.avgRecall)} | ${pct(area.avgPrecision)} | ${area.avgF1.toFixed(2)} | ${area.taskCount} |`);
35
+ }
36
+ lines.push("");
37
+ }
38
+ // Invisible documents
39
+ if (report.invisibleDocs.length > 0) {
40
+ lines.push("### Invisible Documents (never retrieved by any task)");
41
+ lines.push("");
42
+ for (const doc of report.invisibleDocs) {
43
+ const refs = doc.referencedBy.join(", ");
44
+ lines.push(`- \`${doc.slug}\` — referenced by ${refs}`);
45
+ }
46
+ lines.push("");
47
+ }
48
+ // Recommendations
49
+ if (report.recommendations.length > 0) {
50
+ lines.push("### Recommendations");
51
+ lines.push("");
52
+ for (let i = 0; i < report.recommendations.length; i++) {
53
+ lines.push(`${i + 1}. ${report.recommendations[i]}`);
54
+ }
55
+ lines.push("");
56
+ }
57
+ return lines.join("\n");
58
+ }
59
+ /**
60
+ * Generate a structured discovery report from a score summary.
61
+ *
62
+ * @param summary - Parsed score-summary.json
63
+ * @param areaFilter - Optional area names to include (all if empty)
64
+ */
65
+ export function generateDiscoveryReport(summary, areaFilter) {
66
+ const metrics = summary.retrievalMetrics;
67
+ if (!metrics) {
68
+ throw new Error("score-summary.json does not contain retrievalMetrics. " +
69
+ "Run an agentic evaluation first: pnpm pipeline -- --mode agentic");
70
+ }
71
+ // Apply area filter
72
+ const areas = areaFilter && areaFilter.length > 0
73
+ ? metrics.areas.filter((a) => areaFilter.includes(a.area))
74
+ : metrics.areas;
75
+ if (areaFilter && areaFilter.length > 0 && areas.length === 0) {
76
+ throw new Error(`No retrieval data found for area(s): ${areaFilter.join(", ")}. ` +
77
+ `Available areas: ${metrics.areas.map((a) => a.area).join(", ")}`);
78
+ }
79
+ // Recompute overall metrics for filtered areas
80
+ const overall = areas.length === metrics.areas.length
81
+ ? metrics.overall
82
+ : computeOverall(areas);
83
+ // Build invisible docs list with task references
84
+ const invisibleDocs = buildInvisibleDocs(areas);
85
+ // Compute totals for the summary table
86
+ const allTasks = areas.flatMap((a) => a.tasks);
87
+ const allExpected = new Set(allTasks.flatMap((t) => t.expected));
88
+ const allHits = new Set(allTasks.flatMap((t) => t.hits));
89
+ const totalCanonicalDocs = allExpected.size;
90
+ const totalHits = allHits.size;
91
+ // Generate recommendations
92
+ const recommendations = generateRecommendations(invisibleDocs, areas, overall);
93
+ return {
94
+ areas,
95
+ baseUrl: summary.source?.baseUrl,
96
+ invisibleDocs,
97
+ overall,
98
+ recommendations,
99
+ timestamp: summary.timestamp,
100
+ totalCanonicalDocs,
101
+ totalHits,
102
+ };
103
+ }
104
+ // ---------------------------------------------------------------------------
105
+ // Helpers (alphabetical for perfectionist/sort-modules)
106
+ // ---------------------------------------------------------------------------
107
+ function buildInvisibleDocs(areas) {
108
+ // Collect all invisible slugs and map them to the tasks that reference them
109
+ const slugToTasks = new Map();
110
+ for (const area of areas) {
111
+ for (const task of area.tasks) {
112
+ for (const slug of task.missed) {
113
+ // Check if this slug is globally invisible (never retrieved by ANY task)
114
+ const isGloballyInvisible = areas.every((a) => a.tasks.every((t) => !t.retrieved.includes(slug)));
115
+ if (isGloballyInvisible) {
116
+ if (!slugToTasks.has(slug)) {
117
+ slugToTasks.set(slug, new Set());
118
+ }
119
+ slugToTasks.get(slug).add(task.taskId);
120
+ }
121
+ }
122
+ }
123
+ }
124
+ return [...slugToTasks.entries()]
125
+ .map(([slug, tasks]) => ({
126
+ referencedBy: [...tasks].sort(),
127
+ slug,
128
+ }))
129
+ .sort((a, b) => b.referencedBy.length - a.referencedBy.length);
130
+ }
131
+ function computeOverall(areas) {
132
+ if (areas.length === 0) {
133
+ return { avgF1: 0, avgPrecision: 0, avgRecall: 0 };
134
+ }
135
+ // Weight by task count for fair averaging
136
+ const totalTasks = areas.reduce((s, a) => s + a.taskCount, 0);
137
+ if (totalTasks === 0) {
138
+ return { avgF1: 0, avgPrecision: 0, avgRecall: 0 };
139
+ }
140
+ const avgRecall = areas.reduce((s, a) => s + a.avgRecall * a.taskCount, 0) / totalTasks;
141
+ const avgPrecision = areas.reduce((s, a) => s + a.avgPrecision * a.taskCount, 0) / totalTasks;
142
+ const avgF1 = areas.reduce((s, a) => s + a.avgF1 * a.taskCount, 0) / totalTasks;
143
+ return { avgF1, avgPrecision, avgRecall };
144
+ }
145
+ function generateRecommendations(invisibleDocs, areas, overall) {
146
+ const recs = [];
147
+ // Recommend adding invisible docs to llms.txt
148
+ const highImpactInvisible = invisibleDocs.filter((d) => d.referencedBy.length > 0);
149
+ for (const doc of highImpactInvisible.slice(0, 5)) {
150
+ const taskWord = doc.referencedBy.length === 1 ? "task" : "tasks";
151
+ recs.push(`Add \`${doc.slug}\` to llms.txt (referenced by ${doc.referencedBy.length} ${taskWord})`);
152
+ }
153
+ // Recommend cross-linking for invisible docs
154
+ if (invisibleDocs.length > 0) {
155
+ recs.push(`Improve cross-linking to ${invisibleDocs.length} invisible document${invisibleDocs.length === 1 ? "" : "s"}`);
156
+ }
157
+ // Flag low-recall areas
158
+ const lowRecallAreas = areas.filter((a) => a.avgRecall < 0.5);
159
+ for (const area of lowRecallAreas) {
160
+ recs.push(`Investigate low recall in \`${area.area}\` (${pct(area.avgRecall)}) — agents miss most canonical docs`);
161
+ }
162
+ // Flag low-precision areas
163
+ const lowPrecisionAreas = areas.filter((a) => a.avgPrecision < 0.5);
164
+ for (const area of lowPrecisionAreas) {
165
+ recs.push(`Review search relevance for \`${area.area}\` (precision ${pct(area.avgPrecision)}) — agents fetch many irrelevant docs`);
166
+ }
167
+ // Overall recommendation
168
+ if (overall.avgF1 < 0.6) {
169
+ recs.push("Overall F1 is below 0.60 — consider a documentation restructure for agent accessibility");
170
+ }
171
+ return recs;
172
+ }
173
+ function pct(value) {
174
+ return `${Math.round(value * 100)}%`;
175
+ }
176
+ function sortedAreas(areas) {
177
+ return [...areas].sort((a, b) => a.area.localeCompare(b.area));
178
+ }
@@ -0,0 +1,68 @@
1
+ /**
2
+ * Constants and types shared across evaluation steps.
3
+ *
4
+ * Extracted from pipeline/steps/eval-step.ts so that the legacy step
5
+ * files can be deleted while tests and other modules retain access
6
+ * to these shared definitions.
7
+ */
8
+ import type { ConcreteEvalMode, DebugOptions, FilterOptions, StepResult } from "../_vendor/ailf-core/index.d.ts";
9
+ /** Promptfoo config file per concrete eval mode */
10
+ export declare const CONFIG_FILES: Record<ConcreteEvalMode, string>;
11
+ /** Each mode writes eval results to a different file (set in the config's outputPath) */
12
+ export declare const RESULTS_FILES: Record<ConcreteEvalMode, string>;
13
+ /** Extended step result that carries cache metadata for downstream steps */
14
+ export interface EvalStepResult {
15
+ /** The computed eval fingerprint (for publishing in provenance) */
16
+ evalFingerprint?: string;
17
+ /** Whether this result came from a remote cache hit */
18
+ remoteCacheHit?: boolean;
19
+ /** The step result */
20
+ stepResult: StepResult;
21
+ }
22
+ /** Options for the remote cache (Content Lake fingerprint lookup) */
23
+ export interface RemoteCacheOptions {
24
+ /** Whether this is a debug run (debug runs don't use remote cache) */
25
+ debug?: boolean;
26
+ /** Filter options used for fingerprint computation */
27
+ filter?: FilterOptions;
28
+ /** Grader model identifier from models.yaml */
29
+ graderModel: string;
30
+ /** Disable remote cache lookup (--no-remote-cache) */
31
+ noRemoteCache?: boolean;
32
+ /** Sanity API token for reading cached reports */
33
+ sanityToken?: string;
34
+ }
35
+ /** Minimal shape of a raw Promptfoo result entry for error scanning */
36
+ export interface RawResult {
37
+ description?: string;
38
+ error?: string;
39
+ gradingResult?: null | {
40
+ pass: boolean;
41
+ };
42
+ provider?: {
43
+ id?: string;
44
+ label?: string;
45
+ };
46
+ success?: boolean;
47
+ testCase?: {
48
+ description?: string;
49
+ };
50
+ }
51
+ /**
52
+ * Build promptfoo filter flags from debug options.
53
+ */
54
+ export declare function buildFilterFlags(debug?: DebugOptions): string;
55
+ /**
56
+ * Extract the Promptfoo share URL from the eval results JSON.
57
+ *
58
+ * Promptfoo writes a `shareableUrl` field into the results file when
59
+ * `PROMPTFOO_API_KEY` is set.
60
+ */
61
+ export declare function extractShareUrl(resultsPath: string): string | undefined;
62
+ /**
63
+ * Read the eval results JSON and produce a human-readable summary of any
64
+ * errored or failed tests.
65
+ *
66
+ * Returns null if there are no errors/failures worth reporting.
67
+ */
68
+ export declare function scanResultsForErrors(resultsPath: string): null | string;
@@ -0,0 +1,111 @@
1
+ /**
2
+ * Constants and types shared across evaluation steps.
3
+ *
4
+ * Extracted from pipeline/steps/eval-step.ts so that the legacy step
5
+ * files can be deleted while tests and other modules retain access
6
+ * to these shared definitions.
7
+ */
8
+ import { existsSync, readFileSync } from "fs";
9
+ /** Promptfoo config file per concrete eval mode */
10
+ export const CONFIG_FILES = {
11
+ agentic: "promptfooconfig.agentic.yaml",
12
+ baseline: "promptfooconfig.yaml",
13
+ observed: "promptfooconfig.observed.yaml",
14
+ };
15
+ /** Each mode writes eval results to a different file (set in the config's outputPath) */
16
+ export const RESULTS_FILES = {
17
+ agentic: "results/latest/eval-results-agentic.json",
18
+ baseline: "results/latest/eval-results.json",
19
+ observed: "results/latest/eval-results-observed.json",
20
+ };
21
+ /**
22
+ * Build promptfoo filter flags from debug options.
23
+ */
24
+ export function buildFilterFlags(debug) {
25
+ if (!debug?.enabled)
26
+ return "";
27
+ const flags = [];
28
+ if (debug.pattern) {
29
+ flags.push(`--filter-pattern '${debug.pattern}'`);
30
+ }
31
+ if (debug.sample) {
32
+ flags.push(`--filter-sample ${debug.sample}`);
33
+ }
34
+ if (debug.firstN) {
35
+ flags.push(`--filter-first-n ${debug.firstN}`);
36
+ }
37
+ // Default: first 2 tests when no other filters specified
38
+ if (flags.length === 0) {
39
+ flags.push("--filter-first-n 2");
40
+ }
41
+ return " " + flags.join(" ");
42
+ }
43
+ /**
44
+ * Extract the Promptfoo share URL from the eval results JSON.
45
+ *
46
+ * Promptfoo writes a `shareableUrl` field into the results file when
47
+ * `PROMPTFOO_API_KEY` is set.
48
+ */
49
+ export function extractShareUrl(resultsPath) {
50
+ if (!existsSync(resultsPath))
51
+ return undefined;
52
+ try {
53
+ const raw = readFileSync(resultsPath, "utf-8");
54
+ const data = JSON.parse(raw);
55
+ return data.shareableUrl ?? undefined;
56
+ }
57
+ catch {
58
+ return undefined;
59
+ }
60
+ }
61
+ /**
62
+ * Read the eval results JSON and produce a human-readable summary of any
63
+ * errored or failed tests.
64
+ *
65
+ * Returns null if there are no errors/failures worth reporting.
66
+ */
67
+ export function scanResultsForErrors(resultsPath) {
68
+ if (!existsSync(resultsPath))
69
+ return null;
70
+ let file;
71
+ try {
72
+ const raw = readFileSync(resultsPath, "utf-8");
73
+ file = JSON.parse(raw);
74
+ }
75
+ catch {
76
+ return null;
77
+ }
78
+ const results = file?.results?.results;
79
+ if (!Array.isArray(results))
80
+ return null;
81
+ const errored = [];
82
+ for (const r of results) {
83
+ if (r.gradingResult !== null)
84
+ continue;
85
+ const desc = r.testCase?.description ?? r.description ?? "unknown";
86
+ const provider = r.provider?.label ?? r.provider?.id ?? "unknown";
87
+ const errorMsg = r.error
88
+ ? (typeof r.error === "string" ? r.error : JSON.stringify(r.error)).slice(0, 200)
89
+ : "Provider returned no scorable result";
90
+ errored.push({ description: desc, error: errorMsg, provider });
91
+ }
92
+ if (errored.length === 0)
93
+ return null;
94
+ const total = results.length;
95
+ const lines = [];
96
+ lines.push(` ┌─────────────────────────────────────────────────────────────`);
97
+ lines.push(` │ ⚠️ ${errored.length} of ${total} eval result(s) errored (no gradingResult)`);
98
+ lines.push(` │`);
99
+ for (const e of errored) {
100
+ lines.push(` │ ✗ [${e.provider}] ${e.description}`);
101
+ lines.push(` │ → ${e.error}`);
102
+ }
103
+ const errorRate = Math.round((errored.length / total) * 100);
104
+ if (errorRate >= 25) {
105
+ lines.push(` │`);
106
+ lines.push(` │ 🔥 High error rate (${errorRate}%) — check API keys, rate limits,`);
107
+ lines.push(` │ or model availability. Errored results are excluded from scoring.`);
108
+ }
109
+ lines.push(` └─────────────────────────────────────────────────────────────`);
110
+ return lines.join("\n");
111
+ }