@sanity/ailf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (530) hide show
  1. package/README.md +89 -0
  2. package/bin/ailf.js +64 -0
  3. package/canonical/grader-references/README.md +88 -0
  4. package/canonical/grader-references/groq.yaml +234 -0
  5. package/canonical/grader-references/studio-setup.yaml +275 -0
  6. package/canonical/reference-solutions/.gitkeep +1 -0
  7. package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
  8. package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
  9. package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
  10. package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
  11. package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
  12. package/canonical/reference-solutions/groq/joins-references.ts +300 -0
  13. package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
  14. package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
  15. package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
  16. package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
  17. package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
  18. package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
  19. package/config/bigquery/README.md +74 -0
  20. package/config/bigquery/views/area_scores.sql +87 -0
  21. package/config/bigquery/views/reports.sql +49 -0
  22. package/config/features.yaml +116 -0
  23. package/config/models.yaml +115 -0
  24. package/config/prompts.yaml +75 -0
  25. package/config/rubrics.yaml +62 -0
  26. package/config/schedules.yaml +43 -0
  27. package/config/sinks.yaml +54 -0
  28. package/config/sources.yaml +51 -0
  29. package/config/thresholds.yaml +49 -0
  30. package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
  31. package/dist/_vendor/ailf-core/examples/index.js +285 -0
  32. package/dist/_vendor/ailf-core/index.d.ts +17 -0
  33. package/dist/_vendor/ailf-core/index.js +17 -0
  34. package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
  35. package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
  36. package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
  37. package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
  38. package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
  39. package/dist/_vendor/ailf-core/ports/context.js +14 -0
  40. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
  41. package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
  42. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
  43. package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
  44. package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
  45. package/dist/_vendor/ailf-core/ports/index.js +7 -0
  46. package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
  47. package/dist/_vendor/ailf-core/ports/logger.js +11 -0
  48. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
  49. package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
  50. package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
  51. package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
  52. package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
  53. package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
  54. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
  55. package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
  56. package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
  57. package/dist/_vendor/ailf-core/schemas/index.js +16 -0
  58. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
  59. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
  60. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
  61. package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
  62. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
  63. package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
  64. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
  65. package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
  66. package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
  67. package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
  68. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
  69. package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
  70. package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
  71. package/dist/_vendor/ailf-core/services/index.js +12 -0
  72. package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
  73. package/dist/_vendor/ailf-core/services/scoring.js +222 -0
  74. package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
  75. package/dist/_vendor/ailf-core/types/index.js +21 -0
  76. package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
  77. package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
  78. package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
  79. package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
  80. package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
  81. package/dist/_vendor/ailf-shared/document-ref.js +1 -0
  82. package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
  83. package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
  84. package/dist/_vendor/ailf-shared/index.d.ts +16 -0
  85. package/dist/_vendor/ailf-shared/index.js +16 -0
  86. package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
  87. package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
  88. package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
  89. package/dist/_vendor/ailf-shared/score-grades.js +23 -0
  90. package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
  91. package/dist/adapters/cache/content-lake-cache.js +59 -0
  92. package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
  93. package/dist/adapters/cache/filesystem-cache.js +54 -0
  94. package/dist/adapters/cache/index.d.ts +2 -0
  95. package/dist/adapters/cache/index.js +2 -0
  96. package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
  97. package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
  98. package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
  99. package/dist/adapters/config-sources/file-config-adapter.js +96 -0
  100. package/dist/adapters/config-sources/index.d.ts +2 -0
  101. package/dist/adapters/config-sources/index.js +2 -0
  102. package/dist/adapters/doc-fetchers/index.d.ts +1 -0
  103. package/dist/adapters/doc-fetchers/index.js +1 -0
  104. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
  105. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
  106. package/dist/adapters/eval-runners/index.d.ts +1 -0
  107. package/dist/adapters/eval-runners/index.js +1 -0
  108. package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
  109. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
  110. package/dist/adapters/index.d.ts +12 -0
  111. package/dist/adapters/index.js +12 -0
  112. package/dist/adapters/loggers/console-logger.d.ts +22 -0
  113. package/dist/adapters/loggers/console-logger.js +54 -0
  114. package/dist/adapters/loggers/index.d.ts +9 -0
  115. package/dist/adapters/loggers/index.js +9 -0
  116. package/dist/adapters/loggers/json-logger.d.ts +18 -0
  117. package/dist/adapters/loggers/json-logger.js +33 -0
  118. package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
  119. package/dist/adapters/loggers/quiet-logger.js +30 -0
  120. package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
  121. package/dist/adapters/task-sources/composite-task-source.js +59 -0
  122. package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
  123. package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
  124. package/dist/adapters/task-sources/index.d.ts +7 -0
  125. package/dist/adapters/task-sources/index.js +7 -0
  126. package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
  127. package/dist/adapters/task-sources/repo-schemas.js +234 -0
  128. package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
  129. package/dist/adapters/task-sources/repo-task-source.js +104 -0
  130. package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
  131. package/dist/adapters/task-sources/repo-trigger.js +153 -0
  132. package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
  133. package/dist/adapters/task-sources/repo-validation.js +164 -0
  134. package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
  135. package/dist/adapters/task-sources/yaml-task-source.js +136 -0
  136. package/dist/agent-observer/agentic-provider.d.ts +132 -0
  137. package/dist/agent-observer/agentic-provider.js +983 -0
  138. package/dist/agent-observer/classifier.d.ts +62 -0
  139. package/dist/agent-observer/classifier.js +269 -0
  140. package/dist/agent-observer/index.d.ts +7 -0
  141. package/dist/agent-observer/index.js +4 -0
  142. package/dist/agent-observer/pricing.d.ts +35 -0
  143. package/dist/agent-observer/pricing.js +82 -0
  144. package/dist/agent-observer/provider.d.ts +77 -0
  145. package/dist/agent-observer/provider.js +151 -0
  146. package/dist/agent-observer/proxy.d.ts +91 -0
  147. package/dist/agent-observer/proxy.js +321 -0
  148. package/dist/agent-observer/test-imports.d.ts +7 -0
  149. package/dist/agent-observer/test-imports.js +185 -0
  150. package/dist/agent-observer/types.d.ts +137 -0
  151. package/dist/agent-observer/types.js +16 -0
  152. package/dist/assertions/source-isolation.d.ts +72 -0
  153. package/dist/assertions/source-isolation.js +117 -0
  154. package/dist/cli.d.ts +24 -0
  155. package/dist/cli.js +199 -0
  156. package/dist/commands/agent-report.d.ts +5 -0
  157. package/dist/commands/agent-report.js +69 -0
  158. package/dist/commands/baseline.d.ts +9 -0
  159. package/dist/commands/baseline.js +141 -0
  160. package/dist/commands/cache.d.ts +13 -0
  161. package/dist/commands/cache.js +135 -0
  162. package/dist/commands/calculate-scores.d.ts +8 -0
  163. package/dist/commands/calculate-scores.js +48 -0
  164. package/dist/commands/compare.d.ts +8 -0
  165. package/dist/commands/compare.js +120 -0
  166. package/dist/commands/completion.d.ts +18 -0
  167. package/dist/commands/completion.js +260 -0
  168. package/dist/commands/coverage-audit.d.ts +7 -0
  169. package/dist/commands/coverage-audit.js +40 -0
  170. package/dist/commands/discovery-report.d.ts +10 -0
  171. package/dist/commands/discovery-report.js +44 -0
  172. package/dist/commands/eval.d.ts +9 -0
  173. package/dist/commands/eval.js +35 -0
  174. package/dist/commands/explain-handler.d.ts +34 -0
  175. package/dist/commands/explain-handler.js +719 -0
  176. package/dist/commands/fetch-docs.d.ts +8 -0
  177. package/dist/commands/fetch-docs.js +128 -0
  178. package/dist/commands/generate-configs.d.ts +8 -0
  179. package/dist/commands/generate-configs.js +46 -0
  180. package/dist/commands/grader/index.d.ts +11 -0
  181. package/dist/commands/grader/index.js +118 -0
  182. package/dist/commands/init.d.ts +19 -0
  183. package/dist/commands/init.js +150 -0
  184. package/dist/commands/interactive.d.ts +12 -0
  185. package/dist/commands/interactive.js +238 -0
  186. package/dist/commands/lookup-doc.d.ts +15 -0
  187. package/dist/commands/lookup-doc.js +84 -0
  188. package/dist/commands/measure-retrieval.d.ts +5 -0
  189. package/dist/commands/measure-retrieval.js +65 -0
  190. package/dist/commands/pipeline-action.d.ts +71 -0
  191. package/dist/commands/pipeline-action.js +305 -0
  192. package/dist/commands/pipeline.d.ts +62 -0
  193. package/dist/commands/pipeline.js +53 -0
  194. package/dist/commands/pr-comment.d.ts +8 -0
  195. package/dist/commands/pr-comment.js +47 -0
  196. package/dist/commands/publish.d.ts +26 -0
  197. package/dist/commands/publish.js +253 -0
  198. package/dist/commands/readiness-report.d.ts +10 -0
  199. package/dist/commands/readiness-report.js +104 -0
  200. package/dist/commands/shared/options.d.ts +29 -0
  201. package/dist/commands/shared/options.js +57 -0
  202. package/dist/commands/update-quality-scores.d.ts +5 -0
  203. package/dist/commands/update-quality-scores.js +20 -0
  204. package/dist/commands/validate-tasks.d.ts +16 -0
  205. package/dist/commands/validate-tasks.js +93 -0
  206. package/dist/commands/validate.d.ts +9 -0
  207. package/dist/commands/validate.js +73 -0
  208. package/dist/commands/webhook-server.d.ts +5 -0
  209. package/dist/commands/webhook-server.js +30 -0
  210. package/dist/commands/weekly-digest.d.ts +10 -0
  211. package/dist/commands/weekly-digest.js +104 -0
  212. package/dist/composition-root.d.ts +26 -0
  213. package/dist/composition-root.js +107 -0
  214. package/dist/interpolate.d.ts +26 -0
  215. package/dist/interpolate.js +70 -0
  216. package/dist/job-store.d.ts +104 -0
  217. package/dist/job-store.js +188 -0
  218. package/dist/lib/agent-behavior-report.d.ts +8 -0
  219. package/dist/lib/agent-behavior-report.js +185 -0
  220. package/dist/lib/baseline.d.ts +19 -0
  221. package/dist/lib/baseline.js +153 -0
  222. package/dist/lib/calculate-scores.d.ts +23 -0
  223. package/dist/lib/calculate-scores.js +42 -0
  224. package/dist/lib/compare.d.ts +18 -0
  225. package/dist/lib/compare.js +170 -0
  226. package/dist/lib/coverage-audit.d.ts +4 -0
  227. package/dist/lib/coverage-audit.js +42 -0
  228. package/dist/lib/discovery-report.d.ts +13 -0
  229. package/dist/lib/discovery-report.js +57 -0
  230. package/dist/lib/fetch-docs.d.ts +30 -0
  231. package/dist/lib/fetch-docs.js +171 -0
  232. package/dist/lib/generate-configs.d.ts +25 -0
  233. package/dist/lib/generate-configs.js +42 -0
  234. package/dist/lib/grader-api.d.ts +21 -0
  235. package/dist/lib/grader-api.js +34 -0
  236. package/dist/lib/grader-compare.d.ts +19 -0
  237. package/dist/lib/grader-compare.js +91 -0
  238. package/dist/lib/grader-consistency.d.ts +27 -0
  239. package/dist/lib/grader-consistency.js +79 -0
  240. package/dist/lib/grader-sensitivity.d.ts +19 -0
  241. package/dist/lib/grader-sensitivity.js +75 -0
  242. package/dist/lib/grader-validate.d.ts +19 -0
  243. package/dist/lib/grader-validate.js +78 -0
  244. package/dist/lib/measure-retrieval.d.ts +14 -0
  245. package/dist/lib/measure-retrieval.js +71 -0
  246. package/dist/lib/pr-comment.d.ts +16 -0
  247. package/dist/lib/pr-comment.js +28 -0
  248. package/dist/lib/readiness-report.d.ts +13 -0
  249. package/dist/lib/readiness-report.js +108 -0
  250. package/dist/lib/webhook-server.d.ts +11 -0
  251. package/dist/lib/webhook-server.js +24 -0
  252. package/dist/lib/weekly-digest.d.ts +24 -0
  253. package/dist/lib/weekly-digest.js +148 -0
  254. package/dist/orchestration/build-app-context.d.ts +27 -0
  255. package/dist/orchestration/build-app-context.js +81 -0
  256. package/dist/orchestration/build-step-sequence.d.ts +15 -0
  257. package/dist/orchestration/build-step-sequence.js +84 -0
  258. package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
  259. package/dist/orchestration/config-to-source-overrides.js +28 -0
  260. package/dist/orchestration/env-bridge.d.ts +21 -0
  261. package/dist/orchestration/env-bridge.js +66 -0
  262. package/dist/orchestration/index.d.ts +11 -0
  263. package/dist/orchestration/index.js +11 -0
  264. package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
  265. package/dist/orchestration/pipeline-orchestrator.js +153 -0
  266. package/dist/orchestration/step-runner.d.ts +20 -0
  267. package/dist/orchestration/step-runner.js +88 -0
  268. package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
  269. package/dist/orchestration/steps/calculate-scores-step.js +95 -0
  270. package/dist/orchestration/steps/callback-step.d.ts +24 -0
  271. package/dist/orchestration/steps/callback-step.js +76 -0
  272. package/dist/orchestration/steps/compare-step.d.ts +14 -0
  273. package/dist/orchestration/steps/compare-step.js +92 -0
  274. package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
  275. package/dist/orchestration/steps/discovery-report-step.js +55 -0
  276. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  277. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  278. package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
  279. package/dist/orchestration/steps/fetch-docs-step.js +135 -0
  280. package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
  281. package/dist/orchestration/steps/gap-analysis-step.js +136 -0
  282. package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
  283. package/dist/orchestration/steps/generate-configs-step.js +85 -0
  284. package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
  285. package/dist/orchestration/steps/grader-consistency-step.js +64 -0
  286. package/dist/orchestration/steps/index.d.ts +19 -0
  287. package/dist/orchestration/steps/index.js +19 -0
  288. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
  289. package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
  290. package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
  291. package/dist/orchestration/steps/publish-report-step.js +216 -0
  292. package/dist/orchestration/steps/readiness-step.d.ts +13 -0
  293. package/dist/orchestration/steps/readiness-step.js +91 -0
  294. package/dist/orchestration/steps/report-step.d.ts +12 -0
  295. package/dist/orchestration/steps/report-step.js +49 -0
  296. package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
  297. package/dist/orchestration/steps/run-eval-step.js +195 -0
  298. package/dist/orchestration/steps/validate-step.d.ts +12 -0
  299. package/dist/orchestration/steps/validate-step.js +41 -0
  300. package/dist/pipeline/agent-behavior-report.d.ts +53 -0
  301. package/dist/pipeline/agent-behavior-report.js +132 -0
  302. package/dist/pipeline/attribution.d.ts +47 -0
  303. package/dist/pipeline/attribution.js +226 -0
  304. package/dist/pipeline/baseline.d.ts +37 -0
  305. package/dist/pipeline/baseline.js +141 -0
  306. package/dist/pipeline/cache.d.ts +101 -0
  307. package/dist/pipeline/cache.js +283 -0
  308. package/dist/pipeline/calculate-scores.d.ts +102 -0
  309. package/dist/pipeline/calculate-scores.js +1128 -0
  310. package/dist/pipeline/callback-delivery.d.ts +50 -0
  311. package/dist/pipeline/callback-delivery.js +89 -0
  312. package/dist/pipeline/checks.d.ts +39 -0
  313. package/dist/pipeline/checks.js +280 -0
  314. package/dist/pipeline/classify-url.d.ts +61 -0
  315. package/dist/pipeline/classify-url.js +93 -0
  316. package/dist/pipeline/compare.d.ts +31 -0
  317. package/dist/pipeline/compare.js +208 -0
  318. package/dist/pipeline/coverage-audit.d.ts +39 -0
  319. package/dist/pipeline/coverage-audit.js +165 -0
  320. package/dist/pipeline/degradations.d.ts +85 -0
  321. package/dist/pipeline/degradations.js +242 -0
  322. package/dist/pipeline/discovery-report.d.ts +55 -0
  323. package/dist/pipeline/discovery-report.js +178 -0
  324. package/dist/pipeline/eval-constants.d.ts +68 -0
  325. package/dist/pipeline/eval-constants.js +111 -0
  326. package/dist/pipeline/eval-fingerprint.d.ts +66 -0
  327. package/dist/pipeline/eval-fingerprint.js +175 -0
  328. package/dist/pipeline/expand-tasks.d.ts +220 -0
  329. package/dist/pipeline/expand-tasks.js +421 -0
  330. package/dist/pipeline/failure-modes.d.ts +46 -0
  331. package/dist/pipeline/failure-modes.js +348 -0
  332. package/dist/pipeline/fetch-url-content.d.ts +44 -0
  333. package/dist/pipeline/fetch-url-content.js +93 -0
  334. package/dist/pipeline/gap-analysis.d.ts +48 -0
  335. package/dist/pipeline/gap-analysis.js +231 -0
  336. package/dist/pipeline/generate-configs.d.ts +72 -0
  337. package/dist/pipeline/generate-configs.js +395 -0
  338. package/dist/pipeline/grader-api.d.ts +49 -0
  339. package/dist/pipeline/grader-api.js +200 -0
  340. package/dist/pipeline/grader-compare-runner.d.ts +44 -0
  341. package/dist/pipeline/grader-compare-runner.js +301 -0
  342. package/dist/pipeline/grader-comparison.d.ts +111 -0
  343. package/dist/pipeline/grader-comparison.js +161 -0
  344. package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
  345. package/dist/pipeline/grader-consistency-runner.js +270 -0
  346. package/dist/pipeline/grader-consistency.d.ts +103 -0
  347. package/dist/pipeline/grader-consistency.js +146 -0
  348. package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
  349. package/dist/pipeline/grader-sensitivity-runner.js +282 -0
  350. package/dist/pipeline/grader-sensitivity.d.ts +94 -0
  351. package/dist/pipeline/grader-sensitivity.js +144 -0
  352. package/dist/pipeline/grader-validate-runner.d.ts +38 -0
  353. package/dist/pipeline/grader-validate-runner.js +229 -0
  354. package/dist/pipeline/grader-validation.d.ts +107 -0
  355. package/dist/pipeline/grader-validation.js +169 -0
  356. package/dist/pipeline/map-request-to-config.d.ts +19 -0
  357. package/dist/pipeline/map-request-to-config.js +80 -0
  358. package/dist/pipeline/measure-retrieval.d.ts +59 -0
  359. package/dist/pipeline/measure-retrieval.js +111 -0
  360. package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
  361. package/dist/pipeline/mirror-repo-tasks.js +350 -0
  362. package/dist/pipeline/plan-format.d.ts +33 -0
  363. package/dist/pipeline/plan-format.js +202 -0
  364. package/dist/pipeline/plan.d.ts +169 -0
  365. package/dist/pipeline/plan.js +708 -0
  366. package/dist/pipeline/pr-comment.d.ts +19 -0
  367. package/dist/pipeline/pr-comment.js +502 -0
  368. package/dist/pipeline/probe.d.ts +52 -0
  369. package/dist/pipeline/probe.js +390 -0
  370. package/dist/pipeline/provenance.d.ts +47 -0
  371. package/dist/pipeline/provenance.js +146 -0
  372. package/dist/pipeline/readiness-report.d.ts +87 -0
  373. package/dist/pipeline/readiness-report.js +205 -0
  374. package/dist/pipeline/release-classification.d.ts +54 -0
  375. package/dist/pipeline/release-classification.js +238 -0
  376. package/dist/pipeline/release-report.d.ts +37 -0
  377. package/dist/pipeline/release-report.js +222 -0
  378. package/dist/pipeline/repo-eval-comment.d.ts +37 -0
  379. package/dist/pipeline/repo-eval-comment.js +165 -0
  380. package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
  381. package/dist/pipeline/repo-threshold-evaluator.js +162 -0
  382. package/dist/pipeline/resolve-mappings.d.ts +35 -0
  383. package/dist/pipeline/resolve-mappings.js +72 -0
  384. package/dist/pipeline/retrieval-metrics.d.ts +39 -0
  385. package/dist/pipeline/retrieval-metrics.js +136 -0
  386. package/dist/pipeline/reverse-mapping.d.ts +67 -0
  387. package/dist/pipeline/reverse-mapping.js +88 -0
  388. package/dist/pipeline/schemas.d.ts +9 -0
  389. package/dist/pipeline/schemas.js +9 -0
  390. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  391. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  392. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  393. package/dist/pipeline/steps/compare-step.js +90 -0
  394. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  395. package/dist/pipeline/steps/eval-step.js +347 -0
  396. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  397. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  398. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  399. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  400. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  401. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  402. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  403. package/dist/pipeline/steps/publish-report-step.js +243 -0
  404. package/dist/pipeline/steps/report-step.d.ts +13 -0
  405. package/dist/pipeline/steps/report-step.js +56 -0
  406. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  407. package/dist/pipeline/steps/update-scores-step.js +42 -0
  408. package/dist/pipeline/targeted-loo.d.ts +88 -0
  409. package/dist/pipeline/targeted-loo.js +203 -0
  410. package/dist/pipeline/thresholds.d.ts +27 -0
  411. package/dist/pipeline/thresholds.js +245 -0
  412. package/dist/pipeline/types.d.ts +10 -0
  413. package/dist/pipeline/types.js +10 -0
  414. package/dist/pipeline/validate.d.ts +67 -0
  415. package/dist/pipeline/validate.js +406 -0
  416. package/dist/pipeline/webhook-server.d.ts +37 -0
  417. package/dist/pipeline/webhook-server.js +133 -0
  418. package/dist/report-store.d.ts +84 -0
  419. package/dist/report-store.js +208 -0
  420. package/dist/sanity/client.d.ts +38 -0
  421. package/dist/sanity/client.js +86 -0
  422. package/dist/sanity/portable-text.d.ts +11 -0
  423. package/dist/sanity/portable-text.js +211 -0
  424. package/dist/sanity/queries.d.ts +133 -0
  425. package/dist/sanity/queries.js +300 -0
  426. package/dist/schedules/digest.d.ts +116 -0
  427. package/dist/schedules/digest.js +156 -0
  428. package/dist/schedules/index.d.ts +12 -0
  429. package/dist/schedules/index.js +10 -0
  430. package/dist/schedules/loader.d.ts +31 -0
  431. package/dist/schedules/loader.js +73 -0
  432. package/dist/schedules/schema.d.ts +9 -0
  433. package/dist/schedules/schema.js +9 -0
  434. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  435. package/dist/scripts/agent-behavior-report.js +315 -0
  436. package/dist/scripts/baseline.d.ts +43 -0
  437. package/dist/scripts/baseline.js +267 -0
  438. package/dist/scripts/calculate-scores.d.ts +166 -0
  439. package/dist/scripts/calculate-scores.js +1296 -0
  440. package/dist/scripts/compare.d.ts +22 -0
  441. package/dist/scripts/compare.js +334 -0
  442. package/dist/scripts/coverage-audit.d.ts +44 -0
  443. package/dist/scripts/coverage-audit.js +209 -0
  444. package/dist/scripts/debug-eval.d.ts +19 -0
  445. package/dist/scripts/debug-eval.js +73 -0
  446. package/dist/scripts/discovery-report.d.ts +58 -0
  447. package/dist/scripts/discovery-report.js +250 -0
  448. package/dist/scripts/fetch-docs.d.ts +35 -0
  449. package/dist/scripts/fetch-docs.js +472 -0
  450. package/dist/scripts/generate-configs.d.ts +66 -0
  451. package/dist/scripts/generate-configs.js +459 -0
  452. package/dist/scripts/grader-api.d.ts +27 -0
  453. package/dist/scripts/grader-api.js +206 -0
  454. package/dist/scripts/grader-compare.d.ts +22 -0
  455. package/dist/scripts/grader-compare.js +368 -0
  456. package/dist/scripts/grader-consistency.d.ts +20 -0
  457. package/dist/scripts/grader-consistency.js +313 -0
  458. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  459. package/dist/scripts/grader-sensitivity.js +354 -0
  460. package/dist/scripts/grader-validate.d.ts +19 -0
  461. package/dist/scripts/grader-validate.js +267 -0
  462. package/dist/scripts/measure-retrieval.d.ts +10 -0
  463. package/dist/scripts/measure-retrieval.js +145 -0
  464. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
  465. package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
  466. package/dist/scripts/pipeline.d.ts +76 -0
  467. package/dist/scripts/pipeline.js +1031 -0
  468. package/dist/scripts/pr-comment.d.ts +10 -0
  469. package/dist/scripts/pr-comment.js +510 -0
  470. package/dist/scripts/readiness-report.d.ts +88 -0
  471. package/dist/scripts/readiness-report.js +342 -0
  472. package/dist/scripts/update-quality-scores.d.ts +15 -0
  473. package/dist/scripts/update-quality-scores.js +184 -0
  474. package/dist/scripts/validate-task-sources.d.ts +21 -0
  475. package/dist/scripts/validate-task-sources.js +210 -0
  476. package/dist/scripts/validate.d.ts +13 -0
  477. package/dist/scripts/validate.js +79 -0
  478. package/dist/scripts/webhook-server.d.ts +26 -0
  479. package/dist/scripts/webhook-server.js +147 -0
  480. package/dist/scripts/weekly-digest.d.ts +24 -0
  481. package/dist/scripts/weekly-digest.js +144 -0
  482. package/dist/sinks/bigquery/index.d.ts +131 -0
  483. package/dist/sinks/bigquery/index.js +222 -0
  484. package/dist/sinks/format-slack.d.ts +64 -0
  485. package/dist/sinks/format-slack.js +306 -0
  486. package/dist/sinks/index.d.ts +23 -0
  487. package/dist/sinks/index.js +18 -0
  488. package/dist/sinks/loader.d.ts +18 -0
  489. package/dist/sinks/loader.js +82 -0
  490. package/dist/sinks/retry.d.ts +24 -0
  491. package/dist/sinks/retry.js +52 -0
  492. package/dist/sinks/schema.d.ts +9 -0
  493. package/dist/sinks/schema.js +9 -0
  494. package/dist/sinks/slack/format.d.ts +65 -0
  495. package/dist/sinks/slack/format.js +327 -0
  496. package/dist/sinks/slack/index.d.ts +27 -0
  497. package/dist/sinks/slack/index.js +78 -0
  498. package/dist/sinks/slack-sink.d.ts +27 -0
  499. package/dist/sinks/slack-sink.js +78 -0
  500. package/dist/sinks/types.d.ts +59 -0
  501. package/dist/sinks/types.js +44 -0
  502. package/dist/sinks/webhook/index.d.ts +19 -0
  503. package/dist/sinks/webhook/index.js +50 -0
  504. package/dist/sinks/webhook-sink.d.ts +19 -0
  505. package/dist/sinks/webhook-sink.js +50 -0
  506. package/dist/sources.d.ts +104 -0
  507. package/dist/sources.js +292 -0
  508. package/dist/webhook/budget.d.ts +42 -0
  509. package/dist/webhook/budget.js +60 -0
  510. package/dist/webhook/debounce.d.ts +67 -0
  511. package/dist/webhook/debounce.js +76 -0
  512. package/dist/webhook/dispatch.d.ts +45 -0
  513. package/dist/webhook/dispatch.js +84 -0
  514. package/dist/webhook/eval-request-handler.d.ts +87 -0
  515. package/dist/webhook/eval-request-handler.js +181 -0
  516. package/dist/webhook/handler.d.ts +88 -0
  517. package/dist/webhook/handler.js +203 -0
  518. package/dist/webhook/index.d.ts +17 -0
  519. package/dist/webhook/index.js +12 -0
  520. package/dist/webhook/types.d.ts +109 -0
  521. package/dist/webhook/types.js +10 -0
  522. package/package.json +72 -0
  523. package/tasks/.expanded.agentic.yaml +51 -0
  524. package/tasks/.expanded.yaml +66 -0
  525. package/tasks/frameworks.yaml +98 -0
  526. package/tasks/functions.yaml +51 -0
  527. package/tasks/groq.yaml +216 -0
  528. package/tasks/nextjs-live.yaml +62 -0
  529. package/tasks/studio-setup.yaml +111 -0
  530. package/tasks/visual-editing.yaml +120 -0
@@ -0,0 +1,1128 @@
1
+ /**
2
+ * pipeline/calculate-scores.ts
3
+ *
4
+ * Reads Promptfoo evaluation output and computes the AI Literacy Score
5
+ * for each feature area. Each dimension is scored on a uniform 0–100 scale:
6
+ *
7
+ * Task Completion (0–100) — Can the LLM implement the feature?
8
+ * Code Correctness (0–100) — Is the code idiomatic and correct?
9
+ * Doc Coverage (0–100) — Did docs provide the needed info?
10
+ *
11
+ * Dimensions are combined into a weighted composite (0–100) using weights
12
+ * from config/rubrics.yaml (default: Task×0.50 + Code×0.25 + Docs×0.25).
13
+ *
14
+ * Additionally compares with-docs vs without-docs scores to calculate
15
+ * the "Doc Lift" — how much documentation helps vs parametric knowledge.
16
+ *
17
+ * When tests are run with the InstrumentedProvider (agent-observer),
18
+ * this script also aggregates and reports agent behavior data: which
19
+ * documentation pages were visited, what searches were performed, and
20
+ * overall network activity patterns.
21
+ *
22
+ * All functions accept rootDir as a parameter — no module-level constants.
23
+ * No process.argv parsing. No env var fallbacks.
24
+ *
25
+ * @see docs/exec-plans/active/eliminate-lib-layer.md
26
+ */
27
+ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
28
+ import { join } from "path";
29
+ import { calculateCost } from "../agent-observer/pricing.js";
30
+ import { checkResultsExist } from "./checks.js";
31
+ import { loadRubricTemplates } from "./expand-tasks.js";
32
+ import { loadSource } from "../sources.js";
33
+ import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
34
+ import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
35
+ // Re-export pure functions from core for backward compatibility.
36
+ // Existing imports from this file continue to work unchanged.
37
+ export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
38
+ /**
39
+ * Calculate scores grouped by model. Each model gets its own FeatureScore[]
40
+ * and model-level aggregates.
41
+ *
42
+ * Uses the provider.id from Promptfoo results to identify models.
43
+ * Falls back to provider.label, then "unknown" if neither is available.
44
+ *
45
+ * @returns Record keyed by model ID, or null if only one model was used
46
+ * (per-model breakdown is redundant when there's only one model).
47
+ */
48
+ export function calculateScoresPerModel(resultsPath, weights) {
49
+ const results = readAndNormalizeResults(resultsPath);
50
+ // Group results by provider
51
+ const byModel = {};
52
+ for (const result of results) {
53
+ const modelId = result.providerId ?? result.providerLabel ?? "unknown";
54
+ const label = result.providerLabel ?? result.providerId ?? "unknown";
55
+ if (!byModel[modelId]) {
56
+ byModel[modelId] = { label, results: [] };
57
+ }
58
+ byModel[modelId].results.push(result);
59
+ }
60
+ const modelIds = Object.keys(byModel);
61
+ // Skip per-model breakdown if there's only one model — it would be
62
+ // Identical to the overall scores and adds no information.
63
+ if (modelIds.length <= 1) {
64
+ return null;
65
+ }
66
+ const perModel = [];
67
+ for (const [modelId, { label, results: modelResults }] of Object.entries(byModel)) {
68
+ const scores = scoreResults(modelResults, weights, modelId);
69
+ const totalTests = scores.reduce((s, sc) => s + sc.testCount, 0);
70
+ const totalCost = scores.reduce((s, sc) => s + sc.totalCost, 0);
71
+ const avgScore = scores.length > 0
72
+ ? scores.reduce((s, sc) => s + sc.totalScore, 0) / scores.length
73
+ : 0;
74
+ const avgDocLift = scores.length > 0
75
+ ? scores.reduce((s, sc) => s + sc.docLift, 0) / scores.length
76
+ : 0;
77
+ perModel.push({
78
+ label,
79
+ modelId,
80
+ overall: {
81
+ avgDocLift,
82
+ avgScore,
83
+ cost: totalCost > 0 ? totalCost : undefined,
84
+ testCount: totalTests,
85
+ },
86
+ scores,
87
+ });
88
+ }
89
+ return perModel;
90
+ }
91
+ // classifyRubric, detectFeatureArea — imported from @sanity/ailf-core above
92
+ // ---------------------------------------------------------------------------
93
+ // URL extraction from assertion metadata
94
+ // ---------------------------------------------------------------------------
95
+ /**
96
+ * Extract grader judgments (reason text + scores) from evaluation results.
97
+ *
98
+ * This preserves the grader's natural language reasoning for downstream
99
+ * analysis (failure mode classification, gap analysis). Each llm-rubric
100
+ * assertion produces one GraderJudgment entry.
101
+ *
102
+ * Phase 3a prerequisite: structured judgment data for failure mode extraction.
103
+ */
104
+ export function extractGraderJudgments(resultsPath) {
105
+ const results = readAndNormalizeResults(resultsPath);
106
+ const judgments = [];
107
+ for (const result of results) {
108
+ const taskId = result.description;
109
+ const modelId = result.providerId ?? result.providerLabel ?? "unknown";
110
+ for (const comp of result.gradingResult.componentResults) {
111
+ if (comp.assertion?.type !== "llm-rubric") {
112
+ continue;
113
+ }
114
+ const kind = classifyRubric(comp);
115
+ if (!kind) {
116
+ continue;
117
+ }
118
+ const score = parseRubricScore(comp);
119
+ // Extract the reason text — the grader's reasoning
120
+ let reason = comp.reason ?? "";
121
+ if (reason) {
122
+ // Try to parse JSON reason to extract the reason field
123
+ try {
124
+ const parsed = JSON.parse(reason);
125
+ const obj = parsed;
126
+ if (typeof obj.reason === "string") {
127
+ ;
128
+ ({ reason } = obj);
129
+ }
130
+ }
131
+ catch {
132
+ // Not JSON — use raw reason string
133
+ }
134
+ }
135
+ // Map internal dimension names to hyphenated form
136
+ const dimensionMap = {
137
+ codeCorrectness: "code-correctness",
138
+ docCoverage: "doc-coverage",
139
+ taskCompletion: "task-completion",
140
+ };
141
+ judgments.push({
142
+ dimension: dimensionMap[kind] ?? kind,
143
+ modelId,
144
+ reason,
145
+ score,
146
+ taskId,
147
+ });
148
+ }
149
+ }
150
+ return judgments;
151
+ }
152
+ /**
153
+ * Finds the URL-extraction assertion result in a test's componentResults
154
+ * and parses the structured JSON from its `reason` field.
155
+ */
156
+ // extractUrlMetadata, parseRubricScore — imported from @sanity/ailf-core above
157
+ /**
158
+ * Aggregates agent behavior data across all test results, grouped by
159
+ * feature area. Returns null if no behavior data is present.
160
+ */
161
+ function aggregateAgentBehavior(resultsPath) {
162
+ const results = readAndNormalizeResults(resultsPath);
163
+ const byFeature = {};
164
+ let hasBehaviorData = false;
165
+ for (const result of results) {
166
+ const feature = detectFeatureArea(result.description);
167
+ const behavior = extractAgentBehavior(result);
168
+ if (!behavior) {
169
+ continue;
170
+ }
171
+ hasBehaviorData = true;
172
+ if (!byFeature[feature]) {
173
+ byFeature[feature] = [];
174
+ }
175
+ byFeature[feature].push(behavior);
176
+ }
177
+ if (!hasBehaviorData) {
178
+ return null;
179
+ }
180
+ return Object.entries(byFeature)
181
+ .map(([feature, behaviors]) => {
182
+ const count = behaviors.length || 1;
183
+ return {
184
+ avgDocPagesVisited: behaviors.reduce((s, b) => s + b.docPagesVisited, 0) / count,
185
+ avgNetworkTimeMs: behaviors.reduce((s, b) => s + b.totalNetworkMs, 0) / count,
186
+ avgSearchesPerformed: behaviors.reduce((s, b) => s + b.searchesPerformed, 0) / count,
187
+ docSlugsVisited: [
188
+ ...new Set(behaviors.flatMap((b) => b.docSlugsVisited)),
189
+ ],
190
+ externalDomains: [
191
+ ...new Set(behaviors.flatMap((b) => b.externalDomains)),
192
+ ],
193
+ feature,
194
+ searchQueries: [
195
+ ...new Set(behaviors.flatMap((b) => b.uniqueSearchQueries)),
196
+ ],
197
+ tasksWithBehaviorData: behaviors.length,
198
+ };
199
+ })
200
+ .sort((a, b) => a.feature.localeCompare(b.feature));
201
+ }
202
+ /**
203
+ * Computes aggregate source isolation metrics from agentic eval results.
204
+ *
205
+ * Reads DOC_ALLOWED_ORIGINS from the environment (set by pipeline.ts)
206
+ * and analyzes all doc page visits across all test results.
207
+ *
208
+ * Returns null if no origin sandboxing was configured or no agent behavior
209
+ * data is present.
210
+ */
211
+ function aggregateSourceIsolation(resultsPath, allowedOriginsParam) {
212
+ const allowedOrigins = allowedOriginsParam;
213
+ if (!allowedOrigins || allowedOrigins.length === 0) {
214
+ return null;
215
+ }
216
+ const results = readAndNormalizeResults(resultsPath);
217
+ // Collect all doc page visits from all test results
218
+ const allDocVisits = [];
219
+ for (const result of results) {
220
+ const behavior = result.metadata?.agentBehavior;
221
+ if (!behavior?.docPageVisits) {
222
+ continue;
223
+ }
224
+ for (const visit of behavior.docPageVisits) {
225
+ allDocVisits.push({ url: visit.url });
226
+ }
227
+ }
228
+ if (allDocVisits.length === 0) {
229
+ return null;
230
+ }
231
+ return analyzeSourceIsolation(allDocVisits, allowedOrigins);
232
+ }
233
+ // ---------------------------------------------------------------------------
234
+ // Feature area detection
235
+ // ---------------------------------------------------------------------------
236
+ /**
237
+ * Aggregates URL references across all test results, grouped by feature
238
+ * area and gold/baseline variant.
239
+ */
240
+ function aggregateUrlReferences(resultsPath) {
241
+ const results = readAndNormalizeResults(resultsPath);
242
+ const byFeature = {};
243
+ for (const result of results) {
244
+ const feature = detectFeatureArea(result.description);
245
+ if (!byFeature[feature]) {
246
+ byFeature[feature] = {
247
+ baseline: { testCount: 0, urls: {} },
248
+ gold: { testCount: 0, urls: {} },
249
+ };
250
+ }
251
+ const hasDocs = result.vars.docs && result.vars.docs.trim().length > 0;
252
+ const bucket = hasDocs
253
+ ? byFeature[feature].gold
254
+ : byFeature[feature].baseline;
255
+ const meta = extractUrlMetadata(result);
256
+ if (!meta) {
257
+ continue;
258
+ }
259
+ bucket.testCount++;
260
+ for (const url of meta.sanityUrls) {
261
+ bucket.urls[url] = (bucket.urls[url] || 0) + 1;
262
+ }
263
+ }
264
+ return Object.entries(byFeature)
265
+ .map(([feature, data]) => ({ feature, ...data }))
266
+ .sort((a, b) => a.feature.localeCompare(b.feature));
267
+ }
268
+ // ---------------------------------------------------------------------------
269
+ // Score calculation
270
+ // ---------------------------------------------------------------------------
271
+ /**
272
+ * Build source verification data for the score summary.
273
+ *
274
+ * Combines pipeline configuration (mode, source, sandbox) with runtime
275
+ * metadata (URL fetch results, isolation scores) to produce a unified
276
+ * verification report.
277
+ */
278
+ function buildSourceVerification(root, source, verificationCtx) {
279
+ const mode = verificationCtx?.mode ?? "baseline";
280
+ const sourceUrl = source?.baseUrl ?? "default";
281
+ const searchMode = verificationCtx?.searchMode;
282
+ const allowedOrigins = verificationCtx?.allowedOrigins;
283
+ // Read URL fetch metadata if it exists (written by fetch-docs.ts)
284
+ let urlFetch;
285
+ const urlFetchPath = join(root, "contexts", "url-fetch.json");
286
+ if (existsSync(urlFetchPath)) {
287
+ try {
288
+ urlFetch = JSON.parse(readFileSync(urlFetchPath, "utf-8"));
289
+ }
290
+ catch {
291
+ // Malformed JSON — skip
292
+ }
293
+ }
294
+ return {
295
+ ...(allowedOrigins && { allowedOrigins }),
296
+ mode,
297
+ ...(searchMode && { searchMode }),
298
+ source: sourceUrl,
299
+ ...(urlFetch && {
300
+ urlFetch: {
301
+ failures: urlFetch.failures,
302
+ fetchedUrls: urlFetch.fetchedUrls.map((f) => ({
303
+ method: f.method,
304
+ status: f.status,
305
+ url: f.url,
306
+ })),
307
+ totalFailed: urlFetch.totalFailed,
308
+ totalFetched: urlFetch.totalFetched,
309
+ },
310
+ }),
311
+ };
312
+ }
313
+ // ---------------------------------------------------------------------------
314
+ // Agent behavior aggregation
315
+ // ---------------------------------------------------------------------------
316
+ /**
317
+ * Calculate overall scores (all models combined).
318
+ * This is the original scoring path — backward compatible.
319
+ */
320
+ function calculateScores(resultsPath, weights) {
321
+ const results = readAndNormalizeResults(resultsPath);
322
+ return scoreResults(results, weights);
323
+ }
324
+ /**
325
+ * Extracts agent behavior summary from a test result's metadata.
326
+ * Returns null if the test was not run with the instrumented provider.
327
+ */
328
+ function extractAgentBehavior(test) {
329
+ const { metadata } = test;
330
+ if (!metadata?.agentBehaviorSummary) {
331
+ return null;
332
+ }
333
+ return metadata.agentBehaviorSummary;
334
+ }
335
+ /**
336
+ * Extracts grader (assertion) cost from the raw Promptfoo results file.
337
+ * Promptfoo tracks assertion token usage separately in stats.tokenUsage.assertions.
338
+ * The grader model is found in config.defaultTest.options.provider.
339
+ */
340
+ function extractGraderCost(resultsPath) {
341
+ const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
342
+ const stats = file.results?.stats;
343
+ const assertions = stats?.tokenUsage?.assertions;
344
+ if (!assertions || assertions.total === 0) {
345
+ return null;
346
+ }
347
+ // Extract grader model from config
348
+ const graderModel = file.config?.defaultTest?.options?.rubricProvider ??
349
+ file.config?.defaultTest?.options?.provider;
350
+ // Extract just the model name from "openai:gpt-5-2025-08-07" format
351
+ const modelName = graderModel?.replace(/^[^:]+:/, "");
352
+ const cost = modelName
353
+ ? calculateCost(modelName, assertions.prompt, assertions.completion)
354
+ : 0;
355
+ return {
356
+ completionTokens: assertions.completion,
357
+ cost,
358
+ model: graderModel,
359
+ promptTokens: assertions.prompt,
360
+ totalTokens: assertions.total,
361
+ };
362
+ }
363
+ /**
364
+ * Prints a formatted report of agent behavior observations.
365
+ */
366
+ function printAgentBehaviorReport(agentBehavior) {
367
+ console.log("-".repeat(80));
368
+ console.log("AGENT BEHAVIOR OBSERVATION");
369
+ console.log("-".repeat(80));
370
+ console.log();
371
+ // Summary table
372
+ const h = "| Feature Area | Tests | Doc Pages | Searches | Net (ms) |";
373
+ const sep = "|---------------------|-------|-----------|----------|----------|";
374
+ console.log(h);
375
+ console.log(sep);
376
+ for (const ab of agentBehavior) {
377
+ console.log(`| ${ab.feature.padEnd(19)} | ` +
378
+ `${ab.tasksWithBehaviorData.toString().padStart(5)} | ` +
379
+ `${ab.avgDocPagesVisited.toFixed(1).padStart(9)} | ` +
380
+ `${ab.avgSearchesPerformed.toFixed(1).padStart(8)} | ` +
381
+ `${Math.round(ab.avgNetworkTimeMs).toString().padStart(8)} |`);
382
+ }
383
+ console.log();
384
+ // Doc pages visited
385
+ console.log(" Doc pages visited:");
386
+ for (const ab of agentBehavior) {
387
+ if (ab.docSlugsVisited.length === 0) {
388
+ console.log(` ${ab.feature}: (none)`);
389
+ }
390
+ else {
391
+ console.log(` ${ab.feature}:`);
392
+ for (const slug of ab.docSlugsVisited) {
393
+ console.log(` - /docs/${slug}`);
394
+ }
395
+ }
396
+ }
397
+ console.log();
398
+ // Search queries
399
+ const hasSearches = agentBehavior.some((ab) => ab.searchQueries.length > 0);
400
+ if (hasSearches) {
401
+ console.log(" Search queries:");
402
+ for (const ab of agentBehavior) {
403
+ if (ab.searchQueries.length === 0) {
404
+ continue;
405
+ }
406
+ console.log(` ${ab.feature}:`);
407
+ for (const q of ab.searchQueries) {
408
+ console.log(` - "${q}"`);
409
+ }
410
+ }
411
+ console.log();
412
+ }
413
+ // External domains
414
+ const allExternalDomains = [
415
+ ...new Set(agentBehavior.flatMap((ab) => ab.externalDomains)),
416
+ ];
417
+ if (allExternalDomains.length > 0) {
418
+ console.log(" External domains contacted:");
419
+ for (const d of allExternalDomains) {
420
+ console.log(` - ${d}`);
421
+ }
422
+ console.log();
423
+ }
424
+ }
425
+ // ---------------------------------------------------------------------------
426
+ // Report
427
+ // ---------------------------------------------------------------------------
428
+ // ---------------------------------------------------------------------------
429
+ // Grader cost extraction
430
+ // ---------------------------------------------------------------------------
431
+ /**
432
+ * Reads the raw Promptfoo output file and normalizes each result so that
433
+ * `description` is always a top-level field (pulled from `testCase` if needed).
434
+ */
435
+ function readAndNormalizeResults(resultsPath) {
436
+ const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
437
+ const wrapper = file.results ?? file;
438
+ const all = wrapper.results.map((r) => ({
439
+ cost: r.cost ?? 0,
440
+ description: r.testCase?.description ?? "unknown",
441
+ error: r.error,
442
+ gradingResult: r.gradingResult,
443
+ metadata: r.metadata,
444
+ provider: r.provider?.label ?? r.provider?.id,
445
+ providerId: r.provider?.id,
446
+ providerLabel: r.provider?.label,
447
+ response: r.response,
448
+ vars: r.vars ?? r.testCase?.vars ?? {},
449
+ }));
450
+ // Filter out results where gradingResult is null (errored/timed-out tests).
451
+ // Promptfoo sets gradingResult to null when a test errors before grading.
452
+ const valid = all.filter((r) => r.gradingResult !== null);
453
+ const skipped = all.length - valid.length;
454
+ if (skipped > 0) {
455
+ console.warn(` ⚠ Skipping ${skipped} of ${all.length} result(s) with null gradingResult (errored tests):`);
456
+ for (const r of all) {
457
+ if (r.gradingResult === null) {
458
+ const providerLabel = r.provider ? `[${r.provider}] ` : "";
459
+ const errorMsg = r.error
460
+ ? r.error.slice(0, 150)
461
+ : "unknown error (no error field in result)";
462
+ console.warn(` ✗ ${providerLabel}"${r.description}" — ${errorMsg}`);
463
+ }
464
+ }
465
+ }
466
+ return valid;
467
+ }
468
+ /**
469
+ * Core scoring logic: takes a pre-filtered array of TestResult and produces
470
+ * FeatureScore[] grouped by feature area. This is the shared implementation
471
+ * used by both the overall scoring and per-model scoring paths.
472
+ *
473
+ * @param results Pre-filtered (valid) test results
474
+ * @param weights Dimension weights from rubrics.yaml
475
+ * @param modelId Optional model identifier to tag each FeatureScore
476
+ */
477
+ function scoreResults(results, weights, modelId) {
478
+ const wTask = weights["task-completion"] ?? 0.5;
479
+ const wCode = weights["code-correctness"] ?? 0.25;
480
+ const wDoc = weights["doc-coverage"] ?? 0.25;
481
+ // Group by feature + docs/no-docs
482
+ const byFeature = {};
483
+ for (const result of results) {
484
+ const feature = detectFeatureArea(result.description);
485
+ if (!byFeature[feature]) {
486
+ byFeature[feature] = { withDocs: [], withoutDocs: [] };
487
+ }
488
+ const hasDocs = result.vars.docs && result.vars.docs.trim().length > 0;
489
+ if (hasDocs) {
490
+ byFeature[feature].withDocs.push(result);
491
+ }
492
+ else {
493
+ byFeature[feature].withoutDocs.push(result);
494
+ }
495
+ }
496
+ const scores = [];
497
+ for (const [feature, data] of Object.entries(byFeature)) {
498
+ // --- With docs ---
499
+ let totalTask = 0;
500
+ let totalCode = 0;
501
+ let totalDoc = 0;
502
+ let featureCost = 0;
503
+ const countWithDocs = data.withDocs.length || 1;
504
+ for (const test of data.withDocs) {
505
+ featureCost += test.cost;
506
+ for (const comp of test.gradingResult.componentResults) {
507
+ if (comp.assertion?.type !== "llm-rubric") {
508
+ continue;
509
+ }
510
+ const score = parseRubricScore(comp);
511
+ const kind = classifyRubric(comp);
512
+ if (kind === "taskCompletion") {
513
+ totalTask += score;
514
+ }
515
+ else if (kind === "codeCorrectness") {
516
+ totalCode += score;
517
+ }
518
+ else if (kind === "docCoverage") {
519
+ totalDoc += score;
520
+ }
521
+ }
522
+ }
523
+ // Per-dimension averages (each 0–100)
524
+ const avgTask = totalTask / countWithDocs;
525
+ const avgCode = totalCode / countWithDocs;
526
+ const avgDoc = totalDoc / countWithDocs;
527
+ // Weighted composite (0–100)
528
+ const withDocsTotal = avgTask * wTask + avgCode * wCode + avgDoc * wDoc;
529
+ // --- Without docs (baseline) ---
530
+ let baselineTotal = 0;
531
+ let baselineCount = 0;
532
+ for (const test of data.withoutDocs) {
533
+ featureCost += test.cost;
534
+ for (const comp of test.gradingResult.componentResults) {
535
+ if (comp.assertion?.type !== "llm-rubric") {
536
+ continue;
537
+ }
538
+ baselineTotal += parseRubricScore(comp);
539
+ baselineCount++;
540
+ }
541
+ }
542
+ const withoutDocsScore = baselineCount > 0 ? baselineTotal / baselineCount : 0;
543
+ const ceilingScore = Math.round(withDocsTotal);
544
+ const floorScore = Math.round(withoutDocsScore);
545
+ const docLift = ceilingScore - floorScore;
546
+ scores.push({
547
+ ceilingScore,
548
+ codeCorrectness: Math.round(avgCode),
549
+ docCoverage: Math.round(avgDoc),
550
+ docLift,
551
+ docQualityGap: 100 - ceilingScore,
552
+ feature,
553
+ floorScore,
554
+ ...(modelId && { modelId }),
555
+ negativeDocLift: docLift < 0,
556
+ taskCompletion: Math.round(avgTask),
557
+ testCount: data.withDocs.length,
558
+ totalCost: featureCost,
559
+ totalScore: ceilingScore,
560
+ });
561
+ }
562
+ return scores.sort((a, b) => a.feature.localeCompare(b.feature));
563
+ }
564
+ // ---------------------------------------------------------------------------
565
+ // Agentic scoring — all results are "actual" (agent retrieves docs via tools)
566
+ // ---------------------------------------------------------------------------
567
+ /**
568
+ * Score agentic evaluation results. In agentic mode, all test entries are
569
+ * gold-only (no baseline entries — the .expanded.agentic.yaml fix ensures this).
570
+ * The model retrieves docs via tools, so all results map to "actual" scores.
571
+ *
572
+ * Returns a record keyed by feature area with the composite actual score.
573
+ */
574
+ // ActualScoreEntry — imported from @sanity/ailf-core via pipeline/types.js
575
+ export function scoreAgenticResults(resultsPath, weights) {
576
+ const results = readAndNormalizeResults(resultsPath);
577
+ const wTask = weights["task-completion"] ?? 0.5;
578
+ const wCode = weights["code-correctness"] ?? 0.25;
579
+ const wDoc = weights["doc-coverage"] ?? 0.25;
580
+ // Group by feature area
581
+ const byFeature = {};
582
+ for (const result of results) {
583
+ const feature = detectFeatureArea(result.description);
584
+ if (!byFeature[feature]) {
585
+ byFeature[feature] = [];
586
+ }
587
+ byFeature[feature].push(result);
588
+ }
589
+ const entries = {};
590
+ for (const [feature, featureResults] of Object.entries(byFeature)) {
591
+ let totalTask = 0;
592
+ let totalCode = 0;
593
+ let totalDoc = 0;
594
+ let featureCost = 0;
595
+ const count = featureResults.length || 1;
596
+ for (const test of featureResults) {
597
+ featureCost += test.cost;
598
+ for (const comp of test.gradingResult.componentResults) {
599
+ if (comp.assertion?.type !== "llm-rubric")
600
+ continue;
601
+ const score = parseRubricScore(comp);
602
+ const kind = classifyRubric(comp);
603
+ if (kind === "taskCompletion")
604
+ totalTask += score;
605
+ else if (kind === "codeCorrectness")
606
+ totalCode += score;
607
+ else if (kind === "docCoverage")
608
+ totalDoc += score;
609
+ }
610
+ }
611
+ const avgTask = totalTask / count;
612
+ const avgCode = totalCode / count;
613
+ const avgDoc = totalDoc / count;
614
+ const actualScore = Math.round(avgTask * wTask + avgCode * wCode + avgDoc * wDoc);
615
+ entries[feature] = {
616
+ actualScore,
617
+ codeCorrectness: Math.round(avgCode),
618
+ docCoverage: Math.round(avgDoc),
619
+ taskCompletion: Math.round(avgTask),
620
+ testCount: featureResults.length,
621
+ totalCost: featureCost,
622
+ };
623
+ }
624
+ return entries;
625
+ }
626
+ // ---------------------------------------------------------------------------
627
+ // Score merging — combine baseline floor/ceiling with agentic actual
628
+ // ---------------------------------------------------------------------------
629
+ /**
630
+ * Merge baseline FeatureScore[] with agentic actual scores to produce
631
+ * the full three-layer decomposition.
632
+ *
633
+ * The merge is per feature area. For each area:
634
+ * - If baseline data exists: floor, ceiling, docLift, docQualityGap are populated
635
+ * - If agentic data exists: actualScore is populated
636
+ * - If both exist: retrievalGap and infrastructureEfficiency are computed
637
+ *
638
+ * @param baselineScores Floor/ceiling scores from baseline evaluation (may be empty)
639
+ * @param agenticScores Actual scores from agentic evaluation (may be empty)
640
+ */
641
+ // mergeScores — imported from @sanity/ailf-core above
642
+ const CRITICAL_THRESHOLD = 40;
643
+ export function calculateAndWriteScores(options) {
644
+ const ROOT = options.rootDir;
645
+ const sourceName = options.source;
646
+ // Pre-resolved source wins over name-based lookup
647
+ let source = options.resolvedSource;
648
+ if (!source) {
649
+ try {
650
+ source = loadSource(sourceName);
651
+ }
652
+ catch {
653
+ console.warn(` [warn] Could not load source "${sourceName}", proceeding without source metadata`);
654
+ }
655
+ }
656
+ // Determine mode — controls which result files are read
657
+ const mode = options.mode ?? "baseline";
658
+ const baselineResultsPath = options.resultsPath ?? join(ROOT, "results", "latest", "eval-results.json");
659
+ // Agentic results path (only used in full mode)
660
+ const agenticResultsPath = join(ROOT, "results", "latest", "eval-results-agentic.json");
661
+ // Validate baseline results file
662
+ const resultsIssues = checkResultsExist(ROOT, baselineResultsPath);
663
+ const resultsErrors = resultsIssues.filter((i) => i.severity === "error");
664
+ if (resultsErrors.length > 0) {
665
+ console.error("❌ Results validation failed:");
666
+ for (const e of resultsErrors) {
667
+ console.error(` ERROR: ${e.message}`);
668
+ if (e.path) {
669
+ console.error(` at ${e.path}`);
670
+ }
671
+ }
672
+ console.error("\nRun 'pnpm eval' first to generate results, then 'pnpm calculate-scores'.");
673
+ process.exit(1);
674
+ }
675
+ console.log(`Reading results from: ${baselineResultsPath}`);
676
+ if (source) {
677
+ console.log(`Source: ${sourceName} (${source.baseUrl})`);
678
+ }
679
+ // Load dimension weights from rubrics.yaml
680
+ const rubricConfig = loadRubricTemplates(ROOT);
681
+ const baselineScores = calculateScores(baselineResultsPath, rubricConfig.weights);
682
+ const perModel = calculateScoresPerModel(baselineResultsPath, rubricConfig.weights);
683
+ const urlRefs = aggregateUrlReferences(baselineResultsPath);
684
+ const sourceVerification = buildSourceVerification(ROOT, source, {
685
+ allowedOrigins: options.allowedOrigins,
686
+ mode: options.mode ?? mode,
687
+ searchMode: options.searchMode,
688
+ });
689
+ const graderCost = extractGraderCost(baselineResultsPath);
690
+ // Full mode: merge baseline floor/ceiling with agentic actual scores
691
+ let scores;
692
+ let agentBehavior = null;
693
+ let sourceIsolation = null;
694
+ let evaluationMode;
695
+ if (mode === "full" && existsSync(agenticResultsPath)) {
696
+ console.log(`\nReading agentic results from: ${agenticResultsPath}`);
697
+ const agenticScores = scoreAgenticResults(agenticResultsPath, rubricConfig.weights);
698
+ scores = mergeScores(baselineScores, agenticScores);
699
+ evaluationMode = "full";
700
+ // Aggregate agent behavior and source isolation from agentic results
701
+ agentBehavior = aggregateAgentBehavior(agenticResultsPath);
702
+ sourceIsolation = aggregateSourceIsolation(agenticResultsPath, options?.allowedOrigins);
703
+ // Merge grader costs from both files
704
+ const agenticGraderCost = extractGraderCost(agenticResultsPath);
705
+ if (graderCost && agenticGraderCost) {
706
+ graderCost.cost += agenticGraderCost.cost;
707
+ graderCost.totalTokens += agenticGraderCost.totalTokens;
708
+ graderCost.promptTokens += agenticGraderCost.promptTokens;
709
+ graderCost.completionTokens += agenticGraderCost.completionTokens;
710
+ }
711
+ }
712
+ else if (mode === "agentic") {
713
+ scores = baselineScores;
714
+ agentBehavior = aggregateAgentBehavior(baselineResultsPath);
715
+ sourceIsolation = aggregateSourceIsolation(baselineResultsPath, options?.allowedOrigins);
716
+ evaluationMode = "agentic";
717
+ }
718
+ else {
719
+ scores = baselineScores;
720
+ agentBehavior = aggregateAgentBehavior(baselineResultsPath);
721
+ sourceIsolation = aggregateSourceIsolation(baselineResultsPath, options?.allowedOrigins);
722
+ evaluationMode = mode === "observed" ? "observed" : "baseline";
723
+ }
724
+ const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode);
725
+ // Persist
726
+ const outDir = join(ROOT, "results", "latest");
727
+ mkdirSync(outDir, { recursive: true });
728
+ writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
729
+ console.log("Score summary written to results/latest/score-summary.json");
730
+ // Extract and persist grader judgments (Phase 3a: failure mode extraction)
731
+ const judgments = extractGraderJudgments(baselineResultsPath);
732
+ // In full mode, also extract judgments from agentic results
733
+ if (mode === "full" && existsSync(agenticResultsPath)) {
734
+ const agenticJudgments = extractGraderJudgments(agenticResultsPath);
735
+ judgments.push(...agenticJudgments);
736
+ }
737
+ if (judgments.length > 0) {
738
+ writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
739
+ console.log(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
740
+ }
741
+ // Exit with non-zero if any area below critical threshold
742
+ if (summary.belowCritical.length > 0) {
743
+ process.exit(1);
744
+ }
745
+ }
746
+ function printPerModelReport(perModel) {
747
+ console.log("-".repeat(80));
748
+ console.log("PER-MODEL BREAKDOWN");
749
+ console.log("-".repeat(80));
750
+ console.log();
751
+ // Model summary table
752
+ const h = "| Model | Avg Score | Avg Lift | Tests | Cost |";
753
+ const sep = "|--------------------------------|-----------|----------|-------|----------|";
754
+ console.log(h);
755
+ console.log(sep);
756
+ const sorted = [...perModel].sort((a, b) => b.overall.avgScore - a.overall.avgScore);
757
+ for (const entry of sorted) {
758
+ const displayName = entry.label || entry.modelId;
759
+ const costStr = entry.overall.cost
760
+ ? `$${entry.overall.cost.toFixed(4)}`
761
+ : "—";
762
+ const liftStr = entry.overall.avgDocLift >= 0
763
+ ? `+${entry.overall.avgDocLift.toFixed(1)}`
764
+ : entry.overall.avgDocLift.toFixed(1);
765
+ console.log(`| ${displayName.padEnd(30)} | ` +
766
+ `${entry.overall.avgScore.toFixed(1).padStart(9)} | ` +
767
+ `${liftStr.padStart(8)} | ` +
768
+ `${entry.overall.testCount.toString().padStart(5)} | ` +
769
+ `${costStr.padStart(8)} |`);
770
+ }
771
+ console.log();
772
+ // Per-model × per-area breakdown
773
+ for (const entry of sorted) {
774
+ const displayName = entry.label || entry.modelId;
775
+ console.log(` ${displayName} (${entry.modelId}):`);
776
+ const areaH = " | Feature Area | Task | Code | Docs | Total | Lift |";
777
+ const areaSep = " |---------------------|------|------|------|-------|------|";
778
+ console.log(areaH);
779
+ console.log(areaSep);
780
+ for (const s of entry.scores) {
781
+ const lift = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
782
+ console.log(` | ${s.feature.padEnd(19)} | ` +
783
+ `${s.taskCompletion.toString().padStart(4)} | ` +
784
+ `${s.codeCorrectness.toString().padStart(4)} | ` +
785
+ `${s.docCoverage.toString().padStart(4)} | ` +
786
+ `${s.totalScore.toString().padStart(5)} | ` +
787
+ `${lift.padStart(4)} |`);
788
+ }
789
+ console.log();
790
+ }
791
+ // Cost-per-quality-point
792
+ const modelsWithCost = sorted.filter((e) => e.overall.cost && e.overall.cost > 0);
793
+ if (modelsWithCost.length > 0) {
794
+ console.log(" Cost per quality point:");
795
+ for (const entry of modelsWithCost) {
796
+ const displayName = entry.label;
797
+ const costPerPoint = entry.overall.avgScore > 0
798
+ ? (entry.overall.cost ?? 0) / entry.overall.avgScore
799
+ : 0;
800
+ console.log(` ${displayName}: $${costPerPoint.toFixed(6)}/point (score: ${entry.overall.avgScore.toFixed(1)}, cost: $${(entry.overall.cost ?? 0).toFixed(4)})`);
801
+ }
802
+ console.log();
803
+ }
804
+ }
805
+ // ---------------------------------------------------------------------------
806
+ // Main
807
+ // ---------------------------------------------------------------------------
808
+ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode) {
809
+ console.log("\n" + "=".repeat(80));
810
+ console.log(" SANITY AI LITERACY SCORE REPORT");
811
+ console.log("=".repeat(80));
812
+ console.log();
813
+ // Table header
814
+ const h = "| Feature Area | Task | Code | Docs | Total | w/o Docs | Doc Lift |";
815
+ const sep = "|---------------------|------|------|------|-------|----------|----------|";
816
+ console.log(h);
817
+ console.log(sep);
818
+ for (const s of scores) {
819
+ const status = s.totalScore < CRITICAL_THRESHOLD ? "!!" : "ok";
820
+ const lift = s.docLift > 0 ? `+${s.docLift}` : `${s.docLift}`;
821
+ console.log(`| ${status} ${s.feature.padEnd(17)} | ` +
822
+ `${s.taskCompletion.toString().padStart(4)} | ` +
823
+ `${s.codeCorrectness.toString().padStart(4)} | ` +
824
+ `${s.docCoverage.toString().padStart(4)} | ` +
825
+ `${s.totalScore.toString().padStart(5)} | ` +
826
+ `${s.floorScore.toString().padStart(8)} | ` +
827
+ `${lift.padStart(8)} |`);
828
+ }
829
+ console.log();
830
+ // OKR status
831
+ const belowCritical = scores.filter((s) => s.totalScore < CRITICAL_THRESHOLD);
832
+ const lowestScore = scores.reduce((min, s) => s.totalScore < min.totalScore ? s : min);
833
+ const avgScore = scores.reduce((sum, s) => sum + s.totalScore, 0) / scores.length;
834
+ const avgLift = scores.reduce((sum, s) => sum + s.docLift, 0) / scores.length;
835
+ const avgCeilingScore = scores.reduce((sum, s) => sum + s.ceilingScore, 0) / scores.length;
836
+ const avgFloorScore = scores.reduce((sum, s) => sum + s.floorScore, 0) / scores.length;
837
+ const avgDocQualityGap = scores.reduce((sum, s) => sum + s.docQualityGap, 0) / scores.length;
838
+ const negativeDocLiftScores = scores.filter((s) => s.negativeDocLift);
839
+ const negativeDocLiftAreas = negativeDocLiftScores.map((s) => ({
840
+ area: s.feature,
841
+ docLift: s.docLift,
842
+ }));
843
+ console.log("-".repeat(80));
844
+ console.log("OKR STATUS");
845
+ console.log("-".repeat(80));
846
+ console.log();
847
+ if (belowCritical.length === 0) {
848
+ console.log(" KR1: PASS -- All areas above critical threshold (>=40)");
849
+ }
850
+ else {
851
+ console.log(" KR1: FAIL -- Areas below critical threshold:");
852
+ belowCritical.forEach((s) => console.log(` - ${s.feature}: ${s.totalScore}`));
853
+ }
854
+ console.log();
855
+ console.log(` Lowest area: ${lowestScore.feature} (${lowestScore.totalScore})`);
856
+ console.log(` Target: +15 points improvement`);
857
+ console.log();
858
+ console.log(` Avg score: ${avgScore.toFixed(1)}`);
859
+ console.log(` Avg doc lift: +${avgLift.toFixed(1)} points`);
860
+ console.log(` (Doc lift = how much docs help vs parametric knowledge alone)`);
861
+ console.log();
862
+ // Ceiling decomposition
863
+ console.log("-".repeat(80));
864
+ console.log("CEILING DECOMPOSITION");
865
+ console.log("-".repeat(80));
866
+ console.log();
867
+ const ceilH = "| Feature Area | Floor | Ceiling | Doc Lift | Quality Gap |";
868
+ const ceilSep = "|---------------------|-------|---------|----------|-------------|";
869
+ console.log(ceilH);
870
+ console.log(ceilSep);
871
+ for (const s of scores) {
872
+ const liftStr = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
873
+ const liftFlag = s.negativeDocLift ? " 🚨" : "";
874
+ console.log(`| ${s.feature.padEnd(19)} | ` +
875
+ `${s.floorScore.toString().padStart(5)} | ` +
876
+ `${s.ceilingScore.toString().padStart(7)} | ` +
877
+ `${liftStr.padStart(8)}${liftFlag} | ` +
878
+ `${s.docQualityGap.toString().padStart(11)} |`);
879
+ }
880
+ console.log();
881
+ if (negativeDocLiftAreas.length > 0) {
882
+ console.log(" 🚨 NEGATIVE DOC LIFT DETECTED:");
883
+ for (const { area, docLift } of negativeDocLiftAreas) {
884
+ const s = scores.find((sc) => sc.feature === area);
885
+ console.log(` ${area}: Doc Lift = ${docLift} (floor: ${s.floorScore}, ceiling: ${s.ceilingScore})`);
886
+ }
887
+ console.log(" Documentation is HURTING model performance for these areas.");
888
+ console.log(" See docs/design-docs/scenario-matrix/evaluation-ceiling.md");
889
+ console.log();
890
+ }
891
+ else {
892
+ console.log(" ✅ No areas with negative Doc Lift detected.");
893
+ console.log();
894
+ }
895
+ // Three-layer decomposition (only when actual scores are present)
896
+ const hasActualScores = scores.some((s) => s.actualScore !== undefined);
897
+ if (hasActualScores) {
898
+ console.log("-".repeat(80));
899
+ console.log("THREE-LAYER DECOMPOSITION (floor → ceiling → actual)");
900
+ console.log("-".repeat(80));
901
+ console.log();
902
+ const decompH = "| Feature Area | Floor | Ceiling | Actual | Doc Lift | Ret. Gap | Infra % |";
903
+ const decompSep = "|---------------------|-------|---------|--------|----------|----------|---------|";
904
+ console.log(decompH);
905
+ console.log(decompSep);
906
+ for (const s of scores) {
907
+ const liftStr = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
908
+ const actualStr = s.actualScore !== undefined ? s.actualScore.toString() : "—";
909
+ const gapStr = s.retrievalGap !== undefined
910
+ ? s.retrievalGap >= 0
911
+ ? `+${s.retrievalGap}`
912
+ : `${s.retrievalGap}`
913
+ : "—";
914
+ const infraStr = s.infrastructureEfficiency != null
915
+ ? `${Math.round(s.infrastructureEfficiency * 100)}%`
916
+ : "—";
917
+ const flag = s.invertedRetrievalGap ? " 🔄" : "";
918
+ console.log(`| ${s.feature.padEnd(19)} | ` +
919
+ `${s.floorScore.toString().padStart(5)} | ` +
920
+ `${s.ceilingScore.toString().padStart(7)} | ` +
921
+ `${actualStr.padStart(6)} | ` +
922
+ `${liftStr.padStart(8)} | ` +
923
+ `${(gapStr + flag).padStart(8)} | ` +
924
+ `${infraStr.padStart(7)} |`);
925
+ }
926
+ console.log();
927
+ console.log(" Doc Lift = ceiling − floor | Ret. Gap = ceiling − actual | Infra = actual / ceiling");
928
+ console.log(" 🔄 = inverted retrieval gap (agents avoid bad docs → higher actual than ceiling)");
929
+ console.log();
930
+ }
931
+ // Cost summary
932
+ const totalCost = scores.reduce((sum, s) => sum + s.totalCost, 0);
933
+ const totalTests = scores.reduce((sum, s) => sum + s.testCount, 0);
934
+ const graderCostTotal = graderCost?.cost ?? 0;
935
+ const combinedCost = totalCost + graderCostTotal;
936
+ if (totalCost > 0 || graderCostTotal > 0) {
937
+ console.log("-".repeat(80));
938
+ console.log("COST SUMMARY");
939
+ console.log("-".repeat(80));
940
+ console.log();
941
+ console.log(` Provider cost: $${totalCost.toFixed(4)}`);
942
+ if (graderCostTotal > 0) {
943
+ const graderLabel = graderCost?.model ?? "unknown";
944
+ console.log(` Grader cost: $${graderCostTotal.toFixed(4)} (${graderLabel}, ${(graderCost?.totalTokens ?? 0).toLocaleString()} tokens)`);
945
+ }
946
+ console.log(` Total cost: $${combinedCost.toFixed(4)}`);
947
+ console.log(` Avg cost per test: $${(combinedCost / (totalTests || 1)).toFixed(4)}`);
948
+ console.log();
949
+ const costHeader = "| Feature Area | Tests | Cost | Avg/Test |";
950
+ const costSep = "|---------------------|-------|----------|----------|";
951
+ console.log(costHeader);
952
+ console.log(costSep);
953
+ for (const s of scores) {
954
+ const avgCost = s.testCount > 0 ? s.totalCost / s.testCount : 0;
955
+ console.log(`| ${s.feature.padEnd(19)} | ` +
956
+ `${s.testCount.toString().padStart(5)} | ` +
957
+ `$${s.totalCost.toFixed(4).padStart(7)} | ` +
958
+ `$${avgCost.toFixed(4).padStart(7)} |`);
959
+ }
960
+ console.log();
961
+ }
962
+ // Per-model breakdown
963
+ if (perModel) {
964
+ printPerModelReport(perModel);
965
+ }
966
+ // URL References
967
+ printUrlReport(urlRefs);
968
+ // Agent Behavior (only present when run with instrumented provider)
969
+ if (agentBehavior && agentBehavior.length > 0) {
970
+ printAgentBehaviorReport(agentBehavior);
971
+ }
972
+ // Source verification (unified report for all modes)
973
+ if (sourceVerification || sourceIsolation) {
974
+ console.log("-".repeat(80));
975
+ console.log("📋 SOURCE VERIFICATION");
976
+ console.log("-".repeat(80));
977
+ if (sourceVerification) {
978
+ console.log(` Source: ${sourceVerification.source}`);
979
+ console.log(` Mode: ${sourceVerification.mode}`);
980
+ if (sourceVerification.allowedOrigins) {
981
+ console.log(` Sandbox: ${sourceVerification.allowedOrigins.join(", ")}`);
982
+ }
983
+ if (sourceVerification.searchMode) {
984
+ console.log(` Search: ${sourceVerification.searchMode}`);
985
+ }
986
+ // URL fetch results (baseline mode with direct URLs)
987
+ if (sourceVerification.urlFetch) {
988
+ const uf = sourceVerification.urlFetch;
989
+ console.log();
990
+ console.log(` URL fetch: ${uf.totalFetched} fetched, ${uf.totalFailed} failed`);
991
+ for (const f of uf.fetchedUrls) {
992
+ console.log(` ✅ ${f.url} (via ${f.method})`);
993
+ }
994
+ for (const f of uf.failures) {
995
+ // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string means no error info
996
+ console.log(` ⚠️ ${f.url}: ${f.error || "unknown error"}`);
997
+ }
998
+ }
999
+ }
1000
+ // Agentic isolation score
1001
+ if (sourceIsolation) {
1002
+ const pct = Math.round(sourceIsolation.isolationScore * 100);
1003
+ const icon = sourceIsolation.offOrigin === 0 ? "✅" : "⚠️";
1004
+ console.log();
1005
+ console.log(` Agent isolation: ${icon} ${pct}% (${sourceIsolation.onOrigin}/${sourceIsolation.total} on-origin)`);
1006
+ if (sourceIsolation.offOrigin > 0) {
1007
+ console.log(` Off-origin fetches: ${sourceIsolation.offOrigin}`);
1008
+ for (const url of sourceIsolation.offOriginUrls.slice(0, 10)) {
1009
+ console.log(` • ${url}`);
1010
+ }
1011
+ }
1012
+ if (Object.keys(sourceIsolation.originBreakdown).length > 0) {
1013
+ console.log(" Origin breakdown:");
1014
+ for (const [origin, count] of Object.entries(sourceIsolation.originBreakdown).sort((a, b) => b[1] - a[1])) {
1015
+ console.log(` ${origin}: ${count}`);
1016
+ }
1017
+ }
1018
+ }
1019
+ console.log();
1020
+ }
1021
+ // Build overall agent behavior stats for summary
1022
+ const overallAgentBehavior = agentBehavior && agentBehavior.length > 0
1023
+ ? {
1024
+ avgDocPagesVisited: agentBehavior.reduce((s, ab) => s + ab.avgDocPagesVisited, 0) /
1025
+ agentBehavior.length,
1026
+ avgNetworkTimeMs: agentBehavior.reduce((s, ab) => s + ab.avgNetworkTimeMs, 0) /
1027
+ agentBehavior.length,
1028
+ avgSearchesPerformed: agentBehavior.reduce((s, ab) => s + ab.avgSearchesPerformed, 0) /
1029
+ agentBehavior.length,
1030
+ testsWithBehaviorData: agentBehavior.reduce((s, ab) => s + ab.tasksWithBehaviorData, 0),
1031
+ totalUniqueDocSlugs: [
1032
+ ...new Set(agentBehavior.flatMap((ab) => ab.docSlugsVisited)),
1033
+ ].length,
1034
+ totalUniqueSearchQueries: [
1035
+ ...new Set(agentBehavior.flatMap((ab) => ab.searchQueries)),
1036
+ ].length,
1037
+ }
1038
+ : undefined;
1039
+ // Compute aggregate metrics from actual scores (when agentic data present)
1040
+ const scoresWithActual = scores.filter((s) => s.actualScore !== undefined);
1041
+ const avgActualScore = scoresWithActual.length > 0
1042
+ ? scoresWithActual.reduce((sum, s) => sum + (s.actualScore ?? 0), 0) /
1043
+ scoresWithActual.length
1044
+ : undefined;
1045
+ const scoresWithGap = scores.filter((s) => s.retrievalGap !== undefined);
1046
+ const avgRetrievalGap = scoresWithGap.length > 0
1047
+ ? scoresWithGap.reduce((sum, s) => sum + (s.retrievalGap ?? 0), 0) /
1048
+ scoresWithGap.length
1049
+ : undefined;
1050
+ const scoresWithInfra = scores.filter((s) => s.infrastructureEfficiency != null);
1051
+ const avgInfrastructureEfficiency = scoresWithInfra.length > 0
1052
+ ? scoresWithInfra.reduce((sum, s) => sum + (s.infrastructureEfficiency ?? 0), 0) / scoresWithInfra.length
1053
+ : undefined;
1054
+ return {
1055
+ agentBehavior: agentBehavior ?? undefined,
1056
+ belowCritical: belowCritical.map((s) => s.feature),
1057
+ ...(evaluationMode && { evaluationMode }),
1058
+ lowestArea: lowestScore.feature,
1059
+ lowestScore: lowestScore.totalScore,
1060
+ ...(negativeDocLiftAreas.length > 0 && { negativeDocLiftAreas }),
1061
+ overall: {
1062
+ agentBehavior: overallAgentBehavior,
1063
+ ...(avgActualScore !== undefined && { avgActualScore }),
1064
+ avgCeilingScore: avgCeilingScore,
1065
+ avgDocLift: avgLift,
1066
+ avgDocQualityGap: avgDocQualityGap,
1067
+ avgFloorScore: avgFloorScore,
1068
+ ...(avgInfrastructureEfficiency !== undefined && {
1069
+ avgInfrastructureEfficiency,
1070
+ }),
1071
+ ...(avgRetrievalGap !== undefined && { avgRetrievalGap }),
1072
+ avgScore,
1073
+ cost: totalCost > 0 || graderCostTotal > 0
1074
+ ? {
1075
+ graderModel: graderCost?.model,
1076
+ graderTotal: graderCostTotal,
1077
+ perTest: combinedCost / (totalTests || 1),
1078
+ total: combinedCost,
1079
+ totalTokens: graderCost?.totalTokens ?? 0,
1080
+ }
1081
+ : undefined,
1082
+ negativeDocLiftCount: negativeDocLiftAreas.length,
1083
+ },
1084
+ scores,
1085
+ source: source
1086
+ ? {
1087
+ baseUrl: source.baseUrl,
1088
+ dataset: source.dataset,
1089
+ name: source.name ?? "default",
1090
+ perspective: source.perspective,
1091
+ projectId: source.projectId,
1092
+ }
1093
+ : undefined,
1094
+ ...(perModel && { perModel }),
1095
+ ...(sourceIsolation && { sourceIsolation }),
1096
+ ...(sourceVerification && { sourceVerification }),
1097
+ timestamp: new Date().toISOString(),
1098
+ urlReferences: urlRefs,
1099
+ };
1100
+ }
1101
+ function printUrlReport(urlRefs) {
1102
+ console.log("-".repeat(80));
1103
+ console.log("URL REFERENCES");
1104
+ console.log("-".repeat(80));
1105
+ console.log();
1106
+ for (const ref of urlRefs) {
1107
+ const goldUrls = Object.entries(ref.gold.urls).sort((a, b) => b[1] - a[1]);
1108
+ const baselineUrls = Object.entries(ref.baseline.urls).sort((a, b) => b[1] - a[1]);
1109
+ if (goldUrls.length > 0) {
1110
+ console.log(` ${ref.feature} (gold):`);
1111
+ for (const [url, count] of goldUrls) {
1112
+ const suffix = count > 1 ? ` (${count} tests)` : "";
1113
+ console.log(` ${url}${suffix}`);
1114
+ }
1115
+ }
1116
+ if (baselineUrls.length > 0) {
1117
+ console.log(` ${ref.feature} (baseline):`);
1118
+ for (const [url, count] of baselineUrls) {
1119
+ const suffix = count > 1 ? ` (${count} tests)` : "";
1120
+ console.log(` ${url}${suffix} [parametric]`);
1121
+ }
1122
+ }
1123
+ if (goldUrls.length === 0 && baselineUrls.length === 0) {
1124
+ console.log(` ${ref.feature}: no URLs referenced`);
1125
+ }
1126
+ console.log();
1127
+ }
1128
+ }