@sanity/ailf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (530) hide show
  1. package/README.md +89 -0
  2. package/bin/ailf.js +64 -0
  3. package/canonical/grader-references/README.md +88 -0
  4. package/canonical/grader-references/groq.yaml +234 -0
  5. package/canonical/grader-references/studio-setup.yaml +275 -0
  6. package/canonical/reference-solutions/.gitkeep +1 -0
  7. package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
  8. package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
  9. package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
  10. package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
  11. package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
  12. package/canonical/reference-solutions/groq/joins-references.ts +300 -0
  13. package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
  14. package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
  15. package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
  16. package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
  17. package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
  18. package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
  19. package/config/bigquery/README.md +74 -0
  20. package/config/bigquery/views/area_scores.sql +87 -0
  21. package/config/bigquery/views/reports.sql +49 -0
  22. package/config/features.yaml +116 -0
  23. package/config/models.yaml +115 -0
  24. package/config/prompts.yaml +75 -0
  25. package/config/rubrics.yaml +62 -0
  26. package/config/schedules.yaml +43 -0
  27. package/config/sinks.yaml +54 -0
  28. package/config/sources.yaml +51 -0
  29. package/config/thresholds.yaml +49 -0
  30. package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
  31. package/dist/_vendor/ailf-core/examples/index.js +285 -0
  32. package/dist/_vendor/ailf-core/index.d.ts +17 -0
  33. package/dist/_vendor/ailf-core/index.js +17 -0
  34. package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
  35. package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
  36. package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
  37. package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
  38. package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
  39. package/dist/_vendor/ailf-core/ports/context.js +14 -0
  40. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
  41. package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
  42. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
  43. package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
  44. package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
  45. package/dist/_vendor/ailf-core/ports/index.js +7 -0
  46. package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
  47. package/dist/_vendor/ailf-core/ports/logger.js +11 -0
  48. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
  49. package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
  50. package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
  51. package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
  52. package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
  53. package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
  54. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
  55. package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
  56. package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
  57. package/dist/_vendor/ailf-core/schemas/index.js +16 -0
  58. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
  59. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
  60. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
  61. package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
  62. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
  63. package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
  64. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
  65. package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
  66. package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
  67. package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
  68. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
  69. package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
  70. package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
  71. package/dist/_vendor/ailf-core/services/index.js +12 -0
  72. package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
  73. package/dist/_vendor/ailf-core/services/scoring.js +222 -0
  74. package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
  75. package/dist/_vendor/ailf-core/types/index.js +21 -0
  76. package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
  77. package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
  78. package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
  79. package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
  80. package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
  81. package/dist/_vendor/ailf-shared/document-ref.js +1 -0
  82. package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
  83. package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
  84. package/dist/_vendor/ailf-shared/index.d.ts +16 -0
  85. package/dist/_vendor/ailf-shared/index.js +16 -0
  86. package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
  87. package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
  88. package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
  89. package/dist/_vendor/ailf-shared/score-grades.js +23 -0
  90. package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
  91. package/dist/adapters/cache/content-lake-cache.js +59 -0
  92. package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
  93. package/dist/adapters/cache/filesystem-cache.js +54 -0
  94. package/dist/adapters/cache/index.d.ts +2 -0
  95. package/dist/adapters/cache/index.js +2 -0
  96. package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
  97. package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
  98. package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
  99. package/dist/adapters/config-sources/file-config-adapter.js +96 -0
  100. package/dist/adapters/config-sources/index.d.ts +2 -0
  101. package/dist/adapters/config-sources/index.js +2 -0
  102. package/dist/adapters/doc-fetchers/index.d.ts +1 -0
  103. package/dist/adapters/doc-fetchers/index.js +1 -0
  104. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
  105. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
  106. package/dist/adapters/eval-runners/index.d.ts +1 -0
  107. package/dist/adapters/eval-runners/index.js +1 -0
  108. package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
  109. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
  110. package/dist/adapters/index.d.ts +12 -0
  111. package/dist/adapters/index.js +12 -0
  112. package/dist/adapters/loggers/console-logger.d.ts +22 -0
  113. package/dist/adapters/loggers/console-logger.js +54 -0
  114. package/dist/adapters/loggers/index.d.ts +9 -0
  115. package/dist/adapters/loggers/index.js +9 -0
  116. package/dist/adapters/loggers/json-logger.d.ts +18 -0
  117. package/dist/adapters/loggers/json-logger.js +33 -0
  118. package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
  119. package/dist/adapters/loggers/quiet-logger.js +30 -0
  120. package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
  121. package/dist/adapters/task-sources/composite-task-source.js +59 -0
  122. package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
  123. package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
  124. package/dist/adapters/task-sources/index.d.ts +7 -0
  125. package/dist/adapters/task-sources/index.js +7 -0
  126. package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
  127. package/dist/adapters/task-sources/repo-schemas.js +234 -0
  128. package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
  129. package/dist/adapters/task-sources/repo-task-source.js +104 -0
  130. package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
  131. package/dist/adapters/task-sources/repo-trigger.js +153 -0
  132. package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
  133. package/dist/adapters/task-sources/repo-validation.js +164 -0
  134. package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
  135. package/dist/adapters/task-sources/yaml-task-source.js +136 -0
  136. package/dist/agent-observer/agentic-provider.d.ts +132 -0
  137. package/dist/agent-observer/agentic-provider.js +983 -0
  138. package/dist/agent-observer/classifier.d.ts +62 -0
  139. package/dist/agent-observer/classifier.js +269 -0
  140. package/dist/agent-observer/index.d.ts +7 -0
  141. package/dist/agent-observer/index.js +4 -0
  142. package/dist/agent-observer/pricing.d.ts +35 -0
  143. package/dist/agent-observer/pricing.js +82 -0
  144. package/dist/agent-observer/provider.d.ts +77 -0
  145. package/dist/agent-observer/provider.js +151 -0
  146. package/dist/agent-observer/proxy.d.ts +91 -0
  147. package/dist/agent-observer/proxy.js +321 -0
  148. package/dist/agent-observer/test-imports.d.ts +7 -0
  149. package/dist/agent-observer/test-imports.js +185 -0
  150. package/dist/agent-observer/types.d.ts +137 -0
  151. package/dist/agent-observer/types.js +16 -0
  152. package/dist/assertions/source-isolation.d.ts +72 -0
  153. package/dist/assertions/source-isolation.js +117 -0
  154. package/dist/cli.d.ts +24 -0
  155. package/dist/cli.js +199 -0
  156. package/dist/commands/agent-report.d.ts +5 -0
  157. package/dist/commands/agent-report.js +69 -0
  158. package/dist/commands/baseline.d.ts +9 -0
  159. package/dist/commands/baseline.js +141 -0
  160. package/dist/commands/cache.d.ts +13 -0
  161. package/dist/commands/cache.js +135 -0
  162. package/dist/commands/calculate-scores.d.ts +8 -0
  163. package/dist/commands/calculate-scores.js +48 -0
  164. package/dist/commands/compare.d.ts +8 -0
  165. package/dist/commands/compare.js +120 -0
  166. package/dist/commands/completion.d.ts +18 -0
  167. package/dist/commands/completion.js +260 -0
  168. package/dist/commands/coverage-audit.d.ts +7 -0
  169. package/dist/commands/coverage-audit.js +40 -0
  170. package/dist/commands/discovery-report.d.ts +10 -0
  171. package/dist/commands/discovery-report.js +44 -0
  172. package/dist/commands/eval.d.ts +9 -0
  173. package/dist/commands/eval.js +35 -0
  174. package/dist/commands/explain-handler.d.ts +34 -0
  175. package/dist/commands/explain-handler.js +719 -0
  176. package/dist/commands/fetch-docs.d.ts +8 -0
  177. package/dist/commands/fetch-docs.js +128 -0
  178. package/dist/commands/generate-configs.d.ts +8 -0
  179. package/dist/commands/generate-configs.js +46 -0
  180. package/dist/commands/grader/index.d.ts +11 -0
  181. package/dist/commands/grader/index.js +118 -0
  182. package/dist/commands/init.d.ts +19 -0
  183. package/dist/commands/init.js +150 -0
  184. package/dist/commands/interactive.d.ts +12 -0
  185. package/dist/commands/interactive.js +238 -0
  186. package/dist/commands/lookup-doc.d.ts +15 -0
  187. package/dist/commands/lookup-doc.js +84 -0
  188. package/dist/commands/measure-retrieval.d.ts +5 -0
  189. package/dist/commands/measure-retrieval.js +65 -0
  190. package/dist/commands/pipeline-action.d.ts +71 -0
  191. package/dist/commands/pipeline-action.js +305 -0
  192. package/dist/commands/pipeline.d.ts +62 -0
  193. package/dist/commands/pipeline.js +53 -0
  194. package/dist/commands/pr-comment.d.ts +8 -0
  195. package/dist/commands/pr-comment.js +47 -0
  196. package/dist/commands/publish.d.ts +26 -0
  197. package/dist/commands/publish.js +253 -0
  198. package/dist/commands/readiness-report.d.ts +10 -0
  199. package/dist/commands/readiness-report.js +104 -0
  200. package/dist/commands/shared/options.d.ts +29 -0
  201. package/dist/commands/shared/options.js +57 -0
  202. package/dist/commands/update-quality-scores.d.ts +5 -0
  203. package/dist/commands/update-quality-scores.js +20 -0
  204. package/dist/commands/validate-tasks.d.ts +16 -0
  205. package/dist/commands/validate-tasks.js +93 -0
  206. package/dist/commands/validate.d.ts +9 -0
  207. package/dist/commands/validate.js +73 -0
  208. package/dist/commands/webhook-server.d.ts +5 -0
  209. package/dist/commands/webhook-server.js +30 -0
  210. package/dist/commands/weekly-digest.d.ts +10 -0
  211. package/dist/commands/weekly-digest.js +104 -0
  212. package/dist/composition-root.d.ts +26 -0
  213. package/dist/composition-root.js +107 -0
  214. package/dist/interpolate.d.ts +26 -0
  215. package/dist/interpolate.js +70 -0
  216. package/dist/job-store.d.ts +104 -0
  217. package/dist/job-store.js +188 -0
  218. package/dist/lib/agent-behavior-report.d.ts +8 -0
  219. package/dist/lib/agent-behavior-report.js +185 -0
  220. package/dist/lib/baseline.d.ts +19 -0
  221. package/dist/lib/baseline.js +153 -0
  222. package/dist/lib/calculate-scores.d.ts +23 -0
  223. package/dist/lib/calculate-scores.js +42 -0
  224. package/dist/lib/compare.d.ts +18 -0
  225. package/dist/lib/compare.js +170 -0
  226. package/dist/lib/coverage-audit.d.ts +4 -0
  227. package/dist/lib/coverage-audit.js +42 -0
  228. package/dist/lib/discovery-report.d.ts +13 -0
  229. package/dist/lib/discovery-report.js +57 -0
  230. package/dist/lib/fetch-docs.d.ts +30 -0
  231. package/dist/lib/fetch-docs.js +171 -0
  232. package/dist/lib/generate-configs.d.ts +25 -0
  233. package/dist/lib/generate-configs.js +42 -0
  234. package/dist/lib/grader-api.d.ts +21 -0
  235. package/dist/lib/grader-api.js +34 -0
  236. package/dist/lib/grader-compare.d.ts +19 -0
  237. package/dist/lib/grader-compare.js +91 -0
  238. package/dist/lib/grader-consistency.d.ts +27 -0
  239. package/dist/lib/grader-consistency.js +79 -0
  240. package/dist/lib/grader-sensitivity.d.ts +19 -0
  241. package/dist/lib/grader-sensitivity.js +75 -0
  242. package/dist/lib/grader-validate.d.ts +19 -0
  243. package/dist/lib/grader-validate.js +78 -0
  244. package/dist/lib/measure-retrieval.d.ts +14 -0
  245. package/dist/lib/measure-retrieval.js +71 -0
  246. package/dist/lib/pr-comment.d.ts +16 -0
  247. package/dist/lib/pr-comment.js +28 -0
  248. package/dist/lib/readiness-report.d.ts +13 -0
  249. package/dist/lib/readiness-report.js +108 -0
  250. package/dist/lib/webhook-server.d.ts +11 -0
  251. package/dist/lib/webhook-server.js +24 -0
  252. package/dist/lib/weekly-digest.d.ts +24 -0
  253. package/dist/lib/weekly-digest.js +148 -0
  254. package/dist/orchestration/build-app-context.d.ts +27 -0
  255. package/dist/orchestration/build-app-context.js +81 -0
  256. package/dist/orchestration/build-step-sequence.d.ts +15 -0
  257. package/dist/orchestration/build-step-sequence.js +84 -0
  258. package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
  259. package/dist/orchestration/config-to-source-overrides.js +28 -0
  260. package/dist/orchestration/env-bridge.d.ts +21 -0
  261. package/dist/orchestration/env-bridge.js +66 -0
  262. package/dist/orchestration/index.d.ts +11 -0
  263. package/dist/orchestration/index.js +11 -0
  264. package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
  265. package/dist/orchestration/pipeline-orchestrator.js +153 -0
  266. package/dist/orchestration/step-runner.d.ts +20 -0
  267. package/dist/orchestration/step-runner.js +88 -0
  268. package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
  269. package/dist/orchestration/steps/calculate-scores-step.js +95 -0
  270. package/dist/orchestration/steps/callback-step.d.ts +24 -0
  271. package/dist/orchestration/steps/callback-step.js +76 -0
  272. package/dist/orchestration/steps/compare-step.d.ts +14 -0
  273. package/dist/orchestration/steps/compare-step.js +92 -0
  274. package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
  275. package/dist/orchestration/steps/discovery-report-step.js +55 -0
  276. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  277. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  278. package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
  279. package/dist/orchestration/steps/fetch-docs-step.js +135 -0
  280. package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
  281. package/dist/orchestration/steps/gap-analysis-step.js +136 -0
  282. package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
  283. package/dist/orchestration/steps/generate-configs-step.js +85 -0
  284. package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
  285. package/dist/orchestration/steps/grader-consistency-step.js +64 -0
  286. package/dist/orchestration/steps/index.d.ts +19 -0
  287. package/dist/orchestration/steps/index.js +19 -0
  288. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
  289. package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
  290. package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
  291. package/dist/orchestration/steps/publish-report-step.js +216 -0
  292. package/dist/orchestration/steps/readiness-step.d.ts +13 -0
  293. package/dist/orchestration/steps/readiness-step.js +91 -0
  294. package/dist/orchestration/steps/report-step.d.ts +12 -0
  295. package/dist/orchestration/steps/report-step.js +49 -0
  296. package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
  297. package/dist/orchestration/steps/run-eval-step.js +195 -0
  298. package/dist/orchestration/steps/validate-step.d.ts +12 -0
  299. package/dist/orchestration/steps/validate-step.js +41 -0
  300. package/dist/pipeline/agent-behavior-report.d.ts +53 -0
  301. package/dist/pipeline/agent-behavior-report.js +132 -0
  302. package/dist/pipeline/attribution.d.ts +47 -0
  303. package/dist/pipeline/attribution.js +226 -0
  304. package/dist/pipeline/baseline.d.ts +37 -0
  305. package/dist/pipeline/baseline.js +141 -0
  306. package/dist/pipeline/cache.d.ts +101 -0
  307. package/dist/pipeline/cache.js +283 -0
  308. package/dist/pipeline/calculate-scores.d.ts +102 -0
  309. package/dist/pipeline/calculate-scores.js +1128 -0
  310. package/dist/pipeline/callback-delivery.d.ts +50 -0
  311. package/dist/pipeline/callback-delivery.js +89 -0
  312. package/dist/pipeline/checks.d.ts +39 -0
  313. package/dist/pipeline/checks.js +280 -0
  314. package/dist/pipeline/classify-url.d.ts +61 -0
  315. package/dist/pipeline/classify-url.js +93 -0
  316. package/dist/pipeline/compare.d.ts +31 -0
  317. package/dist/pipeline/compare.js +208 -0
  318. package/dist/pipeline/coverage-audit.d.ts +39 -0
  319. package/dist/pipeline/coverage-audit.js +165 -0
  320. package/dist/pipeline/degradations.d.ts +85 -0
  321. package/dist/pipeline/degradations.js +242 -0
  322. package/dist/pipeline/discovery-report.d.ts +55 -0
  323. package/dist/pipeline/discovery-report.js +178 -0
  324. package/dist/pipeline/eval-constants.d.ts +68 -0
  325. package/dist/pipeline/eval-constants.js +111 -0
  326. package/dist/pipeline/eval-fingerprint.d.ts +66 -0
  327. package/dist/pipeline/eval-fingerprint.js +175 -0
  328. package/dist/pipeline/expand-tasks.d.ts +220 -0
  329. package/dist/pipeline/expand-tasks.js +421 -0
  330. package/dist/pipeline/failure-modes.d.ts +46 -0
  331. package/dist/pipeline/failure-modes.js +348 -0
  332. package/dist/pipeline/fetch-url-content.d.ts +44 -0
  333. package/dist/pipeline/fetch-url-content.js +93 -0
  334. package/dist/pipeline/gap-analysis.d.ts +48 -0
  335. package/dist/pipeline/gap-analysis.js +231 -0
  336. package/dist/pipeline/generate-configs.d.ts +72 -0
  337. package/dist/pipeline/generate-configs.js +395 -0
  338. package/dist/pipeline/grader-api.d.ts +49 -0
  339. package/dist/pipeline/grader-api.js +200 -0
  340. package/dist/pipeline/grader-compare-runner.d.ts +44 -0
  341. package/dist/pipeline/grader-compare-runner.js +301 -0
  342. package/dist/pipeline/grader-comparison.d.ts +111 -0
  343. package/dist/pipeline/grader-comparison.js +161 -0
  344. package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
  345. package/dist/pipeline/grader-consistency-runner.js +270 -0
  346. package/dist/pipeline/grader-consistency.d.ts +103 -0
  347. package/dist/pipeline/grader-consistency.js +146 -0
  348. package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
  349. package/dist/pipeline/grader-sensitivity-runner.js +282 -0
  350. package/dist/pipeline/grader-sensitivity.d.ts +94 -0
  351. package/dist/pipeline/grader-sensitivity.js +144 -0
  352. package/dist/pipeline/grader-validate-runner.d.ts +38 -0
  353. package/dist/pipeline/grader-validate-runner.js +229 -0
  354. package/dist/pipeline/grader-validation.d.ts +107 -0
  355. package/dist/pipeline/grader-validation.js +169 -0
  356. package/dist/pipeline/map-request-to-config.d.ts +19 -0
  357. package/dist/pipeline/map-request-to-config.js +80 -0
  358. package/dist/pipeline/measure-retrieval.d.ts +59 -0
  359. package/dist/pipeline/measure-retrieval.js +111 -0
  360. package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
  361. package/dist/pipeline/mirror-repo-tasks.js +350 -0
  362. package/dist/pipeline/plan-format.d.ts +33 -0
  363. package/dist/pipeline/plan-format.js +202 -0
  364. package/dist/pipeline/plan.d.ts +169 -0
  365. package/dist/pipeline/plan.js +708 -0
  366. package/dist/pipeline/pr-comment.d.ts +19 -0
  367. package/dist/pipeline/pr-comment.js +502 -0
  368. package/dist/pipeline/probe.d.ts +52 -0
  369. package/dist/pipeline/probe.js +390 -0
  370. package/dist/pipeline/provenance.d.ts +47 -0
  371. package/dist/pipeline/provenance.js +146 -0
  372. package/dist/pipeline/readiness-report.d.ts +87 -0
  373. package/dist/pipeline/readiness-report.js +205 -0
  374. package/dist/pipeline/release-classification.d.ts +54 -0
  375. package/dist/pipeline/release-classification.js +238 -0
  376. package/dist/pipeline/release-report.d.ts +37 -0
  377. package/dist/pipeline/release-report.js +222 -0
  378. package/dist/pipeline/repo-eval-comment.d.ts +37 -0
  379. package/dist/pipeline/repo-eval-comment.js +165 -0
  380. package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
  381. package/dist/pipeline/repo-threshold-evaluator.js +162 -0
  382. package/dist/pipeline/resolve-mappings.d.ts +35 -0
  383. package/dist/pipeline/resolve-mappings.js +72 -0
  384. package/dist/pipeline/retrieval-metrics.d.ts +39 -0
  385. package/dist/pipeline/retrieval-metrics.js +136 -0
  386. package/dist/pipeline/reverse-mapping.d.ts +67 -0
  387. package/dist/pipeline/reverse-mapping.js +88 -0
  388. package/dist/pipeline/schemas.d.ts +9 -0
  389. package/dist/pipeline/schemas.js +9 -0
  390. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  391. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  392. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  393. package/dist/pipeline/steps/compare-step.js +90 -0
  394. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  395. package/dist/pipeline/steps/eval-step.js +347 -0
  396. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  397. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  398. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  399. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  400. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  401. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  402. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  403. package/dist/pipeline/steps/publish-report-step.js +243 -0
  404. package/dist/pipeline/steps/report-step.d.ts +13 -0
  405. package/dist/pipeline/steps/report-step.js +56 -0
  406. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  407. package/dist/pipeline/steps/update-scores-step.js +42 -0
  408. package/dist/pipeline/targeted-loo.d.ts +88 -0
  409. package/dist/pipeline/targeted-loo.js +203 -0
  410. package/dist/pipeline/thresholds.d.ts +27 -0
  411. package/dist/pipeline/thresholds.js +245 -0
  412. package/dist/pipeline/types.d.ts +10 -0
  413. package/dist/pipeline/types.js +10 -0
  414. package/dist/pipeline/validate.d.ts +67 -0
  415. package/dist/pipeline/validate.js +406 -0
  416. package/dist/pipeline/webhook-server.d.ts +37 -0
  417. package/dist/pipeline/webhook-server.js +133 -0
  418. package/dist/report-store.d.ts +84 -0
  419. package/dist/report-store.js +208 -0
  420. package/dist/sanity/client.d.ts +38 -0
  421. package/dist/sanity/client.js +86 -0
  422. package/dist/sanity/portable-text.d.ts +11 -0
  423. package/dist/sanity/portable-text.js +211 -0
  424. package/dist/sanity/queries.d.ts +133 -0
  425. package/dist/sanity/queries.js +300 -0
  426. package/dist/schedules/digest.d.ts +116 -0
  427. package/dist/schedules/digest.js +156 -0
  428. package/dist/schedules/index.d.ts +12 -0
  429. package/dist/schedules/index.js +10 -0
  430. package/dist/schedules/loader.d.ts +31 -0
  431. package/dist/schedules/loader.js +73 -0
  432. package/dist/schedules/schema.d.ts +9 -0
  433. package/dist/schedules/schema.js +9 -0
  434. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  435. package/dist/scripts/agent-behavior-report.js +315 -0
  436. package/dist/scripts/baseline.d.ts +43 -0
  437. package/dist/scripts/baseline.js +267 -0
  438. package/dist/scripts/calculate-scores.d.ts +166 -0
  439. package/dist/scripts/calculate-scores.js +1296 -0
  440. package/dist/scripts/compare.d.ts +22 -0
  441. package/dist/scripts/compare.js +334 -0
  442. package/dist/scripts/coverage-audit.d.ts +44 -0
  443. package/dist/scripts/coverage-audit.js +209 -0
  444. package/dist/scripts/debug-eval.d.ts +19 -0
  445. package/dist/scripts/debug-eval.js +73 -0
  446. package/dist/scripts/discovery-report.d.ts +58 -0
  447. package/dist/scripts/discovery-report.js +250 -0
  448. package/dist/scripts/fetch-docs.d.ts +35 -0
  449. package/dist/scripts/fetch-docs.js +472 -0
  450. package/dist/scripts/generate-configs.d.ts +66 -0
  451. package/dist/scripts/generate-configs.js +459 -0
  452. package/dist/scripts/grader-api.d.ts +27 -0
  453. package/dist/scripts/grader-api.js +206 -0
  454. package/dist/scripts/grader-compare.d.ts +22 -0
  455. package/dist/scripts/grader-compare.js +368 -0
  456. package/dist/scripts/grader-consistency.d.ts +20 -0
  457. package/dist/scripts/grader-consistency.js +313 -0
  458. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  459. package/dist/scripts/grader-sensitivity.js +354 -0
  460. package/dist/scripts/grader-validate.d.ts +19 -0
  461. package/dist/scripts/grader-validate.js +267 -0
  462. package/dist/scripts/measure-retrieval.d.ts +10 -0
  463. package/dist/scripts/measure-retrieval.js +145 -0
  464. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
  465. package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
  466. package/dist/scripts/pipeline.d.ts +76 -0
  467. package/dist/scripts/pipeline.js +1031 -0
  468. package/dist/scripts/pr-comment.d.ts +10 -0
  469. package/dist/scripts/pr-comment.js +510 -0
  470. package/dist/scripts/readiness-report.d.ts +88 -0
  471. package/dist/scripts/readiness-report.js +342 -0
  472. package/dist/scripts/update-quality-scores.d.ts +15 -0
  473. package/dist/scripts/update-quality-scores.js +184 -0
  474. package/dist/scripts/validate-task-sources.d.ts +21 -0
  475. package/dist/scripts/validate-task-sources.js +210 -0
  476. package/dist/scripts/validate.d.ts +13 -0
  477. package/dist/scripts/validate.js +79 -0
  478. package/dist/scripts/webhook-server.d.ts +26 -0
  479. package/dist/scripts/webhook-server.js +147 -0
  480. package/dist/scripts/weekly-digest.d.ts +24 -0
  481. package/dist/scripts/weekly-digest.js +144 -0
  482. package/dist/sinks/bigquery/index.d.ts +131 -0
  483. package/dist/sinks/bigquery/index.js +222 -0
  484. package/dist/sinks/format-slack.d.ts +64 -0
  485. package/dist/sinks/format-slack.js +306 -0
  486. package/dist/sinks/index.d.ts +23 -0
  487. package/dist/sinks/index.js +18 -0
  488. package/dist/sinks/loader.d.ts +18 -0
  489. package/dist/sinks/loader.js +82 -0
  490. package/dist/sinks/retry.d.ts +24 -0
  491. package/dist/sinks/retry.js +52 -0
  492. package/dist/sinks/schema.d.ts +9 -0
  493. package/dist/sinks/schema.js +9 -0
  494. package/dist/sinks/slack/format.d.ts +65 -0
  495. package/dist/sinks/slack/format.js +327 -0
  496. package/dist/sinks/slack/index.d.ts +27 -0
  497. package/dist/sinks/slack/index.js +78 -0
  498. package/dist/sinks/slack-sink.d.ts +27 -0
  499. package/dist/sinks/slack-sink.js +78 -0
  500. package/dist/sinks/types.d.ts +59 -0
  501. package/dist/sinks/types.js +44 -0
  502. package/dist/sinks/webhook/index.d.ts +19 -0
  503. package/dist/sinks/webhook/index.js +50 -0
  504. package/dist/sinks/webhook-sink.d.ts +19 -0
  505. package/dist/sinks/webhook-sink.js +50 -0
  506. package/dist/sources.d.ts +104 -0
  507. package/dist/sources.js +292 -0
  508. package/dist/webhook/budget.d.ts +42 -0
  509. package/dist/webhook/budget.js +60 -0
  510. package/dist/webhook/debounce.d.ts +67 -0
  511. package/dist/webhook/debounce.js +76 -0
  512. package/dist/webhook/dispatch.d.ts +45 -0
  513. package/dist/webhook/dispatch.js +84 -0
  514. package/dist/webhook/eval-request-handler.d.ts +87 -0
  515. package/dist/webhook/eval-request-handler.js +181 -0
  516. package/dist/webhook/handler.d.ts +88 -0
  517. package/dist/webhook/handler.js +203 -0
  518. package/dist/webhook/index.d.ts +17 -0
  519. package/dist/webhook/index.js +12 -0
  520. package/dist/webhook/types.d.ts +109 -0
  521. package/dist/webhook/types.js +10 -0
  522. package/package.json +72 -0
  523. package/tasks/.expanded.agentic.yaml +51 -0
  524. package/tasks/.expanded.yaml +66 -0
  525. package/tasks/frameworks.yaml +98 -0
  526. package/tasks/functions.yaml +51 -0
  527. package/tasks/groq.yaml +216 -0
  528. package/tasks/nextjs-live.yaml +62 -0
  529. package/tasks/studio-setup.yaml +111 -0
  530. package/tasks/visual-editing.yaml +120 -0
@@ -0,0 +1,395 @@
1
+ /**
2
+ * pipeline/generate-configs.ts
3
+ *
4
+ * Reads config/models.yaml (the central model registry) and generates all
5
+ * promptfoo config files with the correct provider entries.
6
+ *
7
+ * This keeps model definitions in one place — add a model to config/models.yaml
8
+ * and run `pnpm generate-configs` to propagate it to all eval modes.
9
+ *
10
+ * Generated configs:
11
+ * - promptfooconfig.yaml (baseline: with-docs vs without-docs)
12
+ * - promptfooconfig.observed.yaml (instrumented HTTP recording)
13
+ * - promptfooconfig.agentic.yaml (agentic tool-calling: naive vs optimized)
14
+ *
15
+ * All functions accept rootDir as a parameter — no module-level constants.
16
+ * No process.argv parsing. No env var fallbacks. Callers provide typed options.
17
+ *
18
+ * @see config/models.yaml — the central model registry
19
+ * @see docs/exec-plans/active/eliminate-lib-layer.md
20
+ */
21
+ import { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "../_vendor/ailf-core/index.js";
22
+ import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
23
+ import { resolve } from "path";
24
+ import { dump, load } from "js-yaml";
25
+ import { expandTaskDefinitions, loadAndExpandTasks } from "./expand-tasks.js";
26
+ import { validateModelsYaml } from "./validate.js";
27
+ import { loadSource } from "../sources.js";
28
+ // Re-export pure functions from core for backward compatibility.
29
+ // Tests and other modules that previously imported from lib/generate-configs
30
+ // can import from pipeline/generate-configs instead.
31
+ export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "../_vendor/ailf-core/index.js";
32
+ // ---------------------------------------------------------------------------
33
+ // Helpers
34
+ // ---------------------------------------------------------------------------
35
+ /** Auto-discover all task YAML files in the tasks/ directory. */
36
+ export function discoverTaskFiles(rootDir) {
37
+ const tasksDir = resolve(rootDir, "tasks");
38
+ if (!existsSync(tasksDir)) {
39
+ return []; // tasks may come from Content Lake instead
40
+ }
41
+ return readdirSync(tasksDir)
42
+ .filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
43
+ .sort()
44
+ .map((f) => `file://tasks/${f}`);
45
+ }
46
+ /** Load prompt templates from config/prompts.yaml. Throws if missing or malformed. */
47
+ export function loadPrompts(rootDir) {
48
+ const promptsPath = resolve(rootDir, "config", "prompts.yaml");
49
+ if (!existsSync(promptsPath)) {
50
+ throw new Error(`config/prompts.yaml not found at ${promptsPath}. This file is required — it defines the prompt templates for all evaluation modes.`);
51
+ }
52
+ const raw = readFileSync(promptsPath, "utf-8");
53
+ const data = load(raw);
54
+ const toPrompt = (entry) => ({
55
+ id: entry.id,
56
+ label: entry.label,
57
+ raw: entry.template,
58
+ });
59
+ if (!data["with-docs"] || !data["without-docs"] || !data["agentic"]) {
60
+ const missing = ["with-docs", "without-docs", "agentic"].filter((k) => !data[k]);
61
+ throw new Error(`config/prompts.yaml is missing required keys: ${missing.join(", ")}. Each prompt must have id, label, and template fields.`);
62
+ }
63
+ return {
64
+ agentic: toPrompt(data["agentic"]),
65
+ withDocs: toPrompt(data["with-docs"]),
66
+ withoutDocs: toPrompt(data["without-docs"]),
67
+ };
68
+ }
69
+ function loadModels(rootDir) {
70
+ const raw = readFileSync(resolve(rootDir, "config", "models.yaml"), "utf-8");
71
+ return load(raw);
72
+ }
73
+ // ---------------------------------------------------------------------------
74
+ // Shared components
75
+ // ---------------------------------------------------------------------------
76
+ const URL_EXTRACTION_ASSERT = {
77
+ type: "javascript",
78
+ value: `const urlPattern = /https?:\\/\\/[^\\s\\)\\"\\'\\\`>]+/g;
79
+ const urls = [...new Set((output.match(urlPattern) || []))];
80
+ const sanityUrls = urls.filter(u => u.includes('sanity.io'));
81
+ return {
82
+ pass: true,
83
+ score: 1,
84
+ reason: JSON.stringify({
85
+ sanityUrls,
86
+ otherUrls: urls.filter(u => !u.includes('sanity.io')),
87
+ totalUrlCount: urls.length,
88
+ sanityUrlCount: sanityUrls.length
89
+ })
90
+ };`,
91
+ weight: 0,
92
+ };
93
+ /**
94
+ * Source isolation assertion — advisory (weight: 0).
95
+ * Verifies that the agentic provider only fetched docs from allowed origins.
96
+ * Only injected when origin sandboxing is active.
97
+ */
98
+ const SOURCE_ISOLATION_ASSERT = {
99
+ metadata: { dimension: "source-isolation" },
100
+ type: "javascript",
101
+ value: "file://dist/assertions/source-isolation.js",
102
+ weight: 0,
103
+ };
104
+ // ---------------------------------------------------------------------------
105
+ // Config generators
106
+ // ---------------------------------------------------------------------------
107
+ function generateAgenticConfig(models, tests, prompts, source, searchMode, allowedOrigins) {
108
+ const naiveModels = models.models.filter((m) => modelMatchesMode(m, "agentic-naive"));
109
+ const optimizedModels = models.models.filter((m) => modelMatchesMode(m, "agentic-optimized"));
110
+ const providers = [];
111
+ // Build doc source config to inject into providers
112
+ const resolvedSearchMode = searchMode ?? "open";
113
+ const sourceConfig = source
114
+ ? {
115
+ ...(source.allowedOrigins?.length
116
+ ? { allowedOrigins: source.allowedOrigins }
117
+ : {}),
118
+ docBaseUrl: source.baseUrl,
119
+ ...(source.headers && Object.keys(source.headers).length > 0
120
+ ? { customHeaders: source.headers }
121
+ : {}),
122
+ llmsTxtUrl: source.llmsTxt,
123
+ ...(source.priorityDomain
124
+ ? { priorityDomain: source.priorityDomain }
125
+ : {}),
126
+ // Tool access control: search mode for web_search behavior
127
+ ...(resolvedSearchMode !== "open"
128
+ ? { searchMode: resolvedSearchMode }
129
+ : {}),
130
+ }
131
+ : {};
132
+ for (const model of naiveModels) {
133
+ const modelName = extractModelName(model.id);
134
+ const provider = extractProvider(model.id);
135
+ providers.push({
136
+ config: {
137
+ ...mergeConfig(models.defaults, model.config, {
138
+ agentMode: "naive",
139
+ maxToolRounds: models.defaults.maxToolRounds ?? 5,
140
+ model: modelName,
141
+ provider,
142
+ }),
143
+ ...sourceConfig,
144
+ observe: true,
145
+ observerOptions: models.defaults.observerOptions ?? {},
146
+ },
147
+ id: "file://dist/agent-observer/agentic-provider.js",
148
+ label: `${model.label} (Naive Agent)`,
149
+ });
150
+ }
151
+ for (const model of optimizedModels) {
152
+ const modelName = extractModelName(model.id);
153
+ const provider = extractProvider(model.id);
154
+ providers.push({
155
+ config: {
156
+ ...mergeConfig(models.defaults, model.config, {
157
+ agentMode: "optimized",
158
+ maxToolRounds: models.defaults.maxToolRounds ?? 5,
159
+ model: modelName,
160
+ provider,
161
+ }),
162
+ ...sourceConfig,
163
+ observe: true,
164
+ observerOptions: models.defaults.observerOptions ?? {},
165
+ },
166
+ id: "file://dist/agent-observer/agentic-provider.js",
167
+ label: `${model.label} (Optimized Agent)`,
168
+ });
169
+ }
170
+ // Inject source isolation assertion when origin sandboxing is active
171
+ const hasOriginSandbox = Boolean(allowedOrigins?.length);
172
+ const agenticAssertions = hasOriginSandbox ? [SOURCE_ISOLATION_ASSERT] : [];
173
+ return {
174
+ commandLineOptions: { table: false },
175
+ defaultTest: {
176
+ ...(agenticAssertions.length > 0 ? { assert: agenticAssertions } : {}),
177
+ options: {
178
+ provider: models.grader.id,
179
+ rubricProvider: models.grader.id,
180
+ },
181
+ },
182
+ description: "Sanity AI Literacy Evaluation — Agentic (naive vs optimized)",
183
+ ...(models.maxConcurrency
184
+ ? { evaluateOptions: { maxConcurrency: models.maxConcurrency } }
185
+ : {}),
186
+ outputPath: "results/latest/eval-results-agentic.json",
187
+ prompts: [prompts.agentic],
188
+ providers,
189
+ tests,
190
+ };
191
+ }
192
+ function generateBaselineConfig(models, tests, prompts) {
193
+ const baselineModels = models.models.filter((m) => modelMatchesMode(m, "baseline"));
194
+ const providers = baselineModels.map((model) => ({
195
+ config: mergeConfig(models.defaults, model.config),
196
+ id: model.id,
197
+ label: model.label,
198
+ }));
199
+ return {
200
+ commandLineOptions: { table: false },
201
+ defaultTest: {
202
+ assert: [URL_EXTRACTION_ASSERT],
203
+ options: {
204
+ provider: models.grader.id,
205
+ rubricProvider: models.grader.id,
206
+ },
207
+ },
208
+ description: "Sanity AI Literacy Evaluation — Baseline",
209
+ ...(models.maxConcurrency
210
+ ? { evaluateOptions: { maxConcurrency: models.maxConcurrency } }
211
+ : {}),
212
+ outputPath: "results/latest/eval-results.json",
213
+ prompts: [prompts.withDocs, prompts.withoutDocs],
214
+ providers,
215
+ tests,
216
+ };
217
+ }
218
+ function generateObservedConfig(models, tests, prompts) {
219
+ const observedModels = models.models.filter((m) => modelMatchesMode(m, "observed"));
220
+ const providers = observedModels.map((model) => {
221
+ const modelName = extractModelName(model.id);
222
+ return {
223
+ config: {
224
+ ...mergeConfig(models.defaults, model.config),
225
+ modelName,
226
+ observe: true,
227
+ recordOptions: models.defaults.observerOptions ?? {},
228
+ },
229
+ id: "file://dist/agent-observer/provider.js",
230
+ label: `${model.label} (Observed)`,
231
+ };
232
+ });
233
+ return {
234
+ commandLineOptions: { table: false },
235
+ defaultTest: {
236
+ options: {
237
+ provider: models.grader.id,
238
+ rubricProvider: models.grader.id,
239
+ },
240
+ },
241
+ description: "Sanity AI Literacy Evaluation — Observed",
242
+ ...(models.maxConcurrency
243
+ ? { evaluateOptions: { maxConcurrency: models.maxConcurrency } }
244
+ : {}),
245
+ outputPath: "results/latest/eval-results-observed.json",
246
+ prompts: [prompts.withDocs, prompts.withoutDocs],
247
+ providers,
248
+ tests,
249
+ };
250
+ }
251
+ // ---------------------------------------------------------------------------
252
+ // Main entry point
253
+ // ---------------------------------------------------------------------------
254
+ /**
255
+ * Generate Promptfoo config files from models.yaml + task definitions.
256
+ *
257
+ * All parameters are passed via the typed options object — no process.argv
258
+ * parsing or env var fallbacks. Callers (command handlers, orchestration
259
+ * steps) are responsible for resolving options from their own context.
260
+ */
261
+ export function generateConfigs(options) {
262
+ const { rootDir } = options;
263
+ // Validate config/models.yaml before generating configs
264
+ const modelIssues = validateModelsYaml(rootDir);
265
+ const modelErrors = modelIssues.filter((i) => i.severity === "error");
266
+ if (modelErrors.length > 0) {
267
+ console.error("❌ config/models.yaml validation failed:");
268
+ for (const e of modelErrors) {
269
+ console.error(` ERROR: ${e.message}`);
270
+ if (e.path) {
271
+ console.error(` at ${e.path}`);
272
+ }
273
+ }
274
+ console.error("\nFix config/models.yaml before generating configs. Run 'pnpm validate' for details.");
275
+ process.exit(1);
276
+ }
277
+ console.log("Loading config/models.yaml...");
278
+ const models = loadModels(rootDir);
279
+ const activeModels = models.models.filter((m) => m.id && m.label);
280
+ console.log(` Found ${activeModels.length} active model(s):`);
281
+ for (const m of activeModels) {
282
+ // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty array join → "all"
283
+ const modes = m.modes?.join(", ") || "all";
284
+ console.log(` - ${m.label} (${m.id}) → [${modes}]`);
285
+ }
286
+ // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty label falls through to id
287
+ console.log(` Grader: ${models.grader.label || models.grader.id}`);
288
+ // Build filter from options
289
+ const filter = options.filter?.areas || options.filter?.taskIds
290
+ ? options.filter
291
+ : undefined;
292
+ // Expand tasks — use TaskDefinition[] from TaskSource when provided,
293
+ // otherwise fall back to loading from tasks/*.yaml files.
294
+ let entries;
295
+ let agenticEntries;
296
+ if (options.tasks) {
297
+ // TaskSource path — tasks already loaded and filtered by the adapter
298
+ const baselineResult = expandTaskDefinitions(options.tasks, rootDir, "baseline");
299
+ entries = baselineResult.entries;
300
+ console.log(` Expanded ${baselineResult.stats.totalTasks} task(s) → ${baselineResult.stats.expandedTotal} test entries (from TaskSource)`);
301
+ const agenticResult = expandTaskDefinitions(options.tasks, rootDir, "agentic");
302
+ agenticEntries = agenticResult.entries;
303
+ console.log(` Agentic: ${agenticResult.stats.expandedTotal} entries (gold only, no baseline)`);
304
+ }
305
+ else {
306
+ // Legacy path — read from tasks/*.yaml files
307
+ const { entries: baselineEntries, stats } = loadAndExpandTasks(rootDir, filter, "baseline");
308
+ entries = baselineEntries;
309
+ console.log(` Expanded ${stats.singleDefinitions} task(s) → ${stats.expandedTotal} test entries`);
310
+ if (stats.legacyEntries > 0) {
311
+ console.log(` ⚠ ${stats.legacyEntries} legacy (paired) entries passed through unchanged`);
312
+ }
313
+ if (filter) {
314
+ const parts = [];
315
+ if (filter.areas) {
316
+ parts.push(`areas: ${filter.areas.join(", ")}`);
317
+ }
318
+ if (filter.taskIds) {
319
+ parts.push(`tasks: ${filter.taskIds.join(", ")}`);
320
+ }
321
+ console.log(` Scoped to: ${parts.join("; ")}`);
322
+ }
323
+ const { entries: agenticFromYaml, stats: agenticStats } = loadAndExpandTasks(rootDir, filter, "agentic");
324
+ agenticEntries = agenticFromYaml;
325
+ console.log(` Agentic: ${agenticStats.expandedTotal} entries (gold only, no baseline)`);
326
+ }
327
+ // Write expanded tasks to generated files for Promptfoo to consume
328
+ const expandedPath = resolve(rootDir, "tasks", ".expanded.yaml");
329
+ const expandedYaml = dump(entries, {
330
+ forceQuotes: false,
331
+ lineWidth: 120,
332
+ noRefs: true,
333
+ quotingType: "'",
334
+ });
335
+ writeFileSync(expandedPath, `# .expanded.yaml\n#\n# AUTO-GENERATED — do not edit directly.\n# Source: tasks/*.yaml (single-definition format)\n# Run: pnpm generate-configs\n\n${expandedYaml}`, "utf-8");
336
+ console.log(` ✓ tasks/.expanded.yaml (${entries.length} entries)`);
337
+ const agenticExpandedPath = resolve(rootDir, "tasks", ".expanded.agentic.yaml");
338
+ const agenticExpandedYaml = dump(agenticEntries, {
339
+ forceQuotes: false,
340
+ lineWidth: 120,
341
+ noRefs: true,
342
+ quotingType: "'",
343
+ });
344
+ writeFileSync(agenticExpandedPath, `# .expanded.agentic.yaml\n#\n# AUTO-GENERATED — do not edit directly.\n# Gold entries only (no baseline) for agentic evaluation mode.\n# Source: tasks/*.yaml (single-definition format)\n# Run: pnpm generate-configs\n\n${agenticExpandedYaml}`, "utf-8");
345
+ console.log(` ✓ tasks/.expanded.agentic.yaml (${agenticEntries.length} entries)`);
346
+ const taskFiles = ["file://tasks/.expanded.yaml"];
347
+ const agenticTaskFiles = ["file://tasks/.expanded.agentic.yaml"];
348
+ // Load prompt templates
349
+ const prompts = loadPrompts(rootDir);
350
+ console.log(` Loaded prompts: ${Object.keys(prompts).join(", ")}`);
351
+ // Load optional documentation source configuration
352
+ // Pre-resolved source wins over name-based lookup
353
+ let source = options.resolvedSource;
354
+ const sourceName = options.source;
355
+ if (!source && sourceName) {
356
+ console.log(`\nLoading source: ${sourceName}`);
357
+ try {
358
+ source = loadSource(sourceName);
359
+ }
360
+ catch (err) {
361
+ const msg = err instanceof Error ? err.message : String(err);
362
+ console.warn(`\n⚠ Failed to load source "${sourceName}": ${msg}`);
363
+ }
364
+ }
365
+ if (source) {
366
+ console.log(` Base URL: ${source.baseUrl}`);
367
+ console.log(` Dataset: ${source.dataset}`);
368
+ if (source.allowedOrigins?.length) {
369
+ console.log(` Allowed origins: ${source.allowedOrigins.join(", ")}`);
370
+ }
371
+ }
372
+ console.log("\nGenerating configs...");
373
+ writeConfig(rootDir, "promptfooconfig.yaml", generateBaselineConfig(models, taskFiles, prompts), `# promptfooconfig.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`);
374
+ writeConfig(rootDir, "promptfooconfig.observed.yaml", generateObservedConfig(models, taskFiles, prompts), `# promptfooconfig.observed.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`);
375
+ writeConfig(rootDir, "promptfooconfig.agentic.yaml", generateAgenticConfig(models, agenticTaskFiles, prompts, source, options.searchMode, options.allowedOrigins), `# promptfooconfig.agentic.yaml\n#\n# AUTO-GENERATED from config/models.yaml — do not edit directly.\n# Run: pnpm generate-configs\n`);
376
+ console.log("\nDone! Configs are ready.");
377
+ if (source) {
378
+ console.log(` (using doc source: ${sourceName})`);
379
+ }
380
+ }
381
+ // ---------------------------------------------------------------------------
382
+ // File writing
383
+ // ---------------------------------------------------------------------------
384
+ function writeConfig(rootDir, filename, config, header) {
385
+ const yamlStr = dump(config, {
386
+ forceQuotes: false,
387
+ lineWidth: 120,
388
+ noRefs: true,
389
+ quotingType: "'",
390
+ });
391
+ const content = `${header}\n${yamlStr}`;
392
+ const outPath = resolve(rootDir, filename);
393
+ writeFileSync(outPath, content, "utf-8");
394
+ console.log(` ✓ ${filename}`);
395
+ }
@@ -0,0 +1,49 @@
1
+ /**
2
+ * grader-api.ts
3
+ *
4
+ * Shared utility for calling LLM grading APIs from grader scripts.
5
+ *
6
+ * Dispatches to the correct provider API (OpenAI, Anthropic) based on the
7
+ * grader model prefix. Reads the appropriate API key from environment.
8
+ *
9
+ * Also exports `loadGraderModel()` to resolve the grader from
10
+ * `config/models.yaml`.
11
+ *
12
+ * Migrated from lib/grader-api.ts — no module-level side effects, no
13
+ * process.exit(), accepts rootDir as parameter for file-based operations.
14
+ */
15
+ interface ProviderConfig {
16
+ apiKey: string;
17
+ baseUrl: string;
18
+ modelName: string;
19
+ }
20
+ /**
21
+ * Call the grader model once to score a response against a rubric.
22
+ *
23
+ * Dispatches to the correct provider API based on the model prefix.
24
+ * Returns a numeric score (0–100) or null if the call or parse fails.
25
+ */
26
+ export declare function gradeOnce(graderModel: string, responseText: string, rubricText: string): Promise<null | number>;
27
+ /**
28
+ * Load the grader model from `config/models.yaml`.
29
+ * Returns both the model ID and human-readable label.
30
+ * Falls back to `openai:gpt-5` if not configured.
31
+ *
32
+ * @throws Error if config/models.yaml is not found
33
+ */
34
+ export declare function loadGraderModel(rootDir: string): {
35
+ id: string;
36
+ label: string;
37
+ };
38
+ export declare function extractScore(content: string): null | number;
39
+ /**
40
+ * Parse a Promptfoo-style model ID and resolve the provider config.
41
+ *
42
+ * Supported formats:
43
+ * - `openai:chat:gpt-5.2` → OpenAI, model = `gpt-5.2`
44
+ * - `openai:gpt-5` → OpenAI, model = `gpt-5`
45
+ * - `anthropic:messages:claude-opus-4-5-20251101` → Anthropic, model = `claude-opus-4-5-20251101`
46
+ * - `anthropic:claude-sonnet-4` → Anthropic, model = `claude-sonnet-4`
47
+ */
48
+ export declare function resolveProvider(graderModel: string): ProviderConfig;
49
+ export {};
@@ -0,0 +1,200 @@
1
+ /**
2
+ * grader-api.ts
3
+ *
4
+ * Shared utility for calling LLM grading APIs from grader scripts.
5
+ *
6
+ * Dispatches to the correct provider API (OpenAI, Anthropic) based on the
7
+ * grader model prefix. Reads the appropriate API key from environment.
8
+ *
9
+ * Also exports `loadGraderModel()` to resolve the grader from
10
+ * `config/models.yaml`.
11
+ *
12
+ * Migrated from lib/grader-api.ts — no module-level side effects, no
13
+ * process.exit(), accepts rootDir as parameter for file-based operations.
14
+ */
15
+ import { existsSync, readFileSync } from "fs";
16
+ import { join } from "path";
17
+ import { load } from "js-yaml";
18
+ // ---------------------------------------------------------------------------
19
+ // Public API
20
+ // ---------------------------------------------------------------------------
21
+ /**
22
+ * Call the grader model once to score a response against a rubric.
23
+ *
24
+ * Dispatches to the correct provider API based on the model prefix.
25
+ * Returns a numeric score (0–100) or null if the call or parse fails.
26
+ */
27
+ export async function gradeOnce(graderModel, responseText, rubricText) {
28
+ const config = resolveProvider(graderModel);
29
+ const prompt = `You are evaluating an AI assistant's response. Grade the response according to the following rubric.
30
+
31
+ ## Response to evaluate:
32
+ ${responseText.slice(0, 8000)}
33
+
34
+ ## Rubric:
35
+ ${rubricText}
36
+ `;
37
+ try {
38
+ const provider = graderModel.split(":")[0];
39
+ let content;
40
+ if (provider === "anthropic") {
41
+ content = await callAnthropic(config, prompt);
42
+ }
43
+ else if (provider === "openai") {
44
+ content = await callOpenAI(config, prompt);
45
+ }
46
+ else {
47
+ // resolveProvider already throws for unknown providers, but just in case
48
+ return null;
49
+ }
50
+ if (content === null)
51
+ return null;
52
+ const score = extractScore(content);
53
+ if (score === null) {
54
+ console.error(` ⚠ Could not parse grader response: ${content.slice(0, 100)}`);
55
+ }
56
+ return score;
57
+ }
58
+ catch (err) {
59
+ console.error(` ⚠ Grader call failed: ${err instanceof Error ? err.message : String(err)}`);
60
+ return null;
61
+ }
62
+ }
63
+ /**
64
+ * Load the grader model from `config/models.yaml`.
65
+ * Returns both the model ID and human-readable label.
66
+ * Falls back to `openai:gpt-5` if not configured.
67
+ *
68
+ * @throws Error if config/models.yaml is not found
69
+ */
70
+ export function loadGraderModel(rootDir) {
71
+ const modelsPath = join(rootDir, "config", "models.yaml");
72
+ if (!existsSync(modelsPath)) {
73
+ throw new Error(`config/models.yaml not found at ${modelsPath}`);
74
+ }
75
+ const raw = readFileSync(modelsPath, "utf-8");
76
+ const data = load(raw);
77
+ return {
78
+ id: data?.grader?.id ?? "openai:gpt-5",
79
+ label: data?.grader?.label ?? "GPT-5 (grader)",
80
+ };
81
+ }
82
+ // ---------------------------------------------------------------------------
83
+ // Score extraction (exported for testing)
84
+ // ---------------------------------------------------------------------------
85
+ export function extractScore(content) {
86
+ // Try JSON parse first: {"score": 85, "reason": "..."}
87
+ try {
88
+ const parsed = JSON.parse(content);
89
+ if (typeof parsed === "object" &&
90
+ parsed !== null &&
91
+ "score" in parsed &&
92
+ typeof parsed.score === "number") {
93
+ return parsed.score;
94
+ }
95
+ }
96
+ catch {
97
+ // Not JSON — fall through
98
+ }
99
+ // Fallback: extract first bare number
100
+ const match = content.match(/(\d+)/);
101
+ if (match)
102
+ return parseInt(match[1], 10);
103
+ return null;
104
+ }
105
+ // ---------------------------------------------------------------------------
106
+ // Provider resolution (exported for testing)
107
+ // ---------------------------------------------------------------------------
108
+ /**
109
+ * Parse a Promptfoo-style model ID and resolve the provider config.
110
+ *
111
+ * Supported formats:
112
+ * - `openai:chat:gpt-5.2` → OpenAI, model = `gpt-5.2`
113
+ * - `openai:gpt-5` → OpenAI, model = `gpt-5`
114
+ * - `anthropic:messages:claude-opus-4-5-20251101` → Anthropic, model = `claude-opus-4-5-20251101`
115
+ * - `anthropic:claude-sonnet-4` → Anthropic, model = `claude-sonnet-4`
116
+ */
117
+ export function resolveProvider(graderModel) {
118
+ const parts = graderModel.split(":");
119
+ const provider = parts[0];
120
+ if (provider === "anthropic") {
121
+ // "anthropic:messages:claude-opus-4-5" → "claude-opus-4-5"
122
+ // "anthropic:claude-sonnet-4" → "claude-sonnet-4"
123
+ const modelName = parts.length >= 3 && parts[1] === "messages"
124
+ ? parts.slice(2).join(":")
125
+ : parts.slice(1).join(":");
126
+ const apiKey = process.env.ANTHROPIC_API_KEY;
127
+ if (!apiKey) {
128
+ throw new Error("ANTHROPIC_API_KEY not set. Required for grader model: " + graderModel);
129
+ }
130
+ return {
131
+ apiKey,
132
+ baseUrl: "https://api.anthropic.com/v1/messages",
133
+ modelName,
134
+ };
135
+ }
136
+ if (provider === "openai") {
137
+ // "openai:chat:gpt-5.2" → "gpt-5.2", "openai:gpt-5" → "gpt-5"
138
+ const modelName = parts.length >= 3 ? parts.slice(2).join(":") : parts[1];
139
+ const apiKey = process.env.OPENAI_API_KEY;
140
+ if (!apiKey) {
141
+ throw new Error("OPENAI_API_KEY not set. Required for grader model: " + graderModel);
142
+ }
143
+ return {
144
+ apiKey,
145
+ baseUrl: "https://api.openai.com/v1/chat/completions",
146
+ modelName,
147
+ };
148
+ }
149
+ throw new Error(`Unsupported grader provider "${provider}" in model "${graderModel}". ` +
150
+ "Supported: openai, anthropic.");
151
+ }
152
+ // ---------------------------------------------------------------------------
153
+ // Provider-specific API calls
154
+ // ---------------------------------------------------------------------------
155
+ async function callAnthropic(config, prompt) {
156
+ const response = await fetch(config.baseUrl, {
157
+ body: JSON.stringify({
158
+ max_tokens: 256,
159
+ messages: [{ content: prompt, role: "user" }],
160
+ model: config.modelName,
161
+ temperature: 0.2,
162
+ }),
163
+ headers: {
164
+ "anthropic-version": "2023-06-01",
165
+ "Content-Type": "application/json",
166
+ "x-api-key": config.apiKey,
167
+ },
168
+ method: "POST",
169
+ });
170
+ if (!response.ok) {
171
+ const text = await response.text();
172
+ console.error(` ⚠ Grader API error (Anthropic): ${response.status} ${text.slice(0, 200)}`);
173
+ return null;
174
+ }
175
+ const data = (await response.json());
176
+ const textBlock = data.content?.find((c) => c.type === "text");
177
+ return textBlock?.text ?? "";
178
+ }
179
+ async function callOpenAI(config, prompt) {
180
+ const response = await fetch(config.baseUrl, {
181
+ body: JSON.stringify({
182
+ max_tokens: 256,
183
+ messages: [{ content: prompt, role: "user" }],
184
+ model: config.modelName,
185
+ temperature: 0.2,
186
+ }),
187
+ headers: {
188
+ Authorization: `Bearer ${config.apiKey}`,
189
+ "Content-Type": "application/json",
190
+ },
191
+ method: "POST",
192
+ });
193
+ if (!response.ok) {
194
+ const text = await response.text();
195
+ console.error(` ⚠ Grader API error (OpenAI): ${response.status} ${text.slice(0, 200)}`);
196
+ return null;
197
+ }
198
+ const data = (await response.json());
199
+ return data.choices?.[0]?.message?.content ?? "";
200
+ }
@@ -0,0 +1,44 @@
1
+ /**
2
+ * pipeline/grader-compare-runner.ts
3
+ *
4
+ * Orchestration module for inter-grader comparison (Phase 3).
5
+ *
6
+ * Reads eval results, extracts grading judgments, re-grades each with
7
+ * candidate models, and calls `compareGraders()` from the pure
8
+ * computation module.
9
+ *
10
+ * Migrated from lib/grader-compare.ts — no process.argv, no process.exit(),
11
+ * no module-level constants. Accepts rootDir as parameter.
12
+ *
13
+ * @see docs/exec-plans/completed/grader-reliability.md — Phase 3
14
+ */
15
+ import { type GraderComparison } from "./grader-comparison.js";
16
+ export interface GraderCompareRunnerOptions {
17
+ /** Candidate grader models to compare against the baseline */
18
+ candidates: {
19
+ id: string;
20
+ label: string;
21
+ }[];
22
+ /** Output format */
23
+ format?: "json" | "table";
24
+ /** Custom output path (default: results/latest/grader-comparison.json) */
25
+ outputPath?: string;
26
+ /** Path to eval results (default: results/latest/eval-results.json) */
27
+ resultsPath?: string;
28
+ /** Root directory of the eval package */
29
+ rootDir: string;
30
+ }
31
+ /**
32
+ * Format a GraderComparison result as a human-readable table report.
33
+ * Returns a string — does NOT print to console.
34
+ */
35
+ export declare function formatComparisonReport(result: GraderComparison): string;
36
+ /**
37
+ * Run inter-grader comparison.
38
+ *
39
+ * Reads eval results, grades each judgment with the baseline and candidate
40
+ * graders, and produces a comparison report.
41
+ *
42
+ * @throws Error if results file not found, no candidates configured, or no judgments found
43
+ */
44
+ export declare function runGraderCompare(options: GraderCompareRunnerOptions): Promise<GraderComparison>;