@sanity/ailf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (530) hide show
  1. package/README.md +89 -0
  2. package/bin/ailf.js +64 -0
  3. package/canonical/grader-references/README.md +88 -0
  4. package/canonical/grader-references/groq.yaml +234 -0
  5. package/canonical/grader-references/studio-setup.yaml +275 -0
  6. package/canonical/reference-solutions/.gitkeep +1 -0
  7. package/canonical/reference-solutions/frameworks/nuxt.ts +119 -0
  8. package/canonical/reference-solutions/frameworks/remix.tsx +100 -0
  9. package/canonical/reference-solutions/functions/publish-webhook.ts +60 -0
  10. package/canonical/reference-solutions/groq/advanced-filtering.ts +379 -0
  11. package/canonical/reference-solutions/groq/blog-queries.ts +137 -0
  12. package/canonical/reference-solutions/groq/joins-references.ts +300 -0
  13. package/canonical/reference-solutions/nextjs/app-router-integration.tsx +128 -0
  14. package/canonical/reference-solutions/studio-setup/blog-schema.ts +143 -0
  15. package/canonical/reference-solutions/studio-setup/custom-tool.tsx +78 -0
  16. package/canonical/reference-solutions/visual-editing/live-preview.tsx +137 -0
  17. package/canonical/reference-solutions/visual-editing/presentation-nextjs.tsx +130 -0
  18. package/config/airbyte/ai_literacy_framework.connector.yaml +639 -0
  19. package/config/bigquery/README.md +74 -0
  20. package/config/bigquery/views/area_scores.sql +87 -0
  21. package/config/bigquery/views/reports.sql +49 -0
  22. package/config/features.yaml +116 -0
  23. package/config/models.yaml +115 -0
  24. package/config/prompts.yaml +75 -0
  25. package/config/rubrics.yaml +62 -0
  26. package/config/schedules.yaml +43 -0
  27. package/config/sinks.yaml +54 -0
  28. package/config/sources.yaml +51 -0
  29. package/config/thresholds.yaml +49 -0
  30. package/dist/_vendor/ailf-core/examples/index.d.ts +190 -0
  31. package/dist/_vendor/ailf-core/examples/index.js +285 -0
  32. package/dist/_vendor/ailf-core/index.d.ts +17 -0
  33. package/dist/_vendor/ailf-core/index.js +17 -0
  34. package/dist/_vendor/ailf-core/ports/cache-store.d.ts +72 -0
  35. package/dist/_vendor/ailf-core/ports/cache-store.js +17 -0
  36. package/dist/_vendor/ailf-core/ports/config-source.d.ts +33 -0
  37. package/dist/_vendor/ailf-core/ports/config-source.js +15 -0
  38. package/dist/_vendor/ailf-core/ports/context.d.ts +172 -0
  39. package/dist/_vendor/ailf-core/ports/context.js +14 -0
  40. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +131 -0
  41. package/dist/_vendor/ailf-core/ports/doc-fetcher.js +12 -0
  42. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +24 -0
  43. package/dist/_vendor/ailf-core/ports/eval-runner.js +8 -0
  44. package/dist/_vendor/ailf-core/ports/index.d.ts +15 -0
  45. package/dist/_vendor/ailf-core/ports/index.js +7 -0
  46. package/dist/_vendor/ailf-core/ports/logger.d.ts +36 -0
  47. package/dist/_vendor/ailf-core/ports/logger.js +11 -0
  48. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +46 -0
  49. package/dist/_vendor/ailf-core/ports/pipeline-step.js +8 -0
  50. package/dist/_vendor/ailf-core/ports/task-source.d.ts +159 -0
  51. package/dist/_vendor/ailf-core/ports/task-source.js +72 -0
  52. package/dist/_vendor/ailf-core/schemas/callback-payload.d.ts +24 -0
  53. package/dist/_vendor/ailf-core/schemas/callback-payload.js +29 -0
  54. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +55 -0
  55. package/dist/_vendor/ailf-core/schemas/eval-config.js +78 -0
  56. package/dist/_vendor/ailf-core/schemas/index.d.ts +16 -0
  57. package/dist/_vendor/ailf-core/schemas/index.js +16 -0
  58. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +125 -0
  59. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +67 -0
  60. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +531 -0
  61. package/dist/_vendor/ailf-core/schemas/pipeline.js +318 -0
  62. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +68 -0
  63. package/dist/_vendor/ailf-core/schemas/schedules.js +74 -0
  64. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +207 -0
  65. package/dist/_vendor/ailf-core/schemas/sinks.js +108 -0
  66. package/dist/_vendor/ailf-core/services/comparison-formatters.d.ts +18 -0
  67. package/dist/_vendor/ailf-core/services/comparison-formatters.js +189 -0
  68. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +41 -0
  69. package/dist/_vendor/ailf-core/services/config-helpers.js +86 -0
  70. package/dist/_vendor/ailf-core/services/index.d.ts +12 -0
  71. package/dist/_vendor/ailf-core/services/index.js +12 -0
  72. package/dist/_vendor/ailf-core/services/scoring.d.ts +49 -0
  73. package/dist/_vendor/ailf-core/services/scoring.js +222 -0
  74. package/dist/_vendor/ailf-core/types/index.d.ts +1082 -0
  75. package/dist/_vendor/ailf-core/types/index.js +21 -0
  76. package/dist/_vendor/ailf-core/types/scoring-input.d.ts +54 -0
  77. package/dist/_vendor/ailf-core/types/scoring-input.js +9 -0
  78. package/dist/_vendor/ailf-shared/dimension-names.d.ts +21 -0
  79. package/dist/_vendor/ailf-shared/dimension-names.js +27 -0
  80. package/dist/_vendor/ailf-shared/document-ref.d.ts +29 -0
  81. package/dist/_vendor/ailf-shared/document-ref.js +1 -0
  82. package/dist/_vendor/ailf-shared/eval-modes.d.ts +12 -0
  83. package/dist/_vendor/ailf-shared/eval-modes.js +8 -0
  84. package/dist/_vendor/ailf-shared/index.d.ts +16 -0
  85. package/dist/_vendor/ailf-shared/index.js +16 -0
  86. package/dist/_vendor/ailf-shared/noise-threshold.d.ts +9 -0
  87. package/dist/_vendor/ailf-shared/noise-threshold.js +9 -0
  88. package/dist/_vendor/ailf-shared/score-grades.d.ts +17 -0
  89. package/dist/_vendor/ailf-shared/score-grades.js +23 -0
  90. package/dist/adapters/cache/content-lake-cache.d.ts +24 -0
  91. package/dist/adapters/cache/content-lake-cache.js +59 -0
  92. package/dist/adapters/cache/filesystem-cache.d.ts +18 -0
  93. package/dist/adapters/cache/filesystem-cache.js +54 -0
  94. package/dist/adapters/cache/index.d.ts +2 -0
  95. package/dist/adapters/cache/index.js +2 -0
  96. package/dist/adapters/config-sources/cli-config-adapter.d.ts +17 -0
  97. package/dist/adapters/config-sources/cli-config-adapter.js +23 -0
  98. package/dist/adapters/config-sources/file-config-adapter.d.ts +26 -0
  99. package/dist/adapters/config-sources/file-config-adapter.js +96 -0
  100. package/dist/adapters/config-sources/index.d.ts +2 -0
  101. package/dist/adapters/config-sources/index.js +2 -0
  102. package/dist/adapters/doc-fetchers/index.d.ts +1 -0
  103. package/dist/adapters/doc-fetchers/index.js +1 -0
  104. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +76 -0
  105. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +620 -0
  106. package/dist/adapters/eval-runners/index.d.ts +1 -0
  107. package/dist/adapters/eval-runners/index.js +1 -0
  108. package/dist/adapters/eval-runners/promptfoo-eval-adapter.d.ts +14 -0
  109. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +63 -0
  110. package/dist/adapters/index.d.ts +12 -0
  111. package/dist/adapters/index.js +12 -0
  112. package/dist/adapters/loggers/console-logger.d.ts +22 -0
  113. package/dist/adapters/loggers/console-logger.js +54 -0
  114. package/dist/adapters/loggers/index.d.ts +9 -0
  115. package/dist/adapters/loggers/index.js +9 -0
  116. package/dist/adapters/loggers/json-logger.d.ts +18 -0
  117. package/dist/adapters/loggers/json-logger.js +33 -0
  118. package/dist/adapters/loggers/quiet-logger.d.ts +16 -0
  119. package/dist/adapters/loggers/quiet-logger.js +30 -0
  120. package/dist/adapters/task-sources/composite-task-source.d.ts +20 -0
  121. package/dist/adapters/task-sources/composite-task-source.js +59 -0
  122. package/dist/adapters/task-sources/content-lake-task-source.d.ts +20 -0
  123. package/dist/adapters/task-sources/content-lake-task-source.js +219 -0
  124. package/dist/adapters/task-sources/index.d.ts +7 -0
  125. package/dist/adapters/task-sources/index.js +7 -0
  126. package/dist/adapters/task-sources/repo-schemas.d.ts +245 -0
  127. package/dist/adapters/task-sources/repo-schemas.js +234 -0
  128. package/dist/adapters/task-sources/repo-task-source.d.ts +22 -0
  129. package/dist/adapters/task-sources/repo-task-source.js +104 -0
  130. package/dist/adapters/task-sources/repo-trigger.d.ts +52 -0
  131. package/dist/adapters/task-sources/repo-trigger.js +153 -0
  132. package/dist/adapters/task-sources/repo-validation.d.ts +49 -0
  133. package/dist/adapters/task-sources/repo-validation.js +164 -0
  134. package/dist/adapters/task-sources/yaml-task-source.d.ts +18 -0
  135. package/dist/adapters/task-sources/yaml-task-source.js +136 -0
  136. package/dist/agent-observer/agentic-provider.d.ts +132 -0
  137. package/dist/agent-observer/agentic-provider.js +983 -0
  138. package/dist/agent-observer/classifier.d.ts +62 -0
  139. package/dist/agent-observer/classifier.js +269 -0
  140. package/dist/agent-observer/index.d.ts +7 -0
  141. package/dist/agent-observer/index.js +4 -0
  142. package/dist/agent-observer/pricing.d.ts +35 -0
  143. package/dist/agent-observer/pricing.js +82 -0
  144. package/dist/agent-observer/provider.d.ts +77 -0
  145. package/dist/agent-observer/provider.js +151 -0
  146. package/dist/agent-observer/proxy.d.ts +91 -0
  147. package/dist/agent-observer/proxy.js +321 -0
  148. package/dist/agent-observer/test-imports.d.ts +7 -0
  149. package/dist/agent-observer/test-imports.js +185 -0
  150. package/dist/agent-observer/types.d.ts +137 -0
  151. package/dist/agent-observer/types.js +16 -0
  152. package/dist/assertions/source-isolation.d.ts +72 -0
  153. package/dist/assertions/source-isolation.js +117 -0
  154. package/dist/cli.d.ts +24 -0
  155. package/dist/cli.js +199 -0
  156. package/dist/commands/agent-report.d.ts +5 -0
  157. package/dist/commands/agent-report.js +69 -0
  158. package/dist/commands/baseline.d.ts +9 -0
  159. package/dist/commands/baseline.js +141 -0
  160. package/dist/commands/cache.d.ts +13 -0
  161. package/dist/commands/cache.js +135 -0
  162. package/dist/commands/calculate-scores.d.ts +8 -0
  163. package/dist/commands/calculate-scores.js +48 -0
  164. package/dist/commands/compare.d.ts +8 -0
  165. package/dist/commands/compare.js +120 -0
  166. package/dist/commands/completion.d.ts +18 -0
  167. package/dist/commands/completion.js +260 -0
  168. package/dist/commands/coverage-audit.d.ts +7 -0
  169. package/dist/commands/coverage-audit.js +40 -0
  170. package/dist/commands/discovery-report.d.ts +10 -0
  171. package/dist/commands/discovery-report.js +44 -0
  172. package/dist/commands/eval.d.ts +9 -0
  173. package/dist/commands/eval.js +35 -0
  174. package/dist/commands/explain-handler.d.ts +34 -0
  175. package/dist/commands/explain-handler.js +719 -0
  176. package/dist/commands/fetch-docs.d.ts +8 -0
  177. package/dist/commands/fetch-docs.js +128 -0
  178. package/dist/commands/generate-configs.d.ts +8 -0
  179. package/dist/commands/generate-configs.js +46 -0
  180. package/dist/commands/grader/index.d.ts +11 -0
  181. package/dist/commands/grader/index.js +118 -0
  182. package/dist/commands/init.d.ts +19 -0
  183. package/dist/commands/init.js +150 -0
  184. package/dist/commands/interactive.d.ts +12 -0
  185. package/dist/commands/interactive.js +238 -0
  186. package/dist/commands/lookup-doc.d.ts +15 -0
  187. package/dist/commands/lookup-doc.js +84 -0
  188. package/dist/commands/measure-retrieval.d.ts +5 -0
  189. package/dist/commands/measure-retrieval.js +65 -0
  190. package/dist/commands/pipeline-action.d.ts +71 -0
  191. package/dist/commands/pipeline-action.js +305 -0
  192. package/dist/commands/pipeline.d.ts +62 -0
  193. package/dist/commands/pipeline.js +53 -0
  194. package/dist/commands/pr-comment.d.ts +8 -0
  195. package/dist/commands/pr-comment.js +47 -0
  196. package/dist/commands/publish.d.ts +26 -0
  197. package/dist/commands/publish.js +253 -0
  198. package/dist/commands/readiness-report.d.ts +10 -0
  199. package/dist/commands/readiness-report.js +104 -0
  200. package/dist/commands/shared/options.d.ts +29 -0
  201. package/dist/commands/shared/options.js +57 -0
  202. package/dist/commands/update-quality-scores.d.ts +5 -0
  203. package/dist/commands/update-quality-scores.js +20 -0
  204. package/dist/commands/validate-tasks.d.ts +16 -0
  205. package/dist/commands/validate-tasks.js +93 -0
  206. package/dist/commands/validate.d.ts +9 -0
  207. package/dist/commands/validate.js +73 -0
  208. package/dist/commands/webhook-server.d.ts +5 -0
  209. package/dist/commands/webhook-server.js +30 -0
  210. package/dist/commands/weekly-digest.d.ts +10 -0
  211. package/dist/commands/weekly-digest.js +104 -0
  212. package/dist/composition-root.d.ts +26 -0
  213. package/dist/composition-root.js +107 -0
  214. package/dist/interpolate.d.ts +26 -0
  215. package/dist/interpolate.js +70 -0
  216. package/dist/job-store.d.ts +104 -0
  217. package/dist/job-store.js +188 -0
  218. package/dist/lib/agent-behavior-report.d.ts +8 -0
  219. package/dist/lib/agent-behavior-report.js +185 -0
  220. package/dist/lib/baseline.d.ts +19 -0
  221. package/dist/lib/baseline.js +153 -0
  222. package/dist/lib/calculate-scores.d.ts +23 -0
  223. package/dist/lib/calculate-scores.js +42 -0
  224. package/dist/lib/compare.d.ts +18 -0
  225. package/dist/lib/compare.js +170 -0
  226. package/dist/lib/coverage-audit.d.ts +4 -0
  227. package/dist/lib/coverage-audit.js +42 -0
  228. package/dist/lib/discovery-report.d.ts +13 -0
  229. package/dist/lib/discovery-report.js +57 -0
  230. package/dist/lib/fetch-docs.d.ts +30 -0
  231. package/dist/lib/fetch-docs.js +171 -0
  232. package/dist/lib/generate-configs.d.ts +25 -0
  233. package/dist/lib/generate-configs.js +42 -0
  234. package/dist/lib/grader-api.d.ts +21 -0
  235. package/dist/lib/grader-api.js +34 -0
  236. package/dist/lib/grader-compare.d.ts +19 -0
  237. package/dist/lib/grader-compare.js +91 -0
  238. package/dist/lib/grader-consistency.d.ts +27 -0
  239. package/dist/lib/grader-consistency.js +79 -0
  240. package/dist/lib/grader-sensitivity.d.ts +19 -0
  241. package/dist/lib/grader-sensitivity.js +75 -0
  242. package/dist/lib/grader-validate.d.ts +19 -0
  243. package/dist/lib/grader-validate.js +78 -0
  244. package/dist/lib/measure-retrieval.d.ts +14 -0
  245. package/dist/lib/measure-retrieval.js +71 -0
  246. package/dist/lib/pr-comment.d.ts +16 -0
  247. package/dist/lib/pr-comment.js +28 -0
  248. package/dist/lib/readiness-report.d.ts +13 -0
  249. package/dist/lib/readiness-report.js +108 -0
  250. package/dist/lib/webhook-server.d.ts +11 -0
  251. package/dist/lib/webhook-server.js +24 -0
  252. package/dist/lib/weekly-digest.d.ts +24 -0
  253. package/dist/lib/weekly-digest.js +148 -0
  254. package/dist/orchestration/build-app-context.d.ts +27 -0
  255. package/dist/orchestration/build-app-context.js +81 -0
  256. package/dist/orchestration/build-step-sequence.d.ts +15 -0
  257. package/dist/orchestration/build-step-sequence.js +84 -0
  258. package/dist/orchestration/config-to-source-overrides.d.ts +9 -0
  259. package/dist/orchestration/config-to-source-overrides.js +28 -0
  260. package/dist/orchestration/env-bridge.d.ts +21 -0
  261. package/dist/orchestration/env-bridge.js +66 -0
  262. package/dist/orchestration/index.d.ts +11 -0
  263. package/dist/orchestration/index.js +11 -0
  264. package/dist/orchestration/pipeline-orchestrator.d.ts +24 -0
  265. package/dist/orchestration/pipeline-orchestrator.js +153 -0
  266. package/dist/orchestration/step-runner.d.ts +20 -0
  267. package/dist/orchestration/step-runner.js +88 -0
  268. package/dist/orchestration/steps/calculate-scores-step.d.ts +13 -0
  269. package/dist/orchestration/steps/calculate-scores-step.js +95 -0
  270. package/dist/orchestration/steps/callback-step.d.ts +24 -0
  271. package/dist/orchestration/steps/callback-step.js +76 -0
  272. package/dist/orchestration/steps/compare-step.d.ts +14 -0
  273. package/dist/orchestration/steps/compare-step.js +92 -0
  274. package/dist/orchestration/steps/discovery-report-step.d.ts +13 -0
  275. package/dist/orchestration/steps/discovery-report-step.js +55 -0
  276. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  277. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  278. package/dist/orchestration/steps/fetch-docs-step.d.ts +14 -0
  279. package/dist/orchestration/steps/fetch-docs-step.js +135 -0
  280. package/dist/orchestration/steps/gap-analysis-step.d.ts +16 -0
  281. package/dist/orchestration/steps/gap-analysis-step.js +136 -0
  282. package/dist/orchestration/steps/generate-configs-step.d.ts +14 -0
  283. package/dist/orchestration/steps/generate-configs-step.js +85 -0
  284. package/dist/orchestration/steps/grader-consistency-step.d.ts +13 -0
  285. package/dist/orchestration/steps/grader-consistency-step.js +64 -0
  286. package/dist/orchestration/steps/index.d.ts +19 -0
  287. package/dist/orchestration/steps/index.js +19 -0
  288. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +21 -0
  289. package/dist/orchestration/steps/mirror-repo-tasks-step.js +94 -0
  290. package/dist/orchestration/steps/publish-report-step.d.ts +26 -0
  291. package/dist/orchestration/steps/publish-report-step.js +216 -0
  292. package/dist/orchestration/steps/readiness-step.d.ts +13 -0
  293. package/dist/orchestration/steps/readiness-step.js +91 -0
  294. package/dist/orchestration/steps/report-step.d.ts +12 -0
  295. package/dist/orchestration/steps/report-step.js +49 -0
  296. package/dist/orchestration/steps/run-eval-step.d.ts +17 -0
  297. package/dist/orchestration/steps/run-eval-step.js +195 -0
  298. package/dist/orchestration/steps/validate-step.d.ts +12 -0
  299. package/dist/orchestration/steps/validate-step.js +41 -0
  300. package/dist/pipeline/agent-behavior-report.d.ts +53 -0
  301. package/dist/pipeline/agent-behavior-report.js +132 -0
  302. package/dist/pipeline/attribution.d.ts +47 -0
  303. package/dist/pipeline/attribution.js +226 -0
  304. package/dist/pipeline/baseline.d.ts +37 -0
  305. package/dist/pipeline/baseline.js +141 -0
  306. package/dist/pipeline/cache.d.ts +101 -0
  307. package/dist/pipeline/cache.js +283 -0
  308. package/dist/pipeline/calculate-scores.d.ts +102 -0
  309. package/dist/pipeline/calculate-scores.js +1128 -0
  310. package/dist/pipeline/callback-delivery.d.ts +50 -0
  311. package/dist/pipeline/callback-delivery.js +89 -0
  312. package/dist/pipeline/checks.d.ts +39 -0
  313. package/dist/pipeline/checks.js +280 -0
  314. package/dist/pipeline/classify-url.d.ts +61 -0
  315. package/dist/pipeline/classify-url.js +93 -0
  316. package/dist/pipeline/compare.d.ts +31 -0
  317. package/dist/pipeline/compare.js +208 -0
  318. package/dist/pipeline/coverage-audit.d.ts +39 -0
  319. package/dist/pipeline/coverage-audit.js +165 -0
  320. package/dist/pipeline/degradations.d.ts +85 -0
  321. package/dist/pipeline/degradations.js +242 -0
  322. package/dist/pipeline/discovery-report.d.ts +55 -0
  323. package/dist/pipeline/discovery-report.js +178 -0
  324. package/dist/pipeline/eval-constants.d.ts +68 -0
  325. package/dist/pipeline/eval-constants.js +111 -0
  326. package/dist/pipeline/eval-fingerprint.d.ts +66 -0
  327. package/dist/pipeline/eval-fingerprint.js +175 -0
  328. package/dist/pipeline/expand-tasks.d.ts +220 -0
  329. package/dist/pipeline/expand-tasks.js +421 -0
  330. package/dist/pipeline/failure-modes.d.ts +46 -0
  331. package/dist/pipeline/failure-modes.js +348 -0
  332. package/dist/pipeline/fetch-url-content.d.ts +44 -0
  333. package/dist/pipeline/fetch-url-content.js +93 -0
  334. package/dist/pipeline/gap-analysis.d.ts +48 -0
  335. package/dist/pipeline/gap-analysis.js +231 -0
  336. package/dist/pipeline/generate-configs.d.ts +72 -0
  337. package/dist/pipeline/generate-configs.js +395 -0
  338. package/dist/pipeline/grader-api.d.ts +49 -0
  339. package/dist/pipeline/grader-api.js +200 -0
  340. package/dist/pipeline/grader-compare-runner.d.ts +44 -0
  341. package/dist/pipeline/grader-compare-runner.js +301 -0
  342. package/dist/pipeline/grader-comparison.d.ts +111 -0
  343. package/dist/pipeline/grader-comparison.js +161 -0
  344. package/dist/pipeline/grader-consistency-runner.d.ts +60 -0
  345. package/dist/pipeline/grader-consistency-runner.js +270 -0
  346. package/dist/pipeline/grader-consistency.d.ts +103 -0
  347. package/dist/pipeline/grader-consistency.js +146 -0
  348. package/dist/pipeline/grader-sensitivity-runner.d.ts +40 -0
  349. package/dist/pipeline/grader-sensitivity-runner.js +282 -0
  350. package/dist/pipeline/grader-sensitivity.d.ts +94 -0
  351. package/dist/pipeline/grader-sensitivity.js +144 -0
  352. package/dist/pipeline/grader-validate-runner.d.ts +38 -0
  353. package/dist/pipeline/grader-validate-runner.js +229 -0
  354. package/dist/pipeline/grader-validation.d.ts +107 -0
  355. package/dist/pipeline/grader-validation.js +169 -0
  356. package/dist/pipeline/map-request-to-config.d.ts +19 -0
  357. package/dist/pipeline/map-request-to-config.js +80 -0
  358. package/dist/pipeline/measure-retrieval.d.ts +59 -0
  359. package/dist/pipeline/measure-retrieval.js +111 -0
  360. package/dist/pipeline/mirror-repo-tasks.d.ts +86 -0
  361. package/dist/pipeline/mirror-repo-tasks.js +350 -0
  362. package/dist/pipeline/plan-format.d.ts +33 -0
  363. package/dist/pipeline/plan-format.js +202 -0
  364. package/dist/pipeline/plan.d.ts +169 -0
  365. package/dist/pipeline/plan.js +708 -0
  366. package/dist/pipeline/pr-comment.d.ts +19 -0
  367. package/dist/pipeline/pr-comment.js +502 -0
  368. package/dist/pipeline/probe.d.ts +52 -0
  369. package/dist/pipeline/probe.js +390 -0
  370. package/dist/pipeline/provenance.d.ts +47 -0
  371. package/dist/pipeline/provenance.js +146 -0
  372. package/dist/pipeline/readiness-report.d.ts +87 -0
  373. package/dist/pipeline/readiness-report.js +205 -0
  374. package/dist/pipeline/release-classification.d.ts +54 -0
  375. package/dist/pipeline/release-classification.js +238 -0
  376. package/dist/pipeline/release-report.d.ts +37 -0
  377. package/dist/pipeline/release-report.js +222 -0
  378. package/dist/pipeline/repo-eval-comment.d.ts +37 -0
  379. package/dist/pipeline/repo-eval-comment.js +165 -0
  380. package/dist/pipeline/repo-threshold-evaluator.d.ts +89 -0
  381. package/dist/pipeline/repo-threshold-evaluator.js +162 -0
  382. package/dist/pipeline/resolve-mappings.d.ts +35 -0
  383. package/dist/pipeline/resolve-mappings.js +72 -0
  384. package/dist/pipeline/retrieval-metrics.d.ts +39 -0
  385. package/dist/pipeline/retrieval-metrics.js +136 -0
  386. package/dist/pipeline/reverse-mapping.d.ts +67 -0
  387. package/dist/pipeline/reverse-mapping.js +88 -0
  388. package/dist/pipeline/schemas.d.ts +9 -0
  389. package/dist/pipeline/schemas.js +9 -0
  390. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  391. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  392. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  393. package/dist/pipeline/steps/compare-step.js +90 -0
  394. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  395. package/dist/pipeline/steps/eval-step.js +347 -0
  396. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  397. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  398. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  399. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  400. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  401. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  402. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  403. package/dist/pipeline/steps/publish-report-step.js +243 -0
  404. package/dist/pipeline/steps/report-step.d.ts +13 -0
  405. package/dist/pipeline/steps/report-step.js +56 -0
  406. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  407. package/dist/pipeline/steps/update-scores-step.js +42 -0
  408. package/dist/pipeline/targeted-loo.d.ts +88 -0
  409. package/dist/pipeline/targeted-loo.js +203 -0
  410. package/dist/pipeline/thresholds.d.ts +27 -0
  411. package/dist/pipeline/thresholds.js +245 -0
  412. package/dist/pipeline/types.d.ts +10 -0
  413. package/dist/pipeline/types.js +10 -0
  414. package/dist/pipeline/validate.d.ts +67 -0
  415. package/dist/pipeline/validate.js +406 -0
  416. package/dist/pipeline/webhook-server.d.ts +37 -0
  417. package/dist/pipeline/webhook-server.js +133 -0
  418. package/dist/report-store.d.ts +84 -0
  419. package/dist/report-store.js +208 -0
  420. package/dist/sanity/client.d.ts +38 -0
  421. package/dist/sanity/client.js +86 -0
  422. package/dist/sanity/portable-text.d.ts +11 -0
  423. package/dist/sanity/portable-text.js +211 -0
  424. package/dist/sanity/queries.d.ts +133 -0
  425. package/dist/sanity/queries.js +300 -0
  426. package/dist/schedules/digest.d.ts +116 -0
  427. package/dist/schedules/digest.js +156 -0
  428. package/dist/schedules/index.d.ts +12 -0
  429. package/dist/schedules/index.js +10 -0
  430. package/dist/schedules/loader.d.ts +31 -0
  431. package/dist/schedules/loader.js +73 -0
  432. package/dist/schedules/schema.d.ts +9 -0
  433. package/dist/schedules/schema.js +9 -0
  434. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  435. package/dist/scripts/agent-behavior-report.js +315 -0
  436. package/dist/scripts/baseline.d.ts +43 -0
  437. package/dist/scripts/baseline.js +267 -0
  438. package/dist/scripts/calculate-scores.d.ts +166 -0
  439. package/dist/scripts/calculate-scores.js +1296 -0
  440. package/dist/scripts/compare.d.ts +22 -0
  441. package/dist/scripts/compare.js +334 -0
  442. package/dist/scripts/coverage-audit.d.ts +44 -0
  443. package/dist/scripts/coverage-audit.js +209 -0
  444. package/dist/scripts/debug-eval.d.ts +19 -0
  445. package/dist/scripts/debug-eval.js +73 -0
  446. package/dist/scripts/discovery-report.d.ts +58 -0
  447. package/dist/scripts/discovery-report.js +250 -0
  448. package/dist/scripts/fetch-docs.d.ts +35 -0
  449. package/dist/scripts/fetch-docs.js +472 -0
  450. package/dist/scripts/generate-configs.d.ts +66 -0
  451. package/dist/scripts/generate-configs.js +459 -0
  452. package/dist/scripts/grader-api.d.ts +27 -0
  453. package/dist/scripts/grader-api.js +206 -0
  454. package/dist/scripts/grader-compare.d.ts +22 -0
  455. package/dist/scripts/grader-compare.js +368 -0
  456. package/dist/scripts/grader-consistency.d.ts +20 -0
  457. package/dist/scripts/grader-consistency.js +313 -0
  458. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  459. package/dist/scripts/grader-sensitivity.js +354 -0
  460. package/dist/scripts/grader-validate.d.ts +19 -0
  461. package/dist/scripts/grader-validate.js +267 -0
  462. package/dist/scripts/measure-retrieval.d.ts +10 -0
  463. package/dist/scripts/measure-retrieval.js +145 -0
  464. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +24 -0
  465. package/dist/scripts/migrate-tasks-to-content-lake.js +327 -0
  466. package/dist/scripts/pipeline.d.ts +76 -0
  467. package/dist/scripts/pipeline.js +1031 -0
  468. package/dist/scripts/pr-comment.d.ts +10 -0
  469. package/dist/scripts/pr-comment.js +510 -0
  470. package/dist/scripts/readiness-report.d.ts +88 -0
  471. package/dist/scripts/readiness-report.js +342 -0
  472. package/dist/scripts/update-quality-scores.d.ts +15 -0
  473. package/dist/scripts/update-quality-scores.js +184 -0
  474. package/dist/scripts/validate-task-sources.d.ts +21 -0
  475. package/dist/scripts/validate-task-sources.js +210 -0
  476. package/dist/scripts/validate.d.ts +13 -0
  477. package/dist/scripts/validate.js +79 -0
  478. package/dist/scripts/webhook-server.d.ts +26 -0
  479. package/dist/scripts/webhook-server.js +147 -0
  480. package/dist/scripts/weekly-digest.d.ts +24 -0
  481. package/dist/scripts/weekly-digest.js +144 -0
  482. package/dist/sinks/bigquery/index.d.ts +131 -0
  483. package/dist/sinks/bigquery/index.js +222 -0
  484. package/dist/sinks/format-slack.d.ts +64 -0
  485. package/dist/sinks/format-slack.js +306 -0
  486. package/dist/sinks/index.d.ts +23 -0
  487. package/dist/sinks/index.js +18 -0
  488. package/dist/sinks/loader.d.ts +18 -0
  489. package/dist/sinks/loader.js +82 -0
  490. package/dist/sinks/retry.d.ts +24 -0
  491. package/dist/sinks/retry.js +52 -0
  492. package/dist/sinks/schema.d.ts +9 -0
  493. package/dist/sinks/schema.js +9 -0
  494. package/dist/sinks/slack/format.d.ts +65 -0
  495. package/dist/sinks/slack/format.js +327 -0
  496. package/dist/sinks/slack/index.d.ts +27 -0
  497. package/dist/sinks/slack/index.js +78 -0
  498. package/dist/sinks/slack-sink.d.ts +27 -0
  499. package/dist/sinks/slack-sink.js +78 -0
  500. package/dist/sinks/types.d.ts +59 -0
  501. package/dist/sinks/types.js +44 -0
  502. package/dist/sinks/webhook/index.d.ts +19 -0
  503. package/dist/sinks/webhook/index.js +50 -0
  504. package/dist/sinks/webhook-sink.d.ts +19 -0
  505. package/dist/sinks/webhook-sink.js +50 -0
  506. package/dist/sources.d.ts +104 -0
  507. package/dist/sources.js +292 -0
  508. package/dist/webhook/budget.d.ts +42 -0
  509. package/dist/webhook/budget.js +60 -0
  510. package/dist/webhook/debounce.d.ts +67 -0
  511. package/dist/webhook/debounce.js +76 -0
  512. package/dist/webhook/dispatch.d.ts +45 -0
  513. package/dist/webhook/dispatch.js +84 -0
  514. package/dist/webhook/eval-request-handler.d.ts +87 -0
  515. package/dist/webhook/eval-request-handler.js +181 -0
  516. package/dist/webhook/handler.d.ts +88 -0
  517. package/dist/webhook/handler.js +203 -0
  518. package/dist/webhook/index.d.ts +17 -0
  519. package/dist/webhook/index.js +12 -0
  520. package/dist/webhook/types.d.ts +109 -0
  521. package/dist/webhook/types.js +10 -0
  522. package/package.json +72 -0
  523. package/tasks/.expanded.agentic.yaml +51 -0
  524. package/tasks/.expanded.yaml +66 -0
  525. package/tasks/frameworks.yaml +98 -0
  526. package/tasks/functions.yaml +51 -0
  527. package/tasks/groq.yaml +216 -0
  528. package/tasks/nextjs-live.yaml +62 -0
  529. package/tasks/studio-setup.yaml +111 -0
  530. package/tasks/visual-editing.yaml +120 -0
@@ -0,0 +1,66 @@
1
+ /**
2
+ * pipeline/eval-fingerprint.ts
3
+ *
4
+ * Computes a deterministic fingerprint of all inputs that affect evaluation
5
+ * output. Used for cross-environment cache lookup: when running in CI, the
6
+ * pipeline can query the Sanity Content Lake for a previous report with an
7
+ * identical fingerprint and skip the expensive eval step.
8
+ *
9
+ * The fingerprint captures everything that would change evaluation results:
10
+ * - Evaluation mode (baseline, observed, agentic)
11
+ * - Model configuration (which models, their settings)
12
+ * - Grader model identity (different graders score differently)
13
+ * - Prompt templates (different instructions → different outputs)
14
+ * - Rubric templates (different criteria → different scores)
15
+ * - Task definitions (what's being evaluated)
16
+ * - Reference solutions (used by grader assertions)
17
+ * - Documentation content (the docs being evaluated — the primary variable)
18
+ * - Filter flags (which subset of tasks is included)
19
+ *
20
+ * The fingerprint intentionally EXCLUDES:
21
+ * - Source name/URL (content matters, not origin)
22
+ * - Git metadata (informational, not eval-affecting)
23
+ * - Trigger type (manual vs CI → same inputs → same results)
24
+ * - Report tags (human labels)
25
+ *
26
+ * @see docs/design-docs/content-lake-eval-caching.md
27
+ */
28
+ import type { EvalMode, FilterOptions } from "./types.js";
29
+ /** Inputs needed to compute an evaluation fingerprint. */
30
+ export interface FingerprintInput {
31
+ /** Filter options (areas, taskIds) — determines which tasks are included */
32
+ filter?: FilterOptions;
33
+ /** Grader model identifier (e.g., "anthropic:messages:claude-opus-4-5-20251101") */
34
+ graderModel: string;
35
+ /** Evaluation mode */
36
+ mode: EvalMode;
37
+ /** Path to the packages/eval root directory */
38
+ rootDir: string;
39
+ }
40
+ /**
41
+ * Collect all file paths that contribute to the evaluation fingerprint.
42
+ *
43
+ * This is similar to `getStepInputPaths()` in `cache.ts` but is more
44
+ * comprehensive and explicitly designed for cross-environment cache keys:
45
+ *
46
+ * - Includes `config/prompts.yaml` and `config/rubrics.yaml` directly
47
+ * (the local cache only includes them indirectly via generated configs)
48
+ * - Includes `config/models.yaml` (model configuration)
49
+ * - Includes task definitions and reference solutions
50
+ * - Includes the actual documentation content (contexts/canonical/*.md)
51
+ * - Respects filter flags to only include relevant files
52
+ */
53
+ export declare function collectFingerprintInputPaths(rootDir: string, filter?: FilterOptions): string[];
54
+ /**
55
+ * Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
56
+ *
57
+ * The fingerprint is content-addressed: identical inputs always produce
58
+ * the same fingerprint, regardless of the environment (local, CI, etc.).
59
+ *
60
+ * Reuses the existing `hashFiles()` from `cache.ts` to hash file content,
61
+ * and adds non-file context (mode, grader model, filter flags) as
62
+ * additional context strings.
63
+ *
64
+ * @returns SHA-256 hex string (64 characters)
65
+ */
66
+ export declare function computeEvalFingerprint(input: FingerprintInput): string;
@@ -0,0 +1,175 @@
1
+ /**
2
+ * pipeline/eval-fingerprint.ts
3
+ *
4
+ * Computes a deterministic fingerprint of all inputs that affect evaluation
5
+ * output. Used for cross-environment cache lookup: when running in CI, the
6
+ * pipeline can query the Sanity Content Lake for a previous report with an
7
+ * identical fingerprint and skip the expensive eval step.
8
+ *
9
+ * The fingerprint captures everything that would change evaluation results:
10
+ * - Evaluation mode (baseline, observed, agentic)
11
+ * - Model configuration (which models, their settings)
12
+ * - Grader model identity (different graders score differently)
13
+ * - Prompt templates (different instructions → different outputs)
14
+ * - Rubric templates (different criteria → different scores)
15
+ * - Task definitions (what's being evaluated)
16
+ * - Reference solutions (used by grader assertions)
17
+ * - Documentation content (the docs being evaluated — the primary variable)
18
+ * - Filter flags (which subset of tasks is included)
19
+ *
20
+ * The fingerprint intentionally EXCLUDES:
21
+ * - Source name/URL (content matters, not origin)
22
+ * - Git metadata (informational, not eval-affecting)
23
+ * - Trigger type (manual vs CI → same inputs → same results)
24
+ * - Report tags (human labels)
25
+ *
26
+ * @see docs/design-docs/content-lake-eval-caching.md
27
+ */
28
+ import { existsSync, readdirSync, statSync } from "fs";
29
+ import { join, resolve } from "path";
30
+ import { hashFiles } from "./cache.js";
31
+ // ---------------------------------------------------------------------------
32
+ // Constants
33
+ // ---------------------------------------------------------------------------
34
+ /**
35
+ * Version prefix for the fingerprint hash. Bumping this invalidates all
36
+ * existing fingerprints in the Content Lake without needing to clear the
37
+ * store. Change this when adding new inputs to the hash.
38
+ */
39
+ const FINGERPRINT_VERSION = "eval-fingerprint-v1";
40
+ /**
41
+ * Collect all file paths that contribute to the evaluation fingerprint.
42
+ *
43
+ * This is similar to `getStepInputPaths()` in `cache.ts` but is more
44
+ * comprehensive and explicitly designed for cross-environment cache keys:
45
+ *
46
+ * - Includes `config/prompts.yaml` and `config/rubrics.yaml` directly
47
+ * (the local cache only includes them indirectly via generated configs)
48
+ * - Includes `config/models.yaml` (model configuration)
49
+ * - Includes task definitions and reference solutions
50
+ * - Includes the actual documentation content (contexts/canonical/*.md)
51
+ * - Respects filter flags to only include relevant files
52
+ */
53
+ export function collectFingerprintInputPaths(rootDir, filter) {
54
+ const r = (rel) => resolve(rootDir, rel);
55
+ const paths = [];
56
+ // -----------------------------------------------------------------------
57
+ // Config files — always included
58
+ // -----------------------------------------------------------------------
59
+ const configFiles = [
60
+ "config/models.yaml",
61
+ "config/prompts.yaml",
62
+ "config/rubrics.yaml",
63
+ ];
64
+ for (const f of configFiles) {
65
+ const p = r(f);
66
+ if (existsSync(p))
67
+ paths.push(p);
68
+ }
69
+ // -----------------------------------------------------------------------
70
+ // Task files — filtered if --area is set
71
+ // -----------------------------------------------------------------------
72
+ const tasksDir = r("tasks");
73
+ if (existsSync(tasksDir)) {
74
+ const taskFiles = readdirSync(tasksDir)
75
+ .filter((f) => f.endsWith(".yaml") || f.endsWith(".yml"))
76
+ .filter((f) => !f.startsWith(".")); // exclude .expanded.yaml
77
+ for (const f of taskFiles) {
78
+ // If area filter is set, only include matching task files
79
+ if (filter?.areas && filter.areas.length > 0) {
80
+ const stem = f.replace(/\.ya?ml$/, "");
81
+ if (!filter.areas.includes(stem))
82
+ continue;
83
+ }
84
+ paths.push(join(tasksDir, f));
85
+ }
86
+ }
87
+ // -----------------------------------------------------------------------
88
+ // Reference solutions — all included (they're referenced by tasks)
89
+ // -----------------------------------------------------------------------
90
+ const refDir = r("canonical/reference-solutions");
91
+ if (existsSync(refDir)) {
92
+ collectFilesRecursive(refDir, paths);
93
+ }
94
+ // -----------------------------------------------------------------------
95
+ // Canonical context files — the documentation content being evaluated
96
+ // This is the KEY differentiator from the local cache (which doesn't
97
+ // include Sanity document content in the fetch-docs cache key).
98
+ // -----------------------------------------------------------------------
99
+ const canonicalDir = r("contexts/canonical");
100
+ if (existsSync(canonicalDir)) {
101
+ const contextFiles = readdirSync(canonicalDir)
102
+ .filter((f) => f.endsWith(".md"))
103
+ .sort();
104
+ for (const f of contextFiles) {
105
+ // If area or task filter is set, we include all context files anyway
106
+ // because context filenames map to task IDs, and task-to-area mapping
107
+ // requires reading the YAML. It's safer to include all — a superset
108
+ // doesn't cause false cache hits, only potential false misses when
109
+ // a non-matching context changes. This is acceptable: the filter
110
+ // flags in the context strings differentiate the fingerprints.
111
+ paths.push(join(canonicalDir, f));
112
+ }
113
+ }
114
+ return paths;
115
+ }
116
+ /**
117
+ * Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
118
+ *
119
+ * The fingerprint is content-addressed: identical inputs always produce
120
+ * the same fingerprint, regardless of the environment (local, CI, etc.).
121
+ *
122
+ * Reuses the existing `hashFiles()` from `cache.ts` to hash file content,
123
+ * and adds non-file context (mode, grader model, filter flags) as
124
+ * additional context strings.
125
+ *
126
+ * @returns SHA-256 hex string (64 characters)
127
+ */
128
+ export function computeEvalFingerprint(input) {
129
+ const { filter, graderModel, mode, rootDir } = input;
130
+ // -----------------------------------------------------------------------
131
+ // 1. Collect context strings (non-file inputs)
132
+ // -----------------------------------------------------------------------
133
+ const context = [
134
+ FINGERPRINT_VERSION,
135
+ `mode:${mode}`,
136
+ `grader:${graderModel}`,
137
+ ];
138
+ // Include filter flags so that scoped runs produce different fingerprints
139
+ if (filter?.areas && filter.areas.length > 0) {
140
+ context.push(`areas:${[...filter.areas].sort().join(",")}`);
141
+ }
142
+ if (filter?.taskIds && filter.taskIds.length > 0) {
143
+ context.push(`tasks:${[...filter.taskIds].sort().join(",")}`);
144
+ }
145
+ // -----------------------------------------------------------------------
146
+ // 2. Collect input file paths (all files that affect eval output)
147
+ // -----------------------------------------------------------------------
148
+ const paths = collectFingerprintInputPaths(rootDir, filter);
149
+ // -----------------------------------------------------------------------
150
+ // 3. Hash everything together
151
+ // -----------------------------------------------------------------------
152
+ return hashFiles(paths, context);
153
+ }
154
+ // ---------------------------------------------------------------------------
155
+ // Helpers
156
+ // ---------------------------------------------------------------------------
157
+ /**
158
+ * Recursively collect all file paths under a directory.
159
+ * Skips hidden files and directories (starting with '.').
160
+ */
161
+ function collectFilesRecursive(dir, paths) {
162
+ const entries = readdirSync(dir);
163
+ for (const entry of entries) {
164
+ if (entry.startsWith("."))
165
+ continue;
166
+ const fullPath = join(dir, entry);
167
+ const stat = statSync(fullPath);
168
+ if (stat.isDirectory()) {
169
+ collectFilesRecursive(fullPath, paths);
170
+ }
171
+ else if (stat.isFile()) {
172
+ paths.push(fullPath);
173
+ }
174
+ }
175
+ }
@@ -0,0 +1,220 @@
1
+ /**
2
+ * pipeline/expand-tasks.ts
3
+ *
4
+ * Reads task YAML files in the single-definition format and expands each
5
+ * task into gold + baseline Promptfoo test entries. This eliminates the
6
+ * manual duplication where every task had to be written twice.
7
+ *
8
+ * Rubric templates from config/rubrics.yaml are resolved at expansion time:
9
+ * tasks specify `template` + `criteria`, and the expander assembles
10
+ * the full rubric text by injecting criteria into the template.
11
+ *
12
+ * Structured dimension metadata (Approach 5):
13
+ * When a rubric template has a `dimension` field, the resolved assertion
14
+ * includes `metadata.dimension` and `metadata.maxScore`. This flows through
15
+ * Promptfoo into component results, allowing the scoring engine to classify
16
+ * rubrics structurally instead of via heuristic string matching.
17
+ * See docs/design-docs/structured-dimensions.md.
18
+ *
19
+ * Single-definition format:
20
+ * - id: groq-blog-queries
21
+ * description: "GROQ - Blog queries with filtering and pagination"
22
+ * doc_coverage: true
23
+ * vars:
24
+ * task: |
25
+ * Write GROQ queries for a Sanity blog application: ...
26
+ * docs: file://contexts/canonical/groq-blog-queries.md
27
+ * assert:
28
+ * - type: llm-rubric
29
+ * template: task-completion
30
+ * criteria:
31
+ * - GROQ filter with _type == "post"
32
+ * - Projection with aliased slug field
33
+ * - type: contains-any
34
+ * value: ["client.fetch", "createClient"]
35
+ * baseline:
36
+ * enabled: true
37
+ * rubric: abbreviated
38
+ *
39
+ * Expands to:
40
+ * 1. Gold entry — uses vars.docs as-is, resolves templates, appends doc-coverage
41
+ * 2. Baseline entry — sets docs: "", adds transform, uses abbreviated rubric
42
+ */
43
+ import type { TaskDefinition } from "../_vendor/ailf-core/index.d.ts";
44
+ import { type RubricConfig } from "./schemas.js";
45
+ import type { FilterOptions } from "./types.js";
46
+ /** Any assertion entry (templated or value-based). */
47
+ export type AssertEntry = TemplatedAssert | ValueAssert;
48
+ /** The output format — a Promptfoo-compatible test entry. */
49
+ export interface ExpandedTestEntry {
50
+ assert?: ValueAssert[];
51
+ description: string;
52
+ /**
53
+ * Promptfoo prompt filter — restricts which prompts this test runs against.
54
+ * Matches on prompt `id` or `label`. When absent, all prompts are allowed
55
+ * (Promptfoo's default cartesian product behavior).
56
+ *
57
+ * Gold entries use `['with-docs']` (ceiling measurement).
58
+ * Baseline entries use `['without-docs']` (floor measurement).
59
+ *
60
+ * See: evaluation-ceiling.md for the floor/ceiling/actual decomposition.
61
+ */
62
+ prompts?: string[];
63
+ vars: Record<string, unknown>;
64
+ }
65
+ /** A legacy task entry (the old paired format without an `id` field). */
66
+ export interface LegacyTaskEntry {
67
+ assert?: AssertEntry[];
68
+ description: string;
69
+ transform?: string;
70
+ vars?: Record<string, unknown>;
71
+ }
72
+ /** A single task definition in the new format (input). */
73
+ export interface SingleTaskDefinition {
74
+ /** Grading assertions (applied to gold; optionally abbreviated for baseline). */
75
+ assert: AssertEntry[];
76
+ /** Baseline generation options. */
77
+ baseline?: {
78
+ /** Whether to generate a baseline variant. Default: true. */
79
+ enabled?: boolean;
80
+ /** Rubric mode: 'full' copies all asserts, 'abbreviated' generates a
81
+ * summary rubric, 'none' omits rubric asserts. Default: 'abbreviated'. */
82
+ rubric?: "abbreviated" | "full" | "none";
83
+ };
84
+ /** Human-readable description of what this task tests. */
85
+ description: string;
86
+ /** Opt-in: auto-generate a documentation coverage rubric for gold. */
87
+ doc_coverage?: boolean;
88
+ /** Explicit task ID — determines the canonical context filename. */
89
+ id: string;
90
+ /** Template variables: task prompt and docs path. */
91
+ vars: {
92
+ task: string;
93
+ docs: string;
94
+ [key: string]: unknown;
95
+ };
96
+ }
97
+ /** A templated assertion — references a rubric template. */
98
+ export interface TemplatedAssert {
99
+ criteria: string[];
100
+ template: string;
101
+ type: "llm-rubric";
102
+ weight?: number;
103
+ }
104
+ /** A standard assertion with a value. */
105
+ export interface ValueAssert {
106
+ [key: string]: unknown;
107
+ type: string;
108
+ value?: unknown;
109
+ weight?: number;
110
+ }
111
+ /**
112
+ * Assemble a full rubric text string from a template and criteria.
113
+ *
114
+ * Output format:
115
+ * {header}
116
+ * - {scale[0]}
117
+ * - {scale[1]}
118
+ * ...
119
+ *
120
+ * {criteria_label}
121
+ * - {criteria[0]}
122
+ * - {criteria[1]}
123
+ * ...
124
+ *
125
+ * {footer}
126
+ */
127
+ export declare function assembleRubric(templateKey: string, criteria: string[], rubricConfig: RubricConfig): string;
128
+ /**
129
+ * Build baseline assertions based on the rubric mode.
130
+ *
131
+ * - 'full': Copy all assertions as-is
132
+ * - 'abbreviated': Keep only the first llm-rubric (task completion) with
133
+ * a shortened prompt, plus all non-rubric assertions
134
+ * - 'none': No assertions at all
135
+ */
136
+ export declare function buildBaselineAsserts(goldAsserts: ValueAssert[], mode: "abbreviated" | "full" | "none"): ValueAssert[];
137
+ /**
138
+ * Clear the cached rubric config. Used in tests.
139
+ */
140
+ export declare function clearRubricCache(): void;
141
+ /**
142
+ * Expand a single task definition into gold + baseline Promptfoo test entries.
143
+ * Returns 1 entry (gold only) if baseline is disabled, or 2 entries otherwise.
144
+ *
145
+ * Resolves templated assertions and appends doc-coverage if opted in.
146
+ *
147
+ * @param mode - Controls which entries are generated and how:
148
+ * - `'baseline'` (default): Gold + baseline entries with `prompts` filter
149
+ * to prevent cartesian product with multiple prompts. Gold entries get
150
+ * `prompts: ['with-docs']`, baseline entries get `prompts: ['without-docs']`.
151
+ * - `'agentic'`: Gold entries only, no `prompts` filter (agentic mode has
152
+ * a single prompt that doesn't use `{{docs}}`; baseline entries would be
153
+ * pure waste — identical prompts, wasted API calls).
154
+ */
155
+ export declare function expandTask(task: SingleTaskDefinition, rubricConfig: RubricConfig, mode?: "agentic" | "baseline"): ExpandedTestEntry[];
156
+ /**
157
+ * Expand an array of TaskDefinition[] (from any TaskSource adapter) into
158
+ * Promptfoo-compatible test entries. This is the TaskSource-aware counterpart
159
+ * of loadAndExpandTasks() — it skips YAML file I/O and works directly with
160
+ * the canonical domain type.
161
+ *
162
+ * @param tasks - Task definitions from any TaskSource adapter
163
+ * @param rootDir - Eval package root (needed to load rubric templates)
164
+ * @param mode - Expansion mode: 'baseline' (gold + baseline) or 'agentic' (gold only)
165
+ * @returns Expanded test entries and statistics
166
+ */
167
+ export declare function expandTaskDefinitions(tasks: TaskDefinition[], rootDir: string, mode?: "agentic" | "baseline"): {
168
+ entries: ExpandedTestEntry[];
169
+ stats: {
170
+ totalTasks: number;
171
+ expandedTotal: number;
172
+ };
173
+ };
174
+ /**
175
+ * Extract all task IDs from task files. Only works with the new
176
+ * single-definition format entries (those that have an `id` field).
177
+ */
178
+ export declare function extractTaskIds(rootDir: string): string[];
179
+ /**
180
+ * Type guard: checks if an entry is in the new single-definition format.
181
+ * The distinguishing feature is the presence of an `id` field.
182
+ */
183
+ export declare function isSingleTaskDefinition(entry: unknown): entry is SingleTaskDefinition;
184
+ /**
185
+ * Type guard: checks if an assertion uses the templated format.
186
+ */
187
+ export declare function isTemplatedAssert(entry: AssertEntry): entry is TemplatedAssert;
188
+ /**
189
+ * Load and expand all task files from the tasks/ directory.
190
+ * Supports both the new single-definition format (has `id`) and the legacy
191
+ * paired format (no `id`). Legacy entries pass through unchanged.
192
+ *
193
+ * @param mode - Controls expansion behavior:
194
+ * - `'baseline'` (default): Gold + baseline entries with prompt filters.
195
+ * - `'agentic'`: Gold entries only, no prompt filters.
196
+ *
197
+ * Returns the expanded entries grouped by source file.
198
+ */
199
+ export declare function loadAndExpandTasks(rootDir: string, filter?: FilterOptions, mode?: "agentic" | "baseline"): {
200
+ /** All expanded test entries, in order. */
201
+ entries: ExpandedTestEntry[];
202
+ /** Statistics about what was processed. */
203
+ stats: {
204
+ totalFiles: number;
205
+ singleDefinitions: number;
206
+ legacyEntries: number;
207
+ expandedTotal: number;
208
+ };
209
+ };
210
+ /**
211
+ * Load and validate config/rubrics.yaml from the given root directory.
212
+ * Caches the result for subsequent calls with the same rootDir.
213
+ */
214
+ export declare function loadRubricTemplates(rootDir: string): RubricConfig;
215
+ /**
216
+ * Resolve a single assertion: if it's templated, assemble the rubric text
217
+ * and attach structured dimension metadata when the template has a
218
+ * `dimension` field. Otherwise, pass through unchanged.
219
+ */
220
+ export declare function resolveAssert(entry: AssertEntry, rubricConfig: RubricConfig): ValueAssert;