@sanity/ailf 1.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (499) hide show
  1. package/README.md +0 -1
  2. package/canonical/grader-references/README.md +2 -2
  3. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  4. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  5. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  6. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  7. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  8. package/config/features.ts +1 -1
  9. package/config/models.ts +29 -12
  10. package/config/sources.ts +1 -1
  11. package/config/thresholds.ts +1 -1
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  13. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  17. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  18. package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
  19. package/dist/_vendor/ailf-core/config-helpers.js +51 -2
  20. package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
  21. package/dist/_vendor/ailf-core/examples/index.js +213 -94
  22. package/dist/_vendor/ailf-core/index.d.ts +3 -2
  23. package/dist/_vendor/ailf-core/index.js +2 -1
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  25. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  27. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  28. package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
  29. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  30. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  31. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  32. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  33. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  34. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  35. package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
  36. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
  37. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
  38. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
  39. package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
  40. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  41. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  42. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  43. package/dist/_vendor/ailf-core/services/index.js +1 -1
  44. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
  46. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  47. package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
  50. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  51. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  52. package/dist/adapters/api-client/remediation.js +2 -2
  53. package/dist/adapters/config-sources/file-config-adapter.js +7 -1
  54. package/dist/adapters/config-sources/ts-config-loader.js +21 -13
  55. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  56. package/dist/adapters/index.d.ts +0 -1
  57. package/dist/adapters/index.js +0 -1
  58. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  59. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  60. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  61. package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
  62. package/dist/adapters/task-sources/index.d.ts +3 -4
  63. package/dist/adapters/task-sources/index.js +3 -4
  64. package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
  65. package/dist/adapters/task-sources/repo-schemas.js +228 -20
  66. package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
  67. package/dist/adapters/task-sources/repo-task-source.js +81 -122
  68. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  69. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  70. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  71. package/dist/adapters/task-sources/repo-validation.js +126 -5
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
  73. package/dist/adapters/task-sources/task-file-loader.js +21 -7
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/coverage-audit.js +3 -1
  95. package/dist/commands/explain-handler.d.ts +1 -1
  96. package/dist/commands/explain-handler.js +37 -8
  97. package/dist/commands/fetch-docs.js +1 -0
  98. package/dist/commands/generate-configs.d.ts +3 -3
  99. package/dist/commands/generate-configs.js +20 -8
  100. package/dist/commands/init.d.ts +5 -4
  101. package/dist/commands/init.js +190 -25
  102. package/dist/commands/pipeline-action.d.ts +7 -1
  103. package/dist/commands/pipeline-action.js +43 -19
  104. package/dist/commands/pipeline.d.ts +6 -1
  105. package/dist/commands/pipeline.js +7 -2
  106. package/dist/commands/pr-comment.js +1 -0
  107. package/dist/commands/publish.js +1 -0
  108. package/dist/commands/shared/help.js +2 -2
  109. package/dist/commands/update-quality-scores.d.ts +5 -0
  110. package/dist/commands/update-quality-scores.js +20 -0
  111. package/dist/commands/validate-tasks.d.ts +2 -2
  112. package/dist/commands/validate-tasks.js +26 -15
  113. package/dist/composition-root.d.ts +15 -4
  114. package/dist/composition-root.js +100 -55
  115. package/dist/config/features.ts +23 -0
  116. package/dist/config/models.ts +100 -0
  117. package/dist/config/prompts.ts +16 -0
  118. package/dist/config/rubrics.ts +225 -0
  119. package/dist/config/schedules.ts +47 -0
  120. package/dist/config/sinks.ts +37 -0
  121. package/dist/config/sources.ts +21 -0
  122. package/dist/config/thresholds.ts +61 -0
  123. package/dist/index.d.ts +41 -0
  124. package/dist/index.js +48 -0
  125. package/dist/lib/agent-behavior-report.d.ts +8 -0
  126. package/dist/lib/agent-behavior-report.js +185 -0
  127. package/dist/lib/baseline.d.ts +19 -0
  128. package/dist/lib/baseline.js +153 -0
  129. package/dist/lib/calculate-scores.d.ts +23 -0
  130. package/dist/lib/calculate-scores.js +42 -0
  131. package/dist/lib/compare.d.ts +18 -0
  132. package/dist/lib/compare.js +170 -0
  133. package/dist/lib/coverage-audit.d.ts +4 -0
  134. package/dist/lib/coverage-audit.js +42 -0
  135. package/dist/lib/discovery-report.d.ts +13 -0
  136. package/dist/lib/discovery-report.js +57 -0
  137. package/dist/lib/fetch-docs.d.ts +30 -0
  138. package/dist/lib/fetch-docs.js +171 -0
  139. package/dist/lib/generate-configs.d.ts +25 -0
  140. package/dist/lib/generate-configs.js +42 -0
  141. package/dist/lib/grader-api.d.ts +21 -0
  142. package/dist/lib/grader-api.js +34 -0
  143. package/dist/lib/grader-compare.d.ts +19 -0
  144. package/dist/lib/grader-compare.js +91 -0
  145. package/dist/lib/grader-consistency.d.ts +27 -0
  146. package/dist/lib/grader-consistency.js +79 -0
  147. package/dist/lib/grader-sensitivity.d.ts +19 -0
  148. package/dist/lib/grader-sensitivity.js +75 -0
  149. package/dist/lib/grader-validate.d.ts +19 -0
  150. package/dist/lib/grader-validate.js +78 -0
  151. package/dist/lib/measure-retrieval.d.ts +14 -0
  152. package/dist/lib/measure-retrieval.js +71 -0
  153. package/dist/lib/pr-comment.d.ts +16 -0
  154. package/dist/lib/pr-comment.js +28 -0
  155. package/dist/lib/readiness-report.d.ts +13 -0
  156. package/dist/lib/readiness-report.js +108 -0
  157. package/dist/lib/webhook-server.d.ts +11 -0
  158. package/dist/lib/webhook-server.js +24 -0
  159. package/dist/lib/weekly-digest.d.ts +24 -0
  160. package/dist/lib/weekly-digest.js +148 -0
  161. package/dist/orchestration/build-app-context.js +13 -0
  162. package/dist/orchestration/build-step-sequence.js +4 -2
  163. package/dist/orchestration/cache-context.d.ts +23 -0
  164. package/dist/orchestration/cache-context.js +43 -0
  165. package/dist/orchestration/env-bridge.d.ts +21 -0
  166. package/dist/orchestration/env-bridge.js +66 -0
  167. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  168. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  169. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  170. package/dist/orchestration/step-runner.js +5 -1
  171. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  172. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  173. package/dist/orchestration/steps/callback-step.js +10 -1
  174. package/dist/orchestration/steps/compare-step.js +6 -3
  175. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  176. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  177. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  178. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  179. package/dist/orchestration/steps/fetch-docs-step.js +32 -19
  180. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  181. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  182. package/dist/orchestration/steps/generate-configs-step.js +77 -26
  183. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  184. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  185. package/dist/orchestration/steps/publish-report-step.js +19 -0
  186. package/dist/orchestration/steps/readiness-step.js +8 -3
  187. package/dist/orchestration/steps/report-step.js +17 -4
  188. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  189. package/dist/orchestration/steps/run-eval-step.js +51 -31
  190. package/dist/pipeline/agent-behavior-report.js +6 -0
  191. package/dist/pipeline/attribution.d.ts +1 -1
  192. package/dist/pipeline/attribution.js +1 -1
  193. package/dist/pipeline/cache.js +29 -15
  194. package/dist/pipeline/calculate-scores.d.ts +2 -0
  195. package/dist/pipeline/calculate-scores.js +70 -33
  196. package/dist/pipeline/chronic-failures.d.ts +55 -0
  197. package/dist/pipeline/chronic-failures.js +110 -0
  198. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
  199. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
  200. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
  201. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
  202. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  203. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
  204. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  205. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  206. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  207. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  208. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  209. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  210. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  211. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  212. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  213. package/dist/pipeline/compiler/config-loader.js +42 -2
  214. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  215. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  216. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  217. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  218. package/dist/pipeline/compiler/index.d.ts +2 -5
  219. package/dist/pipeline/compiler/index.js +2 -5
  220. package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
  221. package/dist/pipeline/compiler/literacy-bridge.js +2 -2
  222. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  223. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  224. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  225. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  226. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  227. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  228. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
  229. package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
  230. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  231. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  232. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  233. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  234. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  235. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  236. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  237. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  238. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  239. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  240. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  241. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  242. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  243. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  244. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
  245. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  246. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  247. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  248. package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
  249. package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
  250. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  251. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  252. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  253. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  254. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  255. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  256. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  257. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  258. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  259. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  260. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  261. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  262. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  263. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  264. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  265. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  266. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  267. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  268. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  269. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  270. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  271. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  272. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  273. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  274. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
  275. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
  276. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
  277. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  278. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  279. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  280. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  281. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  282. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  283. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  284. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
  285. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  286. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  287. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  288. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  289. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
  290. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
  291. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  292. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
  293. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  294. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
  295. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  296. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  297. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
  298. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
  299. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
  300. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  301. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  302. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  303. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  304. package/dist/pipeline/compiler/preset-loader.js +99 -0
  305. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
  306. package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
  307. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  308. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  309. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  310. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  311. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  312. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  313. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  314. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  315. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  316. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  317. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  318. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  319. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  320. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  321. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  322. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  323. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  324. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  325. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  326. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  327. package/dist/pipeline/compiler/task-bridge.js +92 -0
  328. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  329. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  330. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  331. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  332. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  333. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  334. package/dist/pipeline/coverage-audit.d.ts +1 -1
  335. package/dist/pipeline/coverage-audit.js +1 -1
  336. package/dist/pipeline/degradations.d.ts +1 -1
  337. package/dist/pipeline/degradations.js +1 -1
  338. package/dist/pipeline/expand-tasks.d.ts +2 -2
  339. package/dist/pipeline/expand-tasks.js +2 -2
  340. package/dist/pipeline/failure-modes.d.ts +1 -1
  341. package/dist/pipeline/failure-modes.js +13 -1
  342. package/dist/pipeline/gap-analysis.d.ts +1 -1
  343. package/dist/pipeline/gap-analysis.js +3 -1
  344. package/dist/pipeline/generate-configs.d.ts +2 -2
  345. package/dist/pipeline/generate-configs.js +16 -9
  346. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  347. package/dist/pipeline/grader-compare-runner.js +7 -1
  348. package/dist/pipeline/grader-comparison.d.ts +1 -1
  349. package/dist/pipeline/grader-comparison.js +1 -1
  350. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  351. package/dist/pipeline/grader-consistency-runner.js +7 -1
  352. package/dist/pipeline/grader-consistency.d.ts +1 -1
  353. package/dist/pipeline/grader-consistency.js +1 -1
  354. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  355. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  356. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  357. package/dist/pipeline/grader-sensitivity.js +1 -1
  358. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  359. package/dist/pipeline/grader-validate-runner.js +2 -2
  360. package/dist/pipeline/grader-validation.d.ts +1 -1
  361. package/dist/pipeline/grader-validation.js +1 -1
  362. package/dist/pipeline/map-request-to-config.js +16 -2
  363. package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
  364. package/dist/pipeline/mirror-repo-tasks.js +10 -10
  365. package/dist/pipeline/plan-format.d.ts +1 -1
  366. package/dist/pipeline/plan-format.js +1 -1
  367. package/dist/pipeline/plan.d.ts +1 -1
  368. package/dist/pipeline/plan.js +68 -30
  369. package/dist/pipeline/probe.d.ts +1 -1
  370. package/dist/pipeline/probe.js +1 -1
  371. package/dist/pipeline/readiness-report.d.ts +2 -2
  372. package/dist/pipeline/readiness-report.js +2 -2
  373. package/dist/pipeline/release-classification.d.ts +1 -1
  374. package/dist/pipeline/release-classification.js +1 -1
  375. package/dist/pipeline/release-report.d.ts +1 -1
  376. package/dist/pipeline/release-report.js +1 -1
  377. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  378. package/dist/pipeline/repo-eval-comment.js +1 -1
  379. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  380. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  381. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  382. package/dist/pipeline/resolve-mappings.js +44 -44
  383. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  384. package/dist/pipeline/retrieval-metrics.js +28 -20
  385. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  386. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  387. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  388. package/dist/pipeline/steps/compare-step.js +90 -0
  389. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  390. package/dist/pipeline/steps/eval-step.js +347 -0
  391. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  392. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  393. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  394. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  395. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  396. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  397. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  398. package/dist/pipeline/steps/publish-report-step.js +243 -0
  399. package/dist/pipeline/steps/report-step.d.ts +13 -0
  400. package/dist/pipeline/steps/report-step.js +56 -0
  401. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  402. package/dist/pipeline/steps/update-scores-step.js +42 -0
  403. package/dist/pipeline/targeted-loo.d.ts +1 -1
  404. package/dist/pipeline/targeted-loo.js +1 -1
  405. package/dist/pipeline/thresholds.d.ts +1 -1
  406. package/dist/pipeline/thresholds.js +1 -1
  407. package/dist/pipeline/validate.js +13 -0
  408. package/dist/report-store.d.ts +17 -0
  409. package/dist/report-store.js +24 -0
  410. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  411. package/dist/scripts/agent-behavior-report.js +315 -0
  412. package/dist/scripts/baseline.d.ts +43 -0
  413. package/dist/scripts/baseline.js +267 -0
  414. package/dist/scripts/calculate-scores.d.ts +166 -0
  415. package/dist/scripts/calculate-scores.js +1296 -0
  416. package/dist/scripts/compare.d.ts +22 -0
  417. package/dist/scripts/compare.js +334 -0
  418. package/dist/scripts/coverage-audit.d.ts +44 -0
  419. package/dist/scripts/coverage-audit.js +209 -0
  420. package/dist/scripts/debug-eval.d.ts +19 -0
  421. package/dist/scripts/debug-eval.js +73 -0
  422. package/dist/scripts/discovery-report.d.ts +58 -0
  423. package/dist/scripts/discovery-report.js +250 -0
  424. package/dist/scripts/fetch-docs.d.ts +35 -0
  425. package/dist/scripts/fetch-docs.js +472 -0
  426. package/dist/scripts/generate-configs.d.ts +66 -0
  427. package/dist/scripts/generate-configs.js +459 -0
  428. package/dist/scripts/grader-api.d.ts +27 -0
  429. package/dist/scripts/grader-api.js +206 -0
  430. package/dist/scripts/grader-compare.d.ts +22 -0
  431. package/dist/scripts/grader-compare.js +368 -0
  432. package/dist/scripts/grader-consistency.d.ts +20 -0
  433. package/dist/scripts/grader-consistency.js +313 -0
  434. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  435. package/dist/scripts/grader-sensitivity.js +354 -0
  436. package/dist/scripts/grader-validate.d.ts +19 -0
  437. package/dist/scripts/grader-validate.js +267 -0
  438. package/dist/scripts/measure-retrieval.d.ts +10 -0
  439. package/dist/scripts/measure-retrieval.js +145 -0
  440. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  441. package/dist/scripts/migrate-task-mode.js +1 -1
  442. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  443. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  444. package/dist/scripts/pipeline.d.ts +76 -0
  445. package/dist/scripts/pipeline.js +1031 -0
  446. package/dist/scripts/pr-comment.d.ts +10 -0
  447. package/dist/scripts/pr-comment.js +510 -0
  448. package/dist/scripts/readiness-report.d.ts +88 -0
  449. package/dist/scripts/readiness-report.js +342 -0
  450. package/dist/scripts/update-quality-scores.d.ts +15 -0
  451. package/dist/scripts/update-quality-scores.js +184 -0
  452. package/dist/scripts/validate-task-sources.d.ts +1 -1
  453. package/dist/scripts/validate-task-sources.js +1 -1
  454. package/dist/scripts/validate.d.ts +13 -0
  455. package/dist/scripts/validate.js +79 -0
  456. package/dist/scripts/webhook-server.d.ts +26 -0
  457. package/dist/scripts/webhook-server.js +147 -0
  458. package/dist/scripts/weekly-digest.d.ts +24 -0
  459. package/dist/scripts/weekly-digest.js +144 -0
  460. package/dist/sinks/format-slack.d.ts +64 -0
  461. package/dist/sinks/format-slack.js +306 -0
  462. package/dist/sinks/slack-sink.d.ts +27 -0
  463. package/dist/sinks/slack-sink.js +78 -0
  464. package/dist/sinks/types.d.ts +1 -1
  465. package/dist/sinks/types.js +1 -1
  466. package/dist/sinks/webhook-sink.d.ts +19 -0
  467. package/dist/sinks/webhook-sink.js +50 -0
  468. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  469. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  470. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  471. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  472. package/dist/tasks/literacy/functions.task.ts +70 -0
  473. package/dist/tasks/literacy/groq.task.ts +259 -0
  474. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  475. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  476. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  477. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  478. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  479. package/package.json +32 -24
  480. package/tasks/.expanded.agentic.yaml +280 -0
  481. package/tasks/.expanded.yaml +565 -0
  482. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  483. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  484. package/tasks/literacy/content-lake.task.ts +181 -0
  485. package/tasks/literacy/frameworks.task.ts +1 -0
  486. package/tasks/literacy/functions.task.ts +1 -0
  487. package/tasks/literacy/groq.task.ts +1 -0
  488. package/tasks/literacy/image-handling.task.ts +95 -0
  489. package/tasks/literacy/nextjs-live.task.ts +2 -1
  490. package/tasks/literacy/portable-text.task.ts +169 -0
  491. package/tasks/literacy/studio-setup.task.ts +5 -2
  492. package/tasks/literacy/visual-editing.task.ts +1 -0
  493. package/LICENSE +0 -21
  494. package/tasks/frameworks.yaml +0 -98
  495. package/tasks/functions.yaml +0 -51
  496. package/tasks/groq.yaml +0 -216
  497. package/tasks/nextjs-live.yaml +0 -62
  498. package/tasks/studio-setup.yaml +0 -111
  499. package/tasks/visual-editing.yaml +0 -120
@@ -0,0 +1,225 @@
1
+ /**
2
+ * rubrics.ts — Centralized rubric templates for LLM grading assertions.
3
+ *
4
+ * Tasks reference these templates by key and provide only their unique
5
+ * criteria bullet points. The pipeline assembles the full rubric text
6
+ * at expansion time.
7
+ *
8
+ * @see docs/design-docs/structured-dimensions.md
9
+ * @see docs/design-docs/uniform-dimension-scoring.md
10
+ */
11
+
12
+ import { defineRubrics } from "../_vendor/ailf-core/index.js"
13
+
14
+ export default defineRubrics({
15
+ templates: {
16
+ // ── Core literacy dimensions ────────────────────────────
17
+ "task-completion": {
18
+ dimension: "task-completion",
19
+ header: "Score task completion from 0 to 100:",
20
+ scale: [
21
+ "0: Couldn't attempt — missing critical information",
22
+ "20: Attempted but fundamentally wrong approach",
23
+ "50: Partial implementation — major functional gaps",
24
+ "80: Mostly complete — minor issues or missing edge cases",
25
+ "100: Fully functional code — works as expected",
26
+ ],
27
+ criteria_label: "Must demonstrate:",
28
+ },
29
+ "code-correctness": {
30
+ dimension: "code-correctness",
31
+ header: "Score code correctness from 0 to 100:",
32
+ scale: [
33
+ "0: Broken code, syntax errors, or deprecated APIs",
34
+ "30: Works but uses anti-patterns or inefficient approaches",
35
+ "50: Works but not idiomatic",
36
+ "80: Follows most best practices",
37
+ "100: Follows all best practices, idiomatic implementation",
38
+ ],
39
+ criteria_label: "Check for:",
40
+ },
41
+ "doc-coverage": {
42
+ dimension: "doc-coverage",
43
+ header: "Score documentation coverage from 0 to 100:",
44
+ scale: [
45
+ "0: Had to hallucinate/guess most implementation details",
46
+ "30: Significant gaps — filled with assumptions",
47
+ "50: Some gaps — inferred from partial information",
48
+ "80: Minor gaps — almost everything was documented",
49
+ "100: Complete coverage — all necessary info was in docs",
50
+ ],
51
+ },
52
+
53
+ // ── MCP server dimensions ───────────────────────────────
54
+ "mcp-input-validation": {
55
+ dimension: "input-validation",
56
+ header: "Score MCP tool input correctness from 0 to 100:",
57
+ scale: [
58
+ "0: Completely wrong tool inputs — missing or invalid parameters",
59
+ "25: Attempted correct inputs but with significant errors",
60
+ "50: Partially correct — some required fields present, others wrong",
61
+ "75: Mostly correct — minor issues with optional parameters",
62
+ "100: Perfect tool inputs — all parameters correct and well-formed",
63
+ ],
64
+ criteria_label: "Evaluate:",
65
+ },
66
+ "mcp-output-correctness": {
67
+ dimension: "output-correctness",
68
+ header: "Score MCP tool output handling from 0 to 100:",
69
+ scale: [
70
+ "0: Completely failed to use tool outputs",
71
+ "25: Used outputs but misinterpreted the data",
72
+ "50: Partially correct interpretation of tool outputs",
73
+ "75: Mostly correct — minor misuse of returned data",
74
+ "100: Perfect output handling — correctly interpreted all tool responses",
75
+ ],
76
+ criteria_label: "Check for:",
77
+ },
78
+ "mcp-error-handling": {
79
+ dimension: "error-handling",
80
+ header: "Score MCP error handling from 0 to 100:",
81
+ scale: [
82
+ "0: Crashed or ignored errors entirely",
83
+ "25: Acknowledged error but took wrong corrective action",
84
+ "50: Partial error handling — recovered from some errors",
85
+ "75: Good error handling — recovered gracefully with minor issues",
86
+ "100: Excellent — handled all errors appropriately with clear messaging",
87
+ ],
88
+ criteria_label: "Evaluate:",
89
+ },
90
+ "mcp-security": {
91
+ dimension: "security",
92
+ header: "Score MCP security compliance from 0 to 100:",
93
+ scale: [
94
+ "0: Attempted unauthorized operations or leaked sensitive data",
95
+ "25: Made questionable tool calls outside the expected scope",
96
+ "50: Stayed within scope but with minor security concerns",
97
+ "75: Good security — no unauthorized access, minor concerns",
98
+ "100: Perfect security — only used authorized tools with safe inputs",
99
+ ],
100
+ criteria_label: "Check for:",
101
+ },
102
+
103
+ // ── Knowledge probe dimensions ──────────────────────────
104
+ "factual-correctness": {
105
+ dimension: "factual-correctness",
106
+ header: "Score factual correctness from 0 to 100:",
107
+ scale: [
108
+ "0: Completely wrong — fabricated facts, hallucinated APIs, incorrect syntax",
109
+ "25: Mostly wrong — some real concepts but significant factual errors",
110
+ "50: Mixed — some correct facts alongside notable inaccuracies",
111
+ "75: Mostly correct — minor inaccuracies or imprecisions",
112
+ "100: Fully correct — all statements are accurate and verifiable",
113
+ ],
114
+ criteria_label: "Verify:",
115
+ },
116
+ completeness: {
117
+ dimension: "completeness",
118
+ header: "Score knowledge completeness from 0 to 100:",
119
+ scale: [
120
+ "0: Superficial — only knows the name, no substantive knowledge",
121
+ "25: Minimal — knows basic concepts but misses most important details",
122
+ "50: Partial — covers some key aspects but has significant gaps",
123
+ "75: Good coverage — covers most key aspects with minor gaps",
124
+ "100: Comprehensive — thorough coverage of all important aspects",
125
+ ],
126
+ criteria_label: "Check coverage of:",
127
+ },
128
+ currency: {
129
+ dimension: "currency",
130
+ header: "Score knowledge currency (up-to-dateness) from 0 to 100:",
131
+ scale: [
132
+ "0: Severely outdated — references deprecated APIs or removed features",
133
+ "25: Mostly outdated — aware of old version but not recent changes",
134
+ "50: Partially current — knows some recent changes but misses others",
135
+ "75: Mostly current — knows recent API but minor gaps on latest features",
136
+ "100: Fully current — references latest APIs, patterns, and best practices",
137
+ ],
138
+ criteria_label: "Check for:",
139
+ },
140
+
141
+ // ── Agent harness dimensions ────────────────────────────
142
+ "process-quality": {
143
+ dimension: "process-quality",
144
+ header:
145
+ "Score the agent's process quality from 0 to 100 (advisory — does not gate pass/fail):",
146
+ scale: [
147
+ "0: Chaotic process — random tool calls, no planning, no error handling",
148
+ "25: Poor process — some structure but significant inefficiencies",
149
+ "50: Adequate process — gets the job done but not efficiently",
150
+ "75: Good process — reads before writing, handles errors, incremental changes",
151
+ "100: Excellent process — optimal tool usage, clear planning, graceful recovery",
152
+ ],
153
+ criteria_label: "Evaluate:",
154
+ },
155
+ "agent-output": {
156
+ dimension: "agent-output",
157
+ header: "Score the agent's final output from 0 to 100:",
158
+ scale: [
159
+ "0: No useful output produced — task completely failed",
160
+ "25: Partial output — attempted but with fundamental errors",
161
+ "50: Usable output — works but with significant issues",
162
+ "75: Good output — mostly correct with minor issues",
163
+ "100: Excellent output — fully correct, clean, and complete",
164
+ ],
165
+ criteria_label: "Check for:",
166
+ },
167
+ "agent-tool-usage": {
168
+ dimension: "tool-usage",
169
+ header: "Score the agent's tool usage from 0 to 100:",
170
+ scale: [
171
+ "0: Completely wrong tool usage — called wrong tools or with invalid inputs",
172
+ "25: Poor tool usage — correct tools but wrong parameters or sequencing",
173
+ "50: Adequate — correct tools and basic parameters, some inefficiency",
174
+ "75: Good — efficient tool usage with proper error handling",
175
+ "100: Excellent — optimal tool selection, correct inputs, minimal redundancy",
176
+ ],
177
+ criteria_label: "Evaluate:",
178
+ },
179
+ },
180
+
181
+ // ── Named scoring profiles ────────────────────────────────
182
+ profiles: {
183
+ default: {
184
+ "task-completion": 0.5,
185
+ "code-correctness": 0.25,
186
+ "doc-coverage": 0.25,
187
+ },
188
+ "output-only": {
189
+ "task-completion": 0.6,
190
+ "code-correctness": 0.4,
191
+ },
192
+ "mcp-behavior": {
193
+ "input-validation": 0.25,
194
+ "output-correctness": 0.35,
195
+ "error-handling": 0.25,
196
+ security: 0.15,
197
+ },
198
+ "knowledge-probe": {
199
+ "factual-correctness": 0.45,
200
+ completeness: 0.35,
201
+ currency: 0.2,
202
+ },
203
+ "agent-harness": {
204
+ "agent-output": 0.45,
205
+ "tool-usage": 0.4,
206
+ "process-quality": 0.15,
207
+ },
208
+ },
209
+
210
+ // ── Mode-to-profile bindings ──────────────────────────────
211
+ // Literacy mode uses variant sub-keys because 'baseline' and 'agentic'
212
+ // are scoring variants within the same canonical mode, not separate modes.
213
+ "mode-profiles": {
214
+ "literacy": {
215
+ baseline: { gold: "default", baseline: "output-only" },
216
+ agentic: { gold: "default" },
217
+ },
218
+ "mcp-server": { gold: "mcp-behavior" },
219
+ "knowledge-probe": { gold: "knowledge-probe" },
220
+ "agent-harness": { gold: "agent-harness" },
221
+ },
222
+
223
+ footer:
224
+ 'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}',
225
+ })
@@ -0,0 +1,47 @@
1
+ /**
2
+ * schedules.ts — Scheduled evaluation configuration.
3
+ *
4
+ * Each schedule defines a recurring pipeline run with its own source,
5
+ * mode, and delivery preferences. The GitHub Actions cron workflow reads
6
+ * this config to determine which evaluations to run and when.
7
+ *
8
+ * Cron expressions use UTC timezone (GitHub Actions standard).
9
+ *
10
+ * @see docs/design-docs/report-store/implementation.md — Phase 5
11
+ */
12
+
13
+ import { defineSchedules, env } from "../_vendor/ailf-core/index.js"
14
+
15
+ export default defineSchedules({
16
+ schedules: [
17
+ // Daily baseline — track score trends against production docs
18
+ {
19
+ name: "daily-baseline",
20
+ cron: "0 2 * * *", // 2:00 AM UTC, every day
21
+ mode: "baseline",
22
+ source: "production",
23
+ publish: true,
24
+ compare: true,
25
+ enabled: true,
26
+ },
27
+
28
+ // Weekly full decomposition — complete floor/ceiling/actual report
29
+ {
30
+ name: "weekly-full",
31
+ cron: "0 3 * * 0", // 3:00 AM UTC, every Sunday
32
+ mode: "full",
33
+ source: "production",
34
+ publish: true,
35
+ compare: true,
36
+ enabled: true,
37
+ },
38
+ ],
39
+
40
+ // Digest — aggregates reports into periodic summaries
41
+ digest: {
42
+ enabled: true,
43
+ cron: "0 9 * * 1", // 9:00 AM UTC, every Monday
44
+ lookbackDays: 7,
45
+ slackWebhookUrl: env("SLACK_WEBHOOK_URL", ""),
46
+ },
47
+ })
@@ -0,0 +1,37 @@
1
+ /**
2
+ * sinks.ts — Report delivery sink configuration.
3
+ *
4
+ * Sinks receive published evaluation reports and deliver them to external
5
+ * systems (BigQuery, Slack, GitHub, webhooks, etc.).
6
+ *
7
+ * Sinks are fire-and-forget: a sink failure is logged but never blocks
8
+ * the pipeline. The Sanity Content Lake is the system of record.
9
+ *
10
+ * Sinks activate only when their required environment variables are present.
11
+ * A developer running locally with no env vars gets zero sinks.
12
+ *
13
+ * @see docs/design-docs/report-store/sink-architecture.md
14
+ */
15
+
16
+ import { defineSinks } from "../_vendor/ailf-core/index.js"
17
+
18
+ export default defineSinks({
19
+ sinks: [
20
+ // All sinks are currently disabled (commented out in YAML).
21
+ // Uncomment and configure as needed:
22
+ // BigQuery — disabled; Airbyte ELT is the primary delivery mechanism
23
+ // { type: "bigquery", enabled: false, project: env("BIGQUERY_PROJECT", "data-platform-302218"),
24
+ // dataset: env("BIGQUERY_DATASET", "ailf"), credentials: env("GOOGLE_APPLICATION_CREDENTIALS", "") },
25
+ // Slack — regression alerts to configured channels
26
+ // { type: "slack", enabled: true, webhookUrl: env("SLACK_WEBHOOK_URL", ""),
27
+ // channel: "#docs-ai-literacy",
28
+ // routing: { critical: "#docs-alerts", warning: "#docs-team",
29
+ // regression: "#docs-team", digest: "#docs-weekly" } },
30
+ // GitHub PR comments
31
+ // { type: "github-comment", enabled: false, token: env("GITHUB_TOKEN", "") },
32
+ // Webhook — generic HTTP relay
33
+ // { type: "webhook", enabled: false, url: env("AILF_WEBHOOK_URL", ""),
34
+ // headers: { Authorization: `Bearer ${env("AILF_WEBHOOK_TOKEN", "")}` },
35
+ // routing: { critical: true } },
36
+ ],
37
+ })
@@ -0,0 +1,21 @@
1
+ /**
2
+ * sources.ts — Documentation source definitions for AILF evaluations.
3
+ *
4
+ * Default sources (production, branch, local) are provided by the
5
+ * sanity-literacy preset registered in the composition root. This file
6
+ * exists as an override point — any sources defined here are merged with
7
+ * (and take precedence over) preset-provided sources.
8
+ *
9
+ * To add a custom source, define it here:
10
+ *
11
+ * export default defineSources([
12
+ * { name: "my-docs", baseUrl: "https://docs.example.com", ... },
13
+ * ])
14
+ *
15
+ * @see packages/eval/src/pipeline/compiler/presets/sanity-literacy.ts
16
+ * @see docs/archive/exec-plans/dynamic-doc-sources.md
17
+ */
18
+
19
+ import { defineSources } from "../_vendor/ailf-core/index.js"
20
+
21
+ export default defineSources([])
@@ -0,0 +1,61 @@
1
+ /**
2
+ * thresholds.ts — Quality thresholds for readiness gates and regression alerts.
3
+ *
4
+ * Used by:
5
+ * - `npx @sanity/ailf pipeline --readiness` (launch readiness checklist)
6
+ * - `npx @sanity/ailf pipeline --publish` (severity-aware sink routing)
7
+ * - `npx @sanity/ailf pipeline --compare` (regression alerting)
8
+ *
9
+ * @see docs/archive/exec-plans/scenario-matrix-implementation/phase-5-readiness-thresholds.md
10
+ */
11
+
12
+ import { defineThresholds } from "../_vendor/ailf-core/index.js"
13
+
14
+ export default defineThresholds({
15
+ // Global defaults (apply to all areas unless overridden)
16
+ defaults: {
17
+ composite: 50,
18
+ dimensions: {
19
+ "task-completion": 40,
20
+ "code-correctness": 30,
21
+ "doc-coverage": 30,
22
+ },
23
+ "doc-lift": 0, // docs must not hurt
24
+ ceiling: 40, // minimum ceiling score (doc quality floor)
25
+ },
26
+
27
+ // Per-area overrides (inherit from defaults, override specific values)
28
+ areas: {
29
+ groq: {
30
+ composite: 60, // GROQ is critical — higher bar
31
+ dimensions: {
32
+ "task-completion": 50,
33
+ },
34
+ },
35
+ // "visual-editing": {
36
+ // composite: 45, // currently at 36, set achievable near-term target
37
+ // },
38
+ },
39
+
40
+ // Regression thresholds (for comparison reports)
41
+ regression: {
42
+ composite: -3, // alert if composite drops more than 3 points
43
+ "per-area": -5, // alert if any area drops more than 5 points
44
+ "per-dimension": -8, // alert if any dimension drops more than 8 points
45
+ },
46
+
47
+ // Severity classification
48
+ severity: {
49
+ critical: {
50
+ "composite-below": 30,
51
+ "negative-doc-lift": true,
52
+ },
53
+ warning: {
54
+ "composite-below": 50,
55
+ "regression-exceeds": -3,
56
+ },
57
+ info: {
58
+ "composite-below": 60,
59
+ },
60
+ },
61
+ })
@@ -0,0 +1,41 @@
1
+ /**
2
+ * @sanity/ailf — Public API for the AI Literacy Framework.
3
+ *
4
+ * This module is the entry point for external consumers who import from
5
+ * `@sanity/ailf`. It re-exports the authoring API needed to write task
6
+ * definitions, configuration files, and validate task YAML.
7
+ *
8
+ * ## Task authoring
9
+ *
10
+ * ```typescript
11
+ * import { defineTask } from "@sanity/ailf"
12
+ *
13
+ * export default defineTask({
14
+ * id: "groq-projection-basics",
15
+ * mode: "literacy",
16
+ * title: "GROQ Projection Basics",
17
+ * area: "groq",
18
+ * prompt: { text: "Write GROQ queries..." },
19
+ * assertions: [
20
+ * { type: "llm-rubric", template: "task-completion", criteria: ["..."] },
21
+ * ],
22
+ * })
23
+ * ```
24
+ *
25
+ * ## Configuration authoring
26
+ *
27
+ * ```typescript
28
+ * import { defineConfig, env } from "@sanity/ailf"
29
+ *
30
+ * export default defineConfig({
31
+ * projectId: env("SANITY_PROJECT_ID"),
32
+ * dataset: env("SANITY_DATASET"),
33
+ * })
34
+ * ```
35
+ */
36
+ export { defineConfig, defineFeatures, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./_vendor/ailf-core/index.d.ts";
37
+ export type { PricingEntry, PromptEntry, SourceEntry } from "./_vendor/ailf-core/index.d.ts";
38
+ export { env } from "./_vendor/ailf-core/index.d.ts";
39
+ export type { AgentHarnessTaskDefinition, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PathDocRef, PerspectiveDocRef, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./_vendor/ailf-core/index.d.ts";
40
+ export { CanonicalTaskFileSchema, CanonicalTaskSchema, CURATED_ASSERTION_TYPES, detectLegacyFieldNames, parseCanonicalTaskFile, RUBRIC_TEMPLATE_NAMES, type CanonicalTask, type CuratedAssertionType, type RubricTemplateName, } from "./adapters/task-sources/repo-schemas.js";
41
+ export { formatValidationResult, validateCanonicalTasks, type ValidationMessage, type ValidationResult, } from "./adapters/task-sources/repo-validation.js";
package/dist/index.js ADDED
@@ -0,0 +1,48 @@
1
+ /**
2
+ * @sanity/ailf — Public API for the AI Literacy Framework.
3
+ *
4
+ * This module is the entry point for external consumers who import from
5
+ * `@sanity/ailf`. It re-exports the authoring API needed to write task
6
+ * definitions, configuration files, and validate task YAML.
7
+ *
8
+ * ## Task authoring
9
+ *
10
+ * ```typescript
11
+ * import { defineTask } from "@sanity/ailf"
12
+ *
13
+ * export default defineTask({
14
+ * id: "groq-projection-basics",
15
+ * mode: "literacy",
16
+ * title: "GROQ Projection Basics",
17
+ * area: "groq",
18
+ * prompt: { text: "Write GROQ queries..." },
19
+ * assertions: [
20
+ * { type: "llm-rubric", template: "task-completion", criteria: ["..."] },
21
+ * ],
22
+ * })
23
+ * ```
24
+ *
25
+ * ## Configuration authoring
26
+ *
27
+ * ```typescript
28
+ * import { defineConfig, env } from "@sanity/ailf"
29
+ *
30
+ * export default defineConfig({
31
+ * projectId: env("SANITY_PROJECT_ID"),
32
+ * dataset: env("SANITY_DATASET"),
33
+ * })
34
+ * ```
35
+ */
36
+ // ---------------------------------------------------------------------------
37
+ // Configuration helpers (define* identity functions for typed authoring)
38
+ // ---------------------------------------------------------------------------
39
+ export { defineConfig, defineFeatures, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./_vendor/ailf-core/index.js";
40
+ // ---------------------------------------------------------------------------
41
+ // Environment helper
42
+ // ---------------------------------------------------------------------------
43
+ export { env } from "./_vendor/ailf-core/index.js";
44
+ // ---------------------------------------------------------------------------
45
+ // Validation — for programmatic validation of task YAML
46
+ // ---------------------------------------------------------------------------
47
+ export { CanonicalTaskFileSchema, CanonicalTaskSchema, CURATED_ASSERTION_TYPES, detectLegacyFieldNames, parseCanonicalTaskFile, RUBRIC_TEMPLATE_NAMES, } from "./adapters/task-sources/repo-schemas.js";
48
+ export { formatValidationResult, validateCanonicalTasks, } from "./adapters/task-sources/repo-validation.js";
@@ -0,0 +1,8 @@
1
+ /**
2
+ * lib/agent-behavior-report.ts — DEPRECATED re-export shim.
3
+ * @deprecated Import from ../pipeline/agent-behavior-report.js instead.
4
+ */
5
+ import "dotenv/config";
6
+ export { analyzeResults, CANONICAL_DOC_MAP, detectFeatureArea, } from "../pipeline/agent-behavior-report.js";
7
+ export type { AnalysisResult, FeatureAnalysis, TaskBehavior, TestResult, } from "../pipeline/agent-behavior-report.js";
8
+ export declare function main(resultsPathArg?: string): void;
@@ -0,0 +1,185 @@
1
+ /**
2
+ * lib/agent-behavior-report.ts — DEPRECATED re-export shim.
3
+ * @deprecated Import from ../pipeline/agent-behavior-report.js instead.
4
+ */
5
+ // oxlint-disable-next-line import/no-unassigned-import -- side-effect: loads .env into process.env
6
+ import "dotenv/config";
7
+ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
8
+ import { dirname, join } from "path";
9
+ export { analyzeResults, CANONICAL_DOC_MAP, detectFeatureArea, } from "../pipeline/agent-behavior-report.js";
10
+ import { analyzeResults, } from "../pipeline/agent-behavior-report.js";
11
+ export function main(resultsPathArg) {
12
+ const ROOT = join(dirname(new URL(import.meta.url).pathname), "..", "..");
13
+ const resultsPath = resultsPathArg ??
14
+ process.argv[2] ??
15
+ join(ROOT, "results", "latest", "eval-results.json");
16
+ if (!existsSync(resultsPath)) {
17
+ console.error(`Results file not found: ${resultsPath}`);
18
+ console.error("Run an evaluation first: pnpm eval:observed");
19
+ process.exit(1);
20
+ }
21
+ console.log(`Reading results from: ${resultsPath}`);
22
+ console.log();
23
+ const json = JSON.parse(readFileSync(resultsPath, "utf-8"));
24
+ const rawResults = Array.isArray(json.results)
25
+ ? json.results
26
+ : json.results.results;
27
+ const analysis = analyzeResults(rawResults);
28
+ if (!analysis.hasData) {
29
+ console.log("No agent behavior data found in the results.");
30
+ console.log("Make sure you ran the evaluation with the observed config:");
31
+ console.log(" pnpm eval:observed");
32
+ process.exit(0);
33
+ }
34
+ printReport(analysis);
35
+ // Persist detailed report as JSON
36
+ const outDir = join(ROOT, "results", "latest");
37
+ mkdirSync(outDir, { recursive: true });
38
+ const reportData = {
39
+ features: analysis.features.map((f) => ({
40
+ avgDocPages: f.avgDocPages,
41
+ avgNetworkMs: f.avgNetworkMs,
42
+ avgSearches: f.avgSearches,
43
+ canonicalCoverage: f.canonicalCoverage,
44
+ canonicalSlugs: f.canonicalSlugs,
45
+ docSlugsVisited: f.allDocSlugs,
46
+ externalDomains: f.allExternalDomains,
47
+ feature: f.feature,
48
+ searchQueries: f.allSearchQueries,
49
+ taskCount: f.tasks.length,
50
+ })),
51
+ tasks: analysis.tasks.map((t) => ({
52
+ behavior: t.behavior,
53
+ description: t.description,
54
+ feature: t.feature,
55
+ hasDocs: t.hasDocs,
56
+ })),
57
+ timestamp: new Date().toISOString(),
58
+ totalTasks: analysis.tasks.length,
59
+ };
60
+ writeFileSync(join(outDir, "agent-behavior-report.json"), JSON.stringify(reportData, null, 2));
61
+ console.log("Agent behavior report written to results/latest/agent-behavior-report.json");
62
+ }
63
+ // ---------------------------------------------------------------------------
64
+ // Report output (kept in shim for backward compat)
65
+ // ---------------------------------------------------------------------------
66
+ function printReport(analysis) {
67
+ console.log("=".repeat(80));
68
+ console.log(" AGENT BEHAVIOR OBSERVATION REPORT");
69
+ console.log("=".repeat(80));
70
+ console.log();
71
+ // ---- Overview table ----
72
+ console.log("OVERVIEW BY FEATURE AREA");
73
+ console.log("-".repeat(80));
74
+ const h = "| Feature Area | Tasks | Avg Docs | Avg Search | Avg Net(ms) | Canon% |";
75
+ const sep = "|---------------------|-------|----------|------------|-------------|--------|";
76
+ console.log(h);
77
+ console.log(sep);
78
+ for (const f of analysis.features) {
79
+ console.log(`| ${f.feature.padEnd(19)} | ` +
80
+ `${f.tasks.length.toString().padStart(5)} | ` +
81
+ `${f.avgDocPages.toFixed(1).padStart(8)} | ` +
82
+ `${f.avgSearches.toFixed(1).padStart(10)} | ` +
83
+ `${Math.round(f.avgNetworkMs).toString().padStart(11)} | ` +
84
+ `${(f.canonicalCoverage * 100).toFixed(0).padStart(5)}% |`);
85
+ }
86
+ console.log();
87
+ // ---- Canonical coverage breakdown ----
88
+ console.log("CANONICAL DOCUMENTATION COVERAGE");
89
+ console.log("-".repeat(80));
90
+ console.log();
91
+ for (const f of analysis.features) {
92
+ console.log(` ${f.feature} (${(f.canonicalCoverage * 100).toFixed(0)}% canonical coverage):`);
93
+ if (f.canonicalSlugs.length === 0) {
94
+ console.log(" (no canonical docs defined)");
95
+ }
96
+ else {
97
+ for (const slug of f.canonicalSlugs) {
98
+ const found = f.allDocSlugs.some((visited) => visited.includes(slug));
99
+ const marker = found ? "[x]" : "[ ]";
100
+ console.log(` ${marker} ${slug}`);
101
+ }
102
+ }
103
+ if (f.allDocSlugs.length > 0) {
104
+ const nonCanonical = f.allDocSlugs.filter((slug) => !f.canonicalSlugs.some((c) => slug.includes(c)));
105
+ if (nonCanonical.length > 0) {
106
+ console.log(" Additional docs visited:");
107
+ for (const slug of nonCanonical) {
108
+ console.log(` + ${slug}`);
109
+ }
110
+ }
111
+ }
112
+ console.log();
113
+ }
114
+ // ---- Search strategy ----
115
+ const allSearches = analysis.features.flatMap((f) => f.allSearchQueries);
116
+ if (allSearches.length > 0) {
117
+ console.log("SEARCH STRATEGY");
118
+ console.log("-".repeat(80));
119
+ console.log();
120
+ for (const f of analysis.features) {
121
+ if (f.allSearchQueries.length === 0)
122
+ continue;
123
+ console.log(` ${f.feature}:`);
124
+ for (const q of f.allSearchQueries) {
125
+ console.log(` -> "${q}"`);
126
+ }
127
+ }
128
+ console.log();
129
+ }
130
+ // ---- Per-task detail ----
131
+ console.log("PER-TASK DETAIL");
132
+ console.log("-".repeat(80));
133
+ console.log();
134
+ for (const f of analysis.features) {
135
+ console.log(` ## ${f.feature}`);
136
+ console.log();
137
+ for (const t of f.tasks) {
138
+ const variant = t.hasDocs ? "[gold]" : "[baseline]";
139
+ console.log(` ${variant} ${t.description}`);
140
+ console.log(` Requests: ${t.behavior.totalRequests} | ` +
141
+ `Doc pages: ${t.behavior.docPagesVisited} | ` +
142
+ `Searches: ${t.behavior.searchesPerformed} | ` +
143
+ `External: ${t.behavior.externalRequestCount}`);
144
+ if (t.behavior.docSlugsVisited.length > 0) {
145
+ console.log(` Docs: ${t.behavior.docSlugsVisited.join(", ")}`);
146
+ }
147
+ if (t.behavior.uniqueSearchQueries.length > 0) {
148
+ console.log(` Queries: ${t.behavior.uniqueSearchQueries.map((q) => `"${q}"`).join(", ")}`);
149
+ }
150
+ console.log();
151
+ }
152
+ }
153
+ // ---- External domains ----
154
+ const allDomains = [
155
+ ...new Set(analysis.features.flatMap((f) => f.allExternalDomains)),
156
+ ];
157
+ if (allDomains.length > 0) {
158
+ console.log("EXTERNAL DOMAINS");
159
+ console.log("-".repeat(80));
160
+ console.log();
161
+ for (const d of allDomains) {
162
+ console.log(` - ${d}`);
163
+ }
164
+ console.log();
165
+ }
166
+ // ---- Summary stats ----
167
+ console.log("OVERALL STATISTICS");
168
+ console.log("-".repeat(80));
169
+ console.log();
170
+ const totalTasks = analysis.tasks.length;
171
+ const tasksUsingDocs = analysis.tasks.filter((t) => t.behavior.usedDocs).length;
172
+ const tasksUsingSearch = analysis.tasks.filter((t) => t.behavior.usedSearch).length;
173
+ const avgCanonical = analysis.features.reduce((s, f) => s + f.canonicalCoverage, 0) /
174
+ (analysis.features.length || 1);
175
+ console.log(` Total tasks observed: ${totalTasks}`);
176
+ console.log(` Tasks that used docs: ${tasksUsingDocs}/${totalTasks} (${((tasksUsingDocs / totalTasks) * 100).toFixed(0)}%)`);
177
+ console.log(` Tasks that used search: ${tasksUsingSearch}/${totalTasks} (${((tasksUsingSearch / totalTasks) * 100).toFixed(0)}%)`);
178
+ console.log(` Avg canonical coverage: ${(avgCanonical * 100).toFixed(1)}%`);
179
+ console.log();
180
+ }
181
+ // Only run when invoked directly (not when imported)
182
+ if (process.argv[1]?.endsWith("agent-behavior-report.ts") ||
183
+ process.argv[1]?.endsWith("agent-behavior-report.js")) {
184
+ main();
185
+ }