@sanity/ailf 0.5.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. package/README.md +0 -1
  2. package/config/features.ts +23 -0
  3. package/config/models.ts +95 -0
  4. package/config/prompts.ts +16 -0
  5. package/config/rubrics.ts +225 -0
  6. package/config/schedules.ts +47 -0
  7. package/config/sinks.ts +37 -0
  8. package/config/sources.ts +21 -0
  9. package/config/thresholds.ts +61 -0
  10. package/dist/_vendor/ailf-core/config-helpers.d.ts +171 -0
  11. package/dist/_vendor/ailf-core/config-helpers.js +170 -0
  12. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  13. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  14. package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
  15. package/dist/_vendor/ailf-core/examples/index.js +25 -0
  16. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  17. package/dist/_vendor/ailf-core/index.js +5 -0
  18. package/dist/_vendor/ailf-core/ports/context.d.ts +17 -2
  19. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  20. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  21. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  22. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  23. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  24. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  25. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +8 -2
  26. package/dist/_vendor/ailf-core/schemas/eval-config.js +17 -2
  27. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +9 -3
  28. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +8 -1
  29. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -31
  30. package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -9
  31. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  32. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  33. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  34. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  35. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  36. package/dist/_vendor/ailf-core/services/index.js +2 -1
  37. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  38. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  39. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  40. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  41. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  42. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  43. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  44. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +332 -0
  46. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  47. package/dist/_vendor/ailf-core/types/index.d.ts +45 -83
  48. package/dist/_vendor/ailf-core/types/index.js +8 -1
  49. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +257 -0
  50. package/dist/_vendor/ailf-core/types/plugin-registry.js +185 -0
  51. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  52. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  53. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  54. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  55. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  56. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  57. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  58. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  59. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  60. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  61. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  62. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  63. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  64. package/dist/_vendor/ailf-shared/index.js +0 -1
  65. package/dist/adapters/api-client/build-request.js +14 -13
  66. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  67. package/dist/adapters/config-sources/file-config-adapter.js +39 -12
  68. package/dist/adapters/config-sources/index.d.ts +2 -0
  69. package/dist/adapters/config-sources/index.js +1 -0
  70. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  71. package/dist/adapters/config-sources/ts-config-loader.js +141 -0
  72. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  73. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  74. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  75. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  76. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  77. package/dist/adapters/task-sources/content-lake-task-source.js +35 -39
  78. package/dist/adapters/task-sources/index.d.ts +3 -2
  79. package/dist/adapters/task-sources/index.js +3 -2
  80. package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
  81. package/dist/adapters/task-sources/repo-schemas.js +227 -19
  82. package/dist/adapters/task-sources/repo-task-source.d.ts +16 -12
  83. package/dist/adapters/task-sources/repo-task-source.js +92 -80
  84. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  85. package/dist/adapters/task-sources/repo-validation.js +126 -5
  86. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  87. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  88. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  89. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  90. package/dist/cli.js +0 -2
  91. package/dist/commands/baseline.js +4 -1
  92. package/dist/commands/calculate-scores.js +1 -1
  93. package/dist/commands/coverage-audit.js +9 -1
  94. package/dist/commands/explain-handler.js +25 -23
  95. package/dist/commands/fetch-docs.js +3 -2
  96. package/dist/commands/generate-configs.js +1 -1
  97. package/dist/commands/init.d.ts +6 -4
  98. package/dist/commands/init.js +302 -23
  99. package/dist/commands/interactive.js +11 -7
  100. package/dist/commands/pipeline-action.d.ts +2 -0
  101. package/dist/commands/pipeline-action.js +16 -6
  102. package/dist/commands/pipeline.d.ts +1 -0
  103. package/dist/commands/pipeline.js +4 -2
  104. package/dist/commands/pr-comment.js +1 -1
  105. package/dist/commands/publish.js +2 -2
  106. package/dist/commands/readiness-report.js +13 -6
  107. package/dist/commands/validate-tasks.d.ts +2 -2
  108. package/dist/commands/validate-tasks.js +26 -15
  109. package/dist/composition-root.d.ts +13 -1
  110. package/dist/composition-root.js +99 -4
  111. package/dist/index.d.ts +41 -0
  112. package/dist/index.js +48 -0
  113. package/dist/orchestration/build-app-context.js +1 -0
  114. package/dist/orchestration/build-step-sequence.js +28 -8
  115. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  116. package/dist/orchestration/steps/fetch-docs-step.js +8 -7
  117. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  118. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  119. package/dist/orchestration/steps/generate-configs-step.js +261 -51
  120. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  121. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  122. package/dist/orchestration/steps/readiness-step.js +5 -6
  123. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  124. package/dist/orchestration/steps/run-eval-step.js +8 -7
  125. package/dist/pipeline/cache.d.ts +1 -1
  126. package/dist/pipeline/cache.js +36 -8
  127. package/dist/pipeline/calculate-scores.d.ts +2 -4
  128. package/dist/pipeline/calculate-scores.js +43 -113
  129. package/dist/pipeline/checks.js +2 -2
  130. package/dist/pipeline/compare.js +8 -8
  131. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  132. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  133. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  134. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  135. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  136. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  137. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  138. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  139. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  140. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +392 -0
  141. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  142. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  143. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  144. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  145. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  146. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +404 -0
  147. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  148. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  149. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  150. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  151. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  152. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  153. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  154. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  155. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  156. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  157. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  158. package/dist/pipeline/compiler/config-loader.js +111 -0
  159. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  160. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  161. package/dist/pipeline/compiler/hash.d.ts +11 -0
  162. package/dist/pipeline/compiler/hash.js +18 -0
  163. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  164. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  165. package/dist/pipeline/compiler/index.d.ts +29 -0
  166. package/dist/pipeline/compiler/index.js +45 -0
  167. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  168. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  169. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  170. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  171. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  172. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  173. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  174. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  175. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
  176. package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
  177. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  178. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  179. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  180. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  181. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  182. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  183. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  184. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  185. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  186. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  187. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  188. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  189. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  190. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  191. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  192. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  193. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  194. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  195. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  196. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  197. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
  198. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  199. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  200. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  201. package/dist/pipeline/compiler/mode-handlers/index.d.ts +15 -0
  202. package/dist/pipeline/compiler/mode-handlers/index.js +19 -0
  203. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  204. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  205. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  206. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  207. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  208. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  209. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  210. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  211. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  212. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  213. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  214. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  215. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  216. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  217. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  218. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  219. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  220. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  221. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  222. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  223. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  224. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  225. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  226. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  227. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
  228. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
  229. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  230. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  231. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  232. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  233. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  234. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  235. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  236. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
  237. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  238. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  239. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  240. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  241. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  242. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
  243. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  244. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
  245. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  246. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  247. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
  248. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
  249. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
  250. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  251. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  252. package/dist/pipeline/compiler/preset-loader.js +99 -0
  253. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  254. package/dist/pipeline/compiler/presets/index.js +8 -0
  255. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +42 -0
  256. package/dist/pipeline/compiler/presets/sanity-literacy.js +208 -0
  257. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  258. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  259. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  260. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  261. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  262. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  263. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  264. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  265. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  266. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  267. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  268. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  269. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  270. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  271. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  272. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  273. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  274. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  275. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  276. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  277. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  278. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  279. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  280. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  281. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  282. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  283. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  284. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  285. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  286. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  287. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  288. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  289. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  290. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  291. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  292. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  293. package/dist/pipeline/coverage-audit.d.ts +15 -5
  294. package/dist/pipeline/coverage-audit.js +41 -22
  295. package/dist/pipeline/eval-constants.d.ts +16 -6
  296. package/dist/pipeline/eval-constants.js +25 -4
  297. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  298. package/dist/pipeline/eval-fingerprint.js +8 -9
  299. package/dist/pipeline/expand-tasks.d.ts +19 -10
  300. package/dist/pipeline/expand-tasks.js +34 -28
  301. package/dist/pipeline/gap-analysis.d.ts +1 -1
  302. package/dist/pipeline/gap-analysis.js +2 -2
  303. package/dist/pipeline/generate-configs.d.ts +22 -4
  304. package/dist/pipeline/generate-configs.js +53 -24
  305. package/dist/pipeline/grader-api.d.ts +3 -3
  306. package/dist/pipeline/grader-api.js +5 -12
  307. package/dist/pipeline/grader-compare-runner.js +20 -27
  308. package/dist/pipeline/grader-comparison.d.ts +4 -8
  309. package/dist/pipeline/grader-comparison.js +11 -17
  310. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  311. package/dist/pipeline/grader-consistency-runner.js +16 -20
  312. package/dist/pipeline/grader-consistency.d.ts +6 -10
  313. package/dist/pipeline/grader-consistency.js +13 -32
  314. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  315. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  316. package/dist/pipeline/grader-sensitivity.js +10 -10
  317. package/dist/pipeline/grader-validate-runner.js +7 -5
  318. package/dist/pipeline/grader-validation.d.ts +2 -6
  319. package/dist/pipeline/grader-validation.js +14 -22
  320. package/dist/pipeline/map-request-to-config.js +7 -1
  321. package/dist/pipeline/mirror-repo-tasks.d.ts +13 -13
  322. package/dist/pipeline/mirror-repo-tasks.js +22 -21
  323. package/dist/pipeline/normalize-mode.d.ts +49 -0
  324. package/dist/pipeline/normalize-mode.js +64 -0
  325. package/dist/pipeline/plan.d.ts +5 -2
  326. package/dist/pipeline/plan.js +134 -78
  327. package/dist/pipeline/pr-comment.js +2 -0
  328. package/dist/pipeline/profile-resolution.d.ts +22 -14
  329. package/dist/pipeline/profile-resolution.js +41 -19
  330. package/dist/pipeline/provenance.d.ts +2 -2
  331. package/dist/pipeline/provenance.js +12 -17
  332. package/dist/pipeline/release-report.js +4 -4
  333. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  334. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  335. package/dist/pipeline/rubric-loader.d.ts +20 -0
  336. package/dist/pipeline/rubric-loader.js +37 -0
  337. package/dist/pipeline/validate.d.ts +4 -4
  338. package/dist/pipeline/validate.js +64 -53
  339. package/dist/schedules/loader.js +18 -8
  340. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  341. package/dist/scripts/migrate-task-mode.js +85 -0
  342. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  343. package/dist/scripts/validate-task-sources.d.ts +1 -1
  344. package/dist/scripts/validate-task-sources.js +15 -15
  345. package/dist/sinks/loader.js +5 -7
  346. package/dist/sources.d.ts +7 -7
  347. package/dist/sources.js +22 -24
  348. package/dist/webhook/dispatch.js +2 -1
  349. package/package.json +15 -4
  350. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  351. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  352. package/tasks/literacy/frameworks.task.ts +128 -0
  353. package/tasks/literacy/functions.task.ts +69 -0
  354. package/tasks/literacy/groq.task.ts +258 -0
  355. package/tasks/literacy/nextjs-live.task.ts +75 -0
  356. package/tasks/literacy/studio-setup.task.ts +131 -0
  357. package/tasks/literacy/visual-editing.task.ts +146 -0
  358. package/config/features.yaml +0 -116
  359. package/config/models.yaml +0 -116
  360. package/config/prompts.yaml +0 -75
  361. package/config/rubrics.yaml +0 -81
  362. package/config/schedules.yaml +0 -43
  363. package/config/sinks.yaml +0 -54
  364. package/config/sources.yaml +0 -51
  365. package/config/thresholds.yaml +0 -49
  366. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  367. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  368. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  369. package/dist/_vendor/ailf-tasks/index.js +0 -16
  370. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  371. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  372. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  373. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  374. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  375. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  376. package/dist/agent-observer/test-imports.d.ts +0 -7
  377. package/dist/agent-observer/test-imports.js +0 -185
@@ -5,12 +5,14 @@
5
5
  * task files. The generated files are ready-to-edit starting points —
6
6
  * not live evaluation tasks.
7
7
  *
8
- * YAML output (default) preserves the inline comments from the source
9
- * YAML files in packages/core/examples/. JSON output is a plain
10
- * serialization of the parsed data no comments.
8
+ * TypeScript output (default) uses define* helpers from @sanity/ailf-core
9
+ * for full IDE autocomplete and type checking. YAML output preserves
10
+ * inline comments from the source files. JSON output is a plain
11
+ * serialization of the parsed data.
11
12
  *
12
13
  * Usage:
13
- * ailf init # YAML output (default)
14
+ * ailf init # TypeScript output (default)
15
+ * ailf init --output-format yaml # YAML output
14
16
  * ailf init --output-format json # JSON output
15
17
  * ailf init --force # overwrite existing files
16
18
  * ailf init --path ./my-dir # target a specific directory
@@ -18,16 +20,17 @@
18
20
  import { Command } from "commander";
19
21
  import { existsSync, mkdirSync, writeFileSync } from "fs";
20
22
  import { resolve, relative } from "path";
21
- import { ailfConfigData, ailfConfigYaml, taskYamlFiles, TASK_FILE_NAMES, allTaskData, workflowYaml, } from "../_vendor/ailf-core/index.js";
23
+ import { ailfConfigData, ailfConfigYaml, ailfConfigTs, taskYamlFiles, taskTsFiles, TASK_FILE_NAMES, TASK_TS_FILE_NAMES, allTaskData, workflowYaml, } from "../_vendor/ailf-core/index.js";
22
24
  // ---------------------------------------------------------------------------
23
25
  // Command factory
24
26
  // ---------------------------------------------------------------------------
25
27
  export function createInitCommand() {
26
28
  return new Command("init")
27
29
  .description("Initialize a directory for AI Literacy Framework evaluation")
28
- .option("--output-format <fmt>", 'Output format for generated files: "yaml" (default) or "json"', "yaml")
30
+ .option("--output-format <fmt>", 'Output format for generated files: "ts" (default), "yaml", or "json"', "ts")
29
31
  .option("--force", "Overwrite existing files", false)
30
32
  .option("--path <dir>", "Target directory (default: current directory)", ".")
33
+ .option("--mode <mode>", "Scaffold for a specific mode: literacy, mcp-server, custom (default: all modes)")
31
34
  .action(async (opts) => {
32
35
  await runInit(opts);
33
36
  });
@@ -55,8 +58,13 @@ function rel(from, to) {
55
58
  // Init logic
56
59
  // ---------------------------------------------------------------------------
57
60
  async function runInit(opts) {
58
- const format = opts.outputFormat === "json" ? "json" : "yaml";
59
- const ext = format === "json" ? ".json" : ".yaml";
61
+ const validFormats = new Set(["ts", "yaml", "json"]);
62
+ if (!validFormats.has(opts.outputFormat)) {
63
+ console.error(` ✗ Invalid output format "${opts.outputFormat}". Valid options: ts, yaml, json`);
64
+ process.exitCode = 1;
65
+ return;
66
+ }
67
+ const format = opts.outputFormat;
60
68
  const force = opts.force;
61
69
  // Resolve target from the caller's actual working directory
62
70
  const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
@@ -72,24 +80,103 @@ async function runInit(opts) {
72
80
  console.log(` ✓ Created ${rel(targetDir, tasksDir)}/`);
73
81
  const written = [];
74
82
  const skipped = [];
75
- // 2. Write .ailf/config.yaml (or .json)
76
- // YAML: raw string passthrough (preserves comments)
77
- // JSON: serialize the parsed data
78
- const configPath = resolve(ailfDir, `config${ext}`);
79
- const configContent = format === "yaml"
80
- ? ailfConfigYaml
81
- : JSON.stringify(ailfConfigData, null, 2) + "\n";
82
- if (writeIfNew(configPath, configContent, force)) {
83
- written.push(rel(targetDir, configPath));
83
+ // 2. Write project config
84
+ if (format === "ts") {
85
+ // TypeScript: ailf.config.ts with defineConfig helper
86
+ const configPath = resolve(ailfDir, "ailf.config.ts");
87
+ if (writeIfNew(configPath, ailfConfigTs, force)) {
88
+ written.push(rel(targetDir, configPath));
89
+ }
90
+ else {
91
+ skipped.push(rel(targetDir, configPath));
92
+ }
93
+ }
94
+ else if (format === "yaml") {
95
+ // YAML: raw string passthrough (preserves comments)
96
+ const configPath = resolve(ailfDir, "config.yaml");
97
+ if (writeIfNew(configPath, ailfConfigYaml, force)) {
98
+ written.push(rel(targetDir, configPath));
99
+ }
100
+ else {
101
+ skipped.push(rel(targetDir, configPath));
102
+ }
84
103
  }
85
104
  else {
86
- skipped.push(rel(targetDir, configPath));
105
+ // JSON: serialize the parsed data
106
+ const configPath = resolve(ailfDir, "config.json");
107
+ const content = JSON.stringify(ailfConfigData, null, 2) + "\n";
108
+ if (writeIfNew(configPath, content, force)) {
109
+ written.push(rel(targetDir, configPath));
110
+ }
111
+ else {
112
+ skipped.push(rel(targetDir, configPath));
113
+ }
87
114
  }
88
115
  // 3. Write example tasks to .ailf/tasks/
89
- // YAML: raw string passthrough (preserves comments)
90
- // JSON: serialize individual task data
91
- if (format === "yaml") {
92
- // Each task is its own commented YAML file — write as-is
116
+ const modeFilter = opts.mode;
117
+ const isCustomMode = modeFilter === "custom";
118
+ if (format === "ts") {
119
+ // TypeScript: *.task.ts files with defineTask helper
120
+ // Default (no --mode): write literacy examples + draft MCP/probe examples
121
+ // --mode literacy: only literacy examples
122
+ // --mode mcp-server: only MCP examples (active, not draft)
123
+ // --mode custom: only a custom example task
124
+ if (!modeFilter || modeFilter === "literacy") {
125
+ for (const stem of TASK_TS_FILE_NAMES) {
126
+ const taskPath = resolve(tasksDir, `${stem}.task.ts`);
127
+ const content = taskTsFiles[stem];
128
+ if (writeIfNew(taskPath, content, force)) {
129
+ written.push(rel(targetDir, taskPath));
130
+ }
131
+ else {
132
+ skipped.push(rel(targetDir, taskPath));
133
+ }
134
+ }
135
+ }
136
+ // Draft examples for other modes (default init only)
137
+ if (!modeFilter) {
138
+ const mcpPath = resolve(tasksDir, "example-mcp-tool-usage.task.ts");
139
+ if (writeIfNew(mcpPath, MCP_DRAFT_TASK_TS, force)) {
140
+ written.push(rel(targetDir, mcpPath));
141
+ }
142
+ else {
143
+ skipped.push(rel(targetDir, mcpPath));
144
+ }
145
+ const probePath = resolve(tasksDir, "example-knowledge-probe.task.ts");
146
+ if (writeIfNew(probePath, PROBE_DRAFT_TASK_TS, force)) {
147
+ written.push(rel(targetDir, probePath));
148
+ }
149
+ else {
150
+ skipped.push(rel(targetDir, probePath));
151
+ }
152
+ }
153
+ // MCP-only init
154
+ if (modeFilter === "mcp-server") {
155
+ const mcpContent = MCP_DRAFT_TASK_TS.replace('status: "draft",', '// status: "active", // Activated — this task runs in evaluations');
156
+ const mcpPath = resolve(tasksDir, "example-mcp-tool-usage.task.ts");
157
+ if (writeIfNew(mcpPath, mcpContent, force)) {
158
+ written.push(rel(targetDir, mcpPath));
159
+ }
160
+ else {
161
+ skipped.push(rel(targetDir, mcpPath));
162
+ }
163
+ }
164
+ // Custom preset scaffold
165
+ if (isCustomMode) {
166
+ const customTaskPath = resolve(tasksDir, "example-custom.task.ts");
167
+ // Reuse the GROQ literacy task as a starting point
168
+ if (taskTsFiles[TASK_TS_FILE_NAMES[0]]) {
169
+ if (writeIfNew(customTaskPath, taskTsFiles[TASK_TS_FILE_NAMES[0]], force)) {
170
+ written.push(rel(targetDir, customTaskPath));
171
+ }
172
+ else {
173
+ skipped.push(rel(targetDir, customTaskPath));
174
+ }
175
+ }
176
+ }
177
+ }
178
+ else if (format === "yaml") {
179
+ // YAML: raw string passthrough (preserves comments)
93
180
  for (const stem of TASK_FILE_NAMES) {
94
181
  const taskPath = resolve(tasksDir, `${stem}.yaml`);
95
182
  const content = taskYamlFiles[stem];
@@ -118,6 +205,16 @@ async function runInit(opts) {
118
205
  }
119
206
  }
120
207
  }
208
+ // 3b. Write custom preset scaffold (--mode custom only)
209
+ if (isCustomMode && format === "ts") {
210
+ const presetPath = resolve(ailfDir, "preset.ts");
211
+ if (writeIfNew(presetPath, CUSTOM_PRESET_TS, force)) {
212
+ written.push(rel(targetDir, presetPath));
213
+ }
214
+ else {
215
+ skipped.push(rel(targetDir, presetPath));
216
+ }
217
+ }
121
218
  // 4. Write .gitignore in .ailf/ (keep results out of version control)
122
219
  const gitignorePath = resolve(ailfDir, ".gitignore");
123
220
  const gitignoreContent = `# AILF generated files\nresults/\ncontexts/\n`;
@@ -150,18 +247,25 @@ async function runInit(opts) {
150
247
  console.log(` ⊘ Skipped ${f} (already exists, use --force to overwrite)`);
151
248
  }
152
249
  }
250
+ const taskExt = format === "ts" ? ".task.ts" : format === "yaml" ? ".yaml" : ".json";
153
251
  console.log();
154
252
  console.log(" Next steps:");
155
253
  console.log();
156
254
  console.log(` 1. Edit the example tasks in ${rel(targetDir, tasksDir)}/ — update`);
157
255
  console.log(" slugs and prompts for your documentation");
158
- console.log(" 2. Validate locally: npx @sanity/ailf@latest validate-tasks .ailf/tasks/");
256
+ console.log(` 2. Validate locally: npx @sanity/ailf@latest validate-tasks .ailf/tasks/`);
159
257
  console.log(" 3. Add two GitHub Actions secrets");
160
258
  console.log(" (Settings → Secrets and variables → Actions):");
161
259
  console.log(" • AILF_API_KEY — your API key (starts with ailf_live_sk_)");
162
260
  console.log(" • NPM_TOKEN — npm token with read access to @sanity scope");
163
261
  console.log(" 4. Push — the workflow at .github/workflows/ailf-eval.yml runs");
164
262
  console.log(" automatically on PRs");
263
+ if (format === "ts") {
264
+ console.log();
265
+ console.log(` 💡 TypeScript tasks (${taskExt}) give you full IDE autocomplete`);
266
+ console.log(" via defineTask() from @sanity/ailf-core. YAML and JSON are");
267
+ console.log(" also supported — re-run with --output-format yaml if preferred.");
268
+ }
165
269
  console.log();
166
270
  console.log(" 🔑 Retrieve secrets from 1Password (Sanity employees):");
167
271
  console.log();
@@ -177,3 +281,178 @@ async function runInit(opts) {
177
281
  console.log(" AILF_API_KEY=... npx @sanity/ailf@latest pipeline --remote --debug");
178
282
  console.log();
179
283
  }
284
+ // ---------------------------------------------------------------------------
285
+ // Draft example templates for non-literacy modes
286
+ // ---------------------------------------------------------------------------
287
+ const MCP_DRAFT_TASK_TS = `/**
288
+ * Example Task: MCP Server tool-use evaluation (DRAFT).
289
+ *
290
+ * Tests whether an LLM can correctly discover and invoke Sanity MCP server
291
+ * tools. Connects to the hosted Sanity MCP server at https://mcp.sanity.io.
292
+ *
293
+ * Prerequisites:
294
+ * - A Sanity API token with read access (for token-based auth)
295
+ * - Or: OAuth authentication will be prompted on first connect
296
+ *
297
+ * Authentication options:
298
+ * 1. Token-based: set SANITY_API_TOKEN env var
299
+ * 2. OAuth: the server prompts for login on first connect
300
+ *
301
+ * Setup: npx sanity@latest mcp configure
302
+ * Docs: https://www.sanity.io/docs/ai/mcp-server
303
+ *
304
+ * This task is a DRAFT — it won't run unless activated or explicitly targeted.
305
+ * To activate: change status to "active" or remove the status field.
306
+ */
307
+
308
+ import { defineTask } from "../_vendor/ailf-core/index.js"
309
+
310
+ export default defineTask({
311
+ mode: "mcp-server",
312
+ id: "example-mcp-tool-usage",
313
+ title: "MCP tool discovery and invocation",
314
+ description: "Example — tests Sanity MCP server tool-use (draft)",
315
+ area: "mcp",
316
+
317
+ // ── Server configuration ────────────────────────────────────
318
+ // The Sanity MCP server is hosted remotely at https://mcp.sanity.io.
319
+ // Authentication via API token header or OAuth.
320
+ //
321
+ // For token auth, set SANITY_API_TOKEN in your environment.
322
+ serverConfig: {
323
+ transport: "streamable-http",
324
+ url: "https://mcp.sanity.io",
325
+ env: {
326
+ SANITY_API_TOKEN: process.env.SANITY_API_TOKEN ?? "",
327
+ },
328
+ },
329
+
330
+ prompt: {
331
+ text: \`Use the available MCP tools to query all documents of type "article"
332
+ in the Sanity dataset. Return the title and slug for each document.
333
+ Limit results to 5 documents.\`,
334
+ },
335
+
336
+ assertions: [
337
+ {
338
+ type: "llm-rubric",
339
+ template: "mcp-input-validation",
340
+ criteria: [
341
+ "Correctly identifies the query_documents tool",
342
+ "Passes a valid GROQ query to filter by document type",
343
+ "Requests only the needed fields (title, slug)",
344
+ ],
345
+ },
346
+ ],
347
+
348
+ status: "draft",
349
+ })
350
+ `;
351
+ const PROBE_DRAFT_TASK_TS = `/**
352
+ * Example Task: Knowledge probe baseline (DRAFT).
353
+ *
354
+ * Tests what the model knows about a topic without providing documentation.
355
+ * Used to establish a baseline for comparison with literacy evaluations.
356
+ * This task is a DRAFT — it won't run unless activated or explicitly targeted.
357
+ *
358
+ * To activate: change status to "active" or remove the status field.
359
+ */
360
+
361
+ import { defineTask } from "../_vendor/ailf-core/index.js"
362
+
363
+ export default defineTask({
364
+ mode: "knowledge-probe",
365
+ id: "example-knowledge-probe",
366
+ title: "Model knowledge of GROQ syntax",
367
+ description: "Example — probes baseline model knowledge (draft)",
368
+ area: "groq",
369
+
370
+ prompt: {
371
+ text: \`Explain the GROQ query language used by Sanity. Cover:
372
+ 1. Basic query syntax and projections
373
+ 2. How to filter and sort results
374
+ 3. Common patterns for fetching related documents
375
+ Provide working code examples.\`,
376
+ },
377
+
378
+ assertions: [
379
+ {
380
+ type: "llm-rubric",
381
+ template: "task-completion",
382
+ criteria: [
383
+ "Demonstrates understanding of GROQ query syntax",
384
+ "Shows filtering and projection patterns",
385
+ "Code examples use valid GROQ syntax",
386
+ ],
387
+ },
388
+ ],
389
+
390
+ status: "draft",
391
+ })
392
+ `;
393
+ const CUSTOM_PRESET_TS = `/**
394
+ * Custom preset — your domain-specific evaluation configuration.
395
+ *
396
+ * This preset targets the "literacy" mode base and inherits its evaluation
397
+ * methodology (rubrics, scoring profiles, prompt templates). You only need
398
+ * to provide domain-specific configuration: where your docs live, what
399
+ * features to track, and how to fetch documentation.
400
+ *
401
+ * To use a different mode (e.g., "mcp-server"), change the mode field.
402
+ * Available built-in modes: literacy, mcp-server, knowledge-probe, agent-harness.
403
+ *
404
+ * @see https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/PRESETS.md
405
+ */
406
+
407
+ import { definePreset } from "../_vendor/ailf-core/index.js"
408
+
409
+ export default definePreset({
410
+ name: "my-docs-evaluation",
411
+ manifest: {
412
+ name: "my-docs-evaluation",
413
+ version: "1.0.0",
414
+ description: "Documentation literacy evaluation for my project.",
415
+ pluginApiVersion: 1,
416
+ },
417
+
418
+ // Target the literacy mode base — inherits rubrics, scoring, prompts.
419
+ // Change to "mcp-server" to evaluate MCP tool usage instead.
420
+ mode: "literacy",
421
+
422
+ // Source definitions — where your documentation lives.
423
+ sourceDefs: [
424
+ {
425
+ name: "production",
426
+ baseUrl: "https://docs.example.com",
427
+ // projectId: "your-sanity-project-id",
428
+ // dataset: "production",
429
+ },
430
+ ],
431
+
432
+ // Feature registry — what product features you're tracking coverage for.
433
+ featureDefs: {
434
+ features: [
435
+ {
436
+ id: "getting-started",
437
+ name: "Getting Started Guide",
438
+ sections: ["guides"],
439
+ status: "covered",
440
+ area: "guides",
441
+ priority: "critical",
442
+ },
443
+ {
444
+ id: "api-reference",
445
+ name: "API Reference",
446
+ sections: ["reference"],
447
+ status: "uncovered",
448
+ priority: "high",
449
+ },
450
+ ],
451
+ },
452
+
453
+ // Optional: override mode base rubrics, scoring, or prompts here.
454
+ // rubricTemplates: [{ ... }],
455
+ // scoringProfiles: { ... },
456
+ // promptTemplates: { ... },
457
+ })
458
+ `;
@@ -9,6 +9,10 @@
9
9
  * Uses @inquirer/prompts for a clean, modern terminal UI.
10
10
  */
11
11
  import { Command } from "commander";
12
+ import { LiteracyVariant } from "../pipeline/normalize-mode.js";
13
+ // CLI command name for the baseline snapshot management subcommand.
14
+ // Defined as a constant to avoid scattering the literal string across routing code.
15
+ const BASELINE_CMD = "baseline";
12
16
  export function createInteractiveCommand() {
13
17
  return new Command("interactive")
14
18
  .description("Guided wizard for common evaluation workflows")
@@ -65,7 +69,7 @@ async function runInteractiveWizard() {
65
69
  {
66
70
  description: "Save, compare, or list historical score snapshots",
67
71
  name: "Manage baselines",
68
- value: "baseline",
72
+ value: BASELINE_CMD,
69
73
  },
70
74
  {
71
75
  description: "Weekly evaluation trends and area summaries",
@@ -93,7 +97,7 @@ async function runInteractiveWizard() {
93
97
  });
94
98
  return { args: dryRun ? ["--dry-run"] : [], command: "weekly-digest" };
95
99
  }
96
- if (workflow === "baseline") {
100
+ if (workflow === BASELINE_CMD) {
97
101
  const subcommand = await select({
98
102
  choices: [
99
103
  { name: "Save current scores", value: "save" },
@@ -102,7 +106,7 @@ async function runInteractiveWizard() {
102
106
  ],
103
107
  message: "Baseline operation:",
104
108
  });
105
- return { args: [subcommand], command: "baseline" };
109
+ return { args: [subcommand], command: BASELINE_CMD };
106
110
  }
107
111
  if (workflow === "grader") {
108
112
  const subcommand = await select({
@@ -140,22 +144,22 @@ async function runInteractiveWizard() {
140
144
  {
141
145
  description: "Evaluate with pre-fetched documentation context",
142
146
  name: "Baseline (with docs vs without docs)",
143
- value: "baseline",
147
+ value: LiteracyVariant.STANDARD,
144
148
  },
145
149
  {
146
150
  description: "Baseline + record HTTP request patterns",
147
151
  name: "Observed (instrumented)",
148
- value: "observed",
152
+ value: LiteracyVariant.OBSERVED,
149
153
  },
150
154
  {
151
155
  description: "Agent searches for docs itself via web tools",
152
156
  name: "Agentic (agent-driven retrieval)",
153
- value: "agentic",
157
+ value: LiteracyVariant.AGENTIC,
154
158
  },
155
159
  ],
156
160
  message: "Evaluation mode:",
157
161
  });
158
- if (mode !== "baseline") {
162
+ if (mode !== LiteracyVariant.STANDARD) {
159
163
  args.push("--mode", mode);
160
164
  }
161
165
  // Step 3: Area scoping
@@ -31,6 +31,8 @@ export interface ResolvedOptions {
31
31
  headerArgs: string[];
32
32
  impactSummary?: ImpactSummary;
33
33
  mode: EvalMode;
34
+ /** Literacy variant — set when the user passes a legacy mode name */
35
+ variant?: string;
34
36
  noAutoScope: boolean;
35
37
  noCache: boolean;
36
38
  noRemoteCache: boolean;
@@ -14,6 +14,7 @@ import { existsSync, readFileSync, writeFileSync } from "fs";
14
14
  import { dirname, resolve } from "path";
15
15
  import { fileURLToPath } from "url";
16
16
  import { classifyUrls } from "../pipeline/classify-url.js";
17
+ import { normalizeMode } from "../pipeline/normalize-mode.js";
17
18
  import { assessImpact, buildReverseMapping, } from "../pipeline/reverse-mapping.js";
18
19
  import { buildAppContext } from "../orchestration/build-app-context.js";
19
20
  import { buildStepSequence } from "../orchestration/build-step-sequence.js";
@@ -23,9 +24,8 @@ import { parseRepoConfig, } from "../adapters/task-sources/repo-schemas.js";
23
24
  const __dirname = dirname(fileURLToPath(import.meta.url));
24
25
  const ROOT = resolve(__dirname, "..", "..");
25
26
  // ---------------------------------------------------------------------------
26
- // Valid modes & search modes
27
+ // Valid search modes
27
28
  // ---------------------------------------------------------------------------
28
- const VALID_MODES = ["baseline", "observed", "agentic", "full"];
29
29
  const VALID_SEARCH_MODES = ["open", "origin-only", "off"];
30
30
  /**
31
31
  * Pure option resolution — computes ResolvedOptions from CLI flags without
@@ -36,10 +36,19 @@ const VALID_SEARCH_MODES = ["open", "origin-only", "off"];
36
36
  export function computeResolvedOptions(opts) {
37
37
  // Resolve paths relative to the caller's cwd, not the eval package root
38
38
  const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
39
- // Validate mode
40
- const mode = opts.mode;
41
- if (!VALID_MODES.includes(mode)) {
42
- console.error(`❌ Invalid mode "${opts.mode}". Must be one of: ${VALID_MODES.join(", ")}`);
39
+ // Validate + normalize mode via the single boundary function.
40
+ // normalizeMode() maps legacy variant names (baseline, agentic, etc.)
41
+ // to canonical mode "literacy" + variant, and throws on invalid input.
42
+ let mode;
43
+ let variant;
44
+ try {
45
+ const normalized = normalizeMode(opts.mode);
46
+ mode = normalized.mode;
47
+ // Explicit --variant flag takes precedence over what normalizeMode inferred
48
+ variant = opts.variant ?? normalized.variant;
49
+ }
50
+ catch (err) {
51
+ console.error(`❌ ${err instanceof Error ? err.message : String(err)}`);
43
52
  process.exit(1);
44
53
  }
45
54
  // Debug options — any sub-flag (--debug-n, --debug-pattern, --debug-sample)
@@ -220,6 +229,7 @@ export function computeResolvedOptions(opts) {
220
229
  headerArgs,
221
230
  impactSummary,
222
231
  mode,
232
+ variant,
223
233
  noAutoScope: opts.autoScope === false,
224
234
  noCache: !opts.cache,
225
235
  noRemoteCache: opts.remoteCache === false,
@@ -35,6 +35,7 @@ export interface PipelineCliOptions {
35
35
  header: string[];
36
36
  headers: string[];
37
37
  mode: string;
38
+ variant?: string;
38
39
  output?: string;
39
40
  promptfooUrl?: string;
40
41
  publish?: boolean;
@@ -8,11 +8,13 @@
8
8
  * @see docs/CLI.md for the full flag reference.
9
9
  */
10
10
  import { Command } from "commander";
11
+ import { LiteracyVariant } from "../pipeline/normalize-mode.js";
11
12
  import { addAgenticOptions, addDebugOptions, addSanitySourceOptions, } from "./shared/options.js";
12
13
  export function createPipelineCommand() {
13
14
  const cmd = new Command("pipeline")
14
15
  .description("Run the full evaluation pipeline")
15
- .option("-m, --mode <mode>", "Evaluation mode: full (default floor + ceiling + actual), baseline (floor + ceiling only), agentic (actual only), observed", "full")
16
+ .option("-m, --mode <mode>", "Evaluation mode: literacy (default), mcp-server, agent-harness, knowledge-probe, custom. Legacy aliases (baseline, agentic, observed, full) are accepted and normalized to literacy + variant.", LiteracyVariant.FULL)
17
+ .option("--variant <variant>", "Literacy variant: full (default — standard + agentic), baseline (standard only), agentic (agentic only), observed. Only applies to --mode literacy.")
16
18
  .option("-s, --source <name>", "Documentation source name (from sources.yaml)")
17
19
  .option("-n, --dry-run", "Validate configuration only, no execution", false)
18
20
  .option("--skip-fetch", "Reuse cached documentation contexts", false)
@@ -44,7 +46,7 @@ export function createPipelineCommand() {
44
46
  .option("--publish-tag <tag>", "Label for published report")
45
47
  .option("--report-dataset <name>", "Sanity dataset for report store")
46
48
  .option("--report-project <id>", "Sanity project ID for report store")
47
- .option("--config <path>", "Load pipeline config from a JSON/YAML file (overrides most CLI flags)")
49
+ .option("--config <path>", "Load pipeline config from a TS/JS/YAML/JSON file (overrides most CLI flags)")
48
50
  .option("-o, --output <path>", "Write PR comment markdown to file")
49
51
  .option("--promptfoo-url <url>", "Promptfoo share URL for report")
50
52
  .option("--task-source <type>", "Task definition source: content-lake (default — Sanity Content Lake), repo (repo tasks only, no Content Lake merge), yaml (tasks/*.yaml files, legacy)", "content-lake")
@@ -20,7 +20,7 @@ export function createPrCommentCommand() {
20
20
  try {
21
21
  const ctx = createAppContext({
22
22
  rootDir: ROOT,
23
- mode: "baseline",
23
+ mode: "literacy",
24
24
  noAutoScope: false,
25
25
  skipFetch: true,
26
26
  skipEval: true,
@@ -52,7 +52,7 @@ export function createPublishCommand() {
52
52
  */
53
53
  function buildProvenanceFromSummary(summary) {
54
54
  const areas = summary.scores.map((s) => s.feature);
55
- const mode = (process.env.EVAL_MODE ?? "baseline");
55
+ const mode = (process.env.EVAL_MODE ?? "literacy");
56
56
  const source = {
57
57
  baseUrl: summary.source?.baseUrl ?? "https://www.sanity.io/docs",
58
58
  dataset: summary.source?.dataset ?? process.env.SANITY_DATASET ?? "next",
@@ -83,7 +83,7 @@ async function runPublishCommand(summaryPath, opts) {
83
83
  compareEnabled: false,
84
84
  discoveryReportEnabled: false,
85
85
  gapAnalysisEnabled: false,
86
- mode: "baseline",
86
+ mode: "literacy",
87
87
  noAutoScope: false,
88
88
  noCache: true,
89
89
  noRemoteCache: true,
@@ -10,14 +10,14 @@ import { Command } from "commander";
10
10
  import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
11
11
  import { dirname, join, resolve } from "path";
12
12
  import { fileURLToPath } from "url";
13
- import { load } from "js-yaml";
13
+ import { ConfigNotFoundError, loadConfigFile, } from "../pipeline/compiler/config-loader.js";
14
14
  import { formatReadinessMarkdown, generateReadinessReport, } from "../pipeline/readiness-report.js";
15
15
  import { ThresholdConfigSchema, } from "../pipeline/schemas.js";
16
16
  const __dirname = dirname(fileURLToPath(import.meta.url));
17
17
  const ROOT = resolve(__dirname, "..", "..");
18
18
  const SCORE_SUMMARY_PATH = join(ROOT, "results", "latest", "score-summary.json");
19
19
  const GAP_ANALYSIS_PATH = join(ROOT, "results", "latest", "gap-analysis.json");
20
- const THRESHOLDS_PATH = join(ROOT, "config", "thresholds.yaml");
20
+ // thresholds loaded via loadConfigFile below
21
21
  const BASELINES_DIR = join(ROOT, "results", "baselines");
22
22
  export function createReadinessReportCommand() {
23
23
  return new Command("readiness-report")
@@ -33,12 +33,19 @@ export function createReadinessReportCommand() {
33
33
  }
34
34
  const scoreSummary = JSON.parse(readFileSync(SCORE_SUMMARY_PATH, "utf-8"));
35
35
  // Load threshold config
36
- if (!existsSync(THRESHOLDS_PATH)) {
37
- console.error(`❌ Threshold config not found at ${THRESHOLDS_PATH}.`);
36
+ let parsedThresholds;
37
+ try {
38
+ parsedThresholds = loadConfigFile("thresholds", ROOT).data;
39
+ }
40
+ catch (err) {
41
+ if (err instanceof ConfigNotFoundError) {
42
+ console.error("❌ Threshold config not found in config/.");
43
+ }
44
+ else {
45
+ console.error(`❌ Failed to load threshold config: ${err instanceof Error ? err.message : err}`);
46
+ }
38
47
  process.exit(1);
39
48
  }
40
- const rawThresholds = readFileSync(THRESHOLDS_PATH, "utf-8");
41
- const parsedThresholds = load(rawThresholds);
42
49
  const thresholdResult = ThresholdConfigSchema.safeParse(parsedThresholds);
43
50
  if (!thresholdResult.success) {
44
51
  const messages = thresholdResult.error.issues
@@ -1,7 +1,7 @@
1
1
  /**
2
- * validate-tasks command — standalone validation of repo-based task YAML files.
2
+ * validate-tasks command — standalone validation of task files.
3
3
  *
4
- * Validates .ailf/tasks/*.yaml files against the RepoTaskSchema without
4
+ * Validates .ailf/tasks/*.yaml files against the CanonicalTaskSchema without
5
5
  * running the full pipeline. Useful for pre-commit hooks and CI checks
6
6
  * in external repos.
7
7
  *