@sanity/ailf 0.4.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. package/config/features.ts +23 -0
  2. package/config/models.ts +83 -0
  3. package/config/prompts.ts +16 -0
  4. package/config/rubrics.ts +225 -0
  5. package/config/schedules.ts +47 -0
  6. package/config/sinks.ts +37 -0
  7. package/config/sources.ts +21 -0
  8. package/config/thresholds.ts +61 -0
  9. package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
  10. package/dist/_vendor/ailf-core/config-helpers.js +150 -0
  11. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  12. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  13. package/dist/_vendor/ailf-core/examples/index.d.ts +10 -10
  14. package/dist/_vendor/ailf-core/examples/index.js +10 -10
  15. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  16. package/dist/_vendor/ailf-core/index.js +5 -0
  17. package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
  18. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  19. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  20. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  21. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  22. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  23. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  24. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
  25. package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
  26. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
  27. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
  28. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +32 -31
  29. package/dist/_vendor/ailf-core/schemas/pipeline.js +52 -12
  30. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  31. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  32. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  33. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  34. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  35. package/dist/_vendor/ailf-core/services/index.js +2 -1
  36. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  37. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  38. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  39. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  40. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  41. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  42. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  43. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  44. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  46. package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
  47. package/dist/_vendor/ailf-core/types/index.js +8 -1
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
  50. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  51. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  52. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  53. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  54. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  55. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  56. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  57. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  58. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  59. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  60. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  61. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  62. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  63. package/dist/_vendor/ailf-shared/index.js +0 -1
  64. package/dist/adapters/api-client/build-request.js +14 -13
  65. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  66. package/dist/adapters/config-sources/file-config-adapter.js +38 -12
  67. package/dist/adapters/config-sources/index.d.ts +2 -0
  68. package/dist/adapters/config-sources/index.js +1 -0
  69. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  70. package/dist/adapters/config-sources/ts-config-loader.js +133 -0
  71. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  72. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  73. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  74. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  75. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  76. package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
  77. package/dist/adapters/task-sources/index.d.ts +1 -0
  78. package/dist/adapters/task-sources/index.js +1 -0
  79. package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
  80. package/dist/adapters/task-sources/repo-task-source.js +69 -16
  81. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  82. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  83. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  84. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  85. package/dist/cli.js +0 -2
  86. package/dist/commands/baseline.js +4 -1
  87. package/dist/commands/calculate-scores.js +1 -1
  88. package/dist/commands/coverage-audit.js +7 -1
  89. package/dist/commands/explain-handler.js +25 -23
  90. package/dist/commands/fetch-docs.js +3 -2
  91. package/dist/commands/generate-configs.js +1 -1
  92. package/dist/commands/interactive.js +11 -7
  93. package/dist/commands/pipeline-action.d.ts +2 -0
  94. package/dist/commands/pipeline-action.js +16 -6
  95. package/dist/commands/pipeline.d.ts +1 -0
  96. package/dist/commands/pipeline.js +4 -2
  97. package/dist/commands/pr-comment.js +1 -1
  98. package/dist/commands/publish.js +2 -2
  99. package/dist/commands/readiness-report.js +13 -6
  100. package/dist/composition-root.d.ts +1 -1
  101. package/dist/composition-root.js +67 -4
  102. package/dist/orchestration/build-app-context.js +1 -0
  103. package/dist/orchestration/build-step-sequence.js +24 -6
  104. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  105. package/dist/orchestration/steps/fetch-docs-step.js +6 -4
  106. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  107. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  108. package/dist/orchestration/steps/generate-configs-step.js +245 -51
  109. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  110. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  111. package/dist/orchestration/steps/readiness-step.js +5 -6
  112. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  113. package/dist/orchestration/steps/run-eval-step.js +8 -7
  114. package/dist/pipeline/cache.d.ts +1 -1
  115. package/dist/pipeline/cache.js +36 -8
  116. package/dist/pipeline/calculate-scores.d.ts +5 -7
  117. package/dist/pipeline/calculate-scores.js +74 -153
  118. package/dist/pipeline/checks.js +2 -2
  119. package/dist/pipeline/compare.js +8 -8
  120. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  121. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  122. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  123. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  124. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  125. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  126. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  127. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  128. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  129. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
  130. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  131. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  132. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  133. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  134. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  135. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
  136. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  137. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  138. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  139. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  140. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  141. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  142. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  143. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  144. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  145. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  146. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  147. package/dist/pipeline/compiler/config-loader.js +111 -0
  148. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  149. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  150. package/dist/pipeline/compiler/hash.d.ts +11 -0
  151. package/dist/pipeline/compiler/hash.js +18 -0
  152. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  153. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  154. package/dist/pipeline/compiler/index.d.ts +29 -0
  155. package/dist/pipeline/compiler/index.js +45 -0
  156. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  157. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  158. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  159. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  160. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  161. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  162. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  163. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  164. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  165. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  166. package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
  167. package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
  168. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  169. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  170. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  171. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  172. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  173. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
  174. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
  175. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
  176. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  177. package/dist/pipeline/compiler/presets/index.js +8 -0
  178. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
  179. package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
  180. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  181. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  182. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  183. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  184. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  185. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  186. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  187. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  188. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  189. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  190. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  191. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  192. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  193. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  194. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  195. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  196. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  197. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  198. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  199. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  200. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  201. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  202. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  203. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  204. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  205. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  206. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  207. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  208. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  209. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  210. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  211. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  212. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  213. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  214. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  215. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  216. package/dist/pipeline/coverage-audit.d.ts +15 -5
  217. package/dist/pipeline/coverage-audit.js +41 -22
  218. package/dist/pipeline/eval-constants.d.ts +16 -6
  219. package/dist/pipeline/eval-constants.js +25 -4
  220. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  221. package/dist/pipeline/eval-fingerprint.js +8 -9
  222. package/dist/pipeline/expand-tasks.d.ts +23 -14
  223. package/dist/pipeline/expand-tasks.js +37 -31
  224. package/dist/pipeline/gap-analysis.d.ts +1 -1
  225. package/dist/pipeline/gap-analysis.js +2 -2
  226. package/dist/pipeline/generate-configs.d.ts +22 -4
  227. package/dist/pipeline/generate-configs.js +53 -24
  228. package/dist/pipeline/grader-api.d.ts +3 -3
  229. package/dist/pipeline/grader-api.js +5 -12
  230. package/dist/pipeline/grader-compare-runner.js +20 -27
  231. package/dist/pipeline/grader-comparison.d.ts +4 -8
  232. package/dist/pipeline/grader-comparison.js +11 -17
  233. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  234. package/dist/pipeline/grader-consistency-runner.js +18 -21
  235. package/dist/pipeline/grader-consistency.d.ts +6 -10
  236. package/dist/pipeline/grader-consistency.js +13 -32
  237. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  238. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  239. package/dist/pipeline/grader-sensitivity.js +10 -10
  240. package/dist/pipeline/grader-validate-runner.js +7 -5
  241. package/dist/pipeline/grader-validation.d.ts +2 -6
  242. package/dist/pipeline/grader-validation.js +14 -22
  243. package/dist/pipeline/map-request-to-config.js +6 -1
  244. package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
  245. package/dist/pipeline/mirror-repo-tasks.js +16 -15
  246. package/dist/pipeline/normalize-mode.d.ts +49 -0
  247. package/dist/pipeline/normalize-mode.js +64 -0
  248. package/dist/pipeline/plan.d.ts +5 -2
  249. package/dist/pipeline/plan.js +134 -78
  250. package/dist/pipeline/pr-comment.js +2 -0
  251. package/dist/pipeline/profile-resolution.d.ts +47 -0
  252. package/dist/pipeline/profile-resolution.js +91 -0
  253. package/dist/pipeline/provenance.d.ts +2 -2
  254. package/dist/pipeline/provenance.js +12 -17
  255. package/dist/pipeline/release-report.js +4 -4
  256. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  257. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  258. package/dist/pipeline/rubric-loader.d.ts +20 -0
  259. package/dist/pipeline/rubric-loader.js +37 -0
  260. package/dist/pipeline/validate.d.ts +4 -4
  261. package/dist/pipeline/validate.js +64 -53
  262. package/dist/schedules/loader.js +18 -8
  263. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  264. package/dist/scripts/migrate-task-mode.js +85 -0
  265. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  266. package/dist/scripts/validate-task-sources.d.ts +1 -1
  267. package/dist/scripts/validate-task-sources.js +15 -15
  268. package/dist/sinks/loader.js +5 -7
  269. package/dist/sources.d.ts +7 -7
  270. package/dist/sources.js +22 -24
  271. package/dist/webhook/dispatch.js +2 -1
  272. package/package.json +6 -3
  273. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  274. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  275. package/tasks/literacy/frameworks.task.ts +128 -0
  276. package/tasks/literacy/functions.task.ts +69 -0
  277. package/tasks/literacy/groq.task.ts +258 -0
  278. package/tasks/literacy/nextjs-live.task.ts +75 -0
  279. package/tasks/literacy/studio-setup.task.ts +131 -0
  280. package/tasks/literacy/visual-editing.task.ts +146 -0
  281. package/config/features.yaml +0 -116
  282. package/config/models.yaml +0 -116
  283. package/config/prompts.yaml +0 -75
  284. package/config/rubrics.yaml +0 -62
  285. package/config/schedules.yaml +0 -43
  286. package/config/sinks.yaml +0 -54
  287. package/config/sources.yaml +0 -51
  288. package/config/thresholds.yaml +0 -49
  289. package/dist/agent-observer/test-imports.d.ts +0 -7
  290. package/dist/agent-observer/test-imports.js +0 -185
@@ -23,6 +23,7 @@ import { TASK_FILE_NAMES } from "../_vendor/ailf-core/index.js";
23
23
  import { buildPipelinePlan, buildSimpleCommandPlan, } from "../pipeline/plan.js";
24
24
  import { formatPlanConsole, formatPlanJson } from "../pipeline/plan-format.js";
25
25
  import { computeResolvedOptions } from "./pipeline-action.js";
26
+ import { LiteracyVariant } from "../pipeline/normalize-mode.js";
26
27
  // ---------------------------------------------------------------------------
27
28
  // Registry
28
29
  // ---------------------------------------------------------------------------
@@ -84,8 +85,8 @@ const EXPLAIN_REGISTRY = {
84
85
  filesCreated: ["results/latest/score-summary.json"],
85
86
  filesRead: [
86
87
  "results/latest/eval-results.json",
87
- "config/rubrics.yaml",
88
- "config/models.yaml",
88
+ "config/rubrics.ts",
89
+ "config/models.ts",
89
90
  ],
90
91
  steps: [
91
92
  {
@@ -138,12 +139,12 @@ const EXPLAIN_REGISTRY = {
138
139
  },
139
140
  "coverage-audit": {
140
141
  description: "Cross-reference feature registry against evaluation tasks for coverage gaps",
141
- filesRead: ["config/features.yaml", "tasks/*.yaml"],
142
+ filesRead: ["config/features.ts", "tasks/*.{yaml,task.ts,task.js}"],
142
143
  steps: [
143
144
  {
144
145
  cacheStatus: "miss",
145
146
  name: "Load feature registry",
146
- reason: "Parse config/features.yaml for product feature list",
147
+ reason: "Parse config/features.ts for product feature list",
147
148
  willRun: true,
148
149
  },
149
150
  {
@@ -201,7 +202,7 @@ const EXPLAIN_REGISTRY = {
201
202
  "fetch-docs": {
202
203
  description: "Fetch documentation from Sanity CMS and generate canonical context files",
203
204
  filesCreated: ["contexts/canonical/*.md"],
204
- filesRead: ["config/sources.yaml", "config/models.yaml"],
205
+ filesRead: ["config/sources.ts", "config/models.ts"],
205
206
  steps: [
206
207
  {
207
208
  cacheStatus: "miss",
@@ -224,7 +225,7 @@ const EXPLAIN_REGISTRY = {
224
225
  ],
225
226
  },
226
227
  "generate-configs": {
227
- description: "Generate Promptfoo config files from models.yaml and task definitions",
228
+ description: "Generate Promptfoo config files from models.ts and task definitions",
228
229
  filesCreated: [
229
230
  "promptfooconfig.yaml",
230
231
  "promptfooconfig.observed.yaml",
@@ -232,16 +233,16 @@ const EXPLAIN_REGISTRY = {
232
233
  "tasks/.expanded.yaml",
233
234
  ],
234
235
  filesRead: [
235
- "config/models.yaml",
236
- "config/prompts.yaml",
237
- "config/rubrics.yaml",
238
- "config/sources.yaml",
236
+ "config/models.ts",
237
+ "config/prompts.ts",
238
+ "config/rubrics.ts",
239
+ "config/sources.ts",
239
240
  ],
240
241
  steps: [
241
242
  {
242
243
  cacheStatus: "miss",
243
244
  name: "Load models",
244
- reason: "Parse config/models.yaml for active model list",
245
+ reason: "Parse config/models.ts for active model list",
245
246
  willRun: true,
246
247
  },
247
248
  {
@@ -262,7 +263,7 @@ const EXPLAIN_REGISTRY = {
262
263
  description: "Grader reliability tools (consistency, compare, sensitivity, validate)",
263
264
  filesRead: [
264
265
  "results/latest/eval-results.json",
265
- "config/rubrics.yaml",
266
+ "config/rubrics.ts",
266
267
  "canonical/reference-solutions/",
267
268
  ],
268
269
  steps: [
@@ -369,7 +370,7 @@ const EXPLAIN_REGISTRY = {
369
370
  filesRead: [
370
371
  "results/latest/score-summary.json",
371
372
  "results/latest/gap-analysis.json",
372
- "config/thresholds.yaml",
373
+ "config/thresholds.ts",
373
374
  "results/baselines/",
374
375
  ],
375
376
  filesCreated: ["results/latest/readiness-report.md"],
@@ -377,7 +378,7 @@ const EXPLAIN_REGISTRY = {
377
378
  {
378
379
  cacheStatus: "miss",
379
380
  name: "Load scores + thresholds",
380
- reason: "Read score-summary.json and thresholds.yaml for gate evaluation",
381
+ reason: "Read score-summary.json and thresholds.ts for gate evaluation",
381
382
  willRun: true,
382
383
  },
383
384
  {
@@ -395,18 +396,18 @@ const EXPLAIN_REGISTRY = {
395
396
  ],
396
397
  },
397
398
  validate: {
398
- description: "Validate all YAML config files, task definitions, reference solutions, and environment",
399
+ description: "Validate all config files, task definitions, reference solutions, and environment",
399
400
  filesRead: [
400
- "config/models.yaml",
401
- "config/rubrics.yaml",
402
- "config/features.yaml",
403
- "config/thresholds.yaml",
401
+ "config/models.ts",
402
+ "config/rubrics.ts",
403
+ "config/features.ts",
404
+ "config/thresholds.ts",
404
405
  ],
405
406
  steps: [
406
407
  {
407
408
  cacheStatus: "miss",
408
409
  name: "Validate configuration",
409
- reason: "Parse all YAML configs through Zod schemas, cross-reference mappings",
410
+ reason: "Parse all config files through Zod schemas, cross-reference mappings",
410
411
  willRun: true,
411
412
  },
412
413
  {
@@ -454,12 +455,12 @@ const EXPLAIN_REGISTRY = {
454
455
  },
455
456
  "weekly-digest": {
456
457
  description: "Generate and deliver a weekly evaluation trend digest via Slack",
457
- filesRead: ["config/schedules.yaml", "config/sinks.yaml"],
458
+ filesRead: ["config/schedules.ts", "config/sinks.ts"],
458
459
  steps: [
459
460
  {
460
461
  cacheStatus: "miss",
461
462
  name: "Load digest config",
462
- reason: "Read schedules.yaml for lookback window and delivery targets",
463
+ reason: "Read schedules.ts for lookback window and delivery targets",
463
464
  willRun: true,
464
465
  },
465
466
  {
@@ -670,7 +671,7 @@ async function buildPipelineExplainPlan(actionCommand, rootDir) {
670
671
  graderReplications: raw.graderReplications,
671
672
  header: raw.header ?? [],
672
673
  headers: raw.headers ?? [],
673
- mode: raw.mode ?? "full",
674
+ mode: raw.mode ?? LiteracyVariant.FULL,
674
675
  output: raw.output,
675
676
  promptfooUrl: raw.promptfooUrl,
676
677
  publish: raw.publish,
@@ -714,6 +715,7 @@ async function buildPipelineExplainPlan(actionCommand, rootDir) {
714
715
  gapAnalysisEnabled: resolved.gapAnalysisEnabled,
715
716
  graderReplications: resolved.graderReplications,
716
717
  mode: resolved.mode,
718
+ variant: resolved.variant,
717
719
  noCache: resolved.noCache,
718
720
  publishEnabled: resolved.publishEnabled,
719
721
  readinessEnabled: resolved.readinessEnabled,
@@ -41,7 +41,7 @@ async function executeFetchDocs(opts) {
41
41
  // Build a minimal ResolvedConfig for the composition root
42
42
  const ctx = createAppContext({
43
43
  rootDir: ROOT,
44
- mode: "baseline",
44
+ mode: "literacy",
45
45
  noAutoScope: false,
46
46
  skipFetch: false,
47
47
  skipEval: true,
@@ -83,7 +83,8 @@ async function executeFetchDocs(opts) {
83
83
  }
84
84
  // Canonical contexts — same code path as the pipeline
85
85
  const tasks = await ctx.taskSource.loadTasks();
86
- const tasksWithDocs = tasks.filter((t) => t.canonicalDocs.length > 0);
86
+ // Bridge: narrow to literacy tasks with docs (only literacy tasks have context.docs)
87
+ const tasksWithDocs = tasks.filter((t) => t.mode === "literacy" && (t.context?.docs?.length ?? 0) > 0);
87
88
  if (tasksWithDocs.length > 0) {
88
89
  console.log("\nGenerating canonical (gold-retrieval) contexts...\n");
89
90
  const result = await fetcher.fetch(tasksWithDocs, resolvedSource);
@@ -19,7 +19,7 @@ export function createGenerateConfigsCommand() {
19
19
  try {
20
20
  const ctx = createAppContext({
21
21
  rootDir: ROOT,
22
- mode: "baseline",
22
+ mode: "literacy",
23
23
  noAutoScope: false,
24
24
  skipFetch: true,
25
25
  skipEval: true,
@@ -9,6 +9,10 @@
9
9
  * Uses @inquirer/prompts for a clean, modern terminal UI.
10
10
  */
11
11
  import { Command } from "commander";
12
+ import { LiteracyVariant } from "../pipeline/normalize-mode.js";
13
+ // CLI command name for the baseline snapshot management subcommand.
14
+ // Defined as a constant to avoid scattering the literal string across routing code.
15
+ const BASELINE_CMD = "baseline";
12
16
  export function createInteractiveCommand() {
13
17
  return new Command("interactive")
14
18
  .description("Guided wizard for common evaluation workflows")
@@ -65,7 +69,7 @@ async function runInteractiveWizard() {
65
69
  {
66
70
  description: "Save, compare, or list historical score snapshots",
67
71
  name: "Manage baselines",
68
- value: "baseline",
72
+ value: BASELINE_CMD,
69
73
  },
70
74
  {
71
75
  description: "Weekly evaluation trends and area summaries",
@@ -93,7 +97,7 @@ async function runInteractiveWizard() {
93
97
  });
94
98
  return { args: dryRun ? ["--dry-run"] : [], command: "weekly-digest" };
95
99
  }
96
- if (workflow === "baseline") {
100
+ if (workflow === BASELINE_CMD) {
97
101
  const subcommand = await select({
98
102
  choices: [
99
103
  { name: "Save current scores", value: "save" },
@@ -102,7 +106,7 @@ async function runInteractiveWizard() {
102
106
  ],
103
107
  message: "Baseline operation:",
104
108
  });
105
- return { args: [subcommand], command: "baseline" };
109
+ return { args: [subcommand], command: BASELINE_CMD };
106
110
  }
107
111
  if (workflow === "grader") {
108
112
  const subcommand = await select({
@@ -140,22 +144,22 @@ async function runInteractiveWizard() {
140
144
  {
141
145
  description: "Evaluate with pre-fetched documentation context",
142
146
  name: "Baseline (with docs vs without docs)",
143
- value: "baseline",
147
+ value: LiteracyVariant.STANDARD,
144
148
  },
145
149
  {
146
150
  description: "Baseline + record HTTP request patterns",
147
151
  name: "Observed (instrumented)",
148
- value: "observed",
152
+ value: LiteracyVariant.OBSERVED,
149
153
  },
150
154
  {
151
155
  description: "Agent searches for docs itself via web tools",
152
156
  name: "Agentic (agent-driven retrieval)",
153
- value: "agentic",
157
+ value: LiteracyVariant.AGENTIC,
154
158
  },
155
159
  ],
156
160
  message: "Evaluation mode:",
157
161
  });
158
- if (mode !== "baseline") {
162
+ if (mode !== LiteracyVariant.STANDARD) {
159
163
  args.push("--mode", mode);
160
164
  }
161
165
  // Step 3: Area scoping
@@ -31,6 +31,8 @@ export interface ResolvedOptions {
31
31
  headerArgs: string[];
32
32
  impactSummary?: ImpactSummary;
33
33
  mode: EvalMode;
34
+ /** Literacy variant — set when the user passes a legacy mode name */
35
+ variant?: string;
34
36
  noAutoScope: boolean;
35
37
  noCache: boolean;
36
38
  noRemoteCache: boolean;
@@ -14,6 +14,7 @@ import { existsSync, readFileSync, writeFileSync } from "fs";
14
14
  import { dirname, resolve } from "path";
15
15
  import { fileURLToPath } from "url";
16
16
  import { classifyUrls } from "../pipeline/classify-url.js";
17
+ import { normalizeMode } from "../pipeline/normalize-mode.js";
17
18
  import { assessImpact, buildReverseMapping, } from "../pipeline/reverse-mapping.js";
18
19
  import { buildAppContext } from "../orchestration/build-app-context.js";
19
20
  import { buildStepSequence } from "../orchestration/build-step-sequence.js";
@@ -23,9 +24,8 @@ import { parseRepoConfig, } from "../adapters/task-sources/repo-schemas.js";
23
24
  const __dirname = dirname(fileURLToPath(import.meta.url));
24
25
  const ROOT = resolve(__dirname, "..", "..");
25
26
  // ---------------------------------------------------------------------------
26
- // Valid modes & search modes
27
+ // Valid search modes
27
28
  // ---------------------------------------------------------------------------
28
- const VALID_MODES = ["baseline", "observed", "agentic", "full"];
29
29
  const VALID_SEARCH_MODES = ["open", "origin-only", "off"];
30
30
  /**
31
31
  * Pure option resolution — computes ResolvedOptions from CLI flags without
@@ -36,10 +36,19 @@ const VALID_SEARCH_MODES = ["open", "origin-only", "off"];
36
36
  export function computeResolvedOptions(opts) {
37
37
  // Resolve paths relative to the caller's cwd, not the eval package root
38
38
  const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
39
- // Validate mode
40
- const mode = opts.mode;
41
- if (!VALID_MODES.includes(mode)) {
42
- console.error(`❌ Invalid mode "${opts.mode}". Must be one of: ${VALID_MODES.join(", ")}`);
39
+ // Validate + normalize mode via the single boundary function.
40
+ // normalizeMode() maps legacy variant names (baseline, agentic, etc.)
41
+ // to canonical mode "literacy" + variant, and throws on invalid input.
42
+ let mode;
43
+ let variant;
44
+ try {
45
+ const normalized = normalizeMode(opts.mode);
46
+ mode = normalized.mode;
47
+ // Explicit --variant flag takes precedence over what normalizeMode inferred
48
+ variant = opts.variant ?? normalized.variant;
49
+ }
50
+ catch (err) {
51
+ console.error(`❌ ${err instanceof Error ? err.message : String(err)}`);
43
52
  process.exit(1);
44
53
  }
45
54
  // Debug options — any sub-flag (--debug-n, --debug-pattern, --debug-sample)
@@ -220,6 +229,7 @@ export function computeResolvedOptions(opts) {
220
229
  headerArgs,
221
230
  impactSummary,
222
231
  mode,
232
+ variant,
223
233
  noAutoScope: opts.autoScope === false,
224
234
  noCache: !opts.cache,
225
235
  noRemoteCache: opts.remoteCache === false,
@@ -35,6 +35,7 @@ export interface PipelineCliOptions {
35
35
  header: string[];
36
36
  headers: string[];
37
37
  mode: string;
38
+ variant?: string;
38
39
  output?: string;
39
40
  promptfooUrl?: string;
40
41
  publish?: boolean;
@@ -8,11 +8,13 @@
8
8
  * @see docs/CLI.md for the full flag reference.
9
9
  */
10
10
  import { Command } from "commander";
11
+ import { LiteracyVariant } from "../pipeline/normalize-mode.js";
11
12
  import { addAgenticOptions, addDebugOptions, addSanitySourceOptions, } from "./shared/options.js";
12
13
  export function createPipelineCommand() {
13
14
  const cmd = new Command("pipeline")
14
15
  .description("Run the full evaluation pipeline")
15
- .option("-m, --mode <mode>", "Evaluation mode: full (default floor + ceiling + actual), baseline (floor + ceiling only), agentic (actual only), observed", "full")
16
+ .option("-m, --mode <mode>", "Evaluation mode: literacy (default), mcp-server, agent-harness, knowledge-probe, custom. Legacy aliases (baseline, agentic, observed, full) are accepted and normalized to literacy + variant.", LiteracyVariant.FULL)
17
+ .option("--variant <variant>", "Literacy variant: full (default — standard + agentic), baseline (standard only), agentic (agentic only), observed. Only applies to --mode literacy.")
16
18
  .option("-s, --source <name>", "Documentation source name (from sources.yaml)")
17
19
  .option("-n, --dry-run", "Validate configuration only, no execution", false)
18
20
  .option("--skip-fetch", "Reuse cached documentation contexts", false)
@@ -44,7 +46,7 @@ export function createPipelineCommand() {
44
46
  .option("--publish-tag <tag>", "Label for published report")
45
47
  .option("--report-dataset <name>", "Sanity dataset for report store")
46
48
  .option("--report-project <id>", "Sanity project ID for report store")
47
- .option("--config <path>", "Load pipeline config from a JSON/YAML file (overrides most CLI flags)")
49
+ .option("--config <path>", "Load pipeline config from a TS/JS/YAML/JSON file (overrides most CLI flags)")
48
50
  .option("-o, --output <path>", "Write PR comment markdown to file")
49
51
  .option("--promptfoo-url <url>", "Promptfoo share URL for report")
50
52
  .option("--task-source <type>", "Task definition source: content-lake (default — Sanity Content Lake), repo (repo tasks only, no Content Lake merge), yaml (tasks/*.yaml files, legacy)", "content-lake")
@@ -20,7 +20,7 @@ export function createPrCommentCommand() {
20
20
  try {
21
21
  const ctx = createAppContext({
22
22
  rootDir: ROOT,
23
- mode: "baseline",
23
+ mode: "literacy",
24
24
  noAutoScope: false,
25
25
  skipFetch: true,
26
26
  skipEval: true,
@@ -52,7 +52,7 @@ export function createPublishCommand() {
52
52
  */
53
53
  function buildProvenanceFromSummary(summary) {
54
54
  const areas = summary.scores.map((s) => s.feature);
55
- const mode = (process.env.EVAL_MODE ?? "baseline");
55
+ const mode = (process.env.EVAL_MODE ?? "literacy");
56
56
  const source = {
57
57
  baseUrl: summary.source?.baseUrl ?? "https://www.sanity.io/docs",
58
58
  dataset: summary.source?.dataset ?? process.env.SANITY_DATASET ?? "next",
@@ -83,7 +83,7 @@ async function runPublishCommand(summaryPath, opts) {
83
83
  compareEnabled: false,
84
84
  discoveryReportEnabled: false,
85
85
  gapAnalysisEnabled: false,
86
- mode: "baseline",
86
+ mode: "literacy",
87
87
  noAutoScope: false,
88
88
  noCache: true,
89
89
  noRemoteCache: true,
@@ -10,14 +10,14 @@ import { Command } from "commander";
10
10
  import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
11
11
  import { dirname, join, resolve } from "path";
12
12
  import { fileURLToPath } from "url";
13
- import { load } from "js-yaml";
13
+ import { ConfigNotFoundError, loadConfigFile, } from "../pipeline/compiler/config-loader.js";
14
14
  import { formatReadinessMarkdown, generateReadinessReport, } from "../pipeline/readiness-report.js";
15
15
  import { ThresholdConfigSchema, } from "../pipeline/schemas.js";
16
16
  const __dirname = dirname(fileURLToPath(import.meta.url));
17
17
  const ROOT = resolve(__dirname, "..", "..");
18
18
  const SCORE_SUMMARY_PATH = join(ROOT, "results", "latest", "score-summary.json");
19
19
  const GAP_ANALYSIS_PATH = join(ROOT, "results", "latest", "gap-analysis.json");
20
- const THRESHOLDS_PATH = join(ROOT, "config", "thresholds.yaml");
20
+ // thresholds loaded via loadConfigFile below
21
21
  const BASELINES_DIR = join(ROOT, "results", "baselines");
22
22
  export function createReadinessReportCommand() {
23
23
  return new Command("readiness-report")
@@ -33,12 +33,19 @@ export function createReadinessReportCommand() {
33
33
  }
34
34
  const scoreSummary = JSON.parse(readFileSync(SCORE_SUMMARY_PATH, "utf-8"));
35
35
  // Load threshold config
36
- if (!existsSync(THRESHOLDS_PATH)) {
37
- console.error(`❌ Threshold config not found at ${THRESHOLDS_PATH}.`);
36
+ let parsedThresholds;
37
+ try {
38
+ parsedThresholds = loadConfigFile("thresholds", ROOT).data;
39
+ }
40
+ catch (err) {
41
+ if (err instanceof ConfigNotFoundError) {
42
+ console.error("❌ Threshold config not found in config/.");
43
+ }
44
+ else {
45
+ console.error(`❌ Failed to load threshold config: ${err instanceof Error ? err.message : err}`);
46
+ }
38
47
  process.exit(1);
39
48
  }
40
- const rawThresholds = readFileSync(THRESHOLDS_PATH, "utf-8");
41
- const parsedThresholds = load(rawThresholds);
42
49
  const thresholdResult = ThresholdConfigSchema.safeParse(parsedThresholds);
43
50
  if (!thresholdResult.success) {
44
51
  const messages = thresholdResult.error.issues
@@ -15,7 +15,7 @@
15
15
  * @see packages/core/src/ports/context.ts — AppContext interface
16
16
  * @see docs/exec-plans/ports-and-adapters/phase-7-composition-root.md
17
17
  */
18
- import type { AppContext, ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
18
+ import { type AppContext, type ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
19
19
  /**
20
20
  * Create a fully wired AppContext from resolved configuration.
21
21
  *
@@ -15,12 +15,13 @@
15
15
  * @see packages/core/src/ports/context.ts — AppContext interface
16
16
  * @see docs/exec-plans/ports-and-adapters/phase-7-composition-root.md
17
17
  */
18
+ import { InMemoryPluginRegistry, } from "./_vendor/ailf-core/index.js";
18
19
  import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js";
19
20
  import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
20
- import { SanityDocFetcher } from "./adapters/doc-fetchers/index.js";
21
21
  import { PromptfooEvalAdapter } from "./adapters/eval-runners/promptfoo-eval-adapter.js";
22
22
  import { ConsoleLogger, JsonLogger, QuietLogger, } from "./adapters/loggers/index.js";
23
23
  import { CompositeTaskSource, ContentLakeTaskSource, RepoTaskSource, YamlTaskSource, } from "./adapters/task-sources/index.js";
24
+ import { createSanityLiteracyPreset } from "./pipeline/compiler/presets/index.js";
24
25
  import { getSanityClient } from "./sanity/client.js";
25
26
  import { ReportStore } from "./report-store.js";
26
27
  import { loadSinks } from "./sinks/index.js";
@@ -38,13 +39,18 @@ export function createAppContext(config) {
38
39
  const cache = config.noCache ? undefined : createCache(config);
39
40
  // Task source — selected by config.taskSourceType
40
41
  const taskSource = createTaskSource(config);
41
- // Doc fetcherSanity Content Lake
42
- const docFetcher = new SanityDocFetcher(config.rootDir);
42
+ // Plugin registrymode handlers, assertions, rubric templates, doc fetcher.
43
+ // The Sanity preset is registered here with config.rootDir so its doc fetcher
44
+ // factory resolves paths relative to the eval package root (not cwd).
45
+ const registry = createRegistry(config.rootDir);
46
+ // Doc fetcher — provided by the registered preset's factory
47
+ const docFetcherFactory = registry.getDocFetcherFactory();
48
+ const docFetcher = docFetcherFactory ? docFetcherFactory() : undefined;
43
49
  // Eval runner — Promptfoo subprocess
44
50
  const evalRunner = new PromptfooEvalAdapter(config.rootDir);
45
51
  // Report store — Sanity Content Lake (for publish + auto-compare)
46
52
  const reportStore = createReportStore(config);
47
- // Sinks — loaded from config/sinks.yaml
53
+ // Sinks — loaded from config/sinks
48
54
  const sinks = loadSinks();
49
55
  return {
50
56
  cache,
@@ -52,6 +58,7 @@ export function createAppContext(config) {
52
58
  docFetcher,
53
59
  evalRunner,
54
60
  logger,
61
+ registry,
55
62
  reportStore,
56
63
  sinks,
57
64
  taskSource,
@@ -113,6 +120,62 @@ function createTaskSource(config) {
113
120
  }
114
121
  return primary;
115
122
  }
123
+ // ---------------------------------------------------------------------------
124
+ // Built-in mode registrations for non-literacy modes
125
+ // ---------------------------------------------------------------------------
126
+ const BUILT_IN_MODES = [
127
+ {
128
+ id: "knowledge-probe",
129
+ label: "Knowledge Probe",
130
+ validProviderPatterns: ["^openai:", "^anthropic:", "^file://"],
131
+ rubricTemplateIds: [],
132
+ handlerModule: "./mode-handlers/knowledge-probe-handler.js",
133
+ },
134
+ {
135
+ id: "mcp-server",
136
+ label: "MCP Server Testing",
137
+ validProviderPatterns: ["^mcp:", "^file://"],
138
+ rubricTemplateIds: [
139
+ "mcp-input-validation",
140
+ "mcp-output-correctness",
141
+ "mcp-error-handling",
142
+ ],
143
+ handlerModule: "./mode-handlers/mcp-server-handler.js",
144
+ },
145
+ {
146
+ id: "agent-harness",
147
+ label: "Agent Harness",
148
+ validProviderPatterns: ["^openai:", "^anthropic:", "^file://"],
149
+ rubricTemplateIds: [],
150
+ handlerModule: "./mode-handlers/agent-harness-handler.js",
151
+ },
152
+ ];
153
+ /**
154
+ * Build and populate the plugin registry.
155
+ *
156
+ * Preset registration flow:
157
+ * 1. A preset is a PresetDefinition — a bundle of modes, assertions, rubric
158
+ * templates, prompt templates, scoring profiles, a doc fetcher factory,
159
+ * source definitions, and feature definitions.
160
+ * 2. registerPreset() iterates the preset's fields and delegates each one to
161
+ * the appropriate register method (registerMode, registerRubricTemplate, …).
162
+ * 3. After registration the rest of createAppContext() can pull capabilities
163
+ * from the registry (e.g. getDocFetcherFactory()) without knowing which
164
+ * preset provided them.
165
+ *
166
+ * To add a new preset: create a PresetDefinition, then call
167
+ * registry.registerPreset() here before the built-in mode registrations.
168
+ */
169
+ function createRegistry(rootDir) {
170
+ const registry = new InMemoryPluginRegistry();
171
+ // Register the sanity-literacy preset — the Sanity-specific evaluation bundle.
172
+ registry.registerPreset(createSanityLiteracyPreset({ rootDir }));
173
+ // Register other built-in modes (not part of any preset yet)
174
+ for (const mode of BUILT_IN_MODES) {
175
+ registry.registerMode(mode);
176
+ }
177
+ return registry;
178
+ }
116
179
  function createReportStore(config) {
117
180
  return new ReportStore({
118
181
  dataset: process.env.AILF_REPORT_DATASET ??
@@ -20,6 +20,7 @@ export function mapToResolvedConfig(opts, rootDir) {
20
20
  return {
21
21
  rootDir,
22
22
  mode: opts.mode,
23
+ variant: opts.variant,
23
24
  noAutoScope: opts.noAutoScope ?? false,
24
25
  debug: opts.debug,
25
26
  areas: opts.areaOption
@@ -5,7 +5,7 @@
5
5
  * PipelineStep objects determined by config flags like skipFetch,
6
6
  * skipEval, compareEnabled, etc.
7
7
  */
8
- import { FULL_MODE_SUBMODES } from "../_vendor/ailf-shared/index.js";
8
+ import { LiteracyVariant } from "../pipeline/normalize-mode.js";
9
9
  import { CallbackStep } from "./steps/callback-step.js";
10
10
  import { CalculateScoresStep } from "./steps/calculate-scores-step.js";
11
11
  import { CompareStep } from "./steps/compare-step.js";
@@ -40,11 +40,29 @@ export function buildStepSequence(ctx, pipelineStart = Date.now()) {
40
40
  // Step 2: Generate Promptfoo configs
41
41
  steps.push(new GenerateConfigsStep());
42
42
  // Step 3: Run evaluation (steps handle --skip-eval internally)
43
- const modes = config.mode === "full"
44
- ? [...FULL_MODE_SUBMODES]
45
- : [config.mode];
46
- for (const mode of modes) {
47
- steps.push(new RunEvalStep(mode));
43
+ //
44
+ // For literacy mode, the variant determines how many eval steps run:
45
+ // "full" → baseline + agentic (two steps)
46
+ // "baseline" / "agentic" / "observed" → one step
47
+ // undefined → defaults to baseline
48
+ //
49
+ // For all other modes, one eval step per mode.
50
+ if (config.mode === "literacy") {
51
+ const variant = config.variant ?? LiteracyVariant.STANDARD;
52
+ if (variant === LiteracyVariant.FULL) {
53
+ for (const submode of [
54
+ LiteracyVariant.STANDARD,
55
+ LiteracyVariant.AGENTIC,
56
+ ]) {
57
+ steps.push(new RunEvalStep(submode));
58
+ }
59
+ }
60
+ else {
61
+ steps.push(new RunEvalStep(variant));
62
+ }
63
+ }
64
+ else {
65
+ steps.push(new RunEvalStep(config.mode));
48
66
  }
49
67
  // Step 3c: Grader consistency (optional, conditional)
50
68
  if (config.graderReplications) {