@sanity/ailf 0.5.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. package/README.md +0 -1
  2. package/config/features.ts +23 -0
  3. package/config/models.ts +95 -0
  4. package/config/prompts.ts +16 -0
  5. package/config/rubrics.ts +225 -0
  6. package/config/schedules.ts +47 -0
  7. package/config/sinks.ts +37 -0
  8. package/config/sources.ts +21 -0
  9. package/config/thresholds.ts +61 -0
  10. package/dist/_vendor/ailf-core/config-helpers.d.ts +171 -0
  11. package/dist/_vendor/ailf-core/config-helpers.js +170 -0
  12. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  13. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  14. package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
  15. package/dist/_vendor/ailf-core/examples/index.js +25 -0
  16. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  17. package/dist/_vendor/ailf-core/index.js +5 -0
  18. package/dist/_vendor/ailf-core/ports/context.d.ts +17 -2
  19. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  20. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  21. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  22. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  23. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  24. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  25. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +8 -2
  26. package/dist/_vendor/ailf-core/schemas/eval-config.js +17 -2
  27. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +9 -3
  28. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +8 -1
  29. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -31
  30. package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -9
  31. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  32. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  33. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  34. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  35. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  36. package/dist/_vendor/ailf-core/services/index.js +2 -1
  37. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  38. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  39. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  40. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  41. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  42. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  43. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  44. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +332 -0
  46. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  47. package/dist/_vendor/ailf-core/types/index.d.ts +45 -83
  48. package/dist/_vendor/ailf-core/types/index.js +8 -1
  49. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +257 -0
  50. package/dist/_vendor/ailf-core/types/plugin-registry.js +185 -0
  51. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  52. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  53. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  54. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  55. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  56. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  57. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  58. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  59. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  60. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  61. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  62. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  63. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  64. package/dist/_vendor/ailf-shared/index.js +0 -1
  65. package/dist/adapters/api-client/build-request.js +14 -13
  66. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  67. package/dist/adapters/config-sources/file-config-adapter.js +39 -12
  68. package/dist/adapters/config-sources/index.d.ts +2 -0
  69. package/dist/adapters/config-sources/index.js +1 -0
  70. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  71. package/dist/adapters/config-sources/ts-config-loader.js +141 -0
  72. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  73. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  74. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  75. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  76. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  77. package/dist/adapters/task-sources/content-lake-task-source.js +35 -39
  78. package/dist/adapters/task-sources/index.d.ts +3 -2
  79. package/dist/adapters/task-sources/index.js +3 -2
  80. package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
  81. package/dist/adapters/task-sources/repo-schemas.js +227 -19
  82. package/dist/adapters/task-sources/repo-task-source.d.ts +16 -12
  83. package/dist/adapters/task-sources/repo-task-source.js +92 -80
  84. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  85. package/dist/adapters/task-sources/repo-validation.js +126 -5
  86. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  87. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  88. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  89. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  90. package/dist/cli.js +0 -2
  91. package/dist/commands/baseline.js +4 -1
  92. package/dist/commands/calculate-scores.js +1 -1
  93. package/dist/commands/coverage-audit.js +9 -1
  94. package/dist/commands/explain-handler.js +25 -23
  95. package/dist/commands/fetch-docs.js +3 -2
  96. package/dist/commands/generate-configs.js +1 -1
  97. package/dist/commands/init.d.ts +6 -4
  98. package/dist/commands/init.js +302 -23
  99. package/dist/commands/interactive.js +11 -7
  100. package/dist/commands/pipeline-action.d.ts +2 -0
  101. package/dist/commands/pipeline-action.js +16 -6
  102. package/dist/commands/pipeline.d.ts +1 -0
  103. package/dist/commands/pipeline.js +4 -2
  104. package/dist/commands/pr-comment.js +1 -1
  105. package/dist/commands/publish.js +2 -2
  106. package/dist/commands/readiness-report.js +13 -6
  107. package/dist/commands/validate-tasks.d.ts +2 -2
  108. package/dist/commands/validate-tasks.js +26 -15
  109. package/dist/composition-root.d.ts +13 -1
  110. package/dist/composition-root.js +99 -4
  111. package/dist/index.d.ts +41 -0
  112. package/dist/index.js +48 -0
  113. package/dist/orchestration/build-app-context.js +1 -0
  114. package/dist/orchestration/build-step-sequence.js +28 -8
  115. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  116. package/dist/orchestration/steps/fetch-docs-step.js +8 -7
  117. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  118. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  119. package/dist/orchestration/steps/generate-configs-step.js +261 -51
  120. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  121. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  122. package/dist/orchestration/steps/readiness-step.js +5 -6
  123. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  124. package/dist/orchestration/steps/run-eval-step.js +8 -7
  125. package/dist/pipeline/cache.d.ts +1 -1
  126. package/dist/pipeline/cache.js +36 -8
  127. package/dist/pipeline/calculate-scores.d.ts +2 -4
  128. package/dist/pipeline/calculate-scores.js +43 -113
  129. package/dist/pipeline/checks.js +2 -2
  130. package/dist/pipeline/compare.js +8 -8
  131. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  132. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  133. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  134. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  135. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  136. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  137. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  138. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  139. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  140. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +392 -0
  141. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  142. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  143. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  144. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  145. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  146. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +404 -0
  147. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  148. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  149. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  150. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  151. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  152. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  153. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  154. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  155. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  156. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  157. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  158. package/dist/pipeline/compiler/config-loader.js +111 -0
  159. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  160. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  161. package/dist/pipeline/compiler/hash.d.ts +11 -0
  162. package/dist/pipeline/compiler/hash.js +18 -0
  163. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  164. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  165. package/dist/pipeline/compiler/index.d.ts +29 -0
  166. package/dist/pipeline/compiler/index.js +45 -0
  167. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  168. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  169. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  170. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  171. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  172. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  173. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  174. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  175. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
  176. package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
  177. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  178. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  179. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  180. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  181. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  182. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  183. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  184. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  185. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  186. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  187. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  188. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  189. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  190. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  191. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  192. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  193. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  194. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  195. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  196. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  197. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
  198. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  199. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  200. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  201. package/dist/pipeline/compiler/mode-handlers/index.d.ts +15 -0
  202. package/dist/pipeline/compiler/mode-handlers/index.js +19 -0
  203. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  204. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  205. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  206. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  207. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  208. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  209. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  210. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  211. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  212. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  213. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  214. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  215. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  216. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  217. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  218. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  219. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  220. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  221. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  222. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  223. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  224. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  225. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  226. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  227. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
  228. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
  229. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  230. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  231. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  232. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  233. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  234. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  235. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  236. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
  237. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  238. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  239. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  240. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  241. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  242. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
  243. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  244. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
  245. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  246. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  247. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
  248. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
  249. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
  250. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  251. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  252. package/dist/pipeline/compiler/preset-loader.js +99 -0
  253. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  254. package/dist/pipeline/compiler/presets/index.js +8 -0
  255. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +42 -0
  256. package/dist/pipeline/compiler/presets/sanity-literacy.js +208 -0
  257. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  258. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  259. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  260. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  261. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  262. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  263. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  264. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  265. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  266. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  267. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  268. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  269. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  270. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  271. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  272. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  273. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  274. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  275. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  276. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  277. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  278. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  279. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  280. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  281. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  282. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  283. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  284. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  285. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  286. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  287. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  288. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  289. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  290. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  291. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  292. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  293. package/dist/pipeline/coverage-audit.d.ts +15 -5
  294. package/dist/pipeline/coverage-audit.js +41 -22
  295. package/dist/pipeline/eval-constants.d.ts +16 -6
  296. package/dist/pipeline/eval-constants.js +25 -4
  297. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  298. package/dist/pipeline/eval-fingerprint.js +8 -9
  299. package/dist/pipeline/expand-tasks.d.ts +19 -10
  300. package/dist/pipeline/expand-tasks.js +34 -28
  301. package/dist/pipeline/gap-analysis.d.ts +1 -1
  302. package/dist/pipeline/gap-analysis.js +2 -2
  303. package/dist/pipeline/generate-configs.d.ts +22 -4
  304. package/dist/pipeline/generate-configs.js +53 -24
  305. package/dist/pipeline/grader-api.d.ts +3 -3
  306. package/dist/pipeline/grader-api.js +5 -12
  307. package/dist/pipeline/grader-compare-runner.js +20 -27
  308. package/dist/pipeline/grader-comparison.d.ts +4 -8
  309. package/dist/pipeline/grader-comparison.js +11 -17
  310. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  311. package/dist/pipeline/grader-consistency-runner.js +16 -20
  312. package/dist/pipeline/grader-consistency.d.ts +6 -10
  313. package/dist/pipeline/grader-consistency.js +13 -32
  314. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  315. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  316. package/dist/pipeline/grader-sensitivity.js +10 -10
  317. package/dist/pipeline/grader-validate-runner.js +7 -5
  318. package/dist/pipeline/grader-validation.d.ts +2 -6
  319. package/dist/pipeline/grader-validation.js +14 -22
  320. package/dist/pipeline/map-request-to-config.js +7 -1
  321. package/dist/pipeline/mirror-repo-tasks.d.ts +13 -13
  322. package/dist/pipeline/mirror-repo-tasks.js +22 -21
  323. package/dist/pipeline/normalize-mode.d.ts +49 -0
  324. package/dist/pipeline/normalize-mode.js +64 -0
  325. package/dist/pipeline/plan.d.ts +5 -2
  326. package/dist/pipeline/plan.js +134 -78
  327. package/dist/pipeline/pr-comment.js +2 -0
  328. package/dist/pipeline/profile-resolution.d.ts +22 -14
  329. package/dist/pipeline/profile-resolution.js +41 -19
  330. package/dist/pipeline/provenance.d.ts +2 -2
  331. package/dist/pipeline/provenance.js +12 -17
  332. package/dist/pipeline/release-report.js +4 -4
  333. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  334. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  335. package/dist/pipeline/rubric-loader.d.ts +20 -0
  336. package/dist/pipeline/rubric-loader.js +37 -0
  337. package/dist/pipeline/validate.d.ts +4 -4
  338. package/dist/pipeline/validate.js +64 -53
  339. package/dist/schedules/loader.js +18 -8
  340. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  341. package/dist/scripts/migrate-task-mode.js +85 -0
  342. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  343. package/dist/scripts/validate-task-sources.d.ts +1 -1
  344. package/dist/scripts/validate-task-sources.js +15 -15
  345. package/dist/sinks/loader.js +5 -7
  346. package/dist/sources.d.ts +7 -7
  347. package/dist/sources.js +22 -24
  348. package/dist/webhook/dispatch.js +2 -1
  349. package/package.json +15 -4
  350. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  351. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  352. package/tasks/literacy/frameworks.task.ts +128 -0
  353. package/tasks/literacy/functions.task.ts +69 -0
  354. package/tasks/literacy/groq.task.ts +258 -0
  355. package/tasks/literacy/nextjs-live.task.ts +75 -0
  356. package/tasks/literacy/studio-setup.task.ts +131 -0
  357. package/tasks/literacy/visual-editing.task.ts +146 -0
  358. package/config/features.yaml +0 -116
  359. package/config/models.yaml +0 -116
  360. package/config/prompts.yaml +0 -75
  361. package/config/rubrics.yaml +0 -81
  362. package/config/schedules.yaml +0 -43
  363. package/config/sinks.yaml +0 -54
  364. package/config/sources.yaml +0 -51
  365. package/config/thresholds.yaml +0 -49
  366. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  367. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  368. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  369. package/dist/_vendor/ailf-tasks/index.js +0 -16
  370. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  371. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  372. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  373. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  374. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  375. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  376. package/dist/agent-observer/test-imports.d.ts +0 -7
  377. package/dist/agent-observer/test-imports.js +0 -185
@@ -9,13 +9,15 @@
9
9
  *
10
10
  * @see docs/exec-plans/execution-preview.md
11
11
  */
12
- import { existsSync, readFileSync, readdirSync, statSync } from "fs";
12
+ import { existsSync, readdirSync, statSync } from "fs";
13
13
  import { resolve } from "path";
14
- import { load } from "js-yaml";
15
14
  import { lookupPricing } from "../agent-observer/pricing.js";
16
15
  import { RepoTaskSource } from "../adapters/task-sources/repo-task-source.js";
16
+ import { loadAllTsTaskFiles } from "../adapters/task-sources/task-file-loader.js";
17
17
  import { lookupCache } from "./cache.js";
18
- import { loadAndExpandTasks } from "./expand-tasks.js";
18
+ import { compileLiteracyTasks } from "./compiler/literacy-bridge.js";
19
+ import { tryLoadConfigFile } from "./compiler/config-loader.js";
20
+ import { LiteracyVariant } from "./normalize-mode.js";
19
21
  import { validateConfiguration } from "./validate.js";
20
22
  /**
21
23
  * Known promptfoo provider prefixes — stripped to get the raw model name.
@@ -39,38 +41,37 @@ function extractModelName(id) {
39
41
  return parts.length > 1 ? parts.slice(1).join(":") : id;
40
42
  }
41
43
  function loadModelsFile(rootDir) {
42
- const modelsPath = resolve(rootDir, "config", "models.yaml");
43
- if (!existsSync(modelsPath))
44
- return null;
45
- try {
46
- const raw = readFileSync(modelsPath, "utf-8");
47
- return load(raw);
48
- }
49
- catch {
50
- return null;
51
- }
44
+ const result = tryLoadConfigFile("models", rootDir);
45
+ return result?.data ?? null;
52
46
  }
53
47
  /**
54
- * Map eval mode to the model "modes" array values from models.yaml.
55
- * Baseline mode maps to "baseline"; agentic maps to both naive and optimized.
48
+ * Map eval mode + variant to the model "modes" array values from models config.
49
+ *
50
+ * Literacy mode uses the variant to determine which model sub-modes match.
51
+ * Non-literacy modes accept all models by default (filtering is done
52
+ * elsewhere for those modes).
56
53
  */
57
- function modeMatchesModelModes(mode, modelModes) {
54
+ function modeMatchesModelModes(mode, modelModes, variant) {
58
55
  if (!modelModes || modelModes.length === 0)
59
56
  return true;
60
- switch (mode) {
61
- case "agentic":
62
- return (modelModes.includes("agentic-naive") ||
63
- modelModes.includes("agentic-optimized"));
64
- case "baseline":
65
- return modelModes.includes("baseline");
66
- case "full":
67
- // Full mode uses all models — a model matches if it's in any sub-mode
68
- return (modelModes.includes("baseline") ||
69
- modelModes.includes("agentic-naive") ||
70
- modelModes.includes("agentic-optimized"));
71
- case "observed":
72
- return modelModes.includes("observed");
57
+ if (mode === "literacy") {
58
+ switch (variant) {
59
+ case LiteracyVariant.AGENTIC:
60
+ return (modelModes.includes("agentic-naive") ||
61
+ modelModes.includes("agentic-optimized"));
62
+ case LiteracyVariant.OBSERVED:
63
+ return modelModes.includes(LiteracyVariant.OBSERVED);
64
+ case LiteracyVariant.FULL:
65
+ return (modelModes.includes(LiteracyVariant.STANDARD) ||
66
+ modelModes.includes("agentic-naive") ||
67
+ modelModes.includes("agentic-optimized"));
68
+ case LiteracyVariant.STANDARD:
69
+ default:
70
+ return modelModes.includes(LiteracyVariant.STANDARD);
71
+ }
73
72
  }
73
+ // Non-literacy modes accept all models by default
74
+ return true;
74
75
  }
75
76
  // ---------------------------------------------------------------------------
76
77
  // Cost estimation
@@ -131,46 +132,103 @@ export async function buildPipelinePlan(opts, rootDir) {
131
132
  let totalTests = 0;
132
133
  let tasks = [];
133
134
  let repoTaskCount;
135
+ // -----------------------------------------------------------------------
136
+ // Load and compile tasks — unified path for all modes
137
+ // -----------------------------------------------------------------------
134
138
  try {
135
- const { entries } = loadAndExpandTasks(rootDir, filter);
136
- totalTests = entries.length;
137
- tasks = entries.map((entry) => {
138
- const desc = typeof entry.description === "string" ? entry.description : "(unknown)";
139
- const isBaseline = desc.includes("[Baseline]") ||
140
- desc.endsWith("(baseline)") ||
141
- (Array.isArray(entry.prompts) && entry.prompts.includes("without-docs"));
142
- return {
143
- description: desc,
144
- variant: isBaseline ? "baseline" : "gold",
145
- };
146
- });
139
+ const modelsForCompile = loadModelsFile(rootDir);
140
+ const graderProvider = modelsForCompile?.grader?.id ?? "openai:chat:gpt-4o";
141
+ const modelEntries = (modelsForCompile?.models ?? []).map((m) => ({ id: m.id, label: m.label }));
142
+ // Load *.task.ts files from tasks/<mode>/
143
+ const modeTasksDir = resolve(rootDir, "tasks", opts.mode);
144
+ if (existsSync(modeTasksDir)) {
145
+ const rawTasks = await loadAllTsTaskFiles(modeTasksDir);
146
+ if (rawTasks.length > 0) {
147
+ // Dynamic import of the handler module
148
+ const handlerModulePath = `./compiler/mode-handlers/${opts.mode}/index.js`;
149
+ const mod = await import(handlerModulePath);
150
+ const handler = mod.handler;
151
+ for (const rawFile of rawTasks) {
152
+ for (const taskDef of rawFile.tasks) {
153
+ const task = taskDef;
154
+ // Apply area/task/tag filter
155
+ if (filter) {
156
+ if (filter.areas?.length &&
157
+ !filter.areas
158
+ .map((a) => a.toLowerCase())
159
+ .includes((task.area ?? "").toLowerCase()))
160
+ continue;
161
+ if (filter.taskIds?.length && !filter.taskIds.includes(task.id))
162
+ continue;
163
+ if (filter.tags?.length &&
164
+ (!task.tags || !task.tags.some((t) => filter.tags.includes(t))))
165
+ continue;
166
+ }
167
+ const result = handler.compileTask(task, {
168
+ rootDir,
169
+ graderProvider,
170
+ models: modelEntries,
171
+ // For literacy mode, pass the variant as evalMode
172
+ ...(opts.mode === "literacy"
173
+ ? {
174
+ evalMode: opts.variant === LiteracyVariant.AGENTIC
175
+ ? LiteracyVariant.AGENTIC
176
+ : LiteracyVariant.STANDARD,
177
+ }
178
+ : {}),
179
+ });
180
+ totalTests += result.tests.length;
181
+ for (const test of result.tests) {
182
+ const desc = typeof test.description === "string"
183
+ ? test.description
184
+ : (taskDef.id ?? "unknown");
185
+ const isBaseline = desc.includes("[Baseline]") || desc.endsWith("(baseline)");
186
+ tasks.push({
187
+ description: desc,
188
+ variant: isBaseline
189
+ ? LiteracyVariant.STANDARD
190
+ : "gold",
191
+ });
192
+ }
193
+ }
194
+ }
195
+ }
196
+ }
147
197
  }
148
- catch {
149
- errors.push("Failed to expand tasks check task YAML files");
198
+ catch (err) {
199
+ const detail = err instanceof Error ? err.message : String(err);
200
+ errors.push(`Failed to compile tasks: ${detail}`);
150
201
  }
151
202
  // Scan repo tasks path for additional task count (preview only)
152
203
  if (opts.repoTasksPath) {
153
204
  try {
154
205
  const repoSource = new RepoTaskSource(opts.repoTasksPath);
155
- const repoTasks = await repoSource.loadTasks(filter);
206
+ // Type-narrow to literacy tasks — compileLiteracyTasks accepts LiteracyTaskDefinition[]
207
+ const repoTasks = (await repoSource.loadTasks(filter)).filter((t) => t.mode === "literacy");
156
208
  repoTaskCount = repoTasks.length;
157
209
  if (repoTaskCount > 0) {
158
- // Expand repo tasks to estimate test entries
159
- const { expandTaskDefinitions } = await import("./expand-tasks.js");
160
- const { entries: repoEntries } = expandTaskDefinitions(repoTasks, rootDir, opts.mode === "agentic" ? "agentic" : "baseline");
161
- totalTests += repoEntries.length;
162
- for (const entry of repoEntries) {
163
- const desc = typeof entry.description === "string"
164
- ? entry.description
165
- : "(unknown)";
166
- const isBaseline = desc.includes("[Baseline]") ||
167
- desc.endsWith("(baseline)") ||
168
- (Array.isArray(entry.prompts) &&
169
- entry.prompts.includes("without-docs"));
170
- tasks.push({
171
- description: desc,
172
- variant: isBaseline ? "baseline" : "gold",
173
- });
210
+ const modelsForCompile = loadModelsFile(rootDir);
211
+ const graderProvider = modelsForCompile?.grader?.id ?? "openai:chat:gpt-4o";
212
+ const compileResult = compileLiteracyTasks(repoTasks, {
213
+ rootDir,
214
+ evalMode: opts.variant === LiteracyVariant.AGENTIC
215
+ ? LiteracyVariant.AGENTIC
216
+ : LiteracyVariant.STANDARD,
217
+ graderProvider,
218
+ models: (modelsForCompile?.models ?? []).map((m) => ({ id: m.id, label: m.label })),
219
+ });
220
+ totalTests += compileResult.totalTests;
221
+ for (const { taskId, result } of compileResult.tasks) {
222
+ for (const test of result.tests) {
223
+ const desc = typeof test.description === "string" ? test.description : taskId;
224
+ const isBaseline = desc.includes("[Baseline]") || desc.endsWith("(baseline)");
225
+ tasks.push({
226
+ description: desc,
227
+ variant: isBaseline
228
+ ? LiteracyVariant.STANDARD
229
+ : "gold",
230
+ });
231
+ }
174
232
  }
175
233
  }
176
234
  }
@@ -186,11 +244,11 @@ export async function buildPipelinePlan(opts, rootDir) {
186
244
  const models = [];
187
245
  let graderModelName = "";
188
246
  if (modelsFile) {
189
- const activeModels = modelsFile.models.filter((m) => modeMatchesModelModes(opts.mode, m.modes));
247
+ const activeModels = modelsFile.models.filter((m) => modeMatchesModelModes(opts.mode, m.modes, opts.variant));
190
248
  // For agentic mode, each model appears twice (naive + optimized)
191
249
  for (const m of activeModels) {
192
250
  const modelName = extractModelName(m.id);
193
- if (opts.mode === "agentic") {
251
+ if (opts.variant === LiteracyVariant.AGENTIC) {
194
252
  if (m.modes?.includes("agentic-naive")) {
195
253
  models.push({
196
254
  id: m.id,
@@ -518,16 +576,16 @@ function collectFilesCreated(opts) {
518
576
  // ---------------------------------------------------------------------------
519
577
  function collectFilesRead(rootDir, _mode) {
520
578
  const files = [
521
- "config/models.yaml",
522
- "config/rubrics.yaml",
523
- "config/prompts.yaml",
524
- "config/sources.yaml",
579
+ "config/models.ts",
580
+ "config/rubrics.ts",
581
+ "config/prompts.ts",
582
+ "config/sources.ts",
525
583
  ];
526
584
  // Task files
527
585
  const tasksDir = resolve(rootDir, "tasks");
528
586
  if (existsSync(tasksDir)) {
529
587
  const taskFiles = readdirSync(tasksDir)
530
- .filter((f) => (f.endsWith(".yaml") || f.endsWith(".yml")) && !f.startsWith("."))
588
+ .filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f) && !f.startsWith("."))
531
589
  .sort();
532
590
  for (const f of taskFiles)
533
591
  files.push(`tasks/${f}`);
@@ -551,11 +609,11 @@ function collectFilesRead(rootDir, _mode) {
551
609
  files.push(`canonical/reference-solutions/${f}`);
552
610
  }
553
611
  // Thresholds (if readiness is involved)
554
- if (existsSync(resolve(rootDir, "config", "thresholds.yaml"))) {
555
- files.push("config/thresholds.yaml");
612
+ if (existsSync(resolve(rootDir, "config", "thresholds.ts"))) {
613
+ files.push("config/thresholds.ts");
556
614
  }
557
- if (existsSync(resolve(rootDir, "config", "features.yaml"))) {
558
- files.push("config/features.yaml");
615
+ if (existsSync(resolve(rootDir, "config", "features.ts"))) {
616
+ files.push("config/features.ts");
559
617
  }
560
618
  return [...new Set(files)].sort();
561
619
  }
@@ -616,16 +674,14 @@ function estimateCost(testCount, models, graderModelName, rubricAssertionsPerTas
616
674
  // Used by the plan builder without importing the full type to avoid circular deps.
617
675
  // ---------------------------------------------------------------------------
618
676
  function estimateRubricAssertionsPerTask(rootDir) {
619
- // Load rubrics.yaml and count the default template set.
677
+ // Load rubrics config and count the default template set.
620
678
  // In practice, most tasks have 2-4 rubric assertions.
621
- const rubricsPath = resolve(rootDir, "config", "rubrics.yaml");
622
- if (!existsSync(rubricsPath))
679
+ const result = tryLoadConfigFile("rubrics", rootDir);
680
+ if (!result)
623
681
  return 2; // conservative default
624
682
  try {
625
- const raw = readFileSync(rubricsPath, "utf-8");
626
- const data = load(raw);
627
- const templateCount = data?.templates
628
- ? Object.keys(data.templates).length
683
+ const templateCount = result.data?.templates
684
+ ? Object.keys(result.data.templates).length
629
685
  : 2;
630
686
  // Most tasks use 2-3 of the available templates
631
687
  return Math.min(templateCount, 3);
@@ -320,6 +320,8 @@ function generateComment(summary, options = {}) {
320
320
  ? "📉"
321
321
  : "➡️";
322
322
  const d = (n) => n > 0 ? `+${Math.round(n)}` : String(Math.round(n));
323
+ // TODO(multi-mode): These dimension keys are literacy-specific.
324
+ // For other modes, iterate Object.entries(a.dimensions) dynamically.
323
325
  lines.push(`| ${a.area} | ${a.baseline} | ${a.experiment} | ${icon} ${d(a.delta)} | ${d(a.dimensions.taskCompletion.delta)} | ${d(a.dimensions.codeCorrectness.delta)} | ${d(a.dimensions.docCoverage.delta)} |`);
324
326
  }
325
327
  }
@@ -1,13 +1,16 @@
1
1
  /**
2
2
  * pipeline/profile-resolution.ts
3
3
  *
4
- * Resolves the correct weight profile for a given (mode, variant) pair.
5
- * The scoring engine calls this to determine which dimensions and weights
6
- * apply to each test entry's composite score.
4
+ * Resolves the correct weight profile for a given (mode, perspective, variant)
5
+ * tuple. The scoring engine calls this to determine which dimensions and
6
+ * weights apply to each test entry's composite score.
7
7
  *
8
8
  * Resolution order:
9
- * 1. Explicit binding: mode-profiles.<mode>.<variant> → profile name
10
- * 2. Fallback: the "default" profile
9
+ * 1. Nested binding (variant provided):
10
+ * mode-profiles.<mode>.<variant>.<perspective> profile name
11
+ * 2. Flat binding (no variant):
12
+ * mode-profiles.<mode>.<perspective> → profile name
13
+ * 3. Fallback: the "default" profile
11
14
  *
12
15
  * Supports both the new `profiles` format and the legacy flat `weights`
13
16
  * format (treated as a single profile named "default").
@@ -23,17 +26,22 @@ import type { RubricConfig, WeightProfile } from "../_vendor/ailf-core/index.d.t
23
26
  */
24
27
  export declare function resolveProfiles(config: RubricConfig): Record<string, WeightProfile>;
25
28
  /**
26
- * Resolve the weight profile for a specific (mode, variant) pair.
29
+ * Resolve the weight profile for a specific (mode, perspective, variant) tuple.
27
30
  *
28
- * @param mode - Evaluation mode (e.g., "baseline", "agentic", "agent-task")
29
- * @param variant - Entry variant: "gold" (with docs) or "baseline" (without docs)
30
- * @param config - Parsed rubrics.yaml config
31
+ * @param mode - Canonical mode (e.g., "literacy", "mcp-server")
32
+ * @param perspective - Entry perspective: "gold" (with docs) or "baseline" (without docs)
33
+ * @param config - Parsed rubrics config
34
+ * @param variant - Optional variant within the mode (e.g., "baseline", "agentic" for literacy)
31
35
  * @returns The resolved weight profile (dimension → weight map)
32
36
  *
33
37
  * @example
34
- * resolveProfile("baseline", "gold", config) // default profile
35
- * resolveProfile("baseline", "baseline", config) // → output-only profile
36
- * resolveProfile("agentic", "gold", config) // → default profile
37
- * resolveProfile("unknown-mode", "gold", config) // → default (fallback)
38
+ * // Nested: literacy mode with variant sub-keys
39
+ * resolveProfile("literacy", "gold", config, "baseline") // → default profile
40
+ * resolveProfile("literacy", "baseline", config, "baseline") // → output-only profile
41
+ * resolveProfile("literacy", "gold", config, "agentic") // → default profile
42
+ *
43
+ * // Flat: non-literacy modes
44
+ * resolveProfile("mcp-server", "gold", config) // → mcp-behavior profile
45
+ * resolveProfile("unknown-mode", "gold", config) // → default (fallback)
38
46
  */
39
- export declare function resolveProfile(mode: string, variant: string, config: RubricConfig): WeightProfile;
47
+ export declare function resolveProfile(mode: string, perspective: string, config: RubricConfig, variant?: string): WeightProfile;
@@ -1,13 +1,16 @@
1
1
  /**
2
2
  * pipeline/profile-resolution.ts
3
3
  *
4
- * Resolves the correct weight profile for a given (mode, variant) pair.
5
- * The scoring engine calls this to determine which dimensions and weights
6
- * apply to each test entry's composite score.
4
+ * Resolves the correct weight profile for a given (mode, perspective, variant)
5
+ * tuple. The scoring engine calls this to determine which dimensions and
6
+ * weights apply to each test entry's composite score.
7
7
  *
8
8
  * Resolution order:
9
- * 1. Explicit binding: mode-profiles.<mode>.<variant> → profile name
10
- * 2. Fallback: the "default" profile
9
+ * 1. Nested binding (variant provided):
10
+ * mode-profiles.<mode>.<variant>.<perspective> profile name
11
+ * 2. Flat binding (no variant):
12
+ * mode-profiles.<mode>.<perspective> → profile name
13
+ * 3. Fallback: the "default" profile
11
14
  *
12
15
  * Supports both the new `profiles` format and the legacy flat `weights`
13
16
  * format (treated as a single profile named "default").
@@ -29,31 +32,50 @@ export function resolveProfiles(config) {
29
32
  return { default: config.weights };
30
33
  }
31
34
  // Schema validation should prevent this, but be defensive
32
- throw new Error("rubrics.yaml has neither 'profiles' nor 'weights' — cannot resolve scoring profiles");
35
+ throw new Error("rubrics config has neither 'profiles' nor 'weights' — cannot resolve scoring profiles");
33
36
  }
34
37
  /**
35
- * Resolve the weight profile for a specific (mode, variant) pair.
38
+ * Resolve the weight profile for a specific (mode, perspective, variant) tuple.
36
39
  *
37
- * @param mode - Evaluation mode (e.g., "baseline", "agentic", "agent-task")
38
- * @param variant - Entry variant: "gold" (with docs) or "baseline" (without docs)
39
- * @param config - Parsed rubrics.yaml config
40
+ * @param mode - Canonical mode (e.g., "literacy", "mcp-server")
41
+ * @param perspective - Entry perspective: "gold" (with docs) or "baseline" (without docs)
42
+ * @param config - Parsed rubrics config
43
+ * @param variant - Optional variant within the mode (e.g., "baseline", "agentic" for literacy)
40
44
  * @returns The resolved weight profile (dimension → weight map)
41
45
  *
42
46
  * @example
43
- * resolveProfile("baseline", "gold", config) // default profile
44
- * resolveProfile("baseline", "baseline", config) // → output-only profile
45
- * resolveProfile("agentic", "gold", config) // → default profile
46
- * resolveProfile("unknown-mode", "gold", config) // → default (fallback)
47
+ * // Nested: literacy mode with variant sub-keys
48
+ * resolveProfile("literacy", "gold", config, "baseline") // → default profile
49
+ * resolveProfile("literacy", "baseline", config, "baseline") // → output-only profile
50
+ * resolveProfile("literacy", "gold", config, "agentic") // → default profile
51
+ *
52
+ * // Flat: non-literacy modes
53
+ * resolveProfile("mcp-server", "gold", config) // → mcp-behavior profile
54
+ * resolveProfile("unknown-mode", "gold", config) // → default (fallback)
47
55
  */
48
- export function resolveProfile(mode, variant, config) {
56
+ export function resolveProfile(mode, perspective, config, variant) {
49
57
  const profiles = resolveProfiles(config);
50
58
  const modeProfiles = config["mode-profiles"];
51
- // Look up explicit binding: mode-profiles.<mode>.<variant> → profile name
52
- const profileName = modeProfiles?.[mode]?.[variant];
59
+ const modeEntry = modeProfiles?.[mode];
60
+ let profileName;
61
+ if (modeEntry && variant) {
62
+ // Nested lookup: mode-profiles.<mode>.<variant>.<perspective>
63
+ const variantEntry = modeEntry[variant];
64
+ if (typeof variantEntry === "object" && variantEntry !== null) {
65
+ profileName = variantEntry[perspective];
66
+ }
67
+ }
68
+ if (!profileName && modeEntry) {
69
+ // Flat lookup: mode-profiles.<mode>.<perspective>
70
+ const directEntry = modeEntry[perspective];
71
+ if (typeof directEntry === "string") {
72
+ profileName = directEntry;
73
+ }
74
+ }
53
75
  if (profileName) {
54
76
  const profile = profiles[profileName];
55
77
  if (!profile) {
56
- throw new Error(`mode-profiles.${mode}.${variant} references profile "${profileName}" ` +
78
+ throw new Error(`mode-profiles.${mode}.${variant ? variant + "." : ""}${perspective} references profile "${profileName}" ` +
57
79
  `which does not exist. Available profiles: ${Object.keys(profiles).join(", ")}`);
58
80
  }
59
81
  return profile;
@@ -61,7 +83,7 @@ export function resolveProfile(mode, variant, config) {
61
83
  // Fall back to "default" profile
62
84
  const defaultProfile = profiles["default"];
63
85
  if (!defaultProfile) {
64
- throw new Error(`No scoring profile found for mode="${mode}" variant="${variant}" ` +
86
+ throw new Error(`No scoring profile found for mode="${mode}" perspective="${perspective}" ` +
65
87
  `and no "default" profile exists. ` +
66
88
  `Available profiles: ${Object.keys(profiles).join(", ")}`);
67
89
  }
@@ -42,7 +42,7 @@ export interface ProvenanceInput {
42
42
  promptfooUrl?: string;
43
43
  /** Per-mode Promptfoo share URLs */
44
44
  promptfooUrls?: PromptfooUrlEntry[];
45
- /** Path to the package root (for reading models.yaml) */
45
+ /** Path to the package root (for reading config/models) */
46
46
  rootDir: string;
47
47
  /** Report ID that triggered this re-run (becomes lineage.rerunOf) */
48
48
  sourceReportId?: string;
@@ -58,7 +58,7 @@ export interface ProvenanceInput {
58
58
  *
59
59
  * Assembles provenance from:
60
60
  * - Pipeline options (mode, source, areas, tasks)
61
- * - config/models.yaml (model list, grader)
61
+ * - config/models.ts (model list, grader)
62
62
  * - Environment variables (CI metadata, trigger detection)
63
63
  * - Optional metadata (context hash, Promptfoo URL)
64
64
  */
@@ -11,16 +11,14 @@
11
11
  * @see docs/design-docs/report-store/domain-model.md
12
12
  * @see docs/design-docs/report-store/architecture.md — Provenance collection
13
13
  */
14
- import { readFileSync } from "fs";
15
- import { resolve } from "path";
16
- import { load } from "js-yaml";
17
14
  import { ConsoleLogger } from "../adapters/loggers/index.js";
15
+ import { tryLoadConfigFile } from "./compiler/config-loader.js";
18
16
  /**
19
17
  * Build a ReportProvenance object from pipeline context.
20
18
  *
21
19
  * Assembles provenance from:
22
20
  * - Pipeline options (mode, source, areas, tasks)
23
- * - config/models.yaml (model list, grader)
21
+ * - config/models.ts (model list, grader)
24
22
  * - Environment variables (CI metadata, trigger detection)
25
23
  * - Optional metadata (context hash, Promptfoo URL)
26
24
  */
@@ -168,20 +166,17 @@ function detectTrigger() {
168
166
  // Model config loading
169
167
  // ---------------------------------------------------------------------------
170
168
  /**
171
- * Load config/models.yaml to extract model list and grader info.
169
+ * Load config/models to extract model list and grader info.
172
170
  * Falls back to a minimal config if the file can't be read.
173
171
  */
174
172
  function loadModelsConfig(rootDir, log) {
175
- try {
176
- const content = readFileSync(resolve(rootDir, "config", "models.yaml"), "utf-8");
177
- return load(content);
178
- }
179
- catch {
180
- log.warn("Could not read config/models.yaml for provenance");
181
- return {
182
- defaults: {},
183
- grader: { id: "unknown" },
184
- models: [],
185
- };
186
- }
173
+ const result = tryLoadConfigFile("models", rootDir);
174
+ if (result)
175
+ return result.data;
176
+ log.warn("Could not read config/models for provenance");
177
+ return {
178
+ defaults: {},
179
+ grader: { id: "unknown" },
180
+ models: [],
181
+ };
187
182
  }
@@ -133,8 +133,8 @@ export function formatReleaseImpactConsole(report) {
133
133
  const docs = task.attributedDocs.length > 0
134
134
  ? task.attributedDocs.join(", ")
135
135
  : "(unattributed)";
136
- const deltaStr = task.delta >= 0 ? `+${task.delta.toFixed(1)}` : task.delta.toFixed(1);
137
- lines.push(` ${docs.padEnd(32)} | ${area.area.padEnd(16)} | ${task.taskId.padEnd(23)} | ${deltaStr}`);
136
+ const taskDeltaStr = task.delta >= 0 ? `+${task.delta.toFixed(1)}` : task.delta.toFixed(1);
137
+ lines.push(` ${docs.padEnd(32)} | ${area.area.padEnd(16)} | ${task.taskId.padEnd(23)} | ${taskDeltaStr}`);
138
138
  }
139
139
  }
140
140
  lines.push("");
@@ -194,9 +194,9 @@ export function formatReleaseImpactMarkdown(report) {
194
194
  const docs = task.attributedDocs.length > 0
195
195
  ? task.attributedDocs.map((d) => `\`${d}\``).join(", ")
196
196
  : "—";
197
- const deltaStr = task.delta >= 0 ? `+${task.delta.toFixed(1)}` : task.delta.toFixed(1);
197
+ const taskDeltaStr = task.delta >= 0 ? `+${task.delta.toFixed(1)}` : task.delta.toFixed(1);
198
198
  const regressIcon = area.regressed ? " ⚠️" : "";
199
- lines.push(`| ${docs} | ${area.area} | ${task.taskId} | ${deltaStr}${regressIcon} |`);
199
+ lines.push(`| ${docs} | ${area.area} | ${task.taskId} | ${taskDeltaStr}${regressIcon} |`);
200
200
  }
201
201
  }
202
202
  lines.push("");
@@ -5,7 +5,7 @@
5
5
  * definitions (`.ailf/tasks/*.yaml` → `execution.threshold`).
6
6
  *
7
7
  * This is distinct from the readiness-gate threshold system in
8
- * `config/thresholds.yaml`. Repo thresholds are per-task, defined by
8
+ * `config/thresholds`. Repo thresholds are per-task, defined by
9
9
  * the product team, and drive PR check pass/fail status. Framework
10
10
  * thresholds are per-area, defined by the AILF team, and drive
11
11
  * readiness reports.
@@ -5,7 +5,7 @@
5
5
  * definitions (`.ailf/tasks/*.yaml` → `execution.threshold`).
6
6
  *
7
7
  * This is distinct from the readiness-gate threshold system in
8
- * `config/thresholds.yaml`. Repo thresholds are per-task, defined by
8
+ * `config/thresholds`. Repo thresholds are per-task, defined by
9
9
  * the product team, and drive PR check pass/fail status. Framework
10
10
  * thresholds are per-area, defined by the AILF team, and drive
11
11
  * readiness reports.
@@ -0,0 +1,20 @@
1
+ /**
2
+ * pipeline/rubric-loader.ts — Load and validate rubric config.
3
+ *
4
+ * Extracted from the legacy expand-tasks.ts so that callers (e.g.,
5
+ * calculate-scores.ts) can load rubric templates without pulling in
6
+ * the deprecated task expansion machinery.
7
+ *
8
+ * @see packages/eval/config/rubrics.ts — the rubric configuration
9
+ * @see packages/core/src/schemas/pipeline.ts — RubricConfigSchema
10
+ */
11
+ import { type RubricConfig } from "../_vendor/ailf-core/index.d.ts";
12
+ /**
13
+ * Load and validate config/rubrics from the given root directory.
14
+ * Caches the result for subsequent calls with the same rootDir.
15
+ */
16
+ export declare function loadRubricTemplates(rootDir: string): RubricConfig;
17
+ /**
18
+ * Reset the rubric config cache. Useful in tests.
19
+ */
20
+ export declare function resetRubricCache(): void;
@@ -0,0 +1,37 @@
1
+ /**
2
+ * pipeline/rubric-loader.ts — Load and validate rubric config.
3
+ *
4
+ * Extracted from the legacy expand-tasks.ts so that callers (e.g.,
5
+ * calculate-scores.ts) can load rubric templates without pulling in
6
+ * the deprecated task expansion machinery.
7
+ *
8
+ * @see packages/eval/config/rubrics.ts — the rubric configuration
9
+ * @see packages/core/src/schemas/pipeline.ts — RubricConfigSchema
10
+ */
11
+ import { RubricConfigSchema } from "../_vendor/ailf-core/index.js";
12
+ import { loadConfigFile } from "./compiler/config-loader.js";
13
+ let cachedRubricConfig = null;
14
+ /**
15
+ * Load and validate config/rubrics from the given root directory.
16
+ * Caches the result for subsequent calls with the same rootDir.
17
+ */
18
+ export function loadRubricTemplates(rootDir) {
19
+ if (cachedRubricConfig)
20
+ return cachedRubricConfig;
21
+ const { data } = loadConfigFile("rubrics", rootDir);
22
+ const result = RubricConfigSchema.safeParse(data);
23
+ if (!result.success) {
24
+ const messages = result.error.issues
25
+ .map((i) => ` [${i.path.join(".")}]: ${i.message}`)
26
+ .join("\n");
27
+ throw new Error(`Invalid config/rubrics:\n${messages}`);
28
+ }
29
+ cachedRubricConfig = result.data;
30
+ return result.data;
31
+ }
32
+ /**
33
+ * Reset the rubric config cache. Useful in tests.
34
+ */
35
+ export function resetRubricCache() {
36
+ cachedRubricConfig = null;
37
+ }