@sanity/ailf 0.5.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. package/README.md +0 -1
  2. package/config/features.ts +23 -0
  3. package/config/models.ts +95 -0
  4. package/config/prompts.ts +16 -0
  5. package/config/rubrics.ts +225 -0
  6. package/config/schedules.ts +47 -0
  7. package/config/sinks.ts +37 -0
  8. package/config/sources.ts +21 -0
  9. package/config/thresholds.ts +61 -0
  10. package/dist/_vendor/ailf-core/config-helpers.d.ts +171 -0
  11. package/dist/_vendor/ailf-core/config-helpers.js +170 -0
  12. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  13. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  14. package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
  15. package/dist/_vendor/ailf-core/examples/index.js +25 -0
  16. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  17. package/dist/_vendor/ailf-core/index.js +5 -0
  18. package/dist/_vendor/ailf-core/ports/context.d.ts +17 -2
  19. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  20. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  21. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  22. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  23. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  24. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  25. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +8 -2
  26. package/dist/_vendor/ailf-core/schemas/eval-config.js +17 -2
  27. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +9 -3
  28. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +8 -1
  29. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -31
  30. package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -9
  31. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  32. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  33. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  34. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  35. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  36. package/dist/_vendor/ailf-core/services/index.js +2 -1
  37. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  38. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  39. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  40. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  41. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  42. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  43. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  44. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +332 -0
  46. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  47. package/dist/_vendor/ailf-core/types/index.d.ts +45 -83
  48. package/dist/_vendor/ailf-core/types/index.js +8 -1
  49. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +257 -0
  50. package/dist/_vendor/ailf-core/types/plugin-registry.js +185 -0
  51. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  52. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  53. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  54. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  55. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  56. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  57. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  58. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  59. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  60. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  61. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  62. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  63. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  64. package/dist/_vendor/ailf-shared/index.js +0 -1
  65. package/dist/adapters/api-client/build-request.js +14 -13
  66. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  67. package/dist/adapters/config-sources/file-config-adapter.js +39 -12
  68. package/dist/adapters/config-sources/index.d.ts +2 -0
  69. package/dist/adapters/config-sources/index.js +1 -0
  70. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  71. package/dist/adapters/config-sources/ts-config-loader.js +141 -0
  72. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  73. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  74. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  75. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  76. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  77. package/dist/adapters/task-sources/content-lake-task-source.js +35 -39
  78. package/dist/adapters/task-sources/index.d.ts +3 -2
  79. package/dist/adapters/task-sources/index.js +3 -2
  80. package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
  81. package/dist/adapters/task-sources/repo-schemas.js +227 -19
  82. package/dist/adapters/task-sources/repo-task-source.d.ts +16 -12
  83. package/dist/adapters/task-sources/repo-task-source.js +92 -80
  84. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  85. package/dist/adapters/task-sources/repo-validation.js +126 -5
  86. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  87. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  88. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  89. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  90. package/dist/cli.js +0 -2
  91. package/dist/commands/baseline.js +4 -1
  92. package/dist/commands/calculate-scores.js +1 -1
  93. package/dist/commands/coverage-audit.js +9 -1
  94. package/dist/commands/explain-handler.js +25 -23
  95. package/dist/commands/fetch-docs.js +3 -2
  96. package/dist/commands/generate-configs.js +1 -1
  97. package/dist/commands/init.d.ts +6 -4
  98. package/dist/commands/init.js +302 -23
  99. package/dist/commands/interactive.js +11 -7
  100. package/dist/commands/pipeline-action.d.ts +2 -0
  101. package/dist/commands/pipeline-action.js +16 -6
  102. package/dist/commands/pipeline.d.ts +1 -0
  103. package/dist/commands/pipeline.js +4 -2
  104. package/dist/commands/pr-comment.js +1 -1
  105. package/dist/commands/publish.js +2 -2
  106. package/dist/commands/readiness-report.js +13 -6
  107. package/dist/commands/validate-tasks.d.ts +2 -2
  108. package/dist/commands/validate-tasks.js +26 -15
  109. package/dist/composition-root.d.ts +13 -1
  110. package/dist/composition-root.js +99 -4
  111. package/dist/index.d.ts +41 -0
  112. package/dist/index.js +48 -0
  113. package/dist/orchestration/build-app-context.js +1 -0
  114. package/dist/orchestration/build-step-sequence.js +28 -8
  115. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  116. package/dist/orchestration/steps/fetch-docs-step.js +8 -7
  117. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  118. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  119. package/dist/orchestration/steps/generate-configs-step.js +261 -51
  120. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  121. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  122. package/dist/orchestration/steps/readiness-step.js +5 -6
  123. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  124. package/dist/orchestration/steps/run-eval-step.js +8 -7
  125. package/dist/pipeline/cache.d.ts +1 -1
  126. package/dist/pipeline/cache.js +36 -8
  127. package/dist/pipeline/calculate-scores.d.ts +2 -4
  128. package/dist/pipeline/calculate-scores.js +43 -113
  129. package/dist/pipeline/checks.js +2 -2
  130. package/dist/pipeline/compare.js +8 -8
  131. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  132. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  133. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  134. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  135. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  136. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  137. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  138. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  139. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  140. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +392 -0
  141. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  142. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  143. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  144. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  145. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  146. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +404 -0
  147. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  148. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  149. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  150. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  151. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  152. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  153. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  154. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  155. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  156. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  157. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  158. package/dist/pipeline/compiler/config-loader.js +111 -0
  159. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  160. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  161. package/dist/pipeline/compiler/hash.d.ts +11 -0
  162. package/dist/pipeline/compiler/hash.js +18 -0
  163. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  164. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  165. package/dist/pipeline/compiler/index.d.ts +29 -0
  166. package/dist/pipeline/compiler/index.js +45 -0
  167. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  168. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  169. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  170. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  171. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  172. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  173. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  174. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  175. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
  176. package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
  177. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  178. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  179. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  180. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  181. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  182. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  183. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  184. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  185. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  186. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  187. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  188. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  189. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  190. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  191. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  192. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  193. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  194. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  195. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  196. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  197. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
  198. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  199. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  200. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  201. package/dist/pipeline/compiler/mode-handlers/index.d.ts +15 -0
  202. package/dist/pipeline/compiler/mode-handlers/index.js +19 -0
  203. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  204. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  205. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  206. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  207. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  208. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  209. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  210. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  211. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  212. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  213. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  214. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  215. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  216. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  217. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  218. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  219. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  220. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  221. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  222. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  223. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  224. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  225. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  226. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  227. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
  228. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
  229. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  230. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  231. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  232. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  233. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  234. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  235. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  236. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
  237. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  238. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  239. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  240. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  241. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  242. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
  243. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  244. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
  245. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  246. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  247. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
  248. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
  249. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
  250. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  251. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  252. package/dist/pipeline/compiler/preset-loader.js +99 -0
  253. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  254. package/dist/pipeline/compiler/presets/index.js +8 -0
  255. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +42 -0
  256. package/dist/pipeline/compiler/presets/sanity-literacy.js +208 -0
  257. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  258. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  259. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  260. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  261. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  262. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  263. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  264. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  265. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  266. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  267. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  268. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  269. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  270. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  271. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  272. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  273. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  274. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  275. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  276. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  277. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  278. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  279. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  280. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  281. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  282. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  283. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  284. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  285. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  286. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  287. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  288. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  289. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  290. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  291. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  292. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  293. package/dist/pipeline/coverage-audit.d.ts +15 -5
  294. package/dist/pipeline/coverage-audit.js +41 -22
  295. package/dist/pipeline/eval-constants.d.ts +16 -6
  296. package/dist/pipeline/eval-constants.js +25 -4
  297. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  298. package/dist/pipeline/eval-fingerprint.js +8 -9
  299. package/dist/pipeline/expand-tasks.d.ts +19 -10
  300. package/dist/pipeline/expand-tasks.js +34 -28
  301. package/dist/pipeline/gap-analysis.d.ts +1 -1
  302. package/dist/pipeline/gap-analysis.js +2 -2
  303. package/dist/pipeline/generate-configs.d.ts +22 -4
  304. package/dist/pipeline/generate-configs.js +53 -24
  305. package/dist/pipeline/grader-api.d.ts +3 -3
  306. package/dist/pipeline/grader-api.js +5 -12
  307. package/dist/pipeline/grader-compare-runner.js +20 -27
  308. package/dist/pipeline/grader-comparison.d.ts +4 -8
  309. package/dist/pipeline/grader-comparison.js +11 -17
  310. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  311. package/dist/pipeline/grader-consistency-runner.js +16 -20
  312. package/dist/pipeline/grader-consistency.d.ts +6 -10
  313. package/dist/pipeline/grader-consistency.js +13 -32
  314. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  315. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  316. package/dist/pipeline/grader-sensitivity.js +10 -10
  317. package/dist/pipeline/grader-validate-runner.js +7 -5
  318. package/dist/pipeline/grader-validation.d.ts +2 -6
  319. package/dist/pipeline/grader-validation.js +14 -22
  320. package/dist/pipeline/map-request-to-config.js +7 -1
  321. package/dist/pipeline/mirror-repo-tasks.d.ts +13 -13
  322. package/dist/pipeline/mirror-repo-tasks.js +22 -21
  323. package/dist/pipeline/normalize-mode.d.ts +49 -0
  324. package/dist/pipeline/normalize-mode.js +64 -0
  325. package/dist/pipeline/plan.d.ts +5 -2
  326. package/dist/pipeline/plan.js +134 -78
  327. package/dist/pipeline/pr-comment.js +2 -0
  328. package/dist/pipeline/profile-resolution.d.ts +22 -14
  329. package/dist/pipeline/profile-resolution.js +41 -19
  330. package/dist/pipeline/provenance.d.ts +2 -2
  331. package/dist/pipeline/provenance.js +12 -17
  332. package/dist/pipeline/release-report.js +4 -4
  333. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  334. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  335. package/dist/pipeline/rubric-loader.d.ts +20 -0
  336. package/dist/pipeline/rubric-loader.js +37 -0
  337. package/dist/pipeline/validate.d.ts +4 -4
  338. package/dist/pipeline/validate.js +64 -53
  339. package/dist/schedules/loader.js +18 -8
  340. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  341. package/dist/scripts/migrate-task-mode.js +85 -0
  342. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  343. package/dist/scripts/validate-task-sources.d.ts +1 -1
  344. package/dist/scripts/validate-task-sources.js +15 -15
  345. package/dist/sinks/loader.js +5 -7
  346. package/dist/sources.d.ts +7 -7
  347. package/dist/sources.js +22 -24
  348. package/dist/webhook/dispatch.js +2 -1
  349. package/package.json +15 -4
  350. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  351. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  352. package/tasks/literacy/frameworks.task.ts +128 -0
  353. package/tasks/literacy/functions.task.ts +69 -0
  354. package/tasks/literacy/groq.task.ts +258 -0
  355. package/tasks/literacy/nextjs-live.task.ts +75 -0
  356. package/tasks/literacy/studio-setup.task.ts +131 -0
  357. package/tasks/literacy/visual-editing.task.ts +146 -0
  358. package/config/features.yaml +0 -116
  359. package/config/models.yaml +0 -116
  360. package/config/prompts.yaml +0 -75
  361. package/config/rubrics.yaml +0 -81
  362. package/config/schedules.yaml +0 -43
  363. package/config/sinks.yaml +0 -54
  364. package/config/sources.yaml +0 -51
  365. package/config/thresholds.yaml +0 -49
  366. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  367. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  368. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  369. package/dist/_vendor/ailf-tasks/index.js +0 -16
  370. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  371. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  372. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  373. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  374. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  375. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  376. package/dist/agent-observer/test-imports.d.ts +0 -7
  377. package/dist/agent-observer/test-imports.js +0 -185
@@ -47,7 +47,7 @@ export type WeightProfile = z.infer<typeof WeightProfileSchema>;
47
47
  */
48
48
  export declare const RubricConfigSchema: z.ZodObject<{
49
49
  footer: z.ZodString;
50
- "mode-profiles": z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodRecord<z.ZodString, z.ZodString>>>;
50
+ "mode-profiles": z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnion<readonly [z.ZodString, z.ZodRecord<z.ZodString, z.ZodString>]>>>>;
51
51
  profiles: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodRecord<z.ZodString, z.ZodNumber>>>;
52
52
  templates: z.ZodRecord<z.ZodString, z.ZodObject<{
53
53
  criteria_label: z.ZodOptional<z.ZodNullable<z.ZodString>>;
@@ -67,19 +67,18 @@ export declare const FeatureSchema: z.ZodObject<{
67
67
  id: z.ZodString;
68
68
  name: z.ZodString;
69
69
  priority: z.ZodEnum<{
70
+ critical: "critical";
70
71
  high: "high";
71
- low: "low";
72
72
  medium: "medium";
73
- critical: "critical";
73
+ low: "low";
74
74
  }>;
75
75
  sections: z.ZodArray<z.ZodString>;
76
76
  status: z.ZodEnum<{
77
77
  covered: "covered";
78
- "out-of-scope": "out-of-scope";
79
- planned: "planned";
80
78
  uncovered: "uncovered";
79
+ planned: "planned";
80
+ "out-of-scope": "out-of-scope";
81
81
  }>;
82
- taskCount: z.ZodOptional<z.ZodNumber>;
83
82
  }, z.core.$strip>;
84
83
  /** Inferred TypeScript type for a product feature. */
85
84
  export type Feature = z.infer<typeof FeatureSchema>;
@@ -92,19 +91,18 @@ export declare const FeatureRegistrySchema: z.ZodObject<{
92
91
  id: z.ZodString;
93
92
  name: z.ZodString;
94
93
  priority: z.ZodEnum<{
94
+ critical: "critical";
95
95
  high: "high";
96
- low: "low";
97
96
  medium: "medium";
98
- critical: "critical";
97
+ low: "low";
99
98
  }>;
100
99
  sections: z.ZodArray<z.ZodString>;
101
100
  status: z.ZodEnum<{
102
101
  covered: "covered";
103
- "out-of-scope": "out-of-scope";
104
- planned: "planned";
105
102
  uncovered: "uncovered";
103
+ planned: "planned";
104
+ "out-of-scope": "out-of-scope";
106
105
  }>;
107
- taskCount: z.ZodOptional<z.ZodNumber>;
108
106
  }, z.core.$strip>>;
109
107
  }, z.core.$strip>;
110
108
  /** Inferred TypeScript type for the feature registry. */
@@ -440,14 +438,11 @@ export declare const TaskFileSchema: z.ZodArray<z.ZodUnion<readonly [z.ZodObject
440
438
  export type TaskFile = z.infer<typeof TaskFileSchema>;
441
439
  /**
442
440
  * Schema for per-dimension threshold values.
441
+ * Uses a dynamic record to support all evaluation modes, not just literacy.
443
442
  * Keys use kebab-case to match YAML convention; the threshold engine
444
443
  * normalizes to camelCase for comparison against FeatureScore fields.
445
444
  */
446
- export declare const ThresholdDimensionsSchema: z.ZodObject<{
447
- "code-correctness": z.ZodOptional<z.ZodNumber>;
448
- "doc-coverage": z.ZodOptional<z.ZodNumber>;
449
- "task-completion": z.ZodOptional<z.ZodNumber>;
450
- }, z.core.$strip>;
445
+ export declare const ThresholdDimensionsSchema: z.ZodRecord<z.ZodString, z.ZodNumber>;
451
446
  /** Inferred TypeScript type for threshold dimension overrides. */
452
447
  export type ThresholdDimensions = z.infer<typeof ThresholdDimensionsSchema>;
453
448
  /**
@@ -457,11 +452,7 @@ export type ThresholdDimensions = z.infer<typeof ThresholdDimensionsSchema>;
457
452
  export declare const ThresholdDefaultsSchema: z.ZodObject<{
458
453
  ceiling: z.ZodOptional<z.ZodNumber>;
459
454
  composite: z.ZodNumber;
460
- dimensions: z.ZodOptional<z.ZodObject<{
461
- "code-correctness": z.ZodOptional<z.ZodNumber>;
462
- "doc-coverage": z.ZodOptional<z.ZodNumber>;
463
- "task-completion": z.ZodOptional<z.ZodNumber>;
464
- }, z.core.$strip>>;
455
+ dimensions: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
465
456
  "doc-lift": z.ZodOptional<z.ZodNumber>;
466
457
  }, z.core.$strip>;
467
458
  /** Inferred TypeScript type for threshold defaults. */
@@ -501,21 +492,13 @@ export declare const ThresholdConfigSchema: z.ZodObject<{
501
492
  areas: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodObject<{
502
493
  ceiling: z.ZodOptional<z.ZodOptional<z.ZodNumber>>;
503
494
  composite: z.ZodOptional<z.ZodNumber>;
504
- dimensions: z.ZodOptional<z.ZodOptional<z.ZodObject<{
505
- "code-correctness": z.ZodOptional<z.ZodNumber>;
506
- "doc-coverage": z.ZodOptional<z.ZodNumber>;
507
- "task-completion": z.ZodOptional<z.ZodNumber>;
508
- }, z.core.$strip>>>;
495
+ dimensions: z.ZodOptional<z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>>;
509
496
  "doc-lift": z.ZodOptional<z.ZodOptional<z.ZodNumber>>;
510
497
  }, z.core.$strip>>>;
511
498
  defaults: z.ZodObject<{
512
499
  ceiling: z.ZodOptional<z.ZodNumber>;
513
500
  composite: z.ZodNumber;
514
- dimensions: z.ZodOptional<z.ZodObject<{
515
- "code-correctness": z.ZodOptional<z.ZodNumber>;
516
- "doc-coverage": z.ZodOptional<z.ZodNumber>;
517
- "task-completion": z.ZodOptional<z.ZodNumber>;
518
- }, z.core.$strip>>;
501
+ dimensions: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
519
502
  "doc-lift": z.ZodOptional<z.ZodNumber>;
520
503
  }, z.core.$strip>;
521
504
  regression: z.ZodOptional<z.ZodObject<{
@@ -43,10 +43,22 @@ const WeightProfileSchema = z
43
43
  return Math.abs(sum - 1.0) < 0.001;
44
44
  }, { message: "profile weights must sum to 1.0" });
45
45
  /**
46
- * Mode-to-profile bindings — maps (mode, variant) pairs to profile names.
47
- * Example: { baseline: { gold: "default", baseline: "output-only" } }
46
+ * Mode-to-profile bindings — maps (mode, perspective) pairs to profile names.
47
+ *
48
+ * Flat form (most modes):
49
+ * { "mcp-server": { gold: "mcp-behavior" } }
50
+ *
51
+ * Nested form (literacy mode with variant sub-keys):
52
+ * { literacy: { baseline: { gold: "default", baseline: "output-only" }, agentic: { gold: "default" } } }
53
+ *
54
+ * The nested form adds a variant level between mode and perspective,
55
+ * allowing a single canonical mode to host multiple scoring variants.
48
56
  */
49
- const ModeProfilesSchema = z.record(z.string(), z.record(z.string(), z.string()));
57
+ const ModeProfileEntrySchema = z.union([
58
+ z.string(),
59
+ z.record(z.string(), z.string()),
60
+ ]);
61
+ const ModeProfilesSchema = z.record(z.string(), z.record(z.string(), ModeProfileEntrySchema));
50
62
  /**
51
63
  * Schema for the full config/rubrics.yaml config file.
52
64
  *
@@ -96,7 +108,6 @@ export const FeatureSchema = z.object({
96
108
  priority: z.enum(["critical", "high", "medium", "low"]),
97
109
  sections: z.array(z.string().min(1)).min(1),
98
110
  status: z.enum(["covered", "uncovered", "planned", "out-of-scope"]),
99
- taskCount: z.number().int().min(0).optional(),
100
111
  });
101
112
  /**
102
113
  * Schema for the full config/features.yaml config file.
@@ -277,14 +288,11 @@ export const TaskFileSchema = z
277
288
  // ---------------------------------------------------------------------------
278
289
  /**
279
290
  * Schema for per-dimension threshold values.
291
+ * Uses a dynamic record to support all evaluation modes, not just literacy.
280
292
  * Keys use kebab-case to match YAML convention; the threshold engine
281
293
  * normalizes to camelCase for comparison against FeatureScore fields.
282
294
  */
283
- export const ThresholdDimensionsSchema = z.object({
284
- "code-correctness": z.number().min(0).max(100).optional(),
285
- "doc-coverage": z.number().min(0).max(100).optional(),
286
- "task-completion": z.number().min(0).max(100).optional(),
287
- });
295
+ export const ThresholdDimensionsSchema = z.record(z.string(), z.number().min(0).max(100));
288
296
  /**
289
297
  * Schema for threshold defaults (and per-area overrides).
290
298
  * All fields are optional in per-area overrides; defaults must have composite.
@@ -18,10 +18,15 @@ export declare const ScheduleEntrySchema: z.ZodObject<{
18
18
  cron: z.ZodString;
19
19
  enabled: z.ZodDefault<z.ZodBoolean>;
20
20
  mode: z.ZodDefault<z.ZodEnum<{
21
- agentic: "agentic";
21
+ custom: "custom";
22
+ literacy: "literacy";
23
+ "mcp-server": "mcp-server";
24
+ "agent-harness": "agent-harness";
25
+ "knowledge-probe": "knowledge-probe";
22
26
  baseline: "baseline";
23
- full: "full";
27
+ agentic: "agentic";
24
28
  observed: "observed";
29
+ full: "full";
25
30
  }>>;
26
31
  name: z.ZodString;
27
32
  publish: z.ZodDefault<z.ZodBoolean>;
@@ -53,10 +58,15 @@ export declare const SchedulesFileSchema: z.ZodObject<{
53
58
  cron: z.ZodString;
54
59
  enabled: z.ZodDefault<z.ZodBoolean>;
55
60
  mode: z.ZodDefault<z.ZodEnum<{
56
- agentic: "agentic";
61
+ custom: "custom";
62
+ literacy: "literacy";
63
+ "mcp-server": "mcp-server";
64
+ "agent-harness": "agent-harness";
65
+ "knowledge-probe": "knowledge-probe";
57
66
  baseline: "baseline";
58
- full: "full";
67
+ agentic: "agentic";
59
68
  observed: "observed";
69
+ full: "full";
60
70
  }>>;
61
71
  name: z.ZodString;
62
72
  publish: z.ZodDefault<z.ZodBoolean>;
@@ -11,6 +11,7 @@
11
11
  * @see docs/design-docs/report-store/implementation.md — Phase 5
12
12
  */
13
13
  import { z } from "zod";
14
+ import { RAW_EVAL_MODES } from "../../ailf-shared/index.js";
14
15
  // ---------------------------------------------------------------------------
15
16
  // Cron expression validation
16
17
  // ---------------------------------------------------------------------------
@@ -34,8 +35,11 @@ export const ScheduleEntrySchema = z.object({
34
35
  cron: CronSchema,
35
36
  /** Whether this schedule is active */
36
37
  enabled: z.boolean().default(true),
37
- /** Evaluation mode */
38
- mode: z.enum(["agentic", "baseline", "full", "observed"]).default("baseline"),
38
+ /**
39
+ * Evaluation mode accepts both canonical and legacy names.
40
+ * Legacy names must pass through normalizeMode() before entering typed pipeline code.
41
+ */
42
+ mode: z.enum(RAW_EVAL_MODES).default("baseline"),
39
43
  /** Human-readable schedule name (used as report tag) */
40
44
  name: z
41
45
  .string()
@@ -17,10 +17,10 @@
17
17
  import { z } from "zod";
18
18
  /** All supported sink types as a Zod union. */
19
19
  export declare const SinkTypeSchema: z.ZodEnum<{
20
- webhook: "webhook";
21
20
  bigquery: "bigquery";
22
21
  "github-comment": "github-comment";
23
22
  slack: "slack";
23
+ webhook: "webhook";
24
24
  }>;
25
25
  /** Supported sink type string literal union. */
26
26
  export type SinkType = z.infer<typeof SinkTypeSchema>;
@@ -25,12 +25,21 @@ export function formatComparisonMarkdown(report) {
25
25
  lines.push("");
26
26
  lines.push(`**Overall: ${Math.round(report.baseline.overall.avgScore)} → ${Math.round(report.experiment.overall.avgScore)}** (${overallIcon} ${deltaStr(overall)})`);
27
27
  lines.push("");
28
- // Per-area table
29
- lines.push("| Feature | Baseline | Current | Delta | Task | Code | Docs |");
30
- lines.push("|---------|----------|---------|-------|------|------|------|");
28
+ // Derive dimension columns from the first area's keys (all areas share the
29
+ // same scoring profile, so the key set is uniform).
30
+ const dimKeys = report.areas.length > 0
31
+ ? Object.keys(report.areas[0].dimensions)
32
+ : Object.keys(report.deltas.perDimension);
33
+ // Per-area table — columns are dynamic
34
+ const dimHeaders = dimKeys.map(kebabToTitleCase);
35
+ const headerRow = ["Feature", "Baseline", "Current", "Delta", ...dimHeaders];
36
+ const separatorRow = headerRow.map(() => "------");
37
+ lines.push(`| ${headerRow.join(" | ")} |`);
38
+ lines.push(`|${separatorRow.join("|")}|`);
31
39
  for (const a of report.areas) {
32
40
  const icon = changeIcon(a.change);
33
- lines.push(`| ${a.area} | ${a.baseline} | ${a.experiment} | ${icon} ${deltaStr(a.delta)} | ${deltaStr(a.dimensions.taskCompletion.delta)} | ${deltaStr(a.dimensions.codeCorrectness.delta)} | ${deltaStr(a.dimensions.docCoverage.delta)} |`);
41
+ const dimCells = dimKeys.map((k) => deltaStr(a.dimensions[k]?.delta ?? 0));
42
+ lines.push(`| ${a.area} | ${a.baseline} | ${a.experiment} | ${icon} ${deltaStr(a.delta)} | ${dimCells.join(" | ")} |`);
34
43
  }
35
44
  lines.push("");
36
45
  // Summary
@@ -55,9 +64,9 @@ export function formatComparisonMarkdown(report) {
55
64
  const dim = report.deltas.perDimension;
56
65
  lines.push("| Dimension | Delta |");
57
66
  lines.push("|-----------|-------|");
58
- lines.push(`| Task Completion | ${deltaStr(dim.taskCompletion)} |`);
59
- lines.push(`| Code Correctness | ${deltaStr(dim.codeCorrectness)} |`);
60
- lines.push(`| Doc Coverage | ${deltaStr(dim.docCoverage)} |`);
67
+ for (const k of Object.keys(dim)) {
68
+ lines.push(`| ${kebabToTitleCase(k)} | ${deltaStr(dim[k])} |`);
69
+ }
61
70
  lines.push(`| Doc Lift | ${deltaStr(report.deltas.docLift)} |`);
62
71
  if (report.deltas.cost !== undefined) {
63
72
  const costStr = report.deltas.cost > 0
@@ -91,29 +100,51 @@ export function formatComparisonTable(report) {
91
100
  : "unchanged");
92
101
  lines.push(` Overall: ${Math.round(report.baseline.overall.avgScore)} → ${Math.round(report.experiment.overall.avgScore)} (${overallIcon} ${deltaStr(overall)})`);
93
102
  lines.push("");
94
- // Per-dimension averages
103
+ // Per-dimension averages — derived dynamically from the report
95
104
  const dim = report.deltas.perDimension;
105
+ const dimKeys = report.areas.length > 0
106
+ ? Object.keys(report.areas[0].dimensions)
107
+ : Object.keys(dim);
96
108
  lines.push(" Dimension averages:");
97
- lines.push(` Task Completion: ${deltaStr(dim.taskCompletion)}`);
98
- lines.push(` Code Correctness: ${deltaStr(dim.codeCorrectness)}`);
99
- lines.push(` Doc Coverage: ${deltaStr(dim.docCoverage)}`);
100
- lines.push(` Doc Lift: ${deltaStr(report.deltas.docLift)}`);
109
+ // Pad labels to the longest dimension label for alignment
110
+ const dimLabels = dimKeys.map(kebabToTitleCase);
111
+ // +1 for the colon appended to each label
112
+ const maxLabelLen = Math.max(...dimLabels.map((l) => l.length + 1), "Doc Lift:".length);
113
+ for (let i = 0; i < dimKeys.length; i++) {
114
+ lines.push(` ${(dimLabels[i] + ":").padEnd(maxLabelLen)} ${deltaStr(dim[dimKeys[i]] ?? 0)}`);
115
+ }
116
+ lines.push(` ${"Doc Lift:".padEnd(maxLabelLen)} ${deltaStr(report.deltas.docLift)}`);
101
117
  if (report.deltas.cost !== undefined) {
102
- lines.push(` Cost: ${report.deltas.cost > 0 ? "+" : ""}$${report.deltas.cost.toFixed(4)}`);
118
+ lines.push(` ${"Cost:".padEnd(maxLabelLen)} ${report.deltas.cost > 0 ? "+" : ""}$${report.deltas.cost.toFixed(4)}`);
103
119
  }
104
120
  lines.push("");
105
- // Per-area table
121
+ // Per-area table — columns are dynamic
106
122
  lines.push("-".repeat(80));
107
123
  lines.push("PER-AREA BREAKDOWN");
108
124
  lines.push("-".repeat(80));
109
125
  lines.push("");
110
- const h = "| Feature Area | Baseline | Experiment | Delta | Task | Code | Docs |";
111
- const sep = "|---------------------|----------|------------|-------|------|------|------|";
112
- lines.push(h);
113
- lines.push(sep);
126
+ const dimHeaders = dimKeys.map(kebabToTitleCase);
127
+ const colWidths = dimHeaders.map((h) => Math.max(h.length, 4));
128
+ const hCols = [
129
+ "Feature Area".padEnd(19),
130
+ "Baseline".padStart(8),
131
+ "Experiment".padStart(10),
132
+ "Delta".padStart(5),
133
+ ...dimHeaders.map((h, i) => h.padStart(colWidths[i])),
134
+ ];
135
+ const sepCols = [
136
+ "-".repeat(21),
137
+ "-".repeat(10),
138
+ "-".repeat(12),
139
+ "-".repeat(7),
140
+ ...colWidths.map((w) => "-".repeat(w + 2)),
141
+ ];
142
+ lines.push(`| ${hCols.join(" | ")} |`);
143
+ lines.push(`|${sepCols.join("|")}|`);
114
144
  for (const a of report.areas) {
115
145
  const icon = changeIcon(a.change);
116
- lines.push(`| ${icon} ${a.area.padEnd(17)} | ${String(a.baseline).padStart(8)} | ${String(a.experiment).padStart(10)} | ${deltaStr(a.delta).padStart(5)} | ${deltaStr(a.dimensions.taskCompletion.delta).padStart(4)} | ${deltaStr(a.dimensions.codeCorrectness.delta).padStart(4)} | ${deltaStr(a.dimensions.docCoverage.delta).padStart(4)} |`);
146
+ const dimCells = dimKeys.map((k, i) => deltaStr(a.dimensions[k]?.delta ?? 0).padStart(colWidths[i]));
147
+ lines.push(`| ${icon} ${a.area.padEnd(17)} | ${String(a.baseline).padStart(8)} | ${String(a.experiment).padStart(10)} | ${deltaStr(a.delta).padStart(5)} | ${dimCells.join(" | ")} |`);
117
148
  }
118
149
  lines.push("");
119
150
  // Classification summary
@@ -187,3 +218,10 @@ function deltaStr(d) {
187
218
  return `${Math.round(d)}`;
188
219
  return "0";
189
220
  }
221
+ /** Convert kebab-case dimension name to title case (e.g. 'task-completion' → 'Task Completion') */
222
+ function kebabToTitleCase(name) {
223
+ return name
224
+ .split("-")
225
+ .map((w) => w.charAt(0).toUpperCase() + w.slice(1))
226
+ .join(" ");
227
+ }
@@ -7,6 +7,7 @@
7
7
  * Extracted from packages/eval/src/lib/ during the Ports & Adapters
8
8
  * migration (Phase 4e).
9
9
  */
10
- export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
10
+ export { classifyRubric, detectFeatureArea, extractDimensions, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
11
11
  export { formatComparisonMarkdown, formatComparisonTable, } from "./comparison-formatters.js";
12
+ export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, type AggregationStrategy, type AreaScore, type AssertionScore, type DimensionScore, type EnsembleGradingConfig, type GraderTransitionConfig, type TaskScore, type TaskScoreOptions, } from "./scoring-engine.js";
12
13
  export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "./config-helpers.js";
@@ -7,6 +7,7 @@
7
7
  * Extracted from packages/eval/src/lib/ during the Ports & Adapters
8
8
  * migration (Phase 4e).
9
9
  */
10
- export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
10
+ export { classifyRubric, detectFeatureArea, extractDimensions, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
11
11
  export { formatComparisonMarkdown, formatComparisonTable, } from "./comparison-formatters.js";
12
+ export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, } from "./scoring-engine.js";
12
13
  export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "./config-helpers.js";
@@ -0,0 +1,153 @@
1
+ /**
2
+ * 4-tier scoring engine — unified scoring across all evaluation modes.
3
+ *
4
+ * Tier 1: Assertion-level (atomic pass/fail + optional numeric score)
5
+ * Tier 2: Dimension-level (aggregated per scoring dimension)
6
+ * Tier 3: Task-level (weighted composite of dimensions)
7
+ * Tier 4: Suite/Area-level (aggregated across tasks)
8
+ *
9
+ * This engine is mode-agnostic — it works for literacy, MCP server,
10
+ * agent harness, knowledge probe, and custom modes.
11
+ *
12
+ * @see docs/design-docs/architecture-overhaul/scoring-rubrics-assertions.md
13
+ */
14
+ /** The result of a single assertion evaluation */
15
+ export interface AssertionScore {
16
+ /** Whether the assertion passed */
17
+ pass: boolean;
18
+ /** Numeric score in [0, 1], null if not applicable */
19
+ score: number | null;
20
+ /** Human-readable explanation */
21
+ reason: string;
22
+ /** Assertion type that produced this result */
23
+ assertionType: string;
24
+ /** Dimension this assertion contributes to */
25
+ dimension: string;
26
+ /** Wall-clock grading time in ms */
27
+ latencyMs: number;
28
+ /** Weight of this assertion (1.0 if unspecified) */
29
+ weight: number;
30
+ }
31
+ /** Aggregation strategy for dimension scoring */
32
+ export type AggregationStrategy = "max" | "mean" | "min" | "weighted-mean";
33
+ /** Aggregated score for a scoring dimension */
34
+ export interface DimensionScore {
35
+ /** Dimension identifier (e.g., "code-correctness") */
36
+ dimensionId: string;
37
+ /** Human-readable label */
38
+ label: string;
39
+ /** Aggregated score in [0, 1] */
40
+ score: number;
41
+ /** How many assertions contributed */
42
+ assertionCount: number;
43
+ /** How many assertions passed */
44
+ passCount: number;
45
+ /** Aggregation method used */
46
+ aggregation: AggregationStrategy;
47
+ /** Individual assertion results */
48
+ assertions: AssertionScore[];
49
+ }
50
+ /**
51
+ * Aggregate assertion scores into dimension scores.
52
+ *
53
+ * Groups assertions by dimension, then applies the configured aggregation
54
+ * strategy (default: weighted-mean).
55
+ */
56
+ export declare function aggregateDimensions(assertions: AssertionScore[], options?: {
57
+ defaultAggregation?: AggregationStrategy;
58
+ dimensionLabels?: Record<string, string>;
59
+ }): DimensionScore[];
60
+ /** Weighted composite score for a task */
61
+ export interface TaskScore {
62
+ /** Task identifier */
63
+ taskId: string;
64
+ /** Feature area (e.g., "groq", "studio"). When absent, aggregateAreas() falls back to taskId prefix. */
65
+ area?: string;
66
+ /** Weighted composite score in [0, 1] */
67
+ score: number;
68
+ /** Per-dimension breakdown */
69
+ dimensions: DimensionScore[];
70
+ /** Weight configuration used */
71
+ weights: Record<string, number>;
72
+ /** Source of weights (default profile, task override, etc.) */
73
+ weightSource: string;
74
+ /** Whether the task met its quality threshold */
75
+ passesThreshold: boolean;
76
+ /** The threshold compared against */
77
+ threshold: number;
78
+ /** Warnings about potential misconfiguration (e.g., no dimensions matched weights) */
79
+ warnings?: string[];
80
+ }
81
+ /** Options for computing a task score */
82
+ export interface TaskScoreOptions {
83
+ /** Task identifier */
84
+ taskId: string;
85
+ /** Feature area (e.g., "groq", "studio"). Falls back to taskId prefix if omitted. */
86
+ area?: string;
87
+ /** Dimension weights (must sum to ~1.0) */
88
+ weights: Record<string, number>;
89
+ /** Where the weights came from (for traceability) */
90
+ weightSource?: string;
91
+ /** Quality threshold (0-1) for pass/fail gate */
92
+ threshold?: number;
93
+ }
94
+ /**
95
+ * Compute a weighted task score from dimension scores.
96
+ */
97
+ export declare function computeTaskScore(dimensions: DimensionScore[], options: TaskScoreOptions): TaskScore;
98
+ /** Aggregated score across tasks in a feature area */
99
+ export interface AreaScore {
100
+ /** Area identifier (e.g., "groq", "studio") */
101
+ areaId: string;
102
+ /** Mean task score */
103
+ score: number;
104
+ /** Number of tasks evaluated */
105
+ taskCount: number;
106
+ /** Number of tasks passing threshold */
107
+ passingTaskCount: number;
108
+ /** Per-task breakdown */
109
+ tasks: TaskScore[];
110
+ /** Trend vs previous evaluation */
111
+ delta: number | null;
112
+ }
113
+ /**
114
+ * Aggregate task scores into area scores.
115
+ */
116
+ export declare function aggregateAreas(tasks: TaskScore[], previousScores?: Record<string, number>): AreaScore[];
117
+ /**
118
+ * Normalize an assertion score to [0, 1] range.
119
+ *
120
+ * Different assertion types produce scores in different ranges:
121
+ * - Boolean (contains, equals, regex): 0 or 1
122
+ * - LLM rubric: 0-100 (needs /100)
123
+ * - similar: 0-1 (already normalized)
124
+ * - javascript/python: user-defined (assumed 0-1)
125
+ */
126
+ export declare function normalizeScore(rawScore: number, assertionType: string): number;
127
+ /** Grader transition configuration for gradual migration */
128
+ export interface GraderTransitionConfig {
129
+ /** Current (old) grader model */
130
+ old: string;
131
+ /** New grader model to transition to */
132
+ new_: string;
133
+ /** ISO date after which old grader is retired */
134
+ expiration: string;
135
+ /** Whether to run both graders in parallel */
136
+ parallel: boolean;
137
+ }
138
+ /** Ensemble grading configuration */
139
+ export interface EnsembleGradingConfig {
140
+ /** Whether ensemble grading is enabled */
141
+ enabled: boolean;
142
+ /** Grader models to use */
143
+ models: string[];
144
+ /** Aggregation strategy for ensemble scores */
145
+ aggregation: "max" | "mean" | "median";
146
+ }
147
+ /**
148
+ * Compute ensemble score from multiple grader outputs.
149
+ */
150
+ export declare function computeEnsembleScore(scores: number[], aggregation?: "max" | "mean" | "median"): {
151
+ score: number;
152
+ agreement: number;
153
+ };