@sanity/ailf 0.4.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. package/config/features.ts +23 -0
  2. package/config/models.ts +83 -0
  3. package/config/prompts.ts +16 -0
  4. package/config/rubrics.ts +225 -0
  5. package/config/schedules.ts +47 -0
  6. package/config/sinks.ts +37 -0
  7. package/config/sources.ts +21 -0
  8. package/config/thresholds.ts +61 -0
  9. package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
  10. package/dist/_vendor/ailf-core/config-helpers.js +150 -0
  11. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  12. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  13. package/dist/_vendor/ailf-core/examples/index.d.ts +10 -10
  14. package/dist/_vendor/ailf-core/examples/index.js +10 -10
  15. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  16. package/dist/_vendor/ailf-core/index.js +5 -0
  17. package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
  18. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  19. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  20. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  21. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  22. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  23. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  24. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
  25. package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
  26. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
  27. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
  28. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +32 -31
  29. package/dist/_vendor/ailf-core/schemas/pipeline.js +52 -12
  30. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  31. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  32. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  33. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  34. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  35. package/dist/_vendor/ailf-core/services/index.js +2 -1
  36. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  37. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  38. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  39. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  40. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  41. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  42. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  43. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  44. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  46. package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
  47. package/dist/_vendor/ailf-core/types/index.js +8 -1
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
  50. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  51. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  52. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  53. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  54. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  55. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  56. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  57. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  58. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  59. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  60. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  61. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  62. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  63. package/dist/_vendor/ailf-shared/index.js +0 -1
  64. package/dist/adapters/api-client/build-request.js +14 -13
  65. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  66. package/dist/adapters/config-sources/file-config-adapter.js +38 -12
  67. package/dist/adapters/config-sources/index.d.ts +2 -0
  68. package/dist/adapters/config-sources/index.js +1 -0
  69. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  70. package/dist/adapters/config-sources/ts-config-loader.js +133 -0
  71. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  72. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  73. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  74. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  75. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  76. package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
  77. package/dist/adapters/task-sources/index.d.ts +1 -0
  78. package/dist/adapters/task-sources/index.js +1 -0
  79. package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
  80. package/dist/adapters/task-sources/repo-task-source.js +69 -16
  81. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  82. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  83. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  84. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  85. package/dist/cli.js +0 -2
  86. package/dist/commands/baseline.js +4 -1
  87. package/dist/commands/calculate-scores.js +1 -1
  88. package/dist/commands/coverage-audit.js +7 -1
  89. package/dist/commands/explain-handler.js +25 -23
  90. package/dist/commands/fetch-docs.js +3 -2
  91. package/dist/commands/generate-configs.js +1 -1
  92. package/dist/commands/interactive.js +11 -7
  93. package/dist/commands/pipeline-action.d.ts +2 -0
  94. package/dist/commands/pipeline-action.js +16 -6
  95. package/dist/commands/pipeline.d.ts +1 -0
  96. package/dist/commands/pipeline.js +4 -2
  97. package/dist/commands/pr-comment.js +1 -1
  98. package/dist/commands/publish.js +2 -2
  99. package/dist/commands/readiness-report.js +13 -6
  100. package/dist/composition-root.d.ts +1 -1
  101. package/dist/composition-root.js +67 -4
  102. package/dist/orchestration/build-app-context.js +1 -0
  103. package/dist/orchestration/build-step-sequence.js +24 -6
  104. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  105. package/dist/orchestration/steps/fetch-docs-step.js +6 -4
  106. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  107. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  108. package/dist/orchestration/steps/generate-configs-step.js +245 -51
  109. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  110. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  111. package/dist/orchestration/steps/readiness-step.js +5 -6
  112. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  113. package/dist/orchestration/steps/run-eval-step.js +8 -7
  114. package/dist/pipeline/cache.d.ts +1 -1
  115. package/dist/pipeline/cache.js +36 -8
  116. package/dist/pipeline/calculate-scores.d.ts +5 -7
  117. package/dist/pipeline/calculate-scores.js +74 -153
  118. package/dist/pipeline/checks.js +2 -2
  119. package/dist/pipeline/compare.js +8 -8
  120. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  121. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  122. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  123. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  124. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  125. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  126. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  127. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  128. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  129. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
  130. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  131. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  132. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  133. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  134. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  135. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
  136. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  137. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  138. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  139. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  140. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  141. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  142. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  143. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  144. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  145. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  146. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  147. package/dist/pipeline/compiler/config-loader.js +111 -0
  148. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  149. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  150. package/dist/pipeline/compiler/hash.d.ts +11 -0
  151. package/dist/pipeline/compiler/hash.js +18 -0
  152. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  153. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  154. package/dist/pipeline/compiler/index.d.ts +29 -0
  155. package/dist/pipeline/compiler/index.js +45 -0
  156. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  157. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  158. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  159. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  160. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  161. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  162. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  163. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  164. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  165. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  166. package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
  167. package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
  168. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  169. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  170. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  171. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  172. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  173. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
  174. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
  175. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
  176. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  177. package/dist/pipeline/compiler/presets/index.js +8 -0
  178. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
  179. package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
  180. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  181. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  182. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  183. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  184. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  185. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  186. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  187. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  188. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  189. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  190. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  191. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  192. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  193. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  194. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  195. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  196. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  197. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  198. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  199. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  200. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  201. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  202. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  203. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  204. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  205. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  206. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  207. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  208. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  209. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  210. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  211. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  212. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  213. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  214. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  215. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  216. package/dist/pipeline/coverage-audit.d.ts +15 -5
  217. package/dist/pipeline/coverage-audit.js +41 -22
  218. package/dist/pipeline/eval-constants.d.ts +16 -6
  219. package/dist/pipeline/eval-constants.js +25 -4
  220. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  221. package/dist/pipeline/eval-fingerprint.js +8 -9
  222. package/dist/pipeline/expand-tasks.d.ts +23 -14
  223. package/dist/pipeline/expand-tasks.js +37 -31
  224. package/dist/pipeline/gap-analysis.d.ts +1 -1
  225. package/dist/pipeline/gap-analysis.js +2 -2
  226. package/dist/pipeline/generate-configs.d.ts +22 -4
  227. package/dist/pipeline/generate-configs.js +53 -24
  228. package/dist/pipeline/grader-api.d.ts +3 -3
  229. package/dist/pipeline/grader-api.js +5 -12
  230. package/dist/pipeline/grader-compare-runner.js +20 -27
  231. package/dist/pipeline/grader-comparison.d.ts +4 -8
  232. package/dist/pipeline/grader-comparison.js +11 -17
  233. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  234. package/dist/pipeline/grader-consistency-runner.js +18 -21
  235. package/dist/pipeline/grader-consistency.d.ts +6 -10
  236. package/dist/pipeline/grader-consistency.js +13 -32
  237. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  238. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  239. package/dist/pipeline/grader-sensitivity.js +10 -10
  240. package/dist/pipeline/grader-validate-runner.js +7 -5
  241. package/dist/pipeline/grader-validation.d.ts +2 -6
  242. package/dist/pipeline/grader-validation.js +14 -22
  243. package/dist/pipeline/map-request-to-config.js +6 -1
  244. package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
  245. package/dist/pipeline/mirror-repo-tasks.js +16 -15
  246. package/dist/pipeline/normalize-mode.d.ts +49 -0
  247. package/dist/pipeline/normalize-mode.js +64 -0
  248. package/dist/pipeline/plan.d.ts +5 -2
  249. package/dist/pipeline/plan.js +134 -78
  250. package/dist/pipeline/pr-comment.js +2 -0
  251. package/dist/pipeline/profile-resolution.d.ts +47 -0
  252. package/dist/pipeline/profile-resolution.js +91 -0
  253. package/dist/pipeline/provenance.d.ts +2 -2
  254. package/dist/pipeline/provenance.js +12 -17
  255. package/dist/pipeline/release-report.js +4 -4
  256. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  257. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  258. package/dist/pipeline/rubric-loader.d.ts +20 -0
  259. package/dist/pipeline/rubric-loader.js +37 -0
  260. package/dist/pipeline/validate.d.ts +4 -4
  261. package/dist/pipeline/validate.js +64 -53
  262. package/dist/schedules/loader.js +18 -8
  263. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  264. package/dist/scripts/migrate-task-mode.js +85 -0
  265. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  266. package/dist/scripts/validate-task-sources.d.ts +1 -1
  267. package/dist/scripts/validate-task-sources.js +15 -15
  268. package/dist/sinks/loader.js +5 -7
  269. package/dist/sources.d.ts +7 -7
  270. package/dist/sources.js +22 -24
  271. package/dist/webhook/dispatch.js +2 -1
  272. package/package.json +6 -3
  273. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  274. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  275. package/tasks/literacy/frameworks.task.ts +128 -0
  276. package/tasks/literacy/functions.task.ts +69 -0
  277. package/tasks/literacy/groq.task.ts +258 -0
  278. package/tasks/literacy/nextjs-live.task.ts +75 -0
  279. package/tasks/literacy/studio-setup.task.ts +131 -0
  280. package/tasks/literacy/visual-editing.task.ts +146 -0
  281. package/config/features.yaml +0 -116
  282. package/config/models.yaml +0 -116
  283. package/config/prompts.yaml +0 -75
  284. package/config/rubrics.yaml +0 -62
  285. package/config/schedules.yaml +0 -43
  286. package/config/sinks.yaml +0 -54
  287. package/config/sources.yaml +0 -51
  288. package/config/thresholds.yaml +0 -49
  289. package/dist/agent-observer/test-imports.d.ts +0 -7
  290. package/dist/agent-observer/test-imports.js +0 -185
@@ -10,6 +10,7 @@
10
10
  * (studio-eval-config) so Content Lake documents validate identically.
11
11
  */
12
12
  import { z } from "zod";
13
+ import { RAW_EVAL_MODES } from "../../ailf-shared/index.js";
13
14
  export const EvalConfigSchema = z
14
15
  .object({
15
16
  /** Allowed origins for agentic mode */
@@ -46,8 +47,12 @@ export const EvalConfigSchema = z
46
47
  graderReplications: z.number().int().positive().optional(),
47
48
  /** Custom headers for doc fetching */
48
49
  headers: z.record(z.string(), z.string()).optional(),
49
- /** Evaluation mode */
50
- mode: z.enum(["baseline", "agentic", "observed", "full"]).optional(),
50
+ /**
51
+ * Evaluation mode accepts both canonical and legacy names.
52
+ * Legacy names ("baseline", "agentic", "observed", "full") must pass
53
+ * through normalizeMode() before entering typed pipeline code.
54
+ */
55
+ mode: z.enum(RAW_EVAL_MODES).optional(),
51
56
  /** Disable release-aware auto-scoping */
52
57
  noAutoScope: z.boolean().optional(),
53
58
  /** Disable local cache */
@@ -49,10 +49,15 @@ export declare const PipelineRequestSchema: z.ZodObject<{
49
49
  inlineTasks: z.ZodOptional<z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
50
50
  jobId: z.ZodOptional<z.ZodString>;
51
51
  mode: z.ZodOptional<z.ZodEnum<{
52
- agentic: "agentic";
52
+ custom: "custom";
53
+ literacy: "literacy";
54
+ "mcp-server": "mcp-server";
55
+ "agent-harness": "agent-harness";
56
+ "knowledge-probe": "knowledge-probe";
53
57
  baseline: "baseline";
54
- full: "full";
58
+ agentic: "agentic";
55
59
  observed: "observed";
60
+ full: "full";
56
61
  }>>;
57
62
  noAutoScope: z.ZodOptional<z.ZodBoolean>;
58
63
  noCache: z.ZodOptional<z.ZodBoolean>;
@@ -70,9 +75,9 @@ export declare const PipelineRequestSchema: z.ZodObject<{
70
75
  source: z.ZodOptional<z.ZodString>;
71
76
  sourceReportId: z.ZodOptional<z.ZodString>;
72
77
  taskMode: z.ZodOptional<z.ZodEnum<{
78
+ inline: "inline";
73
79
  "content-lake": "content-lake";
74
80
  yaml: "yaml";
75
- inline: "inline";
76
81
  }>>;
77
82
  tasks: z.ZodOptional<z.ZodArray<z.ZodString>>;
78
83
  urls: z.ZodOptional<z.ZodArray<z.ZodString>>;
@@ -13,6 +13,7 @@
13
13
  * @see packages/eval/src/pipeline/map-request-to-config.ts — maps to ResolvedConfig
14
14
  */
15
15
  import { z } from "zod";
16
+ import { RAW_EVAL_MODES } from "../../ailf-shared/index.js";
16
17
  // ---------------------------------------------------------------------------
17
18
  // Debug options — boolean shorthand or structured object
18
19
  // ---------------------------------------------------------------------------
@@ -69,7 +70,11 @@ export const PipelineRequestSchema = z.object({
69
70
  headers: z.record(z.string(), z.string()).optional(),
70
71
  inlineTasks: z.array(z.record(z.string(), z.unknown())).optional(),
71
72
  jobId: z.string().optional(),
72
- mode: z.enum(["baseline", "agentic", "observed", "full"]).optional(),
73
+ /**
74
+ * Evaluation mode — accepts both canonical and legacy names.
75
+ * Legacy names must pass through normalizeMode() before entering typed pipeline code.
76
+ */
77
+ mode: z.enum(RAW_EVAL_MODES).optional(),
73
78
  noAutoScope: z.boolean().optional(),
74
79
  noCache: z.boolean().optional(),
75
80
  noRemoteCache: z.boolean().optional(),
@@ -25,21 +25,37 @@ export declare const RubricTemplateSchema: z.ZodObject<{
25
25
  }, z.core.$strip>;
26
26
  /** Inferred TypeScript type for a rubric template. */
27
27
  export type RubricTemplate = z.infer<typeof RubricTemplateSchema>;
28
+ /**
29
+ * A named weight profile — maps dimension names to weights (must sum to 1.0).
30
+ * Each profile is a self-contained scoring formula used for a specific
31
+ * (mode, variant) pair.
32
+ */
33
+ declare const WeightProfileSchema: z.ZodRecord<z.ZodString, z.ZodNumber>;
34
+ /** Inferred type for a single weight profile. */
35
+ export type WeightProfile = z.infer<typeof WeightProfileSchema>;
28
36
  /**
29
37
  * Schema for the full config/rubrics.yaml config file.
30
38
  *
31
- * Each dimension is scored on a uniform 0–100 scale. The `weights` section
32
- * defines how dimensions are combined into a composite score (must sum to 1.0).
39
+ * Each dimension is scored on a uniform 0–100 scale. Named scoring profiles
40
+ * define how dimensions are combined into composite scores. Mode-profile
41
+ * bindings declare which profile to use for each (mode, variant) pair.
42
+ *
43
+ * Supports both the new `profiles` format and the legacy flat `weights`
44
+ * format for backward compatibility.
45
+ *
46
+ * @see docs/design-docs/named-scoring-profiles.md
33
47
  */
34
48
  export declare const RubricConfigSchema: z.ZodObject<{
35
49
  footer: z.ZodString;
50
+ "mode-profiles": z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnion<readonly [z.ZodString, z.ZodRecord<z.ZodString, z.ZodString>]>>>>;
51
+ profiles: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodRecord<z.ZodString, z.ZodNumber>>>;
36
52
  templates: z.ZodRecord<z.ZodString, z.ZodObject<{
37
53
  criteria_label: z.ZodOptional<z.ZodNullable<z.ZodString>>;
38
54
  dimension: z.ZodOptional<z.ZodString>;
39
55
  header: z.ZodString;
40
56
  scale: z.ZodArray<z.ZodString>;
41
57
  }, z.core.$strip>>;
42
- weights: z.ZodRecord<z.ZodString, z.ZodNumber>;
58
+ weights: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
43
59
  }, z.core.$strip>;
44
60
  /** Inferred TypeScript type for the rubrics config. */
45
61
  export type RubricConfig = z.infer<typeof RubricConfigSchema>;
@@ -51,17 +67,17 @@ export declare const FeatureSchema: z.ZodObject<{
51
67
  id: z.ZodString;
52
68
  name: z.ZodString;
53
69
  priority: z.ZodEnum<{
70
+ critical: "critical";
54
71
  high: "high";
55
- low: "low";
56
72
  medium: "medium";
57
- critical: "critical";
73
+ low: "low";
58
74
  }>;
59
75
  sections: z.ZodArray<z.ZodString>;
60
76
  status: z.ZodEnum<{
61
77
  covered: "covered";
62
- "out-of-scope": "out-of-scope";
63
- planned: "planned";
64
78
  uncovered: "uncovered";
79
+ planned: "planned";
80
+ "out-of-scope": "out-of-scope";
65
81
  }>;
66
82
  taskCount: z.ZodOptional<z.ZodNumber>;
67
83
  }, z.core.$strip>;
@@ -76,17 +92,17 @@ export declare const FeatureRegistrySchema: z.ZodObject<{
76
92
  id: z.ZodString;
77
93
  name: z.ZodString;
78
94
  priority: z.ZodEnum<{
95
+ critical: "critical";
79
96
  high: "high";
80
- low: "low";
81
97
  medium: "medium";
82
- critical: "critical";
98
+ low: "low";
83
99
  }>;
84
100
  sections: z.ZodArray<z.ZodString>;
85
101
  status: z.ZodEnum<{
86
102
  covered: "covered";
87
- "out-of-scope": "out-of-scope";
88
- planned: "planned";
89
103
  uncovered: "uncovered";
104
+ planned: "planned";
105
+ "out-of-scope": "out-of-scope";
90
106
  }>;
91
107
  taskCount: z.ZodOptional<z.ZodNumber>;
92
108
  }, z.core.$strip>>;
@@ -424,14 +440,11 @@ export declare const TaskFileSchema: z.ZodArray<z.ZodUnion<readonly [z.ZodObject
424
440
  export type TaskFile = z.infer<typeof TaskFileSchema>;
425
441
  /**
426
442
  * Schema for per-dimension threshold values.
443
+ * Uses a dynamic record to support all evaluation modes, not just literacy.
427
444
  * Keys use kebab-case to match YAML convention; the threshold engine
428
445
  * normalizes to camelCase for comparison against FeatureScore fields.
429
446
  */
430
- export declare const ThresholdDimensionsSchema: z.ZodObject<{
431
- "code-correctness": z.ZodOptional<z.ZodNumber>;
432
- "doc-coverage": z.ZodOptional<z.ZodNumber>;
433
- "task-completion": z.ZodOptional<z.ZodNumber>;
434
- }, z.core.$strip>;
447
+ export declare const ThresholdDimensionsSchema: z.ZodRecord<z.ZodString, z.ZodNumber>;
435
448
  /** Inferred TypeScript type for threshold dimension overrides. */
436
449
  export type ThresholdDimensions = z.infer<typeof ThresholdDimensionsSchema>;
437
450
  /**
@@ -441,11 +454,7 @@ export type ThresholdDimensions = z.infer<typeof ThresholdDimensionsSchema>;
441
454
  export declare const ThresholdDefaultsSchema: z.ZodObject<{
442
455
  ceiling: z.ZodOptional<z.ZodNumber>;
443
456
  composite: z.ZodNumber;
444
- dimensions: z.ZodOptional<z.ZodObject<{
445
- "code-correctness": z.ZodOptional<z.ZodNumber>;
446
- "doc-coverage": z.ZodOptional<z.ZodNumber>;
447
- "task-completion": z.ZodOptional<z.ZodNumber>;
448
- }, z.core.$strip>>;
457
+ dimensions: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
449
458
  "doc-lift": z.ZodOptional<z.ZodNumber>;
450
459
  }, z.core.$strip>;
451
460
  /** Inferred TypeScript type for threshold defaults. */
@@ -485,21 +494,13 @@ export declare const ThresholdConfigSchema: z.ZodObject<{
485
494
  areas: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodObject<{
486
495
  ceiling: z.ZodOptional<z.ZodOptional<z.ZodNumber>>;
487
496
  composite: z.ZodOptional<z.ZodNumber>;
488
- dimensions: z.ZodOptional<z.ZodOptional<z.ZodObject<{
489
- "code-correctness": z.ZodOptional<z.ZodNumber>;
490
- "doc-coverage": z.ZodOptional<z.ZodNumber>;
491
- "task-completion": z.ZodOptional<z.ZodNumber>;
492
- }, z.core.$strip>>>;
497
+ dimensions: z.ZodOptional<z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>>;
493
498
  "doc-lift": z.ZodOptional<z.ZodOptional<z.ZodNumber>>;
494
499
  }, z.core.$strip>>>;
495
500
  defaults: z.ZodObject<{
496
501
  ceiling: z.ZodOptional<z.ZodNumber>;
497
502
  composite: z.ZodNumber;
498
- dimensions: z.ZodOptional<z.ZodObject<{
499
- "code-correctness": z.ZodOptional<z.ZodNumber>;
500
- "doc-coverage": z.ZodOptional<z.ZodNumber>;
501
- "task-completion": z.ZodOptional<z.ZodNumber>;
502
- }, z.core.$strip>>;
503
+ dimensions: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
503
504
  "doc-lift": z.ZodOptional<z.ZodNumber>;
504
505
  }, z.core.$strip>;
505
506
  regression: z.ZodOptional<z.ZodObject<{
@@ -31,23 +31,66 @@ export const RubricTemplateSchema = z.object({
31
31
  .array(z.string().min(1))
32
32
  .min(1, "scale must have at least one entry"),
33
33
  });
34
+ /**
35
+ * A named weight profile — maps dimension names to weights (must sum to 1.0).
36
+ * Each profile is a self-contained scoring formula used for a specific
37
+ * (mode, variant) pair.
38
+ */
39
+ const WeightProfileSchema = z
40
+ .record(z.string(), z.number().min(0).max(1))
41
+ .refine((w) => {
42
+ const sum = Object.values(w).reduce((s, v) => s + v, 0);
43
+ return Math.abs(sum - 1.0) < 0.001;
44
+ }, { message: "profile weights must sum to 1.0" });
45
+ /**
46
+ * Mode-to-profile bindings — maps (mode, perspective) pairs to profile names.
47
+ *
48
+ * Flat form (most modes):
49
+ * { "mcp-server": { gold: "mcp-behavior" } }
50
+ *
51
+ * Nested form (literacy mode with variant sub-keys):
52
+ * { literacy: { baseline: { gold: "default", baseline: "output-only" }, agentic: { gold: "default" } } }
53
+ *
54
+ * The nested form adds a variant level between mode and perspective,
55
+ * allowing a single canonical mode to host multiple scoring variants.
56
+ */
57
+ const ModeProfileEntrySchema = z.union([
58
+ z.string(),
59
+ z.record(z.string(), z.string()),
60
+ ]);
61
+ const ModeProfilesSchema = z.record(z.string(), z.record(z.string(), ModeProfileEntrySchema));
34
62
  /**
35
63
  * Schema for the full config/rubrics.yaml config file.
36
64
  *
37
- * Each dimension is scored on a uniform 0–100 scale. The `weights` section
38
- * defines how dimensions are combined into a composite score (must sum to 1.0).
65
+ * Each dimension is scored on a uniform 0–100 scale. Named scoring profiles
66
+ * define how dimensions are combined into composite scores. Mode-profile
67
+ * bindings declare which profile to use for each (mode, variant) pair.
68
+ *
69
+ * Supports both the new `profiles` format and the legacy flat `weights`
70
+ * format for backward compatibility.
71
+ *
72
+ * @see docs/design-docs/named-scoring-profiles.md
39
73
  */
40
- export const RubricConfigSchema = z.object({
74
+ export const RubricConfigSchema = z
75
+ .object({
41
76
  footer: z.string().min(1, "footer must be a non-empty string"),
77
+ "mode-profiles": ModeProfilesSchema.optional(),
78
+ profiles: z
79
+ .record(z.string(), WeightProfileSchema)
80
+ .refine((p) => "default" in p, {
81
+ message: "profiles must include a 'default' profile",
82
+ })
83
+ .optional(),
42
84
  templates: z
43
85
  .record(z.string(), RubricTemplateSchema)
44
86
  .refine((t) => Object.keys(t).length > 0, {
45
87
  message: "templates must have at least one entry",
46
88
  }),
47
- weights: z.record(z.string(), z.number().min(0).max(1)).refine((w) => {
48
- const sum = Object.values(w).reduce((s, v) => s + v, 0);
49
- return Math.abs(sum - 1.0) < 0.001;
50
- }, { message: "weights must sum to 1.0" }),
89
+ // Legacy: flat weight map. Treated as a single profile named "default".
90
+ weights: WeightProfileSchema.optional(),
91
+ })
92
+ .refine((c) => c.profiles !== undefined || c.weights !== undefined, {
93
+ message: "rubrics.yaml must have either 'profiles' or 'weights'",
51
94
  });
52
95
  // ---------------------------------------------------------------------------
53
96
  // Feature registry schema — validates config/features.yaml (Phase 3c)
@@ -246,14 +289,11 @@ export const TaskFileSchema = z
246
289
  // ---------------------------------------------------------------------------
247
290
  /**
248
291
  * Schema for per-dimension threshold values.
292
+ * Uses a dynamic record to support all evaluation modes, not just literacy.
249
293
  * Keys use kebab-case to match YAML convention; the threshold engine
250
294
  * normalizes to camelCase for comparison against FeatureScore fields.
251
295
  */
252
- export const ThresholdDimensionsSchema = z.object({
253
- "code-correctness": z.number().min(0).max(100).optional(),
254
- "doc-coverage": z.number().min(0).max(100).optional(),
255
- "task-completion": z.number().min(0).max(100).optional(),
256
- });
296
+ export const ThresholdDimensionsSchema = z.record(z.string(), z.number().min(0).max(100));
257
297
  /**
258
298
  * Schema for threshold defaults (and per-area overrides).
259
299
  * All fields are optional in per-area overrides; defaults must have composite.
@@ -18,10 +18,15 @@ export declare const ScheduleEntrySchema: z.ZodObject<{
18
18
  cron: z.ZodString;
19
19
  enabled: z.ZodDefault<z.ZodBoolean>;
20
20
  mode: z.ZodDefault<z.ZodEnum<{
21
- agentic: "agentic";
21
+ custom: "custom";
22
+ literacy: "literacy";
23
+ "mcp-server": "mcp-server";
24
+ "agent-harness": "agent-harness";
25
+ "knowledge-probe": "knowledge-probe";
22
26
  baseline: "baseline";
23
- full: "full";
27
+ agentic: "agentic";
24
28
  observed: "observed";
29
+ full: "full";
25
30
  }>>;
26
31
  name: z.ZodString;
27
32
  publish: z.ZodDefault<z.ZodBoolean>;
@@ -53,10 +58,15 @@ export declare const SchedulesFileSchema: z.ZodObject<{
53
58
  cron: z.ZodString;
54
59
  enabled: z.ZodDefault<z.ZodBoolean>;
55
60
  mode: z.ZodDefault<z.ZodEnum<{
56
- agentic: "agentic";
61
+ custom: "custom";
62
+ literacy: "literacy";
63
+ "mcp-server": "mcp-server";
64
+ "agent-harness": "agent-harness";
65
+ "knowledge-probe": "knowledge-probe";
57
66
  baseline: "baseline";
58
- full: "full";
67
+ agentic: "agentic";
59
68
  observed: "observed";
69
+ full: "full";
60
70
  }>>;
61
71
  name: z.ZodString;
62
72
  publish: z.ZodDefault<z.ZodBoolean>;
@@ -11,6 +11,7 @@
11
11
  * @see docs/design-docs/report-store/implementation.md — Phase 5
12
12
  */
13
13
  import { z } from "zod";
14
+ import { RAW_EVAL_MODES } from "../../ailf-shared/index.js";
14
15
  // ---------------------------------------------------------------------------
15
16
  // Cron expression validation
16
17
  // ---------------------------------------------------------------------------
@@ -34,8 +35,11 @@ export const ScheduleEntrySchema = z.object({
34
35
  cron: CronSchema,
35
36
  /** Whether this schedule is active */
36
37
  enabled: z.boolean().default(true),
37
- /** Evaluation mode */
38
- mode: z.enum(["agentic", "baseline", "full", "observed"]).default("baseline"),
38
+ /**
39
+ * Evaluation mode accepts both canonical and legacy names.
40
+ * Legacy names must pass through normalizeMode() before entering typed pipeline code.
41
+ */
42
+ mode: z.enum(RAW_EVAL_MODES).default("baseline"),
39
43
  /** Human-readable schedule name (used as report tag) */
40
44
  name: z
41
45
  .string()
@@ -17,10 +17,10 @@
17
17
  import { z } from "zod";
18
18
  /** All supported sink types as a Zod union. */
19
19
  export declare const SinkTypeSchema: z.ZodEnum<{
20
- webhook: "webhook";
21
20
  bigquery: "bigquery";
22
21
  "github-comment": "github-comment";
23
22
  slack: "slack";
23
+ webhook: "webhook";
24
24
  }>;
25
25
  /** Supported sink type string literal union. */
26
26
  export type SinkType = z.infer<typeof SinkTypeSchema>;
@@ -25,12 +25,21 @@ export function formatComparisonMarkdown(report) {
25
25
  lines.push("");
26
26
  lines.push(`**Overall: ${Math.round(report.baseline.overall.avgScore)} → ${Math.round(report.experiment.overall.avgScore)}** (${overallIcon} ${deltaStr(overall)})`);
27
27
  lines.push("");
28
- // Per-area table
29
- lines.push("| Feature | Baseline | Current | Delta | Task | Code | Docs |");
30
- lines.push("|---------|----------|---------|-------|------|------|------|");
28
+ // Derive dimension columns from the first area's keys (all areas share the
29
+ // same scoring profile, so the key set is uniform).
30
+ const dimKeys = report.areas.length > 0
31
+ ? Object.keys(report.areas[0].dimensions)
32
+ : Object.keys(report.deltas.perDimension);
33
+ // Per-area table — columns are dynamic
34
+ const dimHeaders = dimKeys.map(kebabToTitleCase);
35
+ const headerRow = ["Feature", "Baseline", "Current", "Delta", ...dimHeaders];
36
+ const separatorRow = headerRow.map(() => "------");
37
+ lines.push(`| ${headerRow.join(" | ")} |`);
38
+ lines.push(`|${separatorRow.join("|")}|`);
31
39
  for (const a of report.areas) {
32
40
  const icon = changeIcon(a.change);
33
- lines.push(`| ${a.area} | ${a.baseline} | ${a.experiment} | ${icon} ${deltaStr(a.delta)} | ${deltaStr(a.dimensions.taskCompletion.delta)} | ${deltaStr(a.dimensions.codeCorrectness.delta)} | ${deltaStr(a.dimensions.docCoverage.delta)} |`);
41
+ const dimCells = dimKeys.map((k) => deltaStr(a.dimensions[k]?.delta ?? 0));
42
+ lines.push(`| ${a.area} | ${a.baseline} | ${a.experiment} | ${icon} ${deltaStr(a.delta)} | ${dimCells.join(" | ")} |`);
34
43
  }
35
44
  lines.push("");
36
45
  // Summary
@@ -55,9 +64,9 @@ export function formatComparisonMarkdown(report) {
55
64
  const dim = report.deltas.perDimension;
56
65
  lines.push("| Dimension | Delta |");
57
66
  lines.push("|-----------|-------|");
58
- lines.push(`| Task Completion | ${deltaStr(dim.taskCompletion)} |`);
59
- lines.push(`| Code Correctness | ${deltaStr(dim.codeCorrectness)} |`);
60
- lines.push(`| Doc Coverage | ${deltaStr(dim.docCoverage)} |`);
67
+ for (const k of Object.keys(dim)) {
68
+ lines.push(`| ${kebabToTitleCase(k)} | ${deltaStr(dim[k])} |`);
69
+ }
61
70
  lines.push(`| Doc Lift | ${deltaStr(report.deltas.docLift)} |`);
62
71
  if (report.deltas.cost !== undefined) {
63
72
  const costStr = report.deltas.cost > 0
@@ -91,29 +100,51 @@ export function formatComparisonTable(report) {
91
100
  : "unchanged");
92
101
  lines.push(` Overall: ${Math.round(report.baseline.overall.avgScore)} → ${Math.round(report.experiment.overall.avgScore)} (${overallIcon} ${deltaStr(overall)})`);
93
102
  lines.push("");
94
- // Per-dimension averages
103
+ // Per-dimension averages — derived dynamically from the report
95
104
  const dim = report.deltas.perDimension;
105
+ const dimKeys = report.areas.length > 0
106
+ ? Object.keys(report.areas[0].dimensions)
107
+ : Object.keys(dim);
96
108
  lines.push(" Dimension averages:");
97
- lines.push(` Task Completion: ${deltaStr(dim.taskCompletion)}`);
98
- lines.push(` Code Correctness: ${deltaStr(dim.codeCorrectness)}`);
99
- lines.push(` Doc Coverage: ${deltaStr(dim.docCoverage)}`);
100
- lines.push(` Doc Lift: ${deltaStr(report.deltas.docLift)}`);
109
+ // Pad labels to the longest dimension label for alignment
110
+ const dimLabels = dimKeys.map(kebabToTitleCase);
111
+ // +1 for the colon appended to each label
112
+ const maxLabelLen = Math.max(...dimLabels.map((l) => l.length + 1), "Doc Lift:".length);
113
+ for (let i = 0; i < dimKeys.length; i++) {
114
+ lines.push(` ${(dimLabels[i] + ":").padEnd(maxLabelLen)} ${deltaStr(dim[dimKeys[i]] ?? 0)}`);
115
+ }
116
+ lines.push(` ${"Doc Lift:".padEnd(maxLabelLen)} ${deltaStr(report.deltas.docLift)}`);
101
117
  if (report.deltas.cost !== undefined) {
102
- lines.push(` Cost: ${report.deltas.cost > 0 ? "+" : ""}$${report.deltas.cost.toFixed(4)}`);
118
+ lines.push(` ${"Cost:".padEnd(maxLabelLen)} ${report.deltas.cost > 0 ? "+" : ""}$${report.deltas.cost.toFixed(4)}`);
103
119
  }
104
120
  lines.push("");
105
- // Per-area table
121
+ // Per-area table — columns are dynamic
106
122
  lines.push("-".repeat(80));
107
123
  lines.push("PER-AREA BREAKDOWN");
108
124
  lines.push("-".repeat(80));
109
125
  lines.push("");
110
- const h = "| Feature Area | Baseline | Experiment | Delta | Task | Code | Docs |";
111
- const sep = "|---------------------|----------|------------|-------|------|------|------|";
112
- lines.push(h);
113
- lines.push(sep);
126
+ const dimHeaders = dimKeys.map(kebabToTitleCase);
127
+ const colWidths = dimHeaders.map((h) => Math.max(h.length, 4));
128
+ const hCols = [
129
+ "Feature Area".padEnd(19),
130
+ "Baseline".padStart(8),
131
+ "Experiment".padStart(10),
132
+ "Delta".padStart(5),
133
+ ...dimHeaders.map((h, i) => h.padStart(colWidths[i])),
134
+ ];
135
+ const sepCols = [
136
+ "-".repeat(21),
137
+ "-".repeat(10),
138
+ "-".repeat(12),
139
+ "-".repeat(7),
140
+ ...colWidths.map((w) => "-".repeat(w + 2)),
141
+ ];
142
+ lines.push(`| ${hCols.join(" | ")} |`);
143
+ lines.push(`|${sepCols.join("|")}|`);
114
144
  for (const a of report.areas) {
115
145
  const icon = changeIcon(a.change);
116
- lines.push(`| ${icon} ${a.area.padEnd(17)} | ${String(a.baseline).padStart(8)} | ${String(a.experiment).padStart(10)} | ${deltaStr(a.delta).padStart(5)} | ${deltaStr(a.dimensions.taskCompletion.delta).padStart(4)} | ${deltaStr(a.dimensions.codeCorrectness.delta).padStart(4)} | ${deltaStr(a.dimensions.docCoverage.delta).padStart(4)} |`);
146
+ const dimCells = dimKeys.map((k, i) => deltaStr(a.dimensions[k]?.delta ?? 0).padStart(colWidths[i]));
147
+ lines.push(`| ${icon} ${a.area.padEnd(17)} | ${String(a.baseline).padStart(8)} | ${String(a.experiment).padStart(10)} | ${deltaStr(a.delta).padStart(5)} | ${dimCells.join(" | ")} |`);
117
148
  }
118
149
  lines.push("");
119
150
  // Classification summary
@@ -187,3 +218,10 @@ function deltaStr(d) {
187
218
  return `${Math.round(d)}`;
188
219
  return "0";
189
220
  }
221
+ /** Convert kebab-case dimension name to title case (e.g. 'task-completion' → 'Task Completion') */
222
+ function kebabToTitleCase(name) {
223
+ return name
224
+ .split("-")
225
+ .map((w) => w.charAt(0).toUpperCase() + w.slice(1))
226
+ .join(" ");
227
+ }
@@ -7,6 +7,7 @@
7
7
  * Extracted from packages/eval/src/lib/ during the Ports & Adapters
8
8
  * migration (Phase 4e).
9
9
  */
10
- export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
10
+ export { classifyRubric, detectFeatureArea, extractDimensions, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
11
11
  export { formatComparisonMarkdown, formatComparisonTable, } from "./comparison-formatters.js";
12
+ export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, type AggregationStrategy, type AreaScore, type AssertionScore, type DimensionScore, type EnsembleGradingConfig, type GraderTransitionConfig, type TaskScore, type TaskScoreOptions, } from "./scoring-engine.js";
12
13
  export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "./config-helpers.js";
@@ -7,6 +7,7 @@
7
7
  * Extracted from packages/eval/src/lib/ during the Ports & Adapters
8
8
  * migration (Phase 4e).
9
9
  */
10
- export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
10
+ export { classifyRubric, detectFeatureArea, extractDimensions, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
11
11
  export { formatComparisonMarkdown, formatComparisonTable, } from "./comparison-formatters.js";
12
+ export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, } from "./scoring-engine.js";
12
13
  export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "./config-helpers.js";