@sanity/ailf 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (288) hide show
  1. package/config/features.ts +23 -0
  2. package/config/models.ts +83 -0
  3. package/config/prompts.ts +16 -0
  4. package/config/rubrics.ts +225 -0
  5. package/config/schedules.ts +47 -0
  6. package/config/sinks.ts +37 -0
  7. package/config/sources.ts +21 -0
  8. package/config/thresholds.ts +61 -0
  9. package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
  10. package/dist/_vendor/ailf-core/config-helpers.js +150 -0
  11. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  12. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  13. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  14. package/dist/_vendor/ailf-core/index.js +5 -0
  15. package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
  16. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  17. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  18. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  19. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  20. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  21. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  22. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
  23. package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
  24. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
  25. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
  26. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -29
  27. package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -8
  28. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  29. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  30. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  31. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  32. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  33. package/dist/_vendor/ailf-core/services/index.js +2 -1
  34. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  35. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  36. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  37. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  38. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  39. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  40. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  41. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  42. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
  43. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  44. package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
  45. package/dist/_vendor/ailf-core/types/index.js +8 -1
  46. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
  47. package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
  48. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  49. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  50. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  51. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  52. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  53. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  54. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  55. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  56. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  57. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  58. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  59. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  60. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  61. package/dist/_vendor/ailf-shared/index.js +0 -1
  62. package/dist/adapters/api-client/build-request.js +14 -13
  63. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  64. package/dist/adapters/config-sources/file-config-adapter.js +38 -12
  65. package/dist/adapters/config-sources/index.d.ts +2 -0
  66. package/dist/adapters/config-sources/index.js +1 -0
  67. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  68. package/dist/adapters/config-sources/ts-config-loader.js +133 -0
  69. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  70. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  71. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  72. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  73. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  74. package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
  75. package/dist/adapters/task-sources/index.d.ts +1 -0
  76. package/dist/adapters/task-sources/index.js +1 -0
  77. package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
  78. package/dist/adapters/task-sources/repo-task-source.js +69 -16
  79. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  80. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  81. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  82. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  83. package/dist/cli.js +0 -2
  84. package/dist/commands/baseline.js +4 -1
  85. package/dist/commands/calculate-scores.js +1 -1
  86. package/dist/commands/coverage-audit.js +7 -1
  87. package/dist/commands/explain-handler.js +25 -23
  88. package/dist/commands/fetch-docs.js +3 -2
  89. package/dist/commands/generate-configs.js +1 -1
  90. package/dist/commands/interactive.js +11 -7
  91. package/dist/commands/pipeline-action.d.ts +2 -0
  92. package/dist/commands/pipeline-action.js +16 -6
  93. package/dist/commands/pipeline.d.ts +1 -0
  94. package/dist/commands/pipeline.js +4 -2
  95. package/dist/commands/pr-comment.js +1 -1
  96. package/dist/commands/publish.js +2 -2
  97. package/dist/commands/readiness-report.js +13 -6
  98. package/dist/composition-root.d.ts +1 -1
  99. package/dist/composition-root.js +67 -4
  100. package/dist/orchestration/build-app-context.js +1 -0
  101. package/dist/orchestration/build-step-sequence.js +24 -6
  102. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  103. package/dist/orchestration/steps/fetch-docs-step.js +6 -4
  104. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  105. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  106. package/dist/orchestration/steps/generate-configs-step.js +245 -51
  107. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  108. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  109. package/dist/orchestration/steps/readiness-step.js +5 -6
  110. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  111. package/dist/orchestration/steps/run-eval-step.js +8 -7
  112. package/dist/pipeline/cache.d.ts +1 -1
  113. package/dist/pipeline/cache.js +36 -8
  114. package/dist/pipeline/calculate-scores.d.ts +2 -4
  115. package/dist/pipeline/calculate-scores.js +43 -113
  116. package/dist/pipeline/checks.js +2 -2
  117. package/dist/pipeline/compare.js +8 -8
  118. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  119. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  120. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  121. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  122. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  123. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  124. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  125. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  126. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  127. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
  128. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  129. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  130. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  131. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  132. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  133. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
  134. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  135. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  136. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  137. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  138. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  139. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  140. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  141. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  142. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  143. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  144. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  145. package/dist/pipeline/compiler/config-loader.js +111 -0
  146. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  147. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  148. package/dist/pipeline/compiler/hash.d.ts +11 -0
  149. package/dist/pipeline/compiler/hash.js +18 -0
  150. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  151. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  152. package/dist/pipeline/compiler/index.d.ts +29 -0
  153. package/dist/pipeline/compiler/index.js +45 -0
  154. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  155. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  156. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  157. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  158. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  159. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  160. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  161. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  162. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  163. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  164. package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
  165. package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
  166. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  167. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  168. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  169. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  170. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  171. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
  172. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
  173. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
  174. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  175. package/dist/pipeline/compiler/presets/index.js +8 -0
  176. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
  177. package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
  178. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  179. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  180. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  181. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  182. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  183. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  184. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  185. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  186. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  187. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  188. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  189. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  190. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  191. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  192. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  193. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  194. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  195. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  196. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  197. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  198. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  199. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  200. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  201. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  202. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  203. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  204. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  205. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  206. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  207. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  208. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  209. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  210. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  211. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  212. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  213. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  214. package/dist/pipeline/coverage-audit.d.ts +15 -5
  215. package/dist/pipeline/coverage-audit.js +41 -22
  216. package/dist/pipeline/eval-constants.d.ts +16 -6
  217. package/dist/pipeline/eval-constants.js +25 -4
  218. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  219. package/dist/pipeline/eval-fingerprint.js +8 -9
  220. package/dist/pipeline/expand-tasks.d.ts +19 -10
  221. package/dist/pipeline/expand-tasks.js +34 -28
  222. package/dist/pipeline/gap-analysis.d.ts +1 -1
  223. package/dist/pipeline/gap-analysis.js +2 -2
  224. package/dist/pipeline/generate-configs.d.ts +22 -4
  225. package/dist/pipeline/generate-configs.js +53 -24
  226. package/dist/pipeline/grader-api.d.ts +3 -3
  227. package/dist/pipeline/grader-api.js +5 -12
  228. package/dist/pipeline/grader-compare-runner.js +20 -27
  229. package/dist/pipeline/grader-comparison.d.ts +4 -8
  230. package/dist/pipeline/grader-comparison.js +11 -17
  231. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  232. package/dist/pipeline/grader-consistency-runner.js +16 -20
  233. package/dist/pipeline/grader-consistency.d.ts +6 -10
  234. package/dist/pipeline/grader-consistency.js +13 -32
  235. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  236. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  237. package/dist/pipeline/grader-sensitivity.js +10 -10
  238. package/dist/pipeline/grader-validate-runner.js +7 -5
  239. package/dist/pipeline/grader-validation.d.ts +2 -6
  240. package/dist/pipeline/grader-validation.js +14 -22
  241. package/dist/pipeline/map-request-to-config.js +6 -1
  242. package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
  243. package/dist/pipeline/mirror-repo-tasks.js +16 -15
  244. package/dist/pipeline/normalize-mode.d.ts +49 -0
  245. package/dist/pipeline/normalize-mode.js +64 -0
  246. package/dist/pipeline/plan.d.ts +5 -2
  247. package/dist/pipeline/plan.js +134 -78
  248. package/dist/pipeline/pr-comment.js +2 -0
  249. package/dist/pipeline/profile-resolution.d.ts +22 -14
  250. package/dist/pipeline/profile-resolution.js +41 -19
  251. package/dist/pipeline/provenance.d.ts +2 -2
  252. package/dist/pipeline/provenance.js +12 -17
  253. package/dist/pipeline/release-report.js +4 -4
  254. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  255. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  256. package/dist/pipeline/rubric-loader.d.ts +20 -0
  257. package/dist/pipeline/rubric-loader.js +37 -0
  258. package/dist/pipeline/validate.d.ts +4 -4
  259. package/dist/pipeline/validate.js +64 -53
  260. package/dist/schedules/loader.js +18 -8
  261. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  262. package/dist/scripts/migrate-task-mode.js +85 -0
  263. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  264. package/dist/scripts/validate-task-sources.d.ts +1 -1
  265. package/dist/scripts/validate-task-sources.js +15 -15
  266. package/dist/sinks/loader.js +5 -7
  267. package/dist/sources.d.ts +7 -7
  268. package/dist/sources.js +22 -24
  269. package/dist/webhook/dispatch.js +2 -1
  270. package/package.json +6 -3
  271. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  272. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  273. package/tasks/literacy/frameworks.task.ts +128 -0
  274. package/tasks/literacy/functions.task.ts +69 -0
  275. package/tasks/literacy/groq.task.ts +258 -0
  276. package/tasks/literacy/nextjs-live.task.ts +75 -0
  277. package/tasks/literacy/studio-setup.task.ts +131 -0
  278. package/tasks/literacy/visual-editing.task.ts +146 -0
  279. package/config/features.yaml +0 -116
  280. package/config/models.yaml +0 -116
  281. package/config/prompts.yaml +0 -75
  282. package/config/rubrics.yaml +0 -81
  283. package/config/schedules.yaml +0 -43
  284. package/config/sinks.yaml +0 -54
  285. package/config/sources.yaml +0 -51
  286. package/config/thresholds.yaml +0 -49
  287. package/dist/agent-observer/test-imports.d.ts +0 -7
  288. package/dist/agent-observer/test-imports.js +0 -185
@@ -1,6 +1,21 @@
1
1
  /**
2
2
  * pipeline/generate-configs.ts
3
3
  *
4
+ * @deprecated This is the LEGACY compilation path. New code should use the
5
+ * config compiler pipeline instead:
6
+ *
7
+ * import { compileLiteracyTasks } from "./compiler/literacy-bridge.js"
8
+ * import { buildTaskGraph, compileToPromptfoo } from "./compiler/index.js"
9
+ *
10
+ * This file is retained behind the `--legacy-compiler` CLI flag as an
11
+ * emergency fallback during the migration period. It will be removed once
12
+ * the new compiler has been validated in production.
13
+ *
14
+ * @see packages/eval/src/pipeline/compiler/ — the new compiler pipeline
15
+ * @see docs/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
16
+ *
17
+ * ---
18
+ *
4
19
  * Reads config/models.yaml (the central model registry) and generates all
5
20
  * promptfoo config files with the correct provider entries.
6
21
  *
@@ -19,12 +34,15 @@
19
34
  * @see docs/exec-plans/eliminate-lib-layer.md
20
35
  */
21
36
  import { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "../_vendor/ailf-core/index.js";
22
- import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
37
+ import { existsSync, readdirSync, writeFileSync } from "fs";
23
38
  import { resolve } from "path";
24
- import { dump, load } from "js-yaml";
39
+ import { dump } from "js-yaml";
25
40
  import { ConsoleLogger } from "../adapters/loggers/index.js";
41
+ import { loadConfigFile } from "./compiler/config-loader.js";
42
+ import { LITERACY_PROMPT_TEMPLATES } from "./compiler/mode-handlers/literacy-handler.js";
26
43
  import { expandTaskDefinitions, loadAndExpandTasks } from "./expand-tasks.js";
27
44
  import { validateModelsYaml } from "./validate.js";
45
+ import { LiteracyVariant } from "./normalize-mode.js";
28
46
  import { loadSource } from "../sources.js";
29
47
  // Re-export pure functions from core for backward compatibility.
30
48
  // Tests and other modules that previously imported from lib/generate-configs
@@ -44,32 +62,43 @@ export function discoverTaskFiles(rootDir) {
44
62
  .sort()
45
63
  .map((f) => `file://tasks/${f}`);
46
64
  }
47
- /** Load prompt templates from config/prompts.yaml. Throws if missing or malformed. */
65
+ /**
66
+ * Load prompt templates. Uses handler-owned literacy templates as defaults,
67
+ * with config/prompts.ts as an override layer for user customization.
68
+ */
48
69
  export function loadPrompts(rootDir) {
49
- const promptsPath = resolve(rootDir, "config", "prompts.yaml");
50
- if (!existsSync(promptsPath)) {
51
- throw new Error(`config/prompts.yaml not found at ${promptsPath}. This file is required — it defines the prompt templates for all evaluation modes.`);
52
- }
53
- const raw = readFileSync(promptsPath, "utf-8");
54
- const data = load(raw);
55
70
  const toPrompt = (entry) => ({
56
71
  id: entry.id,
57
72
  label: entry.label,
58
73
  raw: entry.template,
59
74
  });
60
- if (!data["with-docs"] || !data["without-docs"] || !data["agentic"]) {
61
- const missing = ["with-docs", "without-docs", "agentic"].filter((k) => !data[k]);
62
- throw new Error(`config/prompts.yaml is missing required keys: ${missing.join(", ")}. Each prompt must have id, label, and template fields.`);
75
+ // Load user overrides from config/prompts (may be empty after Wave 4)
76
+ let overrides = {};
77
+ try {
78
+ const loaded = loadConfigFile("prompts", rootDir).data;
79
+ // config/prompts.ts may export a Record (legacy) or an empty array (post-Wave 4)
80
+ if (loaded && !Array.isArray(loaded)) {
81
+ overrides = loaded;
82
+ }
83
+ }
84
+ catch {
85
+ // No config/prompts file — use handler defaults only
63
86
  }
87
+ // Handler-owned templates are the canonical source; overrides take precedence
64
88
  return {
65
- agentic: toPrompt(data["agentic"]),
66
- withDocs: toPrompt(data["with-docs"]),
67
- withoutDocs: toPrompt(data["without-docs"]),
89
+ agentic: overrides[LiteracyVariant.AGENTIC]
90
+ ? toPrompt(overrides[LiteracyVariant.AGENTIC])
91
+ : toPrompt(LITERACY_PROMPT_TEMPLATES[LiteracyVariant.AGENTIC]),
92
+ withDocs: overrides["with-docs"]
93
+ ? toPrompt(overrides["with-docs"])
94
+ : toPrompt(LITERACY_PROMPT_TEMPLATES["with-docs"]),
95
+ withoutDocs: overrides["without-docs"]
96
+ ? toPrompt(overrides["without-docs"])
97
+ : toPrompt(LITERACY_PROMPT_TEMPLATES["without-docs"]),
68
98
  };
69
99
  }
70
100
  function loadModels(rootDir) {
71
- const raw = readFileSync(resolve(rootDir, "config", "models.yaml"), "utf-8");
72
- return load(raw);
101
+ return loadConfigFile("models", rootDir).data;
73
102
  }
74
103
  // ---------------------------------------------------------------------------
75
104
  // Shared components
@@ -191,7 +220,7 @@ function generateAgenticConfig(models, tests, prompts, source, searchMode, allow
191
220
  };
192
221
  }
193
222
  function generateBaselineConfig(models, tests, prompts) {
194
- const baselineModels = models.models.filter((m) => modelMatchesMode(m, "baseline"));
223
+ const baselineModels = models.models.filter((m) => modelMatchesMode(m, LiteracyVariant.STANDARD));
195
224
  const providers = baselineModels.map((model) => ({
196
225
  config: mergeConfig(models.defaults, model.config),
197
226
  id: model.id,
@@ -217,7 +246,7 @@ function generateBaselineConfig(models, tests, prompts) {
217
246
  };
218
247
  }
219
248
  function generateObservedConfig(models, tests, prompts) {
220
- const observedModels = models.models.filter((m) => modelMatchesMode(m, "observed"));
249
+ const observedModels = models.models.filter((m) => modelMatchesMode(m, LiteracyVariant.OBSERVED));
221
250
  const providers = observedModels.map((model) => {
222
251
  const modelName = extractModelName(model.id);
223
252
  return {
@@ -293,7 +322,7 @@ export function generateConfigs(options) {
293
322
  const filter = options.filter?.areas || options.filter?.taskIds
294
323
  ? options.filter
295
324
  : undefined;
296
- // Expand tasks — use TaskDefinition[] from TaskSource when provided,
325
+ // Expand tasks — use GeneralizedTaskDefinition[] from TaskSource when provided,
297
326
  // otherwise fall back to loading from tasks/*.yaml files.
298
327
  let entries;
299
328
  let agenticEntries;
@@ -303,16 +332,16 @@ export function generateConfigs(options) {
303
332
  taskCount: options.tasks.length,
304
333
  taskIds: options.tasks.map((t) => t.id),
305
334
  });
306
- const baselineResult = expandTaskDefinitions(options.tasks, rootDir, "baseline");
335
+ const baselineResult = expandTaskDefinitions(options.tasks, rootDir, LiteracyVariant.STANDARD);
307
336
  entries = baselineResult.entries;
308
337
  log.info(` Expanded ${baselineResult.stats.totalTasks} task(s) → ${baselineResult.stats.expandedTotal} test entries (from TaskSource)`);
309
- const agenticResult = expandTaskDefinitions(options.tasks, rootDir, "agentic");
338
+ const agenticResult = expandTaskDefinitions(options.tasks, rootDir, LiteracyVariant.AGENTIC);
310
339
  agenticEntries = agenticResult.entries;
311
340
  log.info(` Agentic: ${agenticResult.stats.expandedTotal} entries (gold only, no baseline)`);
312
341
  }
313
342
  else {
314
343
  // Legacy path — read from tasks/*.yaml files
315
- const { entries: baselineEntries, stats } = loadAndExpandTasks(rootDir, filter, "baseline", log);
344
+ const { entries: baselineEntries, stats } = loadAndExpandTasks(rootDir, filter, LiteracyVariant.STANDARD, log);
316
345
  entries = baselineEntries;
317
346
  log.info(` Expanded ${stats.singleDefinitions} task(s) → ${stats.expandedTotal} test entries`);
318
347
  if (stats.legacyEntries > 0) {
@@ -328,7 +357,7 @@ export function generateConfigs(options) {
328
357
  }
329
358
  log.info(` Scoped to: ${parts.join("; ")}`);
330
359
  }
331
- const { entries: agenticFromYaml, stats: agenticStats } = loadAndExpandTasks(rootDir, filter, "agentic", log);
360
+ const { entries: agenticFromYaml, stats: agenticStats } = loadAndExpandTasks(rootDir, filter, LiteracyVariant.AGENTIC, log);
332
361
  agenticEntries = agenticFromYaml;
333
362
  log.info(` Agentic: ${agenticStats.expandedTotal} entries (gold only, no baseline)`);
334
363
  }
@@ -7,7 +7,7 @@
7
7
  * grader model prefix. Reads the appropriate API key from environment.
8
8
  *
9
9
  * Also exports `loadGraderModel()` to resolve the grader from
10
- * `config/models.yaml`.
10
+ * `config/models`.
11
11
  *
12
12
  * Migrated from lib/grader-api.ts — no module-level side effects, no
13
13
  * process.exit(), accepts rootDir as parameter for file-based operations.
@@ -26,11 +26,11 @@ interface ProviderConfig {
26
26
  */
27
27
  export declare function gradeOnce(graderModel: string, responseText: string, rubricText: string, logger?: Logger): Promise<null | number>;
28
28
  /**
29
- * Load the grader model from `config/models.yaml`.
29
+ * Load the grader model from `config/models`.
30
30
  * Returns both the model ID and human-readable label.
31
31
  * Falls back to `openai:gpt-5` if not configured.
32
32
  *
33
- * @throws Error if config/models.yaml is not found
33
+ * @throws Error if config/models is not found
34
34
  */
35
35
  export declare function loadGraderModel(rootDir: string): {
36
36
  id: string;
@@ -7,15 +7,13 @@
7
7
  * grader model prefix. Reads the appropriate API key from environment.
8
8
  *
9
9
  * Also exports `loadGraderModel()` to resolve the grader from
10
- * `config/models.yaml`.
10
+ * `config/models`.
11
11
  *
12
12
  * Migrated from lib/grader-api.ts — no module-level side effects, no
13
13
  * process.exit(), accepts rootDir as parameter for file-based operations.
14
14
  */
15
- import { existsSync, readFileSync } from "fs";
16
- import { join } from "path";
17
- import { load } from "js-yaml";
18
15
  import { ConsoleLogger } from "../adapters/loggers/index.js";
16
+ import { loadConfigFile } from "./compiler/config-loader.js";
19
17
  // ---------------------------------------------------------------------------
20
18
  // Public API
21
19
  // ---------------------------------------------------------------------------
@@ -63,19 +61,14 @@ ${rubricText}
63
61
  }
64
62
  }
65
63
  /**
66
- * Load the grader model from `config/models.yaml`.
64
+ * Load the grader model from `config/models`.
67
65
  * Returns both the model ID and human-readable label.
68
66
  * Falls back to `openai:gpt-5` if not configured.
69
67
  *
70
- * @throws Error if config/models.yaml is not found
68
+ * @throws Error if config/models is not found
71
69
  */
72
70
  export function loadGraderModel(rootDir) {
73
- const modelsPath = join(rootDir, "config", "models.yaml");
74
- if (!existsSync(modelsPath)) {
75
- throw new Error(`config/models.yaml not found at ${modelsPath}`);
76
- }
77
- const raw = readFileSync(modelsPath, "utf-8");
78
- const data = load(raw);
71
+ const data = loadConfigFile("models", rootDir).data;
79
72
  return {
80
73
  id: data?.grader?.id ?? "openai:gpt-5",
81
74
  label: data?.grader?.label ?? "GPT-5 (grader)",
@@ -14,8 +14,8 @@
14
14
  */
15
15
  import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
16
16
  import { join } from "path";
17
- import { load } from "js-yaml";
18
17
  import { ConsoleLogger } from "../adapters/loggers/index.js";
18
+ import { loadConfigFile } from "./compiler/config-loader.js";
19
19
  import { compareGraders, } from "./grader-comparison.js";
20
20
  import { classifyCorrelation } from "./grader-validation.js";
21
21
  import { gradeOnce } from "./grader-api.js";
@@ -23,26 +23,20 @@ import { gradeOnce } from "./grader-api.js";
23
23
  // Internal helpers
24
24
  // ---------------------------------------------------------------------------
25
25
  function classifyDimension(component) {
26
+ // Prefer structured metadata — pass through any dimension name directly,
27
+ // enabling non-literacy profiles (MCP, agent, knowledge-probe)
26
28
  const metadata = component.assertion?.metadata;
27
29
  if (metadata?.dimension) {
28
- switch (metadata.dimension) {
29
- case "code-correctness":
30
- return "codeCorrectness";
31
- case "doc-coverage":
32
- return "docCoverage";
33
- case "task-completion":
34
- return "taskCompletion";
35
- default:
36
- return null;
37
- }
30
+ return metadata.dimension;
38
31
  }
32
+ // Fallback: heuristic name matching (returns kebab-case)
39
33
  const value = (component.assertion?.value ?? "").toLowerCase();
40
34
  if (value.includes("task completion"))
41
- return "taskCompletion";
35
+ return "task-completion";
42
36
  if (value.includes("code correctness"))
43
- return "codeCorrectness";
37
+ return "code-correctness";
44
38
  if (value.includes("documentation coverage") || value.includes("hallucinate"))
45
- return "docCoverage";
39
+ return "doc-coverage";
46
40
  return null;
47
41
  }
48
42
  function detectFeatureArea(description) {
@@ -101,15 +95,10 @@ function extractJudgments(file) {
101
95
  }
102
96
  /**
103
97
  * Load config: resolve baseline grader and candidate graders.
104
- * Candidate overrides take precedence over config/models.yaml.
98
+ * Candidate overrides take precedence over config/models.
105
99
  */
106
100
  function loadConfig(rootDir, candidateOverrides) {
107
- const modelsPath = join(rootDir, "config", "models.yaml");
108
- if (!existsSync(modelsPath)) {
109
- throw new Error(`config/models.yaml not found at ${modelsPath}`);
110
- }
111
- const raw = readFileSync(modelsPath, "utf-8");
112
- const data = load(raw);
101
+ const data = loadConfigFile("models", rootDir).data;
113
102
  const baseline = {
114
103
  id: data?.grader?.id ?? "openai:gpt-5",
115
104
  label: data?.grader?.label ?? "GPT-5 (grader)",
@@ -158,11 +147,15 @@ export function formatComparisonReport(result) {
158
147
  const sep = "|------------------|-------------|--------|---------|-------|";
159
148
  lines.push(h);
160
149
  lines.push(sep);
161
- const dims = [
162
- { data: pair.perDimension.taskCompletion, name: "Task Completion" },
163
- { data: pair.perDimension.codeCorrectness, name: "Code Correctness" },
164
- { data: pair.perDimension.docCoverage, name: "Doc Coverage" },
165
- ];
150
+ // Derive display rows dynamically from whatever dimensions are present
151
+ const dims = Object.entries(pair.perDimension).map(([key, data]) => ({
152
+ data,
153
+ // kebab-case → Title Case (e.g. 'task-completion' 'Task Completion')
154
+ name: key
155
+ .split("-")
156
+ .map((w) => w.charAt(0).toUpperCase() + w.slice(1))
157
+ .join(" "),
158
+ }));
166
159
  for (const { data, name } of dims) {
167
160
  const biasStr = data.bias > 0 ? `+${data.bias}` : `${data.bias}`;
168
161
  lines.push(`| ${name.padEnd(16)} | r=${String(data.correlation).padStart(9)} | ${biasStr.padStart(6)} | ${String(data.meanAbsDiff).padStart(7)} | ${String(data.count).padStart(5)} |`);
@@ -208,7 +201,7 @@ export async function runGraderCompare(options) {
208
201
  const { baseline, candidates } = loadConfig(rootDir, options.candidates);
209
202
  if (candidates.length === 0) {
210
203
  throw new Error("No candidate graders configured. " +
211
- "Add grader-candidates to config/models.yaml or pass --candidate.");
204
+ "Add grader-candidates to config/models or pass --candidate.");
212
205
  }
213
206
  // Load eval results
214
207
  if (!existsSync(resultsPath)) {
@@ -51,12 +51,8 @@ export interface GraderPairComparison {
51
51
  graderB: string;
52
52
  /** Mean absolute difference between scores */
53
53
  meanAbsDiff: number;
54
- /** Per-dimension comparisons */
55
- perDimension: {
56
- taskCompletion: DimensionPairComparison;
57
- codeCorrectness: DimensionPairComparison;
58
- docCoverage: DimensionPairComparison;
59
- };
54
+ /** Per-dimension comparisons (keyed by kebab-case dimension name) */
55
+ perDimension: Record<string, DimensionPairComparison>;
60
56
  }
61
57
  /** Recommendation for a candidate grader */
62
58
  export interface GraderRecommendation {
@@ -71,8 +67,8 @@ export interface GraderRecommendation {
71
67
  export interface GraderScore {
72
68
  /** Feature area (e.g., "groq") */
73
69
  area: string;
74
- /** Which scoring dimension */
75
- dimension: "codeCorrectness" | "docCoverage" | "taskCompletion";
70
+ /** Which scoring dimension (kebab-case, e.g. 'task-completion') */
71
+ dimension: string;
76
72
  /** Score assigned by this grader (0–100) */
77
73
  score: number;
78
74
  /** Task ID (e.g., "groq-blog-queries") */
@@ -68,16 +68,9 @@ function comparePair(a, b) {
68
68
  // Find paired observations (present in both graders)
69
69
  const pairedA = [];
70
70
  const pairedB = [];
71
- const dimPairsA = {
72
- codeCorrectness: [],
73
- docCoverage: [],
74
- taskCompletion: [],
75
- };
76
- const dimPairsB = {
77
- codeCorrectness: [],
78
- docCoverage: [],
79
- taskCompletion: [],
80
- };
71
+ // Group by dimension dynamically — works with any dimension names
72
+ const dimPairsA = {};
73
+ const dimPairsB = {};
81
74
  for (const sA of a.scores) {
82
75
  const key = `${sA.taskId}::${sA.dimension}`;
83
76
  const scoreB = bScoreMap.get(key);
@@ -85,8 +78,13 @@ function comparePair(a, b) {
85
78
  continue;
86
79
  pairedA.push(sA.score);
87
80
  pairedB.push(scoreB);
88
- dimPairsA[sA.dimension].push(sA.score);
89
- dimPairsB[sA.dimension].push(scoreB);
81
+ (dimPairsA[sA.dimension] ??= []).push(sA.score);
82
+ (dimPairsB[sA.dimension] ??= []).push(scoreB);
83
+ }
84
+ // Build perDimension from all dimensions observed in paired data
85
+ const perDimension = {};
86
+ for (const dim of Object.keys(dimPairsA)) {
87
+ perDimension[dim] = computeDimensionPair(dimPairsA[dim], dimPairsB[dim]);
90
88
  }
91
89
  return {
92
90
  bias: computeBias(pairedA, pairedB),
@@ -94,11 +92,7 @@ function comparePair(a, b) {
94
92
  graderA: a.modelId,
95
93
  graderB: b.modelId,
96
94
  meanAbsDiff: computeMeanAbsDiff(pairedA, pairedB),
97
- perDimension: {
98
- codeCorrectness: computeDimensionPair(dimPairsA.codeCorrectness, dimPairsB.codeCorrectness),
99
- docCoverage: computeDimensionPair(dimPairsA.docCoverage, dimPairsB.docCoverage),
100
- taskCompletion: computeDimensionPair(dimPairsA.taskCompletion, dimPairsB.taskCompletion),
101
- },
95
+ perDimension,
102
96
  };
103
97
  }
104
98
  /** Mean signed difference (B - A). Positive = B scores higher. */
@@ -17,7 +17,6 @@
17
17
  import { type Logger } from "../_vendor/ailf-core/index.d.ts";
18
18
  import type { RawPromptfooFile } from "./calculate-scores.js";
19
19
  import { type GraderConsistency } from "./grader-consistency.js";
20
- import type { DimensionName } from "./types.js";
21
20
  /** Options for the grader consistency runner. */
22
21
  export interface GraderConsistencyRunnerOptions {
23
22
  /** Logger for structured output. Falls back to ConsoleLogger if omitted. */
@@ -34,8 +33,8 @@ interface GradingJudgment {
34
33
  area: string;
35
34
  /** Task description */
36
35
  description: string;
37
- /** Scoring dimension */
38
- dimension: DimensionName;
36
+ /** Scoring dimension (kebab-case, e.g. 'task-completion') */
37
+ dimension: string;
39
38
  /** The original score from the eval run */
40
39
  originalScore: number;
41
40
  /** Provider (model under test) */
@@ -23,28 +23,20 @@ import { analyzeConsistency, } from "./grader-consistency.js";
23
23
  // Rubric dimension classification (similar to calculate-scores)
24
24
  // ---------------------------------------------------------------------------
25
25
  function classifyDimension(component) {
26
- // Prefer structured metadata
26
+ // Prefer structured metadata — pass through any dimension name directly,
27
+ // enabling non-literacy profiles (MCP, agent, knowledge-probe)
27
28
  const metadata = component.assertion?.metadata;
28
29
  if (metadata?.dimension) {
29
- switch (metadata.dimension) {
30
- case "code-correctness":
31
- return "codeCorrectness";
32
- case "doc-coverage":
33
- return "docCoverage";
34
- case "task-completion":
35
- return "taskCompletion";
36
- default:
37
- return null;
38
- }
30
+ return metadata.dimension;
39
31
  }
40
- // Fallback: heuristic name matching
32
+ // Fallback: heuristic name matching (returns kebab-case)
41
33
  const value = (component.assertion?.value ?? "").toLowerCase();
42
34
  if (value.includes("task completion"))
43
- return "taskCompletion";
35
+ return "task-completion";
44
36
  if (value.includes("code correctness"))
45
- return "codeCorrectness";
37
+ return "code-correctness";
46
38
  if (value.includes("documentation coverage") || value.includes("hallucinate"))
47
- return "docCoverage";
39
+ return "doc-coverage";
48
40
  return null;
49
41
  }
50
42
  // ---------------------------------------------------------------------------
@@ -140,11 +132,15 @@ export function formatConsistencyReport(result, graderModel) {
140
132
  const sep = "|------------------|-------|-------|-----------|-----------| ";
141
133
  lines.push(h);
142
134
  lines.push(sep);
143
- const dims = [
144
- { data: result.perDimension.taskCompletion, name: "Task Completion" },
145
- { data: result.perDimension.codeCorrectness, name: "Code Correctness" },
146
- { data: result.perDimension.docCoverage, name: "Doc Coverage" },
147
- ];
135
+ // Derive display rows dynamically from whatever dimensions are present
136
+ const dims = Object.entries(result.perDimension).map(([key, data]) => ({
137
+ data,
138
+ // kebab-case → Title Case (e.g. 'task-completion' 'Task Completion')
139
+ name: key
140
+ .split("-")
141
+ .map((w) => w.charAt(0).toUpperCase() + w.slice(1))
142
+ .join(" "),
143
+ }));
148
144
  for (const { data, name } of dims) {
149
145
  lines.push(`| ${name.padEnd(16)} | ${String(data.avgStdDev).padStart(5)} | ${String(data.maxStdDev).padStart(5)} | ${String(data.avgRange).padStart(9)} | ${String(data.judgmentCount).padStart(9)} |`);
150
146
  }
@@ -35,12 +35,8 @@ export interface GraderConsistency {
35
35
  judgments: JudgmentConsistency[];
36
36
  /** Maximum standard deviation observed (worst-case noise) */
37
37
  maxStdDev: number;
38
- /** Per-dimension consistency */
39
- perDimension: {
40
- taskCompletion: DimensionConsistency;
41
- codeCorrectness: DimensionConsistency;
42
- docCoverage: DimensionConsistency;
43
- };
38
+ /** Per-dimension consistency (keyed by kebab-case dimension name) */
39
+ perDimension: Record<string, DimensionConsistency>;
44
40
  /** Recommended noise threshold for comparisons (2× max dimension avgStdDev) */
45
41
  recommendedThreshold: number;
46
42
  /** Number of replications per judgment */
@@ -52,8 +48,8 @@ export interface GraderConsistency {
52
48
  export interface JudgmentConsistency {
53
49
  /** Feature area */
54
50
  area: string;
55
- /** Scoring dimension */
56
- dimension: "codeCorrectness" | "docCoverage" | "taskCompletion";
51
+ /** Scoring dimension (kebab-case, e.g. 'task-completion') */
52
+ dimension: string;
57
53
  /** Max score observed */
58
54
  max: number;
59
55
  /** Mean score across replications */
@@ -75,8 +71,8 @@ export interface JudgmentConsistency {
75
71
  export interface ReplicatedGrading {
76
72
  /** Feature area (derived from task description) */
77
73
  area: string;
78
- /** Which scoring dimension this rubric measures */
79
- dimension: "codeCorrectness" | "docCoverage" | "taskCompletion";
74
+ /** Which scoring dimension this rubric measures (kebab-case, e.g. 'task-completion') */
75
+ dimension: string;
80
76
  /** Provider (model under test) that produced the original response */
81
77
  providerId?: string;
82
78
  /** The scores from each replication (length = N replications) */
@@ -31,26 +31,7 @@ export function analyzeConsistency(gradings) {
31
31
  generatedAt: new Date().toISOString(),
32
32
  judgments: [],
33
33
  maxStdDev: 0,
34
- perDimension: {
35
- codeCorrectness: {
36
- avgRange: 0,
37
- avgStdDev: 0,
38
- judgmentCount: 0,
39
- maxStdDev: 0,
40
- },
41
- docCoverage: {
42
- avgRange: 0,
43
- avgStdDev: 0,
44
- judgmentCount: 0,
45
- maxStdDev: 0,
46
- },
47
- taskCompletion: {
48
- avgRange: 0,
49
- avgStdDev: 0,
50
- judgmentCount: 0,
51
- maxStdDev: 0,
52
- },
53
- },
34
+ perDimension: {},
54
35
  recommendedThreshold: 0,
55
36
  replications: 0,
56
37
  totalJudgments: 0,
@@ -58,17 +39,16 @@ export function analyzeConsistency(gradings) {
58
39
  }
59
40
  // Analyze each judgment
60
41
  const judgments = gradings.map(analyzeJudgment);
61
- // Group by dimension
62
- const byDimension = {
63
- codeCorrectness: judgments.filter((j) => j.dimension === "codeCorrectness"),
64
- docCoverage: judgments.filter((j) => j.dimension === "docCoverage"),
65
- taskCompletion: judgments.filter((j) => j.dimension === "taskCompletion"),
66
- };
67
- const perDimension = {
68
- codeCorrectness: aggregateDimension(byDimension.codeCorrectness),
69
- docCoverage: aggregateDimension(byDimension.docCoverage),
70
- taskCompletion: aggregateDimension(byDimension.taskCompletion),
71
- };
42
+ // Group by dimension dynamically — works with any dimension names
43
+ const byDimension = {};
44
+ for (const j of judgments) {
45
+ ;
46
+ (byDimension[j.dimension] ??= []).push(j);
47
+ }
48
+ const perDimension = {};
49
+ for (const [dim, dimJudgments] of Object.entries(byDimension)) {
50
+ perDimension[dim] = aggregateDimension(dimJudgments);
51
+ }
72
52
  // Overall stats
73
53
  const allStdDevs = judgments.map((j) => j.stdDev);
74
54
  const allRanges = judgments.map((j) => j.range);
@@ -76,7 +56,8 @@ export function analyzeConsistency(gradings) {
76
56
  // Recommended threshold: 2× the worst (highest) per-dimension avgStdDev.
77
57
  // This means a comparison delta must exceed 2σ of the noisiest dimension
78
58
  // to be classified as a real change rather than grader variance.
79
- const maxDimensionAvgStdDev = Math.max(perDimension.taskCompletion.avgStdDev, perDimension.codeCorrectness.avgStdDev, perDimension.docCoverage.avgStdDev);
59
+ const dimAvgStdDevs = Object.values(perDimension).map((d) => d.avgStdDev);
60
+ const maxDimensionAvgStdDev = dimAvgStdDevs.length > 0 ? Math.max(...dimAvgStdDevs) : 0;
80
61
  const recommendedThreshold = Math.ceil(maxDimensionAvgStdDev * 2);
81
62
  // Sort judgments by stdDev descending (noisiest first)
82
63
  const sortedJudgments = [...judgments].sort((a, b) => b.stdDev - a.stdDev);
@@ -119,11 +119,13 @@ export function formatSensitivityReport(result) {
119
119
  const sep = "|------------------|-------------|---------|-------|-------|";
120
120
  lines.push(h);
121
121
  lines.push(sep);
122
- const dims = [
123
- { data: result.perDimension.taskCompletion, name: "Task Completion" },
124
- { data: result.perDimension.codeCorrectness, name: "Code Correctness" },
125
- { data: result.perDimension.docCoverage, name: "Doc Coverage" },
126
- ];
122
+ const dims = Object.entries(result.perDimension).map(([key, data]) => ({
123
+ data,
124
+ name: key
125
+ .split(/[-_]/)
126
+ .map((w) => w.charAt(0).toUpperCase() + w.slice(1))
127
+ .join(" "),
128
+ }));
127
129
  for (const { data, name } of dims) {
128
130
  lines.push(`| ${name.padEnd(16)} | ${String(data.concordanceRate + "%").padStart(11)} | ${String(data.avgSeparation).padStart(7)} | ${String(data.tiedRate + "%").padStart(5)} | ${String(data.pairCount).padStart(5)} |`);
129
131
  }
@@ -58,12 +58,8 @@ export interface GraderSensitivityResult {
58
58
  generatedAt: string;
59
59
  /** Grader model used */
60
60
  graderModel: string;
61
- /** Per-dimension sensitivity metrics */
62
- perDimension: {
63
- taskCompletion: DimensionSensitivity;
64
- codeCorrectness: DimensionSensitivity;
65
- docCoverage: DimensionSensitivity;
66
- };
61
+ /** Per-dimension sensitivity metrics (keyed by dimension name) */
62
+ perDimension: Record<string, DimensionSensitivity>;
67
63
  /** Total paired comparisons analyzed */
68
64
  totalPairs: number;
69
65
  }
@@ -30,11 +30,15 @@ export function analyzeSensitivity(pairs, graderModel) {
30
30
  // Overall concordance and separation
31
31
  const { avgSeparation, concordanceRate, tiedRate: _tiedRate, } = computeMetrics(pairs);
32
32
  // Per-dimension (based on the grading dimension, not the target dimension)
33
- const perDimension = {
34
- codeCorrectness: computeMetrics(pairs.filter((p) => p.dimension === "codeCorrectness")),
35
- docCoverage: computeMetrics(pairs.filter((p) => p.dimension === "docCoverage")),
36
- taskCompletion: computeMetrics(pairs.filter((p) => p.dimension === "taskCompletion")),
37
- };
33
+ const dimGroups = {};
34
+ for (const p of pairs) {
35
+ ;
36
+ (dimGroups[p.dimension] ??= []).push(p);
37
+ }
38
+ const perDimension = {};
39
+ for (const [dim, dimPairs] of Object.entries(dimGroups)) {
40
+ perDimension[dim] = computeMetrics(dimPairs);
41
+ }
38
42
  // Cross-dimension: on-target (dimension matches targetDimension) vs off-target
39
43
  const onTargetPairs = pairs.filter((p) => p.dimension === p.targetDimension);
40
44
  const offTargetPairs = pairs.filter((p) => p.dimension !== p.targetDimension);
@@ -130,11 +134,7 @@ function emptyResult(graderModel) {
130
134
  failedPairs: [],
131
135
  generatedAt: new Date().toISOString(),
132
136
  graderModel,
133
- perDimension: {
134
- codeCorrectness: emptyDim,
135
- docCoverage: emptyDim,
136
- taskCompletion: emptyDim,
137
- },
137
+ perDimension: {},
138
138
  totalPairs: 0,
139
139
  };
140
140
  }