@sanity/ailf 0.4.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. package/config/features.ts +23 -0
  2. package/config/models.ts +83 -0
  3. package/config/prompts.ts +16 -0
  4. package/config/rubrics.ts +225 -0
  5. package/config/schedules.ts +47 -0
  6. package/config/sinks.ts +37 -0
  7. package/config/sources.ts +21 -0
  8. package/config/thresholds.ts +61 -0
  9. package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
  10. package/dist/_vendor/ailf-core/config-helpers.js +150 -0
  11. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  12. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  13. package/dist/_vendor/ailf-core/examples/index.d.ts +10 -10
  14. package/dist/_vendor/ailf-core/examples/index.js +10 -10
  15. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  16. package/dist/_vendor/ailf-core/index.js +5 -0
  17. package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
  18. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  19. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  20. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  21. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  22. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  23. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  24. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
  25. package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
  26. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
  27. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
  28. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +32 -31
  29. package/dist/_vendor/ailf-core/schemas/pipeline.js +52 -12
  30. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  31. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  32. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  33. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  34. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  35. package/dist/_vendor/ailf-core/services/index.js +2 -1
  36. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  37. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  38. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  39. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  40. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  41. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  42. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  43. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  44. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  46. package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
  47. package/dist/_vendor/ailf-core/types/index.js +8 -1
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
  50. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  51. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  52. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  53. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  54. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  55. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  56. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  57. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  58. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  59. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  60. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  61. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  62. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  63. package/dist/_vendor/ailf-shared/index.js +0 -1
  64. package/dist/adapters/api-client/build-request.js +14 -13
  65. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  66. package/dist/adapters/config-sources/file-config-adapter.js +38 -12
  67. package/dist/adapters/config-sources/index.d.ts +2 -0
  68. package/dist/adapters/config-sources/index.js +1 -0
  69. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  70. package/dist/adapters/config-sources/ts-config-loader.js +133 -0
  71. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  72. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  73. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  74. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  75. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  76. package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
  77. package/dist/adapters/task-sources/index.d.ts +1 -0
  78. package/dist/adapters/task-sources/index.js +1 -0
  79. package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
  80. package/dist/adapters/task-sources/repo-task-source.js +69 -16
  81. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  82. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  83. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  84. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  85. package/dist/cli.js +0 -2
  86. package/dist/commands/baseline.js +4 -1
  87. package/dist/commands/calculate-scores.js +1 -1
  88. package/dist/commands/coverage-audit.js +7 -1
  89. package/dist/commands/explain-handler.js +25 -23
  90. package/dist/commands/fetch-docs.js +3 -2
  91. package/dist/commands/generate-configs.js +1 -1
  92. package/dist/commands/interactive.js +11 -7
  93. package/dist/commands/pipeline-action.d.ts +2 -0
  94. package/dist/commands/pipeline-action.js +16 -6
  95. package/dist/commands/pipeline.d.ts +1 -0
  96. package/dist/commands/pipeline.js +4 -2
  97. package/dist/commands/pr-comment.js +1 -1
  98. package/dist/commands/publish.js +2 -2
  99. package/dist/commands/readiness-report.js +13 -6
  100. package/dist/composition-root.d.ts +1 -1
  101. package/dist/composition-root.js +67 -4
  102. package/dist/orchestration/build-app-context.js +1 -0
  103. package/dist/orchestration/build-step-sequence.js +24 -6
  104. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  105. package/dist/orchestration/steps/fetch-docs-step.js +6 -4
  106. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  107. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  108. package/dist/orchestration/steps/generate-configs-step.js +245 -51
  109. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  110. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  111. package/dist/orchestration/steps/readiness-step.js +5 -6
  112. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  113. package/dist/orchestration/steps/run-eval-step.js +8 -7
  114. package/dist/pipeline/cache.d.ts +1 -1
  115. package/dist/pipeline/cache.js +36 -8
  116. package/dist/pipeline/calculate-scores.d.ts +5 -7
  117. package/dist/pipeline/calculate-scores.js +74 -153
  118. package/dist/pipeline/checks.js +2 -2
  119. package/dist/pipeline/compare.js +8 -8
  120. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  121. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  122. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  123. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  124. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  125. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  126. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  127. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  128. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  129. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
  130. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  131. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  132. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  133. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  134. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  135. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
  136. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  137. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  138. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  139. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  140. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  141. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  142. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  143. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  144. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  145. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  146. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  147. package/dist/pipeline/compiler/config-loader.js +111 -0
  148. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  149. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  150. package/dist/pipeline/compiler/hash.d.ts +11 -0
  151. package/dist/pipeline/compiler/hash.js +18 -0
  152. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  153. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  154. package/dist/pipeline/compiler/index.d.ts +29 -0
  155. package/dist/pipeline/compiler/index.js +45 -0
  156. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  157. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  158. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  159. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  160. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  161. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  162. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  163. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  164. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  165. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  166. package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
  167. package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
  168. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  169. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  170. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  171. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  172. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  173. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
  174. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
  175. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
  176. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  177. package/dist/pipeline/compiler/presets/index.js +8 -0
  178. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
  179. package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
  180. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  181. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  182. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  183. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  184. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  185. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  186. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  187. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  188. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  189. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  190. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  191. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  192. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  193. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  194. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  195. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  196. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  197. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  198. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  199. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  200. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  201. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  202. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  203. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  204. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  205. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  206. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  207. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  208. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  209. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  210. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  211. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  212. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  213. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  214. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  215. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  216. package/dist/pipeline/coverage-audit.d.ts +15 -5
  217. package/dist/pipeline/coverage-audit.js +41 -22
  218. package/dist/pipeline/eval-constants.d.ts +16 -6
  219. package/dist/pipeline/eval-constants.js +25 -4
  220. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  221. package/dist/pipeline/eval-fingerprint.js +8 -9
  222. package/dist/pipeline/expand-tasks.d.ts +23 -14
  223. package/dist/pipeline/expand-tasks.js +37 -31
  224. package/dist/pipeline/gap-analysis.d.ts +1 -1
  225. package/dist/pipeline/gap-analysis.js +2 -2
  226. package/dist/pipeline/generate-configs.d.ts +22 -4
  227. package/dist/pipeline/generate-configs.js +53 -24
  228. package/dist/pipeline/grader-api.d.ts +3 -3
  229. package/dist/pipeline/grader-api.js +5 -12
  230. package/dist/pipeline/grader-compare-runner.js +20 -27
  231. package/dist/pipeline/grader-comparison.d.ts +4 -8
  232. package/dist/pipeline/grader-comparison.js +11 -17
  233. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  234. package/dist/pipeline/grader-consistency-runner.js +18 -21
  235. package/dist/pipeline/grader-consistency.d.ts +6 -10
  236. package/dist/pipeline/grader-consistency.js +13 -32
  237. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  238. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  239. package/dist/pipeline/grader-sensitivity.js +10 -10
  240. package/dist/pipeline/grader-validate-runner.js +7 -5
  241. package/dist/pipeline/grader-validation.d.ts +2 -6
  242. package/dist/pipeline/grader-validation.js +14 -22
  243. package/dist/pipeline/map-request-to-config.js +6 -1
  244. package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
  245. package/dist/pipeline/mirror-repo-tasks.js +16 -15
  246. package/dist/pipeline/normalize-mode.d.ts +49 -0
  247. package/dist/pipeline/normalize-mode.js +64 -0
  248. package/dist/pipeline/plan.d.ts +5 -2
  249. package/dist/pipeline/plan.js +134 -78
  250. package/dist/pipeline/pr-comment.js +2 -0
  251. package/dist/pipeline/profile-resolution.d.ts +47 -0
  252. package/dist/pipeline/profile-resolution.js +91 -0
  253. package/dist/pipeline/provenance.d.ts +2 -2
  254. package/dist/pipeline/provenance.js +12 -17
  255. package/dist/pipeline/release-report.js +4 -4
  256. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  257. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  258. package/dist/pipeline/rubric-loader.d.ts +20 -0
  259. package/dist/pipeline/rubric-loader.js +37 -0
  260. package/dist/pipeline/validate.d.ts +4 -4
  261. package/dist/pipeline/validate.js +64 -53
  262. package/dist/schedules/loader.js +18 -8
  263. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  264. package/dist/scripts/migrate-task-mode.js +85 -0
  265. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  266. package/dist/scripts/validate-task-sources.d.ts +1 -1
  267. package/dist/scripts/validate-task-sources.js +15 -15
  268. package/dist/sinks/loader.js +5 -7
  269. package/dist/sources.d.ts +7 -7
  270. package/dist/sources.js +22 -24
  271. package/dist/webhook/dispatch.js +2 -1
  272. package/package.json +6 -3
  273. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  274. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  275. package/tasks/literacy/frameworks.task.ts +128 -0
  276. package/tasks/literacy/functions.task.ts +69 -0
  277. package/tasks/literacy/groq.task.ts +258 -0
  278. package/tasks/literacy/nextjs-live.task.ts +75 -0
  279. package/tasks/literacy/studio-setup.task.ts +131 -0
  280. package/tasks/literacy/visual-editing.task.ts +146 -0
  281. package/config/features.yaml +0 -116
  282. package/config/models.yaml +0 -116
  283. package/config/prompts.yaml +0 -75
  284. package/config/rubrics.yaml +0 -62
  285. package/config/schedules.yaml +0 -43
  286. package/config/sinks.yaml +0 -54
  287. package/config/sources.yaml +0 -51
  288. package/config/thresholds.yaml +0 -49
  289. package/dist/agent-observer/test-imports.d.ts +0 -7
  290. package/dist/agent-observer/test-imports.js +0 -185
@@ -10,7 +10,7 @@
10
10
  *
11
11
  * Cache invalidation triggers:
12
12
  * - Content change: any input file's content changes → hash changes → miss
13
- * - Config change: config/models.yaml, config/sources.yaml, tasks/*.yaml changes → miss
13
+ * - Config change: config/models, config/sources, tasks/*.yaml changes → miss
14
14
  * - Manual bypass: --no-cache flag skips all cache lookups
15
15
  * - Cache clear: delete results/cache/ to start fresh
16
16
  */
@@ -10,7 +10,7 @@
10
10
  *
11
11
  * Cache invalidation triggers:
12
12
  * - Content change: any input file's content changes → hash changes → miss
13
- * - Config change: config/models.yaml, config/sources.yaml, tasks/*.yaml changes → miss
13
+ * - Config change: config/models, config/sources, tasks/*.yaml changes → miss
14
14
  * - Manual bypass: --no-cache flag skips all cache lookups
15
15
  * - Cache clear: delete results/cache/ to start fresh
16
16
  */
@@ -18,6 +18,19 @@ import { createHash } from "crypto";
18
18
  import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync, } from "fs";
19
19
  import { join, resolve } from "path";
20
20
  // ---------------------------------------------------------------------------
21
+ // Helpers
22
+ // ---------------------------------------------------------------------------
23
+ /** Resolve first existing config file (matches loadConfigFile priority chain) */
24
+ function resolveConfig(rootDir, name) {
25
+ const r = (f) => resolve(rootDir, f);
26
+ for (const ext of [".ts", ".js", ".yaml", ".yml", ".json"]) {
27
+ const p = r(`config/${name}${ext}`);
28
+ if (existsSync(p))
29
+ return p;
30
+ }
31
+ return undefined;
32
+ }
33
+ // ---------------------------------------------------------------------------
21
34
  // Constants
22
35
  // ---------------------------------------------------------------------------
23
36
  const CACHE_DIR_NAME = "cache";
@@ -79,7 +92,10 @@ export function getStepInputPaths(rootDir, step) {
79
92
  const isBaseline = step === "eval-baseline" || step === "eval";
80
93
  const isAgentic = step === "eval-agentic" || step === "eval";
81
94
  const isObserved = step === "eval-observed" || step === "eval";
82
- const paths = [r("config/models.yaml")];
95
+ const paths = [];
96
+ const modelsPath = resolveConfig(rootDir, "models");
97
+ if (modelsPath)
98
+ paths.push(modelsPath);
83
99
  // Config files — only the relevant ones for this mode
84
100
  if (isBaseline) {
85
101
  paths.push(r("promptfooconfig.yaml"));
@@ -130,25 +146,37 @@ export function getStepInputPaths(rootDir, step) {
130
146
  return paths;
131
147
  }
132
148
  case "fetch-docs": {
133
- // Inputs: config/sources.yaml, config/models.yaml, task files (which contain inline mappings)
134
- const paths = [r("config/sources.yaml"), r("config/models.yaml")];
149
+ // Inputs: config sources + models, task files
150
+ const paths = [];
151
+ const sourcesPath = resolveConfig(rootDir, "sources");
152
+ const modelsPath2 = resolveConfig(rootDir, "models");
153
+ if (sourcesPath)
154
+ paths.push(sourcesPath);
155
+ if (modelsPath2)
156
+ paths.push(modelsPath2);
135
157
  // Include all task files (they define feature areas)
136
158
  const tasksDir = r("tasks");
137
159
  if (existsSync(tasksDir)) {
138
160
  const taskFiles = readdirSync(tasksDir)
139
- .filter((f) => f.endsWith(".yaml") || f.endsWith(".yml"))
161
+ .filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
140
162
  .map((f) => join(tasksDir, f));
141
163
  paths.push(...taskFiles);
142
164
  }
143
165
  return paths;
144
166
  }
145
167
  case "generate-configs": {
146
- // Inputs: config/models.yaml, config/sources.yaml, all task files
147
- const paths = [r("config/models.yaml"), r("config/sources.yaml")];
168
+ // Inputs: config models + sources, all task files
169
+ const paths = [];
170
+ const modelsPath3 = resolveConfig(rootDir, "models");
171
+ const sourcesPath2 = resolveConfig(rootDir, "sources");
172
+ if (modelsPath3)
173
+ paths.push(modelsPath3);
174
+ if (sourcesPath2)
175
+ paths.push(sourcesPath2);
148
176
  const tasksDir = r("tasks");
149
177
  if (existsSync(tasksDir)) {
150
178
  const taskFiles = readdirSync(tasksDir)
151
- .filter((f) => f.endsWith(".yaml") || f.endsWith(".yml"))
179
+ .filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
152
180
  .map((f) => join(tasksDir, f));
153
181
  paths.push(...taskFiles);
154
182
  }
@@ -1,9 +1,7 @@
1
- import type { Logger, TestSummary } from "../_vendor/ailf-core/index.d.ts";
1
+ import { type ActualScoreEntry, type ComponentResult, type Logger, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
2
2
  import { type ResolvedSourceConfig } from "../sources.js";
3
- import { type ActualScoreEntry, type ComponentResult } from "../_vendor/ailf-core/index.d.ts";
4
3
  import type { GraderJudgment, PerModelEntry } from "./types.js";
5
- export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.d.ts";
6
- export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
4
+ export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
7
5
  export interface PromptfooResultsWrapper {
8
6
  results: RawTestResult[];
9
7
  stats: {
@@ -64,7 +62,7 @@ export interface RawTestResult {
64
62
  * @returns Record keyed by model ID, or null if only one model was used
65
63
  * (per-model breakdown is redundant when there's only one model).
66
64
  */
67
- export declare function calculateScoresPerModel(resultsPath: string, weights: Record<string, number>): null | PerModelEntry[];
65
+ export declare function calculateScoresPerModel(resultsPath: string, goldProfile: Record<string, number>, baselineProfile: Record<string, number>): null | PerModelEntry[];
68
66
  /**
69
67
  * Extract grader judgments (reason text + scores) from evaluation results.
70
68
  *
@@ -82,7 +80,7 @@ export declare function extractGraderJudgments(resultsPath: string): GraderJudgm
82
80
  *
83
81
  * Returns a record keyed by feature area with the composite actual score.
84
82
  */
85
- export declare function scoreAgenticResults(resultsPath: string, weights: Record<string, number>): Record<string, ActualScoreEntry>;
83
+ export declare function scoreAgenticResults(resultsPath: string, profile: Record<string, number>): Record<string, ActualScoreEntry>;
86
84
  /**
87
85
  * Score agentic results broken down by model.
88
86
  *
@@ -90,7 +88,7 @@ export declare function scoreAgenticResults(resultsPath: string, weights: Record
90
88
  * producing a map of model → feature → ActualScoreEntry.
91
89
  * Used to enrich the per-model breakdown with actual scores in full mode.
92
90
  */
93
- export declare function scoreAgenticResultsPerModel(resultsPath: string, weights: Record<string, number>): Record<string, Record<string, ActualScoreEntry>>;
91
+ export declare function scoreAgenticResultsPerModel(resultsPath: string, profile: Record<string, number>): Record<string, Record<string, ActualScoreEntry>>;
94
92
  /** Options for the calculate-scores main() function. */
95
93
  export interface CalculateScoresOptions {
96
94
  /** Allowed origins for source isolation reporting */
@@ -8,8 +8,11 @@
8
8
  * Code Correctness (0–100) — Is the code idiomatic and correct?
9
9
  * Doc Coverage (0–100) — Did docs provide the needed info?
10
10
  *
11
- * Dimensions are combined into a weighted composite (0–100) using weights
12
- * from config/rubrics.yaml (default: Task×0.50 + Code×0.25 + Docs×0.25).
11
+ * Dimensions are combined into a weighted composite (0–100) using named
12
+ * scoring profiles from config/rubrics. Gold (with-docs) entries use
13
+ * the "default" profile; baseline (without-docs) entries use "output-only"
14
+ * which excludes doc-coverage (undefined without docs).
15
+ * See docs/design-docs/named-scoring-profiles.md.
13
16
  *
14
17
  * Additionally compares with-docs vs without-docs scores to calculate
15
18
  * the "Doc Lift" — how much documentation helps vs parametric knowledge.
@@ -26,14 +29,17 @@
26
29
  */
27
30
  import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
28
31
  import { join } from "path";
32
+ import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
29
33
  import { calculateCost } from "../agent-observer/pricing.js";
30
34
  import { ConsoleLogger } from "../adapters/loggers/index.js";
35
+ import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
31
36
  import { checkResultsExist } from "./checks.js";
32
- import { loadRubricTemplates } from "./expand-tasks.js";
37
+ import { loadRubricTemplates } from "./rubric-loader.js";
38
+ import { resolveProfile } from "./profile-resolution.js";
33
39
  import { loadSource } from "../sources.js";
34
- import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
35
- import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
36
- // Re-export pure functions from core for backward compatibility.
40
+ import { LiteracyVariant } from "./normalize-mode.js";
41
+ import { scoreTestGroup } from "./compiler/scoring-bridge.js";
42
+ // Re-export from core for backward compatibility.
37
43
  // Existing imports from this file continue to work unchanged.
38
44
  export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
39
45
  /**
@@ -46,7 +52,7 @@ export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, par
46
52
  * @returns Record keyed by model ID, or null if only one model was used
47
53
  * (per-model breakdown is redundant when there's only one model).
48
54
  */
49
- export function calculateScoresPerModel(resultsPath, weights) {
55
+ export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfile) {
50
56
  const results = readAndNormalizeResults(resultsPath);
51
57
  // Group results by provider
52
58
  const byModel = {};
@@ -66,7 +72,7 @@ export function calculateScoresPerModel(resultsPath, weights) {
66
72
  }
67
73
  const perModel = [];
68
74
  for (const [modelId, { label, results: modelResults }] of Object.entries(byModel)) {
69
- const scores = scoreResults(modelResults, weights, modelId);
75
+ const scores = scoreResults(modelResults, goldProfile, baselineProfile, modelId);
70
76
  const totalTests = scores.reduce((s, sc) => s + sc.testCount, 0);
71
77
  const totalCost = scores.reduce((s, sc) => s + sc.totalCost, 0);
72
78
  const avgScore = scores.length > 0
@@ -133,14 +139,8 @@ export function extractGraderJudgments(resultsPath) {
133
139
  // Not JSON — use raw reason string
134
140
  }
135
141
  }
136
- // Map internal dimension names to hyphenated form
137
- const dimensionMap = {
138
- codeCorrectness: "code-correctness",
139
- docCoverage: "doc-coverage",
140
- taskCompletion: "task-completion",
141
- };
142
142
  judgments.push({
143
- dimension: dimensionMap[kind] ?? kind,
143
+ dimension: kind,
144
144
  modelId,
145
145
  reason,
146
146
  score,
@@ -277,7 +277,7 @@ function aggregateUrlReferences(resultsPath) {
277
277
  * verification report.
278
278
  */
279
279
  function buildSourceVerification(root, source, verificationCtx) {
280
- const mode = verificationCtx?.mode ?? "baseline";
280
+ const mode = verificationCtx?.mode ?? LiteracyVariant.STANDARD;
281
281
  const sourceUrl = source?.baseUrl ?? "default";
282
282
  const searchMode = verificationCtx?.searchMode;
283
283
  const allowedOrigins = verificationCtx?.allowedOrigins;
@@ -318,9 +318,9 @@ function buildSourceVerification(root, source, verificationCtx) {
318
318
  * Calculate overall scores (all models combined).
319
319
  * This is the original scoring path — backward compatible.
320
320
  */
321
- function calculateScores(resultsPath, weights) {
321
+ function calculateScores(resultsPath, goldProfile, baselineProfile) {
322
322
  const results = readAndNormalizeResults(resultsPath);
323
- return scoreResults(results, weights);
323
+ return scoreResults(results, goldProfile, baselineProfile);
324
324
  }
325
325
  /**
326
326
  * Extracts agent behavior summary from a test result's metadata.
@@ -495,13 +495,11 @@ function readAndNormalizeResults(resultsPath, log) {
495
495
  * used by both the overall scoring and per-model scoring paths.
496
496
  *
497
497
  * @param results Pre-filtered (valid) test results
498
- * @param weights Dimension weights from rubrics.yaml
499
- * @param modelId Optional model identifier to tag each FeatureScore
498
+ * @param goldProfile Weight profile for gold (with-docs) entries
499
+ * @param baselineProfile Weight profile for baseline (without-docs) entries
500
+ * @param modelId Optional model identifier to tag each FeatureScore
500
501
  */
501
- function scoreResults(results, weights, modelId) {
502
- const wTask = weights["task-completion"] ?? 0.5;
503
- const wCode = weights["code-correctness"] ?? 0.25;
504
- const wDoc = weights["doc-coverage"] ?? 0.25;
502
+ function scoreResults(results, goldProfile, baselineProfile, modelId) {
505
503
  // Group by feature + docs/no-docs
506
504
  const byFeature = {};
507
505
  for (const result of results) {
@@ -519,65 +517,28 @@ function scoreResults(results, weights, modelId) {
519
517
  }
520
518
  const scores = [];
521
519
  for (const [feature, data] of Object.entries(byFeature)) {
522
- // --- With docs ---
523
- let totalTask = 0;
524
- let totalCode = 0;
525
- let totalDoc = 0;
526
- let featureCost = 0;
527
- const countWithDocs = data.withDocs.length || 1;
528
- for (const test of data.withDocs) {
529
- featureCost += test.cost;
530
- for (const comp of test.gradingResult.componentResults) {
531
- if (comp.assertion?.type !== "llm-rubric") {
532
- continue;
533
- }
534
- const score = parseRubricScore(comp);
535
- const kind = classifyRubric(comp);
536
- if (kind === "taskCompletion") {
537
- totalTask += score;
538
- }
539
- else if (kind === "codeCorrectness") {
540
- totalCode += score;
541
- }
542
- else if (kind === "docCoverage") {
543
- totalDoc += score;
544
- }
545
- }
546
- }
547
- // Per-dimension averages (each 0–100)
548
- const avgTask = totalTask / countWithDocs;
549
- const avgCode = totalCode / countWithDocs;
550
- const avgDoc = totalDoc / countWithDocs;
551
- // Weighted composite (0–100)
552
- const withDocsTotal = avgTask * wTask + avgCode * wCode + avgDoc * wDoc;
553
- // --- Without docs (baseline) ---
554
- let baselineTotal = 0;
555
- let baselineCount = 0;
556
- for (const test of data.withoutDocs) {
557
- featureCost += test.cost;
558
- for (const comp of test.gradingResult.componentResults) {
559
- if (comp.assertion?.type !== "llm-rubric") {
560
- continue;
561
- }
562
- baselineTotal += parseRubricScore(comp);
563
- baselineCount++;
564
- }
565
- }
566
- const withoutDocsScore = baselineCount > 0 ? baselineTotal / baselineCount : 0;
567
- const ceilingScore = Math.round(withDocsTotal);
568
- const floorScore = Math.round(withoutDocsScore);
520
+ // --- With docs (gold / ceiling) — scored via 4-tier engine ---
521
+ const gold = scoreTestGroup(data.withDocs, goldProfile, feature);
522
+ // --- Without docs (baseline / floor) ---
523
+ // Uses the baseline profile (e.g. "output-only") which may exclude
524
+ // dimensions like doc-coverage that are undefined without docs.
525
+ // See docs/design-docs/named-scoring-profiles.md.
526
+ const baseline = scoreTestGroup(data.withoutDocs, baselineProfile, feature);
527
+ const featureCost = gold.totalCost + baseline.totalCost;
528
+ const ceilingScore = gold.composite;
529
+ const floorScore = baseline.composite;
569
530
  const docLift = ceilingScore - floorScore;
570
531
  const featureScore = {
571
532
  ceilingScore,
572
- codeCorrectness: Math.round(avgCode),
573
- docCoverage: Math.round(avgDoc),
533
+ codeCorrectness: gold.dimensions.codeCorrectness ?? 0,
534
+ docCoverage: gold.dimensions.docCoverage ?? 0,
574
535
  docLift,
575
536
  docQualityGap: 100 - ceilingScore,
576
537
  feature,
577
538
  floorScore,
578
539
  ...(modelId && { modelId }),
579
540
  negativeDocLift: docLift < 0,
580
- taskCompletion: Math.round(avgTask),
541
+ taskCompletion: gold.dimensions.taskCompletion ?? 0,
581
542
  testCount: data.withDocs.length,
582
543
  totalCost: featureCost,
583
544
  totalScore: ceilingScore,
@@ -597,11 +558,8 @@ function scoreResults(results, weights, modelId) {
597
558
  * Returns a record keyed by feature area with the composite actual score.
598
559
  */
599
560
  // ActualScoreEntry — imported from @sanity/ailf-core via pipeline/types.js
600
- export function scoreAgenticResults(resultsPath, weights) {
561
+ export function scoreAgenticResults(resultsPath, profile) {
601
562
  const results = readAndNormalizeResults(resultsPath);
602
- const wTask = weights["task-completion"] ?? 0.5;
603
- const wCode = weights["code-correctness"] ?? 0.25;
604
- const wDoc = weights["doc-coverage"] ?? 0.25;
605
563
  // Group by feature area
606
564
  const byFeature = {};
607
565
  for (const result of results) {
@@ -613,37 +571,14 @@ export function scoreAgenticResults(resultsPath, weights) {
613
571
  }
614
572
  const entries = {};
615
573
  for (const [feature, featureResults] of Object.entries(byFeature)) {
616
- let totalTask = 0;
617
- let totalCode = 0;
618
- let totalDoc = 0;
619
- let featureCost = 0;
620
- const count = featureResults.length || 1;
621
- for (const test of featureResults) {
622
- featureCost += test.cost;
623
- for (const comp of test.gradingResult.componentResults) {
624
- if (comp.assertion?.type !== "llm-rubric")
625
- continue;
626
- const score = parseRubricScore(comp);
627
- const kind = classifyRubric(comp);
628
- if (kind === "taskCompletion")
629
- totalTask += score;
630
- else if (kind === "codeCorrectness")
631
- totalCode += score;
632
- else if (kind === "docCoverage")
633
- totalDoc += score;
634
- }
635
- }
636
- const avgTask = totalTask / count;
637
- const avgCode = totalCode / count;
638
- const avgDoc = totalDoc / count;
639
- const actualScore = Math.round(avgTask * wTask + avgCode * wCode + avgDoc * wDoc);
574
+ const scored = scoreTestGroup(featureResults, profile, feature);
640
575
  entries[feature] = {
641
- actualScore,
642
- codeCorrectness: Math.round(avgCode),
643
- docCoverage: Math.round(avgDoc),
644
- taskCompletion: Math.round(avgTask),
576
+ actualScore: scored.composite,
577
+ codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
578
+ docCoverage: scored.dimensions.docCoverage ?? 0,
579
+ taskCompletion: scored.dimensions.taskCompletion ?? 0,
645
580
  testCount: featureResults.length,
646
- totalCost: featureCost,
581
+ totalCost: scored.totalCost,
647
582
  };
648
583
  }
649
584
  return entries;
@@ -655,11 +590,8 @@ export function scoreAgenticResults(resultsPath, weights) {
655
590
  * producing a map of model → feature → ActualScoreEntry.
656
591
  * Used to enrich the per-model breakdown with actual scores in full mode.
657
592
  */
658
- export function scoreAgenticResultsPerModel(resultsPath, weights) {
593
+ export function scoreAgenticResultsPerModel(resultsPath, profile) {
659
594
  const results = readAndNormalizeResults(resultsPath);
660
- const wTask = weights["task-completion"] ?? 0.5;
661
- const wCode = weights["code-correctness"] ?? 0.25;
662
- const wDoc = weights["doc-coverage"] ?? 0.25;
663
595
  // Group by model, then feature
664
596
  const byModel = {};
665
597
  for (const result of results) {
@@ -675,37 +607,14 @@ export function scoreAgenticResultsPerModel(resultsPath, weights) {
675
607
  for (const [modelId, features] of Object.entries(byModel)) {
676
608
  perModel[modelId] = {};
677
609
  for (const [feature, featureResults] of Object.entries(features)) {
678
- let totalTask = 0;
679
- let totalCode = 0;
680
- let totalDoc = 0;
681
- let featureCost = 0;
682
- const count = featureResults.length || 1;
683
- for (const test of featureResults) {
684
- featureCost += test.cost;
685
- for (const comp of test.gradingResult.componentResults) {
686
- if (comp.assertion?.type !== "llm-rubric")
687
- continue;
688
- const score = parseRubricScore(comp);
689
- const kind = classifyRubric(comp);
690
- if (kind === "taskCompletion")
691
- totalTask += score;
692
- else if (kind === "codeCorrectness")
693
- totalCode += score;
694
- else if (kind === "docCoverage")
695
- totalDoc += score;
696
- }
697
- }
698
- const avgTask = totalTask / count;
699
- const avgCode = totalCode / count;
700
- const avgDoc = totalDoc / count;
701
- const actualScore = Math.round(avgTask * wTask + avgCode * wCode + avgDoc * wDoc);
610
+ const scored = scoreTestGroup(featureResults, profile, feature);
702
611
  perModel[modelId][feature] = {
703
- actualScore,
704
- codeCorrectness: Math.round(avgCode),
705
- docCoverage: Math.round(avgDoc),
706
- taskCompletion: Math.round(avgTask),
612
+ actualScore: scored.composite,
613
+ codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
614
+ docCoverage: scored.dimensions.docCoverage ?? 0,
615
+ taskCompletion: scored.dimensions.taskCompletion ?? 0,
707
616
  testCount: featureResults.length,
708
- totalCost: featureCost,
617
+ totalCost: scored.totalCost,
709
618
  };
710
619
  }
711
620
  }
@@ -743,7 +652,7 @@ export function calculateAndWriteScores(options) {
743
652
  }
744
653
  }
745
654
  // Determine mode — controls which result files are read
746
- const mode = options.mode ?? "baseline";
655
+ const mode = options.mode ?? LiteracyVariant.STANDARD;
747
656
  const baselineResultsPath = options.resultsPath ?? join(ROOT, "results", "latest", "eval-results.json");
748
657
  // Agentic results path (only used in full mode)
749
658
  const agenticResultsPath = join(ROOT, "results", "latest", "eval-results-agentic.json");
@@ -760,10 +669,18 @@ export function calculateAndWriteScores(options) {
760
669
  if (source) {
761
670
  log.info(`Source: ${sourceName} (${source.baseUrl})`);
762
671
  }
763
- // Load dimension weights from rubrics.yaml
672
+ // Load rubric config and resolve scoring profiles per variant.
673
+ // Gold (with-docs) entries use the "default" profile (3 dimensions).
674
+ // Baseline (without-docs) entries use "output-only" (2 dimensions,
675
+ // doc-coverage excluded). See docs/design-docs/named-scoring-profiles.md.
764
676
  const rubricConfig = loadRubricTemplates(ROOT);
765
- log.debug("Loaded rubric weights", { weights: rubricConfig.weights });
766
- const baselineScores = calculateScores(baselineResultsPath, rubricConfig.weights);
677
+ const goldProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.STANDARD);
678
+ const baselineProfileWeights = resolveProfile("literacy", LiteracyVariant.STANDARD, rubricConfig, LiteracyVariant.STANDARD);
679
+ log.debug("Loaded scoring profiles", {
680
+ gold: goldProfile,
681
+ baseline: baselineProfileWeights,
682
+ });
683
+ const baselineScores = calculateScores(baselineResultsPath, goldProfile, baselineProfileWeights);
767
684
  log.debug("Baseline scores calculated", {
768
685
  featureCount: baselineScores.length,
769
686
  features: baselineScores.map((s) => ({
@@ -773,7 +690,7 @@ export function calculateAndWriteScores(options) {
773
690
  docLift: s.docLift,
774
691
  })),
775
692
  });
776
- const perModel = calculateScoresPerModel(baselineResultsPath, rubricConfig.weights);
693
+ const perModel = calculateScoresPerModel(baselineResultsPath, goldProfile, baselineProfileWeights);
777
694
  const urlRefs = aggregateUrlReferences(baselineResultsPath);
778
695
  const sourceVerification = buildSourceVerification(ROOT, source, {
779
696
  allowedOrigins: options.allowedOrigins,
@@ -786,9 +703,10 @@ export function calculateAndWriteScores(options) {
786
703
  let agentBehavior = null;
787
704
  let sourceIsolation = null;
788
705
  let evaluationMode;
789
- if (mode === "full" && existsSync(agenticResultsPath)) {
706
+ if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
790
707
  log.info(`\nReading agentic results from: ${agenticResultsPath}`);
791
- const agenticScores = scoreAgenticResults(agenticResultsPath, rubricConfig.weights);
708
+ const agenticProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC);
709
+ const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile);
792
710
  log.debug("Agentic scores calculated", {
793
711
  featureCount: Object.keys(agenticScores).length,
794
712
  features: Object.entries(agenticScores).map(([f, s]) => ({
@@ -798,10 +716,10 @@ export function calculateAndWriteScores(options) {
798
716
  })),
799
717
  });
800
718
  scores = mergeScores(baselineScores, agenticScores);
801
- evaluationMode = "full";
719
+ evaluationMode = LiteracyVariant.FULL;
802
720
  // Merge agentic actual scores into the per-model breakdown
803
721
  if (perModel) {
804
- const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath, rubricConfig.weights);
722
+ const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath, agenticProfile);
805
723
  for (const entry of perModel) {
806
724
  const modelAgentic = agenticPerModel[entry.modelId];
807
725
  if (modelAgentic) {
@@ -821,17 +739,20 @@ export function calculateAndWriteScores(options) {
821
739
  graderCost.completionTokens += agenticGraderCost.completionTokens;
822
740
  }
823
741
  }
824
- else if (mode === "agentic") {
742
+ else if (mode === LiteracyVariant.AGENTIC) {
825
743
  scores = baselineScores;
826
744
  agentBehavior = aggregateAgentBehavior(baselineResultsPath);
827
745
  sourceIsolation = aggregateSourceIsolation(baselineResultsPath, options?.allowedOrigins);
828
- evaluationMode = "agentic";
746
+ evaluationMode = LiteracyVariant.AGENTIC;
829
747
  }
830
748
  else {
831
749
  scores = baselineScores;
832
750
  agentBehavior = aggregateAgentBehavior(baselineResultsPath);
833
751
  sourceIsolation = aggregateSourceIsolation(baselineResultsPath, options?.allowedOrigins);
834
- evaluationMode = mode === "observed" ? "observed" : "baseline";
752
+ evaluationMode =
753
+ mode === LiteracyVariant.OBSERVED
754
+ ? LiteracyVariant.OBSERVED
755
+ : LiteracyVariant.STANDARD;
835
756
  }
836
757
  const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log);
837
758
  // Persist
@@ -842,7 +763,7 @@ export function calculateAndWriteScores(options) {
842
763
  // Extract and persist grader judgments (Phase 3a: failure mode extraction)
843
764
  const judgments = extractGraderJudgments(baselineResultsPath);
844
765
  // In full mode, also extract judgments from agentic results
845
- if (mode === "full" && existsSync(agenticResultsPath)) {
766
+ if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
846
767
  const agenticJudgments = extractGraderJudgments(agenticResultsPath);
847
768
  judgments.push(...agenticJudgments);
848
769
  }
@@ -117,7 +117,7 @@ export function checkGeneratedConfigsExist(rootDir) {
117
117
  const baselinePath = resolve(rootDir, "promptfooconfig.yaml");
118
118
  if (!existsSync(baselinePath)) {
119
119
  issues.push({
120
- message: "Baseline config 'promptfooconfig.yaml' not found. Run 'pnpm generate-configs'.",
120
+ message: "Baseline config 'promptfooconfig.yaml' not found. Run the pipeline to generate it.",
121
121
  path: baselinePath,
122
122
  severity: "error",
123
123
  source: "checkGeneratedConfigsExist",
@@ -131,7 +131,7 @@ export function checkGeneratedConfigsExist(rootDir) {
131
131
  const configPath = resolve(rootDir, name);
132
132
  if (!existsSync(configPath)) {
133
133
  issues.push({
134
- message: `Optional config \`${name}\` not found. Run \`pnpm generate-configs\` to create it.`,
134
+ message: `Optional config \`${name}\` not found. Run the pipeline to generate it.`,
135
135
  path: configPath,
136
136
  severity: "warning",
137
137
  source: "checkGeneratedConfigsExist",
@@ -79,14 +79,14 @@ export function compare(baseline, experiment, options) {
79
79
  // Per-dimension average deltas (only for areas present in both summaries)
80
80
  const commonAreas = areas.filter((a) => baselineAreas.has(a.area) && experimentAreas.has(a.area));
81
81
  const commonCount = commonAreas.length || 1;
82
- const perDimension = {
83
- codeCorrectness: commonAreas.reduce((s, a) => s + a.dimensions.codeCorrectness.delta, 0) /
84
- commonCount,
85
- docCoverage: commonAreas.reduce((s, a) => s + a.dimensions.docCoverage.delta, 0) /
86
- commonCount,
87
- taskCompletion: commonAreas.reduce((s, a) => s + a.dimensions.taskCompletion.delta, 0) /
88
- commonCount,
89
- };
82
+ // Collect all dimension keys from area deltas and average each
83
+ const allDimKeys = new Set(commonAreas.flatMap((a) => Object.keys(a.dimensions)));
84
+ const perDimension = {};
85
+ for (const dim of allDimKeys) {
86
+ perDimension[dim] =
87
+ commonAreas.reduce((s, a) => s + (a.dimensions[dim]?.delta ?? 0), 0) /
88
+ commonCount;
89
+ }
90
90
  // Doc Lift average delta (common areas only)
91
91
  const docLift = commonAreas.reduce((s, a) => s + a.docLiftDelta, 0) / commonCount;
92
92
  // Cost delta (if both summaries have cost data)
@@ -0,0 +1,10 @@
1
+ /**
2
+ * agent-harness-handler.test.ts — Tests for agent harness mode compilation.
3
+ *
4
+ * Tests validation, provider assembly, tool permission resolution,
5
+ * assertion mapping, sandbox config, lifecycle extensions, and
6
+ * end-to-end compilation of example tasks.
7
+ *
8
+ * Run: npx tsx --test src/pipeline/compiler/__tests__/agent-harness-handler.test.ts
9
+ */
10
+ export {};