@sanity/ailf 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (288) hide show
  1. package/config/features.ts +23 -0
  2. package/config/models.ts +83 -0
  3. package/config/prompts.ts +16 -0
  4. package/config/rubrics.ts +225 -0
  5. package/config/schedules.ts +47 -0
  6. package/config/sinks.ts +37 -0
  7. package/config/sources.ts +21 -0
  8. package/config/thresholds.ts +61 -0
  9. package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
  10. package/dist/_vendor/ailf-core/config-helpers.js +150 -0
  11. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  12. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  13. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  14. package/dist/_vendor/ailf-core/index.js +5 -0
  15. package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
  16. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  17. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  18. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  19. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  20. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  21. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  22. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
  23. package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
  24. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
  25. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
  26. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -29
  27. package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -8
  28. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  29. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  30. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  31. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  32. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  33. package/dist/_vendor/ailf-core/services/index.js +2 -1
  34. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  35. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  36. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  37. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  38. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  39. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  40. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  41. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  42. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
  43. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  44. package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
  45. package/dist/_vendor/ailf-core/types/index.js +8 -1
  46. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
  47. package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
  48. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  49. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  50. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  51. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  52. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  53. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  54. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  55. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  56. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  57. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  58. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  59. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  60. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  61. package/dist/_vendor/ailf-shared/index.js +0 -1
  62. package/dist/adapters/api-client/build-request.js +14 -13
  63. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  64. package/dist/adapters/config-sources/file-config-adapter.js +38 -12
  65. package/dist/adapters/config-sources/index.d.ts +2 -0
  66. package/dist/adapters/config-sources/index.js +1 -0
  67. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  68. package/dist/adapters/config-sources/ts-config-loader.js +133 -0
  69. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  70. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  71. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  72. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  73. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  74. package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
  75. package/dist/adapters/task-sources/index.d.ts +1 -0
  76. package/dist/adapters/task-sources/index.js +1 -0
  77. package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
  78. package/dist/adapters/task-sources/repo-task-source.js +69 -16
  79. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  80. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  81. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  82. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  83. package/dist/cli.js +0 -2
  84. package/dist/commands/baseline.js +4 -1
  85. package/dist/commands/calculate-scores.js +1 -1
  86. package/dist/commands/coverage-audit.js +7 -1
  87. package/dist/commands/explain-handler.js +25 -23
  88. package/dist/commands/fetch-docs.js +3 -2
  89. package/dist/commands/generate-configs.js +1 -1
  90. package/dist/commands/interactive.js +11 -7
  91. package/dist/commands/pipeline-action.d.ts +2 -0
  92. package/dist/commands/pipeline-action.js +16 -6
  93. package/dist/commands/pipeline.d.ts +1 -0
  94. package/dist/commands/pipeline.js +4 -2
  95. package/dist/commands/pr-comment.js +1 -1
  96. package/dist/commands/publish.js +2 -2
  97. package/dist/commands/readiness-report.js +13 -6
  98. package/dist/composition-root.d.ts +1 -1
  99. package/dist/composition-root.js +67 -4
  100. package/dist/orchestration/build-app-context.js +1 -0
  101. package/dist/orchestration/build-step-sequence.js +24 -6
  102. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  103. package/dist/orchestration/steps/fetch-docs-step.js +6 -4
  104. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  105. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  106. package/dist/orchestration/steps/generate-configs-step.js +245 -51
  107. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  108. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  109. package/dist/orchestration/steps/readiness-step.js +5 -6
  110. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  111. package/dist/orchestration/steps/run-eval-step.js +8 -7
  112. package/dist/pipeline/cache.d.ts +1 -1
  113. package/dist/pipeline/cache.js +36 -8
  114. package/dist/pipeline/calculate-scores.d.ts +2 -4
  115. package/dist/pipeline/calculate-scores.js +43 -113
  116. package/dist/pipeline/checks.js +2 -2
  117. package/dist/pipeline/compare.js +8 -8
  118. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  119. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  120. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  121. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  122. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  123. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  124. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  125. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  126. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  127. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
  128. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  129. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  130. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  131. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  132. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  133. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
  134. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  135. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  136. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  137. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  138. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  139. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  140. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  141. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  142. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  143. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  144. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  145. package/dist/pipeline/compiler/config-loader.js +111 -0
  146. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  147. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  148. package/dist/pipeline/compiler/hash.d.ts +11 -0
  149. package/dist/pipeline/compiler/hash.js +18 -0
  150. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  151. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  152. package/dist/pipeline/compiler/index.d.ts +29 -0
  153. package/dist/pipeline/compiler/index.js +45 -0
  154. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  155. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  156. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  157. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  158. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  159. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  160. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  161. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  162. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  163. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  164. package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
  165. package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
  166. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  167. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  168. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  169. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  170. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  171. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
  172. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
  173. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
  174. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  175. package/dist/pipeline/compiler/presets/index.js +8 -0
  176. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
  177. package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
  178. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  179. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  180. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  181. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  182. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  183. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  184. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  185. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  186. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  187. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  188. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  189. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  190. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  191. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  192. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  193. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  194. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  195. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  196. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  197. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  198. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  199. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  200. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  201. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  202. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  203. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  204. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  205. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  206. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  207. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  208. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  209. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  210. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  211. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  212. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  213. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  214. package/dist/pipeline/coverage-audit.d.ts +15 -5
  215. package/dist/pipeline/coverage-audit.js +41 -22
  216. package/dist/pipeline/eval-constants.d.ts +16 -6
  217. package/dist/pipeline/eval-constants.js +25 -4
  218. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  219. package/dist/pipeline/eval-fingerprint.js +8 -9
  220. package/dist/pipeline/expand-tasks.d.ts +19 -10
  221. package/dist/pipeline/expand-tasks.js +34 -28
  222. package/dist/pipeline/gap-analysis.d.ts +1 -1
  223. package/dist/pipeline/gap-analysis.js +2 -2
  224. package/dist/pipeline/generate-configs.d.ts +22 -4
  225. package/dist/pipeline/generate-configs.js +53 -24
  226. package/dist/pipeline/grader-api.d.ts +3 -3
  227. package/dist/pipeline/grader-api.js +5 -12
  228. package/dist/pipeline/grader-compare-runner.js +20 -27
  229. package/dist/pipeline/grader-comparison.d.ts +4 -8
  230. package/dist/pipeline/grader-comparison.js +11 -17
  231. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  232. package/dist/pipeline/grader-consistency-runner.js +16 -20
  233. package/dist/pipeline/grader-consistency.d.ts +6 -10
  234. package/dist/pipeline/grader-consistency.js +13 -32
  235. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  236. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  237. package/dist/pipeline/grader-sensitivity.js +10 -10
  238. package/dist/pipeline/grader-validate-runner.js +7 -5
  239. package/dist/pipeline/grader-validation.d.ts +2 -6
  240. package/dist/pipeline/grader-validation.js +14 -22
  241. package/dist/pipeline/map-request-to-config.js +6 -1
  242. package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
  243. package/dist/pipeline/mirror-repo-tasks.js +16 -15
  244. package/dist/pipeline/normalize-mode.d.ts +49 -0
  245. package/dist/pipeline/normalize-mode.js +64 -0
  246. package/dist/pipeline/plan.d.ts +5 -2
  247. package/dist/pipeline/plan.js +134 -78
  248. package/dist/pipeline/pr-comment.js +2 -0
  249. package/dist/pipeline/profile-resolution.d.ts +22 -14
  250. package/dist/pipeline/profile-resolution.js +41 -19
  251. package/dist/pipeline/provenance.d.ts +2 -2
  252. package/dist/pipeline/provenance.js +12 -17
  253. package/dist/pipeline/release-report.js +4 -4
  254. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  255. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  256. package/dist/pipeline/rubric-loader.d.ts +20 -0
  257. package/dist/pipeline/rubric-loader.js +37 -0
  258. package/dist/pipeline/validate.d.ts +4 -4
  259. package/dist/pipeline/validate.js +64 -53
  260. package/dist/schedules/loader.js +18 -8
  261. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  262. package/dist/scripts/migrate-task-mode.js +85 -0
  263. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  264. package/dist/scripts/validate-task-sources.d.ts +1 -1
  265. package/dist/scripts/validate-task-sources.js +15 -15
  266. package/dist/sinks/loader.js +5 -7
  267. package/dist/sources.d.ts +7 -7
  268. package/dist/sources.js +22 -24
  269. package/dist/webhook/dispatch.js +2 -1
  270. package/package.json +6 -3
  271. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  272. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  273. package/tasks/literacy/frameworks.task.ts +128 -0
  274. package/tasks/literacy/functions.task.ts +69 -0
  275. package/tasks/literacy/groq.task.ts +258 -0
  276. package/tasks/literacy/nextjs-live.task.ts +75 -0
  277. package/tasks/literacy/studio-setup.task.ts +131 -0
  278. package/tasks/literacy/visual-editing.task.ts +146 -0
  279. package/config/features.yaml +0 -116
  280. package/config/models.yaml +0 -116
  281. package/config/prompts.yaml +0 -75
  282. package/config/rubrics.yaml +0 -81
  283. package/config/schedules.yaml +0 -43
  284. package/config/sinks.yaml +0 -54
  285. package/config/sources.yaml +0 -51
  286. package/config/thresholds.yaml +0 -49
  287. package/dist/agent-observer/test-imports.d.ts +0 -7
  288. package/dist/agent-observer/test-imports.js +0 -185
@@ -10,7 +10,7 @@
10
10
  *
11
11
  * Cache invalidation triggers:
12
12
  * - Content change: any input file's content changes → hash changes → miss
13
- * - Config change: config/models.yaml, config/sources.yaml, tasks/*.yaml changes → miss
13
+ * - Config change: config/models, config/sources, tasks/*.yaml changes → miss
14
14
  * - Manual bypass: --no-cache flag skips all cache lookups
15
15
  * - Cache clear: delete results/cache/ to start fresh
16
16
  */
@@ -10,7 +10,7 @@
10
10
  *
11
11
  * Cache invalidation triggers:
12
12
  * - Content change: any input file's content changes → hash changes → miss
13
- * - Config change: config/models.yaml, config/sources.yaml, tasks/*.yaml changes → miss
13
+ * - Config change: config/models, config/sources, tasks/*.yaml changes → miss
14
14
  * - Manual bypass: --no-cache flag skips all cache lookups
15
15
  * - Cache clear: delete results/cache/ to start fresh
16
16
  */
@@ -18,6 +18,19 @@ import { createHash } from "crypto";
18
18
  import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync, } from "fs";
19
19
  import { join, resolve } from "path";
20
20
  // ---------------------------------------------------------------------------
21
+ // Helpers
22
+ // ---------------------------------------------------------------------------
23
+ /** Resolve first existing config file (matches loadConfigFile priority chain) */
24
+ function resolveConfig(rootDir, name) {
25
+ const r = (f) => resolve(rootDir, f);
26
+ for (const ext of [".ts", ".js", ".yaml", ".yml", ".json"]) {
27
+ const p = r(`config/${name}${ext}`);
28
+ if (existsSync(p))
29
+ return p;
30
+ }
31
+ return undefined;
32
+ }
33
+ // ---------------------------------------------------------------------------
21
34
  // Constants
22
35
  // ---------------------------------------------------------------------------
23
36
  const CACHE_DIR_NAME = "cache";
@@ -79,7 +92,10 @@ export function getStepInputPaths(rootDir, step) {
79
92
  const isBaseline = step === "eval-baseline" || step === "eval";
80
93
  const isAgentic = step === "eval-agentic" || step === "eval";
81
94
  const isObserved = step === "eval-observed" || step === "eval";
82
- const paths = [r("config/models.yaml")];
95
+ const paths = [];
96
+ const modelsPath = resolveConfig(rootDir, "models");
97
+ if (modelsPath)
98
+ paths.push(modelsPath);
83
99
  // Config files — only the relevant ones for this mode
84
100
  if (isBaseline) {
85
101
  paths.push(r("promptfooconfig.yaml"));
@@ -130,25 +146,37 @@ export function getStepInputPaths(rootDir, step) {
130
146
  return paths;
131
147
  }
132
148
  case "fetch-docs": {
133
- // Inputs: config/sources.yaml, config/models.yaml, task files (which contain inline mappings)
134
- const paths = [r("config/sources.yaml"), r("config/models.yaml")];
149
+ // Inputs: config sources + models, task files
150
+ const paths = [];
151
+ const sourcesPath = resolveConfig(rootDir, "sources");
152
+ const modelsPath2 = resolveConfig(rootDir, "models");
153
+ if (sourcesPath)
154
+ paths.push(sourcesPath);
155
+ if (modelsPath2)
156
+ paths.push(modelsPath2);
135
157
  // Include all task files (they define feature areas)
136
158
  const tasksDir = r("tasks");
137
159
  if (existsSync(tasksDir)) {
138
160
  const taskFiles = readdirSync(tasksDir)
139
- .filter((f) => f.endsWith(".yaml") || f.endsWith(".yml"))
161
+ .filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
140
162
  .map((f) => join(tasksDir, f));
141
163
  paths.push(...taskFiles);
142
164
  }
143
165
  return paths;
144
166
  }
145
167
  case "generate-configs": {
146
- // Inputs: config/models.yaml, config/sources.yaml, all task files
147
- const paths = [r("config/models.yaml"), r("config/sources.yaml")];
168
+ // Inputs: config models + sources, all task files
169
+ const paths = [];
170
+ const modelsPath3 = resolveConfig(rootDir, "models");
171
+ const sourcesPath2 = resolveConfig(rootDir, "sources");
172
+ if (modelsPath3)
173
+ paths.push(modelsPath3);
174
+ if (sourcesPath2)
175
+ paths.push(sourcesPath2);
148
176
  const tasksDir = r("tasks");
149
177
  if (existsSync(tasksDir)) {
150
178
  const taskFiles = readdirSync(tasksDir)
151
- .filter((f) => f.endsWith(".yaml") || f.endsWith(".yml"))
179
+ .filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
152
180
  .map((f) => join(tasksDir, f));
153
181
  paths.push(...taskFiles);
154
182
  }
@@ -1,9 +1,7 @@
1
- import type { Logger, TestSummary } from "../_vendor/ailf-core/index.d.ts";
1
+ import { type ActualScoreEntry, type ComponentResult, type Logger, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
2
2
  import { type ResolvedSourceConfig } from "../sources.js";
3
- import { type ActualScoreEntry, type ComponentResult } from "../_vendor/ailf-core/index.d.ts";
4
3
  import type { GraderJudgment, PerModelEntry } from "./types.js";
5
- export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.d.ts";
6
- export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
4
+ export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
7
5
  export interface PromptfooResultsWrapper {
8
6
  results: RawTestResult[];
9
7
  stats: {
@@ -9,7 +9,7 @@
9
9
  * Doc Coverage (0–100) — Did docs provide the needed info?
10
10
  *
11
11
  * Dimensions are combined into a weighted composite (0–100) using named
12
- * scoring profiles from config/rubrics.yaml. Gold (with-docs) entries use
12
+ * scoring profiles from config/rubrics. Gold (with-docs) entries use
13
13
  * the "default" profile; baseline (without-docs) entries use "output-only"
14
14
  * which excludes doc-coverage (undefined without docs).
15
15
  * See docs/design-docs/named-scoring-profiles.md.
@@ -29,15 +29,17 @@
29
29
  */
30
30
  import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
31
31
  import { join } from "path";
32
+ import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
32
33
  import { calculateCost } from "../agent-observer/pricing.js";
33
34
  import { ConsoleLogger } from "../adapters/loggers/index.js";
35
+ import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
34
36
  import { checkResultsExist } from "./checks.js";
35
- import { loadRubricTemplates } from "./expand-tasks.js";
37
+ import { loadRubricTemplates } from "./rubric-loader.js";
36
38
  import { resolveProfile } from "./profile-resolution.js";
37
39
  import { loadSource } from "../sources.js";
38
- import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
39
- import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
40
- // Re-export pure functions from core for backward compatibility.
40
+ import { LiteracyVariant } from "./normalize-mode.js";
41
+ import { scoreTestGroup } from "./compiler/scoring-bridge.js";
42
+ // Re-export from core for backward compatibility.
41
43
  // Existing imports from this file continue to work unchanged.
42
44
  export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
43
45
  /**
@@ -137,14 +139,8 @@ export function extractGraderJudgments(resultsPath) {
137
139
  // Not JSON — use raw reason string
138
140
  }
139
141
  }
140
- // Map internal dimension names to hyphenated form
141
- const dimensionMap = {
142
- codeCorrectness: "code-correctness",
143
- docCoverage: "doc-coverage",
144
- taskCompletion: "task-completion",
145
- };
146
142
  judgments.push({
147
- dimension: dimensionMap[kind] ?? kind,
143
+ dimension: kind,
148
144
  modelId,
149
145
  reason,
150
146
  score,
@@ -281,7 +277,7 @@ function aggregateUrlReferences(resultsPath) {
281
277
  * verification report.
282
278
  */
283
279
  function buildSourceVerification(root, source, verificationCtx) {
284
- const mode = verificationCtx?.mode ?? "baseline";
280
+ const mode = verificationCtx?.mode ?? LiteracyVariant.STANDARD;
285
281
  const sourceUrl = source?.baseUrl ?? "default";
286
282
  const searchMode = verificationCtx?.searchMode;
287
283
  const allowedOrigins = verificationCtx?.allowedOrigins;
@@ -493,62 +489,6 @@ function readAndNormalizeResults(resultsPath, log) {
493
489
  }
494
490
  return valid;
495
491
  }
496
- /**
497
- * Accumulate raw dimension scores across an array of test results.
498
- * Dimension-agnostic: any dimension returned by classifyRubric() is tracked.
499
- */
500
- function accumulateDimensions(tests) {
501
- const dimensions = {};
502
- let totalCost = 0;
503
- for (const test of tests) {
504
- totalCost += test.cost;
505
- for (const comp of test.gradingResult.componentResults) {
506
- if (comp.assertion?.type !== "llm-rubric")
507
- continue;
508
- const score = parseRubricScore(comp);
509
- const kind = classifyRubric(comp);
510
- if (kind) {
511
- dimensions[kind] = (dimensions[kind] ?? 0) + score;
512
- }
513
- }
514
- }
515
- return { dimensions, totalCost };
516
- }
517
- /**
518
- * Average accumulated dimension scores by a count.
519
- * Returns a dimension → average score map.
520
- */
521
- function averageDimensions(accumulated, count) {
522
- const avg = {};
523
- for (const [dim, total] of Object.entries(accumulated.dimensions)) {
524
- avg[dim] = total / count;
525
- }
526
- return avg;
527
- }
528
- /**
529
- * Compute a weighted composite score from dimension averages and a profile.
530
- * Only dimensions present in the profile contribute to the composite.
531
- * Dimensions not in the profile are ignored (e.g., doc-coverage on baseline).
532
- *
533
- * The profile maps camelCase dimension names (as returned by classifyRubric)
534
- * to kebab-case keys (as used in rubrics.yaml). This function handles the
535
- * mapping internally.
536
- */
537
- function weightedComposite(dimensionAverages, profile) {
538
- // Map profile keys (kebab-case: "task-completion") to classifyRubric
539
- // output (camelCase: "taskCompletion")
540
- const kebabToCamel = {
541
- "code-correctness": "codeCorrectness",
542
- "doc-coverage": "docCoverage",
543
- "task-completion": "taskCompletion",
544
- };
545
- let total = 0;
546
- for (const [profileKey, weight] of Object.entries(profile)) {
547
- const dimKey = kebabToCamel[profileKey] ?? profileKey;
548
- total += (dimensionAverages[dimKey] ?? 0) * weight;
549
- }
550
- return total;
551
- }
552
492
  /**
553
493
  * Core scoring logic: takes a pre-filtered array of TestResult and produces
554
494
  * FeatureScore[] grouped by feature area. This is the shared implementation
@@ -577,35 +517,28 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
577
517
  }
578
518
  const scores = [];
579
519
  for (const [feature, data] of Object.entries(byFeature)) {
580
- // --- With docs (gold / ceiling) ---
581
- const goldDims = accumulateDimensions(data.withDocs);
582
- let featureCost = goldDims.totalCost;
583
- const countWithDocs = data.withDocs.length || 1;
584
- const avgGold = averageDimensions(goldDims, countWithDocs);
585
- const withDocsTotal = weightedComposite(avgGold, goldProfile);
520
+ // --- With docs (gold / ceiling) — scored via 4-tier engine ---
521
+ const gold = scoreTestGroup(data.withDocs, goldProfile, feature);
586
522
  // --- Without docs (baseline / floor) ---
587
523
  // Uses the baseline profile (e.g. "output-only") which may exclude
588
524
  // dimensions like doc-coverage that are undefined without docs.
589
525
  // See docs/design-docs/named-scoring-profiles.md.
590
- const baselineDims = accumulateDimensions(data.withoutDocs);
591
- featureCost += baselineDims.totalCost;
592
- const countWithoutDocs = data.withoutDocs.length || 1;
593
- const avgBaseline = averageDimensions(baselineDims, countWithoutDocs);
594
- const withoutDocsScore = weightedComposite(avgBaseline, baselineProfile);
595
- const ceilingScore = Math.round(withDocsTotal);
596
- const floorScore = Math.round(withoutDocsScore);
526
+ const baseline = scoreTestGroup(data.withoutDocs, baselineProfile, feature);
527
+ const featureCost = gold.totalCost + baseline.totalCost;
528
+ const ceilingScore = gold.composite;
529
+ const floorScore = baseline.composite;
597
530
  const docLift = ceilingScore - floorScore;
598
531
  const featureScore = {
599
532
  ceilingScore,
600
- codeCorrectness: Math.round(avgGold.codeCorrectness ?? 0),
601
- docCoverage: Math.round(avgGold.docCoverage ?? 0),
533
+ codeCorrectness: gold.dimensions.codeCorrectness ?? 0,
534
+ docCoverage: gold.dimensions.docCoverage ?? 0,
602
535
  docLift,
603
536
  docQualityGap: 100 - ceilingScore,
604
537
  feature,
605
538
  floorScore,
606
539
  ...(modelId && { modelId }),
607
540
  negativeDocLift: docLift < 0,
608
- taskCompletion: Math.round(avgGold.taskCompletion ?? 0),
541
+ taskCompletion: gold.dimensions.taskCompletion ?? 0,
609
542
  testCount: data.withDocs.length,
610
543
  totalCost: featureCost,
611
544
  totalScore: ceilingScore,
@@ -638,17 +571,14 @@ export function scoreAgenticResults(resultsPath, profile) {
638
571
  }
639
572
  const entries = {};
640
573
  for (const [feature, featureResults] of Object.entries(byFeature)) {
641
- const count = featureResults.length || 1;
642
- const accumulated = accumulateDimensions(featureResults);
643
- const avg = averageDimensions(accumulated, count);
644
- const actualScore = Math.round(weightedComposite(avg, profile));
574
+ const scored = scoreTestGroup(featureResults, profile, feature);
645
575
  entries[feature] = {
646
- actualScore,
647
- codeCorrectness: Math.round(avg.codeCorrectness ?? 0),
648
- docCoverage: Math.round(avg.docCoverage ?? 0),
649
- taskCompletion: Math.round(avg.taskCompletion ?? 0),
576
+ actualScore: scored.composite,
577
+ codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
578
+ docCoverage: scored.dimensions.docCoverage ?? 0,
579
+ taskCompletion: scored.dimensions.taskCompletion ?? 0,
650
580
  testCount: featureResults.length,
651
- totalCost: accumulated.totalCost,
581
+ totalCost: scored.totalCost,
652
582
  };
653
583
  }
654
584
  return entries;
@@ -677,17 +607,14 @@ export function scoreAgenticResultsPerModel(resultsPath, profile) {
677
607
  for (const [modelId, features] of Object.entries(byModel)) {
678
608
  perModel[modelId] = {};
679
609
  for (const [feature, featureResults] of Object.entries(features)) {
680
- const count = featureResults.length || 1;
681
- const accumulated = accumulateDimensions(featureResults);
682
- const avg = averageDimensions(accumulated, count);
683
- const actualScore = Math.round(weightedComposite(avg, profile));
610
+ const scored = scoreTestGroup(featureResults, profile, feature);
684
611
  perModel[modelId][feature] = {
685
- actualScore,
686
- codeCorrectness: Math.round(avg.codeCorrectness ?? 0),
687
- docCoverage: Math.round(avg.docCoverage ?? 0),
688
- taskCompletion: Math.round(avg.taskCompletion ?? 0),
612
+ actualScore: scored.composite,
613
+ codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
614
+ docCoverage: scored.dimensions.docCoverage ?? 0,
615
+ taskCompletion: scored.dimensions.taskCompletion ?? 0,
689
616
  testCount: featureResults.length,
690
- totalCost: accumulated.totalCost,
617
+ totalCost: scored.totalCost,
691
618
  };
692
619
  }
693
620
  }
@@ -725,7 +652,7 @@ export function calculateAndWriteScores(options) {
725
652
  }
726
653
  }
727
654
  // Determine mode — controls which result files are read
728
- const mode = options.mode ?? "baseline";
655
+ const mode = options.mode ?? LiteracyVariant.STANDARD;
729
656
  const baselineResultsPath = options.resultsPath ?? join(ROOT, "results", "latest", "eval-results.json");
730
657
  // Agentic results path (only used in full mode)
731
658
  const agenticResultsPath = join(ROOT, "results", "latest", "eval-results-agentic.json");
@@ -747,8 +674,8 @@ export function calculateAndWriteScores(options) {
747
674
  // Baseline (without-docs) entries use "output-only" (2 dimensions,
748
675
  // doc-coverage excluded). See docs/design-docs/named-scoring-profiles.md.
749
676
  const rubricConfig = loadRubricTemplates(ROOT);
750
- const goldProfile = resolveProfile("baseline", "gold", rubricConfig);
751
- const baselineProfileWeights = resolveProfile("baseline", "baseline", rubricConfig);
677
+ const goldProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.STANDARD);
678
+ const baselineProfileWeights = resolveProfile("literacy", LiteracyVariant.STANDARD, rubricConfig, LiteracyVariant.STANDARD);
752
679
  log.debug("Loaded scoring profiles", {
753
680
  gold: goldProfile,
754
681
  baseline: baselineProfileWeights,
@@ -776,9 +703,9 @@ export function calculateAndWriteScores(options) {
776
703
  let agentBehavior = null;
777
704
  let sourceIsolation = null;
778
705
  let evaluationMode;
779
- if (mode === "full" && existsSync(agenticResultsPath)) {
706
+ if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
780
707
  log.info(`\nReading agentic results from: ${agenticResultsPath}`);
781
- const agenticProfile = resolveProfile("agentic", "gold", rubricConfig);
708
+ const agenticProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC);
782
709
  const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile);
783
710
  log.debug("Agentic scores calculated", {
784
711
  featureCount: Object.keys(agenticScores).length,
@@ -789,7 +716,7 @@ export function calculateAndWriteScores(options) {
789
716
  })),
790
717
  });
791
718
  scores = mergeScores(baselineScores, agenticScores);
792
- evaluationMode = "full";
719
+ evaluationMode = LiteracyVariant.FULL;
793
720
  // Merge agentic actual scores into the per-model breakdown
794
721
  if (perModel) {
795
722
  const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath, agenticProfile);
@@ -812,17 +739,20 @@ export function calculateAndWriteScores(options) {
812
739
  graderCost.completionTokens += agenticGraderCost.completionTokens;
813
740
  }
814
741
  }
815
- else if (mode === "agentic") {
742
+ else if (mode === LiteracyVariant.AGENTIC) {
816
743
  scores = baselineScores;
817
744
  agentBehavior = aggregateAgentBehavior(baselineResultsPath);
818
745
  sourceIsolation = aggregateSourceIsolation(baselineResultsPath, options?.allowedOrigins);
819
- evaluationMode = "agentic";
746
+ evaluationMode = LiteracyVariant.AGENTIC;
820
747
  }
821
748
  else {
822
749
  scores = baselineScores;
823
750
  agentBehavior = aggregateAgentBehavior(baselineResultsPath);
824
751
  sourceIsolation = aggregateSourceIsolation(baselineResultsPath, options?.allowedOrigins);
825
- evaluationMode = mode === "observed" ? "observed" : "baseline";
752
+ evaluationMode =
753
+ mode === LiteracyVariant.OBSERVED
754
+ ? LiteracyVariant.OBSERVED
755
+ : LiteracyVariant.STANDARD;
826
756
  }
827
757
  const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log);
828
758
  // Persist
@@ -833,7 +763,7 @@ export function calculateAndWriteScores(options) {
833
763
  // Extract and persist grader judgments (Phase 3a: failure mode extraction)
834
764
  const judgments = extractGraderJudgments(baselineResultsPath);
835
765
  // In full mode, also extract judgments from agentic results
836
- if (mode === "full" && existsSync(agenticResultsPath)) {
766
+ if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
837
767
  const agenticJudgments = extractGraderJudgments(agenticResultsPath);
838
768
  judgments.push(...agenticJudgments);
839
769
  }
@@ -117,7 +117,7 @@ export function checkGeneratedConfigsExist(rootDir) {
117
117
  const baselinePath = resolve(rootDir, "promptfooconfig.yaml");
118
118
  if (!existsSync(baselinePath)) {
119
119
  issues.push({
120
- message: "Baseline config 'promptfooconfig.yaml' not found. Run 'pnpm generate-configs'.",
120
+ message: "Baseline config 'promptfooconfig.yaml' not found. Run the pipeline to generate it.",
121
121
  path: baselinePath,
122
122
  severity: "error",
123
123
  source: "checkGeneratedConfigsExist",
@@ -131,7 +131,7 @@ export function checkGeneratedConfigsExist(rootDir) {
131
131
  const configPath = resolve(rootDir, name);
132
132
  if (!existsSync(configPath)) {
133
133
  issues.push({
134
- message: `Optional config \`${name}\` not found. Run \`pnpm generate-configs\` to create it.`,
134
+ message: `Optional config \`${name}\` not found. Run the pipeline to generate it.`,
135
135
  path: configPath,
136
136
  severity: "warning",
137
137
  source: "checkGeneratedConfigsExist",
@@ -79,14 +79,14 @@ export function compare(baseline, experiment, options) {
79
79
  // Per-dimension average deltas (only for areas present in both summaries)
80
80
  const commonAreas = areas.filter((a) => baselineAreas.has(a.area) && experimentAreas.has(a.area));
81
81
  const commonCount = commonAreas.length || 1;
82
- const perDimension = {
83
- codeCorrectness: commonAreas.reduce((s, a) => s + a.dimensions.codeCorrectness.delta, 0) /
84
- commonCount,
85
- docCoverage: commonAreas.reduce((s, a) => s + a.dimensions.docCoverage.delta, 0) /
86
- commonCount,
87
- taskCompletion: commonAreas.reduce((s, a) => s + a.dimensions.taskCompletion.delta, 0) /
88
- commonCount,
89
- };
82
+ // Collect all dimension keys from area deltas and average each
83
+ const allDimKeys = new Set(commonAreas.flatMap((a) => Object.keys(a.dimensions)));
84
+ const perDimension = {};
85
+ for (const dim of allDimKeys) {
86
+ perDimension[dim] =
87
+ commonAreas.reduce((s, a) => s + (a.dimensions[dim]?.delta ?? 0), 0) /
88
+ commonCount;
89
+ }
90
90
  // Doc Lift average delta (common areas only)
91
91
  const docLift = commonAreas.reduce((s, a) => s + a.docLiftDelta, 0) / commonCount;
92
92
  // Cost delta (if both summaries have cost data)
@@ -0,0 +1,10 @@
1
+ /**
2
+ * agent-harness-handler.test.ts — Tests for agent harness mode compilation.
3
+ *
4
+ * Tests validation, provider assembly, tool permission resolution,
5
+ * assertion mapping, sandbox config, lifecycle extensions, and
6
+ * end-to-end compilation of example tasks.
7
+ *
8
+ * Run: npx tsx --test src/pipeline/compiler/__tests__/agent-harness-handler.test.ts
9
+ */
10
+ export {};