@sanity/ailf 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (288) hide show
  1. package/config/features.ts +23 -0
  2. package/config/models.ts +83 -0
  3. package/config/prompts.ts +16 -0
  4. package/config/rubrics.ts +225 -0
  5. package/config/schedules.ts +47 -0
  6. package/config/sinks.ts +37 -0
  7. package/config/sources.ts +21 -0
  8. package/config/thresholds.ts +61 -0
  9. package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
  10. package/dist/_vendor/ailf-core/config-helpers.js +150 -0
  11. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  12. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  13. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  14. package/dist/_vendor/ailf-core/index.js +5 -0
  15. package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
  16. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  17. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  18. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  19. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  20. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  21. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  22. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
  23. package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
  24. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
  25. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
  26. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -29
  27. package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -8
  28. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  29. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  30. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  31. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  32. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  33. package/dist/_vendor/ailf-core/services/index.js +2 -1
  34. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  35. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  36. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  37. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  38. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  39. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  40. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  41. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  42. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
  43. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  44. package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
  45. package/dist/_vendor/ailf-core/types/index.js +8 -1
  46. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
  47. package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
  48. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  49. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  50. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  51. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  52. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  53. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  54. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  55. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  56. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  57. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  58. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  59. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  60. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  61. package/dist/_vendor/ailf-shared/index.js +0 -1
  62. package/dist/adapters/api-client/build-request.js +14 -13
  63. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  64. package/dist/adapters/config-sources/file-config-adapter.js +38 -12
  65. package/dist/adapters/config-sources/index.d.ts +2 -0
  66. package/dist/adapters/config-sources/index.js +1 -0
  67. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  68. package/dist/adapters/config-sources/ts-config-loader.js +133 -0
  69. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  70. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  71. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  72. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  73. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  74. package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
  75. package/dist/adapters/task-sources/index.d.ts +1 -0
  76. package/dist/adapters/task-sources/index.js +1 -0
  77. package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
  78. package/dist/adapters/task-sources/repo-task-source.js +69 -16
  79. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  80. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  81. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  82. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  83. package/dist/cli.js +0 -2
  84. package/dist/commands/baseline.js +4 -1
  85. package/dist/commands/calculate-scores.js +1 -1
  86. package/dist/commands/coverage-audit.js +7 -1
  87. package/dist/commands/explain-handler.js +25 -23
  88. package/dist/commands/fetch-docs.js +3 -2
  89. package/dist/commands/generate-configs.js +1 -1
  90. package/dist/commands/interactive.js +11 -7
  91. package/dist/commands/pipeline-action.d.ts +2 -0
  92. package/dist/commands/pipeline-action.js +16 -6
  93. package/dist/commands/pipeline.d.ts +1 -0
  94. package/dist/commands/pipeline.js +4 -2
  95. package/dist/commands/pr-comment.js +1 -1
  96. package/dist/commands/publish.js +2 -2
  97. package/dist/commands/readiness-report.js +13 -6
  98. package/dist/composition-root.d.ts +1 -1
  99. package/dist/composition-root.js +67 -4
  100. package/dist/orchestration/build-app-context.js +1 -0
  101. package/dist/orchestration/build-step-sequence.js +24 -6
  102. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  103. package/dist/orchestration/steps/fetch-docs-step.js +6 -4
  104. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  105. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  106. package/dist/orchestration/steps/generate-configs-step.js +245 -51
  107. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  108. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  109. package/dist/orchestration/steps/readiness-step.js +5 -6
  110. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  111. package/dist/orchestration/steps/run-eval-step.js +8 -7
  112. package/dist/pipeline/cache.d.ts +1 -1
  113. package/dist/pipeline/cache.js +36 -8
  114. package/dist/pipeline/calculate-scores.d.ts +2 -4
  115. package/dist/pipeline/calculate-scores.js +43 -113
  116. package/dist/pipeline/checks.js +2 -2
  117. package/dist/pipeline/compare.js +8 -8
  118. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  119. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  120. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  121. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  122. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  123. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  124. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  125. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  126. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  127. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
  128. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  129. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  130. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  131. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  132. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  133. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
  134. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  135. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  136. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  137. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  138. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  139. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  140. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  141. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  142. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  143. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  144. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  145. package/dist/pipeline/compiler/config-loader.js +111 -0
  146. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  147. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  148. package/dist/pipeline/compiler/hash.d.ts +11 -0
  149. package/dist/pipeline/compiler/hash.js +18 -0
  150. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  151. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  152. package/dist/pipeline/compiler/index.d.ts +29 -0
  153. package/dist/pipeline/compiler/index.js +45 -0
  154. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  155. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  156. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  157. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  158. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  159. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  160. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  161. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  162. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  163. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  164. package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
  165. package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
  166. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  167. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  168. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  169. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  170. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  171. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
  172. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
  173. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
  174. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  175. package/dist/pipeline/compiler/presets/index.js +8 -0
  176. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
  177. package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
  178. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  179. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  180. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  181. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  182. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  183. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  184. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  185. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  186. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  187. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  188. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  189. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  190. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  191. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  192. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  193. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  194. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  195. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  196. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  197. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  198. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  199. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  200. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  201. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  202. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  203. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  204. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  205. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  206. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  207. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  208. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  209. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  210. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  211. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  212. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  213. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  214. package/dist/pipeline/coverage-audit.d.ts +15 -5
  215. package/dist/pipeline/coverage-audit.js +41 -22
  216. package/dist/pipeline/eval-constants.d.ts +16 -6
  217. package/dist/pipeline/eval-constants.js +25 -4
  218. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  219. package/dist/pipeline/eval-fingerprint.js +8 -9
  220. package/dist/pipeline/expand-tasks.d.ts +19 -10
  221. package/dist/pipeline/expand-tasks.js +34 -28
  222. package/dist/pipeline/gap-analysis.d.ts +1 -1
  223. package/dist/pipeline/gap-analysis.js +2 -2
  224. package/dist/pipeline/generate-configs.d.ts +22 -4
  225. package/dist/pipeline/generate-configs.js +53 -24
  226. package/dist/pipeline/grader-api.d.ts +3 -3
  227. package/dist/pipeline/grader-api.js +5 -12
  228. package/dist/pipeline/grader-compare-runner.js +20 -27
  229. package/dist/pipeline/grader-comparison.d.ts +4 -8
  230. package/dist/pipeline/grader-comparison.js +11 -17
  231. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  232. package/dist/pipeline/grader-consistency-runner.js +16 -20
  233. package/dist/pipeline/grader-consistency.d.ts +6 -10
  234. package/dist/pipeline/grader-consistency.js +13 -32
  235. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  236. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  237. package/dist/pipeline/grader-sensitivity.js +10 -10
  238. package/dist/pipeline/grader-validate-runner.js +7 -5
  239. package/dist/pipeline/grader-validation.d.ts +2 -6
  240. package/dist/pipeline/grader-validation.js +14 -22
  241. package/dist/pipeline/map-request-to-config.js +6 -1
  242. package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
  243. package/dist/pipeline/mirror-repo-tasks.js +16 -15
  244. package/dist/pipeline/normalize-mode.d.ts +49 -0
  245. package/dist/pipeline/normalize-mode.js +64 -0
  246. package/dist/pipeline/plan.d.ts +5 -2
  247. package/dist/pipeline/plan.js +134 -78
  248. package/dist/pipeline/pr-comment.js +2 -0
  249. package/dist/pipeline/profile-resolution.d.ts +22 -14
  250. package/dist/pipeline/profile-resolution.js +41 -19
  251. package/dist/pipeline/provenance.d.ts +2 -2
  252. package/dist/pipeline/provenance.js +12 -17
  253. package/dist/pipeline/release-report.js +4 -4
  254. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  255. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  256. package/dist/pipeline/rubric-loader.d.ts +20 -0
  257. package/dist/pipeline/rubric-loader.js +37 -0
  258. package/dist/pipeline/validate.d.ts +4 -4
  259. package/dist/pipeline/validate.js +64 -53
  260. package/dist/schedules/loader.js +18 -8
  261. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  262. package/dist/scripts/migrate-task-mode.js +85 -0
  263. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  264. package/dist/scripts/validate-task-sources.d.ts +1 -1
  265. package/dist/scripts/validate-task-sources.js +15 -15
  266. package/dist/sinks/loader.js +5 -7
  267. package/dist/sources.d.ts +7 -7
  268. package/dist/sources.js +22 -24
  269. package/dist/webhook/dispatch.js +2 -1
  270. package/package.json +6 -3
  271. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  272. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  273. package/tasks/literacy/frameworks.task.ts +128 -0
  274. package/tasks/literacy/functions.task.ts +69 -0
  275. package/tasks/literacy/groq.task.ts +258 -0
  276. package/tasks/literacy/nextjs-live.task.ts +75 -0
  277. package/tasks/literacy/studio-setup.task.ts +131 -0
  278. package/tasks/literacy/visual-editing.task.ts +146 -0
  279. package/config/features.yaml +0 -116
  280. package/config/models.yaml +0 -116
  281. package/config/prompts.yaml +0 -75
  282. package/config/rubrics.yaml +0 -81
  283. package/config/schedules.yaml +0 -43
  284. package/config/sinks.yaml +0 -54
  285. package/config/sources.yaml +0 -51
  286. package/config/thresholds.yaml +0 -49
  287. package/dist/agent-observer/test-imports.d.ts +0 -7
  288. package/dist/agent-observer/test-imports.js +0 -185
@@ -0,0 +1,49 @@
1
+ /**
2
+ * scoring-bridge.ts — Bridge between Promptfoo raw results and the
3
+ * 4-tier scoring engine.
4
+ *
5
+ * Converts Promptfoo `ComponentResult[]` (from test results) into the
6
+ * scoring engine's `AssertionScore[]` format, then delegates aggregation
7
+ * to `aggregateDimensions` and `computeTaskScore` from core.
8
+ *
9
+ * This bridge replaces the three legacy scoring primitives in
10
+ * `calculate-scores.ts`:
11
+ * - `accumulateDimensions` → `convertToAssertionScores` + `aggregateDimensions`
12
+ * - `averageDimensions` → (handled internally by `aggregateDimensions`)
13
+ * - `weightedComposite` → `computeTaskScore`
14
+ *
15
+ * The bridge preserves the existing 0–100 output scale. The 4-tier
16
+ * engine works in [0, 1]; this module handles the conversion at
17
+ * boundaries.
18
+ *
19
+ * @see packages/core/src/services/scoring-engine.ts — the 4-tier engine
20
+ * @see packages/eval/src/pipeline/calculate-scores.ts — the consumer
21
+ * @see docs/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
22
+ */
23
+ import { type DimensionScore } from "../../_vendor/ailf-core/index.d.ts";
24
+ import type { TestResult } from "../../_vendor/ailf-core/index.d.ts";
25
+ /** Result of scoring a group of tests via the 4-tier engine */
26
+ export interface BridgedScoreResult {
27
+ /** Per-dimension breakdown (0–100 scale) */
28
+ dimensions: Record<string, number>;
29
+ /** Weighted composite score (0–100 scale) */
30
+ composite: number;
31
+ /** Total cost across all tests */
32
+ totalCost: number;
33
+ /** Raw DimensionScore objects from the engine (0–1 scale) */
34
+ rawDimensions: DimensionScore[];
35
+ }
36
+ /**
37
+ * Score a group of test results using the 4-tier scoring engine.
38
+ *
39
+ * This replaces the legacy `accumulateDimensions → averageDimensions →
40
+ * weightedComposite` chain with the new engine's `aggregateDimensions →
41
+ * computeTaskScore` chain.
42
+ *
43
+ * @param tests Pre-filtered test results (e.g., all gold or all baseline)
44
+ * @param profile Weight profile mapping kebab-case dimension names to weights
45
+ * (e.g., `{ "task-completion": 0.4, "code-correctness": 0.35, "doc-coverage": 0.25 }`)
46
+ * @param taskId Optional task identifier for traceability in TaskScore output
47
+ * @returns Dimensions (0–100) and composite (0–100), matching legacy output format
48
+ */
49
+ export declare function scoreTestGroup(tests: TestResult[], profile: Record<string, number>, taskId?: string): BridgedScoreResult;
@@ -0,0 +1,114 @@
1
+ /**
2
+ * scoring-bridge.ts — Bridge between Promptfoo raw results and the
3
+ * 4-tier scoring engine.
4
+ *
5
+ * Converts Promptfoo `ComponentResult[]` (from test results) into the
6
+ * scoring engine's `AssertionScore[]` format, then delegates aggregation
7
+ * to `aggregateDimensions` and `computeTaskScore` from core.
8
+ *
9
+ * This bridge replaces the three legacy scoring primitives in
10
+ * `calculate-scores.ts`:
11
+ * - `accumulateDimensions` → `convertToAssertionScores` + `aggregateDimensions`
12
+ * - `averageDimensions` → (handled internally by `aggregateDimensions`)
13
+ * - `weightedComposite` → `computeTaskScore`
14
+ *
15
+ * The bridge preserves the existing 0–100 output scale. The 4-tier
16
+ * engine works in [0, 1]; this module handles the conversion at
17
+ * boundaries.
18
+ *
19
+ * @see packages/core/src/services/scoring-engine.ts — the 4-tier engine
20
+ * @see packages/eval/src/pipeline/calculate-scores.ts — the consumer
21
+ * @see docs/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
22
+ */
23
+ import { aggregateDimensions, computeTaskScore, normalizeScore, } from "../../_vendor/ailf-core/index.js";
24
+ import { classifyRubric, parseRubricScore } from "../../_vendor/ailf-core/index.js";
25
+ // ---------------------------------------------------------------------------
26
+ // Public API
27
+ // ---------------------------------------------------------------------------
28
+ /**
29
+ * Score a group of test results using the 4-tier scoring engine.
30
+ *
31
+ * This replaces the legacy `accumulateDimensions → averageDimensions →
32
+ * weightedComposite` chain with the new engine's `aggregateDimensions →
33
+ * computeTaskScore` chain.
34
+ *
35
+ * @param tests Pre-filtered test results (e.g., all gold or all baseline)
36
+ * @param profile Weight profile mapping kebab-case dimension names to weights
37
+ * (e.g., `{ "task-completion": 0.4, "code-correctness": 0.35, "doc-coverage": 0.25 }`)
38
+ * @param taskId Optional task identifier for traceability in TaskScore output
39
+ * @returns Dimensions (0–100) and composite (0–100), matching legacy output format
40
+ */
41
+ export function scoreTestGroup(tests, profile, taskId) {
42
+ let totalCost = 0;
43
+ // Step 1: Convert all ComponentResults into AssertionScore[] (0–1 scale)
44
+ const assertionScores = [];
45
+ for (const test of tests) {
46
+ totalCost += test.cost;
47
+ for (const comp of test.gradingResult.componentResults) {
48
+ if (comp.assertion?.type !== "llm-rubric")
49
+ continue;
50
+ const converted = componentToAssertionScore(comp);
51
+ if (converted)
52
+ assertionScores.push(converted);
53
+ }
54
+ }
55
+ // Step 2: Aggregate into DimensionScores (0–1 scale)
56
+ const dimensionLabels = {
57
+ "code-correctness": "Code Correctness",
58
+ "doc-coverage": "Doc Coverage",
59
+ "task-completion": "Task Completion",
60
+ };
61
+ const rawDimensions = aggregateDimensions(assertionScores, {
62
+ defaultAggregation: "mean",
63
+ dimensionLabels,
64
+ });
65
+ // Step 3: Compute weighted composite via TaskScore (0–1 scale)
66
+ const taskScoreResult = computeTaskScore(rawDimensions, {
67
+ taskId: taskId ?? "aggregate",
68
+ weights: profile,
69
+ weightSource: "scoring-bridge",
70
+ });
71
+ // Step 4: Convert back to 0–100 scale for legacy compatibility
72
+ const dimensions = {};
73
+ for (const dim of rawDimensions) {
74
+ // Map kebab-case dimension IDs to camelCase for legacy compatibility
75
+ const camelKey = kebabToCamel(dim.dimensionId);
76
+ dimensions[camelKey] = Math.round(dim.score * 100);
77
+ }
78
+ return {
79
+ composite: Math.round(taskScoreResult.score * 100),
80
+ dimensions,
81
+ rawDimensions,
82
+ totalCost,
83
+ };
84
+ }
85
+ // ---------------------------------------------------------------------------
86
+ // Conversion helpers
87
+ // ---------------------------------------------------------------------------
88
+ /**
89
+ * Convert a single Promptfoo ComponentResult into the scoring engine's
90
+ * AssertionScore format.
91
+ *
92
+ * Returns null if the component doesn't map to a known dimension.
93
+ */
94
+ function componentToAssertionScore(comp) {
95
+ const dim = classifyRubric(comp);
96
+ if (!dim)
97
+ return null;
98
+ // Parse the raw score (0–100 from the grader) and normalize to [0, 1]
99
+ const rawScore = parseRubricScore(comp);
100
+ const normalized = normalizeScore(rawScore, "llm-rubric");
101
+ return {
102
+ assertionType: comp.assertion?.type ?? "llm-rubric",
103
+ dimension: dim,
104
+ latencyMs: 0,
105
+ pass: comp.pass,
106
+ reason: comp.reason ?? "",
107
+ score: normalized,
108
+ weight: 1.0,
109
+ };
110
+ }
111
+ /** Convert kebab-case dimension key to camelCase (e.g., "task-completion" → "taskCompletion") */
112
+ function kebabToCamel(kebab) {
113
+ return kebab.replace(/-([a-z])/g, (_, c) => c.toUpperCase());
114
+ }
@@ -0,0 +1,54 @@
1
+ /**
2
+ * TaskGraphBuilder — converts task definitions into a TaskGraph IR.
3
+ *
4
+ * The builder is the first stage of the compilation pipeline:
5
+ * GeneralizedTaskDefinitions → TaskGraphBuilder → TaskGraph → PromptfooCompiler → YAML
6
+ *
7
+ * Responsibilities:
8
+ * - Accept tasks from any source (TS, YAML, Content Lake)
9
+ * - Apply area/tag/mode filtering
10
+ * - Resolve inter-task dependencies into edges
11
+ * - Validate the graph is a DAG (reject cycles)
12
+ * - Assign execution priority via topological sort
13
+ *
14
+ * This module exists alongside `generate-configs.ts` — it does NOT replace
15
+ * the existing codegen path. Phase 7 will swap callers over to the compiler.
16
+ *
17
+ * @see packages/core/src/types/task-graph.ts — TaskGraph types
18
+ * @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
19
+ */
20
+ import type { DependencyEdge, FilterOptions, GeneralizedTaskDefinition, TaskGraph, TaskNode } from "../../_vendor/ailf-core/index.d.ts";
21
+ /** Options for building a task graph */
22
+ export interface TaskGraphBuildOptions {
23
+ /** Task definitions from any source */
24
+ tasks: GeneralizedTaskDefinition[];
25
+ /** Optional filter to narrow task set */
26
+ filter?: FilterOptions;
27
+ /** Compilation target backend */
28
+ compilationTarget?: "custom" | "promptfoo";
29
+ }
30
+ /** Result of building a task graph */
31
+ export interface TaskGraphBuildResult {
32
+ /** The built graph (null if no tasks survived filtering) */
33
+ graph: TaskGraph | null;
34
+ /** Warnings emitted during build (non-fatal) */
35
+ warnings: string[];
36
+ /** Tasks that were filtered out */
37
+ filteredOut: string[];
38
+ }
39
+ /**
40
+ * Build a TaskGraph from task definitions.
41
+ *
42
+ * 1. Filters tasks by area, tags, task IDs, and status
43
+ * 2. Creates TaskNodes with resolved variables
44
+ * 3. Discovers dependency edges from task metadata
45
+ * 4. Validates the graph is acyclic
46
+ * 5. Assigns topological priority
47
+ */
48
+ export declare function buildTaskGraph(options: TaskGraphBuildOptions): TaskGraphBuildResult;
49
+ /**
50
+ * Detect cycles in the task graph using Kahn's algorithm.
51
+ *
52
+ * @returns null if acyclic, or the cycle path as a string array
53
+ */
54
+ export declare function detectCycle(nodes: Map<string, TaskNode>, edges: DependencyEdge[]): string[] | null;
@@ -0,0 +1,291 @@
1
+ /**
2
+ * TaskGraphBuilder — converts task definitions into a TaskGraph IR.
3
+ *
4
+ * The builder is the first stage of the compilation pipeline:
5
+ * GeneralizedTaskDefinitions → TaskGraphBuilder → TaskGraph → PromptfooCompiler → YAML
6
+ *
7
+ * Responsibilities:
8
+ * - Accept tasks from any source (TS, YAML, Content Lake)
9
+ * - Apply area/tag/mode filtering
10
+ * - Resolve inter-task dependencies into edges
11
+ * - Validate the graph is a DAG (reject cycles)
12
+ * - Assign execution priority via topological sort
13
+ *
14
+ * This module exists alongside `generate-configs.ts` — it does NOT replace
15
+ * the existing codegen path. Phase 7 will swap callers over to the compiler.
16
+ *
17
+ * @see packages/core/src/types/task-graph.ts — TaskGraph types
18
+ * @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
19
+ */
20
+ // ---------------------------------------------------------------------------
21
+ // Public API
22
+ // ---------------------------------------------------------------------------
23
+ /**
24
+ * Build a TaskGraph from task definitions.
25
+ *
26
+ * 1. Filters tasks by area, tags, task IDs, and status
27
+ * 2. Creates TaskNodes with resolved variables
28
+ * 3. Discovers dependency edges from task metadata
29
+ * 4. Validates the graph is acyclic
30
+ * 5. Assigns topological priority
31
+ */
32
+ export function buildTaskGraph(options) {
33
+ const warnings = [];
34
+ const filteredOut = [];
35
+ // Step 1: Filter tasks
36
+ const filtered = filterTasks(options.tasks, options.filter, filteredOut);
37
+ if (filtered.length === 0) {
38
+ return {
39
+ graph: null,
40
+ warnings: ["No tasks matched the filter criteria"],
41
+ filteredOut,
42
+ };
43
+ }
44
+ // Step 2: Create nodes
45
+ const nodes = new Map();
46
+ for (const task of filtered) {
47
+ const node = taskToNode(task);
48
+ if (nodes.has(node.taskId)) {
49
+ warnings.push(`Duplicate task ID "${node.taskId}" — later definition wins`);
50
+ }
51
+ nodes.set(node.taskId, node);
52
+ }
53
+ // Step 3: Discover edges from dependency metadata
54
+ const edges = discoverEdges(filtered, nodes, warnings);
55
+ // Step 4: Validate acyclicity
56
+ const cycleError = detectCycle(nodes, edges);
57
+ if (cycleError) {
58
+ throw new Error(`Task graph contains a cycle: ${cycleError.join(" → ")}. ` +
59
+ "Task graphs must be directed acyclic graphs (DAGs).");
60
+ }
61
+ // Step 5: Assign topological priority
62
+ assignPriority(nodes, edges);
63
+ // Step 6: Build fixture map (empty for now — Phase 2d fills this)
64
+ const fixtures = new Map();
65
+ const graph = {
66
+ compilationTarget: options.compilationTarget ?? "promptfoo",
67
+ edges,
68
+ fixtures,
69
+ nodes,
70
+ };
71
+ return { graph, warnings, filteredOut };
72
+ }
73
+ // ---------------------------------------------------------------------------
74
+ // Filtering
75
+ // ---------------------------------------------------------------------------
76
+ function filterTasks(tasks, filter, filteredOut) {
77
+ return tasks.filter((task) => {
78
+ // Status filter — always applied (even without explicit filter options)
79
+ const status = task.status ?? "active";
80
+ const isTargetedById = filter?.taskIds && filter.taskIds.includes(task.id);
81
+ if (status === "archived") {
82
+ filteredOut.push(task.id);
83
+ return false;
84
+ }
85
+ if (status === "paused" && !isTargetedById) {
86
+ filteredOut.push(task.id);
87
+ return false;
88
+ }
89
+ if (status === "draft" && !isTargetedById && !filter?.includeDrafts) {
90
+ filteredOut.push(task.id);
91
+ return false;
92
+ }
93
+ // Remaining filters only apply when an explicit filter is provided
94
+ if (!filter)
95
+ return true;
96
+ // Area filter — GeneralizedTaskDefinition uses `area` (not `featureArea`)
97
+ const taskArea = task.area ?? "";
98
+ if (filter.areas &&
99
+ filter.areas.length > 0 &&
100
+ !filter.areas.map((a) => a.toLowerCase()).includes(taskArea.toLowerCase())) {
101
+ filteredOut.push(task.id);
102
+ return false;
103
+ }
104
+ // Task ID filter
105
+ if (filter.taskIds &&
106
+ filter.taskIds.length > 0 &&
107
+ !filter.taskIds.includes(task.id)) {
108
+ filteredOut.push(task.id);
109
+ return false;
110
+ }
111
+ // Tag filter
112
+ if (filter.tags &&
113
+ filter.tags.length > 0 &&
114
+ (!task.tags || !task.tags.some((t) => filter.tags.includes(t)))) {
115
+ filteredOut.push(task.id);
116
+ return false;
117
+ }
118
+ return true;
119
+ });
120
+ }
121
+ // ---------------------------------------------------------------------------
122
+ // Node creation
123
+ // ---------------------------------------------------------------------------
124
+ function taskToNode(task) {
125
+ // GeneralizedTaskDefinition uses prompt.text/prompt.template instead of taskPrompt,
126
+ // and prompt.vars instead of extraVars
127
+ const promptText = task.prompt?.text ?? task.prompt?.template ?? "";
128
+ const promptVars = task.prompt?.vars ?? {};
129
+ const envelope = {
130
+ declarations: [],
131
+ provenance: {},
132
+ values: {
133
+ ...(promptText ? { task: promptText } : {}),
134
+ ...promptVars,
135
+ },
136
+ };
137
+ return {
138
+ dependsOn: [],
139
+ mode: task.mode,
140
+ priority: 0,
141
+ resolvedPrompt: promptText,
142
+ resolvedVariables: envelope,
143
+ taskId: task.id,
144
+ };
145
+ }
146
+ // ---------------------------------------------------------------------------
147
+ // Edge discovery
148
+ // ---------------------------------------------------------------------------
149
+ /**
150
+ * Discover dependency edges from task metadata.
151
+ *
152
+ * Looks for explicit `dependsOn` arrays in prompt.vars (the generalized
153
+ * equivalent of the old extraVars convention).
154
+ * Future phases will add implicit deps from fixture sharing, data flow, etc.
155
+ */
156
+ function discoverEdges(tasks, nodes, warnings) {
157
+ const edges = [];
158
+ for (const task of tasks) {
159
+ // Check for explicit dependencies in prompt.vars (was extraVars.dependsOn)
160
+ const deps = task.prompt?.vars?.dependsOn;
161
+ if (Array.isArray(deps)) {
162
+ for (const dep of deps) {
163
+ if (typeof dep !== "string")
164
+ continue;
165
+ if (!nodes.has(dep)) {
166
+ warnings.push(`Task "${task.id}" depends on "${dep}" which is not in the graph — ` +
167
+ "dependency ignored (task may have been filtered out)");
168
+ continue;
169
+ }
170
+ edges.push({ from: dep, to: task.id, type: "ordering" });
171
+ const node = nodes.get(task.id);
172
+ if (node && !node.dependsOn.includes(dep)) {
173
+ node.dependsOn.push(dep);
174
+ }
175
+ }
176
+ }
177
+ }
178
+ return edges;
179
+ }
180
+ // ---------------------------------------------------------------------------
181
+ // Cycle detection — Kahn's algorithm (topological sort)
182
+ // ---------------------------------------------------------------------------
183
+ /**
184
+ * Detect cycles in the task graph using Kahn's algorithm.
185
+ *
186
+ * @returns null if acyclic, or the cycle path as a string array
187
+ */
188
+ export function detectCycle(nodes, edges) {
189
+ // Build in-degree map
190
+ const inDegree = new Map();
191
+ const adjacency = new Map();
192
+ for (const id of nodes.keys()) {
193
+ inDegree.set(id, 0);
194
+ adjacency.set(id, []);
195
+ }
196
+ for (const edge of edges) {
197
+ adjacency.get(edge.from).push(edge.to);
198
+ inDegree.set(edge.to, (inDegree.get(edge.to) ?? 0) + 1);
199
+ }
200
+ // Start with all zero-in-degree nodes
201
+ const queue = [];
202
+ for (const [id, deg] of inDegree) {
203
+ if (deg === 0)
204
+ queue.push(id);
205
+ }
206
+ let visited = 0;
207
+ while (queue.length > 0) {
208
+ const current = queue.shift();
209
+ visited++;
210
+ for (const neighbor of adjacency.get(current) ?? []) {
211
+ const newDeg = (inDegree.get(neighbor) ?? 1) - 1;
212
+ inDegree.set(neighbor, newDeg);
213
+ if (newDeg === 0)
214
+ queue.push(neighbor);
215
+ }
216
+ }
217
+ if (visited === nodes.size)
218
+ return null;
219
+ // Find cycle participants (nodes with remaining in-degree > 0)
220
+ const cycleNodes = [...inDegree.entries()]
221
+ .filter(([, deg]) => deg > 0)
222
+ .map(([id]) => id);
223
+ // Reconstruct a cycle path for the error message
224
+ return reconstructCyclePath(cycleNodes, adjacency);
225
+ }
226
+ /**
227
+ * Reconstruct a human-readable cycle path from cycle participants.
228
+ */
229
+ function reconstructCyclePath(cycleNodes, adjacency) {
230
+ if (cycleNodes.length === 0)
231
+ return [];
232
+ const inCycle = new Set(cycleNodes);
233
+ const start = cycleNodes[0];
234
+ const path = [start];
235
+ const visited = new Set();
236
+ let current = start;
237
+ // Follow edges within the cycle to produce a readable path
238
+ while (true) {
239
+ visited.add(current);
240
+ const next = (adjacency.get(current) ?? []).find((n) => inCycle.has(n) && (!visited.has(n) || n === start));
241
+ if (!next)
242
+ break;
243
+ path.push(next);
244
+ if (next === start)
245
+ break; // Completed the cycle
246
+ current = next;
247
+ }
248
+ return path;
249
+ }
250
+ // ---------------------------------------------------------------------------
251
+ // Topological priority assignment
252
+ // ---------------------------------------------------------------------------
253
+ /**
254
+ * Assign execution priority via topological order.
255
+ * Lower priority = earlier execution.
256
+ */
257
+ function assignPriority(nodes, edges) {
258
+ const inDegree = new Map();
259
+ const adjacency = new Map();
260
+ for (const id of nodes.keys()) {
261
+ inDegree.set(id, 0);
262
+ adjacency.set(id, []);
263
+ }
264
+ for (const edge of edges) {
265
+ adjacency.get(edge.from).push(edge.to);
266
+ inDegree.set(edge.to, (inDegree.get(edge.to) ?? 0) + 1);
267
+ }
268
+ const queue = [];
269
+ for (const [id, deg] of inDegree) {
270
+ if (deg === 0)
271
+ queue.push(id);
272
+ }
273
+ let priority = 0;
274
+ while (queue.length > 0) {
275
+ // Process all nodes at the current level (same priority)
276
+ const levelSize = queue.length;
277
+ for (let i = 0; i < levelSize; i++) {
278
+ const current = queue.shift();
279
+ const node = nodes.get(current);
280
+ if (node)
281
+ node.priority = priority;
282
+ for (const neighbor of adjacency.get(current) ?? []) {
283
+ const newDeg = (inDegree.get(neighbor) ?? 1) - 1;
284
+ inDegree.set(neighbor, newDeg);
285
+ if (newDeg === 0)
286
+ queue.push(neighbor);
287
+ }
288
+ }
289
+ priority++;
290
+ }
291
+ }
@@ -0,0 +1,90 @@
1
+ /**
2
+ * Cost tracking — model pricing, pre-run estimation, and post-run actuals.
3
+ *
4
+ * Uses a pricing table (YAML config or TS `definePricingTable()`) to compute
5
+ * USD cost from token usage. Supports budget controls with warn/stop thresholds.
6
+ *
7
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
8
+ */
9
+ import type { TraceTokenUsage } from "../../../_vendor/ailf-core/index.d.ts";
10
+ /** Per-model pricing (USD per 1M tokens) */
11
+ export interface ModelPricing {
12
+ /** Input tokens cost per 1M tokens */
13
+ input: number;
14
+ /** Output tokens cost per 1M tokens */
15
+ output: number;
16
+ /** Cached input tokens cost per 1M tokens (optional) */
17
+ cachedInput?: number;
18
+ }
19
+ /** Budget control thresholds (in USD) */
20
+ export interface BudgetConfig {
21
+ perRun?: {
22
+ warn: number;
23
+ stop: number;
24
+ };
25
+ perTask?: {
26
+ warn: number;
27
+ stop: number;
28
+ };
29
+ }
30
+ /** Cost estimate for a pipeline run */
31
+ export interface CostEstimate {
32
+ /** Estimated total cost in USD */
33
+ totalUSD: number;
34
+ /** Per-model breakdown */
35
+ perModel: {
36
+ modelId: string;
37
+ estimatedUSD: number;
38
+ }[];
39
+ /** Whether estimate exceeds budget warning threshold */
40
+ exceedsWarning: boolean;
41
+ /** Whether estimate exceeds budget stop threshold */
42
+ exceedsStop: boolean;
43
+ }
44
+ /** Actual cost computed from real token usage */
45
+ export interface ActualCost {
46
+ /** Actual total cost in USD */
47
+ totalUSD: number;
48
+ /** Per-model actual cost */
49
+ perModel: {
50
+ modelId: string;
51
+ actualUSD: number;
52
+ tokens: TraceTokenUsage;
53
+ }[];
54
+ }
55
+ /** Budget check result */
56
+ export interface BudgetCheckResult {
57
+ /** Whether to proceed */
58
+ proceed: boolean;
59
+ /** Warning message (if any) */
60
+ warning?: string;
61
+ /** Current spend in USD */
62
+ currentUSD: number;
63
+ /** Budget limit that was checked */
64
+ limitUSD?: number;
65
+ }
66
+ /**
67
+ * Compute actual cost from token usage and model pricing.
68
+ *
69
+ * @param usage - Token counts from provider response
70
+ * @param pricing - Per-model pricing (USD per 1M tokens)
71
+ * @returns Cost in USD
72
+ */
73
+ export declare function computeCost(usage: TraceTokenUsage, pricing: ModelPricing): number;
74
+ /**
75
+ * Look up pricing for a model ID.
76
+ *
77
+ * Tries exact match first, then falls back to prefix matching
78
+ * (e.g., "openai:chat:gpt-4o-2024-11-20" matches "openai:chat:gpt-4o").
79
+ */
80
+ export declare function lookupPricing(modelId: string, customPricing?: Record<string, ModelPricing>): ModelPricing | undefined;
81
+ /**
82
+ * Estimate cost for a pipeline run before execution.
83
+ *
84
+ * Uses task count, estimated tokens per task complexity, and model pricing.
85
+ */
86
+ export declare function estimateRunCost(taskCount: number, modelIds: string[], budget?: BudgetConfig, customPricing?: Record<string, ModelPricing>): CostEstimate;
87
+ /**
88
+ * Check if current spend exceeds budget thresholds.
89
+ */
90
+ export declare function checkBudget(currentUSD: number, budget: BudgetConfig, level: "perRun" | "perTask"): BudgetCheckResult;