@sanity/ailf 0.4.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. package/config/features.ts +23 -0
  2. package/config/models.ts +83 -0
  3. package/config/prompts.ts +16 -0
  4. package/config/rubrics.ts +225 -0
  5. package/config/schedules.ts +47 -0
  6. package/config/sinks.ts +37 -0
  7. package/config/sources.ts +21 -0
  8. package/config/thresholds.ts +61 -0
  9. package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
  10. package/dist/_vendor/ailf-core/config-helpers.js +150 -0
  11. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  12. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  13. package/dist/_vendor/ailf-core/examples/index.d.ts +10 -10
  14. package/dist/_vendor/ailf-core/examples/index.js +10 -10
  15. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  16. package/dist/_vendor/ailf-core/index.js +5 -0
  17. package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
  18. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  19. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  20. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  21. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  22. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  23. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  24. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
  25. package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
  26. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
  27. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
  28. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +32 -31
  29. package/dist/_vendor/ailf-core/schemas/pipeline.js +52 -12
  30. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  31. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  32. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  33. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  34. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  35. package/dist/_vendor/ailf-core/services/index.js +2 -1
  36. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  37. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  38. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  39. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  40. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  41. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  42. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  43. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  44. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  46. package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
  47. package/dist/_vendor/ailf-core/types/index.js +8 -1
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
  50. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  51. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  52. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  53. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  54. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  55. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  56. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  57. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  58. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  59. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  60. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  61. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  62. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  63. package/dist/_vendor/ailf-shared/index.js +0 -1
  64. package/dist/adapters/api-client/build-request.js +14 -13
  65. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  66. package/dist/adapters/config-sources/file-config-adapter.js +38 -12
  67. package/dist/adapters/config-sources/index.d.ts +2 -0
  68. package/dist/adapters/config-sources/index.js +1 -0
  69. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  70. package/dist/adapters/config-sources/ts-config-loader.js +133 -0
  71. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  72. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  73. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  74. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  75. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  76. package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
  77. package/dist/adapters/task-sources/index.d.ts +1 -0
  78. package/dist/adapters/task-sources/index.js +1 -0
  79. package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
  80. package/dist/adapters/task-sources/repo-task-source.js +69 -16
  81. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  82. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  83. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  84. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  85. package/dist/cli.js +0 -2
  86. package/dist/commands/baseline.js +4 -1
  87. package/dist/commands/calculate-scores.js +1 -1
  88. package/dist/commands/coverage-audit.js +7 -1
  89. package/dist/commands/explain-handler.js +25 -23
  90. package/dist/commands/fetch-docs.js +3 -2
  91. package/dist/commands/generate-configs.js +1 -1
  92. package/dist/commands/interactive.js +11 -7
  93. package/dist/commands/pipeline-action.d.ts +2 -0
  94. package/dist/commands/pipeline-action.js +16 -6
  95. package/dist/commands/pipeline.d.ts +1 -0
  96. package/dist/commands/pipeline.js +4 -2
  97. package/dist/commands/pr-comment.js +1 -1
  98. package/dist/commands/publish.js +2 -2
  99. package/dist/commands/readiness-report.js +13 -6
  100. package/dist/composition-root.d.ts +1 -1
  101. package/dist/composition-root.js +67 -4
  102. package/dist/orchestration/build-app-context.js +1 -0
  103. package/dist/orchestration/build-step-sequence.js +24 -6
  104. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  105. package/dist/orchestration/steps/fetch-docs-step.js +6 -4
  106. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  107. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  108. package/dist/orchestration/steps/generate-configs-step.js +245 -51
  109. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  110. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  111. package/dist/orchestration/steps/readiness-step.js +5 -6
  112. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  113. package/dist/orchestration/steps/run-eval-step.js +8 -7
  114. package/dist/pipeline/cache.d.ts +1 -1
  115. package/dist/pipeline/cache.js +36 -8
  116. package/dist/pipeline/calculate-scores.d.ts +5 -7
  117. package/dist/pipeline/calculate-scores.js +74 -153
  118. package/dist/pipeline/checks.js +2 -2
  119. package/dist/pipeline/compare.js +8 -8
  120. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  121. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  122. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  123. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  124. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  125. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  126. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  127. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  128. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  129. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
  130. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  131. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  132. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  133. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  134. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  135. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
  136. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  137. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  138. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  139. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  140. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  141. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  142. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  143. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  144. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  145. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  146. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  147. package/dist/pipeline/compiler/config-loader.js +111 -0
  148. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  149. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  150. package/dist/pipeline/compiler/hash.d.ts +11 -0
  151. package/dist/pipeline/compiler/hash.js +18 -0
  152. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  153. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  154. package/dist/pipeline/compiler/index.d.ts +29 -0
  155. package/dist/pipeline/compiler/index.js +45 -0
  156. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  157. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  158. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  159. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  160. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  161. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  162. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  163. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  164. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  165. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  166. package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
  167. package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
  168. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  169. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  170. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  171. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  172. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  173. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
  174. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
  175. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
  176. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  177. package/dist/pipeline/compiler/presets/index.js +8 -0
  178. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
  179. package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
  180. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  181. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  182. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  183. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  184. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  185. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  186. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  187. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  188. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  189. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  190. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  191. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  192. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  193. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  194. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  195. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  196. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  197. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  198. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  199. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  200. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  201. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  202. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  203. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  204. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  205. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  206. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  207. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  208. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  209. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  210. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  211. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  212. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  213. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  214. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  215. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  216. package/dist/pipeline/coverage-audit.d.ts +15 -5
  217. package/dist/pipeline/coverage-audit.js +41 -22
  218. package/dist/pipeline/eval-constants.d.ts +16 -6
  219. package/dist/pipeline/eval-constants.js +25 -4
  220. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  221. package/dist/pipeline/eval-fingerprint.js +8 -9
  222. package/dist/pipeline/expand-tasks.d.ts +23 -14
  223. package/dist/pipeline/expand-tasks.js +37 -31
  224. package/dist/pipeline/gap-analysis.d.ts +1 -1
  225. package/dist/pipeline/gap-analysis.js +2 -2
  226. package/dist/pipeline/generate-configs.d.ts +22 -4
  227. package/dist/pipeline/generate-configs.js +53 -24
  228. package/dist/pipeline/grader-api.d.ts +3 -3
  229. package/dist/pipeline/grader-api.js +5 -12
  230. package/dist/pipeline/grader-compare-runner.js +20 -27
  231. package/dist/pipeline/grader-comparison.d.ts +4 -8
  232. package/dist/pipeline/grader-comparison.js +11 -17
  233. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  234. package/dist/pipeline/grader-consistency-runner.js +18 -21
  235. package/dist/pipeline/grader-consistency.d.ts +6 -10
  236. package/dist/pipeline/grader-consistency.js +13 -32
  237. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  238. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  239. package/dist/pipeline/grader-sensitivity.js +10 -10
  240. package/dist/pipeline/grader-validate-runner.js +7 -5
  241. package/dist/pipeline/grader-validation.d.ts +2 -6
  242. package/dist/pipeline/grader-validation.js +14 -22
  243. package/dist/pipeline/map-request-to-config.js +6 -1
  244. package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
  245. package/dist/pipeline/mirror-repo-tasks.js +16 -15
  246. package/dist/pipeline/normalize-mode.d.ts +49 -0
  247. package/dist/pipeline/normalize-mode.js +64 -0
  248. package/dist/pipeline/plan.d.ts +5 -2
  249. package/dist/pipeline/plan.js +134 -78
  250. package/dist/pipeline/pr-comment.js +2 -0
  251. package/dist/pipeline/profile-resolution.d.ts +47 -0
  252. package/dist/pipeline/profile-resolution.js +91 -0
  253. package/dist/pipeline/provenance.d.ts +2 -2
  254. package/dist/pipeline/provenance.js +12 -17
  255. package/dist/pipeline/release-report.js +4 -4
  256. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  257. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  258. package/dist/pipeline/rubric-loader.d.ts +20 -0
  259. package/dist/pipeline/rubric-loader.js +37 -0
  260. package/dist/pipeline/validate.d.ts +4 -4
  261. package/dist/pipeline/validate.js +64 -53
  262. package/dist/schedules/loader.js +18 -8
  263. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  264. package/dist/scripts/migrate-task-mode.js +85 -0
  265. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  266. package/dist/scripts/validate-task-sources.d.ts +1 -1
  267. package/dist/scripts/validate-task-sources.js +15 -15
  268. package/dist/sinks/loader.js +5 -7
  269. package/dist/sources.d.ts +7 -7
  270. package/dist/sources.js +22 -24
  271. package/dist/webhook/dispatch.js +2 -1
  272. package/package.json +6 -3
  273. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  274. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  275. package/tasks/literacy/frameworks.task.ts +128 -0
  276. package/tasks/literacy/functions.task.ts +69 -0
  277. package/tasks/literacy/groq.task.ts +258 -0
  278. package/tasks/literacy/nextjs-live.task.ts +75 -0
  279. package/tasks/literacy/studio-setup.task.ts +131 -0
  280. package/tasks/literacy/visual-editing.task.ts +146 -0
  281. package/config/features.yaml +0 -116
  282. package/config/models.yaml +0 -116
  283. package/config/prompts.yaml +0 -75
  284. package/config/rubrics.yaml +0 -62
  285. package/config/schedules.yaml +0 -43
  286. package/config/sinks.yaml +0 -54
  287. package/config/sources.yaml +0 -51
  288. package/config/thresholds.yaml +0 -49
  289. package/dist/agent-observer/test-imports.d.ts +0 -7
  290. package/dist/agent-observer/test-imports.js +0 -185
@@ -0,0 +1,471 @@
1
+ /**
2
+ * scoring-and-presets.test.ts — Tests for 4-tier scoring engine,
3
+ * storage schema, and plugin registry / presets.
4
+ *
5
+ * Run: npx tsx --test src/pipeline/compiler/__tests__/scoring-and-presets.test.ts
6
+ */
7
+ import assert from "node:assert/strict";
8
+ import { dirname, resolve } from "node:path";
9
+ import { describe, it } from "node:test";
10
+ import { fileURLToPath } from "node:url";
11
+ const __dirname = dirname(fileURLToPath(import.meta.url));
12
+ import { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, } from "../../../_vendor/ailf-core/index.js";
13
+ import { CURRENT_SCHEMA_VERSION, InMemoryPluginRegistry, isSchemaVersioned, migrateDocument, } from "../../../_vendor/ailf-core/index.js";
14
+ import { createSanityLiteracyPreset, sanityLiteracyPreset, } from "../presets/sanity-literacy.js";
15
+ // ---------------------------------------------------------------------------
16
+ // Helpers
17
+ // ---------------------------------------------------------------------------
18
+ function makeAssertion(overrides) {
19
+ return {
20
+ pass: true,
21
+ score: 0.8,
22
+ reason: "Good",
23
+ assertionType: "llm-rubric",
24
+ dimension: "task-completion",
25
+ latencyMs: 100,
26
+ weight: 1.0,
27
+ ...overrides,
28
+ };
29
+ }
30
+ function makeDimension(overrides) {
31
+ return {
32
+ dimensionId: "task-completion",
33
+ label: "Task Completion",
34
+ score: 0.8,
35
+ assertionCount: 2,
36
+ passCount: 2,
37
+ aggregation: "weighted-mean",
38
+ assertions: [],
39
+ ...overrides,
40
+ };
41
+ }
42
+ // ---------------------------------------------------------------------------
43
+ // Tier 1 → Tier 2: Assertion → Dimension aggregation
44
+ // ---------------------------------------------------------------------------
45
+ describe("aggregateDimensions", () => {
46
+ it("groups assertions by dimension", () => {
47
+ const assertions = [
48
+ makeAssertion({ dimension: "code-correctness", score: 0.9 }),
49
+ makeAssertion({ dimension: "code-correctness", score: 0.7 }),
50
+ makeAssertion({ dimension: "task-completion", score: 0.8 }),
51
+ ];
52
+ const dims = aggregateDimensions(assertions);
53
+ assert.equal(dims.length, 2);
54
+ const cc = dims.find((d) => d.dimensionId === "code-correctness");
55
+ assert.ok(cc);
56
+ assert.equal(cc.assertionCount, 2);
57
+ });
58
+ it("uses weighted-mean by default", () => {
59
+ const assertions = [
60
+ makeAssertion({ score: 0.6, weight: 1.0 }),
61
+ makeAssertion({ score: 0.8, weight: 3.0 }),
62
+ ];
63
+ const dims = aggregateDimensions(assertions);
64
+ // Weighted mean: (0.6*1 + 0.8*3) / (1+3) = 3.0/4 = 0.75
65
+ assert.ok(Math.abs(dims[0].score - 0.75) < 0.01);
66
+ });
67
+ it("falls back to pass rate when no numeric scores", () => {
68
+ const assertions = [
69
+ makeAssertion({ score: null, pass: true }),
70
+ makeAssertion({ score: null, pass: false }),
71
+ ];
72
+ const dims = aggregateDimensions(assertions);
73
+ assert.equal(dims[0].score, 0.5);
74
+ });
75
+ it("applies custom dimension labels", () => {
76
+ const assertions = [makeAssertion({ dimension: "tc" })];
77
+ const dims = aggregateDimensions(assertions, {
78
+ dimensionLabels: { tc: "Task Completion" },
79
+ });
80
+ assert.equal(dims[0].label, "Task Completion");
81
+ });
82
+ });
83
+ // ---------------------------------------------------------------------------
84
+ // Tier 2 → Tier 3: Dimension → Task scoring
85
+ // ---------------------------------------------------------------------------
86
+ describe("computeTaskScore", () => {
87
+ it("computes weighted score from dimensions", () => {
88
+ const dims = [
89
+ makeDimension({ dimensionId: "tc", score: 0.8 }),
90
+ makeDimension({ dimensionId: "cc", score: 0.6 }),
91
+ ];
92
+ const task = computeTaskScore(dims, {
93
+ taskId: "test-task",
94
+ weights: { tc: 0.6, cc: 0.4 },
95
+ });
96
+ // 0.8*0.6 + 0.6*0.4 = 0.48 + 0.24 = 0.72
97
+ assert.ok(Math.abs(task.score - 0.72) < 0.01);
98
+ });
99
+ it("normalizes weights that don't sum to 1", () => {
100
+ const dims = [
101
+ makeDimension({ dimensionId: "tc", score: 1.0 }),
102
+ makeDimension({ dimensionId: "cc", score: 0.0 }),
103
+ ];
104
+ const task = computeTaskScore(dims, {
105
+ taskId: "test-task",
106
+ weights: { tc: 2, cc: 2 },
107
+ });
108
+ // (1.0*2 + 0.0*2) / (2+2) = 2/4 = 0.5
109
+ assert.ok(Math.abs(task.score - 0.5) < 0.01);
110
+ });
111
+ it("checks against threshold", () => {
112
+ const dims = [makeDimension({ dimensionId: "tc", score: 0.6 })];
113
+ const passing = computeTaskScore(dims, {
114
+ taskId: "t1",
115
+ weights: { tc: 1.0 },
116
+ threshold: 0.5,
117
+ });
118
+ assert.equal(passing.passesThreshold, true);
119
+ const failing = computeTaskScore(dims, {
120
+ taskId: "t2",
121
+ weights: { tc: 1.0 },
122
+ threshold: 0.7,
123
+ });
124
+ assert.equal(failing.passesThreshold, false);
125
+ });
126
+ it("records weight source", () => {
127
+ const task = computeTaskScore([makeDimension()], {
128
+ taskId: "t1",
129
+ weights: { "task-completion": 1.0 },
130
+ weightSource: "rubrics.yaml:default",
131
+ });
132
+ assert.equal(task.weightSource, "rubrics.yaml:default");
133
+ });
134
+ });
135
+ // ---------------------------------------------------------------------------
136
+ // Tier 3 → Tier 4: Task → Area aggregation
137
+ // ---------------------------------------------------------------------------
138
+ describe("aggregateAreas", () => {
139
+ it("groups tasks by area prefix", () => {
140
+ const tasks = [
141
+ computeTaskScore([makeDimension({ score: 0.8 })], {
142
+ taskId: "groq-basic",
143
+ weights: { "task-completion": 1.0 },
144
+ }),
145
+ computeTaskScore([makeDimension({ score: 0.6 })], {
146
+ taskId: "groq-advanced",
147
+ weights: { "task-completion": 1.0 },
148
+ }),
149
+ computeTaskScore([makeDimension({ score: 0.9 })], {
150
+ taskId: "studio-schema",
151
+ weights: { "task-completion": 1.0 },
152
+ }),
153
+ ];
154
+ const areas = aggregateAreas(tasks);
155
+ assert.equal(areas.length, 2);
156
+ const groq = areas.find((a) => a.areaId === "groq");
157
+ assert.ok(groq);
158
+ assert.equal(groq.taskCount, 2);
159
+ assert.ok(Math.abs(groq.score - 0.7) < 0.01); // (0.8+0.6)/2
160
+ const studio = areas.find((a) => a.areaId === "studio");
161
+ assert.ok(studio);
162
+ assert.equal(studio.taskCount, 1);
163
+ });
164
+ it("computes delta from previous scores", () => {
165
+ const tasks = [
166
+ computeTaskScore([makeDimension({ score: 0.8 })], {
167
+ taskId: "groq-basic",
168
+ weights: { "task-completion": 1.0 },
169
+ }),
170
+ ];
171
+ const areas = aggregateAreas(tasks, { groq: 0.6 });
172
+ assert.ok(areas[0].delta !== null);
173
+ assert.ok(Math.abs(areas[0].delta - 0.2) < 0.01);
174
+ });
175
+ });
176
+ // ---------------------------------------------------------------------------
177
+ // Score normalization
178
+ // ---------------------------------------------------------------------------
179
+ describe("normalizeScore", () => {
180
+ it("normalizes LLM rubric scores (0-100 → 0-1)", () => {
181
+ assert.ok(Math.abs(normalizeScore(75, "llm-rubric") - 0.75) < 0.01);
182
+ });
183
+ it("passes through already-normalized scores", () => {
184
+ assert.ok(Math.abs(normalizeScore(0.75, "llm-rubric") - 0.75) < 0.01);
185
+ });
186
+ it("normalizes boolean assertions to 0 or 1", () => {
187
+ assert.equal(normalizeScore(1, "contains"), 1);
188
+ assert.equal(normalizeScore(0, "contains"), 0);
189
+ });
190
+ it("clamps similarity scores to [0, 1]", () => {
191
+ assert.equal(normalizeScore(1.5, "similar"), 1);
192
+ assert.equal(normalizeScore(-0.1, "similar"), 0);
193
+ });
194
+ });
195
+ // ---------------------------------------------------------------------------
196
+ // Ensemble grading
197
+ // ---------------------------------------------------------------------------
198
+ describe("computeEnsembleScore", () => {
199
+ it("computes mean ensemble score", () => {
200
+ const { score, agreement } = computeEnsembleScore([0.8, 0.6, 0.7], "mean");
201
+ assert.ok(Math.abs(score - 0.7) < 0.01);
202
+ assert.ok(agreement > 0);
203
+ });
204
+ it("computes median ensemble score", () => {
205
+ const { score } = computeEnsembleScore([0.9, 0.5, 0.7], "median");
206
+ assert.ok(Math.abs(score - 0.7) < 0.01);
207
+ });
208
+ it("computes max ensemble score", () => {
209
+ const { score } = computeEnsembleScore([0.9, 0.5, 0.7], "max");
210
+ assert.ok(Math.abs(score - 0.9) < 0.01);
211
+ });
212
+ it("agreement is 1 for identical scores", () => {
213
+ const { agreement } = computeEnsembleScore([0.8, 0.8, 0.8]);
214
+ assert.ok(Math.abs(agreement - 1.0) < 0.01);
215
+ });
216
+ it("agreement decreases with divergent scores", () => {
217
+ const { agreement } = computeEnsembleScore([0.0, 1.0]);
218
+ assert.ok(agreement < 0.6);
219
+ });
220
+ });
221
+ // ---------------------------------------------------------------------------
222
+ // Storage schema
223
+ // ---------------------------------------------------------------------------
224
+ describe("storage schema", () => {
225
+ it("CURRENT_SCHEMA_VERSION is 1", () => {
226
+ assert.equal(CURRENT_SCHEMA_VERSION, 1);
227
+ });
228
+ it("isSchemaVersioned detects versioned docs", () => {
229
+ assert.equal(isSchemaVersioned({ schemaVersion: 1 }), true);
230
+ assert.equal(isSchemaVersioned({}), false);
231
+ assert.equal(isSchemaVersioned(null), false);
232
+ });
233
+ it("migrateDocument is no-op for current version", () => {
234
+ const doc = { schemaVersion: 1, _type: "ailf.run" };
235
+ const migrated = migrateDocument(doc);
236
+ assert.equal(migrated.schemaVersion, 1);
237
+ });
238
+ });
239
+ // ---------------------------------------------------------------------------
240
+ // Plugin registry
241
+ // ---------------------------------------------------------------------------
242
+ describe("InMemoryPluginRegistry", () => {
243
+ it("registers and retrieves modes", () => {
244
+ const registry = new InMemoryPluginRegistry();
245
+ registry.registerMode({
246
+ id: "custom",
247
+ label: "Custom Mode",
248
+ validProviderPatterns: [".*"],
249
+ rubricTemplateIds: [],
250
+ handlerModule: "./custom.js",
251
+ });
252
+ assert.equal(registry.getModes().length, 1);
253
+ assert.equal(registry.getMode("custom")?.label, "Custom Mode");
254
+ });
255
+ it("registers and retrieves assertions", () => {
256
+ const registry = new InMemoryPluginRegistry();
257
+ registry.registerAssertion({
258
+ type: "api-match",
259
+ label: "API Match",
260
+ compatibleModes: ["custom"],
261
+ handlerModule: "./api-match.js",
262
+ });
263
+ assert.equal(registry.getAssertions().length, 1);
264
+ });
265
+ it("registers a complete preset", () => {
266
+ const registry = new InMemoryPluginRegistry();
267
+ registry.registerPreset(sanityLiteracyPreset);
268
+ // Preset should register its modes, assertions, rubric templates
269
+ assert.ok(registry.getMode("literacy"));
270
+ assert.ok(registry.getAssertions().length > 0);
271
+ assert.ok(registry.getRubricTemplates().length > 0);
272
+ assert.ok(registry.getPresets().length === 1);
273
+ });
274
+ });
275
+ // ---------------------------------------------------------------------------
276
+ // sanity-literacy preset
277
+ // ---------------------------------------------------------------------------
278
+ describe("sanityLiteracyPreset", () => {
279
+ it("has correct manifest", () => {
280
+ assert.equal(sanityLiteracyPreset.name, "sanity-literacy");
281
+ assert.equal(sanityLiteracyPreset.manifest.pluginApiVersion, 1);
282
+ });
283
+ it("registers literacy mode", () => {
284
+ assert.equal(sanityLiteracyPreset.modes?.length, 1);
285
+ assert.equal(sanityLiteracyPreset.modes[0].id, "literacy");
286
+ });
287
+ it("includes core assertion types", () => {
288
+ const types = sanityLiteracyPreset.assertions.map((a) => a.type);
289
+ assert.ok(types.includes("contains"));
290
+ assert.ok(types.includes("llm-rubric"));
291
+ assert.ok(types.includes("javascript"));
292
+ });
293
+ it("includes 3 rubric templates", () => {
294
+ assert.equal(sanityLiteracyPreset.rubricTemplates?.length, 3);
295
+ const ids = sanityLiteracyPreset.rubricTemplates.map((t) => t.id);
296
+ assert.ok(ids.includes("task-completion"));
297
+ assert.ok(ids.includes("code-correctness"));
298
+ assert.ok(ids.includes("doc-coverage"));
299
+ });
300
+ it("rubric template scales match config/rubrics.ts authoritative source", () => {
301
+ const templates = sanityLiteracyPreset.rubricTemplates;
302
+ const tc = templates.find((t) => t.id === "task-completion");
303
+ assert.deepEqual(tc.scale, [
304
+ "0: Couldn't attempt — missing critical information",
305
+ "20: Attempted but fundamentally wrong approach",
306
+ "50: Partial implementation — major functional gaps",
307
+ "80: Mostly complete — minor issues or missing edge cases",
308
+ "100: Fully functional code — works as expected",
309
+ ]);
310
+ assert.equal(tc.criteriaLabel, "Must demonstrate:");
311
+ const cc = templates.find((t) => t.id === "code-correctness");
312
+ assert.deepEqual(cc.scale, [
313
+ "0: Broken code, syntax errors, or deprecated APIs",
314
+ "30: Works but uses anti-patterns or inefficient approaches",
315
+ "50: Works but not idiomatic",
316
+ "80: Follows most best practices",
317
+ "100: Follows all best practices, idiomatic implementation",
318
+ ]);
319
+ assert.equal(cc.criteriaLabel, "Check for:");
320
+ const dc = templates.find((t) => t.id === "doc-coverage");
321
+ assert.deepEqual(dc.scale, [
322
+ "0: Had to hallucinate/guess most implementation details",
323
+ "30: Significant gaps — filled with assumptions",
324
+ "50: Some gaps — inferred from partial information",
325
+ "80: Minor gaps — almost everything was documented",
326
+ "100: Complete coverage — all necessary info was in docs",
327
+ ]);
328
+ });
329
+ it("includes sanity:// fixture resolver", () => {
330
+ assert.ok(sanityLiteracyPreset.fixtureResolvers?.some((r) => r.scheme === "sanity://"));
331
+ });
332
+ it("includes 3 prompt templates", () => {
333
+ const templates = sanityLiteracyPreset.promptTemplates;
334
+ assert.ok(templates);
335
+ assert.ok(templates["with-docs"]);
336
+ assert.ok(templates["without-docs"]);
337
+ assert.ok(templates["agentic"]);
338
+ assert.equal(Object.keys(templates).length, 3);
339
+ });
340
+ it("prompt template content matches literacy handler", () => {
341
+ const templates = sanityLiteracyPreset.promptTemplates;
342
+ assert.ok(templates["with-docs"].template.includes("{{docs}}"));
343
+ assert.ok(templates["with-docs"].template.includes("{{task}}"));
344
+ assert.ok(templates["without-docs"].template.includes("{{task}}"));
345
+ assert.ok(templates["agentic"].template.includes("{{task}}"));
346
+ });
347
+ it("includes default and output-only scoring profiles", () => {
348
+ const profiles = sanityLiteracyPreset.scoringProfiles;
349
+ assert.ok(profiles);
350
+ assert.deepEqual(profiles["default"], {
351
+ "task-completion": 0.5,
352
+ "code-correctness": 0.25,
353
+ "doc-coverage": 0.25,
354
+ });
355
+ assert.deepEqual(profiles["output-only"], {
356
+ "task-completion": 0.6,
357
+ "code-correctness": 0.4,
358
+ });
359
+ });
360
+ it("includes 3 source definitions", () => {
361
+ const sources = sanityLiteracyPreset.sourceDefs;
362
+ assert.ok(sources);
363
+ assert.equal(sources.length, 3);
364
+ const names = sources.map((s) => s.name);
365
+ assert.ok(names.includes("production"));
366
+ assert.ok(names.includes("branch"));
367
+ assert.ok(names.includes("local"));
368
+ });
369
+ it("production source has correct baseUrl", () => {
370
+ const prod = sanityLiteracyPreset.sourceDefs.find((s) => s.name === "production");
371
+ assert.ok(prod);
372
+ assert.equal(prod.baseUrl, "https://www.sanity.io/docs");
373
+ });
374
+ it("includes feature registry with all features", () => {
375
+ const features = sanityLiteracyPreset.featureDefs;
376
+ assert.ok(features);
377
+ assert.equal(features.features.length, 14);
378
+ const ids = features.features.map((f) => f.id);
379
+ // Covered features
380
+ assert.ok(ids.includes("groq"));
381
+ assert.ok(ids.includes("visual-editing"));
382
+ assert.ok(ids.includes("nextjs-live"));
383
+ assert.ok(ids.includes("functions"));
384
+ assert.ok(ids.includes("studio-setup"));
385
+ assert.ok(ids.includes("frameworks"));
386
+ // Uncovered features
387
+ assert.ok(ids.includes("portable-text"));
388
+ assert.ok(ids.includes("image-assets"));
389
+ assert.ok(ids.includes("mutations"));
390
+ assert.ok(ids.includes("schemas"));
391
+ assert.ok(ids.includes("authentication"));
392
+ assert.ok(ids.includes("webhooks"));
393
+ assert.ok(ids.includes("realtime"));
394
+ assert.ok(ids.includes("ai-assist"));
395
+ });
396
+ it("includes a docFetcher factory", () => {
397
+ assert.equal(typeof sanityLiteracyPreset.docFetcher, "function");
398
+ // The factory should return a SanityDocFetcher instance
399
+ const fetcher = sanityLiteracyPreset.docFetcher();
400
+ assert.ok(fetcher);
401
+ assert.equal(typeof fetcher.fetch, "function");
402
+ });
403
+ });
404
+ // ---------------------------------------------------------------------------
405
+ // createSanityLiteracyPreset factory
406
+ // ---------------------------------------------------------------------------
407
+ describe("createSanityLiteracyPreset", () => {
408
+ it("returns a preset with all extension points populated", () => {
409
+ const preset = createSanityLiteracyPreset({ rootDir: "/tmp/test" });
410
+ assert.equal(preset.name, "sanity-literacy");
411
+ assert.ok(preset.modes);
412
+ assert.ok(preset.assertions);
413
+ assert.ok(preset.rubricTemplates);
414
+ assert.ok(preset.fixtureResolvers);
415
+ assert.ok(preset.promptTemplates);
416
+ assert.ok(preset.scoringProfiles);
417
+ assert.ok(preset.docFetcher);
418
+ assert.ok(preset.sourceDefs);
419
+ assert.ok(preset.featureDefs);
420
+ });
421
+ it("registers all extension points into the registry", () => {
422
+ const registry = new InMemoryPluginRegistry();
423
+ const preset = createSanityLiteracyPreset({ rootDir: "/tmp/test" });
424
+ registry.registerPreset(preset);
425
+ assert.ok(registry.getMode("literacy"));
426
+ assert.ok(registry.getAssertions().length > 0);
427
+ assert.ok(registry.getRubricTemplates().length === 3);
428
+ assert.ok(Object.keys(registry.getPromptTemplates()).length === 3);
429
+ assert.ok(Object.keys(registry.getScoringProfiles()).length === 2);
430
+ assert.ok(registry.getDocFetcherFactory());
431
+ assert.equal(registry.getSourceDefs().length, 3);
432
+ assert.ok(registry.getFeatureDefs());
433
+ assert.equal(registry.getFeatureDefs().features.length, 14);
434
+ });
435
+ });
436
+ // ---------------------------------------------------------------------------
437
+ // Preset is single source of truth for sources and features
438
+ // ---------------------------------------------------------------------------
439
+ describe("preset is single source of truth for Sanity config", () => {
440
+ it("config/sources.ts exports an empty array", async () => {
441
+ const { tryLoadConfigFile } = await import("../../compiler/config-loader.js");
442
+ const ROOT = resolve(__dirname, "..", "..", "..", "..");
443
+ const loaded = tryLoadConfigFile("sources", ROOT);
444
+ assert.ok(loaded, "config/sources.ts should exist");
445
+ const sources = loaded.data;
446
+ assert.ok(Array.isArray(sources), "should export an array");
447
+ assert.equal(sources.length, 0, "config/sources should be empty (preset provides sources)");
448
+ });
449
+ it("config/features.ts exports an empty features array", async () => {
450
+ const { tryLoadConfigFile } = await import("../../compiler/config-loader.js");
451
+ const ROOT = resolve(__dirname, "..", "..", "..", "..");
452
+ const loaded = tryLoadConfigFile("features", ROOT);
453
+ assert.ok(loaded, "config/features.ts should exist");
454
+ assert.ok(Array.isArray(loaded.data.features), "should have a features array");
455
+ assert.equal(loaded.data.features.length, 0, "config/features should be empty (preset provides features)");
456
+ });
457
+ it("preset contains all 3 source entries", () => {
458
+ const sources = sanityLiteracyPreset.sourceDefs;
459
+ assert.equal(sources.length, 3);
460
+ const names = sources.map((s) => s.name).sort();
461
+ assert.deepEqual(names, ["branch", "local", "production"]);
462
+ });
463
+ it("preset contains all 14 feature entries", () => {
464
+ const features = sanityLiteracyPreset.featureDefs.features;
465
+ assert.equal(features.length, 14);
466
+ const covered = features.filter((f) => f.status === "covered");
467
+ const uncovered = features.filter((f) => f.status === "uncovered");
468
+ assert.equal(covered.length, 6, "should have 6 covered features");
469
+ assert.equal(uncovered.length, 8, "should have 8 uncovered features");
470
+ });
471
+ });
@@ -0,0 +1,10 @@
1
+ /**
2
+ * scoring-bridge.test.ts — Tests for the 4-tier scoring engine bridge.
3
+ *
4
+ * Verifies that `scoreTestGroup` produces the same 0–100 output as the
5
+ * legacy `accumulateDimensions → averageDimensions → weightedComposite`
6
+ * chain when given identical inputs.
7
+ *
8
+ * Run: npx tsx --test src/pipeline/compiler/__tests__/scoring-bridge.test.ts
9
+ */
10
+ export {};
@@ -0,0 +1,184 @@
1
+ /**
2
+ * scoring-bridge.test.ts — Tests for the 4-tier scoring engine bridge.
3
+ *
4
+ * Verifies that `scoreTestGroup` produces the same 0–100 output as the
5
+ * legacy `accumulateDimensions → averageDimensions → weightedComposite`
6
+ * chain when given identical inputs.
7
+ *
8
+ * Run: npx tsx --test src/pipeline/compiler/__tests__/scoring-bridge.test.ts
9
+ */
10
+ import assert from "node:assert/strict";
11
+ import { describe, it } from "node:test";
12
+ import { scoreTestGroup } from "../scoring-bridge.js";
13
+ // ---------------------------------------------------------------------------
14
+ // Helpers
15
+ // ---------------------------------------------------------------------------
16
+ function makeTestResult(overrides) {
17
+ const dims = overrides?.dimensions ?? {};
18
+ const componentResults = [];
19
+ if (dims.taskCompletion !== undefined) {
20
+ componentResults.push({
21
+ assertion: {
22
+ type: "llm-rubric",
23
+ metadata: { dimension: "task-completion" },
24
+ },
25
+ pass: true,
26
+ reason: JSON.stringify({ score: dims.taskCompletion }),
27
+ score: dims.taskCompletion / 100,
28
+ });
29
+ }
30
+ if (dims.codeCorrectness !== undefined) {
31
+ componentResults.push({
32
+ assertion: {
33
+ type: "llm-rubric",
34
+ metadata: { dimension: "code-correctness" },
35
+ },
36
+ pass: true,
37
+ reason: JSON.stringify({ score: dims.codeCorrectness }),
38
+ score: dims.codeCorrectness / 100,
39
+ });
40
+ }
41
+ if (dims.docCoverage !== undefined) {
42
+ componentResults.push({
43
+ assertion: {
44
+ type: "llm-rubric",
45
+ metadata: { dimension: "doc-coverage" },
46
+ },
47
+ pass: true,
48
+ reason: JSON.stringify({ score: dims.docCoverage }),
49
+ score: dims.docCoverage / 100,
50
+ });
51
+ }
52
+ return {
53
+ cost: overrides?.cost ?? 0.01,
54
+ description: overrides?.description ?? "test",
55
+ gradingResult: {
56
+ componentResults,
57
+ pass: true,
58
+ },
59
+ response: { output: "mock output" },
60
+ vars: overrides?.vars ?? { task: "test", docs: "" },
61
+ };
62
+ }
63
+ const DEFAULT_PROFILE = {
64
+ "code-correctness": 0.35,
65
+ "doc-coverage": 0.25,
66
+ "task-completion": 0.4,
67
+ };
68
+ const OUTPUT_ONLY_PROFILE = {
69
+ "code-correctness": 0.55,
70
+ "task-completion": 0.45,
71
+ };
72
+ // ---------------------------------------------------------------------------
73
+ // Tests
74
+ // ---------------------------------------------------------------------------
75
+ describe("scoreTestGroup — basic scoring", () => {
76
+ it("returns zeroes for empty test array", () => {
77
+ const result = scoreTestGroup([], DEFAULT_PROFILE);
78
+ assert.equal(result.composite, 0);
79
+ assert.equal(result.totalCost, 0);
80
+ assert.deepEqual(result.dimensions, {});
81
+ });
82
+ it("scores a single test with all dimensions", () => {
83
+ const tests = [
84
+ makeTestResult({
85
+ dimensions: {
86
+ taskCompletion: 80,
87
+ codeCorrectness: 70,
88
+ docCoverage: 60,
89
+ },
90
+ }),
91
+ ];
92
+ const result = scoreTestGroup(tests, DEFAULT_PROFILE);
93
+ // Expected: 80*0.4 + 70*0.35 + 60*0.25 = 32 + 24.5 + 15 = 71.5 → 72
94
+ assert.equal(result.dimensions.taskCompletion, 80);
95
+ assert.equal(result.dimensions.codeCorrectness, 70);
96
+ assert.equal(result.dimensions.docCoverage, 60);
97
+ assert.equal(result.composite, 72);
98
+ });
99
+ it("averages across multiple tests", () => {
100
+ const tests = [
101
+ makeTestResult({
102
+ dimensions: { taskCompletion: 80, codeCorrectness: 60 },
103
+ }),
104
+ makeTestResult({
105
+ dimensions: { taskCompletion: 60, codeCorrectness: 80 },
106
+ }),
107
+ ];
108
+ const result = scoreTestGroup(tests, OUTPUT_ONLY_PROFILE);
109
+ // taskCompletion avg = 70, codeCorrectness avg = 70
110
+ // Expected: 70*0.45 + 70*0.55 = 31.5 + 38.5 = 70
111
+ assert.equal(result.dimensions.taskCompletion, 70);
112
+ assert.equal(result.dimensions.codeCorrectness, 70);
113
+ assert.equal(result.composite, 70);
114
+ });
115
+ it("accumulates cost across tests", () => {
116
+ const tests = [
117
+ makeTestResult({ cost: 0.05, dimensions: { taskCompletion: 80 } }),
118
+ makeTestResult({ cost: 0.03, dimensions: { taskCompletion: 70 } }),
119
+ ];
120
+ const result = scoreTestGroup(tests, DEFAULT_PROFILE);
121
+ assert.ok(Math.abs(result.totalCost - 0.08) < 0.001);
122
+ });
123
+ });
124
+ describe("scoreTestGroup — profile handling", () => {
125
+ it("uses output-only profile (excludes doc-coverage)", () => {
126
+ const tests = [
127
+ makeTestResult({
128
+ dimensions: {
129
+ taskCompletion: 80,
130
+ codeCorrectness: 60,
131
+ docCoverage: 100,
132
+ },
133
+ }),
134
+ ];
135
+ const result = scoreTestGroup(tests, OUTPUT_ONLY_PROFILE);
136
+ // doc-coverage should be present in dimensions but NOT affect composite
137
+ // Expected: 80*0.45 + 60*0.55 = 36 + 33 = 69
138
+ assert.equal(result.dimensions.docCoverage, 100);
139
+ assert.equal(result.composite, 69);
140
+ });
141
+ it("handles profile with only one dimension", () => {
142
+ const tests = [
143
+ makeTestResult({
144
+ dimensions: { taskCompletion: 90, codeCorrectness: 50 },
145
+ }),
146
+ ];
147
+ const result = scoreTestGroup(tests, { "task-completion": 1.0 });
148
+ // Only taskCompletion should count
149
+ assert.equal(result.composite, 90);
150
+ });
151
+ });
152
+ describe("scoreTestGroup — edge cases", () => {
153
+ it("handles tests with no rubric components", () => {
154
+ const test = {
155
+ cost: 0.01,
156
+ description: "no rubrics",
157
+ gradingResult: {
158
+ componentResults: [
159
+ { assertion: { type: "javascript" }, pass: true, score: 1 },
160
+ ],
161
+ pass: true,
162
+ },
163
+ response: { output: "mock" },
164
+ vars: { task: "test", docs: "" },
165
+ };
166
+ const result = scoreTestGroup([test], DEFAULT_PROFILE);
167
+ // No llm-rubric components → 0 composite
168
+ assert.equal(result.composite, 0);
169
+ assert.equal(result.totalCost, 0.01);
170
+ });
171
+ it("provides raw DimensionScore objects for advanced consumers", () => {
172
+ const tests = [
173
+ makeTestResult({
174
+ dimensions: { taskCompletion: 80, codeCorrectness: 60 },
175
+ }),
176
+ ];
177
+ const result = scoreTestGroup(tests, DEFAULT_PROFILE);
178
+ assert.ok(result.rawDimensions.length >= 2);
179
+ const tcDim = result.rawDimensions.find((d) => d.dimensionId === "task-completion");
180
+ assert.ok(tcDim);
181
+ assert.ok(tcDim.score >= 0 && tcDim.score <= 1); // 0–1 scale
182
+ assert.equal(tcDim.assertionCount, 1);
183
+ });
184
+ });
@@ -0,0 +1,8 @@
1
+ /**
2
+ * task-graph-builder.test.ts — Unit tests for TaskGraphBuilder.
3
+ *
4
+ * Tests DAG construction, cycle detection, filtering, and priority assignment.
5
+ *
6
+ * Run: npx tsx --test src/pipeline/compiler/__tests__/task-graph-builder.test.ts
7
+ */
8
+ export {};