@sanity/ailf 0.4.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. package/config/features.ts +23 -0
  2. package/config/models.ts +83 -0
  3. package/config/prompts.ts +16 -0
  4. package/config/rubrics.ts +225 -0
  5. package/config/schedules.ts +47 -0
  6. package/config/sinks.ts +37 -0
  7. package/config/sources.ts +21 -0
  8. package/config/thresholds.ts +61 -0
  9. package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
  10. package/dist/_vendor/ailf-core/config-helpers.js +150 -0
  11. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  12. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  13. package/dist/_vendor/ailf-core/examples/index.d.ts +10 -10
  14. package/dist/_vendor/ailf-core/examples/index.js +10 -10
  15. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  16. package/dist/_vendor/ailf-core/index.js +5 -0
  17. package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
  18. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  19. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  20. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  21. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  22. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  23. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  24. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
  25. package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
  26. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
  27. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
  28. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +32 -31
  29. package/dist/_vendor/ailf-core/schemas/pipeline.js +52 -12
  30. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  31. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  32. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  33. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  34. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  35. package/dist/_vendor/ailf-core/services/index.js +2 -1
  36. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  37. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  38. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  39. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  40. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  41. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  42. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  43. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  44. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  46. package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
  47. package/dist/_vendor/ailf-core/types/index.js +8 -1
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
  50. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  51. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  52. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  53. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  54. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  55. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  56. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  57. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  58. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  59. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  60. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  61. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  62. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  63. package/dist/_vendor/ailf-shared/index.js +0 -1
  64. package/dist/adapters/api-client/build-request.js +14 -13
  65. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  66. package/dist/adapters/config-sources/file-config-adapter.js +38 -12
  67. package/dist/adapters/config-sources/index.d.ts +2 -0
  68. package/dist/adapters/config-sources/index.js +1 -0
  69. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  70. package/dist/adapters/config-sources/ts-config-loader.js +133 -0
  71. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  72. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  73. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  74. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  75. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  76. package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
  77. package/dist/adapters/task-sources/index.d.ts +1 -0
  78. package/dist/adapters/task-sources/index.js +1 -0
  79. package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
  80. package/dist/adapters/task-sources/repo-task-source.js +69 -16
  81. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  82. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  83. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  84. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  85. package/dist/cli.js +0 -2
  86. package/dist/commands/baseline.js +4 -1
  87. package/dist/commands/calculate-scores.js +1 -1
  88. package/dist/commands/coverage-audit.js +7 -1
  89. package/dist/commands/explain-handler.js +25 -23
  90. package/dist/commands/fetch-docs.js +3 -2
  91. package/dist/commands/generate-configs.js +1 -1
  92. package/dist/commands/interactive.js +11 -7
  93. package/dist/commands/pipeline-action.d.ts +2 -0
  94. package/dist/commands/pipeline-action.js +16 -6
  95. package/dist/commands/pipeline.d.ts +1 -0
  96. package/dist/commands/pipeline.js +4 -2
  97. package/dist/commands/pr-comment.js +1 -1
  98. package/dist/commands/publish.js +2 -2
  99. package/dist/commands/readiness-report.js +13 -6
  100. package/dist/composition-root.d.ts +1 -1
  101. package/dist/composition-root.js +67 -4
  102. package/dist/orchestration/build-app-context.js +1 -0
  103. package/dist/orchestration/build-step-sequence.js +24 -6
  104. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  105. package/dist/orchestration/steps/fetch-docs-step.js +6 -4
  106. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  107. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  108. package/dist/orchestration/steps/generate-configs-step.js +245 -51
  109. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  110. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  111. package/dist/orchestration/steps/readiness-step.js +5 -6
  112. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  113. package/dist/orchestration/steps/run-eval-step.js +8 -7
  114. package/dist/pipeline/cache.d.ts +1 -1
  115. package/dist/pipeline/cache.js +36 -8
  116. package/dist/pipeline/calculate-scores.d.ts +5 -7
  117. package/dist/pipeline/calculate-scores.js +74 -153
  118. package/dist/pipeline/checks.js +2 -2
  119. package/dist/pipeline/compare.js +8 -8
  120. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  121. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  122. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  123. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  124. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  125. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  126. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  127. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  128. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  129. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
  130. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  131. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  132. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  133. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  134. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  135. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
  136. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  137. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  138. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  139. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  140. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  141. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  142. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  143. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  144. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  145. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  146. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  147. package/dist/pipeline/compiler/config-loader.js +111 -0
  148. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  149. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  150. package/dist/pipeline/compiler/hash.d.ts +11 -0
  151. package/dist/pipeline/compiler/hash.js +18 -0
  152. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  153. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  154. package/dist/pipeline/compiler/index.d.ts +29 -0
  155. package/dist/pipeline/compiler/index.js +45 -0
  156. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  157. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  158. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  159. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  160. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  161. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  162. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  163. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  164. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  165. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  166. package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
  167. package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
  168. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  169. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  170. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  171. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  172. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  173. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
  174. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
  175. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
  176. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  177. package/dist/pipeline/compiler/presets/index.js +8 -0
  178. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
  179. package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
  180. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  181. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  182. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  183. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  184. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  185. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  186. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  187. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  188. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  189. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  190. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  191. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  192. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  193. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  194. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  195. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  196. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  197. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  198. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  199. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  200. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  201. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  202. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  203. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  204. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  205. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  206. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  207. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  208. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  209. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  210. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  211. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  212. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  213. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  214. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  215. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  216. package/dist/pipeline/coverage-audit.d.ts +15 -5
  217. package/dist/pipeline/coverage-audit.js +41 -22
  218. package/dist/pipeline/eval-constants.d.ts +16 -6
  219. package/dist/pipeline/eval-constants.js +25 -4
  220. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  221. package/dist/pipeline/eval-fingerprint.js +8 -9
  222. package/dist/pipeline/expand-tasks.d.ts +23 -14
  223. package/dist/pipeline/expand-tasks.js +37 -31
  224. package/dist/pipeline/gap-analysis.d.ts +1 -1
  225. package/dist/pipeline/gap-analysis.js +2 -2
  226. package/dist/pipeline/generate-configs.d.ts +22 -4
  227. package/dist/pipeline/generate-configs.js +53 -24
  228. package/dist/pipeline/grader-api.d.ts +3 -3
  229. package/dist/pipeline/grader-api.js +5 -12
  230. package/dist/pipeline/grader-compare-runner.js +20 -27
  231. package/dist/pipeline/grader-comparison.d.ts +4 -8
  232. package/dist/pipeline/grader-comparison.js +11 -17
  233. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  234. package/dist/pipeline/grader-consistency-runner.js +18 -21
  235. package/dist/pipeline/grader-consistency.d.ts +6 -10
  236. package/dist/pipeline/grader-consistency.js +13 -32
  237. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  238. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  239. package/dist/pipeline/grader-sensitivity.js +10 -10
  240. package/dist/pipeline/grader-validate-runner.js +7 -5
  241. package/dist/pipeline/grader-validation.d.ts +2 -6
  242. package/dist/pipeline/grader-validation.js +14 -22
  243. package/dist/pipeline/map-request-to-config.js +6 -1
  244. package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
  245. package/dist/pipeline/mirror-repo-tasks.js +16 -15
  246. package/dist/pipeline/normalize-mode.d.ts +49 -0
  247. package/dist/pipeline/normalize-mode.js +64 -0
  248. package/dist/pipeline/plan.d.ts +5 -2
  249. package/dist/pipeline/plan.js +134 -78
  250. package/dist/pipeline/pr-comment.js +2 -0
  251. package/dist/pipeline/profile-resolution.d.ts +47 -0
  252. package/dist/pipeline/profile-resolution.js +91 -0
  253. package/dist/pipeline/provenance.d.ts +2 -2
  254. package/dist/pipeline/provenance.js +12 -17
  255. package/dist/pipeline/release-report.js +4 -4
  256. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  257. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  258. package/dist/pipeline/rubric-loader.d.ts +20 -0
  259. package/dist/pipeline/rubric-loader.js +37 -0
  260. package/dist/pipeline/validate.d.ts +4 -4
  261. package/dist/pipeline/validate.js +64 -53
  262. package/dist/schedules/loader.js +18 -8
  263. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  264. package/dist/scripts/migrate-task-mode.js +85 -0
  265. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  266. package/dist/scripts/validate-task-sources.d.ts +1 -1
  267. package/dist/scripts/validate-task-sources.js +15 -15
  268. package/dist/sinks/loader.js +5 -7
  269. package/dist/sources.d.ts +7 -7
  270. package/dist/sources.js +22 -24
  271. package/dist/webhook/dispatch.js +2 -1
  272. package/package.json +6 -3
  273. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  274. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  275. package/tasks/literacy/frameworks.task.ts +128 -0
  276. package/tasks/literacy/functions.task.ts +69 -0
  277. package/tasks/literacy/groq.task.ts +258 -0
  278. package/tasks/literacy/nextjs-live.task.ts +75 -0
  279. package/tasks/literacy/studio-setup.task.ts +131 -0
  280. package/tasks/literacy/visual-editing.task.ts +146 -0
  281. package/config/features.yaml +0 -116
  282. package/config/models.yaml +0 -116
  283. package/config/prompts.yaml +0 -75
  284. package/config/rubrics.yaml +0 -62
  285. package/config/schedules.yaml +0 -43
  286. package/config/sinks.yaml +0 -54
  287. package/config/sources.yaml +0 -51
  288. package/config/thresholds.yaml +0 -49
  289. package/dist/agent-observer/test-imports.d.ts +0 -7
  290. package/dist/agent-observer/test-imports.js +0 -185
@@ -0,0 +1,153 @@
1
+ /**
2
+ * 4-tier scoring engine — unified scoring across all evaluation modes.
3
+ *
4
+ * Tier 1: Assertion-level (atomic pass/fail + optional numeric score)
5
+ * Tier 2: Dimension-level (aggregated per scoring dimension)
6
+ * Tier 3: Task-level (weighted composite of dimensions)
7
+ * Tier 4: Suite/Area-level (aggregated across tasks)
8
+ *
9
+ * This engine is mode-agnostic — it works for literacy, MCP server,
10
+ * agent harness, knowledge probe, and custom modes.
11
+ *
12
+ * @see docs/design-docs/architecture-overhaul/scoring-rubrics-assertions.md
13
+ */
14
+ /** The result of a single assertion evaluation */
15
+ export interface AssertionScore {
16
+ /** Whether the assertion passed */
17
+ pass: boolean;
18
+ /** Numeric score in [0, 1], null if not applicable */
19
+ score: number | null;
20
+ /** Human-readable explanation */
21
+ reason: string;
22
+ /** Assertion type that produced this result */
23
+ assertionType: string;
24
+ /** Dimension this assertion contributes to */
25
+ dimension: string;
26
+ /** Wall-clock grading time in ms */
27
+ latencyMs: number;
28
+ /** Weight of this assertion (1.0 if unspecified) */
29
+ weight: number;
30
+ }
31
+ /** Aggregation strategy for dimension scoring */
32
+ export type AggregationStrategy = "max" | "mean" | "min" | "weighted-mean";
33
+ /** Aggregated score for a scoring dimension */
34
+ export interface DimensionScore {
35
+ /** Dimension identifier (e.g., "code-correctness") */
36
+ dimensionId: string;
37
+ /** Human-readable label */
38
+ label: string;
39
+ /** Aggregated score in [0, 1] */
40
+ score: number;
41
+ /** How many assertions contributed */
42
+ assertionCount: number;
43
+ /** How many assertions passed */
44
+ passCount: number;
45
+ /** Aggregation method used */
46
+ aggregation: AggregationStrategy;
47
+ /** Individual assertion results */
48
+ assertions: AssertionScore[];
49
+ }
50
+ /**
51
+ * Aggregate assertion scores into dimension scores.
52
+ *
53
+ * Groups assertions by dimension, then applies the configured aggregation
54
+ * strategy (default: weighted-mean).
55
+ */
56
+ export declare function aggregateDimensions(assertions: AssertionScore[], options?: {
57
+ defaultAggregation?: AggregationStrategy;
58
+ dimensionLabels?: Record<string, string>;
59
+ }): DimensionScore[];
60
+ /** Weighted composite score for a task */
61
+ export interface TaskScore {
62
+ /** Task identifier */
63
+ taskId: string;
64
+ /** Feature area (e.g., "groq", "studio"). When absent, aggregateAreas() falls back to taskId prefix. */
65
+ area?: string;
66
+ /** Weighted composite score in [0, 1] */
67
+ score: number;
68
+ /** Per-dimension breakdown */
69
+ dimensions: DimensionScore[];
70
+ /** Weight configuration used */
71
+ weights: Record<string, number>;
72
+ /** Source of weights (default profile, task override, etc.) */
73
+ weightSource: string;
74
+ /** Whether the task met its quality threshold */
75
+ passesThreshold: boolean;
76
+ /** The threshold compared against */
77
+ threshold: number;
78
+ /** Warnings about potential misconfiguration (e.g., no dimensions matched weights) */
79
+ warnings?: string[];
80
+ }
81
+ /** Options for computing a task score */
82
+ export interface TaskScoreOptions {
83
+ /** Task identifier */
84
+ taskId: string;
85
+ /** Feature area (e.g., "groq", "studio"). Falls back to taskId prefix if omitted. */
86
+ area?: string;
87
+ /** Dimension weights (must sum to ~1.0) */
88
+ weights: Record<string, number>;
89
+ /** Where the weights came from (for traceability) */
90
+ weightSource?: string;
91
+ /** Quality threshold (0-1) for pass/fail gate */
92
+ threshold?: number;
93
+ }
94
+ /**
95
+ * Compute a weighted task score from dimension scores.
96
+ */
97
+ export declare function computeTaskScore(dimensions: DimensionScore[], options: TaskScoreOptions): TaskScore;
98
+ /** Aggregated score across tasks in a feature area */
99
+ export interface AreaScore {
100
+ /** Area identifier (e.g., "groq", "studio") */
101
+ areaId: string;
102
+ /** Mean task score */
103
+ score: number;
104
+ /** Number of tasks evaluated */
105
+ taskCount: number;
106
+ /** Number of tasks passing threshold */
107
+ passingTaskCount: number;
108
+ /** Per-task breakdown */
109
+ tasks: TaskScore[];
110
+ /** Trend vs previous evaluation */
111
+ delta: number | null;
112
+ }
113
+ /**
114
+ * Aggregate task scores into area scores.
115
+ */
116
+ export declare function aggregateAreas(tasks: TaskScore[], previousScores?: Record<string, number>): AreaScore[];
117
+ /**
118
+ * Normalize an assertion score to [0, 1] range.
119
+ *
120
+ * Different assertion types produce scores in different ranges:
121
+ * - Boolean (contains, equals, regex): 0 or 1
122
+ * - LLM rubric: 0-100 (needs /100)
123
+ * - similar: 0-1 (already normalized)
124
+ * - javascript/python: user-defined (assumed 0-1)
125
+ */
126
+ export declare function normalizeScore(rawScore: number, assertionType: string): number;
127
+ /** Grader transition configuration for gradual migration */
128
+ export interface GraderTransitionConfig {
129
+ /** Current (old) grader model */
130
+ old: string;
131
+ /** New grader model to transition to */
132
+ new_: string;
133
+ /** ISO date after which old grader is retired */
134
+ expiration: string;
135
+ /** Whether to run both graders in parallel */
136
+ parallel: boolean;
137
+ }
138
+ /** Ensemble grading configuration */
139
+ export interface EnsembleGradingConfig {
140
+ /** Whether ensemble grading is enabled */
141
+ enabled: boolean;
142
+ /** Grader models to use */
143
+ models: string[];
144
+ /** Aggregation strategy for ensemble scores */
145
+ aggregation: "max" | "mean" | "median";
146
+ }
147
+ /**
148
+ * Compute ensemble score from multiple grader outputs.
149
+ */
150
+ export declare function computeEnsembleScore(scores: number[], aggregation?: "max" | "mean" | "median"): {
151
+ score: number;
152
+ agreement: number;
153
+ };
@@ -0,0 +1,237 @@
1
+ /**
2
+ * 4-tier scoring engine — unified scoring across all evaluation modes.
3
+ *
4
+ * Tier 1: Assertion-level (atomic pass/fail + optional numeric score)
5
+ * Tier 2: Dimension-level (aggregated per scoring dimension)
6
+ * Tier 3: Task-level (weighted composite of dimensions)
7
+ * Tier 4: Suite/Area-level (aggregated across tasks)
8
+ *
9
+ * This engine is mode-agnostic — it works for literacy, MCP server,
10
+ * agent harness, knowledge probe, and custom modes.
11
+ *
12
+ * @see docs/design-docs/architecture-overhaul/scoring-rubrics-assertions.md
13
+ */
14
+ /**
15
+ * Aggregate assertion scores into dimension scores.
16
+ *
17
+ * Groups assertions by dimension, then applies the configured aggregation
18
+ * strategy (default: weighted-mean).
19
+ */
20
+ export function aggregateDimensions(assertions, options) {
21
+ const defaultAgg = options?.defaultAggregation ?? "weighted-mean";
22
+ const labels = options?.dimensionLabels ?? {};
23
+ // Group by dimension
24
+ const groups = new Map();
25
+ for (const a of assertions) {
26
+ const dim = a.dimension || "uncategorized";
27
+ const existing = groups.get(dim);
28
+ if (existing) {
29
+ existing.push(a);
30
+ }
31
+ else {
32
+ groups.set(dim, [a]);
33
+ }
34
+ }
35
+ const dimensions = [];
36
+ for (const [dimId, dimAssertions] of groups) {
37
+ const score = aggregateScores(dimAssertions, defaultAgg);
38
+ dimensions.push({
39
+ dimensionId: dimId,
40
+ label: labels[dimId] ?? dimId,
41
+ score,
42
+ assertionCount: dimAssertions.length,
43
+ passCount: dimAssertions.filter((a) => a.pass).length,
44
+ aggregation: defaultAgg,
45
+ assertions: dimAssertions,
46
+ });
47
+ }
48
+ return dimensions.sort((a, b) => a.dimensionId.localeCompare(b.dimensionId));
49
+ }
50
+ /**
51
+ * Compute a weighted task score from dimension scores.
52
+ */
53
+ export function computeTaskScore(dimensions, options) {
54
+ const { weights, taskId } = options;
55
+ const threshold = options.threshold ?? 0.5;
56
+ const warnings = [];
57
+ // Weighted sum
58
+ let score = 0;
59
+ let totalWeight = 0;
60
+ const matchedDimensions = [];
61
+ for (const dim of dimensions) {
62
+ const weight = weights[dim.dimensionId] ?? 0;
63
+ if (weight > 0)
64
+ matchedDimensions.push(dim.dimensionId);
65
+ score += dim.score * weight;
66
+ totalWeight += weight;
67
+ }
68
+ // Warn when no dimensions match any weight key — likely misconfiguration
69
+ if (totalWeight === 0 && dimensions.length > 0) {
70
+ const dimIds = dimensions.map((d) => d.dimensionId).join(", ");
71
+ const weightKeys = Object.keys(weights).join(", ");
72
+ warnings.push(`Task "${taskId}": no dimensions matched weight keys. ` +
73
+ `Dimensions: [${dimIds}], weights: [${weightKeys}]. Score will be 0.`);
74
+ }
75
+ // Normalize if weights don't sum to 1 (guard against NaN when totalWeight is 0)
76
+ if (totalWeight > 0 && Math.abs(totalWeight - 1.0) > 0.001) {
77
+ score = score / totalWeight;
78
+ }
79
+ else if (totalWeight === 0) {
80
+ score = 0;
81
+ }
82
+ return {
83
+ taskId,
84
+ ...(options.area ? { area: options.area } : {}),
85
+ score,
86
+ dimensions,
87
+ weights,
88
+ weightSource: options.weightSource ?? "default",
89
+ passesThreshold: score >= threshold,
90
+ threshold,
91
+ ...(warnings.length > 0 ? { warnings } : {}),
92
+ };
93
+ }
94
+ /**
95
+ * Aggregate task scores into area scores.
96
+ */
97
+ export function aggregateAreas(tasks, previousScores) {
98
+ // Group tasks by area (from explicit metadata, falling back to taskId prefix)
99
+ const groups = new Map();
100
+ for (const task of tasks) {
101
+ const area = task.area ?? extractArea(task.taskId);
102
+ const existing = groups.get(area);
103
+ if (existing) {
104
+ existing.push(task);
105
+ }
106
+ else {
107
+ groups.set(area, [task]);
108
+ }
109
+ }
110
+ const areas = [];
111
+ for (const [areaId, areaTasks] of groups) {
112
+ const score = areaTasks.length > 0
113
+ ? areaTasks.reduce((sum, t) => sum + t.score, 0) / areaTasks.length
114
+ : 0;
115
+ const previousScore = previousScores?.[areaId] ?? null;
116
+ areas.push({
117
+ areaId,
118
+ score,
119
+ taskCount: areaTasks.length,
120
+ passingTaskCount: areaTasks.filter((t) => t.passesThreshold).length,
121
+ tasks: areaTasks,
122
+ delta: previousScore !== null ? score - previousScore : null,
123
+ });
124
+ }
125
+ return areas.sort((a, b) => a.areaId.localeCompare(b.areaId));
126
+ }
127
+ // ---------------------------------------------------------------------------
128
+ // Score normalization
129
+ // ---------------------------------------------------------------------------
130
+ /**
131
+ * Normalize an assertion score to [0, 1] range.
132
+ *
133
+ * Different assertion types produce scores in different ranges:
134
+ * - Boolean (contains, equals, regex): 0 or 1
135
+ * - LLM rubric: 0-100 (needs /100)
136
+ * - similar: 0-1 (already normalized)
137
+ * - javascript/python: user-defined (assumed 0-1)
138
+ */
139
+ export function normalizeScore(rawScore, assertionType) {
140
+ switch (assertionType) {
141
+ case "g-eval":
142
+ case "llm-rubric":
143
+ case "model-graded-closedqa":
144
+ case "model-graded-factuality":
145
+ // LLM rubrics typically return 0-100
146
+ return rawScore > 1 ? rawScore / 100 : rawScore;
147
+ case "similar":
148
+ // Similarity score is already 0-1
149
+ return Math.max(0, Math.min(1, rawScore));
150
+ case "contains":
151
+ case "contains-all":
152
+ case "contains-any":
153
+ case "equals":
154
+ case "is-json":
155
+ case "regex":
156
+ // Boolean assertions: 0 or 1
157
+ return rawScore > 0 ? 1 : 0;
158
+ default:
159
+ // Custom assertions: clamp to [0, 1]
160
+ return Math.max(0, Math.min(1, rawScore));
161
+ }
162
+ }
163
+ /**
164
+ * Compute ensemble score from multiple grader outputs.
165
+ */
166
+ export function computeEnsembleScore(scores, aggregation = "mean") {
167
+ if (scores.length === 0)
168
+ return { score: 0, agreement: 0 };
169
+ if (scores.length === 1)
170
+ return { score: scores[0], agreement: 1 };
171
+ let score;
172
+ switch (aggregation) {
173
+ case "mean":
174
+ score = scores.reduce((a, b) => a + b, 0) / scores.length;
175
+ break;
176
+ case "median": {
177
+ const sorted = [...scores].sort((a, b) => a - b);
178
+ const mid = Math.floor(sorted.length / 2);
179
+ score =
180
+ sorted.length % 2 === 0
181
+ ? (sorted[mid - 1] + sorted[mid]) / 2
182
+ : sorted[mid];
183
+ break;
184
+ }
185
+ case "max":
186
+ score = Math.max(...scores);
187
+ break;
188
+ }
189
+ // Agreement: 1 - normalized standard deviation
190
+ const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
191
+ const variance = scores.reduce((sum, s) => sum + (s - mean) ** 2, 0) / scores.length;
192
+ const stdDev = Math.sqrt(variance);
193
+ const agreement = Math.max(0, 1 - stdDev);
194
+ return { score, agreement };
195
+ }
196
+ // ---------------------------------------------------------------------------
197
+ // Helpers
198
+ // ---------------------------------------------------------------------------
199
+ function aggregateScores(assertions, strategy) {
200
+ // Pre-filter to assertions with numeric scores. After this filter,
201
+ // every element's .score is guaranteed non-null — the cast is safe.
202
+ const scored = assertions.filter((a) => a.score !== null);
203
+ if (scored.length === 0) {
204
+ // Fall back to pass rate
205
+ return assertions.length > 0
206
+ ? assertions.filter((a) => a.pass).length / assertions.length
207
+ : 0;
208
+ }
209
+ switch (strategy) {
210
+ case "mean":
211
+ return scored.reduce((sum, a) => sum + a.score, 0) / scored.length;
212
+ case "weighted-mean": {
213
+ const totalWeight = scored.reduce((sum, a) => sum + a.weight, 0);
214
+ if (totalWeight === 0) {
215
+ return scored.reduce((sum, a) => sum + a.score, 0) / scored.length;
216
+ }
217
+ return (scored.reduce((sum, a) => sum + a.score * a.weight, 0) / totalWeight);
218
+ }
219
+ case "min":
220
+ return Math.min(...scored.map((a) => a.score));
221
+ case "max":
222
+ return Math.max(...scored.map((a) => a.score));
223
+ }
224
+ }
225
+ /**
226
+ * Extract the area name from a task ID.
227
+ *
228
+ * Uses the first segment before the first hyphen (e.g., "groq-blog-queries" → "groq").
229
+ * This works for single-word areas but fails for multi-word areas
230
+ * (e.g., "content-lake-queries" → "content" instead of "content-lake").
231
+ *
232
+ * TODO: Use explicit area metadata from task definitions instead of parsing taskId.
233
+ */
234
+ function extractArea(taskId) {
235
+ const parts = taskId.split("-");
236
+ return parts[0] || "general";
237
+ }
@@ -9,13 +9,26 @@
9
9
  */
10
10
  import type { FeatureScore } from "../types/index.js";
11
11
  import type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata } from "../types/scoring-input.js";
12
+ /**
13
+ * Extract dimension names from a scoring profile's weight map.
14
+ *
15
+ * Scoring profiles (defined in config/rubrics.ts) map dimension names
16
+ * to numeric weights. This function returns those dimension names so
17
+ * callers can work with dynamic dimensions instead of hardcoded ones.
18
+ */
19
+ export declare function extractDimensions(profile: Record<string, number>): string[];
12
20
  /**
13
21
  * Classify a grading component into a scoring dimension.
14
22
  *
15
23
  * Prefers structured metadata (Approach 5) over heuristic string matching.
16
- * Returns null if the component doesn't map to a known dimension.
24
+ * Returns the dimension as a kebab-case string, or null if the component
25
+ * doesn't map to any dimension.
26
+ *
27
+ * Returns `string | null` so non-literacy scoring profiles (MCP, agent,
28
+ * knowledge-probe) can define arbitrary dimension names in metadata
29
+ * without requiring changes here.
17
30
  */
18
- export declare function classifyRubric(component: ComponentResult): "codeCorrectness" | "docCoverage" | "taskCompletion" | null;
31
+ export declare function classifyRubric(component: ComponentResult): string | null;
19
32
  /**
20
33
  * Detect the feature area from a test description string.
21
34
  *
@@ -8,40 +8,50 @@
8
8
  * the Ports & Adapters migration (Phase 4e).
9
9
  */
10
10
  // ---------------------------------------------------------------------------
11
+ // Dimension extraction
12
+ // ---------------------------------------------------------------------------
13
+ /**
14
+ * Extract dimension names from a scoring profile's weight map.
15
+ *
16
+ * Scoring profiles (defined in config/rubrics.ts) map dimension names
17
+ * to numeric weights. This function returns those dimension names so
18
+ * callers can work with dynamic dimensions instead of hardcoded ones.
19
+ */
20
+ export function extractDimensions(profile) {
21
+ return Object.keys(profile);
22
+ }
23
+ // ---------------------------------------------------------------------------
11
24
  // Rubric classification
12
25
  // ---------------------------------------------------------------------------
13
26
  /**
14
27
  * Classify a grading component into a scoring dimension.
15
28
  *
16
29
  * Prefers structured metadata (Approach 5) over heuristic string matching.
17
- * Returns null if the component doesn't map to a known dimension.
30
+ * Returns the dimension as a kebab-case string, or null if the component
31
+ * doesn't map to any dimension.
32
+ *
33
+ * Returns `string | null` so non-literacy scoring profiles (MCP, agent,
34
+ * knowledge-probe) can define arbitrary dimension names in metadata
35
+ * without requiring changes here.
18
36
  */
19
37
  export function classifyRubric(component) {
20
- // Prefer structured metadata (Approach 5) over heuristic matching
38
+ // Prefer structured metadata any dimension name is valid, enabling
39
+ // non-literacy profiles to pass through names like 'input-validation'
21
40
  const metadata = component.assertion?.metadata;
22
41
  if (metadata?.dimension) {
23
- switch (metadata.dimension) {
24
- case "code-correctness":
25
- return "codeCorrectness";
26
- case "doc-coverage":
27
- return "docCoverage";
28
- case "task-completion":
29
- return "taskCompletion";
30
- default:
31
- return null;
32
- }
42
+ return metadata.dimension;
33
43
  }
34
44
  // Fallback: heuristic name matching (for backward compatibility)
35
45
  const value = (component.assertion?.value ?? "").toLowerCase();
36
46
  if (value.includes("task completion")) {
37
- return "taskCompletion";
47
+ return "task-completion";
38
48
  }
39
49
  if (value.includes("code correctness")) {
40
- return "codeCorrectness";
50
+ return "code-correctness";
41
51
  }
42
52
  if (value.includes("documentation coverage") ||
43
53
  value.includes("hallucinate")) {
44
- return "docCoverage";
54
+ return "doc-coverage";
45
55
  }
46
56
  return null;
47
57
  }
@@ -0,0 +1,137 @@
1
+ /**
2
+ * Branded ID types — nominal typing for entity identifiers.
3
+ *
4
+ * All entity IDs use branded types to prevent accidental misuse.
5
+ * A `TaskId` cannot be passed where a `RunId` is expected, even
6
+ * though both are strings at runtime.
7
+ *
8
+ * Constructor functions validate format and return `Result<T, E>` —
9
+ * parse-don't-validate at the boundary, then pass branded values
10
+ * through the pipeline.
11
+ *
12
+ * The `Brand` utility and `Result` type are defined here as the
13
+ * foundation. Existing branded types in the codebase (`ReportId`,
14
+ * `ISOTimestamp`) use inline branding — those will be migrated to
15
+ * use this utility in Phase 7.
16
+ *
17
+ * @see docs/design-docs/architecture-overhaul/domain-model.md (canonical)
18
+ * @see docs/design-docs/parse-dont-validate.md (design principle)
19
+ */
20
+ /** Unique symbol for nominal type branding */
21
+ declare const __brand: unique symbol;
22
+ /**
23
+ * Brand a base type `T` with a nominal tag `B`.
24
+ *
25
+ * At runtime, branded values are identical to their base type.
26
+ * At compile time, `Brand<string, "TaskId">` is incompatible with
27
+ * `Brand<string, "RunId">` — preventing accidental ID swaps.
28
+ */
29
+ export type Brand<T, B extends string> = T & {
30
+ readonly [__brand]: B;
31
+ };
32
+ /** Unique identifier for an evaluation task */
33
+ export type TaskId = Brand<string, "TaskId">;
34
+ /** URL-safe slug for a task (derived from title) */
35
+ export type TaskSlug = Brand<string, "TaskSlug">;
36
+ /** Unique identifier for an evaluation suite */
37
+ export type SuiteId = Brand<string, "SuiteId">;
38
+ /** Unique identifier for an evaluation run */
39
+ export type RunId = Brand<string, "RunId">;
40
+ /** Content-addressable fingerprint for a run's inputs */
41
+ export type RunFingerprint = Brand<string, "RunFingerprint">;
42
+ /** Unique identifier for a single task × provider result */
43
+ export type ResultId = Brand<string, "ResultId">;
44
+ /** Unique identifier for a trace (observability record) */
45
+ export type TraceId = Brand<string, "TraceId">;
46
+ /**
47
+ * Unique identifier for a published report (UUID v7).
48
+ *
49
+ * Note: An existing `ReportId` branded type is defined in
50
+ * `packages/core/src/types/index.ts` using inline branding.
51
+ * This definition uses the `Brand` utility for consistency.
52
+ * Phase 7 will unify them.
53
+ */
54
+ export type NewReportId = Brand<string, "ReportId">;
55
+ /** Unique identifier for a provider (LLM, MCP server, agent harness) */
56
+ export type ProviderId = Brand<string, "ProviderId">;
57
+ /** Unique identifier for a prompt template */
58
+ export type PromptId = Brand<string, "PromptId">;
59
+ /** Unique identifier for a rubric scoring template */
60
+ export type RubricId = Brand<string, "RubricId">;
61
+ /** Unique identifier for a fixture (test data) */
62
+ export type FixtureId = Brand<string, "FixtureId">;
63
+ /** Unique identifier for a build artifact */
64
+ export type ArtifactId = Brand<string, "ArtifactId">;
65
+ /**
66
+ * A success result containing a value.
67
+ */
68
+ export interface Ok<T> {
69
+ readonly ok: true;
70
+ readonly value: T;
71
+ }
72
+ /**
73
+ * A failure result containing an error.
74
+ */
75
+ export interface Err<E> {
76
+ readonly ok: false;
77
+ readonly error: E;
78
+ }
79
+ /** Discriminated union for parse results — parse-don't-validate pattern */
80
+ export type Result<T, E> = Ok<T> | Err<E>;
81
+ /** Construct a success result */
82
+ export declare function ok<T>(value: T): Ok<T>;
83
+ /** Construct a failure result */
84
+ export declare function err<E>(error: E): Err<E>;
85
+ /** Error returned when an ID string fails format validation */
86
+ export interface IdValidationError {
87
+ /** Error code identifying the specific validation failure */
88
+ code: string;
89
+ /** The raw input that failed validation */
90
+ raw: string;
91
+ /** Human-readable error message */
92
+ message: string;
93
+ }
94
+ /**
95
+ * Parse a raw string into a `TaskId`.
96
+ *
97
+ * Valid format: alphanumeric + hyphens, 1–128 characters.
98
+ * Examples: `"groq-projection-basics"`, `"mcp-server-tools-list"`
99
+ */
100
+ export declare function taskId(raw: string): Result<TaskId, IdValidationError>;
101
+ /**
102
+ * Parse a raw string into a `RunId`.
103
+ *
104
+ * Valid format: `run_` prefix followed by alphanumeric characters.
105
+ */
106
+ export declare function runId(raw: string): Result<RunId, IdValidationError>;
107
+ /**
108
+ * Parse a raw string into a `SuiteId`.
109
+ *
110
+ * Valid format: `suite_` prefix followed by alphanumeric characters.
111
+ */
112
+ export declare function suiteId(raw: string): Result<SuiteId, IdValidationError>;
113
+ /**
114
+ * Parse a raw string into a `ResultId`.
115
+ *
116
+ * Valid format: `res_` prefix followed by alphanumeric characters.
117
+ */
118
+ export declare function resultId(raw: string): Result<ResultId, IdValidationError>;
119
+ /**
120
+ * Parse a raw string into a `TraceId`.
121
+ *
122
+ * Valid format: `trace_` prefix followed by alphanumeric characters.
123
+ */
124
+ export declare function traceId(raw: string): Result<TraceId, IdValidationError>;
125
+ /**
126
+ * Parse a raw string into a `ProviderId`.
127
+ *
128
+ * Valid format: colon-separated segments (e.g., `"openai:chat:gpt-4o"`).
129
+ */
130
+ export declare function providerId(raw: string): Result<ProviderId, IdValidationError>;
131
+ /**
132
+ * Parse a raw string into a `FixtureId`.
133
+ *
134
+ * Valid format: alphanumeric + hyphens, 1–128 characters.
135
+ */
136
+ export declare function fixtureId(raw: string): Result<FixtureId, IdValidationError>;
137
+ export {};