@sanity/ailf 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (288) hide show
  1. package/config/features.ts +23 -0
  2. package/config/models.ts +83 -0
  3. package/config/prompts.ts +16 -0
  4. package/config/rubrics.ts +225 -0
  5. package/config/schedules.ts +47 -0
  6. package/config/sinks.ts +37 -0
  7. package/config/sources.ts +21 -0
  8. package/config/thresholds.ts +61 -0
  9. package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
  10. package/dist/_vendor/ailf-core/config-helpers.js +150 -0
  11. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  12. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  13. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  14. package/dist/_vendor/ailf-core/index.js +5 -0
  15. package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
  16. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  17. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  18. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  19. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  20. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  21. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  22. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
  23. package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
  24. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
  25. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
  26. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -29
  27. package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -8
  28. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  29. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  30. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  31. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  32. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  33. package/dist/_vendor/ailf-core/services/index.js +2 -1
  34. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  35. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  36. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  37. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  38. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  39. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  40. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  41. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  42. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
  43. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  44. package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
  45. package/dist/_vendor/ailf-core/types/index.js +8 -1
  46. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
  47. package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
  48. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  49. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  50. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  51. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  52. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  53. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  54. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  55. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  56. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  57. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  58. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  59. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  60. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  61. package/dist/_vendor/ailf-shared/index.js +0 -1
  62. package/dist/adapters/api-client/build-request.js +14 -13
  63. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  64. package/dist/adapters/config-sources/file-config-adapter.js +38 -12
  65. package/dist/adapters/config-sources/index.d.ts +2 -0
  66. package/dist/adapters/config-sources/index.js +1 -0
  67. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  68. package/dist/adapters/config-sources/ts-config-loader.js +133 -0
  69. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  70. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  71. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  72. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  73. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  74. package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
  75. package/dist/adapters/task-sources/index.d.ts +1 -0
  76. package/dist/adapters/task-sources/index.js +1 -0
  77. package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
  78. package/dist/adapters/task-sources/repo-task-source.js +69 -16
  79. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  80. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  81. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  82. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  83. package/dist/cli.js +0 -2
  84. package/dist/commands/baseline.js +4 -1
  85. package/dist/commands/calculate-scores.js +1 -1
  86. package/dist/commands/coverage-audit.js +7 -1
  87. package/dist/commands/explain-handler.js +25 -23
  88. package/dist/commands/fetch-docs.js +3 -2
  89. package/dist/commands/generate-configs.js +1 -1
  90. package/dist/commands/interactive.js +11 -7
  91. package/dist/commands/pipeline-action.d.ts +2 -0
  92. package/dist/commands/pipeline-action.js +16 -6
  93. package/dist/commands/pipeline.d.ts +1 -0
  94. package/dist/commands/pipeline.js +4 -2
  95. package/dist/commands/pr-comment.js +1 -1
  96. package/dist/commands/publish.js +2 -2
  97. package/dist/commands/readiness-report.js +13 -6
  98. package/dist/composition-root.d.ts +1 -1
  99. package/dist/composition-root.js +67 -4
  100. package/dist/orchestration/build-app-context.js +1 -0
  101. package/dist/orchestration/build-step-sequence.js +24 -6
  102. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  103. package/dist/orchestration/steps/fetch-docs-step.js +6 -4
  104. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  105. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  106. package/dist/orchestration/steps/generate-configs-step.js +245 -51
  107. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  108. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  109. package/dist/orchestration/steps/readiness-step.js +5 -6
  110. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  111. package/dist/orchestration/steps/run-eval-step.js +8 -7
  112. package/dist/pipeline/cache.d.ts +1 -1
  113. package/dist/pipeline/cache.js +36 -8
  114. package/dist/pipeline/calculate-scores.d.ts +2 -4
  115. package/dist/pipeline/calculate-scores.js +43 -113
  116. package/dist/pipeline/checks.js +2 -2
  117. package/dist/pipeline/compare.js +8 -8
  118. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  119. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  120. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  121. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  122. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  123. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  124. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  125. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  126. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  127. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
  128. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  129. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  130. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  131. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  132. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  133. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
  134. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  135. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  136. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  137. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  138. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  139. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  140. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  141. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  142. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  143. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  144. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  145. package/dist/pipeline/compiler/config-loader.js +111 -0
  146. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  147. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  148. package/dist/pipeline/compiler/hash.d.ts +11 -0
  149. package/dist/pipeline/compiler/hash.js +18 -0
  150. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  151. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  152. package/dist/pipeline/compiler/index.d.ts +29 -0
  153. package/dist/pipeline/compiler/index.js +45 -0
  154. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  155. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  156. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  157. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  158. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  159. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  160. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  161. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  162. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  163. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  164. package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
  165. package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
  166. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  167. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  168. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  169. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  170. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  171. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
  172. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
  173. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
  174. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  175. package/dist/pipeline/compiler/presets/index.js +8 -0
  176. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
  177. package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
  178. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  179. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  180. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  181. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  182. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  183. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  184. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  185. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  186. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  187. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  188. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  189. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  190. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  191. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  192. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  193. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  194. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  195. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  196. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  197. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  198. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  199. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  200. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  201. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  202. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  203. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  204. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  205. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  206. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  207. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  208. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  209. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  210. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  211. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  212. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  213. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  214. package/dist/pipeline/coverage-audit.d.ts +15 -5
  215. package/dist/pipeline/coverage-audit.js +41 -22
  216. package/dist/pipeline/eval-constants.d.ts +16 -6
  217. package/dist/pipeline/eval-constants.js +25 -4
  218. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  219. package/dist/pipeline/eval-fingerprint.js +8 -9
  220. package/dist/pipeline/expand-tasks.d.ts +19 -10
  221. package/dist/pipeline/expand-tasks.js +34 -28
  222. package/dist/pipeline/gap-analysis.d.ts +1 -1
  223. package/dist/pipeline/gap-analysis.js +2 -2
  224. package/dist/pipeline/generate-configs.d.ts +22 -4
  225. package/dist/pipeline/generate-configs.js +53 -24
  226. package/dist/pipeline/grader-api.d.ts +3 -3
  227. package/dist/pipeline/grader-api.js +5 -12
  228. package/dist/pipeline/grader-compare-runner.js +20 -27
  229. package/dist/pipeline/grader-comparison.d.ts +4 -8
  230. package/dist/pipeline/grader-comparison.js +11 -17
  231. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  232. package/dist/pipeline/grader-consistency-runner.js +16 -20
  233. package/dist/pipeline/grader-consistency.d.ts +6 -10
  234. package/dist/pipeline/grader-consistency.js +13 -32
  235. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  236. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  237. package/dist/pipeline/grader-sensitivity.js +10 -10
  238. package/dist/pipeline/grader-validate-runner.js +7 -5
  239. package/dist/pipeline/grader-validation.d.ts +2 -6
  240. package/dist/pipeline/grader-validation.js +14 -22
  241. package/dist/pipeline/map-request-to-config.js +6 -1
  242. package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
  243. package/dist/pipeline/mirror-repo-tasks.js +16 -15
  244. package/dist/pipeline/normalize-mode.d.ts +49 -0
  245. package/dist/pipeline/normalize-mode.js +64 -0
  246. package/dist/pipeline/plan.d.ts +5 -2
  247. package/dist/pipeline/plan.js +134 -78
  248. package/dist/pipeline/pr-comment.js +2 -0
  249. package/dist/pipeline/profile-resolution.d.ts +22 -14
  250. package/dist/pipeline/profile-resolution.js +41 -19
  251. package/dist/pipeline/provenance.d.ts +2 -2
  252. package/dist/pipeline/provenance.js +12 -17
  253. package/dist/pipeline/release-report.js +4 -4
  254. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  255. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  256. package/dist/pipeline/rubric-loader.d.ts +20 -0
  257. package/dist/pipeline/rubric-loader.js +37 -0
  258. package/dist/pipeline/validate.d.ts +4 -4
  259. package/dist/pipeline/validate.js +64 -53
  260. package/dist/schedules/loader.js +18 -8
  261. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  262. package/dist/scripts/migrate-task-mode.js +85 -0
  263. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  264. package/dist/scripts/validate-task-sources.d.ts +1 -1
  265. package/dist/scripts/validate-task-sources.js +15 -15
  266. package/dist/sinks/loader.js +5 -7
  267. package/dist/sources.d.ts +7 -7
  268. package/dist/sources.js +22 -24
  269. package/dist/webhook/dispatch.js +2 -1
  270. package/package.json +6 -3
  271. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  272. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  273. package/tasks/literacy/frameworks.task.ts +128 -0
  274. package/tasks/literacy/functions.task.ts +69 -0
  275. package/tasks/literacy/groq.task.ts +258 -0
  276. package/tasks/literacy/nextjs-live.task.ts +75 -0
  277. package/tasks/literacy/studio-setup.task.ts +131 -0
  278. package/tasks/literacy/visual-editing.task.ts +146 -0
  279. package/config/features.yaml +0 -116
  280. package/config/models.yaml +0 -116
  281. package/config/prompts.yaml +0 -75
  282. package/config/rubrics.yaml +0 -81
  283. package/config/schedules.yaml +0 -43
  284. package/config/sinks.yaml +0 -54
  285. package/config/sources.yaml +0 -51
  286. package/config/thresholds.yaml +0 -49
  287. package/dist/agent-observer/test-imports.d.ts +0 -7
  288. package/dist/agent-observer/test-imports.js +0 -185
@@ -99,11 +99,13 @@ export function formatValidationReport(result) {
99
99
  const sep = "|------------------|-------|-------------|-----------|--------|-------|";
100
100
  lines.push(h);
101
101
  lines.push(sep);
102
- const dims = [
103
- { data: result.perDimension.taskCompletion, name: "Task Completion" },
104
- { data: result.perDimension.codeCorrectness, name: "Code Correctness" },
105
- { data: result.perDimension.docCoverage, name: "Doc Coverage" },
106
- ];
102
+ const dims = Object.entries(result.perDimension).map(([key, data]) => ({
103
+ data,
104
+ name: key
105
+ .split(/[-_]/)
106
+ .map((w) => w.charAt(0).toUpperCase() + w.slice(1))
107
+ .join(" "),
108
+ }));
107
109
  for (const { data, name } of dims) {
108
110
  const quality = classifyCorrelation(data.correlation);
109
111
  const biasStr = data.bias > 0 ? `+${data.bias}` : `${data.bias}`;
@@ -63,12 +63,8 @@ export interface GraderValidation {
63
63
  overallMae: number;
64
64
  /** Whether the grader passes the MAE threshold (default: MAE < 10) */
65
65
  passesThreshold: boolean;
66
- /** Per-dimension validity metrics */
67
- perDimension: {
68
- taskCompletion: DimensionValidity;
69
- codeCorrectness: DimensionValidity;
70
- docCoverage: DimensionValidity;
71
- };
66
+ /** Per-dimension validity metrics (keyed by dimension name) */
67
+ perDimension: Record<string, DimensionValidity>;
72
68
  /** Total number of (grader, human) score pairs analyzed */
73
69
  totalObservations: number;
74
70
  }
@@ -77,11 +77,7 @@ export function validateGrader(grades, graderModel, options) {
77
77
  overallCorrelation: 0,
78
78
  overallMae: 0,
79
79
  passesThreshold: true,
80
- perDimension: {
81
- codeCorrectness: { bias: 0, correlation: 0, count: 0, mae: 0 },
82
- docCoverage: { bias: 0, correlation: 0, count: 0, mae: 0 },
83
- taskCompletion: { bias: 0, correlation: 0, count: 0, mae: 0 },
84
- },
80
+ perDimension: {},
85
81
  totalObservations: 0,
86
82
  };
87
83
  }
@@ -90,28 +86,24 @@ export function validateGrader(grades, graderModel, options) {
90
86
  grader: g.graderScore,
91
87
  human: g.humanScore,
92
88
  }));
93
- // Group by dimension
94
- const byDimension = {
95
- codeCorrectness: grades
96
- .filter((g) => g.dimension === "codeCorrectness")
97
- .map((g) => ({ grader: g.graderScore, human: g.humanScore })),
98
- docCoverage: grades
99
- .filter((g) => g.dimension === "docCoverage")
100
- .map((g) => ({ grader: g.graderScore, human: g.humanScore })),
101
- taskCompletion: grades
102
- .filter((g) => g.dimension === "taskCompletion")
103
- .map((g) => ({ grader: g.graderScore, human: g.humanScore })),
104
- };
89
+ // Group by dimension dynamically
90
+ const byDimension = {};
91
+ for (const g of grades) {
92
+ ;
93
+ (byDimension[g.dimension] ??= []).push({
94
+ grader: g.graderScore,
95
+ human: g.humanScore,
96
+ });
97
+ }
105
98
  // Overall metrics
106
99
  const overallMae = computeMae(allPairs);
107
100
  const overallCorrelation = Math.round(pearsonCorrelation(allPairs.map((p) => p.grader), allPairs.map((p) => p.human)) * 100) / 100;
108
101
  const overallBias = computeBias(allPairs);
109
102
  // Per-dimension metrics
110
- const perDimension = {
111
- codeCorrectness: computeDimensionValidity(byDimension.codeCorrectness),
112
- docCoverage: computeDimensionValidity(byDimension.docCoverage),
113
- taskCompletion: computeDimensionValidity(byDimension.taskCompletion),
114
- };
103
+ const perDimension = {};
104
+ for (const [dim, dimPairs] of Object.entries(byDimension)) {
105
+ perDimension[dim] = computeDimensionValidity(dimPairs);
106
+ }
115
107
  // Find largest disagreements
116
108
  const disagreements = grades
117
109
  .map((g) => ({
@@ -1,3 +1,4 @@
1
+ import { normalizeMode } from "./normalize-mode.js";
1
2
  /**
2
3
  * Map a PipelineRequest to a ResolvedConfig.
3
4
  *
@@ -16,13 +17,17 @@
16
17
  * with `publish: false`.
17
18
  */
18
19
  export function mapRequestToConfig(request, rootDir) {
20
+ // Normalize mode so downstream pipeline code only sees canonical names.
21
+ // The API may receive legacy names ("baseline", "full") from older clients.
22
+ const { mode, variant } = normalizeMode(request.mode ?? "full");
19
23
  // API-triggered evaluations (identified by jobId) default to publish: true.
20
24
  // Without this, the job's reportId is always null and GET /v1/reports/:id
21
25
  // has nothing to return.
22
26
  const publishDefault = !!request.jobId;
23
27
  return {
24
28
  rootDir,
25
- mode: request.mode ?? "full",
29
+ mode,
30
+ variant,
26
31
  debug: mapDebug(request.debug),
27
32
  areas: request.areas,
28
33
  tasks: request.tasks,
@@ -13,12 +13,12 @@
13
13
  * @see docs/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
14
14
  */
15
15
  import type { SanityClient } from "@sanity/client";
16
- import { type Logger, type TaskDefinition } from "../_vendor/ailf-core/index.d.ts";
16
+ import { type LiteracyTaskDefinition, type Logger } from "../_vendor/ailf-core/index.d.ts";
17
17
  export interface MirrorOptions {
18
18
  /** Sanity client with write access */
19
19
  client: SanityClient;
20
20
  /** Tasks to mirror (already loaded from repo) */
21
- tasks: TaskDefinition[];
21
+ tasks: LiteracyTaskDefinition[];
22
22
  /** Git context for origin provenance */
23
23
  git: GitContext;
24
24
  /** If true, log what would be done without writing */
@@ -90,15 +90,15 @@ export declare function detectGitContext(repoTasksPath: string): Promise<GitCont
90
90
  */
91
91
  export declare function mirrorDocId(owner: string, repo: string, taskId: string): string;
92
92
  /**
93
- * Compute a content hash of a TaskDefinition for change detection.
93
+ * Compute a content hash of a LiteracyTaskDefinition for change detection.
94
94
  *
95
95
  * Includes all fields that affect the mirror document. Excludes
96
96
  * runtime metadata like referenceSolution (filesystem path) since
97
97
  * that's not mirrored.
98
98
  */
99
- export declare function computeTaskHash(task: TaskDefinition): string;
99
+ export declare function computeTaskHash(task: LiteracyTaskDefinition): string;
100
100
  /** @internal Exported for testing — not part of the public API. */
101
- export declare function buildMirrorDocument(task: TaskDefinition, opts: {
101
+ export declare function buildMirrorDocument(task: LiteracyTaskDefinition, opts: {
102
102
  contentHash: string;
103
103
  docId: string;
104
104
  /** Existing author from the current mirror document (write-once preservation) */
@@ -113,7 +113,7 @@ export declare function buildMirrorDocument(task: TaskDefinition, opts: {
113
113
  _id: string;
114
114
  _type: string;
115
115
  ownership: string;
116
- status: "active" | "draft" | "paused" | "archived";
116
+ status: import("@sanity/ailf-core").TaskStatus;
117
117
  assert: Record<string, unknown>[];
118
118
  canonicalDocs: ({
119
119
  _key: string;
@@ -46,7 +46,7 @@ export async function mirrorRepoTasks(options) {
46
46
  // Batch-resolve all canonical doc slugs (slug refs only — other ref types
47
47
  // are stored without a resolved article reference for now)
48
48
  const allSlugs = [
49
- ...new Set(tasks.flatMap((t) => t.canonicalDocs.filter(isSlugRef).map((d) => d.slug))),
49
+ ...new Set(tasks.flatMap((t) => (t.context?.docs ?? []).filter(isSlugRef).map((d) => d.slug))),
50
50
  ];
51
51
  const slugToDocId = await batchResolveDocSlugs(client, allSlugs);
52
52
  // Track unresolved slugs
@@ -56,7 +56,7 @@ export async function mirrorRepoTasks(options) {
56
56
  }
57
57
  }
58
58
  // Ensure all feature areas exist
59
- const areas = [...new Set(tasks.map((t) => t.featureArea))];
59
+ const areas = [...new Set(tasks.map((t) => t.area ?? ""))];
60
60
  const createdAreas = await ensureFeatureAreas(client, areas, dryRun, log);
61
61
  result.areasCreated = createdAreas;
62
62
  // Fetch existing mirror document state for change detection + ownership check
@@ -241,7 +241,7 @@ export function mirrorDocId(owner, repo, taskId) {
241
241
  // Content hashing
242
242
  // ---------------------------------------------------------------------------
243
243
  /**
244
- * Compute a content hash of a TaskDefinition for change detection.
244
+ * Compute a content hash of a LiteracyTaskDefinition for change detection.
245
245
  *
246
246
  * Includes all fields that affect the mirror document. Excludes
247
247
  * runtime metadata like referenceSolution (filesystem path) since
@@ -250,10 +250,10 @@ export function mirrorDocId(owner, repo, taskId) {
250
250
  export function computeTaskHash(task) {
251
251
  const payload = JSON.stringify({
252
252
  id: task.id,
253
- description: task.description,
254
- featureArea: task.featureArea,
255
- taskPrompt: task.taskPrompt,
256
- canonicalDocs: task.canonicalDocs,
253
+ title: task.title,
254
+ area: task.area,
255
+ prompt: task.prompt,
256
+ docs: task.context?.docs,
257
257
  docCoverage: task.docCoverage,
258
258
  assertions: task.assertions,
259
259
  baseline: task.baseline,
@@ -356,7 +356,7 @@ export function buildMirrorDocument(task, opts) {
356
356
  // Build canonical docs with resolved references and correct refType.
357
357
  // Each ref type gets the appropriate resolution fields set on the
358
358
  // mirror document so Studio can display them correctly.
359
- const canonicalDocs = task.canonicalDocs.map((ref, i) => {
359
+ const canonicalDocs = (task.context?.docs ?? []).map((ref, i) => {
360
360
  const base = { _key: `cd${i}`, reason: ref.reason ?? "" };
361
361
  if (isSlugRef(ref)) {
362
362
  const resolvedId = slugToDocId.get(ref.slug);
@@ -395,7 +395,7 @@ export function buildMirrorDocument(task, opts) {
395
395
  return base;
396
396
  });
397
397
  // Build assertions
398
- const assertArray = task.assertions.map((a, i) => {
398
+ const assertArray = (task.assertions ?? []).map((a, i) => {
399
399
  const entry = {
400
400
  _key: `a${i}`,
401
401
  type: a.type,
@@ -420,8 +420,9 @@ export function buildMirrorDocument(task, opts) {
420
420
  }
421
421
  return entry;
422
422
  });
423
- // Determine the source file path (best-effort from task's featureArea)
424
- const filePath = `.ailf/tasks/${task.featureArea}.yaml`;
423
+ // Determine the source file path (best-effort from task's area)
424
+ const area = task.area ?? "";
425
+ const filePath = `.ailf/tasks/${area}.yaml`;
425
426
  return {
426
427
  _id: docId,
427
428
  _type: "ailf.task",
@@ -429,10 +430,10 @@ export function buildMirrorDocument(task, opts) {
429
430
  status: task.status ?? "active",
430
431
  assert: assertArray,
431
432
  canonicalDocs,
432
- description: task.description,
433
- docCoverage: task.docCoverage,
433
+ description: task.title,
434
+ docCoverage: task.docCoverage ?? false,
434
435
  featureArea: {
435
- _ref: `ailf.featureArea.${task.featureArea}`,
436
+ _ref: `ailf.featureArea.${area}`,
436
437
  _type: "reference",
437
438
  },
438
439
  id: { _type: "slug", current: task.id },
@@ -451,7 +452,7 @@ export function buildMirrorDocument(task, opts) {
451
452
  author: existingAuthor ?? git.author,
452
453
  lastEditor: git.author,
453
454
  },
454
- taskPrompt: task.taskPrompt,
455
+ taskPrompt: task.prompt?.text ?? "",
455
456
  ...(task.baseline
456
457
  ? {
457
458
  baseline: {
@@ -0,0 +1,49 @@
1
+ /**
2
+ * CLI boundary normalization for evaluation mode names.
3
+ *
4
+ * Legacy CLI users pass variant names like "baseline" or "agentic" as the
5
+ * --mode flag. This module normalizes those to the canonical mode ("literacy")
6
+ * plus a variant field, so downstream pipeline code only ever sees canonical
7
+ * mode names.
8
+ */
9
+ import { type EvalMode } from "../_vendor/ailf-shared/index.d.ts";
10
+ /**
11
+ * Literacy variant name constants.
12
+ *
13
+ * Production code imports these instead of scattering legacy string literals.
14
+ * Defined here (alongside the normalizer) so all variant name definitions
15
+ * live in one file — the single source of truth for the legacy-to-canonical
16
+ * mapping.
17
+ */
18
+ export declare const LiteracyVariant: {
19
+ /** Standard with-docs / without-docs evaluation (legacy mode name: "baseline") */
20
+ readonly STANDARD: "baseline";
21
+ /** Agentic evaluation — model uses tools to find docs */
22
+ readonly AGENTIC: "agentic";
23
+ /** Observed mode — HTTP-instrumented behavior observation */
24
+ readonly OBSERVED: "observed";
25
+ /** Full mode — standard + agentic combined */
26
+ readonly FULL: "full";
27
+ };
28
+ /** Union of all literacy variant string values */
29
+ export type LiteracyVariantName = (typeof LiteracyVariant)[keyof typeof LiteracyVariant];
30
+ /**
31
+ * The two literacy evaluation sub-modes that control entry generation.
32
+ * "standard" (baseline) generates gold + floor entries; "agentic" generates
33
+ * gold entries only.
34
+ */
35
+ export type LiteracyEvalSubMode = typeof LiteracyVariant.STANDARD | typeof LiteracyVariant.AGENTIC;
36
+ export interface NormalizedMode {
37
+ mode: EvalMode;
38
+ variant?: string;
39
+ }
40
+ /**
41
+ * Normalize a raw CLI mode string to a canonical mode + optional variant.
42
+ *
43
+ * Legacy names ("baseline", "agentic", "observed", "full") are mapped to
44
+ * `{ mode: "literacy", variant: "<name>" }` and emit a deprecation warning
45
+ * on stderr. Canonical names pass through unchanged.
46
+ *
47
+ * @throws {Error} If the input is not a recognized mode or variant name.
48
+ */
49
+ export declare function normalizeMode(input: string): NormalizedMode;
@@ -0,0 +1,64 @@
1
+ /**
2
+ * CLI boundary normalization for evaluation mode names.
3
+ *
4
+ * Legacy CLI users pass variant names like "baseline" or "agentic" as the
5
+ * --mode flag. This module normalizes those to the canonical mode ("literacy")
6
+ * plus a variant field, so downstream pipeline code only ever sees canonical
7
+ * mode names.
8
+ */
9
+ import { CANONICAL_EVAL_MODES, LEGACY_EVAL_MODE_ALIASES, } from "../_vendor/ailf-shared/index.js";
10
+ // ---------------------------------------------------------------------------
11
+ // Constants (derived from shared package — single source of truth)
12
+ // ---------------------------------------------------------------------------
13
+ /** The 5 canonical evaluation modes. */
14
+ const CANONICAL_MODES = new Set(CANONICAL_EVAL_MODES);
15
+ /**
16
+ * Literacy variant name constants.
17
+ *
18
+ * Production code imports these instead of scattering legacy string literals.
19
+ * Defined here (alongside the normalizer) so all variant name definitions
20
+ * live in one file — the single source of truth for the legacy-to-canonical
21
+ * mapping.
22
+ */
23
+ export const LiteracyVariant = {
24
+ /** Standard with-docs / without-docs evaluation (legacy mode name: "baseline") */
25
+ STANDARD: "baseline",
26
+ /** Agentic evaluation — model uses tools to find docs */
27
+ AGENTIC: "agentic",
28
+ /** Observed mode — HTTP-instrumented behavior observation */
29
+ OBSERVED: "observed",
30
+ /** Full mode — standard + agentic combined */
31
+ FULL: "full",
32
+ };
33
+ /**
34
+ * Legacy CLI names that are really literacy variants, not distinct modes.
35
+ * Each maps to `mode: "literacy"` with the original name as the variant.
36
+ */
37
+ const LEGACY_LITERACY_VARIANTS = new Set(LEGACY_EVAL_MODE_ALIASES);
38
+ /** Union of all accepted input strings for error messages. */
39
+ const ALL_ACCEPTED = [
40
+ ...Array.from(CANONICAL_MODES),
41
+ ...Array.from(LEGACY_LITERACY_VARIANTS),
42
+ ];
43
+ // ---------------------------------------------------------------------------
44
+ // Public API
45
+ // ---------------------------------------------------------------------------
46
+ /**
47
+ * Normalize a raw CLI mode string to a canonical mode + optional variant.
48
+ *
49
+ * Legacy names ("baseline", "agentic", "observed", "full") are mapped to
50
+ * `{ mode: "literacy", variant: "<name>" }` and emit a deprecation warning
51
+ * on stderr. Canonical names pass through unchanged.
52
+ *
53
+ * @throws {Error} If the input is not a recognized mode or variant name.
54
+ */
55
+ export function normalizeMode(input) {
56
+ if (LEGACY_LITERACY_VARIANTS.has(input)) {
57
+ console.warn(`⚠ Deprecated: --mode ${input} is a legacy alias. Use --mode literacy --variant ${input} instead.`);
58
+ return { mode: "literacy", variant: input };
59
+ }
60
+ if (CANONICAL_MODES.has(input)) {
61
+ return { mode: input };
62
+ }
63
+ throw new Error(`Unknown mode "${input}". Valid modes: ${ALL_ACCEPTED.join(", ")}`);
64
+ }
@@ -10,6 +10,7 @@
10
10
  * @see docs/exec-plans/execution-preview.md
11
11
  */
12
12
  import type { DebugOptions, EvalMode } from "./types.js";
13
+ import { LiteracyVariant } from "./normalize-mode.js";
13
14
  /** Comparison plan for --compare flag. */
14
15
  export interface ComparisonPlan {
15
16
  /** Age of the baseline in human-readable form */
@@ -121,8 +122,8 @@ export interface StepPlan {
121
122
  export interface TaskPlan {
122
123
  /** Test description */
123
124
  description: string;
124
- /** Whether this is a gold (with docs) or baseline (without docs) variant */
125
- variant: "baseline" | "gold";
125
+ /** Whether this is a gold (with docs) or standard/baseline (without docs) variant */
126
+ variant: typeof LiteracyVariant.STANDARD | "gold";
126
127
  }
127
128
  /** Minimal options shape needed to build a pipeline execution plan. */
128
129
  export interface PlanOptions {
@@ -138,6 +139,8 @@ export interface PlanOptions {
138
139
  gapAnalysisEnabled: boolean;
139
140
  graderReplications?: number;
140
141
  mode: EvalMode;
142
+ /** Literacy variant when mode is "literacy" (baseline, agentic, observed, full) */
143
+ variant?: string;
141
144
  noCache: boolean;
142
145
  publishEnabled: boolean;
143
146
  readinessEnabled: boolean;