@sanity/ailf 0.4.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. package/config/features.ts +23 -0
  2. package/config/models.ts +83 -0
  3. package/config/prompts.ts +16 -0
  4. package/config/rubrics.ts +225 -0
  5. package/config/schedules.ts +47 -0
  6. package/config/sinks.ts +37 -0
  7. package/config/sources.ts +21 -0
  8. package/config/thresholds.ts +61 -0
  9. package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
  10. package/dist/_vendor/ailf-core/config-helpers.js +150 -0
  11. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  12. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  13. package/dist/_vendor/ailf-core/examples/index.d.ts +10 -10
  14. package/dist/_vendor/ailf-core/examples/index.js +10 -10
  15. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  16. package/dist/_vendor/ailf-core/index.js +5 -0
  17. package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
  18. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  19. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  20. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  21. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  22. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  23. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  24. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
  25. package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
  26. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
  27. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
  28. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +32 -31
  29. package/dist/_vendor/ailf-core/schemas/pipeline.js +52 -12
  30. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  31. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  32. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  33. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  34. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  35. package/dist/_vendor/ailf-core/services/index.js +2 -1
  36. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  37. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  38. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  39. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  40. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  41. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  42. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  43. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  44. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  46. package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
  47. package/dist/_vendor/ailf-core/types/index.js +8 -1
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
  50. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  51. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  52. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  53. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  54. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  55. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  56. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  57. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  58. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  59. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  60. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  61. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  62. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  63. package/dist/_vendor/ailf-shared/index.js +0 -1
  64. package/dist/adapters/api-client/build-request.js +14 -13
  65. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  66. package/dist/adapters/config-sources/file-config-adapter.js +38 -12
  67. package/dist/adapters/config-sources/index.d.ts +2 -0
  68. package/dist/adapters/config-sources/index.js +1 -0
  69. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  70. package/dist/adapters/config-sources/ts-config-loader.js +133 -0
  71. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  72. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  73. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  74. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  75. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  76. package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
  77. package/dist/adapters/task-sources/index.d.ts +1 -0
  78. package/dist/adapters/task-sources/index.js +1 -0
  79. package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
  80. package/dist/adapters/task-sources/repo-task-source.js +69 -16
  81. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  82. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  83. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  84. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  85. package/dist/cli.js +0 -2
  86. package/dist/commands/baseline.js +4 -1
  87. package/dist/commands/calculate-scores.js +1 -1
  88. package/dist/commands/coverage-audit.js +7 -1
  89. package/dist/commands/explain-handler.js +25 -23
  90. package/dist/commands/fetch-docs.js +3 -2
  91. package/dist/commands/generate-configs.js +1 -1
  92. package/dist/commands/interactive.js +11 -7
  93. package/dist/commands/pipeline-action.d.ts +2 -0
  94. package/dist/commands/pipeline-action.js +16 -6
  95. package/dist/commands/pipeline.d.ts +1 -0
  96. package/dist/commands/pipeline.js +4 -2
  97. package/dist/commands/pr-comment.js +1 -1
  98. package/dist/commands/publish.js +2 -2
  99. package/dist/commands/readiness-report.js +13 -6
  100. package/dist/composition-root.d.ts +1 -1
  101. package/dist/composition-root.js +67 -4
  102. package/dist/orchestration/build-app-context.js +1 -0
  103. package/dist/orchestration/build-step-sequence.js +24 -6
  104. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  105. package/dist/orchestration/steps/fetch-docs-step.js +6 -4
  106. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  107. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  108. package/dist/orchestration/steps/generate-configs-step.js +245 -51
  109. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  110. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  111. package/dist/orchestration/steps/readiness-step.js +5 -6
  112. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  113. package/dist/orchestration/steps/run-eval-step.js +8 -7
  114. package/dist/pipeline/cache.d.ts +1 -1
  115. package/dist/pipeline/cache.js +36 -8
  116. package/dist/pipeline/calculate-scores.d.ts +5 -7
  117. package/dist/pipeline/calculate-scores.js +74 -153
  118. package/dist/pipeline/checks.js +2 -2
  119. package/dist/pipeline/compare.js +8 -8
  120. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  121. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  122. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  123. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  124. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  125. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  126. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  127. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  128. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  129. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
  130. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  131. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  132. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  133. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  134. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  135. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
  136. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  137. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  138. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  139. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  140. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  141. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  142. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  143. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  144. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  145. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  146. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  147. package/dist/pipeline/compiler/config-loader.js +111 -0
  148. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  149. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  150. package/dist/pipeline/compiler/hash.d.ts +11 -0
  151. package/dist/pipeline/compiler/hash.js +18 -0
  152. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  153. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  154. package/dist/pipeline/compiler/index.d.ts +29 -0
  155. package/dist/pipeline/compiler/index.js +45 -0
  156. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  157. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  158. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  159. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  160. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  161. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  162. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  163. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  164. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  165. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  166. package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
  167. package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
  168. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  169. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  170. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  171. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  172. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  173. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
  174. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
  175. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
  176. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  177. package/dist/pipeline/compiler/presets/index.js +8 -0
  178. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
  179. package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
  180. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  181. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  182. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  183. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  184. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  185. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  186. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  187. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  188. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  189. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  190. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  191. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  192. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  193. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  194. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  195. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  196. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  197. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  198. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  199. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  200. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  201. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  202. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  203. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  204. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  205. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  206. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  207. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  208. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  209. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  210. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  211. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  212. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  213. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  214. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  215. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  216. package/dist/pipeline/coverage-audit.d.ts +15 -5
  217. package/dist/pipeline/coverage-audit.js +41 -22
  218. package/dist/pipeline/eval-constants.d.ts +16 -6
  219. package/dist/pipeline/eval-constants.js +25 -4
  220. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  221. package/dist/pipeline/eval-fingerprint.js +8 -9
  222. package/dist/pipeline/expand-tasks.d.ts +23 -14
  223. package/dist/pipeline/expand-tasks.js +37 -31
  224. package/dist/pipeline/gap-analysis.d.ts +1 -1
  225. package/dist/pipeline/gap-analysis.js +2 -2
  226. package/dist/pipeline/generate-configs.d.ts +22 -4
  227. package/dist/pipeline/generate-configs.js +53 -24
  228. package/dist/pipeline/grader-api.d.ts +3 -3
  229. package/dist/pipeline/grader-api.js +5 -12
  230. package/dist/pipeline/grader-compare-runner.js +20 -27
  231. package/dist/pipeline/grader-comparison.d.ts +4 -8
  232. package/dist/pipeline/grader-comparison.js +11 -17
  233. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  234. package/dist/pipeline/grader-consistency-runner.js +18 -21
  235. package/dist/pipeline/grader-consistency.d.ts +6 -10
  236. package/dist/pipeline/grader-consistency.js +13 -32
  237. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  238. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  239. package/dist/pipeline/grader-sensitivity.js +10 -10
  240. package/dist/pipeline/grader-validate-runner.js +7 -5
  241. package/dist/pipeline/grader-validation.d.ts +2 -6
  242. package/dist/pipeline/grader-validation.js +14 -22
  243. package/dist/pipeline/map-request-to-config.js +6 -1
  244. package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
  245. package/dist/pipeline/mirror-repo-tasks.js +16 -15
  246. package/dist/pipeline/normalize-mode.d.ts +49 -0
  247. package/dist/pipeline/normalize-mode.js +64 -0
  248. package/dist/pipeline/plan.d.ts +5 -2
  249. package/dist/pipeline/plan.js +134 -78
  250. package/dist/pipeline/pr-comment.js +2 -0
  251. package/dist/pipeline/profile-resolution.d.ts +47 -0
  252. package/dist/pipeline/profile-resolution.js +91 -0
  253. package/dist/pipeline/provenance.d.ts +2 -2
  254. package/dist/pipeline/provenance.js +12 -17
  255. package/dist/pipeline/release-report.js +4 -4
  256. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  257. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  258. package/dist/pipeline/rubric-loader.d.ts +20 -0
  259. package/dist/pipeline/rubric-loader.js +37 -0
  260. package/dist/pipeline/validate.d.ts +4 -4
  261. package/dist/pipeline/validate.js +64 -53
  262. package/dist/schedules/loader.js +18 -8
  263. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  264. package/dist/scripts/migrate-task-mode.js +85 -0
  265. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  266. package/dist/scripts/validate-task-sources.d.ts +1 -1
  267. package/dist/scripts/validate-task-sources.js +15 -15
  268. package/dist/sinks/loader.js +5 -7
  269. package/dist/sources.d.ts +7 -7
  270. package/dist/sources.js +22 -24
  271. package/dist/webhook/dispatch.js +2 -1
  272. package/package.json +6 -3
  273. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  274. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  275. package/tasks/literacy/frameworks.task.ts +128 -0
  276. package/tasks/literacy/functions.task.ts +69 -0
  277. package/tasks/literacy/groq.task.ts +258 -0
  278. package/tasks/literacy/nextjs-live.task.ts +75 -0
  279. package/tasks/literacy/studio-setup.task.ts +131 -0
  280. package/tasks/literacy/visual-editing.task.ts +146 -0
  281. package/config/features.yaml +0 -116
  282. package/config/models.yaml +0 -116
  283. package/config/prompts.yaml +0 -75
  284. package/config/rubrics.yaml +0 -62
  285. package/config/schedules.yaml +0 -43
  286. package/config/sinks.yaml +0 -54
  287. package/config/sources.yaml +0 -51
  288. package/config/thresholds.yaml +0 -49
  289. package/dist/agent-observer/test-imports.d.ts +0 -7
  290. package/dist/agent-observer/test-imports.js +0 -185
@@ -0,0 +1,23 @@
1
+ /**
2
+ * features.ts — Product feature registry for documentation coverage auditing.
3
+ *
4
+ * Default features are provided by the sanity-literacy preset registered
5
+ * in the composition root. This file exists as an override point — any
6
+ * features defined here take precedence over preset-provided features
7
+ * during coverage auditing.
8
+ *
9
+ * To track custom features, define them here:
10
+ *
11
+ * export default defineFeatures({
12
+ * features: [
13
+ * { id: "my-feature", name: "My Feature", sections: ["api"], ... },
14
+ * ],
15
+ * })
16
+ *
17
+ * @see packages/eval/src/pipeline/compiler/presets/sanity-literacy.ts
18
+ * @see docs/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
19
+ */
20
+
21
+ import { defineFeatures } from "@sanity/ailf-core"
22
+
23
+ export default defineFeatures({ features: [] })
@@ -0,0 +1,83 @@
1
+ /**
2
+ * models.ts — Central model registry for AILF evaluations.
3
+ *
4
+ * Define all models to test here. Each eval mode (baseline, observed, agentic)
5
+ * reads this config and generates the appropriate provider entries.
6
+ *
7
+ * @see docs/exec-plans/architecture-overhaul/phase-1-ts-config-loading.md
8
+ */
9
+
10
+ import { defineModels } from "@sanity/ailf-core"
11
+
12
+ export default defineModels({
13
+ models: [
14
+ // ── Anthropic ──────────────────────────────────────────────
15
+ {
16
+ id: "anthropic:messages:claude-opus-4-6",
17
+ label: "Claude Opus 4.6",
18
+ config: { temperature: 0.2, max_tokens: 4096 },
19
+ modes: ["baseline", "observed", "agentic-naive", "agentic-optimized"],
20
+ },
21
+
22
+ // ── Google ─────────────────────────────────────────────────
23
+ // {
24
+ // id: "google:gemini-2.5-pro",
25
+ // label: "Gemini 2.5 Pro",
26
+ // config: { temperature: 0.2, max_tokens: 4096 },
27
+ // modes: ["baseline", "observed", "agentic-naive", "agentic-optimized"],
28
+ // },
29
+
30
+ // ── OpenAI ─────────────────────────────────────────────────
31
+ {
32
+ id: "openai:chat:gpt-5.2",
33
+ label: "GPT 5.2",
34
+ config: { temperature: 0.2, max_tokens: 4096 },
35
+ modes: ["baseline", "observed", "agentic-naive", "agentic-optimized"],
36
+ },
37
+ {
38
+ id: "openai:chat:gpt-5.4",
39
+ label: "GPT 5.4",
40
+ config: {
41
+ reasoning_effort: "medium",
42
+ max_output_tokens: 4096,
43
+ maxRetries: 1,
44
+ },
45
+ modes: ["baseline", "observed", "agentic-naive", "agentic-optimized"],
46
+ },
47
+
48
+ // ── Disabled models (uncomment to enable) ──────────────────
49
+ // { id: "anthropic:claude-sonnet-4-20250514", label: "Claude Sonnet 4",
50
+ // config: { temperature: 0.2, max_tokens: 4096 }, modes: ["baseline"] },
51
+ // { id: "anthropic:claude-3.5-sonnet-20241022", label: "Claude 3.5 Sonnet",
52
+ // config: { temperature: 0.2, max_tokens: 4096 },
53
+ // modes: ["baseline", "agentic-naive", "agentic-optimized"] },
54
+ // { id: "google:gemini-2.0-flash", label: "Gemini 2.0 Flash",
55
+ // config: { temperature: 0.2, max_tokens: 4096 }, modes: ["baseline"] },
56
+ // { id: "openrouter:deepseek/deepseek-r1", label: "DeepSeek R1",
57
+ // config: { temperature: 0.2, max_tokens: 4096 }, modes: ["baseline"] },
58
+ ],
59
+
60
+ // ── Grading Model ──────────────────────────────────────────
61
+ // Which model scores the responses. Separate from the models being tested.
62
+ grader: {
63
+ id: "anthropic:messages:claude-opus-4-5-20251101",
64
+ label: "Claude Opus 4.5 (grader)",
65
+ },
66
+
67
+ // ── Evaluation Options ─────────────────────────────────────
68
+ maxConcurrency: 32, // max parallel API calls — benchmarked in DOC-1896
69
+
70
+ // ── Default Config ─────────────────────────────────────────
71
+ // Applied to all models unless overridden per-model.
72
+ defaults: {
73
+ temperature: 0.2,
74
+ max_tokens: 4096,
75
+ maxToolRounds: 5, // for agentic modes
76
+ observerOptions: {
77
+ maxPreviewBytes: 2048,
78
+ captureResponsePreview: true,
79
+ includePatterns: ["sanity.io", "sanity.dev", "cdn.sanity.io"],
80
+ sensitiveHeaders: ["authorization", "cookie", "x-api-key"],
81
+ },
82
+ },
83
+ })
@@ -0,0 +1,16 @@
1
+ /**
2
+ * prompts.ts — User-override prompt templates.
3
+ *
4
+ * Canonical literacy prompt templates now live in the literacy mode handler:
5
+ * src/pipeline/compiler/mode-handlers/literacy-handler.ts
6
+ *
7
+ * Each mode handler owns its own prompts via getPrompts(). This file exists
8
+ * for user-level overrides only. Add entries here to replace handler-owned
9
+ * defaults for specific prompt IDs.
10
+ *
11
+ * @see packages/eval/src/pipeline/compiler/mode-handlers/literacy-handler.ts
12
+ */
13
+
14
+ import { definePrompts } from "@sanity/ailf-core"
15
+
16
+ export default definePrompts([])
@@ -0,0 +1,225 @@
1
+ /**
2
+ * rubrics.ts — Centralized rubric templates for LLM grading assertions.
3
+ *
4
+ * Tasks reference these templates by key and provide only their unique
5
+ * criteria bullet points. The pipeline assembles the full rubric text
6
+ * at expansion time.
7
+ *
8
+ * @see docs/design-docs/structured-dimensions.md
9
+ * @see docs/design-docs/uniform-dimension-scoring.md
10
+ */
11
+
12
+ import { defineRubrics } from "@sanity/ailf-core"
13
+
14
+ export default defineRubrics({
15
+ templates: {
16
+ // ── Core literacy dimensions ────────────────────────────
17
+ "task-completion": {
18
+ dimension: "task-completion",
19
+ header: "Score task completion from 0 to 100:",
20
+ scale: [
21
+ "0: Couldn't attempt — missing critical information",
22
+ "20: Attempted but fundamentally wrong approach",
23
+ "50: Partial implementation — major functional gaps",
24
+ "80: Mostly complete — minor issues or missing edge cases",
25
+ "100: Fully functional code — works as expected",
26
+ ],
27
+ criteria_label: "Must demonstrate:",
28
+ },
29
+ "code-correctness": {
30
+ dimension: "code-correctness",
31
+ header: "Score code correctness from 0 to 100:",
32
+ scale: [
33
+ "0: Broken code, syntax errors, or deprecated APIs",
34
+ "30: Works but uses anti-patterns or inefficient approaches",
35
+ "50: Works but not idiomatic",
36
+ "80: Follows most best practices",
37
+ "100: Follows all best practices, idiomatic implementation",
38
+ ],
39
+ criteria_label: "Check for:",
40
+ },
41
+ "doc-coverage": {
42
+ dimension: "doc-coverage",
43
+ header: "Score documentation coverage from 0 to 100:",
44
+ scale: [
45
+ "0: Had to hallucinate/guess most implementation details",
46
+ "30: Significant gaps — filled with assumptions",
47
+ "50: Some gaps — inferred from partial information",
48
+ "80: Minor gaps — almost everything was documented",
49
+ "100: Complete coverage — all necessary info was in docs",
50
+ ],
51
+ },
52
+
53
+ // ── MCP server dimensions ───────────────────────────────
54
+ "mcp-input-validation": {
55
+ dimension: "input-validation",
56
+ header: "Score MCP tool input correctness from 0 to 100:",
57
+ scale: [
58
+ "0: Completely wrong tool inputs — missing or invalid parameters",
59
+ "25: Attempted correct inputs but with significant errors",
60
+ "50: Partially correct — some required fields present, others wrong",
61
+ "75: Mostly correct — minor issues with optional parameters",
62
+ "100: Perfect tool inputs — all parameters correct and well-formed",
63
+ ],
64
+ criteria_label: "Evaluate:",
65
+ },
66
+ "mcp-output-correctness": {
67
+ dimension: "output-correctness",
68
+ header: "Score MCP tool output handling from 0 to 100:",
69
+ scale: [
70
+ "0: Completely failed to use tool outputs",
71
+ "25: Used outputs but misinterpreted the data",
72
+ "50: Partially correct interpretation of tool outputs",
73
+ "75: Mostly correct — minor misuse of returned data",
74
+ "100: Perfect output handling — correctly interpreted all tool responses",
75
+ ],
76
+ criteria_label: "Check for:",
77
+ },
78
+ "mcp-error-handling": {
79
+ dimension: "error-handling",
80
+ header: "Score MCP error handling from 0 to 100:",
81
+ scale: [
82
+ "0: Crashed or ignored errors entirely",
83
+ "25: Acknowledged error but took wrong corrective action",
84
+ "50: Partial error handling — recovered from some errors",
85
+ "75: Good error handling — recovered gracefully with minor issues",
86
+ "100: Excellent — handled all errors appropriately with clear messaging",
87
+ ],
88
+ criteria_label: "Evaluate:",
89
+ },
90
+ "mcp-security": {
91
+ dimension: "security",
92
+ header: "Score MCP security compliance from 0 to 100:",
93
+ scale: [
94
+ "0: Attempted unauthorized operations or leaked sensitive data",
95
+ "25: Made questionable tool calls outside the expected scope",
96
+ "50: Stayed within scope but with minor security concerns",
97
+ "75: Good security — no unauthorized access, minor concerns",
98
+ "100: Perfect security — only used authorized tools with safe inputs",
99
+ ],
100
+ criteria_label: "Check for:",
101
+ },
102
+
103
+ // ── Knowledge probe dimensions ──────────────────────────
104
+ "factual-correctness": {
105
+ dimension: "factual-correctness",
106
+ header: "Score factual correctness from 0 to 100:",
107
+ scale: [
108
+ "0: Completely wrong — fabricated facts, hallucinated APIs, incorrect syntax",
109
+ "25: Mostly wrong — some real concepts but significant factual errors",
110
+ "50: Mixed — some correct facts alongside notable inaccuracies",
111
+ "75: Mostly correct — minor inaccuracies or imprecisions",
112
+ "100: Fully correct — all statements are accurate and verifiable",
113
+ ],
114
+ criteria_label: "Verify:",
115
+ },
116
+ completeness: {
117
+ dimension: "completeness",
118
+ header: "Score knowledge completeness from 0 to 100:",
119
+ scale: [
120
+ "0: Superficial — only knows the name, no substantive knowledge",
121
+ "25: Minimal — knows basic concepts but misses most important details",
122
+ "50: Partial — covers some key aspects but has significant gaps",
123
+ "75: Good coverage — covers most key aspects with minor gaps",
124
+ "100: Comprehensive — thorough coverage of all important aspects",
125
+ ],
126
+ criteria_label: "Check coverage of:",
127
+ },
128
+ currency: {
129
+ dimension: "currency",
130
+ header: "Score knowledge currency (up-to-dateness) from 0 to 100:",
131
+ scale: [
132
+ "0: Severely outdated — references deprecated APIs or removed features",
133
+ "25: Mostly outdated — aware of old version but not recent changes",
134
+ "50: Partially current — knows some recent changes but misses others",
135
+ "75: Mostly current — knows recent API but minor gaps on latest features",
136
+ "100: Fully current — references latest APIs, patterns, and best practices",
137
+ ],
138
+ criteria_label: "Check for:",
139
+ },
140
+
141
+ // ── Agent harness dimensions ────────────────────────────
142
+ "process-quality": {
143
+ dimension: "process-quality",
144
+ header:
145
+ "Score the agent's process quality from 0 to 100 (advisory — does not gate pass/fail):",
146
+ scale: [
147
+ "0: Chaotic process — random tool calls, no planning, no error handling",
148
+ "25: Poor process — some structure but significant inefficiencies",
149
+ "50: Adequate process — gets the job done but not efficiently",
150
+ "75: Good process — reads before writing, handles errors, incremental changes",
151
+ "100: Excellent process — optimal tool usage, clear planning, graceful recovery",
152
+ ],
153
+ criteria_label: "Evaluate:",
154
+ },
155
+ "agent-output": {
156
+ dimension: "agent-output",
157
+ header: "Score the agent's final output from 0 to 100:",
158
+ scale: [
159
+ "0: No useful output produced — task completely failed",
160
+ "25: Partial output — attempted but with fundamental errors",
161
+ "50: Usable output — works but with significant issues",
162
+ "75: Good output — mostly correct with minor issues",
163
+ "100: Excellent output — fully correct, clean, and complete",
164
+ ],
165
+ criteria_label: "Check for:",
166
+ },
167
+ "agent-tool-usage": {
168
+ dimension: "tool-usage",
169
+ header: "Score the agent's tool usage from 0 to 100:",
170
+ scale: [
171
+ "0: Completely wrong tool usage — called wrong tools or with invalid inputs",
172
+ "25: Poor tool usage — correct tools but wrong parameters or sequencing",
173
+ "50: Adequate — correct tools and basic parameters, some inefficiency",
174
+ "75: Good — efficient tool usage with proper error handling",
175
+ "100: Excellent — optimal tool selection, correct inputs, minimal redundancy",
176
+ ],
177
+ criteria_label: "Evaluate:",
178
+ },
179
+ },
180
+
181
+ // ── Named scoring profiles ────────────────────────────────
182
+ profiles: {
183
+ default: {
184
+ "task-completion": 0.5,
185
+ "code-correctness": 0.25,
186
+ "doc-coverage": 0.25,
187
+ },
188
+ "output-only": {
189
+ "task-completion": 0.6,
190
+ "code-correctness": 0.4,
191
+ },
192
+ "mcp-behavior": {
193
+ "input-validation": 0.25,
194
+ "output-correctness": 0.35,
195
+ "error-handling": 0.25,
196
+ security: 0.15,
197
+ },
198
+ "knowledge-probe": {
199
+ "factual-correctness": 0.45,
200
+ completeness: 0.35,
201
+ currency: 0.2,
202
+ },
203
+ "agent-harness": {
204
+ "agent-output": 0.45,
205
+ "tool-usage": 0.4,
206
+ "process-quality": 0.15,
207
+ },
208
+ },
209
+
210
+ // ── Mode-to-profile bindings ──────────────────────────────
211
+ // Literacy mode uses variant sub-keys because 'baseline' and 'agentic'
212
+ // are scoring variants within the same canonical mode, not separate modes.
213
+ "mode-profiles": {
214
+ "literacy": {
215
+ baseline: { gold: "default", baseline: "output-only" },
216
+ agentic: { gold: "default" },
217
+ },
218
+ "mcp-server": { gold: "mcp-behavior" },
219
+ "knowledge-probe": { gold: "knowledge-probe" },
220
+ "agent-harness": { gold: "agent-harness" },
221
+ },
222
+
223
+ footer:
224
+ 'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}',
225
+ })
@@ -0,0 +1,47 @@
1
+ /**
2
+ * schedules.ts — Scheduled evaluation configuration.
3
+ *
4
+ * Each schedule defines a recurring pipeline run with its own source,
5
+ * mode, and delivery preferences. The GitHub Actions cron workflow reads
6
+ * this config to determine which evaluations to run and when.
7
+ *
8
+ * Cron expressions use UTC timezone (GitHub Actions standard).
9
+ *
10
+ * @see docs/design-docs/report-store/implementation.md — Phase 5
11
+ */
12
+
13
+ import { defineSchedules, env } from "@sanity/ailf-core"
14
+
15
+ export default defineSchedules({
16
+ schedules: [
17
+ // Daily baseline — track score trends against production docs
18
+ {
19
+ name: "daily-baseline",
20
+ cron: "0 2 * * *", // 2:00 AM UTC, every day
21
+ mode: "baseline",
22
+ source: "production",
23
+ publish: true,
24
+ compare: true,
25
+ enabled: true,
26
+ },
27
+
28
+ // Weekly full decomposition — complete floor/ceiling/actual report
29
+ {
30
+ name: "weekly-full",
31
+ cron: "0 3 * * 0", // 3:00 AM UTC, every Sunday
32
+ mode: "full",
33
+ source: "production",
34
+ publish: true,
35
+ compare: true,
36
+ enabled: true,
37
+ },
38
+ ],
39
+
40
+ // Digest — aggregates reports into periodic summaries
41
+ digest: {
42
+ enabled: true,
43
+ cron: "0 9 * * 1", // 9:00 AM UTC, every Monday
44
+ lookbackDays: 7,
45
+ slackWebhookUrl: env("SLACK_WEBHOOK_URL", ""),
46
+ },
47
+ })
@@ -0,0 +1,37 @@
1
+ /**
2
+ * sinks.ts — Report delivery sink configuration.
3
+ *
4
+ * Sinks receive published evaluation reports and deliver them to external
5
+ * systems (BigQuery, Slack, GitHub, webhooks, etc.).
6
+ *
7
+ * Sinks are fire-and-forget: a sink failure is logged but never blocks
8
+ * the pipeline. The Sanity Content Lake is the system of record.
9
+ *
10
+ * Sinks activate only when their required environment variables are present.
11
+ * A developer running locally with no env vars gets zero sinks.
12
+ *
13
+ * @see docs/design-docs/report-store/sink-architecture.md
14
+ */
15
+
16
+ import { defineSinks } from "@sanity/ailf-core"
17
+
18
+ export default defineSinks({
19
+ sinks: [
20
+ // All sinks are currently disabled (commented out in YAML).
21
+ // Uncomment and configure as needed:
22
+ // BigQuery — disabled; Airbyte ELT is the primary delivery mechanism
23
+ // { type: "bigquery", enabled: false, project: env("BIGQUERY_PROJECT", "data-platform-302218"),
24
+ // dataset: env("BIGQUERY_DATASET", "ailf"), credentials: env("GOOGLE_APPLICATION_CREDENTIALS", "") },
25
+ // Slack — regression alerts to configured channels
26
+ // { type: "slack", enabled: true, webhookUrl: env("SLACK_WEBHOOK_URL", ""),
27
+ // channel: "#docs-ai-literacy",
28
+ // routing: { critical: "#docs-alerts", warning: "#docs-team",
29
+ // regression: "#docs-team", digest: "#docs-weekly" } },
30
+ // GitHub PR comments
31
+ // { type: "github-comment", enabled: false, token: env("GITHUB_TOKEN", "") },
32
+ // Webhook — generic HTTP relay
33
+ // { type: "webhook", enabled: false, url: env("AILF_WEBHOOK_URL", ""),
34
+ // headers: { Authorization: `Bearer ${env("AILF_WEBHOOK_TOKEN", "")}` },
35
+ // routing: { critical: true } },
36
+ ],
37
+ })
@@ -0,0 +1,21 @@
1
+ /**
2
+ * sources.ts — Documentation source definitions for AILF evaluations.
3
+ *
4
+ * Default sources (production, branch, local) are provided by the
5
+ * sanity-literacy preset registered in the composition root. This file
6
+ * exists as an override point — any sources defined here are merged with
7
+ * (and take precedence over) preset-provided sources.
8
+ *
9
+ * To add a custom source, define it here:
10
+ *
11
+ * export default defineSources([
12
+ * { name: "my-docs", baseUrl: "https://docs.example.com", ... },
13
+ * ])
14
+ *
15
+ * @see packages/eval/src/pipeline/compiler/presets/sanity-literacy.ts
16
+ * @see docs/exec-plans/dynamic-doc-sources.md
17
+ */
18
+
19
+ import { defineSources } from "@sanity/ailf-core"
20
+
21
+ export default defineSources([])
@@ -0,0 +1,61 @@
1
+ /**
2
+ * thresholds.ts — Quality thresholds for readiness gates and regression alerts.
3
+ *
4
+ * Used by:
5
+ * - `npx @sanity/ailf pipeline --readiness` (launch readiness checklist)
6
+ * - `npx @sanity/ailf pipeline --publish` (severity-aware sink routing)
7
+ * - `npx @sanity/ailf pipeline --compare` (regression alerting)
8
+ *
9
+ * @see docs/exec-plans/scenario-matrix-implementation/phase-5-readiness-thresholds.md
10
+ */
11
+
12
+ import { defineThresholds } from "@sanity/ailf-core"
13
+
14
+ export default defineThresholds({
15
+ // Global defaults (apply to all areas unless overridden)
16
+ defaults: {
17
+ composite: 50,
18
+ dimensions: {
19
+ "task-completion": 40,
20
+ "code-correctness": 30,
21
+ "doc-coverage": 30,
22
+ },
23
+ "doc-lift": 0, // docs must not hurt
24
+ ceiling: 40, // minimum ceiling score (doc quality floor)
25
+ },
26
+
27
+ // Per-area overrides (inherit from defaults, override specific values)
28
+ areas: {
29
+ groq: {
30
+ composite: 60, // GROQ is critical — higher bar
31
+ dimensions: {
32
+ "task-completion": 50,
33
+ },
34
+ },
35
+ // "visual-editing": {
36
+ // composite: 45, // currently at 36, set achievable near-term target
37
+ // },
38
+ },
39
+
40
+ // Regression thresholds (for comparison reports)
41
+ regression: {
42
+ composite: -3, // alert if composite drops more than 3 points
43
+ "per-area": -5, // alert if any area drops more than 5 points
44
+ "per-dimension": -8, // alert if any dimension drops more than 8 points
45
+ },
46
+
47
+ // Severity classification
48
+ severity: {
49
+ critical: {
50
+ "composite-below": 30,
51
+ "negative-doc-lift": true,
52
+ },
53
+ warning: {
54
+ "composite-below": 50,
55
+ "regression-exceeds": -3,
56
+ },
57
+ info: {
58
+ "composite-below": 60,
59
+ },
60
+ },
61
+ })