@sanity/ailf 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (288) hide show
  1. package/config/features.ts +23 -0
  2. package/config/models.ts +83 -0
  3. package/config/prompts.ts +16 -0
  4. package/config/rubrics.ts +225 -0
  5. package/config/schedules.ts +47 -0
  6. package/config/sinks.ts +37 -0
  7. package/config/sources.ts +21 -0
  8. package/config/thresholds.ts +61 -0
  9. package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
  10. package/dist/_vendor/ailf-core/config-helpers.js +150 -0
  11. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  12. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  13. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  14. package/dist/_vendor/ailf-core/index.js +5 -0
  15. package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
  16. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  17. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  18. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  19. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  20. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  21. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  22. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
  23. package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
  24. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
  25. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
  26. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -29
  27. package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -8
  28. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  29. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  30. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  31. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  32. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  33. package/dist/_vendor/ailf-core/services/index.js +2 -1
  34. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  35. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  36. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  37. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  38. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  39. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  40. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  41. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  42. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
  43. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  44. package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
  45. package/dist/_vendor/ailf-core/types/index.js +8 -1
  46. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
  47. package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
  48. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  49. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  50. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  51. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  52. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  53. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  54. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  55. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  56. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  57. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  58. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  59. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  60. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  61. package/dist/_vendor/ailf-shared/index.js +0 -1
  62. package/dist/adapters/api-client/build-request.js +14 -13
  63. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  64. package/dist/adapters/config-sources/file-config-adapter.js +38 -12
  65. package/dist/adapters/config-sources/index.d.ts +2 -0
  66. package/dist/adapters/config-sources/index.js +1 -0
  67. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  68. package/dist/adapters/config-sources/ts-config-loader.js +133 -0
  69. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  70. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  71. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  72. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  73. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  74. package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
  75. package/dist/adapters/task-sources/index.d.ts +1 -0
  76. package/dist/adapters/task-sources/index.js +1 -0
  77. package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
  78. package/dist/adapters/task-sources/repo-task-source.js +69 -16
  79. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  80. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  81. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  82. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  83. package/dist/cli.js +0 -2
  84. package/dist/commands/baseline.js +4 -1
  85. package/dist/commands/calculate-scores.js +1 -1
  86. package/dist/commands/coverage-audit.js +7 -1
  87. package/dist/commands/explain-handler.js +25 -23
  88. package/dist/commands/fetch-docs.js +3 -2
  89. package/dist/commands/generate-configs.js +1 -1
  90. package/dist/commands/interactive.js +11 -7
  91. package/dist/commands/pipeline-action.d.ts +2 -0
  92. package/dist/commands/pipeline-action.js +16 -6
  93. package/dist/commands/pipeline.d.ts +1 -0
  94. package/dist/commands/pipeline.js +4 -2
  95. package/dist/commands/pr-comment.js +1 -1
  96. package/dist/commands/publish.js +2 -2
  97. package/dist/commands/readiness-report.js +13 -6
  98. package/dist/composition-root.d.ts +1 -1
  99. package/dist/composition-root.js +67 -4
  100. package/dist/orchestration/build-app-context.js +1 -0
  101. package/dist/orchestration/build-step-sequence.js +24 -6
  102. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  103. package/dist/orchestration/steps/fetch-docs-step.js +6 -4
  104. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  105. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  106. package/dist/orchestration/steps/generate-configs-step.js +245 -51
  107. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  108. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  109. package/dist/orchestration/steps/readiness-step.js +5 -6
  110. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  111. package/dist/orchestration/steps/run-eval-step.js +8 -7
  112. package/dist/pipeline/cache.d.ts +1 -1
  113. package/dist/pipeline/cache.js +36 -8
  114. package/dist/pipeline/calculate-scores.d.ts +2 -4
  115. package/dist/pipeline/calculate-scores.js +43 -113
  116. package/dist/pipeline/checks.js +2 -2
  117. package/dist/pipeline/compare.js +8 -8
  118. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  119. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  120. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  121. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  122. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  123. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  124. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  125. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  126. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  127. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
  128. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  129. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  130. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  131. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  132. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  133. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
  134. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  135. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  136. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  137. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  138. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  139. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  140. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  141. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  142. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  143. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  144. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  145. package/dist/pipeline/compiler/config-loader.js +111 -0
  146. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  147. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  148. package/dist/pipeline/compiler/hash.d.ts +11 -0
  149. package/dist/pipeline/compiler/hash.js +18 -0
  150. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  151. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  152. package/dist/pipeline/compiler/index.d.ts +29 -0
  153. package/dist/pipeline/compiler/index.js +45 -0
  154. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  155. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  156. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  157. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  158. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  159. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  160. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  161. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  162. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  163. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  164. package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
  165. package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
  166. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  167. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  168. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  169. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  170. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  171. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
  172. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
  173. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
  174. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  175. package/dist/pipeline/compiler/presets/index.js +8 -0
  176. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
  177. package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
  178. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  179. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  180. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  181. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  182. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  183. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  184. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  185. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  186. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  187. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  188. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  189. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  190. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  191. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  192. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  193. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  194. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  195. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  196. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  197. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  198. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  199. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  200. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  201. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  202. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  203. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  204. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  205. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  206. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  207. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  208. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  209. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  210. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  211. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  212. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  213. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  214. package/dist/pipeline/coverage-audit.d.ts +15 -5
  215. package/dist/pipeline/coverage-audit.js +41 -22
  216. package/dist/pipeline/eval-constants.d.ts +16 -6
  217. package/dist/pipeline/eval-constants.js +25 -4
  218. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  219. package/dist/pipeline/eval-fingerprint.js +8 -9
  220. package/dist/pipeline/expand-tasks.d.ts +19 -10
  221. package/dist/pipeline/expand-tasks.js +34 -28
  222. package/dist/pipeline/gap-analysis.d.ts +1 -1
  223. package/dist/pipeline/gap-analysis.js +2 -2
  224. package/dist/pipeline/generate-configs.d.ts +22 -4
  225. package/dist/pipeline/generate-configs.js +53 -24
  226. package/dist/pipeline/grader-api.d.ts +3 -3
  227. package/dist/pipeline/grader-api.js +5 -12
  228. package/dist/pipeline/grader-compare-runner.js +20 -27
  229. package/dist/pipeline/grader-comparison.d.ts +4 -8
  230. package/dist/pipeline/grader-comparison.js +11 -17
  231. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  232. package/dist/pipeline/grader-consistency-runner.js +16 -20
  233. package/dist/pipeline/grader-consistency.d.ts +6 -10
  234. package/dist/pipeline/grader-consistency.js +13 -32
  235. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  236. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  237. package/dist/pipeline/grader-sensitivity.js +10 -10
  238. package/dist/pipeline/grader-validate-runner.js +7 -5
  239. package/dist/pipeline/grader-validation.d.ts +2 -6
  240. package/dist/pipeline/grader-validation.js +14 -22
  241. package/dist/pipeline/map-request-to-config.js +6 -1
  242. package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
  243. package/dist/pipeline/mirror-repo-tasks.js +16 -15
  244. package/dist/pipeline/normalize-mode.d.ts +49 -0
  245. package/dist/pipeline/normalize-mode.js +64 -0
  246. package/dist/pipeline/plan.d.ts +5 -2
  247. package/dist/pipeline/plan.js +134 -78
  248. package/dist/pipeline/pr-comment.js +2 -0
  249. package/dist/pipeline/profile-resolution.d.ts +22 -14
  250. package/dist/pipeline/profile-resolution.js +41 -19
  251. package/dist/pipeline/provenance.d.ts +2 -2
  252. package/dist/pipeline/provenance.js +12 -17
  253. package/dist/pipeline/release-report.js +4 -4
  254. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  255. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  256. package/dist/pipeline/rubric-loader.d.ts +20 -0
  257. package/dist/pipeline/rubric-loader.js +37 -0
  258. package/dist/pipeline/validate.d.ts +4 -4
  259. package/dist/pipeline/validate.js +64 -53
  260. package/dist/schedules/loader.js +18 -8
  261. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  262. package/dist/scripts/migrate-task-mode.js +85 -0
  263. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  264. package/dist/scripts/validate-task-sources.d.ts +1 -1
  265. package/dist/scripts/validate-task-sources.js +15 -15
  266. package/dist/sinks/loader.js +5 -7
  267. package/dist/sources.d.ts +7 -7
  268. package/dist/sources.js +22 -24
  269. package/dist/webhook/dispatch.js +2 -1
  270. package/package.json +6 -3
  271. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  272. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  273. package/tasks/literacy/frameworks.task.ts +128 -0
  274. package/tasks/literacy/functions.task.ts +69 -0
  275. package/tasks/literacy/groq.task.ts +258 -0
  276. package/tasks/literacy/nextjs-live.task.ts +75 -0
  277. package/tasks/literacy/studio-setup.task.ts +131 -0
  278. package/tasks/literacy/visual-editing.task.ts +146 -0
  279. package/config/features.yaml +0 -116
  280. package/config/models.yaml +0 -116
  281. package/config/prompts.yaml +0 -75
  282. package/config/rubrics.yaml +0 -81
  283. package/config/schedules.yaml +0 -43
  284. package/config/sinks.yaml +0 -54
  285. package/config/sources.yaml +0 -51
  286. package/config/thresholds.yaml +0 -49
  287. package/dist/agent-observer/test-imports.d.ts +0 -7
  288. package/dist/agent-observer/test-imports.js +0 -185
@@ -1,116 +0,0 @@
1
- # features.yaml
2
- #
3
- # Product feature registry for documentation coverage auditing.
4
- # Each entry represents a product feature that should have evaluation
5
- # task coverage. Edit this file to track coverage gaps and priorities.
6
- #
7
- # The coverage audit (`pnpm coverage-audit`) cross-references this
8
- # registry against tasks/*.yaml to identify covered and uncovered features.
9
- #
10
- # Status values:
11
- # covered — has evaluation tasks in tasks/*.yaml
12
- # uncovered — no evaluation tasks yet
13
- # planned — tasks are planned but not yet written
14
- # out-of-scope — intentionally excluded from evaluation
15
- #
16
- # Phase 3c of the Scenario Matrix implementation.
17
- # See docs/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
18
-
19
- features:
20
- # === Currently covered (have evaluation tasks) ===
21
- - id: groq
22
- name: "GROQ Query Language"
23
- sections: [content-lake]
24
- status: covered
25
- area: groq
26
- priority: critical
27
- taskCount: 3
28
-
29
- - id: visual-editing
30
- name: "Visual Editing"
31
- sections: [visual-editing]
32
- status: covered
33
- area: visual-editing
34
- priority: critical
35
- taskCount: 1
36
-
37
- - id: nextjs-live
38
- name: "Next.js Live Preview"
39
- sections: [visual-editing]
40
- status: covered
41
- area: nextjs-live
42
- priority: high
43
- taskCount: 2
44
-
45
- - id: functions
46
- name: "Sanity Functions"
47
- sections: [compute-and-ai]
48
- status: covered
49
- area: functions
50
- priority: high
51
- taskCount: 2
52
-
53
- - id: studio-setup
54
- name: "Studio Configuration"
55
- sections: [studio]
56
- status: covered
57
- area: studio-setup
58
- priority: high
59
- taskCount: 1
60
-
61
- - id: frameworks
62
- name: "Framework Integration"
63
- sections: [developer-guides]
64
- status: covered
65
- area: frameworks
66
- priority: high
67
- taskCount: 2
68
-
69
- # === Uncovered (no evaluation tasks yet) ===
70
- - id: portable-text
71
- name: "Portable Text"
72
- sections: [content-lake, studio]
73
- status: uncovered
74
- priority: high
75
-
76
- - id: image-assets
77
- name: "Image & Asset Handling"
78
- sections: [content-lake, apis-and-sdks]
79
- status: uncovered
80
- priority: high
81
-
82
- - id: mutations
83
- name: "Mutations & Transactions"
84
- sections: [content-lake, apis-and-sdks]
85
- status: uncovered
86
- priority: high
87
-
88
- - id: schemas
89
- name: "Schema Types & Validation"
90
- sections: [studio, content-lake]
91
- status: uncovered
92
- priority: medium
93
-
94
- - id: authentication
95
- name: "Authentication & Access Control"
96
- sections: [apis-and-sdks]
97
- status: uncovered
98
- priority: medium
99
-
100
- - id: webhooks
101
- name: "Webhooks"
102
- sections: [content-lake]
103
- status: uncovered
104
- priority: medium
105
-
106
- - id: realtime
107
- name: "Real-time Listeners"
108
- sections: [apis-and-sdks]
109
- status: uncovered
110
- priority: low
111
-
112
- - id: ai-assist
113
- name: "AI Assist"
114
- sections: [compute-and-ai, studio]
115
- status: uncovered
116
- priority: medium
@@ -1,116 +0,0 @@
1
- # models.yaml
2
- #
3
- # Central model registry for ai-literacy-framework evaluations.
4
- #
5
- # Define all models you want to test here. Each eval mode (baseline, observed,
6
- # agentic) reads this file and generates the appropriate provider entries.
7
- #
8
- # Usage:
9
- # 1. Add/remove models below
10
- # 2. Run: pnpm generate-configs
11
- # 3. Run your eval: pnpm eval / pnpm eval:observed / pnpm eval:agentic
12
- #
13
- # Model entries support:
14
- # - id: Promptfoo provider identifier (e.g., "openai:gpt-4o")
15
- # - label: Human-readable label for results display
16
- # - config: Model-specific config (temperature, max_tokens, etc.)
17
- # - modes: Which eval modes to include this model in (default: all)
18
- # Options: baseline, observed, agentic-naive, agentic-optimized
19
- # - env: Environment variable name for the API key (auto-detected for
20
- # openai:* and anthropic:* providers)
21
- #
22
- # The generator script expands each model into the correct provider format
23
- # for each eval mode. Custom providers (observed, agentic) are wired up
24
- # automatically — you just specify the model name.
25
-
26
- models:
27
- # -- Anthropic
28
- - id: anthropic:messages:claude-opus-4-6
29
- label: Claude Opus 4.6
30
- config:
31
- temperature: 0.2
32
- max_tokens: 4096
33
- modes: [baseline, observed, agentic-naive, agentic-optimized]
34
- # -- Google
35
- # - id: google:gemini-2.5-pro
36
- # label: Gemini 2.5 Pro
37
- # config:
38
- # temperature: 0.2
39
- # max_tokens: 4096
40
- # modes: [baseline, observed, agentic-naive, agentic-optimized]
41
- # -- OpenAI
42
- - id: openai:chat:gpt-5.2
43
- label: GPT 5.2
44
- config:
45
- temperature: 0.2
46
- max_tokens: 4096
47
- modes: [baseline, observed, agentic-naive, agentic-optimized]
48
- - id: openai:chat:gpt-5.4
49
- label: GPT 5.4
50
- config:
51
- reasoning_effort: "medium"
52
- max_output_tokens: 4096
53
- maxRetries: 1
54
- modes: [baseline, observed, agentic-naive, agentic-optimized]
55
-
56
- # ── Anthropic ───────────────────────────────────────────────
57
- # - id: anthropic:claude-sonnet-4-20250514
58
- # label: Claude Sonnet 4
59
- # config:
60
- # temperature: 0.2
61
- # max_tokens: 4096
62
- # modes: [baseline]
63
-
64
- # - id: anthropic:claude-3.5-sonnet-20241022
65
- # label: Claude 3.5 Sonnet
66
- # config:
67
- # temperature: 0.2
68
- # max_tokens: 4096
69
- # modes: [baseline, agentic-naive, agentic-optimized]
70
-
71
- # ── Google ──────────────────────────────────────────────────
72
- # - id: google:gemini-2.0-flash
73
- # label: Gemini 2.0 Flash
74
- # config:
75
- # temperature: 0.2
76
- # max_tokens: 4096
77
- # modes: [baseline]
78
-
79
- # ── Other ───────────────────────────────────────────────────
80
- # - id: openrouter:deepseek/deepseek-r1
81
- # label: DeepSeek R1
82
- # config:
83
- # temperature: 0.2
84
- # max_tokens: 4096
85
- # modes: [baseline]
86
-
87
- # ── Grading Model ───────────────────────────────────────────
88
- # Which model scores the responses. Separate from the models being tested.
89
- grader:
90
- id: anthropic:messages:claude-opus-4-5-20251101
91
- label: Claude Opus 4.5 (grader)
92
- #grader:
93
- # id: openai:gpt-5-2025-08-07
94
- # label: GPT-5 (grader)
95
-
96
- # ── Evaluation Options ──────────────────────────────────────
97
- # Controls how promptfoo runs evaluations.
98
- maxConcurrency: 32 # max parallel API calls — benchmarked in DOC-1896
99
-
100
- # ── Default Config ──────────────────────────────────────────
101
- # Applied to all models unless overridden per-model.
102
- defaults:
103
- temperature: 0.2
104
- max_tokens: 4096
105
- maxToolRounds: 5 # for agentic modes
106
- observerOptions:
107
- maxPreviewBytes: 2048
108
- captureResponsePreview: true
109
- includePatterns:
110
- - "sanity.io"
111
- - "sanity.dev"
112
- - "cdn.sanity.io"
113
- sensitiveHeaders:
114
- - "authorization"
115
- - "cookie"
116
- - "x-api-key"
@@ -1,75 +0,0 @@
1
- # prompts.yaml
2
- #
3
- # Prompt templates used across all evaluation modes.
4
- # Edit these to change what instructions the LLM receives.
5
- #
6
- # Available template variables (injected from task vars):
7
- # {{task}} — the implementation task description
8
- # {{docs}} — documentation context (empty string for baseline tests)
9
- #
10
- # Each prompt has:
11
- # id: unique identifier (used in Promptfoo config)
12
- # label: human-readable name (shown in results)
13
- # template: the actual prompt text with {{variable}} placeholders
14
-
15
- with-docs:
16
- id: with-docs
17
- label: With Documentation
18
- template: |
19
- You are an expert Sanity.io developer. Use the following documentation to help implement the task.
20
-
21
- ## Sanity Documentation
22
- {{docs}}
23
-
24
- ## Task
25
- {{task}}
26
-
27
- ## Requirements
28
-
29
- 1. Use ONLY the APIs and patterns shown in the documentation
30
- 2. Provide a complete, working implementation
31
- 3. Include all necessary imports
32
- 4. Follow Sanity best practices as documented
33
-
34
- Provide your implementation:
35
-
36
- without-docs:
37
- id: without-docs
38
- label: Baseline (No Docs)
39
- template: |
40
- You are an expert Sanity.io developer.
41
-
42
- ## Task
43
- {{task}}
44
-
45
- ## Requirements
46
-
47
- 1. Provide a complete, working implementation
48
- 2. Include all necessary imports
49
- 3. Follow Sanity best practices
50
-
51
- Provide your implementation:
52
-
53
- agentic:
54
- id: agentic
55
- label: Agentic (self-retrieval)
56
- template: |
57
- You are an expert developer helping implement a Sanity.io feature.
58
- You have access to web search and page fetching tools.
59
-
60
- IMPORTANT: Before writing any code, search for and read the relevant
61
- Sanity.io documentation to ensure you are using the latest APIs and
62
- best practices. Do not rely on memory alone.
63
-
64
- ## Task
65
- {{task}}
66
-
67
- ## Requirements
68
-
69
- 1. Search for relevant Sanity documentation before implementing
70
- 2. Use ONLY the APIs and patterns from the current official docs
71
- 3. Provide a complete, working implementation
72
- 4. Include all necessary imports
73
- 5. Follow Sanity best practices as documented
74
-
75
- Provide your implementation:
@@ -1,81 +0,0 @@
1
- # rubrics.yaml
2
- #
3
- # Centralized rubric templates for LLM grading assertions.
4
- # Tasks reference these templates by key and provide only their
5
- # unique criteria bullet points. The pipeline assembles the full
6
- # rubric text at expansion time.
7
- #
8
- # Each dimension is scored on a uniform 0–100 scale. Dimensions are
9
- # combined into a composite score using named scoring profiles below.
10
- #
11
- # Each template carries a `dimension` field that tags the scoring
12
- # dimension it belongs to. This metadata propagates through the
13
- # expansion pipeline into Promptfoo assertion metadata, enabling
14
- # structured dimension classification at scoring time instead of
15
- # fragile heuristic string matching.
16
- # See docs/design-docs/structured-dimensions.md for the design.
17
- #
18
- # See docs/exec-plans/rubric-templates.md for the design.
19
- # See docs/design-docs/uniform-dimension-scoring.md for the scoring model.
20
-
21
- templates:
22
- task-completion:
23
- dimension: task-completion
24
- header: "Score task completion from 0 to 100:"
25
- scale:
26
- - "0: Couldn't attempt — missing critical information"
27
- - "20: Attempted but fundamentally wrong approach"
28
- - "50: Partial implementation — major functional gaps"
29
- - "80: Mostly complete — minor issues or missing edge cases"
30
- - "100: Fully functional code — works as expected"
31
- criteria_label: "Must demonstrate:"
32
-
33
- code-correctness:
34
- dimension: code-correctness
35
- header: "Score code correctness from 0 to 100:"
36
- scale:
37
- - "0: Broken code, syntax errors, or deprecated APIs"
38
- - "30: Works but uses anti-patterns or inefficient approaches"
39
- - "50: Works but not idiomatic"
40
- - "80: Follows most best practices"
41
- - "100: Follows all best practices, idiomatic implementation"
42
- criteria_label: "Check for:"
43
-
44
- doc-coverage:
45
- dimension: doc-coverage
46
- header: "Score documentation coverage from 0 to 100:"
47
- scale:
48
- - "0: Had to hallucinate/guess most implementation details"
49
- - "30: Significant gaps — filled with assumptions"
50
- - "50: Some gaps — inferred from partial information"
51
- - "80: Minor gaps — almost everything was documented"
52
- - "100: Complete coverage — all necessary info was in docs"
53
-
54
- # Named scoring profiles — each is a dimension → weight map (must sum to 1.0).
55
- #
56
- # 'default': Full three-dimension composite for gold/ceiling entries (with docs).
57
- # 'output-only': Output quality dimensions only — excludes doc-coverage, which
58
- # is semantically undefined on without-docs entries.
59
- #
60
- # See docs/design-docs/named-scoring-profiles.md for the rationale.
61
- profiles:
62
- default:
63
- task-completion: 0.50
64
- code-correctness: 0.25
65
- doc-coverage: 0.25
66
- output-only:
67
- task-completion: 0.60
68
- code-correctness: 0.40
69
-
70
- # Mode-to-profile bindings — which profile to use for each (mode, variant) pair.
71
- # The scoring engine resolves: mode-profiles.<mode>.<variant> → profile name.
72
- # Falls back to 'default' when no explicit binding exists.
73
- mode-profiles:
74
- baseline:
75
- gold: default
76
- baseline: output-only
77
- agentic:
78
- gold: default
79
-
80
- footer:
81
- 'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}'
@@ -1,43 +0,0 @@
1
- # schedules.yaml
2
- #
3
- # Scheduled evaluation configuration for ai-literacy-framework.
4
- # Each schedule defines a recurring pipeline run with its own source,
5
- # mode, and delivery preferences.
6
- #
7
- # The GitHub Actions cron workflow (.github/workflows/scheduled-eval.yml)
8
- # reads this file to determine which evaluations to run and when.
9
- #
10
- # Schedule names are used as tags on published reports for easy filtering.
11
- # Cron expressions use UTC timezone (GitHub Actions standard).
12
- #
13
- # @see docs/design-docs/report-store/implementation.md — Phase 5
14
-
15
- schedules:
16
- # Daily baseline — track score trends against production docs
17
- - name: daily-baseline
18
- cron: "0 2 * * *" # 2:00 AM UTC, every day
19
- mode: baseline
20
- source: production
21
- publish: true
22
- compare: true
23
- enabled: true
24
-
25
- # Weekly full decomposition — complete floor/ceiling/actual report
26
- # Runs both baseline + agentic in a single pipeline invocation,
27
- # producing the three-layer decomposition with retrieval gap metrics.
28
- # Replaces the previous weekly-full (baseline) + weekly-agentic schedule pair.
29
- - name: weekly-full
30
- cron: "0 3 * * 0" # 3:00 AM UTC, every Sunday
31
- mode: full
32
- source: production
33
- publish: true
34
- compare: true
35
- enabled: true
36
-
37
- # Digest configuration — aggregates reports into periodic summaries
38
- digest:
39
- # Weekly digest — summarize score trends from the past 7 days
40
- enabled: true
41
- cron: "0 9 * * 1" # 9:00 AM UTC, every Monday
42
- lookbackDays: 7
43
- slackWebhookUrl: ${{ SLACK_WEBHOOK_URL }}
package/config/sinks.yaml DELETED
@@ -1,54 +0,0 @@
1
- # sinks.yaml
2
- #
3
- # Report delivery sink configuration for ai-literacy-framework.
4
- # Sinks receive published evaluation reports and deliver them to external
5
- # systems (BigQuery, Slack, GitHub, webhooks, etc.).
6
- #
7
- # Sinks are fire-and-forget (P6): a sink failure is logged but never blocks
8
- # the pipeline. The Sanity Content Lake is the system of record.
9
- #
10
- # Environment variables use ${{ VAR }} or ${{ VAR | default }} syntax.
11
- # A sink with `enabled: false` (or a falsy env var) is skipped entirely.
12
- #
13
- # Sinks activate only when their required environment variables are present.
14
- # A developer running locally with no env vars gets zero sinks.
15
- #
16
- # @see docs/design-docs/report-store/sink-architecture.md
17
-
18
- sinks:
19
- # BigQuery — disabled; Airbyte ELT is the primary BigQuery delivery mechanism.
20
- # The BigQuerySink can be re-enabled as a fallback if Airbyte is unavailable.
21
- # See config/airbyte/ for the active connector and config/bigquery/ for views.
22
- # - type: bigquery
23
- # enabled: false
24
- # project: ${{ BIGQUERY_PROJECT | data-platform-302218 }}
25
- # dataset: ${{ BIGQUERY_DATASET | ailf }}
26
- # credentials: ${{ GOOGLE_APPLICATION_CREDENTIALS }}
27
-
28
- # Slack — regression alerts to configured channels
29
- # Activates only when SLACK_WEBHOOK_URL is set in the environment.
30
- # By default only posts on regressions (avoids notification fatigue).
31
- # Routing controls which messages go where by severity level.
32
- # - type: slack
33
- # enabled: true
34
- # webhookUrl: ${{ SLACK_WEBHOOK_URL }}
35
- # channel: "#docs-ai-literacy"
36
- # routing:
37
- # critical: "#docs-alerts" # critical violations → dedicated alert channel
38
- # warning: "#docs-team" # warnings → team channel
39
- # regression: "#docs-team" # regressions → team channel
40
- # digest: "#docs-weekly" # weekly digests → broader channel
41
-
42
- # GitHub PR comments — score tables on CI-triggered eval runs
43
- # - type: github-comment
44
- # enabled: false
45
- # token: ${{ GITHUB_TOKEN }}
46
-
47
- # Webhook — generic HTTP relay for Airbyte, Zapier, custom services
48
- # - type: webhook
49
- # enabled: false
50
- # url: ${{ AILF_WEBHOOK_URL }}
51
- # headers:
52
- # Authorization: "Bearer ${{ AILF_WEBHOOK_TOKEN }}"
53
- # routing:
54
- # critical: true # webhooks fire on critical only
@@ -1,51 +0,0 @@
1
- # sources.yaml
2
- #
3
- # Documentation source definitions for ai-literacy-framework evaluations.
4
- # Each source defines where to find documentation for both evaluation modes:
5
- #
6
- # - Baseline/Observed: reads from Sanity CMS (projectId + dataset)
7
- # - Agentic: reads from a live URL (baseUrl, llms.txt derived automatically)
8
- #
9
- # Environment variables use ${{ VAR }} or ${{ VAR | default }} syntax.
10
- # Required vars (no default) will error if not set at load time.
11
- #
12
- # Available fields (all optional except baseUrl, projectId, dataset):
13
- # projectId: Sanity project ID
14
- # dataset: Sanity dataset name
15
- # baseUrl: Documentation site URL
16
- # perspective: Sanity release perspective ID (for evaluating content releases)
17
- # studioOrigin: Sanity Studio base URL (default: https://admin.sanity.io)
18
- # allowedOrigins: Origin allowlist for agentic sandboxing
19
- #
20
- # Fields settable only via CLI flags or env vars (not in this file):
21
- # documentIds: --sanity-document / SANITY_DOCUMENT_IDS
22
- # urls: --url / DOC_BASE_URL
23
- # headers: --header / DOC_HEADERS
24
- #
25
- # Usage:
26
- # pnpm eval --source production
27
- # pnpm eval:agentic --source branch
28
- # DOC_BASE_URL=https://my-branch.sanity.dev/docs pnpm eval:agentic
29
-
30
- sources:
31
- # Production Sanity docs — the default when no --source is specified
32
- production:
33
- projectId: ${{ SANITY_PROJECT_ID | 3do82whm }}
34
- dataset: ${{ SANITY_DATASET | next }}
35
- baseUrl: https://www.sanity.io/docs
36
-
37
- # Branch deploy — for testing doc changes before merge
38
- # Requires DOC_BASE_URL to be set in the environment
39
- # Headers can also be passed via --header flag or DOC_HEADERS env var
40
- branch:
41
- projectId: ${{ SANITY_PROJECT_ID | 3do82whm }}
42
- dataset: ${{ SANITY_DATASET | next }}
43
- baseUrl: ${{ DOC_BASE_URL }}
44
- allowedOrigins:
45
- - ${{ DOC_ALLOWED_ORIGIN | }}
46
-
47
- # Local dev server — serve docs locally for offline testing
48
- local:
49
- projectId: ${{ SANITY_PROJECT_ID | 3do82whm }}
50
- dataset: ${{ SANITY_DATASET | next }}
51
- baseUrl: http://localhost:${{ DOC_LOCAL_PORT | 3001 }}/docs
@@ -1,49 +0,0 @@
1
- # thresholds.yaml
2
- #
3
- # Quality thresholds for readiness gates and regression alerts.
4
- # Each threshold defines a minimum acceptable score. Violations are
5
- # classified by severity and routed to configured sinks.
6
- #
7
- # Used by:
8
- # - `pnpm pipeline --readiness` (launch readiness checklist)
9
- # - `pnpm pipeline --publish` (severity-aware sink routing)
10
- # - `pnpm pipeline --compare` (regression alerting)
11
- #
12
- # @see docs/exec-plans/scenario-matrix-implementation/phase-5-readiness-thresholds.md
13
-
14
- # Global defaults (apply to all areas unless overridden)
15
- defaults:
16
- composite: 50 # minimum composite score
17
- dimensions:
18
- task-completion: 40
19
- code-correctness: 30
20
- doc-coverage: 30
21
- doc-lift: 0 # minimum Doc Lift (0 = docs must not hurt)
22
- ceiling: 40 # minimum ceiling score (doc quality floor)
23
-
24
- # Per-area overrides (inherit from defaults, override specific values)
25
- areas:
26
- groq:
27
- composite: 60 # GROQ is critical — higher bar
28
- dimensions:
29
- task-completion: 50
30
- # visual-editing:
31
- # composite: 45 # currently at 36, set achievable near-term target
32
- # Areas not listed here use defaults
33
-
34
- # Regression thresholds (for comparison reports)
35
- regression:
36
- composite: -3 # alert if composite drops more than 3 points
37
- per-area: -5 # alert if any area drops more than 5 points
38
- per-dimension: -8 # alert if any dimension drops more than 8 points
39
-
40
- # Severity classification
41
- severity:
42
- critical: # blocks deployment, immediate notification
43
- composite-below: 30
44
- negative-doc-lift: true
45
- warning: # flags for review, non-blocking
46
- composite-below: 50
47
- regression-exceeds: -3
48
- info: # logged but not alerted
49
- composite-below: 60
@@ -1,7 +0,0 @@
1
- /**
2
- * Final validation — ensures all agent-observer modules work together
3
- * and the full data pipeline (record → classify → summarize) is correct.
4
- *
5
- * Run: tsx src/agent-observer/test-imports.ts
6
- */
7
- export {};