@sanity/ailf 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. package/README.md +0 -1
  2. package/config/models.ts +15 -3
  3. package/dist/_vendor/ailf-core/config-helpers.d.ts +14 -17
  4. package/dist/_vendor/ailf-core/config-helpers.js +22 -2
  5. package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
  6. package/dist/_vendor/ailf-core/examples/index.js +25 -0
  7. package/dist/_vendor/ailf-core/index.d.ts +2 -2
  8. package/dist/_vendor/ailf-core/index.js +1 -1
  9. package/dist/_vendor/ailf-core/ports/context.d.ts +2 -0
  10. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  11. package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
  12. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
  13. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +2 -0
  14. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
  15. package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
  16. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +13 -0
  17. package/dist/_vendor/ailf-core/types/index.d.ts +1 -3
  18. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +78 -23
  19. package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
  20. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  21. package/dist/adapters/config-sources/ts-config-loader.js +21 -13
  22. package/dist/adapters/task-sources/content-lake-task-source.js +17 -20
  23. package/dist/adapters/task-sources/index.d.ts +2 -2
  24. package/dist/adapters/task-sources/index.js +2 -2
  25. package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
  26. package/dist/adapters/task-sources/repo-schemas.js +227 -19
  27. package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
  28. package/dist/adapters/task-sources/repo-task-source.js +81 -122
  29. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  30. package/dist/adapters/task-sources/repo-validation.js +126 -5
  31. package/dist/adapters/task-sources/task-file-loader.d.ts +2 -2
  32. package/dist/adapters/task-sources/task-file-loader.js +2 -2
  33. package/dist/commands/coverage-audit.js +3 -1
  34. package/dist/commands/init.d.ts +6 -4
  35. package/dist/commands/init.js +302 -23
  36. package/dist/commands/validate-tasks.d.ts +2 -2
  37. package/dist/commands/validate-tasks.js +26 -15
  38. package/dist/composition-root.d.ts +13 -1
  39. package/dist/composition-root.js +73 -41
  40. package/dist/index.d.ts +41 -0
  41. package/dist/index.js +48 -0
  42. package/dist/orchestration/build-step-sequence.js +4 -2
  43. package/dist/orchestration/steps/fetch-docs-step.js +2 -3
  44. package/dist/orchestration/steps/generate-configs-step.js +28 -12
  45. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
  46. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
  47. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
  48. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +105 -68
  49. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
  50. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  51. package/dist/pipeline/compiler/literacy-bridge.js +1 -1
  52. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  53. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  54. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  55. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  56. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  57. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  58. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
  59. package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
  60. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  61. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  62. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  63. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  64. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  65. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  66. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  67. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  68. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  69. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  70. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  71. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  72. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  73. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  74. package/dist/pipeline/compiler/mode-handlers/{agent-harness-handler.d.ts → agent-harness/types.d.ts} +3 -24
  75. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  76. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  77. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  78. package/dist/pipeline/compiler/mode-handlers/index.d.ts +4 -5
  79. package/dist/pipeline/compiler/mode-handlers/index.js +4 -6
  80. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  81. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  82. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  83. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  84. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  85. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  86. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  87. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  88. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  89. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  90. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  91. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  92. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  93. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  94. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  95. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  96. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  97. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  98. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  99. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  100. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  101. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  102. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  103. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  104. package/dist/pipeline/compiler/mode-handlers/{mcp-assertions.d.ts → mcp-server/assertions.d.ts} +2 -10
  105. package/dist/pipeline/compiler/mode-handlers/{mcp-assertions.js → mcp-server/assertions.js} +63 -6
  106. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  107. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  108. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  109. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  110. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  111. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  112. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  113. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
  114. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  115. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  116. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  117. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  118. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  119. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
  120. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  121. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
  122. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  123. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  124. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
  125. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
  126. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
  127. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  128. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  129. package/dist/pipeline/compiler/preset-loader.js +99 -0
  130. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +6 -9
  131. package/dist/pipeline/compiler/presets/sanity-literacy.js +10 -156
  132. package/dist/pipeline/expand-tasks.d.ts +2 -2
  133. package/dist/pipeline/expand-tasks.js +2 -2
  134. package/dist/pipeline/generate-configs.js +1 -1
  135. package/dist/pipeline/map-request-to-config.js +1 -0
  136. package/dist/pipeline/mirror-repo-tasks.d.ts +7 -7
  137. package/dist/pipeline/mirror-repo-tasks.js +9 -9
  138. package/dist/pipeline/plan.js +1 -1
  139. package/package.json +11 -3
  140. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  141. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  142. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  143. package/dist/_vendor/ailf-tasks/index.js +0 -16
  144. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  145. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  146. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  147. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  148. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  149. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  150. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  151. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  152. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  153. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  154. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  155. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -67
  156. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -309
package/README.md CHANGED
@@ -17,5 +17,4 @@ for installation, quick start, configuration, and usage guides.
17
17
 
18
18
  | Package | Description |
19
19
  | -------------------------------------------------------------------------- | -------------------------------------------------- |
20
- | [`@sanity/ailf-tasks`](https://www.npmjs.com/package/@sanity/ailf-tasks) | Lightweight task validator — schemas + YAML parser |
21
20
  | [`@sanity/ailf-studio`](https://www.npmjs.com/package/@sanity/ailf-studio) | Sanity Studio dashboard plugin for viewing reports |
package/config/models.ts CHANGED
@@ -16,7 +16,13 @@ export default defineModels({
16
16
  id: "anthropic:messages:claude-opus-4-6",
17
17
  label: "Claude Opus 4.6",
18
18
  config: { temperature: 0.2, max_tokens: 4096 },
19
- modes: ["baseline", "observed", "agentic-naive", "agentic-optimized"],
19
+ modes: [
20
+ "baseline",
21
+ "observed",
22
+ "agentic-naive",
23
+ "agentic-optimized",
24
+ "mcp-server",
25
+ ],
20
26
  },
21
27
 
22
28
  // ── Google ─────────────────────────────────────────────────
@@ -35,14 +41,20 @@ export default defineModels({
35
41
  modes: ["baseline", "observed", "agentic-naive", "agentic-optimized"],
36
42
  },
37
43
  {
38
- id: "openai:chat:gpt-5.4",
44
+ id: "openai:responses:gpt-5.4",
39
45
  label: "GPT 5.4",
40
46
  config: {
41
47
  reasoning_effort: "medium",
42
48
  max_output_tokens: 4096,
43
49
  maxRetries: 1,
44
50
  },
45
- modes: ["baseline", "observed", "agentic-naive", "agentic-optimized"],
51
+ modes: [
52
+ "baseline",
53
+ "observed",
54
+ "agentic-naive",
55
+ "agentic-optimized",
56
+ "mcp-server",
57
+ ],
46
58
  },
47
59
 
48
60
  // ── Disabled models (uncomment to enable) ──────────────────
@@ -31,6 +31,7 @@ import type { SchedulesFile } from "./schemas/schedules.js";
31
31
  import type { SinksFile } from "./schemas/sinks.js";
32
32
  import type { ModelsConfig } from "./types/index.js";
33
33
  import type { GeneralizedTaskDefinition } from "./types/generalized-task.js";
34
+ import type { ModeBase, PresetDefinition } from "./types/plugin-registry.js";
34
35
  /**
35
36
  * Define an AILF evaluation configuration.
36
37
  *
@@ -132,27 +133,23 @@ export interface PromptEntry {
132
133
  */
133
134
  export declare function definePrompts(prompts: PromptEntry[]): PromptEntry[];
134
135
  /**
135
- * A preset is a named bundle of configuration that can be referenced
136
- * by tasks or suites to apply shared settings.
136
+ * Define a domain preset targets a mode base and adds domain-specific config.
137
+ *
138
+ * A domain preset declares which mode it targets (by ID), provides domain
139
+ * configuration (sources, features, doc fetcher), and can optionally override
140
+ * mode base defaults (rubrics, scoring profiles, prompt templates).
141
+ *
142
+ * Used in `.ailf/preset.ts` or npm packages for typed preset authoring.
137
143
  */
138
- export interface PresetConfig {
139
- /** Preset name */
140
- name: string;
141
- /** Default providers for tasks using this preset */
142
- providers?: string[];
143
- /** Default rubric template */
144
- rubric?: string;
145
- /** Default assertion templates */
146
- assertions?: unknown[];
147
- /** Default options */
148
- options?: Record<string, unknown>;
149
- }
144
+ export declare function definePreset(preset: PresetDefinition): PresetDefinition;
150
145
  /**
151
- * Define reusable configuration presets.
146
+ * Define a mode base — shared evaluation methodology for a mode.
152
147
  *
153
- * Used in `config/presets.ts` for typed preset configuration.
148
+ * A mode base defines HOW you evaluate (rubrics, scoring, prompts)
149
+ * independently of WHAT you're evaluating. Domain presets target a mode
150
+ * base and inherit its defaults.
154
151
  */
155
- export declare function definePreset(preset: PresetConfig): PresetConfig;
152
+ export declare function defineModeBase(base: ModeBase): ModeBase;
156
153
  /**
157
154
  * A pricing table entry for cost estimation.
158
155
  */
@@ -132,14 +132,34 @@ export function defineSources(sources) {
132
132
  export function definePrompts(prompts) {
133
133
  return prompts;
134
134
  }
135
+ // ---------------------------------------------------------------------------
136
+ // Preset helpers
137
+ // ---------------------------------------------------------------------------
135
138
  /**
136
- * Define reusable configuration presets.
139
+ * Define a domain preset — targets a mode base and adds domain-specific config.
140
+ *
141
+ * A domain preset declares which mode it targets (by ID), provides domain
142
+ * configuration (sources, features, doc fetcher), and can optionally override
143
+ * mode base defaults (rubrics, scoring profiles, prompt templates).
137
144
  *
138
- * Used in `config/presets.ts` for typed preset configuration.
145
+ * Used in `.ailf/preset.ts` or npm packages for typed preset authoring.
139
146
  */
140
147
  export function definePreset(preset) {
141
148
  return preset;
142
149
  }
150
+ // ---------------------------------------------------------------------------
151
+ // Mode base helpers
152
+ // ---------------------------------------------------------------------------
153
+ /**
154
+ * Define a mode base — shared evaluation methodology for a mode.
155
+ *
156
+ * A mode base defines HOW you evaluate (rubrics, scoring, prompts)
157
+ * independently of WHAT you're evaluating. Domain presets target a mode
158
+ * base and inherit its defaults.
159
+ */
160
+ export function defineModeBase(base) {
161
+ return base;
162
+ }
143
163
  /**
144
164
  * Define a pricing table for cost estimation.
145
165
  *
@@ -301,3 +301,19 @@ export interface ExampleRecord {
301
301
  export declare const EXAMPLES: Record<ExampleType, ExampleRecord>;
302
302
  /** GitHub Actions workflow template for AI Literacy evaluation */
303
303
  export declare const workflowYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# AI Literacy Evaluation \u2014 GitHub Actions workflow\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# Evaluates your documentation quality on every pull request.\n# The AILF CLI reads your .ailf/tasks/ definitions, submits them\n# to the AILF API for evaluation, and writes a score report.\n#\n# Prerequisites:\n# Add one secret to your repository (Settings \u2192 Secrets \u2192 Actions):\n# AILF_API_KEY \u2014 your API key (starts with ailf_live_sk_)\n# NPM_TOKEN \u2014 npm token with read access to @sanity scope\n#\n# Customization:\n# - Narrow the trigger paths to reduce cost (see comment below)\n# - Check debug_mode for faster iteration (fewer tests)\n# - See: https://github.com/sanity-labs/ai-literacy-framework\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\nname: AI Literacy Eval\n\non:\n pull_request:\n branches: [main]\n # Runs on every PR to main by default. To reduce cost:\n # paths: [\".ailf/**\", \"docs/**\"]\n\n workflow_dispatch:\n inputs:\n debug_mode:\n description: \"Run in debug mode (fewer tests, faster iteration)\"\n type: boolean\n default: false\n\nconcurrency:\n group: ailf-eval-${{ github.event.pull_request.number || github.ref }}\n cancel-in-progress: true\n\njobs:\n evaluate:\n name: AI Literacy Evaluation\n runs-on: ubuntu-latest\n permissions:\n contents: read\n pull-requests: write\n steps:\n - uses: actions/checkout@v4\n\n - name: Configure npm for @sanity scope\n run:\n echo \"//registry.npmjs.org/:_authToken=${{ secrets.NPM_TOKEN }}\" >>\n ~/.npmrc\n\n - name: Run evaluation\n id: eval\n env:\n AILF_API_KEY: ${{ secrets.AILF_API_KEY }}\n run: |\n npx @sanity/ailf@latest pipeline --remote \\\n --output /tmp/ailf-report.md \\\n ${{ inputs.debug_mode && '--debug' || '' }}\n\n - name: Post PR comment\n if: always() && github.event_name == 'pull_request'\n uses: actions/github-script@v7\n with:\n script: |\n const fs = require('fs');\n\n // --- Constants ---\n const MARKER = '<!-- ailf-score-report -->';\n const HISTORY_START = '<!-- ailf-score-history -->';\n const HISTORY_END = '<!-- /ailf-score-history -->';\n const MAX_HISTORY = 3; // keep at most 3 prior runs\n\n // --- Read new report ---\n let newReport;\n try {\n newReport = fs.readFileSync('/tmp/ailf-report.md', 'utf-8');\n } catch {\n newReport = `## \u26A0\uFE0F AI Literacy Evaluation\\n\\nNo report generated. Check the [workflow logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}).`;\n }\n\n const prNumber = context.issue?.number || context.payload?.pull_request?.number;\n if (!prNumber) {\n console.log('No PR number found, skipping comment');\n return;\n }\n\n // --- Find existing comment ---\n const { data: comments } = await github.rest.issues.listComments({\n owner: context.repo.owner, repo: context.repo.repo,\n issue_number: prNumber,\n });\n const existing = comments.find(c => c.body?.includes(MARKER));\n\n // --- Build history from previous comment ---\n let historyEntries = [];\n if (existing) {\n const oldBody = existing.body || '';\n\n // Collect existing collapsed history entries\n const histStart = oldBody.indexOf(HISTORY_START);\n const histEnd = oldBody.indexOf(HISTORY_END);\n if (histStart !== -1 && histEnd !== -1) {\n const historyContent = oldBody.slice(histStart + HISTORY_START.length, histEnd).trim();\n // Split on </details> boundaries to get individual entries\n if (historyContent) {\n historyEntries = historyContent\n .split(/<\\/details>\\s*/)\n .map(s => s.trim())\n .filter(s => s.startsWith('<details>'))\n .map(s => s + '\\n</details>');\n }\n }\n\n // Extract the current report (will become the newest history entry)\n let previousReport = '';\n if (histStart !== -1) {\n // Report is between MARKER and the \"Previous runs\" heading (or history section)\n const markerIdx = oldBody.indexOf(MARKER);\n // Find the --- separator before history\n const separatorIdx = oldBody.lastIndexOf('---', histStart);\n const endIdx = separatorIdx > markerIdx ? separatorIdx : histStart;\n previousReport = oldBody.slice(markerIdx + MARKER.length, endIdx).trim();\n } else {\n // No history yet \u2014 everything after MARKER is the report\n const markerIdx = oldBody.indexOf(MARKER);\n if (markerIdx !== -1) {\n previousReport = oldBody.slice(markerIdx + MARKER.length).trim();\n }\n }\n\n // Collapse the previous report into a <details> entry\n if (previousReport) {\n const scoreMatch = previousReport.match(/Overall:\\s*(\\d+)\\/100/);\n const score = scoreMatch ? scoreMatch[1] : '?';\n const dateMatch = previousReport.match(/Generated by.*?\u00B7\\s*([^\u00B7<\\n*]+)/);\n const date = dateMatch\n ? dateMatch[1].trim()\n : new Date().toISOString().slice(0, 16).replace('T', ' ') + ' UTC';\n const entry = `<details>\\n<summary>\uD83D\uDCDC ${date} \u2014 ${score}/100</summary>\\n\\n${previousReport}\\n\\n</details>`;\n historyEntries.unshift(entry); // newest first\n }\n\n // Enforce max history limit\n historyEntries = historyEntries.slice(0, MAX_HISTORY);\n }\n\n // --- Assemble final comment ---\n const historySection = historyEntries.length > 0\n ? `\\n\\n---\\n\\n### \uD83D\uDCDC Previous runs\\n\\n${HISTORY_START}\\n${historyEntries.join('\\n\\n')}\\n${HISTORY_END}`\n : '';\n const finalBody = `${MARKER}\\n${newReport}${historySection}`;\n\n if (existing) {\n await github.rest.issues.updateComment({\n owner: context.repo.owner, repo: context.repo.repo,\n comment_id: existing.id, body: finalBody,\n });\n console.log(`Updated comment (${historyEntries.length} history entries)`);\n } else {\n await github.rest.issues.createComment({\n owner: context.repo.owner, repo: context.repo.repo,\n issue_number: prNumber, body: finalBody,\n });\n console.log('Created new PR comment');\n }\n\n - name: Summary\n if: always()\n run: |\n if [ -f /tmp/ailf-report.md ]; then\n cat /tmp/ailf-report.md >> \"$GITHUB_STEP_SUMMARY\"\n else\n echo \"## \u26A0\uFE0F AI Literacy Evaluation\" >> \"$GITHUB_STEP_SUMMARY\"\n echo \"\" >> \"$GITHUB_STEP_SUMMARY\"\n echo \"No report generated. Check the workflow logs.\" >> \"$GITHUB_STEP_SUMMARY\"\n fi\n";
304
+ /** TypeScript project configuration template (ailf.config.ts) */
305
+ export declare const ailfConfigTs = "/**\n * .ailf/ailf.config.ts \u2014 AI Literacy Framework project configuration.\n *\n * This file configures how the AILF evaluation pipeline runs in this\n * repository. Place it at .ailf/ailf.config.ts in your project root.\n *\n * Evaluations are submitted to the AILF API (ailf-api.sanity.build).\n * The API handles LLM calls, doc fetching, grading, and report\n * publishing. Your repo only needs one secret: AILF_API_KEY.\n *\n * Docs: https://github.com/sanity-labs/ai-literacy-framework\n */\n\nexport default {\n /**\n * Documentation source \u2014 which docs are being evaluated.\n *\n * This tells the pipeline which Sanity project and dataset contain\n * the documentation under test. For most users, this is Sanity's own\n * docs project.\n */\n source: {\n /** Sanity project ID (find yours at sanity.io/manage) */\n projectId: \"3do82whm\",\n /** The dataset to query (e.g., \"production\", \"next\") */\n dataset: \"next\",\n /**\n * The public URL of your documentation site.\n * Used by agentic mode to test agent discoverability.\n */\n baseUrl: \"https://www.sanity.io/docs\",\n },\n\n /**\n * Trigger configuration \u2014 when evaluations run automatically.\n *\n * Each key is a trigger context. The pipeline checks which trigger\n * matches the current execution context (PR, merge, schedule, etc.)\n * and applies its settings.\n *\n * Mode options:\n * \"validate-only\" \u2014 check that task files parse correctly (fast, no LLM calls)\n * \"eval\" \u2014 run the full evaluation pipeline\n */\n triggers: {\n /** On pull requests: just validate task files parse correctly. */\n pr: {\n mode: \"validate-only\",\n },\n\n /** When .ailf/ files change in a PR: run a real evaluation. */\n \"pr-task-change\": {\n mode: \"eval\",\n paths: [\".ailf/**\"],\n },\n\n /** On merge to main: run evaluation (non-blocking). */\n main: {\n mode: \"eval\",\n blocking: false,\n notify: true,\n },\n },\n}\n";
306
+ /** TypeScript task template for example-groq-blog-listing */
307
+ export declare const exampleGroqBlogListingTs = "/**\n * Example Task: Blog listing with GROQ queries.\n *\n * This is a starter template \u2014 edit it for your own documentation.\n * Each task evaluates whether an AI coding agent can implement a feature\n * using your docs as context. Delete this file or replace it entirely.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * Full field reference:\n * https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/CONTRIBUTING_TASKS.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n // \u2500\u2500 Mode \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // \"literacy\" tests whether AI coding tools can implement features\n // using your docs as context. Other modes: \"mcp-server\",\n // \"knowledge-probe\", \"agent-harness\", \"custom\".\n mode: \"literacy\",\n\n // \u2500\u2500 Identity \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Unique identifier \u2014 lowercase alphanumeric with hyphens.\n // Must be unique across all task files in .ailf/tasks/.\n id: \"example-groq-blog-listing\",\n title: \"Blog listing with GROQ queries\",\n description: \"Example \u2014 tests GROQ blog listing implementation\",\n\n // Feature area this task belongs to. Tasks with the same area are\n // grouped together in score summaries.\n area: \"groq\",\n\n // \u2500\u2500 Documentation context \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Canonical doc references for this task. The pipeline fetches these\n // from Sanity and injects them into the prompt for baseline evaluation.\n //\n // This example uses slug-based references \u2014 the simplest form.\n // See the other example tasks for path, id, and perspective references.\n context: {\n docs: [\n {\n slug: \"groq-introduction\",\n reason: \"Core GROQ syntax and query language reference\",\n },\n {\n slug: \"how-queries-work\",\n reason: \"Query execution model and best practices\",\n },\n ],\n },\n\n // When true, the pipeline auto-generates an additional rubric that\n // checks whether the LLM's response actually used the provided docs.\n docCoverage: true,\n\n // Path to a gold-standard implementation, relative to canonical/.\n // The grader uses this as a reference when scoring code correctness.\n referenceSolution: \"canonical/example-groq-blog-listing.ts\",\n\n // \u2500\u2500 Prompt \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // prompt.text \u2014 the implementation prompt given to the LLM.\n // Write this as if you're asking a developer to build the feature.\n // Be specific about requirements so the grader can evaluate clearly.\n prompt: {\n text: `Create a Next.js page component that lists blog posts from Sanity\nusing GROQ. The page should display the title, slug, and published\ndate for each post, sorted by most recent first. Use the Sanity\nclient to fetch data.`,\n },\n\n // \u2500\u2500 Assertions \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Grading assertions \u2014 how the LLM's response is scored.\n //\n // \"llm-rubric\" assertions use a grader LLM to score against criteria.\n // The \"template\" references a rubric template (e.g. task-completion).\n //\n // Available templates:\n // task-completion \u2014 did the LLM implement the feature? (weight: 0.50)\n // code-correctness \u2014 is the code idiomatic and correct? (weight: 0.25)\n //\n // You can also use value-based assertions:\n // { type: \"contains\", value: \"client.fetch\" }\n // { type: \"contains-any\", value: [\"createClient\", \"sanityClient\"] }\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Uses the groq tagged template literal\",\n \"Fetches blog posts with title, slug, and publishedAt fields\",\n \"Orders results by publishedAt in descending order\",\n ],\n },\n {\n type: \"llm-rubric\",\n template: \"code-correctness\",\n criteria: [\n \"Uses createClient from @sanity/client or next-sanity\",\n \"Exports a valid Next.js page component\",\n ],\n },\n ],\n\n // \u2500\u2500 Baseline variant \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // enabled \u2014 set to false to skip this task entirely\n // rubric \u2014 \"full\" (default), \"abbreviated\" (faster), or \"none\"\n baseline: {\n enabled: true,\n rubric: \"full\",\n },\n\n // Example tasks ship as drafts so they don't run in production evals.\n // Change to \"active\" (or remove this field) to activate.\n status: \"draft\",\n})\n";
308
+ /** TypeScript task template for example-id-based-ref */
309
+ export declare const exampleIdBasedRefTs = "/**\n * Example Task: Document ID-based canonical doc references.\n *\n * Demonstrates using `id` to reference canonical documentation by\n * Sanity document `_id`. This is useful for:\n * - Draft documents that don't have a stable slug yet\n * - Programmatic references from imports or migrations\n * - Documents where you know the _id but not the slug\n *\n * The `id` ref type can also carry optional `slug` and `path` fields\n * as human-readable annotations \u2014 these are NOT used for resolution,\n * only for display in logs and reports.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n mode: \"literacy\",\n id: \"example-id-based-ref\",\n title: \"GROQ feature support (ID-based doc references)\",\n description: \"Example \u2014 demonstrates ID-based canonical doc references\",\n\n area: \"groq\",\n\n // ID-based canonical doc references.\n //\n // Use the Sanity document _id to reference articles directly.\n // Optional slug/path annotations help humans reading the file\n // but are NOT used for resolution \u2014 only the `id` field matters.\n //\n // These IDs reference real articles in the Sanity docs (next dataset):\n // 0ba88f1b... = \"GROQ feature support across Sanity\"\n // 5b9c2863... = \"Custom GROQ functions\"\n context: {\n docs: [\n {\n id: \"0ba88f1b-d1a7-418a-9267-2e343d01886a\",\n slug: \"groq-feature-support-by-context\", // annotation only\n reason: \"GROQ feature support across different Sanity contexts\",\n },\n {\n id: \"5b9c2863-ef01-4565-af8e-ee54e081ee74\",\n slug: \"custom-groq-functions\", // annotation only\n reason: \"Custom GROQ functions and pipelines\",\n },\n ],\n },\n\n docCoverage: true,\n\n prompt: {\n text: `Explain how GROQ is used across different Sanity contexts.\nCover the following:\n1. Which GROQ features are available in each context (API queries,\n webhooks, custom functions, access control)\n2. How to create and use custom GROQ functions\n3. Any differences in GROQ support between contexts\nProvide examples demonstrating context-specific GROQ patterns.`,\n },\n\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Explains GROQ availability across different Sanity contexts\",\n \"Describes custom GROQ function creation and usage\",\n \"Notes differences in GROQ support between contexts\",\n ],\n },\n {\n type: \"llm-rubric\",\n template: \"code-correctness\",\n criteria: [\n \"GROQ examples use valid syntax\",\n \"Custom function examples follow the correct API pattern\",\n ],\n },\n ],\n\n baseline: { enabled: true, rubric: \"full\" },\n status: \"draft\",\n})\n";
310
+ /** TypeScript task template for example-path-based-ref */
311
+ export declare const examplePathBasedRefTs = "/**\n * Example Task: Path-based canonical doc references.\n *\n * Demonstrates using `path` to reference canonical documentation.\n * Paths are the preferred reference type because they uniquely identify\n * an article across sections (unlike slugs, which can collide).\n *\n * Path format:\n * - Simple: \"webhooks\" \u2192 resolves by slug lookup\n * - Sectioned: \"content-lake/webhooks\" \u2192 disambiguates by section + slug\n *\n * This example demonstrates why paths matter: the slug \"documents\"\n * exists in both the \"content-lake\" and \"cli-reference\" sections.\n * Using \"content-lake/documents\" ensures we get the right one.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n mode: \"literacy\",\n id: \"example-path-based-ref\",\n title: \"GROQ mutations (path-based doc references)\",\n description: \"Example \u2014 demonstrates path-based canonical doc references\",\n\n area: \"groq\",\n\n // Path-based canonical doc references.\n //\n // Use \"section/slug\" format to uniquely identify articles:\n // - \"content-lake/mutations-introduction\" \u2192 the mutations article\n // - \"content-lake/documents\" \u2192 the documents article in Content Lake\n // (not the CLI \"documents\" article in cli-reference section)\n //\n // The \"documents\" slug exists in two sections \u2014 this is exactly why\n // path-based references are preferred over slug-based references.\n context: {\n docs: [\n {\n path: \"content-lake/mutations-introduction\",\n reason: \"Introduction to document mutations in the Content Lake\",\n },\n {\n path: \"content-lake/documents\",\n reason:\n \"Document structure and types (Content Lake, not CLI reference)\",\n },\n ],\n },\n\n docCoverage: true,\n\n prompt: {\n text: `Explain how to create, update, and delete documents in Sanity's\nContent Lake using mutations. Cover:\n1. The different mutation types (create, createOrReplace, patch, delete)\n2. Document structure and required fields (_id, _type)\n3. How to use patch operations to update specific fields\n4. Best practices for mutation patterns\nProvide working code examples using @sanity/client.`,\n },\n\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Explains create, createOrReplace, patch, and delete mutations\",\n \"Describes required document fields (_id, _type)\",\n \"Shows patch operations for field-level updates\",\n \"Includes practical code examples\",\n ],\n },\n {\n type: \"llm-rubric\",\n template: \"code-correctness\",\n criteria: [\n \"Uses correct @sanity/client mutation API\",\n \"Patch operations use valid set/unset/inc syntax\",\n ],\n },\n ],\n\n baseline: { enabled: true, rubric: \"full\" },\n status: \"draft\",\n})\n";
312
+ /** TypeScript task template for example-perspective-ref */
313
+ export declare const examplePerspectiveRefTs = "/**\n * Example Task: Perspective / content release doc references.\n *\n * Demonstrates using `perspective` to reference all documentation\n * articles within a content release. This is the key capability for\n * evaluating NEW feature documentation before it's published.\n *\n * How it works:\n * - A perspective ref is one-to-many: the doc fetcher queries the\n * named release and expands it to ALL articles versioned within it.\n * - Downstream consumers see the same flat DocContext[] regardless\n * of how docs were resolved.\n * - When the release is published, the perspective entry becomes a\n * no-op (articles are now in published). Migrate to explicit path\n * or slug refs at your convenience.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n mode: \"literacy\",\n id: \"example-perspective-ref\",\n title:\n \"GROQ features from content release (perspective-based doc references)\",\n description:\n \"Example \u2014 demonstrates perspective-based canonical doc references\",\n\n area: \"groq\",\n\n // Perspective-based canonical doc reference.\n //\n // The perspective ID references a content release in the Sanity\n // Content Lake. At evaluation time, the doc fetcher auto-discovers\n // all articles versioned in this release and includes them as\n // canonical documentation context.\n //\n // Release rE9TSJvR4 contains:\n // - \"GROQ-powered webhooks\" (webhooks)\n // - \"Query Cheat Sheet - GROQ\" (query-cheat-sheet)\n // - \"GROQ joins\" (groq-joins)\n //\n // You can combine perspective refs with explicit slug/path/id refs\n // to include foundational published docs alongside release content.\n // Here we add groq-data-types as a complementary published reference.\n context: {\n docs: [\n {\n perspective: \"rE9TSJvR4\",\n reason: \"All GROQ documentation updates in the test content release\",\n },\n {\n slug: \"groq-data-types\",\n reason: \"GROQ data type reference (published, stable)\",\n },\n ],\n },\n\n docCoverage: true,\n\n prompt: {\n text: `Using GROQ, demonstrate advanced query patterns including:\n1. Joining data across document types using references\n2. Filtering webhook payloads with GROQ projections\n3. Using the query cheat sheet patterns for common operations\n4. Working with different GROQ data types in filters\nProvide working GROQ query examples for each pattern.`,\n },\n\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Demonstrates GROQ join syntax for cross-document queries\",\n \"Shows GROQ filter patterns for webhook configuration\",\n \"Includes practical query examples from cheat sheet patterns\",\n ],\n },\n {\n type: \"llm-rubric\",\n template: \"code-correctness\",\n criteria: [\n \"All GROQ queries use valid syntax\",\n \"Reference joins use correct dereference operator (->)\",\n ],\n },\n ],\n\n baseline: { enabled: true, rubric: \"full\" },\n status: \"draft\",\n})\n";
314
+ /** TypeScript task template for example-studio-custom-input */
315
+ export declare const exampleStudioCustomInputTs = "/**\n * Example Task: Custom input component in Sanity Studio.\n *\n * This is a starter template \u2014 edit it for your own documentation.\n * Delete this file or replace it with your own tasks.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n mode: \"literacy\",\n id: \"example-studio-custom-input\",\n title: \"Custom input component in Sanity Studio\",\n description: \"Example \u2014 tests Studio custom input implementation\",\n\n area: \"studio\",\n\n context: {\n docs: [\n {\n slug: \"custom-input-widgets\",\n reason: \"Guide for building custom form inputs in Sanity Studio\",\n },\n {\n slug: \"form-components\",\n reason: \"Form component API and customization patterns\",\n },\n ],\n },\n\n docCoverage: true,\n referenceSolution: \"canonical/example-studio-custom-input.ts\",\n\n prompt: {\n text: `Build a custom string input component for Sanity Studio that shows\na character count below the input field. The component should accept\na maxLength option from the field schema and display a warning when\nthe text exceeds the limit.`,\n },\n\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Implements a React component that renders a text input\",\n \"Displays a live character count\",\n \"Reads maxLength from schema options\",\n \"Shows a visual warning when limit is exceeded\",\n ],\n },\n {\n type: \"llm-rubric\",\n template: \"code-correctness\",\n criteria: [\n \"Uses the Sanity UI library for styling\",\n \"Calls onChange with patch operations\",\n ],\n },\n ],\n\n baseline: { enabled: true, rubric: \"full\" },\n status: \"draft\",\n})\n";
316
+ /** Map of task ID (filename stem) → raw TypeScript source */
317
+ export declare const taskTsFiles: Record<string, string>;
318
+ /** List of TS task file stems, in alphabetical order */
319
+ export declare const TASK_TS_FILE_NAMES: readonly ["example-groq-blog-listing", "example-id-based-ref", "example-path-based-ref", "example-perspective-ref", "example-studio-custom-input"];
@@ -452,3 +452,28 @@ export const EXAMPLES = {
452
452
  // ---------------------------------------------------------------------------
453
453
  /** GitHub Actions workflow template for AI Literacy evaluation */
454
454
  export const workflowYaml = "# ──────────────────────────────────────────────────────────────────────\n# AI Literacy Evaluation — GitHub Actions workflow\n# ──────────────────────────────────────────────────────────────────────\n#\n# Evaluates your documentation quality on every pull request.\n# The AILF CLI reads your .ailf/tasks/ definitions, submits them\n# to the AILF API for evaluation, and writes a score report.\n#\n# Prerequisites:\n# Add one secret to your repository (Settings → Secrets → Actions):\n# AILF_API_KEY — your API key (starts with ailf_live_sk_)\n# NPM_TOKEN — npm token with read access to @sanity scope\n#\n# Customization:\n# - Narrow the trigger paths to reduce cost (see comment below)\n# - Check debug_mode for faster iteration (fewer tests)\n# - See: https://github.com/sanity-labs/ai-literacy-framework\n# ──────────────────────────────────────────────────────────────────────\n\nname: AI Literacy Eval\n\non:\n pull_request:\n branches: [main]\n # Runs on every PR to main by default. To reduce cost:\n # paths: [\".ailf/**\", \"docs/**\"]\n\n workflow_dispatch:\n inputs:\n debug_mode:\n description: \"Run in debug mode (fewer tests, faster iteration)\"\n type: boolean\n default: false\n\nconcurrency:\n group: ailf-eval-${{ github.event.pull_request.number || github.ref }}\n cancel-in-progress: true\n\njobs:\n evaluate:\n name: AI Literacy Evaluation\n runs-on: ubuntu-latest\n permissions:\n contents: read\n pull-requests: write\n steps:\n - uses: actions/checkout@v4\n\n - name: Configure npm for @sanity scope\n run:\n echo \"//registry.npmjs.org/:_authToken=${{ secrets.NPM_TOKEN }}\" >>\n ~/.npmrc\n\n - name: Run evaluation\n id: eval\n env:\n AILF_API_KEY: ${{ secrets.AILF_API_KEY }}\n run: |\n npx @sanity/ailf@latest pipeline --remote \\\n --output /tmp/ailf-report.md \\\n ${{ inputs.debug_mode && '--debug' || '' }}\n\n - name: Post PR comment\n if: always() && github.event_name == 'pull_request'\n uses: actions/github-script@v7\n with:\n script: |\n const fs = require('fs');\n\n // --- Constants ---\n const MARKER = '<!-- ailf-score-report -->';\n const HISTORY_START = '<!-- ailf-score-history -->';\n const HISTORY_END = '<!-- /ailf-score-history -->';\n const MAX_HISTORY = 3; // keep at most 3 prior runs\n\n // --- Read new report ---\n let newReport;\n try {\n newReport = fs.readFileSync('/tmp/ailf-report.md', 'utf-8');\n } catch {\n newReport = `## ⚠️ AI Literacy Evaluation\\n\\nNo report generated. Check the [workflow logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}).`;\n }\n\n const prNumber = context.issue?.number || context.payload?.pull_request?.number;\n if (!prNumber) {\n console.log('No PR number found, skipping comment');\n return;\n }\n\n // --- Find existing comment ---\n const { data: comments } = await github.rest.issues.listComments({\n owner: context.repo.owner, repo: context.repo.repo,\n issue_number: prNumber,\n });\n const existing = comments.find(c => c.body?.includes(MARKER));\n\n // --- Build history from previous comment ---\n let historyEntries = [];\n if (existing) {\n const oldBody = existing.body || '';\n\n // Collect existing collapsed history entries\n const histStart = oldBody.indexOf(HISTORY_START);\n const histEnd = oldBody.indexOf(HISTORY_END);\n if (histStart !== -1 && histEnd !== -1) {\n const historyContent = oldBody.slice(histStart + HISTORY_START.length, histEnd).trim();\n // Split on </details> boundaries to get individual entries\n if (historyContent) {\n historyEntries = historyContent\n .split(/<\\/details>\\s*/)\n .map(s => s.trim())\n .filter(s => s.startsWith('<details>'))\n .map(s => s + '\\n</details>');\n }\n }\n\n // Extract the current report (will become the newest history entry)\n let previousReport = '';\n if (histStart !== -1) {\n // Report is between MARKER and the \"Previous runs\" heading (or history section)\n const markerIdx = oldBody.indexOf(MARKER);\n // Find the --- separator before history\n const separatorIdx = oldBody.lastIndexOf('---', histStart);\n const endIdx = separatorIdx > markerIdx ? separatorIdx : histStart;\n previousReport = oldBody.slice(markerIdx + MARKER.length, endIdx).trim();\n } else {\n // No history yet — everything after MARKER is the report\n const markerIdx = oldBody.indexOf(MARKER);\n if (markerIdx !== -1) {\n previousReport = oldBody.slice(markerIdx + MARKER.length).trim();\n }\n }\n\n // Collapse the previous report into a <details> entry\n if (previousReport) {\n const scoreMatch = previousReport.match(/Overall:\\s*(\\d+)\\/100/);\n const score = scoreMatch ? scoreMatch[1] : '?';\n const dateMatch = previousReport.match(/Generated by.*?·\\s*([^·<\\n*]+)/);\n const date = dateMatch\n ? dateMatch[1].trim()\n : new Date().toISOString().slice(0, 16).replace('T', ' ') + ' UTC';\n const entry = `<details>\\n<summary>📜 ${date} — ${score}/100</summary>\\n\\n${previousReport}\\n\\n</details>`;\n historyEntries.unshift(entry); // newest first\n }\n\n // Enforce max history limit\n historyEntries = historyEntries.slice(0, MAX_HISTORY);\n }\n\n // --- Assemble final comment ---\n const historySection = historyEntries.length > 0\n ? `\\n\\n---\\n\\n### 📜 Previous runs\\n\\n${HISTORY_START}\\n${historyEntries.join('\\n\\n')}\\n${HISTORY_END}`\n : '';\n const finalBody = `${MARKER}\\n${newReport}${historySection}`;\n\n if (existing) {\n await github.rest.issues.updateComment({\n owner: context.repo.owner, repo: context.repo.repo,\n comment_id: existing.id, body: finalBody,\n });\n console.log(`Updated comment (${historyEntries.length} history entries)`);\n } else {\n await github.rest.issues.createComment({\n owner: context.repo.owner, repo: context.repo.repo,\n issue_number: prNumber, body: finalBody,\n });\n console.log('Created new PR comment');\n }\n\n - name: Summary\n if: always()\n run: |\n if [ -f /tmp/ailf-report.md ]; then\n cat /tmp/ailf-report.md >> \"$GITHUB_STEP_SUMMARY\"\n else\n echo \"## ⚠️ AI Literacy Evaluation\" >> \"$GITHUB_STEP_SUMMARY\"\n echo \"\" >> \"$GITHUB_STEP_SUMMARY\"\n echo \"No report generated. Check the workflow logs.\" >> \"$GITHUB_STEP_SUMMARY\"\n fi\n";
455
+ // ---------------------------------------------------------------------------
456
+ // TypeScript template exports (for ailf init --output-format ts)
457
+ // ---------------------------------------------------------------------------
458
+ /** TypeScript project configuration template (ailf.config.ts) */
459
+ export const ailfConfigTs = "/**\n * .ailf/ailf.config.ts — AI Literacy Framework project configuration.\n *\n * This file configures how the AILF evaluation pipeline runs in this\n * repository. Place it at .ailf/ailf.config.ts in your project root.\n *\n * Evaluations are submitted to the AILF API (ailf-api.sanity.build).\n * The API handles LLM calls, doc fetching, grading, and report\n * publishing. Your repo only needs one secret: AILF_API_KEY.\n *\n * Docs: https://github.com/sanity-labs/ai-literacy-framework\n */\n\nexport default {\n /**\n * Documentation source — which docs are being evaluated.\n *\n * This tells the pipeline which Sanity project and dataset contain\n * the documentation under test. For most users, this is Sanity's own\n * docs project.\n */\n source: {\n /** Sanity project ID (find yours at sanity.io/manage) */\n projectId: \"3do82whm\",\n /** The dataset to query (e.g., \"production\", \"next\") */\n dataset: \"next\",\n /**\n * The public URL of your documentation site.\n * Used by agentic mode to test agent discoverability.\n */\n baseUrl: \"https://www.sanity.io/docs\",\n },\n\n /**\n * Trigger configuration — when evaluations run automatically.\n *\n * Each key is a trigger context. The pipeline checks which trigger\n * matches the current execution context (PR, merge, schedule, etc.)\n * and applies its settings.\n *\n * Mode options:\n * \"validate-only\" — check that task files parse correctly (fast, no LLM calls)\n * \"eval\" — run the full evaluation pipeline\n */\n triggers: {\n /** On pull requests: just validate task files parse correctly. */\n pr: {\n mode: \"validate-only\",\n },\n\n /** When .ailf/ files change in a PR: run a real evaluation. */\n \"pr-task-change\": {\n mode: \"eval\",\n paths: [\".ailf/**\"],\n },\n\n /** On merge to main: run evaluation (non-blocking). */\n main: {\n mode: \"eval\",\n blocking: false,\n notify: true,\n },\n },\n}\n";
460
+ /** TypeScript task template for example-groq-blog-listing */
461
+ export const exampleGroqBlogListingTs = "/**\n * Example Task: Blog listing with GROQ queries.\n *\n * This is a starter template — edit it for your own documentation.\n * Each task evaluates whether an AI coding agent can implement a feature\n * using your docs as context. Delete this file or replace it entirely.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * Full field reference:\n * https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/CONTRIBUTING_TASKS.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n // ── Mode ────────────────────────────────────────────────────────────\n // \"literacy\" tests whether AI coding tools can implement features\n // using your docs as context. Other modes: \"mcp-server\",\n // \"knowledge-probe\", \"agent-harness\", \"custom\".\n mode: \"literacy\",\n\n // ── Identity ────────────────────────────────────────────────────────\n // Unique identifier — lowercase alphanumeric with hyphens.\n // Must be unique across all task files in .ailf/tasks/.\n id: \"example-groq-blog-listing\",\n title: \"Blog listing with GROQ queries\",\n description: \"Example — tests GROQ blog listing implementation\",\n\n // Feature area this task belongs to. Tasks with the same area are\n // grouped together in score summaries.\n area: \"groq\",\n\n // ── Documentation context ───────────────────────────────────────────\n // Canonical doc references for this task. The pipeline fetches these\n // from Sanity and injects them into the prompt for baseline evaluation.\n //\n // This example uses slug-based references — the simplest form.\n // See the other example tasks for path, id, and perspective references.\n context: {\n docs: [\n {\n slug: \"groq-introduction\",\n reason: \"Core GROQ syntax and query language reference\",\n },\n {\n slug: \"how-queries-work\",\n reason: \"Query execution model and best practices\",\n },\n ],\n },\n\n // When true, the pipeline auto-generates an additional rubric that\n // checks whether the LLM's response actually used the provided docs.\n docCoverage: true,\n\n // Path to a gold-standard implementation, relative to canonical/.\n // The grader uses this as a reference when scoring code correctness.\n referenceSolution: \"canonical/example-groq-blog-listing.ts\",\n\n // ── Prompt ──────────────────────────────────────────────────────────\n // prompt.text — the implementation prompt given to the LLM.\n // Write this as if you're asking a developer to build the feature.\n // Be specific about requirements so the grader can evaluate clearly.\n prompt: {\n text: `Create a Next.js page component that lists blog posts from Sanity\nusing GROQ. The page should display the title, slug, and published\ndate for each post, sorted by most recent first. Use the Sanity\nclient to fetch data.`,\n },\n\n // ── Assertions ──────────────────────────────────────────────────────\n // Grading assertions — how the LLM's response is scored.\n //\n // \"llm-rubric\" assertions use a grader LLM to score against criteria.\n // The \"template\" references a rubric template (e.g. task-completion).\n //\n // Available templates:\n // task-completion — did the LLM implement the feature? (weight: 0.50)\n // code-correctness — is the code idiomatic and correct? (weight: 0.25)\n //\n // You can also use value-based assertions:\n // { type: \"contains\", value: \"client.fetch\" }\n // { type: \"contains-any\", value: [\"createClient\", \"sanityClient\"] }\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Uses the groq tagged template literal\",\n \"Fetches blog posts with title, slug, and publishedAt fields\",\n \"Orders results by publishedAt in descending order\",\n ],\n },\n {\n type: \"llm-rubric\",\n template: \"code-correctness\",\n criteria: [\n \"Uses createClient from @sanity/client or next-sanity\",\n \"Exports a valid Next.js page component\",\n ],\n },\n ],\n\n // ── Baseline variant ────────────────────────────────────────────────\n // enabled — set to false to skip this task entirely\n // rubric — \"full\" (default), \"abbreviated\" (faster), or \"none\"\n baseline: {\n enabled: true,\n rubric: \"full\",\n },\n\n // Example tasks ship as drafts so they don't run in production evals.\n // Change to \"active\" (or remove this field) to activate.\n status: \"draft\",\n})\n";
462
+ /** TypeScript task template for example-id-based-ref */
463
+ export const exampleIdBasedRefTs = "/**\n * Example Task: Document ID-based canonical doc references.\n *\n * Demonstrates using `id` to reference canonical documentation by\n * Sanity document `_id`. This is useful for:\n * - Draft documents that don't have a stable slug yet\n * - Programmatic references from imports or migrations\n * - Documents where you know the _id but not the slug\n *\n * The `id` ref type can also carry optional `slug` and `path` fields\n * as human-readable annotations — these are NOT used for resolution,\n * only for display in logs and reports.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n mode: \"literacy\",\n id: \"example-id-based-ref\",\n title: \"GROQ feature support (ID-based doc references)\",\n description: \"Example — demonstrates ID-based canonical doc references\",\n\n area: \"groq\",\n\n // ID-based canonical doc references.\n //\n // Use the Sanity document _id to reference articles directly.\n // Optional slug/path annotations help humans reading the file\n // but are NOT used for resolution — only the `id` field matters.\n //\n // These IDs reference real articles in the Sanity docs (next dataset):\n // 0ba88f1b... = \"GROQ feature support across Sanity\"\n // 5b9c2863... = \"Custom GROQ functions\"\n context: {\n docs: [\n {\n id: \"0ba88f1b-d1a7-418a-9267-2e343d01886a\",\n slug: \"groq-feature-support-by-context\", // annotation only\n reason: \"GROQ feature support across different Sanity contexts\",\n },\n {\n id: \"5b9c2863-ef01-4565-af8e-ee54e081ee74\",\n slug: \"custom-groq-functions\", // annotation only\n reason: \"Custom GROQ functions and pipelines\",\n },\n ],\n },\n\n docCoverage: true,\n\n prompt: {\n text: `Explain how GROQ is used across different Sanity contexts.\nCover the following:\n1. Which GROQ features are available in each context (API queries,\n webhooks, custom functions, access control)\n2. How to create and use custom GROQ functions\n3. Any differences in GROQ support between contexts\nProvide examples demonstrating context-specific GROQ patterns.`,\n },\n\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Explains GROQ availability across different Sanity contexts\",\n \"Describes custom GROQ function creation and usage\",\n \"Notes differences in GROQ support between contexts\",\n ],\n },\n {\n type: \"llm-rubric\",\n template: \"code-correctness\",\n criteria: [\n \"GROQ examples use valid syntax\",\n \"Custom function examples follow the correct API pattern\",\n ],\n },\n ],\n\n baseline: { enabled: true, rubric: \"full\" },\n status: \"draft\",\n})\n";
464
+ /** TypeScript task template for example-path-based-ref */
465
+ export const examplePathBasedRefTs = "/**\n * Example Task: Path-based canonical doc references.\n *\n * Demonstrates using `path` to reference canonical documentation.\n * Paths are the preferred reference type because they uniquely identify\n * an article across sections (unlike slugs, which can collide).\n *\n * Path format:\n * - Simple: \"webhooks\" → resolves by slug lookup\n * - Sectioned: \"content-lake/webhooks\" → disambiguates by section + slug\n *\n * This example demonstrates why paths matter: the slug \"documents\"\n * exists in both the \"content-lake\" and \"cli-reference\" sections.\n * Using \"content-lake/documents\" ensures we get the right one.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n mode: \"literacy\",\n id: \"example-path-based-ref\",\n title: \"GROQ mutations (path-based doc references)\",\n description: \"Example — demonstrates path-based canonical doc references\",\n\n area: \"groq\",\n\n // Path-based canonical doc references.\n //\n // Use \"section/slug\" format to uniquely identify articles:\n // - \"content-lake/mutations-introduction\" → the mutations article\n // - \"content-lake/documents\" → the documents article in Content Lake\n // (not the CLI \"documents\" article in cli-reference section)\n //\n // The \"documents\" slug exists in two sections — this is exactly why\n // path-based references are preferred over slug-based references.\n context: {\n docs: [\n {\n path: \"content-lake/mutations-introduction\",\n reason: \"Introduction to document mutations in the Content Lake\",\n },\n {\n path: \"content-lake/documents\",\n reason:\n \"Document structure and types (Content Lake, not CLI reference)\",\n },\n ],\n },\n\n docCoverage: true,\n\n prompt: {\n text: `Explain how to create, update, and delete documents in Sanity's\nContent Lake using mutations. Cover:\n1. The different mutation types (create, createOrReplace, patch, delete)\n2. Document structure and required fields (_id, _type)\n3. How to use patch operations to update specific fields\n4. Best practices for mutation patterns\nProvide working code examples using @sanity/client.`,\n },\n\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Explains create, createOrReplace, patch, and delete mutations\",\n \"Describes required document fields (_id, _type)\",\n \"Shows patch operations for field-level updates\",\n \"Includes practical code examples\",\n ],\n },\n {\n type: \"llm-rubric\",\n template: \"code-correctness\",\n criteria: [\n \"Uses correct @sanity/client mutation API\",\n \"Patch operations use valid set/unset/inc syntax\",\n ],\n },\n ],\n\n baseline: { enabled: true, rubric: \"full\" },\n status: \"draft\",\n})\n";
466
+ /** TypeScript task template for example-perspective-ref */
467
+ export const examplePerspectiveRefTs = "/**\n * Example Task: Perspective / content release doc references.\n *\n * Demonstrates using `perspective` to reference all documentation\n * articles within a content release. This is the key capability for\n * evaluating NEW feature documentation before it's published.\n *\n * How it works:\n * - A perspective ref is one-to-many: the doc fetcher queries the\n * named release and expands it to ALL articles versioned within it.\n * - Downstream consumers see the same flat DocContext[] regardless\n * of how docs were resolved.\n * - When the release is published, the perspective entry becomes a\n * no-op (articles are now in published). Migrate to explicit path\n * or slug refs at your convenience.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n mode: \"literacy\",\n id: \"example-perspective-ref\",\n title:\n \"GROQ features from content release (perspective-based doc references)\",\n description:\n \"Example — demonstrates perspective-based canonical doc references\",\n\n area: \"groq\",\n\n // Perspective-based canonical doc reference.\n //\n // The perspective ID references a content release in the Sanity\n // Content Lake. At evaluation time, the doc fetcher auto-discovers\n // all articles versioned in this release and includes them as\n // canonical documentation context.\n //\n // Release rE9TSJvR4 contains:\n // - \"GROQ-powered webhooks\" (webhooks)\n // - \"Query Cheat Sheet - GROQ\" (query-cheat-sheet)\n // - \"GROQ joins\" (groq-joins)\n //\n // You can combine perspective refs with explicit slug/path/id refs\n // to include foundational published docs alongside release content.\n // Here we add groq-data-types as a complementary published reference.\n context: {\n docs: [\n {\n perspective: \"rE9TSJvR4\",\n reason: \"All GROQ documentation updates in the test content release\",\n },\n {\n slug: \"groq-data-types\",\n reason: \"GROQ data type reference (published, stable)\",\n },\n ],\n },\n\n docCoverage: true,\n\n prompt: {\n text: `Using GROQ, demonstrate advanced query patterns including:\n1. Joining data across document types using references\n2. Filtering webhook payloads with GROQ projections\n3. Using the query cheat sheet patterns for common operations\n4. Working with different GROQ data types in filters\nProvide working GROQ query examples for each pattern.`,\n },\n\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Demonstrates GROQ join syntax for cross-document queries\",\n \"Shows GROQ filter patterns for webhook configuration\",\n \"Includes practical query examples from cheat sheet patterns\",\n ],\n },\n {\n type: \"llm-rubric\",\n template: \"code-correctness\",\n criteria: [\n \"All GROQ queries use valid syntax\",\n \"Reference joins use correct dereference operator (->)\",\n ],\n },\n ],\n\n baseline: { enabled: true, rubric: \"full\" },\n status: \"draft\",\n})\n";
468
+ /** TypeScript task template for example-studio-custom-input */
469
+ export const exampleStudioCustomInputTs = "/**\n * Example Task: Custom input component in Sanity Studio.\n *\n * This is a starter template — edit it for your own documentation.\n * Delete this file or replace it with your own tasks.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n mode: \"literacy\",\n id: \"example-studio-custom-input\",\n title: \"Custom input component in Sanity Studio\",\n description: \"Example — tests Studio custom input implementation\",\n\n area: \"studio\",\n\n context: {\n docs: [\n {\n slug: \"custom-input-widgets\",\n reason: \"Guide for building custom form inputs in Sanity Studio\",\n },\n {\n slug: \"form-components\",\n reason: \"Form component API and customization patterns\",\n },\n ],\n },\n\n docCoverage: true,\n referenceSolution: \"canonical/example-studio-custom-input.ts\",\n\n prompt: {\n text: `Build a custom string input component for Sanity Studio that shows\na character count below the input field. The component should accept\na maxLength option from the field schema and display a warning when\nthe text exceeds the limit.`,\n },\n\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Implements a React component that renders a text input\",\n \"Displays a live character count\",\n \"Reads maxLength from schema options\",\n \"Shows a visual warning when limit is exceeded\",\n ],\n },\n {\n type: \"llm-rubric\",\n template: \"code-correctness\",\n criteria: [\n \"Uses the Sanity UI library for styling\",\n \"Calls onChange with patch operations\",\n ],\n },\n ],\n\n baseline: { enabled: true, rubric: \"full\" },\n status: \"draft\",\n})\n";
470
+ /** Map of task ID (filename stem) → raw TypeScript source */
471
+ export const taskTsFiles = {
472
+ "example-groq-blog-listing": exampleGroqBlogListingTs,
473
+ "example-id-based-ref": exampleIdBasedRefTs,
474
+ "example-path-based-ref": examplePathBasedRefTs,
475
+ "example-perspective-ref": examplePerspectiveRefTs,
476
+ "example-studio-custom-input": exampleStudioCustomInputTs,
477
+ };
478
+ /** List of TS task file stems, in alphabetical order */
479
+ export const TASK_TS_FILE_NAMES = ["example-groq-blog-listing", "example-id-based-ref", "example-path-based-ref", "example-perspective-ref", "example-studio-custom-input"];
@@ -15,6 +15,6 @@ export * from "./schemas/index.js";
15
15
  export * from "./ports/index.js";
16
16
  export * from "./services/index.js";
17
17
  export * from "./examples/index.js";
18
- export { defineConfig, defineFeatures, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./config-helpers.js";
19
- export type { PresetConfig, PricingEntry, PromptEntry, SourceEntry, } from "./config-helpers.js";
18
+ export { defineConfig, defineFeatures, defineModeBase, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./config-helpers.js";
19
+ export type { PricingEntry, PromptEntry, SourceEntry, } from "./config-helpers.js";
20
20
  export { env } from "./env-helper.js";
@@ -18,5 +18,5 @@ export * from "./examples/index.js";
18
18
  // ---------------------------------------------------------------------------
19
19
  // Architecture overhaul — Phase 0 helpers
20
20
  // ---------------------------------------------------------------------------
21
- export { defineConfig, defineFeatures, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./config-helpers.js";
21
+ export { defineConfig, defineFeatures, defineModeBase, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./config-helpers.js";
22
22
  export { env } from "./env-helper.js";
@@ -140,6 +140,8 @@ export interface ResolvedConfig {
140
140
  apiUrl: string;
141
141
  /** AILF API key (from AILF_API_KEY env var) */
142
142
  apiKey?: string;
143
+ /** External preset file paths or npm package names to load */
144
+ presets?: string[];
143
145
  }
144
146
  /**
145
147
  * Application context — the complete dependency carrier.
@@ -57,5 +57,6 @@ export declare const EvalConfigSchema: z.ZodObject<{
57
57
  source: z.ZodOptional<z.ZodString>;
58
58
  tasks: z.ZodOptional<z.ZodArray<z.ZodString>>;
59
59
  urls: z.ZodOptional<z.ZodArray<z.ZodString>>;
60
+ presets: z.ZodOptional<z.ZodArray<z.ZodString>>;
60
61
  }, z.core.$strict>;
61
62
  export type EvalConfig = z.infer<typeof EvalConfigSchema>;
@@ -81,5 +81,15 @@ export const EvalConfigSchema = z
81
81
  tasks: z.array(z.string()).optional(),
82
82
  /** Doc source URL overrides */
83
83
  urls: z.array(z.string().url()).optional(),
84
+ /**
85
+ * External presets to load — file paths or npm package names.
86
+ *
87
+ * Each entry is resolved as:
88
+ * - Relative path (./foo or ../foo): loaded from disk via jiti
89
+ * - Package name: resolved via Node require
90
+ *
91
+ * Presets are registered in order after built-in presets.
92
+ */
93
+ presets: z.array(z.string()).optional(),
84
94
  })
85
95
  .strict();
@@ -81,6 +81,7 @@ export declare const PipelineRequestSchema: z.ZodObject<{
81
81
  }>>;
82
82
  tasks: z.ZodOptional<z.ZodArray<z.ZodString>>;
83
83
  urls: z.ZodOptional<z.ZodArray<z.ZodString>>;
84
+ presets: z.ZodOptional<z.ZodArray<z.ZodString>>;
84
85
  }, z.core.$strip>;
85
86
  /** Inferred TypeScript type for a pipeline request payload. */
86
87
  export type PipelineRequest = z.infer<typeof PipelineRequestSchema>;
@@ -89,4 +89,6 @@ export const PipelineRequestSchema = z.object({
89
89
  taskMode: z.enum(["content-lake", "yaml", "inline"]).optional(),
90
90
  tasks: z.array(z.string()).optional(),
91
91
  urls: z.array(z.string().url()).optional(),
92
+ /** External preset file paths or npm package names to load */
93
+ presets: z.array(z.string()).optional(),
92
94
  });
@@ -79,7 +79,6 @@ export declare const FeatureSchema: z.ZodObject<{
79
79
  planned: "planned";
80
80
  "out-of-scope": "out-of-scope";
81
81
  }>;
82
- taskCount: z.ZodOptional<z.ZodNumber>;
83
82
  }, z.core.$strip>;
84
83
  /** Inferred TypeScript type for a product feature. */
85
84
  export type Feature = z.infer<typeof FeatureSchema>;
@@ -104,7 +103,6 @@ export declare const FeatureRegistrySchema: z.ZodObject<{
104
103
  planned: "planned";
105
104
  "out-of-scope": "out-of-scope";
106
105
  }>;
107
- taskCount: z.ZodOptional<z.ZodNumber>;
108
106
  }, z.core.$strip>>;
109
107
  }, z.core.$strip>;
110
108
  /** Inferred TypeScript type for the feature registry. */
@@ -108,7 +108,6 @@ export const FeatureSchema = z.object({
108
108
  priority: z.enum(["critical", "high", "medium", "low"]),
109
109
  sections: z.array(z.string().min(1)).min(1),
110
110
  status: z.enum(["covered", "uncovered", "planned", "out-of-scope"]),
111
- taskCount: z.number().int().min(0).optional(),
112
111
  });
113
112
  /**
114
113
  * Schema for the full config/features.yaml config file.
@@ -226,6 +226,19 @@ export interface MCPServerTaskDefinition extends TaskCommonFields {
226
226
  content: string;
227
227
  }[];
228
228
  };
229
+ /**
230
+ * Override model list (provider IDs). When set, only these models are
231
+ * used instead of models filtered from the registry by `mcp-server` mode.
232
+ *
233
+ * @example ["anthropic:messages:claude-opus-4-6"]
234
+ */
235
+ models?: string[];
236
+ /**
237
+ * Maximum number of tool call rounds before forcing a final text response.
238
+ * Each round may include multiple parallel tool calls.
239
+ * @default 5
240
+ */
241
+ maxToolRounds?: number;
229
242
  }
230
243
  /**
231
244
  * Agent harness mode — evaluates autonomous agents in a sandbox.
@@ -14,7 +14,7 @@ export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from
14
14
  export type { DocumentRef } from "../../ailf-shared/index.d.ts";
15
15
  export type { StoredBaseline, StoredReport, StoredRun, StoredTaskResult, StoredTrace, SchemaVersioned, } from "./storage-schema.js";
16
16
  export { CURRENT_SCHEMA_VERSION, isSchemaVersioned, migrateDocument, } from "./storage-schema.js";
17
- export type { AssertionRegistration, FixtureResolverRegistration, ModeRegistration, PluginManifest, PluginRegistry, PresetDefinition, ReportSinkRegistration, RubricTemplateRegistration, } from "./plugin-registry.js";
17
+ export type { AssertionRegistration, FixtureResolverRegistration, ModeBase, ModeRegistration, PluginManifest, PluginRegistry, PresetDefinition, ReportSinkRegistration, RubricTemplateRegistration, } from "./plugin-registry.js";
18
18
  export { InMemoryPluginRegistry } from "./plugin-registry.js";
19
19
  export type { AgentHarnessConfig, AgentHarnessModeConfig, CustomModeConfig, EvalModeConfig, EvalModeType, KnowledgeBaseRef, KnowledgeProbeModeConfig, LiteracyModeConfig, MCPServerConfig, MCPServerModeConfig, ProbeStrategy, SandboxConfig, ToolDef, } from "./eval-mode-config.js";
20
20
  export { evalModeType } from "./eval-mode-config.js";
@@ -575,8 +575,6 @@ export interface ProductFeature {
575
575
  sections: string[];
576
576
  /** Coverage status */
577
577
  status: "covered" | "out-of-scope" | "planned" | "uncovered";
578
- /** Number of evaluation tasks (if covered) */
579
- taskCount?: number;
580
578
  }
581
579
  /** Full classification of a content release for evaluation */
582
580
  export interface ReleaseClassification {
@@ -1,12 +1,12 @@
1
1
  /**
2
2
  * Plugin registry — typed extension points for AILF evaluation capabilities.
3
3
  *
4
- * Twelve extension points: evaluation modes, providers, assertions,
5
- * rubric templates, fixture resolvers, report sinks, dashboard renderers,
6
- * prompt templates, scoring profiles, doc fetcher factory, source defs,
7
- * and feature defs.
8
- *
9
- * Presets bundle multiple extensions into a single installable unit.
4
+ * Three-tier architecture:
5
+ * - **Mode bases** define evaluation methodology (rubrics, scoring, prompts)
6
+ * - **Domain presets** target a mode base and add domain config (sources,
7
+ * features, doc fetcher)
8
+ * - **Framework assertions** are generic evaluation primitives available to
9
+ * all modes
10
10
  *
11
11
  * @see docs/design-docs/architecture-overhaul/extensibility-plugins.md
12
12
  */
@@ -33,8 +33,12 @@ export interface AssertionRegistration {
33
33
  type: string;
34
34
  /** Human-readable label */
35
35
  label: string;
36
- /** Which modes this assertion is compatible with */
37
- compatibleModes: string[];
36
+ /**
37
+ * Which modes this assertion is compatible with.
38
+ * When omitted, the assertion is compatible with all modes.
39
+ * When specified, acts as a whitelist of mode IDs.
40
+ */
41
+ compatibleModes?: string[];
38
42
  /** Assertion handler module path */
39
43
  handlerModule: string;
40
44
  }
@@ -65,6 +69,30 @@ export interface ReportSinkRegistration {
65
69
  /** Sink module path */
66
70
  handlerModule: string;
67
71
  }
72
+ /**
73
+ * ModeBase — shared evaluation methodology for a mode.
74
+ *
75
+ * Defines HOW you evaluate (rubrics, scoring, prompts) independently of
76
+ * WHAT you're evaluating (sources, features, docs). Multiple domain presets
77
+ * can target the same mode base and inherit its defaults.
78
+ *
79
+ * Example: the "literacy" mode base defines rubric templates for
80
+ * task-completion, code-correctness, and doc-coverage. Both a Sanity docs
81
+ * preset and an external docs preset can target "literacy" and inherit
82
+ * these rubrics without redefining them.
83
+ */
84
+ export interface ModeBase {
85
+ /** The mode registration (handler, provider patterns, rubric template IDs) */
86
+ mode: ModeRegistration;
87
+ /** Default rubric templates for this mode */
88
+ rubricTemplates?: RubricTemplateRegistration[];
89
+ /** Default scoring profiles for this mode (profile name → dimension weights) */
90
+ scoringProfiles?: Record<string, Record<string, number>>;
91
+ /** Default prompt templates for this mode (template name → template) */
92
+ promptTemplates?: Record<string, PromptTemplate>;
93
+ /** Mode-specific assertion types (beyond framework builtins) */
94
+ assertions?: AssertionRegistration[];
95
+ }
68
96
  /** Plugin manifest describing a single plugin */
69
97
  export interface PluginManifest {
70
98
  /** Plugin name (npm package style) */
@@ -80,32 +108,49 @@ export interface PluginManifest {
80
108
  /** Dependencies on other plugins */
81
109
  requires?: string[];
82
110
  }
83
- /** A preset bundles multiple extensions into an installable unit */
111
+ /**
112
+ * A domain preset targets a mode base and adds domain-specific configuration.
113
+ *
114
+ * The preset inherits evaluation methodology (rubrics, scoring, prompts) from
115
+ * its mode base. It can optionally override any inherited values.
116
+ */
84
117
  export interface PresetDefinition {
85
- /** Preset name */
118
+ /** Preset name (unique identifier) */
86
119
  name: string;
87
120
  /** Plugin manifest */
88
121
  manifest: PluginManifest;
89
- /** Evaluation modes to register */
90
- modes?: ModeRegistration[];
91
- /** Assertion types to register */
92
- assertions?: AssertionRegistration[];
93
- /** Rubric templates to register */
94
- rubricTemplates?: RubricTemplateRegistration[];
95
- /** Fixture resolvers to register */
122
+ /**
123
+ * Lifecycle status — mirrors task status semantics.
124
+ * active: registered and used in evaluations (default)
125
+ * draft: registered but skipped unless explicitly targeted
126
+ * paused: registered but skipped (can be resumed)
127
+ * archived: not registered
128
+ */
129
+ status?: "active" | "archived" | "draft" | "paused";
130
+ /**
131
+ * Which mode this preset targets (by mode ID).
132
+ * Links to a registered ModeBase. The preset inherits rubrics,
133
+ * scoring profiles, and prompt templates from the base.
134
+ */
135
+ mode: string;
136
+ /** Fixture resolvers */
96
137
  fixtureResolvers?: FixtureResolverRegistration[];
97
- /** Report sinks to register */
138
+ /** Report sinks */
98
139
  reportSinks?: ReportSinkRegistration[];
99
- /** Prompt templates keyed by template name (e.g. "with-docs", "agentic") */
100
- promptTemplates?: Record<string, PromptTemplate>;
101
- /** Scoring profiles mapping profile name to dimension-weight pairs */
102
- scoringProfiles?: Record<string, Record<string, number>>;
103
140
  /** Factory function that creates a DocFetcher instance */
104
141
  docFetcher?: () => DocFetcher;
105
142
  /** Documentation source definitions (production, branch, local, etc.) */
106
143
  sourceDefs?: SourceEntry[];
107
144
  /** Product feature registry for coverage tracking */
108
145
  featureDefs?: FeatureRegistry;
146
+ /** Override rubric templates (merged by ID with mode base) */
147
+ rubricTemplates?: RubricTemplateRegistration[];
148
+ /** Override scoring profiles (merged by name with mode base) */
149
+ scoringProfiles?: Record<string, Record<string, number>>;
150
+ /** Override prompt templates (merged by name with mode base) */
151
+ promptTemplates?: Record<string, PromptTemplate>;
152
+ /** Additional mode-specific assertions */
153
+ assertions?: AssertionRegistration[];
109
154
  }
110
155
  /**
111
156
  * PluginRegistry — central registry for all AILF extensions.
@@ -154,10 +199,16 @@ export interface PluginRegistry {
154
199
  registerSourceDefs(sources: SourceEntry[]): void;
155
200
  /** Get all registered source definitions */
156
201
  getSourceDefs(): SourceEntry[];
157
- /** Register a feature registry (last-write-wins) */
202
+ /** Register a feature registry (merged by feature ID with existing) */
158
203
  registerFeatureDefs(features: FeatureRegistry): void;
159
204
  /** Get the registered feature registry, if any */
160
205
  getFeatureDefs(): FeatureRegistry | undefined;
206
+ /** Register a mode base (evaluation methodology) */
207
+ registerModeBase(base: ModeBase): void;
208
+ /** Get a mode base by mode ID */
209
+ getModeBase(modeId: string): ModeBase | undefined;
210
+ /** Get all registered mode bases */
211
+ getModeBases(): ModeBase[];
161
212
  /** Get all registered presets */
162
213
  getPresets(): PresetDefinition[];
163
214
  }
@@ -170,6 +221,7 @@ export declare class InMemoryPluginRegistry implements PluginRegistry {
170
221
  private readonly rubricTemplates_;
171
222
  private readonly fixtureResolvers_;
172
223
  private readonly reportSinks_;
224
+ private readonly modeBases_;
173
225
  private readonly presets_;
174
226
  private promptTemplates_;
175
227
  private scoringProfiles_;
@@ -199,4 +251,7 @@ export declare class InMemoryPluginRegistry implements PluginRegistry {
199
251
  getSourceDefs(): SourceEntry[];
200
252
  registerFeatureDefs(features: FeatureRegistry): void;
201
253
  getFeatureDefs(): FeatureRegistry | undefined;
254
+ registerModeBase(base: ModeBase): void;
255
+ getModeBase(modeId: string): ModeBase | undefined;
256
+ getModeBases(): ModeBase[];
202
257
  }