@sanity/ailf 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. package/dist/_vendor/ailf-core/examples/index.d.ts +16 -12
  2. package/dist/_vendor/ailf-core/examples/index.js +19 -12
  3. package/dist/_vendor/ailf-core/ports/context.d.ts +4 -0
  4. package/dist/adapters/task-sources/repo-schemas.d.ts +12 -2
  5. package/dist/adapters/task-sources/repo-schemas.js +28 -2
  6. package/dist/cli.js +0 -0
  7. package/dist/commands/init.js +17 -5
  8. package/dist/commands/pipeline-action.js +44 -6
  9. package/dist/commands/publish.js +2 -1
  10. package/dist/commands/validate-tasks.js +4 -1
  11. package/dist/composition-root.js +9 -5
  12. package/dist/orchestration/build-app-context.js +2 -0
  13. package/package.json +1 -1
  14. package/dist/commands/update-quality-scores.d.ts +0 -5
  15. package/dist/commands/update-quality-scores.js +0 -20
  16. package/dist/lib/agent-behavior-report.d.ts +0 -8
  17. package/dist/lib/agent-behavior-report.js +0 -185
  18. package/dist/lib/baseline.d.ts +0 -19
  19. package/dist/lib/baseline.js +0 -153
  20. package/dist/lib/calculate-scores.d.ts +0 -23
  21. package/dist/lib/calculate-scores.js +0 -42
  22. package/dist/lib/compare.d.ts +0 -18
  23. package/dist/lib/compare.js +0 -170
  24. package/dist/lib/coverage-audit.d.ts +0 -4
  25. package/dist/lib/coverage-audit.js +0 -42
  26. package/dist/lib/discovery-report.d.ts +0 -13
  27. package/dist/lib/discovery-report.js +0 -57
  28. package/dist/lib/fetch-docs.d.ts +0 -30
  29. package/dist/lib/fetch-docs.js +0 -171
  30. package/dist/lib/generate-configs.d.ts +0 -25
  31. package/dist/lib/generate-configs.js +0 -42
  32. package/dist/lib/grader-api.d.ts +0 -21
  33. package/dist/lib/grader-api.js +0 -34
  34. package/dist/lib/grader-compare.d.ts +0 -19
  35. package/dist/lib/grader-compare.js +0 -91
  36. package/dist/lib/grader-consistency.d.ts +0 -27
  37. package/dist/lib/grader-consistency.js +0 -79
  38. package/dist/lib/grader-sensitivity.d.ts +0 -19
  39. package/dist/lib/grader-sensitivity.js +0 -75
  40. package/dist/lib/grader-validate.d.ts +0 -19
  41. package/dist/lib/grader-validate.js +0 -78
  42. package/dist/lib/measure-retrieval.d.ts +0 -14
  43. package/dist/lib/measure-retrieval.js +0 -71
  44. package/dist/lib/pr-comment.d.ts +0 -16
  45. package/dist/lib/pr-comment.js +0 -28
  46. package/dist/lib/readiness-report.d.ts +0 -13
  47. package/dist/lib/readiness-report.js +0 -108
  48. package/dist/lib/webhook-server.d.ts +0 -11
  49. package/dist/lib/webhook-server.js +0 -24
  50. package/dist/lib/weekly-digest.d.ts +0 -24
  51. package/dist/lib/weekly-digest.js +0 -148
  52. package/dist/orchestration/env-bridge.d.ts +0 -21
  53. package/dist/orchestration/env-bridge.js +0 -66
  54. package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
  55. package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
  56. package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
  57. package/dist/pipeline/steps/calculate-scores-step.js +0 -89
  58. package/dist/pipeline/steps/compare-step.d.ts +0 -18
  59. package/dist/pipeline/steps/compare-step.js +0 -90
  60. package/dist/pipeline/steps/eval-step.d.ts +0 -53
  61. package/dist/pipeline/steps/eval-step.js +0 -347
  62. package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
  63. package/dist/pipeline/steps/fetch-docs-step.js +0 -84
  64. package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
  65. package/dist/pipeline/steps/generate-configs-step.js +0 -98
  66. package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
  67. package/dist/pipeline/steps/grader-consistency-step.js +0 -74
  68. package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
  69. package/dist/pipeline/steps/publish-report-step.js +0 -243
  70. package/dist/pipeline/steps/report-step.d.ts +0 -13
  71. package/dist/pipeline/steps/report-step.js +0 -56
  72. package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
  73. package/dist/pipeline/steps/update-scores-step.js +0 -42
  74. package/dist/scripts/agent-behavior-report.d.ts +0 -19
  75. package/dist/scripts/agent-behavior-report.js +0 -315
  76. package/dist/scripts/baseline.d.ts +0 -43
  77. package/dist/scripts/baseline.js +0 -267
  78. package/dist/scripts/calculate-scores.d.ts +0 -166
  79. package/dist/scripts/calculate-scores.js +0 -1296
  80. package/dist/scripts/compare.d.ts +0 -22
  81. package/dist/scripts/compare.js +0 -334
  82. package/dist/scripts/coverage-audit.d.ts +0 -44
  83. package/dist/scripts/coverage-audit.js +0 -209
  84. package/dist/scripts/debug-eval.d.ts +0 -19
  85. package/dist/scripts/debug-eval.js +0 -73
  86. package/dist/scripts/discovery-report.d.ts +0 -58
  87. package/dist/scripts/discovery-report.js +0 -250
  88. package/dist/scripts/fetch-docs.d.ts +0 -35
  89. package/dist/scripts/fetch-docs.js +0 -472
  90. package/dist/scripts/generate-configs.d.ts +0 -66
  91. package/dist/scripts/generate-configs.js +0 -459
  92. package/dist/scripts/grader-api.d.ts +0 -27
  93. package/dist/scripts/grader-api.js +0 -206
  94. package/dist/scripts/grader-compare.d.ts +0 -22
  95. package/dist/scripts/grader-compare.js +0 -368
  96. package/dist/scripts/grader-consistency.d.ts +0 -20
  97. package/dist/scripts/grader-consistency.js +0 -313
  98. package/dist/scripts/grader-sensitivity.d.ts +0 -22
  99. package/dist/scripts/grader-sensitivity.js +0 -354
  100. package/dist/scripts/grader-validate.d.ts +0 -19
  101. package/dist/scripts/grader-validate.js +0 -267
  102. package/dist/scripts/measure-retrieval.d.ts +0 -10
  103. package/dist/scripts/measure-retrieval.js +0 -145
  104. package/dist/scripts/pipeline.d.ts +0 -76
  105. package/dist/scripts/pipeline.js +0 -1031
  106. package/dist/scripts/pr-comment.d.ts +0 -10
  107. package/dist/scripts/pr-comment.js +0 -510
  108. package/dist/scripts/readiness-report.d.ts +0 -88
  109. package/dist/scripts/readiness-report.js +0 -342
  110. package/dist/scripts/update-quality-scores.d.ts +0 -15
  111. package/dist/scripts/update-quality-scores.js +0 -184
  112. package/dist/scripts/validate.d.ts +0 -13
  113. package/dist/scripts/validate.js +0 -79
  114. package/dist/scripts/webhook-server.d.ts +0 -26
  115. package/dist/scripts/webhook-server.js +0 -147
  116. package/dist/scripts/weekly-digest.d.ts +0 -24
  117. package/dist/scripts/weekly-digest.js +0 -144
  118. package/dist/sinks/format-slack.d.ts +0 -64
  119. package/dist/sinks/format-slack.js +0 -306
  120. package/dist/sinks/slack-sink.d.ts +0 -27
  121. package/dist/sinks/slack-sink.js +0 -78
  122. package/dist/sinks/webhook-sink.d.ts +0 -19
  123. package/dist/sinks/webhook-sink.js +0 -50
  124. package/tasks/.expanded.agentic.yaml +0 -51
  125. package/tasks/.expanded.yaml +0 -66
@@ -90,9 +90,9 @@ export declare const thresholdYaml = "# Example quality threshold configuration.
90
90
  /** Parsed ailf-config example data (JSON-safe) */
91
91
  export declare const ailfConfigData: {
92
92
  readonly source: {
93
- readonly projectId: "your-project-id";
94
- readonly dataset: "production";
95
- readonly baseUrl: "https://your-site.example.com/docs";
93
+ readonly projectId: "3do82whm";
94
+ readonly dataset: "next";
95
+ readonly baseUrl: "https://www.sanity.io/docs";
96
96
  };
97
97
  readonly triggers: {
98
98
  readonly pr: {
@@ -110,20 +110,21 @@ export declare const ailfConfigData: {
110
110
  };
111
111
  };
112
112
  /** Raw YAML string for ailf-config example (preserves comments) */
113
- export declare const ailfConfigYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# .ailf/config.yaml \u2014 AI Literacy Framework project configuration\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# This file configures how the AILF evaluation pipeline runs in this\n# repository. Place it at .ailf/config.yaml in your project root.\n#\n# Docs: https://github.com/sanity-io/ai-literacy-framework\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n# Documentation source \u2014 where to fetch content for evaluation.\n#\n# projectId \u2014 your Sanity project ID (find it in sanity.io/manage)\n# dataset \u2014 the dataset to query (e.g., \"production\", \"staging\")\n# baseUrl \u2014 the public URL of your documentation site\n# (used by agentic mode to test agent discoverability)\nsource:\n projectId: \"your-project-id\"\n dataset: production\n baseUrl: \"https://your-site.example.com/docs\"\n\n# Trigger configuration \u2014 when evaluations run automatically.\n#\n# Each key is a trigger context. The pipeline checks which trigger\n# matches the current execution context (PR, merge, schedule, etc.)\n# and applies its settings.\n#\n# mode options:\n# validate-only \u2014 check that task YAML parses correctly (fast, no LLM calls)\n# eval \u2014 run the full evaluation pipeline\n#\n# paths \u2014 only trigger when files matching these globs change\n# blocking \u2014 if true, a failing eval blocks the PR merge\n# notify \u2014 if true, post results to configured notification channels\ntriggers:\n # On pull requests: just validate task files parse correctly\n pr:\n mode: validate-only\n\n # When .ailf/ files change in a PR: run a real evaluation\n pr-task-change:\n mode: eval\n paths: [\".ailf/**\"]\n\n # On merge to main: run evaluation (non-blocking)\n main:\n mode: eval\n blocking: false\n notify: true\n";
113
+ export declare const ailfConfigYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# .ailf/config.yaml \u2014 AI Literacy Framework project configuration\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# This file configures how the AILF evaluation pipeline runs in this\n# repository. Place it at .ailf/config.yaml in your project root.\n#\n# Evaluations are submitted to the AILF API (ailf-api.sanity.build).\n# The API handles LLM calls, doc fetching, grading, and report\n# publishing. Your repo only needs one secret: AILF_API_KEY.\n#\n# Docs: https://github.com/sanity-io/ai-literacy-framework\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n# Documentation source \u2014 which docs are being evaluated.\n#\n# This tells the pipeline which Sanity project and dataset contain\n# the documentation under test. For most users, this is Sanity's own\n# docs project.\n#\n# projectId \u2014 Sanity project ID (find yours at sanity.io/manage)\n# dataset \u2014 the dataset to query (e.g., \"production\", \"next\")\n# baseUrl \u2014 the public URL of your documentation site\n# (used by agentic mode to test agent discoverability)\nsource:\n projectId: \"3do82whm\"\n dataset: next\n baseUrl: \"https://www.sanity.io/docs\"\n\n# Trigger configuration \u2014 when evaluations run automatically.\n#\n# Each key is a trigger context. The pipeline checks which trigger\n# matches the current execution context (PR, merge, schedule, etc.)\n# and applies its settings.\n#\n# mode options:\n# validate-only \u2014 check that task YAML parses correctly (fast, no LLM calls)\n# eval \u2014 run the full evaluation pipeline\n#\n# paths \u2014 only trigger when files matching these globs change\n# blocking \u2014 if true, a failing eval blocks the PR merge\n# notify \u2014 if true, post results to configured notification channels\ntriggers:\n # On pull requests: just validate task files parse correctly\n pr:\n mode: validate-only\n\n # When .ailf/ files change in a PR: run a real evaluation\n pr-task-change:\n mode: eval\n paths: [\".ailf/**\"]\n\n # On merge to main: run evaluation (non-blocking)\n main:\n mode: eval\n blocking: false\n notify: true\n";
114
114
  /** Parsed task data for example-groq-blog-listing (JSON-safe) */
115
115
  export declare const exampleGroqBlogListingData: readonly [{
116
116
  readonly id: "example-groq-blog-listing";
117
117
  readonly description: "Example — Blog listing with GROQ queries";
118
- readonly canonical_docs: readonly [{
118
+ readonly featureArea: "groq";
119
+ readonly canonicalDocs: readonly [{
119
120
  readonly slug: "groq-introduction";
120
121
  readonly reason: "Core GROQ syntax and query language reference";
121
122
  }, {
122
123
  readonly slug: "how-queries-work";
123
124
  readonly reason: "Query execution model and best practices";
124
125
  }];
125
- readonly doc_coverage: true;
126
- readonly reference_solution: "canonical/example-groq-blog-listing.ts";
126
+ readonly docCoverage: true;
127
+ readonly referenceSolution: "canonical/example-groq-blog-listing.ts";
127
128
  readonly vars: {
128
129
  readonly task: "Create a Next.js page component that lists blog posts from Sanity\nusing GROQ. The page should display the title, slug, and published\ndate for each post, sorted by most recent first. Use the Sanity\nclient to fetch data.\n";
129
130
  readonly docs: "";
@@ -143,17 +144,18 @@ export declare const exampleGroqBlogListingData: readonly [{
143
144
  };
144
145
  }];
145
146
  /** Raw YAML string for example-groq-blog-listing (preserves comments) */
146
- export declare const exampleGroqBlogListingYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Example Task: Blog listing with GROQ queries\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# This is a starter template \u2014 edit it for your own documentation.\n# Each task evaluates whether an AI coding agent can implement a feature\n# using your docs as context. Delete this file or replace it entirely.\n#\n# To disable this task without deleting the file, set:\n# baseline:\n# enabled: false\n#\n# Full field reference:\n# https://github.com/sanity-io/ai-literacy-framework/blob/main/docs/CONTRIBUTING_TASKS.md\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n# Unique identifier \u2014 lowercase alphanumeric with hyphens.\n# Must be unique across all task files in .ailf/tasks/.\n- id: example-groq-blog-listing\n\n # Short human-readable summary. Shown in score tables and reports.\n description: \"Example \u2014 Blog listing with GROQ queries\"\n\n # Feature area this task belongs to. Tasks with the same area are\n # grouped together in score summaries. Use a short kebab-case name.\n # featureArea is inferred from the filename by default, but you can\n # set it explicitly here.\n # featureArea: groq\n\n # Gold-standard documentation articles for this task. The pipeline\n # fetches these from Sanity and injects them into the prompt for\n # baseline evaluation. Each entry needs:\n # slug \u2014 the article's URL slug in your docs site\n # reason \u2014 why this doc is relevant (helps with auditing)\n canonical_docs:\n - slug: groq-introduction\n reason: \"Core GROQ syntax and query language reference\"\n - slug: how-queries-work\n reason: \"Query execution model and best practices\"\n\n # When true, the pipeline auto-generates an additional rubric that\n # checks whether the LLM's response actually used the provided docs.\n doc_coverage: true\n\n # Path to a gold-standard implementation, relative to canonical/.\n # The grader uses this as a reference when scoring code correctness.\n reference_solution: canonical/example-groq-blog-listing.ts\n\n # vars.task \u2014 the implementation prompt given to the LLM.\n # Write this as if you're asking a developer to build the feature.\n # Be specific about requirements so the grader can evaluate clearly.\n #\n # vars.docs \u2014 leave empty (\"\"). The pipeline fills this in:\n # \u2022 Gold variant: injected with canonical doc content\n # \u2022 Baseline variant: left empty (tests model knowledge alone)\n vars:\n task: |\n Create a Next.js page component that lists blog posts from Sanity\n using GROQ. The page should display the title, slug, and published\n date for each post, sorted by most recent first. Use the Sanity\n client to fetch data.\n docs: \"\"\n\n # Grading assertions \u2014 how the LLM's response is scored.\n #\n # \"llm-rubric\" assertions use a grader LLM to score against criteria.\n # The \"template\" references a rubric from config/rubrics.yaml.\n # The \"criteria\" are task-specific bullets injected into the template.\n #\n # Available templates:\n # task-completion \u2014 did the LLM implement the feature? (weight: 0.50)\n # code-correctness \u2014 is the code idiomatic and correct? (weight: 0.25)\n #\n # You can also use value-based assertions:\n # - type: contains\n # value: \"client.fetch\"\n # - type: contains-any\n # value: [\"createClient\", \"sanityClient\"]\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Uses the groq tagged template literal\"\n - \"Fetches blog posts with title, slug, and publishedAt fields\"\n - \"Orders results by publishedAt in descending order\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses createClient from @sanity/client or next-sanity\"\n - \"Exports a valid Next.js page component\"\n\n # Baseline variant configuration.\n # enabled \u2014 set to false to skip this task entirely\n # rubric \u2014 \"abbreviated\" (faster, default), \"full\", or \"none\"\n baseline:\n enabled: true\n rubric: abbreviated\n";
147
+ export declare const exampleGroqBlogListingYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Example Task: Blog listing with GROQ queries\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# This is a starter template \u2014 edit it for your own documentation.\n# Each task evaluates whether an AI coding agent can implement a feature\n# using your docs as context. Delete this file or replace it entirely.\n#\n# To disable this task without deleting the file, set:\n# baseline:\n# enabled: false\n#\n# Full field reference:\n# https://github.com/sanity-io/ai-literacy-framework/blob/main/docs/CONTRIBUTING_TASKS.md\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n# Unique identifier \u2014 lowercase alphanumeric with hyphens.\n# Must be unique across all task files in .ailf/tasks/.\n- id: example-groq-blog-listing\n\n # Short human-readable summary. Shown in score tables and reports.\n description: \"Example \u2014 Blog listing with GROQ queries\"\n\n # Feature area this task belongs to. Tasks with the same area are\n # grouped together in score summaries. Use a short kebab-case name.\n featureArea: groq\n\n # Gold-standard documentation articles for this task. The pipeline\n # fetches these from Sanity and injects them into the prompt for\n # baseline evaluation. Each entry needs:\n # slug \u2014 the article's URL slug in your docs site\n # reason \u2014 why this doc is relevant (helps with auditing)\n canonicalDocs:\n - slug: groq-introduction\n reason: \"Core GROQ syntax and query language reference\"\n - slug: how-queries-work\n reason: \"Query execution model and best practices\"\n\n # When true, the pipeline auto-generates an additional rubric that\n # checks whether the LLM's response actually used the provided docs.\n docCoverage: true\n\n # Path to a gold-standard implementation, relative to canonical/.\n # The grader uses this as a reference when scoring code correctness.\n referenceSolution: canonical/example-groq-blog-listing.ts\n\n # vars.task \u2014 the implementation prompt given to the LLM.\n # Write this as if you're asking a developer to build the feature.\n # Be specific about requirements so the grader can evaluate clearly.\n #\n # vars.docs \u2014 leave empty (\"\"). The pipeline fills this in:\n # \u2022 Gold variant: injected with canonical doc content\n # \u2022 Baseline variant: left empty (tests model knowledge alone)\n vars:\n task: |\n Create a Next.js page component that lists blog posts from Sanity\n using GROQ. The page should display the title, slug, and published\n date for each post, sorted by most recent first. Use the Sanity\n client to fetch data.\n docs: \"\"\n\n # Grading assertions \u2014 how the LLM's response is scored.\n #\n # \"llm-rubric\" assertions use a grader LLM to score against criteria.\n # The \"template\" references a rubric from config/rubrics.yaml.\n # The \"criteria\" are task-specific bullets injected into the template.\n #\n # Available templates:\n # task-completion \u2014 did the LLM implement the feature? (weight: 0.50)\n # code-correctness \u2014 is the code idiomatic and correct? (weight: 0.25)\n #\n # You can also use value-based assertions:\n # - type: contains\n # value: \"client.fetch\"\n # - type: contains-any\n # value: [\"createClient\", \"sanityClient\"]\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Uses the groq tagged template literal\"\n - \"Fetches blog posts with title, slug, and publishedAt fields\"\n - \"Orders results by publishedAt in descending order\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses createClient from @sanity/client or next-sanity\"\n - \"Exports a valid Next.js page component\"\n\n # Baseline variant configuration.\n # enabled \u2014 set to false to skip this task entirely\n # rubric \u2014 \"abbreviated\" (faster, default), \"full\", or \"none\"\n baseline:\n enabled: true\n rubric: abbreviated\n";
147
148
  /** Parsed task data for example-studio-custom-input (JSON-safe) */
148
149
  export declare const exampleStudioCustomInputData: readonly [{
149
150
  readonly id: "example-studio-custom-input";
150
151
  readonly description: "Example — Custom input component in Sanity Studio";
151
- readonly canonical_docs: readonly [{
152
+ readonly featureArea: "studio";
153
+ readonly canonicalDocs: readonly [{
152
154
  readonly slug: "custom-input-components";
153
155
  readonly reason: "Guide for building custom form inputs in Sanity Studio";
154
156
  }];
155
- readonly doc_coverage: true;
156
- readonly reference_solution: "canonical/example-studio-custom-input.ts";
157
+ readonly docCoverage: true;
158
+ readonly referenceSolution: "canonical/example-studio-custom-input.ts";
157
159
  readonly vars: {
158
160
  readonly task: "Build a custom string input component for Sanity Studio that shows\na character count below the input field. The component should accept\na maxLength option from the field schema and display a warning when\nthe text exceeds the limit.\n";
159
161
  readonly docs: "";
@@ -173,7 +175,7 @@ export declare const exampleStudioCustomInputData: readonly [{
173
175
  };
174
176
  }];
175
177
  /** Raw YAML string for example-studio-custom-input (preserves comments) */
176
- export declare const exampleStudioCustomInputYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Example Task: Custom input component in Sanity Studio\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# This is a starter template \u2014 edit it for your own documentation.\n# Delete this file or replace it with your own tasks.\n#\n# To disable without deleting:\n# baseline:\n# enabled: false\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n- id: example-studio-custom-input\n description: \"Example \u2014 Custom input component in Sanity Studio\"\n\n canonical_docs:\n - slug: custom-input-components\n reason: \"Guide for building custom form inputs in Sanity Studio\"\n\n doc_coverage: true\n reference_solution: canonical/example-studio-custom-input.ts\n\n vars:\n task: |\n Build a custom string input component for Sanity Studio that shows\n a character count below the input field. The component should accept\n a maxLength option from the field schema and display a warning when\n the text exceeds the limit.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Implements a React component that renders a text input\"\n - \"Displays a live character count\"\n - \"Reads maxLength from schema options\"\n - \"Shows a visual warning when limit is exceeded\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses the Sanity UI library for styling\"\n - \"Calls onChange with patch operations\"\n\n baseline:\n enabled: true\n rubric: abbreviated\n";
178
+ export declare const exampleStudioCustomInputYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Example Task: Custom input component in Sanity Studio\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# This is a starter template \u2014 edit it for your own documentation.\n# Delete this file or replace it with your own tasks.\n#\n# To disable without deleting:\n# baseline:\n# enabled: false\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n- id: example-studio-custom-input\n description: \"Example \u2014 Custom input component in Sanity Studio\"\n\n featureArea: studio\n\n canonicalDocs:\n - slug: custom-input-components\n reason: \"Guide for building custom form inputs in Sanity Studio\"\n\n docCoverage: true\n referenceSolution: canonical/example-studio-custom-input.ts\n\n vars:\n task: |\n Build a custom string input component for Sanity Studio that shows\n a character count below the input field. The component should accept\n a maxLength option from the field schema and display a warning when\n the text exceeds the limit.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Implements a React component that renders a text input\"\n - \"Displays a live character count\"\n - \"Reads maxLength from schema options\"\n - \"Shows a visual warning when limit is exceeded\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses the Sanity UI library for styling\"\n - \"Calls onChange with patch operations\"\n\n baseline:\n enabled: true\n rubric: abbreviated\n";
177
179
  /** All task example data as a flat array (JSON-safe) */
178
180
  export declare const allTaskData: readonly unknown[];
179
181
  /** Map of task ID (filename stem) → raw YAML string (preserves comments) */
@@ -188,3 +190,5 @@ export interface ExampleRecord {
188
190
  yaml: string;
189
191
  }
190
192
  export declare const EXAMPLES: Record<ExampleType, ExampleRecord>;
193
+ /** GitHub Actions workflow template for AI Literacy evaluation */
194
+ export declare const workflowYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# AI Literacy Evaluation \u2014 GitHub Actions workflow\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# This workflow submits evaluations to the AILF API when task or config\n# files change in a pull request. The API handles all processing\n# (LLM calls, doc fetching, grading, report publishing).\n#\n# Prerequisites:\n# Add one secret to your repository (Settings \u2192 Secrets \u2192 Actions):\n# AILF_API_KEY \u2014 your API key (starts with ailf_live_sk_)\n#\n# Customization:\n# - Adjust `paths` to match your documentation file locations\n# - Set full_eval to true for comprehensive (slower) evaluation\n# - See: https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/API_GATEWAY.md\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\nname: AI Literacy Eval\n\non:\n pull_request:\n branches: [main]\n paths:\n - \".ailf/**\"\n\n # Manual trigger from the Actions tab\n workflow_dispatch:\n inputs:\n full_eval:\n description: \"Run full evaluation (all tests, slower)\"\n type: boolean\n default: false\n\nconcurrency:\n group: ailf-eval-${{ github.event.pull_request.number || github.ref }}\n cancel-in-progress: true\n\njobs:\n evaluate:\n name: AI Literacy Evaluation\n runs-on: ubuntu-latest\n permissions:\n pull-requests: write\n steps:\n # \u2500\u2500\u2500 Submit evaluation to the AILF API \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n - name: Submit evaluation\n id: submit\n env:\n AILF_API_KEY: ${{ secrets.AILF_API_KEY }}\n FULL_EVAL: ${{ inputs.full_eval || 'false' }}\n run: |\n if [ \"$FULL_EVAL\" = \"true\" ]; then\n DEBUG_FIELD=\"\"\n else\n DEBUG_FIELD='\"debug\": { \"enabled\": true, \"firstN\": 2 },'\n fi\n\n PAYLOAD=$(cat <<EOF\n {\n \"mode\": \"baseline\",\n ${DEBUG_FIELD}\n \"publish\": true,\n \"compare\": true\n }\n EOF\n )\n\n RESPONSE=$(curl -sf -X POST \\\n -H \"Authorization: Bearer $AILF_API_KEY\" \\\n -H \"Content-Type: application/json\" \\\n https://ailf-api.sanity.build/v1/pipeline \\\n -d \"$PAYLOAD\")\n\n JOB_ID=$(echo \"$RESPONSE\" | jq -r '.jobId')\n echo \"job_id=$JOB_ID\" >> $GITHUB_OUTPUT\n echo \"\uD83D\uDCCB Submitted job: $JOB_ID\"\n\n # \u2500\u2500\u2500 Poll for results (long-polling) \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n - name: Wait for results\n id: results\n env:\n AILF_API_KEY: ${{ secrets.AILF_API_KEY }}\n JOB_ID: ${{ steps.submit.outputs.job_id }}\n run: |\n for i in $(seq 1 40); do\n RESPONSE=$(curl -s \\\n -H \"Authorization: Bearer $AILF_API_KEY\" \\\n -H \"Prefer: wait=25\" \\\n \"https://ailf-api.sanity.build/v1/jobs/$JOB_ID\")\n\n STATUS=$(echo \"$RESPONSE\" | jq -r '.status')\n\n case \"$STATUS\" in\n completed)\n echo \"status=completed\" >> $GITHUB_OUTPUT\n echo \"report_id=$(echo $RESPONSE | jq -r '.reportId // empty')\" >> $GITHUB_OUTPUT\n echo \"score=$(echo $RESPONSE | jq -r '.score // empty')\" >> $GITHUB_OUTPUT\n echo \"\u2705 Evaluation completed\"\n exit 0\n ;;\n failed|timed-out)\n echo \"status=$STATUS\" >> $GITHUB_OUTPUT\n echo \"::error::Evaluation $STATUS\"\n exit 1\n ;;\n *)\n echo \"\u23F3 [$i/40] $STATUS\"\n ;;\n esac\n done\n\n echo \"::error::Timed out waiting for evaluation\"\n exit 1\n\n # \u2500\u2500\u2500 Post results to PR \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n - name: Post PR comment\n if: >-\n always() && github.event_name == 'pull_request' &&\n steps.submit.outputs.job_id != ''\n uses: actions/github-script@v7\n env:\n JOB_STATUS: ${{ steps.results.outputs.status || 'unknown' }}\n REPORT_ID: ${{ steps.results.outputs.report_id || '' }}\n JOB_ID: ${{ steps.submit.outputs.job_id }}\n SCORE: ${{ steps.results.outputs.score || '' }}\n with:\n script: |\n const marker = '<!-- ailf-score-report -->';\n const status = process.env.JOB_STATUS;\n const reportId = process.env.REPORT_ID;\n const jobId = process.env.JOB_ID;\n const score = process.env.SCORE;\n\n let icon, message;\n if (status === 'completed') {\n icon = '\u2705';\n message = score\n ? `Evaluation completed \u2014 score: **${score}/100**`\n : 'Evaluation completed successfully.';\n } else if (status === 'failed' || status === 'timed-out') {\n icon = '\u26A0\uFE0F';\n message = `Evaluation ${status}.`;\n } else {\n icon = '\u23F3';\n message = 'Evaluation status unknown (may still be running).';\n }\n\n let body = `${marker}\\n## ${icon} AI Literacy Evaluation\\n\\n${message}\\n`;\n if (reportId) {\n body += `\\n\uD83D\uDD17 [View detailed report](https://ailf-api.sanity.build/v1/reports/${reportId})\\n`;\n }\n body += `\\n<sub>Job: \\`${jobId}\\`</sub>\\n`;\n\n const { data: comments } = await github.rest.issues.listComments({\n owner: context.repo.owner,\n repo: context.repo.repo,\n issue_number: context.issue.number,\n });\n const existing = comments.find(c => c.body?.includes(marker));\n\n if (existing) {\n await github.rest.issues.updateComment({\n owner: context.repo.owner,\n repo: context.repo.repo,\n comment_id: existing.id,\n body,\n });\n } else {\n await github.rest.issues.createComment({\n owner: context.repo.owner,\n repo: context.repo.repo,\n issue_number: context.issue.number,\n body,\n });\n }\n\n # \u2500\u2500\u2500 Job summary \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n - name: Summary\n if: always()\n env:\n JOB_STATUS: ${{ steps.results.outputs.status || 'unknown' }}\n REPORT_ID: ${{ steps.results.outputs.report_id || '' }}\n JOB_ID: ${{ steps.submit.outputs.job_id }}\n SCORE: ${{ steps.results.outputs.score || '' }}\n run: |\n {\n echo \"## \uD83D\uDCCA AI Literacy Evaluation\"\n echo \"\"\n echo \"| Field | Value |\"\n echo \"|-------|-------|\"\n echo \"| Job | \\`$JOB_ID\\` |\"\n echo \"| Status | $JOB_STATUS |\"\n [ -n \"$SCORE\" ] && echo \"| Score | $SCORE/100 |\"\n [ -n \"$REPORT_ID\" ] && echo \"| Report | [$REPORT_ID](https://ailf-api.sanity.build/v1/reports/$REPORT_ID) |\"\n } >> \"$GITHUB_STEP_SUMMARY\"\n";
@@ -119,9 +119,9 @@ export const thresholdYaml = "# Example quality threshold configuration.\n#\n# T
119
119
  /** Parsed ailf-config example data (JSON-safe) */
120
120
  export const ailfConfigData = {
121
121
  "source": {
122
- "projectId": "your-project-id",
123
- "dataset": "production",
124
- "baseUrl": "https://your-site.example.com/docs"
122
+ "projectId": "3do82whm",
123
+ "dataset": "next",
124
+ "baseUrl": "https://www.sanity.io/docs"
125
125
  },
126
126
  "triggers": {
127
127
  "pr": {
@@ -141,13 +141,14 @@ export const ailfConfigData = {
141
141
  }
142
142
  };
143
143
  /** Raw YAML string for ailf-config example (preserves comments) */
144
- export const ailfConfigYaml = "# ──────────────────────────────────────────────────────────────────────\n# .ailf/config.yaml — AI Literacy Framework project configuration\n# ──────────────────────────────────────────────────────────────────────\n#\n# This file configures how the AILF evaluation pipeline runs in this\n# repository. Place it at .ailf/config.yaml in your project root.\n#\n# Docs: https://github.com/sanity-io/ai-literacy-framework\n# ──────────────────────────────────────────────────────────────────────\n\n# Documentation source — where to fetch content for evaluation.\n#\n# projectId — your Sanity project ID (find it in sanity.io/manage)\n# dataset — the dataset to query (e.g., \"production\", \"staging\")\n# baseUrl — the public URL of your documentation site\n# (used by agentic mode to test agent discoverability)\nsource:\n projectId: \"your-project-id\"\n dataset: production\n baseUrl: \"https://your-site.example.com/docs\"\n\n# Trigger configuration — when evaluations run automatically.\n#\n# Each key is a trigger context. The pipeline checks which trigger\n# matches the current execution context (PR, merge, schedule, etc.)\n# and applies its settings.\n#\n# mode options:\n# validate-only — check that task YAML parses correctly (fast, no LLM calls)\n# eval — run the full evaluation pipeline\n#\n# paths — only trigger when files matching these globs change\n# blocking — if true, a failing eval blocks the PR merge\n# notify — if true, post results to configured notification channels\ntriggers:\n # On pull requests: just validate task files parse correctly\n pr:\n mode: validate-only\n\n # When .ailf/ files change in a PR: run a real evaluation\n pr-task-change:\n mode: eval\n paths: [\".ailf/**\"]\n\n # On merge to main: run evaluation (non-blocking)\n main:\n mode: eval\n blocking: false\n notify: true\n";
144
+ export const ailfConfigYaml = "# ──────────────────────────────────────────────────────────────────────\n# .ailf/config.yaml — AI Literacy Framework project configuration\n# ──────────────────────────────────────────────────────────────────────\n#\n# This file configures how the AILF evaluation pipeline runs in this\n# repository. Place it at .ailf/config.yaml in your project root.\n#\n# Evaluations are submitted to the AILF API (ailf-api.sanity.build).\n# The API handles LLM calls, doc fetching, grading, and report\n# publishing. Your repo only needs one secret: AILF_API_KEY.\n#\n# Docs: https://github.com/sanity-io/ai-literacy-framework\n# ──────────────────────────────────────────────────────────────────────\n\n# Documentation source — which docs are being evaluated.\n#\n# This tells the pipeline which Sanity project and dataset contain\n# the documentation under test. For most users, this is Sanity's own\n# docs project.\n#\n# projectId — Sanity project ID (find yours at sanity.io/manage)\n# dataset — the dataset to query (e.g., \"production\", \"next\")\n# baseUrl — the public URL of your documentation site\n# (used by agentic mode to test agent discoverability)\nsource:\n projectId: \"3do82whm\"\n dataset: next\n baseUrl: \"https://www.sanity.io/docs\"\n\n# Trigger configuration — when evaluations run automatically.\n#\n# Each key is a trigger context. The pipeline checks which trigger\n# matches the current execution context (PR, merge, schedule, etc.)\n# and applies its settings.\n#\n# mode options:\n# validate-only — check that task YAML parses correctly (fast, no LLM calls)\n# eval — run the full evaluation pipeline\n#\n# paths — only trigger when files matching these globs change\n# blocking — if true, a failing eval blocks the PR merge\n# notify — if true, post results to configured notification channels\ntriggers:\n # On pull requests: just validate task files parse correctly\n pr:\n mode: validate-only\n\n # When .ailf/ files change in a PR: run a real evaluation\n pr-task-change:\n mode: eval\n paths: [\".ailf/**\"]\n\n # On merge to main: run evaluation (non-blocking)\n main:\n mode: eval\n blocking: false\n notify: true\n";
145
145
  /** Parsed task data for example-groq-blog-listing (JSON-safe) */
146
146
  export const exampleGroqBlogListingData = [
147
147
  {
148
148
  "id": "example-groq-blog-listing",
149
149
  "description": "Example — Blog listing with GROQ queries",
150
- "canonical_docs": [
150
+ "featureArea": "groq",
151
+ "canonicalDocs": [
151
152
  {
152
153
  "slug": "groq-introduction",
153
154
  "reason": "Core GROQ syntax and query language reference"
@@ -157,8 +158,8 @@ export const exampleGroqBlogListingData = [
157
158
  "reason": "Query execution model and best practices"
158
159
  }
159
160
  ],
160
- "doc_coverage": true,
161
- "reference_solution": "canonical/example-groq-blog-listing.ts",
161
+ "docCoverage": true,
162
+ "referenceSolution": "canonical/example-groq-blog-listing.ts",
162
163
  "vars": {
163
164
  "task": "Create a Next.js page component that lists blog posts from Sanity\nusing GROQ. The page should display the title, slug, and published\ndate for each post, sorted by most recent first. Use the Sanity\nclient to fetch data.\n",
164
165
  "docs": ""
@@ -189,20 +190,21 @@ export const exampleGroqBlogListingData = [
189
190
  }
190
191
  ];
191
192
  /** Raw YAML string for example-groq-blog-listing (preserves comments) */
192
- export const exampleGroqBlogListingYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Blog listing with GROQ queries\n# ──────────────────────────────────────────────────────────────────────\n#\n# This is a starter template — edit it for your own documentation.\n# Each task evaluates whether an AI coding agent can implement a feature\n# using your docs as context. Delete this file or replace it entirely.\n#\n# To disable this task without deleting the file, set:\n# baseline:\n# enabled: false\n#\n# Full field reference:\n# https://github.com/sanity-io/ai-literacy-framework/blob/main/docs/CONTRIBUTING_TASKS.md\n# ──────────────────────────────────────────────────────────────────────\n\n# Unique identifier — lowercase alphanumeric with hyphens.\n# Must be unique across all task files in .ailf/tasks/.\n- id: example-groq-blog-listing\n\n # Short human-readable summary. Shown in score tables and reports.\n description: \"Example — Blog listing with GROQ queries\"\n\n # Feature area this task belongs to. Tasks with the same area are\n # grouped together in score summaries. Use a short kebab-case name.\n # featureArea is inferred from the filename by default, but you can\n # set it explicitly here.\n # featureArea: groq\n\n # Gold-standard documentation articles for this task. The pipeline\n # fetches these from Sanity and injects them into the prompt for\n # baseline evaluation. Each entry needs:\n # slug — the article's URL slug in your docs site\n # reason — why this doc is relevant (helps with auditing)\n canonical_docs:\n - slug: groq-introduction\n reason: \"Core GROQ syntax and query language reference\"\n - slug: how-queries-work\n reason: \"Query execution model and best practices\"\n\n # When true, the pipeline auto-generates an additional rubric that\n # checks whether the LLM's response actually used the provided docs.\n doc_coverage: true\n\n # Path to a gold-standard implementation, relative to canonical/.\n # The grader uses this as a reference when scoring code correctness.\n reference_solution: canonical/example-groq-blog-listing.ts\n\n # vars.task — the implementation prompt given to the LLM.\n # Write this as if you're asking a developer to build the feature.\n # Be specific about requirements so the grader can evaluate clearly.\n #\n # vars.docs — leave empty (\"\"). The pipeline fills this in:\n # • Gold variant: injected with canonical doc content\n # • Baseline variant: left empty (tests model knowledge alone)\n vars:\n task: |\n Create a Next.js page component that lists blog posts from Sanity\n using GROQ. The page should display the title, slug, and published\n date for each post, sorted by most recent first. Use the Sanity\n client to fetch data.\n docs: \"\"\n\n # Grading assertions — how the LLM's response is scored.\n #\n # \"llm-rubric\" assertions use a grader LLM to score against criteria.\n # The \"template\" references a rubric from config/rubrics.yaml.\n # The \"criteria\" are task-specific bullets injected into the template.\n #\n # Available templates:\n # task-completion — did the LLM implement the feature? (weight: 0.50)\n # code-correctness — is the code idiomatic and correct? (weight: 0.25)\n #\n # You can also use value-based assertions:\n # - type: contains\n # value: \"client.fetch\"\n # - type: contains-any\n # value: [\"createClient\", \"sanityClient\"]\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Uses the groq tagged template literal\"\n - \"Fetches blog posts with title, slug, and publishedAt fields\"\n - \"Orders results by publishedAt in descending order\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses createClient from @sanity/client or next-sanity\"\n - \"Exports a valid Next.js page component\"\n\n # Baseline variant configuration.\n # enabled — set to false to skip this task entirely\n # rubric — \"abbreviated\" (faster, default), \"full\", or \"none\"\n baseline:\n enabled: true\n rubric: abbreviated\n";
193
+ export const exampleGroqBlogListingYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Blog listing with GROQ queries\n# ──────────────────────────────────────────────────────────────────────\n#\n# This is a starter template — edit it for your own documentation.\n# Each task evaluates whether an AI coding agent can implement a feature\n# using your docs as context. Delete this file or replace it entirely.\n#\n# To disable this task without deleting the file, set:\n# baseline:\n# enabled: false\n#\n# Full field reference:\n# https://github.com/sanity-io/ai-literacy-framework/blob/main/docs/CONTRIBUTING_TASKS.md\n# ──────────────────────────────────────────────────────────────────────\n\n# Unique identifier — lowercase alphanumeric with hyphens.\n# Must be unique across all task files in .ailf/tasks/.\n- id: example-groq-blog-listing\n\n # Short human-readable summary. Shown in score tables and reports.\n description: \"Example — Blog listing with GROQ queries\"\n\n # Feature area this task belongs to. Tasks with the same area are\n # grouped together in score summaries. Use a short kebab-case name.\n featureArea: groq\n\n # Gold-standard documentation articles for this task. The pipeline\n # fetches these from Sanity and injects them into the prompt for\n # baseline evaluation. Each entry needs:\n # slug — the article's URL slug in your docs site\n # reason — why this doc is relevant (helps with auditing)\n canonicalDocs:\n - slug: groq-introduction\n reason: \"Core GROQ syntax and query language reference\"\n - slug: how-queries-work\n reason: \"Query execution model and best practices\"\n\n # When true, the pipeline auto-generates an additional rubric that\n # checks whether the LLM's response actually used the provided docs.\n docCoverage: true\n\n # Path to a gold-standard implementation, relative to canonical/.\n # The grader uses this as a reference when scoring code correctness.\n referenceSolution: canonical/example-groq-blog-listing.ts\n\n # vars.task — the implementation prompt given to the LLM.\n # Write this as if you're asking a developer to build the feature.\n # Be specific about requirements so the grader can evaluate clearly.\n #\n # vars.docs — leave empty (\"\"). The pipeline fills this in:\n # • Gold variant: injected with canonical doc content\n # • Baseline variant: left empty (tests model knowledge alone)\n vars:\n task: |\n Create a Next.js page component that lists blog posts from Sanity\n using GROQ. The page should display the title, slug, and published\n date for each post, sorted by most recent first. Use the Sanity\n client to fetch data.\n docs: \"\"\n\n # Grading assertions — how the LLM's response is scored.\n #\n # \"llm-rubric\" assertions use a grader LLM to score against criteria.\n # The \"template\" references a rubric from config/rubrics.yaml.\n # The \"criteria\" are task-specific bullets injected into the template.\n #\n # Available templates:\n # task-completion — did the LLM implement the feature? (weight: 0.50)\n # code-correctness — is the code idiomatic and correct? (weight: 0.25)\n #\n # You can also use value-based assertions:\n # - type: contains\n # value: \"client.fetch\"\n # - type: contains-any\n # value: [\"createClient\", \"sanityClient\"]\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Uses the groq tagged template literal\"\n - \"Fetches blog posts with title, slug, and publishedAt fields\"\n - \"Orders results by publishedAt in descending order\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses createClient from @sanity/client or next-sanity\"\n - \"Exports a valid Next.js page component\"\n\n # Baseline variant configuration.\n # enabled — set to false to skip this task entirely\n # rubric — \"abbreviated\" (faster, default), \"full\", or \"none\"\n baseline:\n enabled: true\n rubric: abbreviated\n";
193
194
  /** Parsed task data for example-studio-custom-input (JSON-safe) */
194
195
  export const exampleStudioCustomInputData = [
195
196
  {
196
197
  "id": "example-studio-custom-input",
197
198
  "description": "Example — Custom input component in Sanity Studio",
198
- "canonical_docs": [
199
+ "featureArea": "studio",
200
+ "canonicalDocs": [
199
201
  {
200
202
  "slug": "custom-input-components",
201
203
  "reason": "Guide for building custom form inputs in Sanity Studio"
202
204
  }
203
205
  ],
204
- "doc_coverage": true,
205
- "reference_solution": "canonical/example-studio-custom-input.ts",
206
+ "docCoverage": true,
207
+ "referenceSolution": "canonical/example-studio-custom-input.ts",
206
208
  "vars": {
207
209
  "task": "Build a custom string input component for Sanity Studio that shows\na character count below the input field. The component should accept\na maxLength option from the field schema and display a warning when\nthe text exceeds the limit.\n",
208
210
  "docs": ""
@@ -234,7 +236,7 @@ export const exampleStudioCustomInputData = [
234
236
  }
235
237
  ];
236
238
  /** Raw YAML string for example-studio-custom-input (preserves comments) */
237
- export const exampleStudioCustomInputYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Custom input component in Sanity Studio\n# ──────────────────────────────────────────────────────────────────────\n#\n# This is a starter template — edit it for your own documentation.\n# Delete this file or replace it with your own tasks.\n#\n# To disable without deleting:\n# baseline:\n# enabled: false\n# ──────────────────────────────────────────────────────────────────────\n\n- id: example-studio-custom-input\n description: \"Example — Custom input component in Sanity Studio\"\n\n canonical_docs:\n - slug: custom-input-components\n reason: \"Guide for building custom form inputs in Sanity Studio\"\n\n doc_coverage: true\n reference_solution: canonical/example-studio-custom-input.ts\n\n vars:\n task: |\n Build a custom string input component for Sanity Studio that shows\n a character count below the input field. The component should accept\n a maxLength option from the field schema and display a warning when\n the text exceeds the limit.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Implements a React component that renders a text input\"\n - \"Displays a live character count\"\n - \"Reads maxLength from schema options\"\n - \"Shows a visual warning when limit is exceeded\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses the Sanity UI library for styling\"\n - \"Calls onChange with patch operations\"\n\n baseline:\n enabled: true\n rubric: abbreviated\n";
239
+ export const exampleStudioCustomInputYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Custom input component in Sanity Studio\n# ──────────────────────────────────────────────────────────────────────\n#\n# This is a starter template — edit it for your own documentation.\n# Delete this file or replace it with your own tasks.\n#\n# To disable without deleting:\n# baseline:\n# enabled: false\n# ──────────────────────────────────────────────────────────────────────\n\n- id: example-studio-custom-input\n description: \"Example — Custom input component in Sanity Studio\"\n\n featureArea: studio\n\n canonicalDocs:\n - slug: custom-input-components\n reason: \"Guide for building custom form inputs in Sanity Studio\"\n\n docCoverage: true\n referenceSolution: canonical/example-studio-custom-input.ts\n\n vars:\n task: |\n Build a custom string input component for Sanity Studio that shows\n a character count below the input field. The component should accept\n a maxLength option from the field schema and display a warning when\n the text exceeds the limit.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Implements a React component that renders a text input\"\n - \"Displays a live character count\"\n - \"Reads maxLength from schema options\"\n - \"Shows a visual warning when limit is exceeded\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses the Sanity UI library for styling\"\n - \"Calls onChange with patch operations\"\n\n baseline:\n enabled: true\n rubric: abbreviated\n";
238
240
  // ---------------------------------------------------------------------------
239
241
  // Aggregate task exports
240
242
  // ---------------------------------------------------------------------------
@@ -283,3 +285,8 @@ export const EXAMPLES = {
283
285
  yaml: Object.values(taskYamlFiles).join("\n"),
284
286
  },
285
287
  };
288
+ // ---------------------------------------------------------------------------
289
+ // Raw file exports (non-data files, exported as raw strings)
290
+ // ---------------------------------------------------------------------------
291
+ /** GitHub Actions workflow template for AI Literacy evaluation */
292
+ export const workflowYaml = "# ──────────────────────────────────────────────────────────────────────\n# AI Literacy Evaluation — GitHub Actions workflow\n# ──────────────────────────────────────────────────────────────────────\n#\n# This workflow submits evaluations to the AILF API when task or config\n# files change in a pull request. The API handles all processing\n# (LLM calls, doc fetching, grading, report publishing).\n#\n# Prerequisites:\n# Add one secret to your repository (Settings → Secrets → Actions):\n# AILF_API_KEY — your API key (starts with ailf_live_sk_)\n#\n# Customization:\n# - Adjust `paths` to match your documentation file locations\n# - Set full_eval to true for comprehensive (slower) evaluation\n# - See: https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/API_GATEWAY.md\n# ──────────────────────────────────────────────────────────────────────\n\nname: AI Literacy Eval\n\non:\n pull_request:\n branches: [main]\n paths:\n - \".ailf/**\"\n\n # Manual trigger from the Actions tab\n workflow_dispatch:\n inputs:\n full_eval:\n description: \"Run full evaluation (all tests, slower)\"\n type: boolean\n default: false\n\nconcurrency:\n group: ailf-eval-${{ github.event.pull_request.number || github.ref }}\n cancel-in-progress: true\n\njobs:\n evaluate:\n name: AI Literacy Evaluation\n runs-on: ubuntu-latest\n permissions:\n pull-requests: write\n steps:\n # ─── Submit evaluation to the AILF API ─────────────────────\n - name: Submit evaluation\n id: submit\n env:\n AILF_API_KEY: ${{ secrets.AILF_API_KEY }}\n FULL_EVAL: ${{ inputs.full_eval || 'false' }}\n run: |\n if [ \"$FULL_EVAL\" = \"true\" ]; then\n DEBUG_FIELD=\"\"\n else\n DEBUG_FIELD='\"debug\": { \"enabled\": true, \"firstN\": 2 },'\n fi\n\n PAYLOAD=$(cat <<EOF\n {\n \"mode\": \"baseline\",\n ${DEBUG_FIELD}\n \"publish\": true,\n \"compare\": true\n }\n EOF\n )\n\n RESPONSE=$(curl -sf -X POST \\\n -H \"Authorization: Bearer $AILF_API_KEY\" \\\n -H \"Content-Type: application/json\" \\\n https://ailf-api.sanity.build/v1/pipeline \\\n -d \"$PAYLOAD\")\n\n JOB_ID=$(echo \"$RESPONSE\" | jq -r '.jobId')\n echo \"job_id=$JOB_ID\" >> $GITHUB_OUTPUT\n echo \"📋 Submitted job: $JOB_ID\"\n\n # ─── Poll for results (long-polling) ───────────────────────\n - name: Wait for results\n id: results\n env:\n AILF_API_KEY: ${{ secrets.AILF_API_KEY }}\n JOB_ID: ${{ steps.submit.outputs.job_id }}\n run: |\n for i in $(seq 1 40); do\n RESPONSE=$(curl -s \\\n -H \"Authorization: Bearer $AILF_API_KEY\" \\\n -H \"Prefer: wait=25\" \\\n \"https://ailf-api.sanity.build/v1/jobs/$JOB_ID\")\n\n STATUS=$(echo \"$RESPONSE\" | jq -r '.status')\n\n case \"$STATUS\" in\n completed)\n echo \"status=completed\" >> $GITHUB_OUTPUT\n echo \"report_id=$(echo $RESPONSE | jq -r '.reportId // empty')\" >> $GITHUB_OUTPUT\n echo \"score=$(echo $RESPONSE | jq -r '.score // empty')\" >> $GITHUB_OUTPUT\n echo \"✅ Evaluation completed\"\n exit 0\n ;;\n failed|timed-out)\n echo \"status=$STATUS\" >> $GITHUB_OUTPUT\n echo \"::error::Evaluation $STATUS\"\n exit 1\n ;;\n *)\n echo \"⏳ [$i/40] $STATUS\"\n ;;\n esac\n done\n\n echo \"::error::Timed out waiting for evaluation\"\n exit 1\n\n # ─── Post results to PR ────────────────────────────────────\n - name: Post PR comment\n if: >-\n always() && github.event_name == 'pull_request' &&\n steps.submit.outputs.job_id != ''\n uses: actions/github-script@v7\n env:\n JOB_STATUS: ${{ steps.results.outputs.status || 'unknown' }}\n REPORT_ID: ${{ steps.results.outputs.report_id || '' }}\n JOB_ID: ${{ steps.submit.outputs.job_id }}\n SCORE: ${{ steps.results.outputs.score || '' }}\n with:\n script: |\n const marker = '<!-- ailf-score-report -->';\n const status = process.env.JOB_STATUS;\n const reportId = process.env.REPORT_ID;\n const jobId = process.env.JOB_ID;\n const score = process.env.SCORE;\n\n let icon, message;\n if (status === 'completed') {\n icon = '✅';\n message = score\n ? `Evaluation completed — score: **${score}/100**`\n : 'Evaluation completed successfully.';\n } else if (status === 'failed' || status === 'timed-out') {\n icon = '⚠️';\n message = `Evaluation ${status}.`;\n } else {\n icon = '⏳';\n message = 'Evaluation status unknown (may still be running).';\n }\n\n let body = `${marker}\\n## ${icon} AI Literacy Evaluation\\n\\n${message}\\n`;\n if (reportId) {\n body += `\\n🔗 [View detailed report](https://ailf-api.sanity.build/v1/reports/${reportId})\\n`;\n }\n body += `\\n<sub>Job: \\`${jobId}\\`</sub>\\n`;\n\n const { data: comments } = await github.rest.issues.listComments({\n owner: context.repo.owner,\n repo: context.repo.repo,\n issue_number: context.issue.number,\n });\n const existing = comments.find(c => c.body?.includes(marker));\n\n if (existing) {\n await github.rest.issues.updateComment({\n owner: context.repo.owner,\n repo: context.repo.repo,\n comment_id: existing.id,\n body,\n });\n } else {\n await github.rest.issues.createComment({\n owner: context.repo.owner,\n repo: context.repo.repo,\n issue_number: context.issue.number,\n body,\n });\n }\n\n # ─── Job summary ───────────────────────────────────────────\n - name: Summary\n if: always()\n env:\n JOB_STATUS: ${{ steps.results.outputs.status || 'unknown' }}\n REPORT_ID: ${{ steps.results.outputs.report_id || '' }}\n JOB_ID: ${{ steps.submit.outputs.job_id }}\n SCORE: ${{ steps.results.outputs.score || '' }}\n run: |\n {\n echo \"## 📊 AI Literacy Evaluation\"\n echo \"\"\n echo \"| Field | Value |\"\n echo \"|-------|-------|\"\n echo \"| Job | \\`$JOB_ID\\` |\"\n echo \"| Status | $JOB_STATUS |\"\n [ -n \"$SCORE\" ] && echo \"| Score | $SCORE/100 |\"\n [ -n \"$REPORT_ID\" ] && echo \"| Report | [$REPORT_ID](https://ailf-api.sanity.build/v1/reports/$REPORT_ID) |\"\n } >> \"$GITHUB_STEP_SUMMARY\"\n";
@@ -95,6 +95,10 @@ export interface ResolvedConfig {
95
95
  taskSourceType?: "content-lake" | "yaml";
96
96
  /** Path to repo-based tasks directory (e.g., .ailf/tasks/) */
97
97
  repoTasksPath?: string;
98
+ /** Report store project ID from .ailf/config.yaml reportStore block */
99
+ reportStoreProjectId?: string;
100
+ /** Report store dataset from .ailf/config.yaml reportStore block */
101
+ reportStoreDataset?: string;
98
102
  /** Callback URL configuration for API-triggered evaluations */
99
103
  callback?: {
100
104
  url: string;
@@ -185,10 +185,20 @@ export declare const RepoTaskFileSchema: z.ZodArray<z.ZodObject<{
185
185
  }, z.core.$strip>>;
186
186
  }, z.core.$strip>>;
187
187
  /**
188
- * Zod schema for .ailf/config.yaml — controls how and when evaluations
189
- * are triggered from an external repository.
188
+ * Zod schema for .ailf/config.yaml — controls documentation source,
189
+ * report destination, and trigger behavior for evaluations from an
190
+ * external repository.
190
191
  */
191
192
  export declare const RepoConfigSchema: z.ZodObject<{
193
+ source: z.ZodOptional<z.ZodObject<{
194
+ projectId: z.ZodOptional<z.ZodString>;
195
+ dataset: z.ZodOptional<z.ZodString>;
196
+ baseUrl: z.ZodOptional<z.ZodString>;
197
+ }, z.core.$strip>>;
198
+ reportStore: z.ZodOptional<z.ZodObject<{
199
+ projectId: z.ZodString;
200
+ dataset: z.ZodString;
201
+ }, z.core.$strip>>;
192
202
  triggers: z.ZodOptional<z.ZodObject<{
193
203
  pr: z.ZodOptional<z.ZodObject<{
194
204
  mode: z.ZodDefault<z.ZodEnum<{
@@ -189,10 +189,36 @@ const ScheduleTriggerSchema = TriggerConfigSchema.extend({
189
189
  cron: z.string().min(1),
190
190
  });
191
191
  /**
192
- * Zod schema for .ailf/config.yaml — controls how and when evaluations
193
- * are triggered from an external repository.
192
+ * Documentation source configuration.
193
+ * Defines which Sanity project holds the documentation being evaluated.
194
+ */
195
+ const SourceConfigSchema = z
196
+ .object({
197
+ projectId: z.string().min(1).optional(),
198
+ dataset: z.string().min(1).optional(),
199
+ baseUrl: z.string().url().optional(),
200
+ })
201
+ .optional();
202
+ /**
203
+ * Report store configuration.
204
+ * Defines which Sanity project receives `ailf.report` documents.
205
+ * This should match the project/dataset configured in the user's Studio.
206
+ * The API token comes from the AILF_REPORT_SANITY_API_TOKEN env var.
207
+ */
208
+ const ReportStoreConfigSchema = z
209
+ .object({
210
+ projectId: z.string().min(1),
211
+ dataset: z.string().min(1),
212
+ })
213
+ .optional();
214
+ /**
215
+ * Zod schema for .ailf/config.yaml — controls documentation source,
216
+ * report destination, and trigger behavior for evaluations from an
217
+ * external repository.
194
218
  */
195
219
  export const RepoConfigSchema = z.object({
220
+ source: SourceConfigSchema,
221
+ reportStore: ReportStoreConfigSchema,
196
222
  triggers: z
197
223
  .object({
198
224
  pr: TriggerConfigSchema.optional(),
package/dist/cli.js CHANGED
File without changes
@@ -18,7 +18,7 @@
18
18
  import { Command } from "commander";
19
19
  import { existsSync, mkdirSync, writeFileSync } from "fs";
20
20
  import { resolve, relative } from "path";
21
- import { ailfConfigData, ailfConfigYaml, taskYamlFiles, TASK_FILE_NAMES, allTaskData, } from "../_vendor/ailf-core/index.js";
21
+ import { ailfConfigData, ailfConfigYaml, taskYamlFiles, TASK_FILE_NAMES, allTaskData, workflowYaml, } from "../_vendor/ailf-core/index.js";
22
22
  // ---------------------------------------------------------------------------
23
23
  // Command factory
24
24
  // ---------------------------------------------------------------------------
@@ -127,7 +127,17 @@ async function runInit(opts) {
127
127
  else {
128
128
  skipped.push(rel(targetDir, gitignorePath));
129
129
  }
130
- // 5. Summary
130
+ // 5. Write GitHub Actions workflow
131
+ const workflowDir = resolve(targetDir, ".github", "workflows");
132
+ const workflowPath = resolve(workflowDir, "ailf-eval.yml");
133
+ mkdirSync(workflowDir, { recursive: true });
134
+ if (writeIfNew(workflowPath, workflowYaml, force)) {
135
+ written.push(rel(targetDir, workflowPath));
136
+ }
137
+ else {
138
+ skipped.push(rel(targetDir, workflowPath));
139
+ }
140
+ // 6. Summary
131
141
  console.log();
132
142
  if (written.length > 0) {
133
143
  for (const f of written) {
@@ -143,8 +153,10 @@ async function runInit(opts) {
143
153
  console.log();
144
154
  console.log(" Next steps:");
145
155
  console.log();
146
- console.log(` 1. Edit ${rel(targetDir, resolve(ailfDir, `config${ext}`))} with your Sanity project settings`);
147
- console.log(` 2. Customize the example tasks in ${rel(targetDir, tasksDir)}/`);
148
- console.log(" 3. Run: ailf pipeline --repo-tasks-path .ailf/tasks/");
156
+ console.log(` 1. Customize the example tasks in ${rel(targetDir, tasksDir)}/`);
157
+ console.log(" 2. Validate: npx @sanity/ailf validate-tasks .ailf/tasks/");
158
+ console.log(" 3. Set AILF_API_KEY in your environment (e.g. in a local .env file)");
159
+ console.log(" and add it as a GitHub Actions secret (Settings → Secrets)");
160
+ console.log(" 4. Push — the workflow at .github/workflows/ailf-eval.yml handles the rest");
149
161
  console.log();
150
162
  }
@@ -10,7 +10,7 @@
10
10
  *
11
11
  * @see packages/eval/src/orchestration/ for the step-based pipeline
12
12
  */
13
- import { writeFileSync } from "fs";
13
+ import { existsSync, readFileSync, writeFileSync } from "fs";
14
14
  import { dirname, resolve } from "path";
15
15
  import { fileURLToPath } from "url";
16
16
  import { classifyUrls } from "../pipeline/classify-url.js";
@@ -18,6 +18,8 @@ import { assessImpact, buildReverseMapping, } from "../pipeline/reverse-mapping.
18
18
  import { buildAppContext } from "../orchestration/build-app-context.js";
19
19
  import { buildStepSequence } from "../orchestration/build-step-sequence.js";
20
20
  import { orchestratePipeline } from "../orchestration/pipeline-orchestrator.js";
21
+ import { load } from "js-yaml";
22
+ import { parseRepoConfig, } from "../adapters/task-sources/repo-schemas.js";
21
23
  const __dirname = dirname(fileURLToPath(import.meta.url));
22
24
  const ROOT = resolve(__dirname, "..", "..");
23
25
  // ---------------------------------------------------------------------------
@@ -32,6 +34,8 @@ const VALID_SEARCH_MODES = ["open", "origin-only", "off"];
32
34
  * Exported so the plan builder can call it independently.
33
35
  */
34
36
  export function computeResolvedOptions(opts) {
37
+ // Resolve paths relative to the caller's cwd, not the eval package root
38
+ const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
35
39
  // Validate mode
36
40
  const mode = opts.mode;
37
41
  if (!VALID_MODES.includes(mode)) {
@@ -163,14 +167,21 @@ export function computeResolvedOptions(opts) {
163
167
  // Smart default: full runs auto-publish when store is configured
164
168
  publishEnabled = reportStoreConfigured && !debugEnabled;
165
169
  }
166
- // Report store overrides — fall back to the eval dataset so that
167
- // perspective evaluations publish reports to the same dataset the
168
- // Studio is reading from. AILF_REPORT_DATASET wins when set explicitly.
170
+ // Report store overrides — resolution order:
171
+ // 1. Explicit CLI flags (--report-dataset, --report-project)
172
+ // 2. Environment variables (AILF_REPORT_DATASET, AILF_REPORT_PROJECT_ID)
173
+ // 3. .ailf/config.yaml reportStore block (when --repo-tasks-path is set)
174
+ // 4. Eval dataset override (so perspective evals publish to the same dataset)
175
+ const repoConfig = loadRepoConfigIfPresent(opts.repoTasksPath);
169
176
  const reportDataset = opts.reportDataset ??
170
177
  process.env.AILF_REPORT_DATASET ??
178
+ repoConfig?.reportStore?.dataset ??
171
179
  datasetOverride ??
172
180
  undefined;
173
- const reportProjectId = opts.reportProject ?? process.env.AILF_REPORT_PROJECT_ID ?? undefined;
181
+ const reportProjectId = opts.reportProject ??
182
+ process.env.AILF_REPORT_PROJECT_ID ??
183
+ repoConfig?.reportStore?.projectId ??
184
+ undefined;
174
185
  return {
175
186
  allowedOriginArgs,
176
187
  areaOption,
@@ -206,7 +217,9 @@ export function computeResolvedOptions(opts) {
206
217
  skipFetch: opts.skipFetch,
207
218
  source: opts.source,
208
219
  studioOriginOverride,
209
- repoTasksPath: opts.repoTasksPath,
220
+ repoTasksPath: opts.repoTasksPath
221
+ ? resolve(callerCwd, opts.repoTasksPath)
222
+ : undefined,
210
223
  taskOption,
211
224
  taskSourceType: resolveTaskSourceType(opts.taskSource),
212
225
  urlArgs,
@@ -303,3 +316,28 @@ function writePipelineResult(result) {
303
316
  // results/latest/ may not exist yet — not critical
304
317
  }
305
318
  }
319
+ /**
320
+ * Load .ailf/config.yaml if --repo-tasks-path is set and the config file
321
+ * exists. Returns null if not applicable.
322
+ *
323
+ * The config.yaml lives one level up from the tasks/ directory:
324
+ * .ailf/config.yaml ← config
325
+ * .ailf/tasks/ ← repoTasksPath
326
+ */
327
+ function loadRepoConfigIfPresent(repoTasksPath) {
328
+ if (!repoTasksPath)
329
+ return null;
330
+ // .ailf/tasks/ → .ailf/config.yaml
331
+ const configPath = resolve(repoTasksPath, "..", "config.yaml");
332
+ if (!existsSync(configPath))
333
+ return null;
334
+ try {
335
+ const raw = readFileSync(configPath, "utf-8");
336
+ const parsed = load(raw);
337
+ return parseRepoConfig(parsed);
338
+ }
339
+ catch (err) {
340
+ console.warn(` ⚠️ Failed to parse ${configPath}: ${err instanceof Error ? err.message : String(err)}`);
341
+ return null;
342
+ }
343
+ }
@@ -101,7 +101,8 @@ async function runPublishCommand(summaryPath, opts) {
101
101
  // -----------------------------------------------------------------------
102
102
  // 1. Resolve and read the score summary
103
103
  // -----------------------------------------------------------------------
104
- const resolvedPath = resolve(summaryPath);
104
+ const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
105
+ const resolvedPath = resolve(callerCwd, summaryPath);
105
106
  if (!existsSync(resolvedPath)) {
106
107
  console.error(` ✖ File not found: ${resolvedPath}`);
107
108
  console.error();
@@ -24,7 +24,10 @@ export function createValidateTasksCommand() {
24
24
  .argument("[path]", "Path to tasks directory (default: .ailf/tasks/)", ".ailf/tasks")
25
25
  .option("--strict", "Treat warnings as errors", false)
26
26
  .action(async (tasksPath, opts) => {
27
- const resolvedPath = resolve(tasksPath);
27
+ // Resolve relative to the caller's working directory, not the
28
+ // eval package root (which differs when run via bin/ailf.js)
29
+ const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
30
+ const resolvedPath = resolve(callerCwd, tasksPath);
28
31
  if (!existsSync(resolvedPath)) {
29
32
  console.error(`❌ Directory not found: ${resolvedPath}`);
30
33
  process.exit(1);
@@ -43,7 +43,7 @@ export function createAppContext(config) {
43
43
  // Eval runner — Promptfoo subprocess
44
44
  const evalRunner = new PromptfooEvalAdapter(config.rootDir);
45
45
  // Report store — Sanity Content Lake (for publish + auto-compare)
46
- const reportStore = createReportStore();
46
+ const reportStore = createReportStore(config);
47
47
  // Sinks — loaded from config/sinks.yaml
48
48
  const sinks = loadSinks();
49
49
  return {
@@ -75,7 +75,7 @@ function createCache(config) {
75
75
  const token = process.env.AILF_REPORT_SANITY_API_TOKEN ?? process.env.SANITY_API_TOKEN;
76
76
  if (!token)
77
77
  return local;
78
- return new ContentLakeCacheAdapter(local, createReportStore());
78
+ return new ContentLakeCacheAdapter(local, createReportStore(config));
79
79
  }
80
80
  function createTaskSource(config) {
81
81
  // Primary source — selected by config.taskSourceType
@@ -96,10 +96,14 @@ function createTaskSource(config) {
96
96
  }
97
97
  return primary;
98
98
  }
99
- function createReportStore() {
99
+ function createReportStore(config) {
100
100
  return new ReportStore({
101
- dataset: process.env.AILF_REPORT_DATASET ?? undefined,
102
- projectId: process.env.AILF_REPORT_PROJECT_ID ?? undefined,
101
+ dataset: process.env.AILF_REPORT_DATASET ??
102
+ config?.reportStoreDataset ??
103
+ undefined,
104
+ projectId: process.env.AILF_REPORT_PROJECT_ID ??
105
+ config?.reportStoreProjectId ??
106
+ undefined,
103
107
  token: process.env.AILF_REPORT_SANITY_API_TOKEN ??
104
108
  process.env.SANITY_API_TOKEN ??
105
109
  undefined,
@@ -67,6 +67,8 @@ export function mapToResolvedConfig(opts, rootDir) {
67
67
  beforeOption: opts.beforeOption,
68
68
  taskSourceType: opts.taskSourceType,
69
69
  repoTasksPath: opts.repoTasksPath,
70
+ reportStoreProjectId: opts.reportProjectId,
71
+ reportStoreDataset: opts.reportDataset,
70
72
  };
71
73
  }
72
74
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "0.1.0",
3
+ "version": "0.1.2",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "restricted"
@@ -1,5 +0,0 @@
1
- /**
2
- * update-quality-scores command — update QUALITY_SCORE.md from scores.
3
- */
4
- import { Command } from "commander";
5
- export declare function createUpdateQualityScoresCommand(): Command;
@@ -1,20 +0,0 @@
1
- /**
2
- * update-quality-scores command — update QUALITY_SCORE.md from scores.
3
- */
4
- import { Command } from "commander";
5
- export function createUpdateQualityScoresCommand() {
6
- return new Command("update-quality-scores")
7
- .description("Update docs/QUALITY_SCORE.md from score-summary.json")
8
- .action(async () => {
9
- const { updateQualityScores } = await import("../scripts/update-quality-scores.js");
10
- console.log("=== Updating QUALITY_SCORE.md from score-summary.json ===\n");
11
- const result = updateQualityScores();
12
- if (result.success) {
13
- console.log(` ✅ ${result.message}`);
14
- }
15
- else {
16
- console.error(` ❌ ${result.message}`);
17
- process.exit(1);
18
- }
19
- });
20
- }
@@ -1,8 +0,0 @@
1
- /**
2
- * lib/agent-behavior-report.ts — DEPRECATED re-export shim.
3
- * @deprecated Import from ../pipeline/agent-behavior-report.js instead.
4
- */
5
- import "dotenv/config";
6
- export { analyzeResults, CANONICAL_DOC_MAP, detectFeatureArea, } from "../pipeline/agent-behavior-report.js";
7
- export type { AnalysisResult, FeatureAnalysis, TaskBehavior, TestResult, } from "../pipeline/agent-behavior-report.js";
8
- export declare function main(resultsPathArg?: string): void;