@sanity/ailf 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/rubrics.yaml +26 -7
- package/dist/_vendor/ailf-core/examples/index.d.ts +10 -10
- package/dist/_vendor/ailf-core/examples/index.js +10 -10
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +1 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +19 -3
- package/dist/_vendor/ailf-core/schemas/pipeline.js +38 -7
- package/dist/_vendor/ailf-core/services/scoring.d.ts +13 -1
- package/dist/_vendor/ailf-core/services/scoring.js +42 -11
- package/dist/adapters/task-sources/content-lake-task-source.js +1 -1
- package/dist/pipeline/calculate-scores.d.ts +3 -3
- package/dist/pipeline/calculate-scores.js +118 -127
- package/dist/pipeline/expand-tasks.d.ts +4 -4
- package/dist/pipeline/expand-tasks.js +3 -3
- package/dist/pipeline/grader-consistency-runner.js +2 -1
- package/dist/pipeline/profile-resolution.d.ts +39 -0
- package/dist/pipeline/profile-resolution.js +69 -0
- package/package.json +3 -3
package/config/rubrics.yaml
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
# rubric text at expansion time.
|
|
7
7
|
#
|
|
8
8
|
# Each dimension is scored on a uniform 0–100 scale. Dimensions are
|
|
9
|
-
# combined into a composite score using
|
|
9
|
+
# combined into a composite score using named scoring profiles below.
|
|
10
10
|
#
|
|
11
11
|
# Each template carries a `dimension` field that tags the scoring
|
|
12
12
|
# dimension it belongs to. This metadata propagates through the
|
|
@@ -51,12 +51,31 @@ templates:
|
|
|
51
51
|
- "80: Minor gaps — almost everything was documented"
|
|
52
52
|
- "100: Complete coverage — all necessary info was in docs"
|
|
53
53
|
|
|
54
|
-
#
|
|
55
|
-
#
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
54
|
+
# Named scoring profiles — each is a dimension → weight map (must sum to 1.0).
|
|
55
|
+
#
|
|
56
|
+
# 'default': Full three-dimension composite for gold/ceiling entries (with docs).
|
|
57
|
+
# 'output-only': Output quality dimensions only — excludes doc-coverage, which
|
|
58
|
+
# is semantically undefined on without-docs entries.
|
|
59
|
+
#
|
|
60
|
+
# See docs/design-docs/named-scoring-profiles.md for the rationale.
|
|
61
|
+
profiles:
|
|
62
|
+
default:
|
|
63
|
+
task-completion: 0.50
|
|
64
|
+
code-correctness: 0.25
|
|
65
|
+
doc-coverage: 0.25
|
|
66
|
+
output-only:
|
|
67
|
+
task-completion: 0.60
|
|
68
|
+
code-correctness: 0.40
|
|
69
|
+
|
|
70
|
+
# Mode-to-profile bindings — which profile to use for each (mode, variant) pair.
|
|
71
|
+
# The scoring engine resolves: mode-profiles.<mode>.<variant> → profile name.
|
|
72
|
+
# Falls back to 'default' when no explicit binding exists.
|
|
73
|
+
mode-profiles:
|
|
74
|
+
baseline:
|
|
75
|
+
gold: default
|
|
76
|
+
baseline: output-only
|
|
77
|
+
agentic:
|
|
78
|
+
gold: default
|
|
60
79
|
|
|
61
80
|
footer:
|
|
62
81
|
'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}'
|
|
@@ -140,12 +140,12 @@ export declare const exampleGroqBlogListingData: readonly [{
|
|
|
140
140
|
}];
|
|
141
141
|
readonly baseline: {
|
|
142
142
|
readonly enabled: true;
|
|
143
|
-
readonly rubric: "
|
|
143
|
+
readonly rubric: "full";
|
|
144
144
|
};
|
|
145
145
|
readonly status: "draft";
|
|
146
146
|
}];
|
|
147
147
|
/** Raw YAML string for example-groq-blog-listing (preserves comments) */
|
|
148
|
-
export declare const exampleGroqBlogListingYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Example Task: Blog listing with GROQ queries\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# This is a starter template \u2014 edit it for your own documentation.\n# Each task evaluates whether an AI coding agent can implement a feature\n# using your docs as context. Delete this file or replace it entirely.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# Full field reference:\n# https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/CONTRIBUTING_TASKS.md\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n# Unique identifier \u2014 lowercase alphanumeric with hyphens.\n# Must be unique across all task files in .ailf/tasks/.\n- id: example-groq-blog-listing\n\n # Short human-readable summary. Shown in score tables and reports.\n description: \"Example \u2014 Blog listing with GROQ queries\"\n\n # Feature area this task belongs to. Tasks with the same area are\n # grouped together in score summaries. Use a short kebab-case name.\n featureArea: groq\n\n # Gold-standard documentation articles for this task. The pipeline\n # fetches these from Sanity and injects them into the prompt for\n # baseline evaluation. Each entry needs:\n # slug \u2014 the article's URL slug in your docs site\n # reason \u2014 why this doc is relevant (helps with auditing)\n #\n # This example uses slug-based references \u2014 the simplest form.\n # See the other example tasks for path, id, and perspective references.\n canonicalDocs:\n - slug: groq-introduction\n reason: \"Core GROQ syntax and query language reference\"\n - slug: how-queries-work\n reason: \"Query execution model and best practices\"\n\n # When true, the pipeline auto-generates an additional rubric that\n # checks whether the LLM's response actually used the provided docs.\n docCoverage: true\n\n # Path to a gold-standard implementation, relative to canonical/.\n # The grader uses this as a reference when scoring code correctness.\n referenceSolution: canonical/example-groq-blog-listing.ts\n\n # vars.task \u2014 the implementation prompt given to the LLM.\n # Write this as if you're asking a developer to build the feature.\n # Be specific about requirements so the grader can evaluate clearly.\n #\n # vars.docs \u2014 leave empty (\"\"). The pipeline fills this in:\n # \u2022 Gold variant: injected with canonical doc content\n # \u2022 Baseline variant: left empty (tests model knowledge alone)\n vars:\n task: |\n Create a Next.js page component that lists blog posts from Sanity\n using GROQ. The page should display the title, slug, and published\n date for each post, sorted by most recent first. Use the Sanity\n client to fetch data.\n docs: \"\"\n\n # Grading assertions \u2014 how the LLM's response is scored.\n #\n # \"llm-rubric\" assertions use a grader LLM to score against criteria.\n # The \"template\" references a rubric from config/rubrics.yaml.\n # The \"criteria\" are task-specific bullets injected into the template.\n #\n # Available templates:\n # task-completion \u2014 did the LLM implement the feature? (weight: 0.50)\n # code-correctness \u2014 is the code idiomatic and correct? (weight: 0.25)\n #\n # You can also use value-based assertions:\n # - type: contains\n # value: \"client.fetch\"\n # - type: contains-any\n # value: [\"createClient\", \"sanityClient\"]\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Uses the groq tagged template literal\"\n - \"Fetches blog posts with title, slug, and publishedAt fields\"\n - \"Orders results by publishedAt in descending order\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses createClient from @sanity/client or next-sanity\"\n - \"Exports a valid Next.js page component\"\n\n # Baseline variant configuration.\n # enabled \u2014 set to false to skip this task entirely\n # rubric \u2014 \"
|
|
148
|
+
export declare const exampleGroqBlogListingYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Example Task: Blog listing with GROQ queries\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# This is a starter template \u2014 edit it for your own documentation.\n# Each task evaluates whether an AI coding agent can implement a feature\n# using your docs as context. Delete this file or replace it entirely.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# Full field reference:\n# https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/CONTRIBUTING_TASKS.md\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n# Unique identifier \u2014 lowercase alphanumeric with hyphens.\n# Must be unique across all task files in .ailf/tasks/.\n- id: example-groq-blog-listing\n\n # Short human-readable summary. Shown in score tables and reports.\n description: \"Example \u2014 Blog listing with GROQ queries\"\n\n # Feature area this task belongs to. Tasks with the same area are\n # grouped together in score summaries. Use a short kebab-case name.\n featureArea: groq\n\n # Gold-standard documentation articles for this task. The pipeline\n # fetches these from Sanity and injects them into the prompt for\n # baseline evaluation. Each entry needs:\n # slug \u2014 the article's URL slug in your docs site\n # reason \u2014 why this doc is relevant (helps with auditing)\n #\n # This example uses slug-based references \u2014 the simplest form.\n # See the other example tasks for path, id, and perspective references.\n canonicalDocs:\n - slug: groq-introduction\n reason: \"Core GROQ syntax and query language reference\"\n - slug: how-queries-work\n reason: \"Query execution model and best practices\"\n\n # When true, the pipeline auto-generates an additional rubric that\n # checks whether the LLM's response actually used the provided docs.\n docCoverage: true\n\n # Path to a gold-standard implementation, relative to canonical/.\n # The grader uses this as a reference when scoring code correctness.\n referenceSolution: canonical/example-groq-blog-listing.ts\n\n # vars.task \u2014 the implementation prompt given to the LLM.\n # Write this as if you're asking a developer to build the feature.\n # Be specific about requirements so the grader can evaluate clearly.\n #\n # vars.docs \u2014 leave empty (\"\"). The pipeline fills this in:\n # \u2022 Gold variant: injected with canonical doc content\n # \u2022 Baseline variant: left empty (tests model knowledge alone)\n vars:\n task: |\n Create a Next.js page component that lists blog posts from Sanity\n using GROQ. The page should display the title, slug, and published\n date for each post, sorted by most recent first. Use the Sanity\n client to fetch data.\n docs: \"\"\n\n # Grading assertions \u2014 how the LLM's response is scored.\n #\n # \"llm-rubric\" assertions use a grader LLM to score against criteria.\n # The \"template\" references a rubric from config/rubrics.yaml.\n # The \"criteria\" are task-specific bullets injected into the template.\n #\n # Available templates:\n # task-completion \u2014 did the LLM implement the feature? (weight: 0.50)\n # code-correctness \u2014 is the code idiomatic and correct? (weight: 0.25)\n #\n # You can also use value-based assertions:\n # - type: contains\n # value: \"client.fetch\"\n # - type: contains-any\n # value: [\"createClient\", \"sanityClient\"]\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Uses the groq tagged template literal\"\n - \"Fetches blog posts with title, slug, and publishedAt fields\"\n - \"Orders results by publishedAt in descending order\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses createClient from @sanity/client or next-sanity\"\n - \"Exports a valid Next.js page component\"\n\n # Baseline variant configuration.\n # enabled \u2014 set to false to skip this task entirely\n # rubric \u2014 \"full\" (default), \"abbreviated\" (faster), or \"none\"\n baseline:\n enabled: true\n rubric: full\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
|
|
149
149
|
/** Parsed task data for example-id-based-ref (JSON-safe) */
|
|
150
150
|
export declare const exampleIdBasedRefData: readonly [{
|
|
151
151
|
readonly id: "example-id-based-ref";
|
|
@@ -176,12 +176,12 @@ export declare const exampleIdBasedRefData: readonly [{
|
|
|
176
176
|
}];
|
|
177
177
|
readonly baseline: {
|
|
178
178
|
readonly enabled: true;
|
|
179
|
-
readonly rubric: "
|
|
179
|
+
readonly rubric: "full";
|
|
180
180
|
};
|
|
181
181
|
readonly status: "draft";
|
|
182
182
|
}];
|
|
183
183
|
/** Raw YAML string for example-id-based-ref (preserves comments) */
|
|
184
|
-
export declare const exampleIdBasedRefYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Example Task: Document ID-based canonical doc references\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# Demonstrates using `id` to reference canonical documentation by\n# Sanity document `_id`. This is useful for:\n# - Draft documents that don't have a stable slug yet\n# - Programmatic references from imports or migrations\n# - Documents where you know the _id but not the slug\n#\n# The `id` ref type can also carry optional `slug` and `path` fields\n# as human-readable annotations \u2014 these are NOT used for resolution,\n# only for display in logs and reports.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n- id: example-id-based-ref\n description: \"Example \u2014 GROQ feature support (ID-based doc references)\"\n\n featureArea: groq\n\n # ID-based canonical doc references.\n #\n # Use the Sanity document _id to reference articles directly.\n # Optional slug/path annotations help humans reading the YAML\n # but are NOT used for resolution \u2014 only the `id` field matters.\n #\n # These IDs reference real articles in the Sanity docs (next dataset):\n # 0ba88f1b... = \"GROQ feature support across Sanity\"\n # 5b9c2863... = \"Custom GROQ functions\"\n canonicalDocs:\n - id: \"0ba88f1b-d1a7-418a-9267-2e343d01886a\"\n slug: groq-feature-support-by-context # annotation only \u2014 not used for resolution\n reason: \"GROQ feature support across different Sanity contexts\"\n - id: \"5b9c2863-ef01-4565-af8e-ee54e081ee74\"\n slug: custom-groq-functions # annotation only \u2014 not used for resolution\n reason: \"Custom GROQ functions and pipelines\"\n\n docCoverage: true\n\n vars:\n task: |\n Explain how GROQ is used across different Sanity contexts.\n Cover the following:\n 1. Which GROQ features are available in each context (API queries,\n webhooks, custom functions, access control)\n 2. How to create and use custom GROQ functions\n 3. Any differences in GROQ support between contexts\n Provide examples demonstrating context-specific GROQ patterns.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Explains GROQ availability across different Sanity contexts\"\n - \"Describes custom GROQ function creation and usage\"\n - \"Notes differences in GROQ support between contexts\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"GROQ examples use valid syntax\"\n - \"Custom function examples follow the correct API pattern\"\n\n baseline:\n enabled: true\n rubric:
|
|
184
|
+
export declare const exampleIdBasedRefYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Example Task: Document ID-based canonical doc references\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# Demonstrates using `id` to reference canonical documentation by\n# Sanity document `_id`. This is useful for:\n# - Draft documents that don't have a stable slug yet\n# - Programmatic references from imports or migrations\n# - Documents where you know the _id but not the slug\n#\n# The `id` ref type can also carry optional `slug` and `path` fields\n# as human-readable annotations \u2014 these are NOT used for resolution,\n# only for display in logs and reports.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n- id: example-id-based-ref\n description: \"Example \u2014 GROQ feature support (ID-based doc references)\"\n\n featureArea: groq\n\n # ID-based canonical doc references.\n #\n # Use the Sanity document _id to reference articles directly.\n # Optional slug/path annotations help humans reading the YAML\n # but are NOT used for resolution \u2014 only the `id` field matters.\n #\n # These IDs reference real articles in the Sanity docs (next dataset):\n # 0ba88f1b... = \"GROQ feature support across Sanity\"\n # 5b9c2863... = \"Custom GROQ functions\"\n canonicalDocs:\n - id: \"0ba88f1b-d1a7-418a-9267-2e343d01886a\"\n slug: groq-feature-support-by-context # annotation only \u2014 not used for resolution\n reason: \"GROQ feature support across different Sanity contexts\"\n - id: \"5b9c2863-ef01-4565-af8e-ee54e081ee74\"\n slug: custom-groq-functions # annotation only \u2014 not used for resolution\n reason: \"Custom GROQ functions and pipelines\"\n\n docCoverage: true\n\n vars:\n task: |\n Explain how GROQ is used across different Sanity contexts.\n Cover the following:\n 1. Which GROQ features are available in each context (API queries,\n webhooks, custom functions, access control)\n 2. How to create and use custom GROQ functions\n 3. Any differences in GROQ support between contexts\n Provide examples demonstrating context-specific GROQ patterns.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Explains GROQ availability across different Sanity contexts\"\n - \"Describes custom GROQ function creation and usage\"\n - \"Notes differences in GROQ support between contexts\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"GROQ examples use valid syntax\"\n - \"Custom function examples follow the correct API pattern\"\n\n baseline:\n enabled: true\n rubric: full\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
|
|
185
185
|
/** Parsed task data for example-path-based-ref (JSON-safe) */
|
|
186
186
|
export declare const examplePathBasedRefData: readonly [{
|
|
187
187
|
readonly id: "example-path-based-ref";
|
|
@@ -210,12 +210,12 @@ export declare const examplePathBasedRefData: readonly [{
|
|
|
210
210
|
}];
|
|
211
211
|
readonly baseline: {
|
|
212
212
|
readonly enabled: true;
|
|
213
|
-
readonly rubric: "
|
|
213
|
+
readonly rubric: "full";
|
|
214
214
|
};
|
|
215
215
|
readonly status: "draft";
|
|
216
216
|
}];
|
|
217
217
|
/** Raw YAML string for example-path-based-ref (preserves comments) */
|
|
218
|
-
export declare const examplePathBasedRefYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Example Task: Path-based canonical doc references\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# Demonstrates using `path` to reference canonical documentation.\n# Paths are the preferred reference type because they uniquely identify\n# an article across sections (unlike slugs, which can collide).\n#\n# Path format:\n# - Simple: \"webhooks\" \u2192 resolves by slug lookup\n# - Sectioned: \"content-lake/webhooks\" \u2192 disambiguates by section + slug\n#\n# This example demonstrates why paths matter: the slug \"documents\"\n# exists in both the \"content-lake\" and \"cli-reference\" sections.\n# Using \"content-lake/documents\" ensures we get the right one.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n- id: example-path-based-ref\n description: \"Example \u2014 GROQ mutations (path-based doc references)\"\n\n featureArea: groq\n\n # Path-based canonical doc references.\n #\n # Use \"section/slug\" format to uniquely identify articles:\n # - \"content-lake/mutations-introduction\" \u2192 the mutations article\n # - \"content-lake/documents\" \u2192 the documents article in Content Lake\n # (not the CLI \"documents\" article in cli-reference section)\n #\n # The \"documents\" slug exists in two sections \u2014 this is exactly why\n # path-based references are preferred over slug-based references.\n canonicalDocs:\n - path: content-lake/mutations-introduction\n reason: \"Introduction to document mutations in the Content Lake\"\n - path: content-lake/documents\n reason: \"Document structure and types (Content Lake, not CLI reference)\"\n\n docCoverage: true\n\n vars:\n task: |\n Explain how to create, update, and delete documents in Sanity's\n Content Lake using mutations. Cover:\n 1. The different mutation types (create, createOrReplace, patch, delete)\n 2. Document structure and required fields (_id, _type)\n 3. How to use patch operations to update specific fields\n 4. Best practices for mutation patterns\n Provide working code examples using @sanity/client.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Explains create, createOrReplace, patch, and delete mutations\"\n - \"Describes required document fields (_id, _type)\"\n - \"Shows patch operations for field-level updates\"\n - \"Includes practical code examples\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses correct @sanity/client mutation API\"\n - \"Patch operations use valid set/unset/inc syntax\"\n\n baseline:\n enabled: true\n rubric:
|
|
218
|
+
export declare const examplePathBasedRefYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Example Task: Path-based canonical doc references\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# Demonstrates using `path` to reference canonical documentation.\n# Paths are the preferred reference type because they uniquely identify\n# an article across sections (unlike slugs, which can collide).\n#\n# Path format:\n# - Simple: \"webhooks\" \u2192 resolves by slug lookup\n# - Sectioned: \"content-lake/webhooks\" \u2192 disambiguates by section + slug\n#\n# This example demonstrates why paths matter: the slug \"documents\"\n# exists in both the \"content-lake\" and \"cli-reference\" sections.\n# Using \"content-lake/documents\" ensures we get the right one.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n- id: example-path-based-ref\n description: \"Example \u2014 GROQ mutations (path-based doc references)\"\n\n featureArea: groq\n\n # Path-based canonical doc references.\n #\n # Use \"section/slug\" format to uniquely identify articles:\n # - \"content-lake/mutations-introduction\" \u2192 the mutations article\n # - \"content-lake/documents\" \u2192 the documents article in Content Lake\n # (not the CLI \"documents\" article in cli-reference section)\n #\n # The \"documents\" slug exists in two sections \u2014 this is exactly why\n # path-based references are preferred over slug-based references.\n canonicalDocs:\n - path: content-lake/mutations-introduction\n reason: \"Introduction to document mutations in the Content Lake\"\n - path: content-lake/documents\n reason: \"Document structure and types (Content Lake, not CLI reference)\"\n\n docCoverage: true\n\n vars:\n task: |\n Explain how to create, update, and delete documents in Sanity's\n Content Lake using mutations. Cover:\n 1. The different mutation types (create, createOrReplace, patch, delete)\n 2. Document structure and required fields (_id, _type)\n 3. How to use patch operations to update specific fields\n 4. Best practices for mutation patterns\n Provide working code examples using @sanity/client.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Explains create, createOrReplace, patch, and delete mutations\"\n - \"Describes required document fields (_id, _type)\"\n - \"Shows patch operations for field-level updates\"\n - \"Includes practical code examples\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses correct @sanity/client mutation API\"\n - \"Patch operations use valid set/unset/inc syntax\"\n\n baseline:\n enabled: true\n rubric: full\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
|
|
219
219
|
/** Parsed task data for example-perspective-ref (JSON-safe) */
|
|
220
220
|
export declare const examplePerspectiveRefData: readonly [{
|
|
221
221
|
readonly id: "example-perspective-ref";
|
|
@@ -244,12 +244,12 @@ export declare const examplePerspectiveRefData: readonly [{
|
|
|
244
244
|
}];
|
|
245
245
|
readonly baseline: {
|
|
246
246
|
readonly enabled: true;
|
|
247
|
-
readonly rubric: "
|
|
247
|
+
readonly rubric: "full";
|
|
248
248
|
};
|
|
249
249
|
readonly status: "draft";
|
|
250
250
|
}];
|
|
251
251
|
/** Raw YAML string for example-perspective-ref (preserves comments) */
|
|
252
|
-
export declare const examplePerspectiveRefYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Example Task: Perspective / content release doc references\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# Demonstrates using `perspective` to reference all documentation\n# articles within a content release. This is the key capability for\n# evaluating NEW feature documentation before it's published.\n#\n# How it works:\n# - A perspective ref is one-to-many: the doc fetcher queries the\n# named release and expands it to ALL articles versioned within it.\n# - Downstream consumers see the same flat DocContext[] regardless\n# of how docs were resolved.\n# - When the release is published, the perspective entry becomes a\n# no-op (articles are now in published). Migrate to explicit path\n# or slug refs at your convenience.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n- id: example-perspective-ref\n description:\n \"Example \u2014 GROQ features from content release (perspective-based doc\n references)\"\n\n featureArea: groq\n\n # Perspective-based canonical doc reference.\n #\n # The perspective ID references a content release in the Sanity\n # Content Lake. At evaluation time, the doc fetcher auto-discovers\n # all articles versioned in this release and includes them as\n # canonical documentation context.\n #\n # Release rE9TSJvR4 contains:\n # - \"GROQ-powered webhooks\" (webhooks)\n # - \"Query Cheat Sheet - GROQ\" (query-cheat-sheet)\n # - \"GROQ joins\" (groq-joins)\n #\n # You can combine perspective refs with explicit slug/path/id refs\n # to include foundational published docs alongside release content.\n # Here we add groq-data-types as a complementary published reference.\n canonicalDocs:\n - perspective: rE9TSJvR4\n reason: \"All GROQ documentation updates in the test content release\"\n - slug: groq-data-types\n reason: \"GROQ data type reference (published, stable)\"\n\n docCoverage: true\n\n vars:\n task: |\n Using GROQ, demonstrate advanced query patterns including:\n 1. Joining data across document types using references\n 2. Filtering webhook payloads with GROQ projections\n 3. Using the query cheat sheet patterns for common operations\n 4. Working with different GROQ data types in filters\n Provide working GROQ query examples for each pattern.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Demonstrates GROQ join syntax for cross-document queries\"\n - \"Shows GROQ filter patterns for webhook configuration\"\n - \"Includes practical query examples from cheat sheet patterns\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"All GROQ queries use valid syntax\"\n - \"Reference joins use correct dereference operator (->)\"\n\n baseline:\n enabled: true\n rubric:
|
|
252
|
+
export declare const examplePerspectiveRefYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Example Task: Perspective / content release doc references\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# Demonstrates using `perspective` to reference all documentation\n# articles within a content release. This is the key capability for\n# evaluating NEW feature documentation before it's published.\n#\n# How it works:\n# - A perspective ref is one-to-many: the doc fetcher queries the\n# named release and expands it to ALL articles versioned within it.\n# - Downstream consumers see the same flat DocContext[] regardless\n# of how docs were resolved.\n# - When the release is published, the perspective entry becomes a\n# no-op (articles are now in published). Migrate to explicit path\n# or slug refs at your convenience.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n- id: example-perspective-ref\n description:\n \"Example \u2014 GROQ features from content release (perspective-based doc\n references)\"\n\n featureArea: groq\n\n # Perspective-based canonical doc reference.\n #\n # The perspective ID references a content release in the Sanity\n # Content Lake. At evaluation time, the doc fetcher auto-discovers\n # all articles versioned in this release and includes them as\n # canonical documentation context.\n #\n # Release rE9TSJvR4 contains:\n # - \"GROQ-powered webhooks\" (webhooks)\n # - \"Query Cheat Sheet - GROQ\" (query-cheat-sheet)\n # - \"GROQ joins\" (groq-joins)\n #\n # You can combine perspective refs with explicit slug/path/id refs\n # to include foundational published docs alongside release content.\n # Here we add groq-data-types as a complementary published reference.\n canonicalDocs:\n - perspective: rE9TSJvR4\n reason: \"All GROQ documentation updates in the test content release\"\n - slug: groq-data-types\n reason: \"GROQ data type reference (published, stable)\"\n\n docCoverage: true\n\n vars:\n task: |\n Using GROQ, demonstrate advanced query patterns including:\n 1. Joining data across document types using references\n 2. Filtering webhook payloads with GROQ projections\n 3. Using the query cheat sheet patterns for common operations\n 4. Working with different GROQ data types in filters\n Provide working GROQ query examples for each pattern.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Demonstrates GROQ join syntax for cross-document queries\"\n - \"Shows GROQ filter patterns for webhook configuration\"\n - \"Includes practical query examples from cheat sheet patterns\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"All GROQ queries use valid syntax\"\n - \"Reference joins use correct dereference operator (->)\"\n\n baseline:\n enabled: true\n rubric: full\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
|
|
253
253
|
/** Parsed task data for example-studio-custom-input (JSON-safe) */
|
|
254
254
|
export declare const exampleStudioCustomInputData: readonly [{
|
|
255
255
|
readonly id: "example-studio-custom-input";
|
|
@@ -279,12 +279,12 @@ export declare const exampleStudioCustomInputData: readonly [{
|
|
|
279
279
|
}];
|
|
280
280
|
readonly baseline: {
|
|
281
281
|
readonly enabled: true;
|
|
282
|
-
readonly rubric: "
|
|
282
|
+
readonly rubric: "full";
|
|
283
283
|
};
|
|
284
284
|
readonly status: "draft";
|
|
285
285
|
}];
|
|
286
286
|
/** Raw YAML string for example-studio-custom-input (preserves comments) */
|
|
287
|
-
export declare const exampleStudioCustomInputYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Example Task: Custom input component in Sanity Studio\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# This is a starter template \u2014 edit it for your own documentation.\n# Delete this file or replace it with your own tasks.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n- id: example-studio-custom-input\n description: \"Example \u2014 Custom input component in Sanity Studio\"\n\n featureArea: studio\n\n # Slug-based canonical doc references.\n canonicalDocs:\n - slug: custom-input-widgets\n reason: \"Guide for building custom form inputs in Sanity Studio\"\n - slug: form-components\n reason: \"Form component API and customization patterns\"\n\n docCoverage: true\n referenceSolution: canonical/example-studio-custom-input.ts\n\n vars:\n task: |\n Build a custom string input component for Sanity Studio that shows\n a character count below the input field. The component should accept\n a maxLength option from the field schema and display a warning when\n the text exceeds the limit.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Implements a React component that renders a text input\"\n - \"Displays a live character count\"\n - \"Reads maxLength from schema options\"\n - \"Shows a visual warning when limit is exceeded\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses the Sanity UI library for styling\"\n - \"Calls onChange with patch operations\"\n\n baseline:\n enabled: true\n rubric:
|
|
287
|
+
export declare const exampleStudioCustomInputYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Example Task: Custom input component in Sanity Studio\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# This is a starter template \u2014 edit it for your own documentation.\n# Delete this file or replace it with your own tasks.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n- id: example-studio-custom-input\n description: \"Example \u2014 Custom input component in Sanity Studio\"\n\n featureArea: studio\n\n # Slug-based canonical doc references.\n canonicalDocs:\n - slug: custom-input-widgets\n reason: \"Guide for building custom form inputs in Sanity Studio\"\n - slug: form-components\n reason: \"Form component API and customization patterns\"\n\n docCoverage: true\n referenceSolution: canonical/example-studio-custom-input.ts\n\n vars:\n task: |\n Build a custom string input component for Sanity Studio that shows\n a character count below the input field. The component should accept\n a maxLength option from the field schema and display a warning when\n the text exceeds the limit.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Implements a React component that renders a text input\"\n - \"Displays a live character count\"\n - \"Reads maxLength from schema options\"\n - \"Shows a visual warning when limit is exceeded\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses the Sanity UI library for styling\"\n - \"Calls onChange with patch operations\"\n\n baseline:\n enabled: true\n rubric: full\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
|
|
288
288
|
/** All task example data as a flat array (JSON-safe) */
|
|
289
289
|
export declare const allTaskData: readonly unknown[];
|
|
290
290
|
/** Map of task ID (filename stem) → raw YAML string (preserves comments) */
|
|
@@ -185,13 +185,13 @@ export const exampleGroqBlogListingData = [
|
|
|
185
185
|
],
|
|
186
186
|
"baseline": {
|
|
187
187
|
"enabled": true,
|
|
188
|
-
"rubric": "
|
|
188
|
+
"rubric": "full"
|
|
189
189
|
},
|
|
190
190
|
"status": "draft"
|
|
191
191
|
}
|
|
192
192
|
];
|
|
193
193
|
/** Raw YAML string for example-groq-blog-listing (preserves comments) */
|
|
194
|
-
export const exampleGroqBlogListingYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Blog listing with GROQ queries\n# ──────────────────────────────────────────────────────────────────────\n#\n# This is a starter template — edit it for your own documentation.\n# Each task evaluates whether an AI coding agent can implement a feature\n# using your docs as context. Delete this file or replace it entirely.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# Full field reference:\n# https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/CONTRIBUTING_TASKS.md\n# ──────────────────────────────────────────────────────────────────────\n\n# Unique identifier — lowercase alphanumeric with hyphens.\n# Must be unique across all task files in .ailf/tasks/.\n- id: example-groq-blog-listing\n\n # Short human-readable summary. Shown in score tables and reports.\n description: \"Example — Blog listing with GROQ queries\"\n\n # Feature area this task belongs to. Tasks with the same area are\n # grouped together in score summaries. Use a short kebab-case name.\n featureArea: groq\n\n # Gold-standard documentation articles for this task. The pipeline\n # fetches these from Sanity and injects them into the prompt for\n # baseline evaluation. Each entry needs:\n # slug — the article's URL slug in your docs site\n # reason — why this doc is relevant (helps with auditing)\n #\n # This example uses slug-based references — the simplest form.\n # See the other example tasks for path, id, and perspective references.\n canonicalDocs:\n - slug: groq-introduction\n reason: \"Core GROQ syntax and query language reference\"\n - slug: how-queries-work\n reason: \"Query execution model and best practices\"\n\n # When true, the pipeline auto-generates an additional rubric that\n # checks whether the LLM's response actually used the provided docs.\n docCoverage: true\n\n # Path to a gold-standard implementation, relative to canonical/.\n # The grader uses this as a reference when scoring code correctness.\n referenceSolution: canonical/example-groq-blog-listing.ts\n\n # vars.task — the implementation prompt given to the LLM.\n # Write this as if you're asking a developer to build the feature.\n # Be specific about requirements so the grader can evaluate clearly.\n #\n # vars.docs — leave empty (\"\"). The pipeline fills this in:\n # • Gold variant: injected with canonical doc content\n # • Baseline variant: left empty (tests model knowledge alone)\n vars:\n task: |\n Create a Next.js page component that lists blog posts from Sanity\n using GROQ. The page should display the title, slug, and published\n date for each post, sorted by most recent first. Use the Sanity\n client to fetch data.\n docs: \"\"\n\n # Grading assertions — how the LLM's response is scored.\n #\n # \"llm-rubric\" assertions use a grader LLM to score against criteria.\n # The \"template\" references a rubric from config/rubrics.yaml.\n # The \"criteria\" are task-specific bullets injected into the template.\n #\n # Available templates:\n # task-completion — did the LLM implement the feature? (weight: 0.50)\n # code-correctness — is the code idiomatic and correct? (weight: 0.25)\n #\n # You can also use value-based assertions:\n # - type: contains\n # value: \"client.fetch\"\n # - type: contains-any\n # value: [\"createClient\", \"sanityClient\"]\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Uses the groq tagged template literal\"\n - \"Fetches blog posts with title, slug, and publishedAt fields\"\n - \"Orders results by publishedAt in descending order\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses createClient from @sanity/client or next-sanity\"\n - \"Exports a valid Next.js page component\"\n\n # Baseline variant configuration.\n # enabled — set to false to skip this task entirely\n # rubric — \"
|
|
194
|
+
export const exampleGroqBlogListingYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Blog listing with GROQ queries\n# ──────────────────────────────────────────────────────────────────────\n#\n# This is a starter template — edit it for your own documentation.\n# Each task evaluates whether an AI coding agent can implement a feature\n# using your docs as context. Delete this file or replace it entirely.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# Full field reference:\n# https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/CONTRIBUTING_TASKS.md\n# ──────────────────────────────────────────────────────────────────────\n\n# Unique identifier — lowercase alphanumeric with hyphens.\n# Must be unique across all task files in .ailf/tasks/.\n- id: example-groq-blog-listing\n\n # Short human-readable summary. Shown in score tables and reports.\n description: \"Example — Blog listing with GROQ queries\"\n\n # Feature area this task belongs to. Tasks with the same area are\n # grouped together in score summaries. Use a short kebab-case name.\n featureArea: groq\n\n # Gold-standard documentation articles for this task. The pipeline\n # fetches these from Sanity and injects them into the prompt for\n # baseline evaluation. Each entry needs:\n # slug — the article's URL slug in your docs site\n # reason — why this doc is relevant (helps with auditing)\n #\n # This example uses slug-based references — the simplest form.\n # See the other example tasks for path, id, and perspective references.\n canonicalDocs:\n - slug: groq-introduction\n reason: \"Core GROQ syntax and query language reference\"\n - slug: how-queries-work\n reason: \"Query execution model and best practices\"\n\n # When true, the pipeline auto-generates an additional rubric that\n # checks whether the LLM's response actually used the provided docs.\n docCoverage: true\n\n # Path to a gold-standard implementation, relative to canonical/.\n # The grader uses this as a reference when scoring code correctness.\n referenceSolution: canonical/example-groq-blog-listing.ts\n\n # vars.task — the implementation prompt given to the LLM.\n # Write this as if you're asking a developer to build the feature.\n # Be specific about requirements so the grader can evaluate clearly.\n #\n # vars.docs — leave empty (\"\"). The pipeline fills this in:\n # • Gold variant: injected with canonical doc content\n # • Baseline variant: left empty (tests model knowledge alone)\n vars:\n task: |\n Create a Next.js page component that lists blog posts from Sanity\n using GROQ. The page should display the title, slug, and published\n date for each post, sorted by most recent first. Use the Sanity\n client to fetch data.\n docs: \"\"\n\n # Grading assertions — how the LLM's response is scored.\n #\n # \"llm-rubric\" assertions use a grader LLM to score against criteria.\n # The \"template\" references a rubric from config/rubrics.yaml.\n # The \"criteria\" are task-specific bullets injected into the template.\n #\n # Available templates:\n # task-completion — did the LLM implement the feature? (weight: 0.50)\n # code-correctness — is the code idiomatic and correct? (weight: 0.25)\n #\n # You can also use value-based assertions:\n # - type: contains\n # value: \"client.fetch\"\n # - type: contains-any\n # value: [\"createClient\", \"sanityClient\"]\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Uses the groq tagged template literal\"\n - \"Fetches blog posts with title, slug, and publishedAt fields\"\n - \"Orders results by publishedAt in descending order\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses createClient from @sanity/client or next-sanity\"\n - \"Exports a valid Next.js page component\"\n\n # Baseline variant configuration.\n # enabled — set to false to skip this task entirely\n # rubric — \"full\" (default), \"abbreviated\" (faster), or \"none\"\n baseline:\n enabled: true\n rubric: full\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
|
|
195
195
|
/** Parsed task data for example-id-based-ref (JSON-safe) */
|
|
196
196
|
export const exampleIdBasedRefData = [
|
|
197
197
|
{
|
|
@@ -236,13 +236,13 @@ export const exampleIdBasedRefData = [
|
|
|
236
236
|
],
|
|
237
237
|
"baseline": {
|
|
238
238
|
"enabled": true,
|
|
239
|
-
"rubric": "
|
|
239
|
+
"rubric": "full"
|
|
240
240
|
},
|
|
241
241
|
"status": "draft"
|
|
242
242
|
}
|
|
243
243
|
];
|
|
244
244
|
/** Raw YAML string for example-id-based-ref (preserves comments) */
|
|
245
|
-
export const exampleIdBasedRefYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Document ID-based canonical doc references\n# ──────────────────────────────────────────────────────────────────────\n#\n# Demonstrates using `id` to reference canonical documentation by\n# Sanity document `_id`. This is useful for:\n# - Draft documents that don't have a stable slug yet\n# - Programmatic references from imports or migrations\n# - Documents where you know the _id but not the slug\n#\n# The `id` ref type can also carry optional `slug` and `path` fields\n# as human-readable annotations — these are NOT used for resolution,\n# only for display in logs and reports.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# ──────────────────────────────────────────────────────────────────────\n\n- id: example-id-based-ref\n description: \"Example — GROQ feature support (ID-based doc references)\"\n\n featureArea: groq\n\n # ID-based canonical doc references.\n #\n # Use the Sanity document _id to reference articles directly.\n # Optional slug/path annotations help humans reading the YAML\n # but are NOT used for resolution — only the `id` field matters.\n #\n # These IDs reference real articles in the Sanity docs (next dataset):\n # 0ba88f1b... = \"GROQ feature support across Sanity\"\n # 5b9c2863... = \"Custom GROQ functions\"\n canonicalDocs:\n - id: \"0ba88f1b-d1a7-418a-9267-2e343d01886a\"\n slug: groq-feature-support-by-context # annotation only — not used for resolution\n reason: \"GROQ feature support across different Sanity contexts\"\n - id: \"5b9c2863-ef01-4565-af8e-ee54e081ee74\"\n slug: custom-groq-functions # annotation only — not used for resolution\n reason: \"Custom GROQ functions and pipelines\"\n\n docCoverage: true\n\n vars:\n task: |\n Explain how GROQ is used across different Sanity contexts.\n Cover the following:\n 1. Which GROQ features are available in each context (API queries,\n webhooks, custom functions, access control)\n 2. How to create and use custom GROQ functions\n 3. Any differences in GROQ support between contexts\n Provide examples demonstrating context-specific GROQ patterns.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Explains GROQ availability across different Sanity contexts\"\n - \"Describes custom GROQ function creation and usage\"\n - \"Notes differences in GROQ support between contexts\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"GROQ examples use valid syntax\"\n - \"Custom function examples follow the correct API pattern\"\n\n baseline:\n enabled: true\n rubric:
|
|
245
|
+
export const exampleIdBasedRefYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Document ID-based canonical doc references\n# ──────────────────────────────────────────────────────────────────────\n#\n# Demonstrates using `id` to reference canonical documentation by\n# Sanity document `_id`. This is useful for:\n# - Draft documents that don't have a stable slug yet\n# - Programmatic references from imports or migrations\n# - Documents where you know the _id but not the slug\n#\n# The `id` ref type can also carry optional `slug` and `path` fields\n# as human-readable annotations — these are NOT used for resolution,\n# only for display in logs and reports.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# ──────────────────────────────────────────────────────────────────────\n\n- id: example-id-based-ref\n description: \"Example — GROQ feature support (ID-based doc references)\"\n\n featureArea: groq\n\n # ID-based canonical doc references.\n #\n # Use the Sanity document _id to reference articles directly.\n # Optional slug/path annotations help humans reading the YAML\n # but are NOT used for resolution — only the `id` field matters.\n #\n # These IDs reference real articles in the Sanity docs (next dataset):\n # 0ba88f1b... = \"GROQ feature support across Sanity\"\n # 5b9c2863... = \"Custom GROQ functions\"\n canonicalDocs:\n - id: \"0ba88f1b-d1a7-418a-9267-2e343d01886a\"\n slug: groq-feature-support-by-context # annotation only — not used for resolution\n reason: \"GROQ feature support across different Sanity contexts\"\n - id: \"5b9c2863-ef01-4565-af8e-ee54e081ee74\"\n slug: custom-groq-functions # annotation only — not used for resolution\n reason: \"Custom GROQ functions and pipelines\"\n\n docCoverage: true\n\n vars:\n task: |\n Explain how GROQ is used across different Sanity contexts.\n Cover the following:\n 1. Which GROQ features are available in each context (API queries,\n webhooks, custom functions, access control)\n 2. How to create and use custom GROQ functions\n 3. Any differences in GROQ support between contexts\n Provide examples demonstrating context-specific GROQ patterns.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Explains GROQ availability across different Sanity contexts\"\n - \"Describes custom GROQ function creation and usage\"\n - \"Notes differences in GROQ support between contexts\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"GROQ examples use valid syntax\"\n - \"Custom function examples follow the correct API pattern\"\n\n baseline:\n enabled: true\n rubric: full\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
|
|
246
246
|
/** Parsed task data for example-path-based-ref (JSON-safe) */
|
|
247
247
|
export const examplePathBasedRefData = [
|
|
248
248
|
{
|
|
@@ -286,13 +286,13 @@ export const examplePathBasedRefData = [
|
|
|
286
286
|
],
|
|
287
287
|
"baseline": {
|
|
288
288
|
"enabled": true,
|
|
289
|
-
"rubric": "
|
|
289
|
+
"rubric": "full"
|
|
290
290
|
},
|
|
291
291
|
"status": "draft"
|
|
292
292
|
}
|
|
293
293
|
];
|
|
294
294
|
/** Raw YAML string for example-path-based-ref (preserves comments) */
|
|
295
|
-
export const examplePathBasedRefYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Path-based canonical doc references\n# ──────────────────────────────────────────────────────────────────────\n#\n# Demonstrates using `path` to reference canonical documentation.\n# Paths are the preferred reference type because they uniquely identify\n# an article across sections (unlike slugs, which can collide).\n#\n# Path format:\n# - Simple: \"webhooks\" → resolves by slug lookup\n# - Sectioned: \"content-lake/webhooks\" → disambiguates by section + slug\n#\n# This example demonstrates why paths matter: the slug \"documents\"\n# exists in both the \"content-lake\" and \"cli-reference\" sections.\n# Using \"content-lake/documents\" ensures we get the right one.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# ──────────────────────────────────────────────────────────────────────\n\n- id: example-path-based-ref\n description: \"Example — GROQ mutations (path-based doc references)\"\n\n featureArea: groq\n\n # Path-based canonical doc references.\n #\n # Use \"section/slug\" format to uniquely identify articles:\n # - \"content-lake/mutations-introduction\" → the mutations article\n # - \"content-lake/documents\" → the documents article in Content Lake\n # (not the CLI \"documents\" article in cli-reference section)\n #\n # The \"documents\" slug exists in two sections — this is exactly why\n # path-based references are preferred over slug-based references.\n canonicalDocs:\n - path: content-lake/mutations-introduction\n reason: \"Introduction to document mutations in the Content Lake\"\n - path: content-lake/documents\n reason: \"Document structure and types (Content Lake, not CLI reference)\"\n\n docCoverage: true\n\n vars:\n task: |\n Explain how to create, update, and delete documents in Sanity's\n Content Lake using mutations. Cover:\n 1. The different mutation types (create, createOrReplace, patch, delete)\n 2. Document structure and required fields (_id, _type)\n 3. How to use patch operations to update specific fields\n 4. Best practices for mutation patterns\n Provide working code examples using @sanity/client.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Explains create, createOrReplace, patch, and delete mutations\"\n - \"Describes required document fields (_id, _type)\"\n - \"Shows patch operations for field-level updates\"\n - \"Includes practical code examples\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses correct @sanity/client mutation API\"\n - \"Patch operations use valid set/unset/inc syntax\"\n\n baseline:\n enabled: true\n rubric:
|
|
295
|
+
export const examplePathBasedRefYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Path-based canonical doc references\n# ──────────────────────────────────────────────────────────────────────\n#\n# Demonstrates using `path` to reference canonical documentation.\n# Paths are the preferred reference type because they uniquely identify\n# an article across sections (unlike slugs, which can collide).\n#\n# Path format:\n# - Simple: \"webhooks\" → resolves by slug lookup\n# - Sectioned: \"content-lake/webhooks\" → disambiguates by section + slug\n#\n# This example demonstrates why paths matter: the slug \"documents\"\n# exists in both the \"content-lake\" and \"cli-reference\" sections.\n# Using \"content-lake/documents\" ensures we get the right one.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# ──────────────────────────────────────────────────────────────────────\n\n- id: example-path-based-ref\n description: \"Example — GROQ mutations (path-based doc references)\"\n\n featureArea: groq\n\n # Path-based canonical doc references.\n #\n # Use \"section/slug\" format to uniquely identify articles:\n # - \"content-lake/mutations-introduction\" → the mutations article\n # - \"content-lake/documents\" → the documents article in Content Lake\n # (not the CLI \"documents\" article in cli-reference section)\n #\n # The \"documents\" slug exists in two sections — this is exactly why\n # path-based references are preferred over slug-based references.\n canonicalDocs:\n - path: content-lake/mutations-introduction\n reason: \"Introduction to document mutations in the Content Lake\"\n - path: content-lake/documents\n reason: \"Document structure and types (Content Lake, not CLI reference)\"\n\n docCoverage: true\n\n vars:\n task: |\n Explain how to create, update, and delete documents in Sanity's\n Content Lake using mutations. Cover:\n 1. The different mutation types (create, createOrReplace, patch, delete)\n 2. Document structure and required fields (_id, _type)\n 3. How to use patch operations to update specific fields\n 4. Best practices for mutation patterns\n Provide working code examples using @sanity/client.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Explains create, createOrReplace, patch, and delete mutations\"\n - \"Describes required document fields (_id, _type)\"\n - \"Shows patch operations for field-level updates\"\n - \"Includes practical code examples\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses correct @sanity/client mutation API\"\n - \"Patch operations use valid set/unset/inc syntax\"\n\n baseline:\n enabled: true\n rubric: full\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
|
|
296
296
|
/** Parsed task data for example-perspective-ref (JSON-safe) */
|
|
297
297
|
export const examplePerspectiveRefData = [
|
|
298
298
|
{
|
|
@@ -335,13 +335,13 @@ export const examplePerspectiveRefData = [
|
|
|
335
335
|
],
|
|
336
336
|
"baseline": {
|
|
337
337
|
"enabled": true,
|
|
338
|
-
"rubric": "
|
|
338
|
+
"rubric": "full"
|
|
339
339
|
},
|
|
340
340
|
"status": "draft"
|
|
341
341
|
}
|
|
342
342
|
];
|
|
343
343
|
/** Raw YAML string for example-perspective-ref (preserves comments) */
|
|
344
|
-
export const examplePerspectiveRefYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Perspective / content release doc references\n# ──────────────────────────────────────────────────────────────────────\n#\n# Demonstrates using `perspective` to reference all documentation\n# articles within a content release. This is the key capability for\n# evaluating NEW feature documentation before it's published.\n#\n# How it works:\n# - A perspective ref is one-to-many: the doc fetcher queries the\n# named release and expands it to ALL articles versioned within it.\n# - Downstream consumers see the same flat DocContext[] regardless\n# of how docs were resolved.\n# - When the release is published, the perspective entry becomes a\n# no-op (articles are now in published). Migrate to explicit path\n# or slug refs at your convenience.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# ──────────────────────────────────────────────────────────────────────\n\n- id: example-perspective-ref\n description:\n \"Example — GROQ features from content release (perspective-based doc\n references)\"\n\n featureArea: groq\n\n # Perspective-based canonical doc reference.\n #\n # The perspective ID references a content release in the Sanity\n # Content Lake. At evaluation time, the doc fetcher auto-discovers\n # all articles versioned in this release and includes them as\n # canonical documentation context.\n #\n # Release rE9TSJvR4 contains:\n # - \"GROQ-powered webhooks\" (webhooks)\n # - \"Query Cheat Sheet - GROQ\" (query-cheat-sheet)\n # - \"GROQ joins\" (groq-joins)\n #\n # You can combine perspective refs with explicit slug/path/id refs\n # to include foundational published docs alongside release content.\n # Here we add groq-data-types as a complementary published reference.\n canonicalDocs:\n - perspective: rE9TSJvR4\n reason: \"All GROQ documentation updates in the test content release\"\n - slug: groq-data-types\n reason: \"GROQ data type reference (published, stable)\"\n\n docCoverage: true\n\n vars:\n task: |\n Using GROQ, demonstrate advanced query patterns including:\n 1. Joining data across document types using references\n 2. Filtering webhook payloads with GROQ projections\n 3. Using the query cheat sheet patterns for common operations\n 4. Working with different GROQ data types in filters\n Provide working GROQ query examples for each pattern.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Demonstrates GROQ join syntax for cross-document queries\"\n - \"Shows GROQ filter patterns for webhook configuration\"\n - \"Includes practical query examples from cheat sheet patterns\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"All GROQ queries use valid syntax\"\n - \"Reference joins use correct dereference operator (->)\"\n\n baseline:\n enabled: true\n rubric:
|
|
344
|
+
export const examplePerspectiveRefYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Perspective / content release doc references\n# ──────────────────────────────────────────────────────────────────────\n#\n# Demonstrates using `perspective` to reference all documentation\n# articles within a content release. This is the key capability for\n# evaluating NEW feature documentation before it's published.\n#\n# How it works:\n# - A perspective ref is one-to-many: the doc fetcher queries the\n# named release and expands it to ALL articles versioned within it.\n# - Downstream consumers see the same flat DocContext[] regardless\n# of how docs were resolved.\n# - When the release is published, the perspective entry becomes a\n# no-op (articles are now in published). Migrate to explicit path\n# or slug refs at your convenience.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# ──────────────────────────────────────────────────────────────────────\n\n- id: example-perspective-ref\n description:\n \"Example — GROQ features from content release (perspective-based doc\n references)\"\n\n featureArea: groq\n\n # Perspective-based canonical doc reference.\n #\n # The perspective ID references a content release in the Sanity\n # Content Lake. At evaluation time, the doc fetcher auto-discovers\n # all articles versioned in this release and includes them as\n # canonical documentation context.\n #\n # Release rE9TSJvR4 contains:\n # - \"GROQ-powered webhooks\" (webhooks)\n # - \"Query Cheat Sheet - GROQ\" (query-cheat-sheet)\n # - \"GROQ joins\" (groq-joins)\n #\n # You can combine perspective refs with explicit slug/path/id refs\n # to include foundational published docs alongside release content.\n # Here we add groq-data-types as a complementary published reference.\n canonicalDocs:\n - perspective: rE9TSJvR4\n reason: \"All GROQ documentation updates in the test content release\"\n - slug: groq-data-types\n reason: \"GROQ data type reference (published, stable)\"\n\n docCoverage: true\n\n vars:\n task: |\n Using GROQ, demonstrate advanced query patterns including:\n 1. Joining data across document types using references\n 2. Filtering webhook payloads with GROQ projections\n 3. Using the query cheat sheet patterns for common operations\n 4. Working with different GROQ data types in filters\n Provide working GROQ query examples for each pattern.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Demonstrates GROQ join syntax for cross-document queries\"\n - \"Shows GROQ filter patterns for webhook configuration\"\n - \"Includes practical query examples from cheat sheet patterns\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"All GROQ queries use valid syntax\"\n - \"Reference joins use correct dereference operator (->)\"\n\n baseline:\n enabled: true\n rubric: full\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
|
|
345
345
|
/** Parsed task data for example-studio-custom-input (JSON-safe) */
|
|
346
346
|
export const exampleStudioCustomInputData = [
|
|
347
347
|
{
|
|
@@ -386,13 +386,13 @@ export const exampleStudioCustomInputData = [
|
|
|
386
386
|
],
|
|
387
387
|
"baseline": {
|
|
388
388
|
"enabled": true,
|
|
389
|
-
"rubric": "
|
|
389
|
+
"rubric": "full"
|
|
390
390
|
},
|
|
391
391
|
"status": "draft"
|
|
392
392
|
}
|
|
393
393
|
];
|
|
394
394
|
/** Raw YAML string for example-studio-custom-input (preserves comments) */
|
|
395
|
-
export const exampleStudioCustomInputYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Custom input component in Sanity Studio\n# ──────────────────────────────────────────────────────────────────────\n#\n# This is a starter template — edit it for your own documentation.\n# Delete this file or replace it with your own tasks.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n# ──────────────────────────────────────────────────────────────────────\n\n- id: example-studio-custom-input\n description: \"Example — Custom input component in Sanity Studio\"\n\n featureArea: studio\n\n # Slug-based canonical doc references.\n canonicalDocs:\n - slug: custom-input-widgets\n reason: \"Guide for building custom form inputs in Sanity Studio\"\n - slug: form-components\n reason: \"Form component API and customization patterns\"\n\n docCoverage: true\n referenceSolution: canonical/example-studio-custom-input.ts\n\n vars:\n task: |\n Build a custom string input component for Sanity Studio that shows\n a character count below the input field. The component should accept\n a maxLength option from the field schema and display a warning when\n the text exceeds the limit.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Implements a React component that renders a text input\"\n - \"Displays a live character count\"\n - \"Reads maxLength from schema options\"\n - \"Shows a visual warning when limit is exceeded\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses the Sanity UI library for styling\"\n - \"Calls onChange with patch operations\"\n\n baseline:\n enabled: true\n rubric:
|
|
395
|
+
export const exampleStudioCustomInputYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Custom input component in Sanity Studio\n# ──────────────────────────────────────────────────────────────────────\n#\n# This is a starter template — edit it for your own documentation.\n# Delete this file or replace it with your own tasks.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n# ──────────────────────────────────────────────────────────────────────\n\n- id: example-studio-custom-input\n description: \"Example — Custom input component in Sanity Studio\"\n\n featureArea: studio\n\n # Slug-based canonical doc references.\n canonicalDocs:\n - slug: custom-input-widgets\n reason: \"Guide for building custom form inputs in Sanity Studio\"\n - slug: form-components\n reason: \"Form component API and customization patterns\"\n\n docCoverage: true\n referenceSolution: canonical/example-studio-custom-input.ts\n\n vars:\n task: |\n Build a custom string input component for Sanity Studio that shows\n a character count below the input field. The component should accept\n a maxLength option from the field schema and display a warning when\n the text exceeds the limit.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Implements a React component that renders a text input\"\n - \"Displays a live character count\"\n - \"Reads maxLength from schema options\"\n - \"Shows a visual warning when limit is exceeded\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses the Sanity UI library for styling\"\n - \"Calls onChange with patch operations\"\n\n baseline:\n enabled: true\n rubric: full\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
|
|
396
396
|
// ---------------------------------------------------------------------------
|
|
397
397
|
// Aggregate task exports
|
|
398
398
|
// ---------------------------------------------------------------------------
|
|
@@ -30,7 +30,7 @@ export type AssertionDefinition = TemplatedAssertion | ValueAssertion;
|
|
|
30
30
|
export interface BaselineConfig {
|
|
31
31
|
/** Whether to generate a baseline variant. Default: true */
|
|
32
32
|
enabled?: boolean;
|
|
33
|
-
/** Rubric mode for baseline. Default: "
|
|
33
|
+
/** Rubric mode for baseline. Default: "full" */
|
|
34
34
|
rubric?: "abbreviated" | "full" | "none";
|
|
35
35
|
}
|
|
36
36
|
/**
|
|
@@ -25,21 +25,37 @@ export declare const RubricTemplateSchema: z.ZodObject<{
|
|
|
25
25
|
}, z.core.$strip>;
|
|
26
26
|
/** Inferred TypeScript type for a rubric template. */
|
|
27
27
|
export type RubricTemplate = z.infer<typeof RubricTemplateSchema>;
|
|
28
|
+
/**
|
|
29
|
+
* A named weight profile — maps dimension names to weights (must sum to 1.0).
|
|
30
|
+
* Each profile is a self-contained scoring formula used for a specific
|
|
31
|
+
* (mode, variant) pair.
|
|
32
|
+
*/
|
|
33
|
+
declare const WeightProfileSchema: z.ZodRecord<z.ZodString, z.ZodNumber>;
|
|
34
|
+
/** Inferred type for a single weight profile. */
|
|
35
|
+
export type WeightProfile = z.infer<typeof WeightProfileSchema>;
|
|
28
36
|
/**
|
|
29
37
|
* Schema for the full config/rubrics.yaml config file.
|
|
30
38
|
*
|
|
31
|
-
* Each dimension is scored on a uniform 0–100 scale.
|
|
32
|
-
*
|
|
39
|
+
* Each dimension is scored on a uniform 0–100 scale. Named scoring profiles
|
|
40
|
+
* define how dimensions are combined into composite scores. Mode-profile
|
|
41
|
+
* bindings declare which profile to use for each (mode, variant) pair.
|
|
42
|
+
*
|
|
43
|
+
* Supports both the new `profiles` format and the legacy flat `weights`
|
|
44
|
+
* format for backward compatibility.
|
|
45
|
+
*
|
|
46
|
+
* @see docs/design-docs/named-scoring-profiles.md
|
|
33
47
|
*/
|
|
34
48
|
export declare const RubricConfigSchema: z.ZodObject<{
|
|
35
49
|
footer: z.ZodString;
|
|
50
|
+
"mode-profiles": z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodRecord<z.ZodString, z.ZodString>>>;
|
|
51
|
+
profiles: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodRecord<z.ZodString, z.ZodNumber>>>;
|
|
36
52
|
templates: z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
37
53
|
criteria_label: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
38
54
|
dimension: z.ZodOptional<z.ZodString>;
|
|
39
55
|
header: z.ZodString;
|
|
40
56
|
scale: z.ZodArray<z.ZodString>;
|
|
41
57
|
}, z.core.$strip>>;
|
|
42
|
-
weights: z.ZodRecord<z.ZodString, z.ZodNumber
|
|
58
|
+
weights: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
|
|
43
59
|
}, z.core.$strip>;
|
|
44
60
|
/** Inferred TypeScript type for the rubrics config. */
|
|
45
61
|
export type RubricConfig = z.infer<typeof RubricConfigSchema>;
|
|
@@ -31,23 +31,54 @@ export const RubricTemplateSchema = z.object({
|
|
|
31
31
|
.array(z.string().min(1))
|
|
32
32
|
.min(1, "scale must have at least one entry"),
|
|
33
33
|
});
|
|
34
|
+
/**
|
|
35
|
+
* A named weight profile — maps dimension names to weights (must sum to 1.0).
|
|
36
|
+
* Each profile is a self-contained scoring formula used for a specific
|
|
37
|
+
* (mode, variant) pair.
|
|
38
|
+
*/
|
|
39
|
+
const WeightProfileSchema = z
|
|
40
|
+
.record(z.string(), z.number().min(0).max(1))
|
|
41
|
+
.refine((w) => {
|
|
42
|
+
const sum = Object.values(w).reduce((s, v) => s + v, 0);
|
|
43
|
+
return Math.abs(sum - 1.0) < 0.001;
|
|
44
|
+
}, { message: "profile weights must sum to 1.0" });
|
|
45
|
+
/**
|
|
46
|
+
* Mode-to-profile bindings — maps (mode, variant) pairs to profile names.
|
|
47
|
+
* Example: { baseline: { gold: "default", baseline: "output-only" } }
|
|
48
|
+
*/
|
|
49
|
+
const ModeProfilesSchema = z.record(z.string(), z.record(z.string(), z.string()));
|
|
34
50
|
/**
|
|
35
51
|
* Schema for the full config/rubrics.yaml config file.
|
|
36
52
|
*
|
|
37
|
-
* Each dimension is scored on a uniform 0–100 scale.
|
|
38
|
-
*
|
|
53
|
+
* Each dimension is scored on a uniform 0–100 scale. Named scoring profiles
|
|
54
|
+
* define how dimensions are combined into composite scores. Mode-profile
|
|
55
|
+
* bindings declare which profile to use for each (mode, variant) pair.
|
|
56
|
+
*
|
|
57
|
+
* Supports both the new `profiles` format and the legacy flat `weights`
|
|
58
|
+
* format for backward compatibility.
|
|
59
|
+
*
|
|
60
|
+
* @see docs/design-docs/named-scoring-profiles.md
|
|
39
61
|
*/
|
|
40
|
-
export const RubricConfigSchema = z
|
|
62
|
+
export const RubricConfigSchema = z
|
|
63
|
+
.object({
|
|
41
64
|
footer: z.string().min(1, "footer must be a non-empty string"),
|
|
65
|
+
"mode-profiles": ModeProfilesSchema.optional(),
|
|
66
|
+
profiles: z
|
|
67
|
+
.record(z.string(), WeightProfileSchema)
|
|
68
|
+
.refine((p) => "default" in p, {
|
|
69
|
+
message: "profiles must include a 'default' profile",
|
|
70
|
+
})
|
|
71
|
+
.optional(),
|
|
42
72
|
templates: z
|
|
43
73
|
.record(z.string(), RubricTemplateSchema)
|
|
44
74
|
.refine((t) => Object.keys(t).length > 0, {
|
|
45
75
|
message: "templates must have at least one entry",
|
|
46
76
|
}),
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
77
|
+
// Legacy: flat weight map. Treated as a single profile named "default".
|
|
78
|
+
weights: WeightProfileSchema.optional(),
|
|
79
|
+
})
|
|
80
|
+
.refine((c) => c.profiles !== undefined || c.weights !== undefined, {
|
|
81
|
+
message: "rubrics.yaml must have either 'profiles' or 'weights'",
|
|
51
82
|
});
|
|
52
83
|
// ---------------------------------------------------------------------------
|
|
53
84
|
// Feature registry schema — validates config/features.yaml (Phase 3c)
|
|
@@ -25,7 +25,19 @@ export declare function detectFeatureArea(description: string): string;
|
|
|
25
25
|
/**
|
|
26
26
|
* Extract a numeric score (0–100) from a grading component result.
|
|
27
27
|
*
|
|
28
|
-
* Tries
|
|
28
|
+
* Tries (in order):
|
|
29
|
+
* 1. JSON-parsed reason (grader's native 0–100 scale — most reliable)
|
|
30
|
+
* 2. Direct score field (may be Promptfoo-normalized to 0–1)
|
|
31
|
+
* 3. Bare number in reason text
|
|
32
|
+
*
|
|
33
|
+
* Promptfoo's `llm-rubric` assertion normalizes `component.score` to
|
|
34
|
+
* the 0–1 range for some providers (notably GPT models, ~50% of the
|
|
35
|
+
* time) while leaving others in the grader's native 0–100 range. The
|
|
36
|
+
* `reason` field always contains the raw grader JSON, so we prefer it.
|
|
37
|
+
*
|
|
38
|
+
* When falling back to `component.score`, values in (0, 1] are rescaled
|
|
39
|
+
* to 0–100 since the rubric explicitly requests a 0–100 score and a
|
|
40
|
+
* true score of 0 or 1 out of 100 is vanishingly unlikely.
|
|
29
41
|
*/
|
|
30
42
|
export declare function parseRubricScore(component: ComponentResult): number;
|
|
31
43
|
/**
|
|
@@ -85,14 +85,22 @@ export function detectFeatureArea(description) {
|
|
|
85
85
|
/**
|
|
86
86
|
* Extract a numeric score (0–100) from a grading component result.
|
|
87
87
|
*
|
|
88
|
-
* Tries
|
|
88
|
+
* Tries (in order):
|
|
89
|
+
* 1. JSON-parsed reason (grader's native 0–100 scale — most reliable)
|
|
90
|
+
* 2. Direct score field (may be Promptfoo-normalized to 0–1)
|
|
91
|
+
* 3. Bare number in reason text
|
|
92
|
+
*
|
|
93
|
+
* Promptfoo's `llm-rubric` assertion normalizes `component.score` to
|
|
94
|
+
* the 0–1 range for some providers (notably GPT models, ~50% of the
|
|
95
|
+
* time) while leaving others in the grader's native 0–100 range. The
|
|
96
|
+
* `reason` field always contains the raw grader JSON, so we prefer it.
|
|
97
|
+
*
|
|
98
|
+
* When falling back to `component.score`, values in (0, 1] are rescaled
|
|
99
|
+
* to 0–100 since the rubric explicitly requests a 0–100 score and a
|
|
100
|
+
* true score of 0 or 1 out of 100 is vanishingly unlikely.
|
|
89
101
|
*/
|
|
90
102
|
export function parseRubricScore(component) {
|
|
91
|
-
//
|
|
92
|
-
if (typeof component.score === "number") {
|
|
93
|
-
return component.score;
|
|
94
|
-
}
|
|
95
|
-
// Try to extract from reason (LLM rubric returns JSON)
|
|
103
|
+
// 1. Prefer reason-extracted score — always in the grader's native 0–100 scale
|
|
96
104
|
if (component.reason) {
|
|
97
105
|
try {
|
|
98
106
|
const parsed = JSON.parse(component.reason);
|
|
@@ -102,15 +110,38 @@ export function parseRubricScore(component) {
|
|
|
102
110
|
}
|
|
103
111
|
}
|
|
104
112
|
catch {
|
|
105
|
-
//
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
113
|
+
// Not valid JSON — fall through to direct score or bare number extraction
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
// 2. Direct score field — may be Promptfoo-normalized to 0–1
|
|
117
|
+
if (typeof component.score === "number") {
|
|
118
|
+
return normalizeScore(component.score);
|
|
119
|
+
}
|
|
120
|
+
// 3. Last resort: bare number in reason text
|
|
121
|
+
if (component.reason) {
|
|
122
|
+
const match = component.reason.match(/(\d+)/);
|
|
123
|
+
if (match) {
|
|
124
|
+
return parseInt(match[1], 10);
|
|
110
125
|
}
|
|
111
126
|
}
|
|
112
127
|
return 0;
|
|
113
128
|
}
|
|
129
|
+
/**
|
|
130
|
+
* Normalize a score that may be in either the 0–1 or 0–100 range.
|
|
131
|
+
*
|
|
132
|
+
* Promptfoo's `llm-rubric` assertion inconsistently normalizes
|
|
133
|
+
* `component.score` to 0–1 for some providers. Since the rubric
|
|
134
|
+
* explicitly requests scores on a 0–100 scale:
|
|
135
|
+
* - Scores in (0, 1] are rescaled to 0–100 (e.g., 0.95 → 95)
|
|
136
|
+
* - Score of exactly 0 stays 0 (genuine zero)
|
|
137
|
+
* - Scores > 1 are already on the 0–100 scale
|
|
138
|
+
*/
|
|
139
|
+
function normalizeScore(score) {
|
|
140
|
+
if (score > 0 && score <= 1) {
|
|
141
|
+
return Math.round(score * 100);
|
|
142
|
+
}
|
|
143
|
+
return score;
|
|
144
|
+
}
|
|
114
145
|
// ---------------------------------------------------------------------------
|
|
115
146
|
// URL metadata extraction
|
|
116
147
|
// ---------------------------------------------------------------------------
|
|
@@ -131,7 +131,7 @@ function mapToTaskDefinition(raw) {
|
|
|
131
131
|
// assertion types that aren't in the curated list). These bypass template
|
|
132
132
|
// resolution and flow directly into the expanded Promptfoo test case as
|
|
133
133
|
// value-based assertions. In baseline mode, buildBaselineAsserts() with
|
|
134
|
-
// "
|
|
134
|
+
// "full" (the default) copies all assertions as-is, so rawAssert
|
|
135
135
|
// entries only run in the gold variant — consistent with how regular
|
|
136
136
|
// value-based assertions like `contains` or `regex` behave.
|
|
137
137
|
const rawAssertions = (raw.rawAssert ?? [])
|
|
@@ -64,7 +64,7 @@ export interface RawTestResult {
|
|
|
64
64
|
* @returns Record keyed by model ID, or null if only one model was used
|
|
65
65
|
* (per-model breakdown is redundant when there's only one model).
|
|
66
66
|
*/
|
|
67
|
-
export declare function calculateScoresPerModel(resultsPath: string,
|
|
67
|
+
export declare function calculateScoresPerModel(resultsPath: string, goldProfile: Record<string, number>, baselineProfile: Record<string, number>): null | PerModelEntry[];
|
|
68
68
|
/**
|
|
69
69
|
* Extract grader judgments (reason text + scores) from evaluation results.
|
|
70
70
|
*
|
|
@@ -82,7 +82,7 @@ export declare function extractGraderJudgments(resultsPath: string): GraderJudgm
|
|
|
82
82
|
*
|
|
83
83
|
* Returns a record keyed by feature area with the composite actual score.
|
|
84
84
|
*/
|
|
85
|
-
export declare function scoreAgenticResults(resultsPath: string,
|
|
85
|
+
export declare function scoreAgenticResults(resultsPath: string, profile: Record<string, number>): Record<string, ActualScoreEntry>;
|
|
86
86
|
/**
|
|
87
87
|
* Score agentic results broken down by model.
|
|
88
88
|
*
|
|
@@ -90,7 +90,7 @@ export declare function scoreAgenticResults(resultsPath: string, weights: Record
|
|
|
90
90
|
* producing a map of model → feature → ActualScoreEntry.
|
|
91
91
|
* Used to enrich the per-model breakdown with actual scores in full mode.
|
|
92
92
|
*/
|
|
93
|
-
export declare function scoreAgenticResultsPerModel(resultsPath: string,
|
|
93
|
+
export declare function scoreAgenticResultsPerModel(resultsPath: string, profile: Record<string, number>): Record<string, Record<string, ActualScoreEntry>>;
|
|
94
94
|
/** Options for the calculate-scores main() function. */
|
|
95
95
|
export interface CalculateScoresOptions {
|
|
96
96
|
/** Allowed origins for source isolation reporting */
|
|
@@ -8,8 +8,11 @@
|
|
|
8
8
|
* Code Correctness (0–100) — Is the code idiomatic and correct?
|
|
9
9
|
* Doc Coverage (0–100) — Did docs provide the needed info?
|
|
10
10
|
*
|
|
11
|
-
* Dimensions are combined into a weighted composite (0–100) using
|
|
12
|
-
* from config/rubrics.yaml
|
|
11
|
+
* Dimensions are combined into a weighted composite (0–100) using named
|
|
12
|
+
* scoring profiles from config/rubrics.yaml. Gold (with-docs) entries use
|
|
13
|
+
* the "default" profile; baseline (without-docs) entries use "output-only"
|
|
14
|
+
* which excludes doc-coverage (undefined without docs).
|
|
15
|
+
* See docs/design-docs/named-scoring-profiles.md.
|
|
13
16
|
*
|
|
14
17
|
* Additionally compares with-docs vs without-docs scores to calculate
|
|
15
18
|
* the "Doc Lift" — how much documentation helps vs parametric knowledge.
|
|
@@ -30,6 +33,7 @@ import { calculateCost } from "../agent-observer/pricing.js";
|
|
|
30
33
|
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
31
34
|
import { checkResultsExist } from "./checks.js";
|
|
32
35
|
import { loadRubricTemplates } from "./expand-tasks.js";
|
|
36
|
+
import { resolveProfile } from "./profile-resolution.js";
|
|
33
37
|
import { loadSource } from "../sources.js";
|
|
34
38
|
import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
|
|
35
39
|
import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
|
|
@@ -46,7 +50,7 @@ export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, par
|
|
|
46
50
|
* @returns Record keyed by model ID, or null if only one model was used
|
|
47
51
|
* (per-model breakdown is redundant when there's only one model).
|
|
48
52
|
*/
|
|
49
|
-
export function calculateScoresPerModel(resultsPath,
|
|
53
|
+
export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfile) {
|
|
50
54
|
const results = readAndNormalizeResults(resultsPath);
|
|
51
55
|
// Group results by provider
|
|
52
56
|
const byModel = {};
|
|
@@ -66,7 +70,7 @@ export function calculateScoresPerModel(resultsPath, weights) {
|
|
|
66
70
|
}
|
|
67
71
|
const perModel = [];
|
|
68
72
|
for (const [modelId, { label, results: modelResults }] of Object.entries(byModel)) {
|
|
69
|
-
const scores = scoreResults(modelResults,
|
|
73
|
+
const scores = scoreResults(modelResults, goldProfile, baselineProfile, modelId);
|
|
70
74
|
const totalTests = scores.reduce((s, sc) => s + sc.testCount, 0);
|
|
71
75
|
const totalCost = scores.reduce((s, sc) => s + sc.totalCost, 0);
|
|
72
76
|
const avgScore = scores.length > 0
|
|
@@ -318,9 +322,9 @@ function buildSourceVerification(root, source, verificationCtx) {
|
|
|
318
322
|
* Calculate overall scores (all models combined).
|
|
319
323
|
* This is the original scoring path — backward compatible.
|
|
320
324
|
*/
|
|
321
|
-
function calculateScores(resultsPath,
|
|
325
|
+
function calculateScores(resultsPath, goldProfile, baselineProfile) {
|
|
322
326
|
const results = readAndNormalizeResults(resultsPath);
|
|
323
|
-
return scoreResults(results,
|
|
327
|
+
return scoreResults(results, goldProfile, baselineProfile);
|
|
324
328
|
}
|
|
325
329
|
/**
|
|
326
330
|
* Extracts agent behavior summary from a test result's metadata.
|
|
@@ -489,19 +493,73 @@ function readAndNormalizeResults(resultsPath, log) {
|
|
|
489
493
|
}
|
|
490
494
|
return valid;
|
|
491
495
|
}
|
|
496
|
+
/**
|
|
497
|
+
* Accumulate raw dimension scores across an array of test results.
|
|
498
|
+
* Dimension-agnostic: any dimension returned by classifyRubric() is tracked.
|
|
499
|
+
*/
|
|
500
|
+
function accumulateDimensions(tests) {
|
|
501
|
+
const dimensions = {};
|
|
502
|
+
let totalCost = 0;
|
|
503
|
+
for (const test of tests) {
|
|
504
|
+
totalCost += test.cost;
|
|
505
|
+
for (const comp of test.gradingResult.componentResults) {
|
|
506
|
+
if (comp.assertion?.type !== "llm-rubric")
|
|
507
|
+
continue;
|
|
508
|
+
const score = parseRubricScore(comp);
|
|
509
|
+
const kind = classifyRubric(comp);
|
|
510
|
+
if (kind) {
|
|
511
|
+
dimensions[kind] = (dimensions[kind] ?? 0) + score;
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
return { dimensions, totalCost };
|
|
516
|
+
}
|
|
517
|
+
/**
|
|
518
|
+
* Average accumulated dimension scores by a count.
|
|
519
|
+
* Returns a dimension → average score map.
|
|
520
|
+
*/
|
|
521
|
+
function averageDimensions(accumulated, count) {
|
|
522
|
+
const avg = {};
|
|
523
|
+
for (const [dim, total] of Object.entries(accumulated.dimensions)) {
|
|
524
|
+
avg[dim] = total / count;
|
|
525
|
+
}
|
|
526
|
+
return avg;
|
|
527
|
+
}
|
|
528
|
+
/**
|
|
529
|
+
* Compute a weighted composite score from dimension averages and a profile.
|
|
530
|
+
* Only dimensions present in the profile contribute to the composite.
|
|
531
|
+
* Dimensions not in the profile are ignored (e.g., doc-coverage on baseline).
|
|
532
|
+
*
|
|
533
|
+
* The profile maps camelCase dimension names (as returned by classifyRubric)
|
|
534
|
+
* to kebab-case keys (as used in rubrics.yaml). This function handles the
|
|
535
|
+
* mapping internally.
|
|
536
|
+
*/
|
|
537
|
+
function weightedComposite(dimensionAverages, profile) {
|
|
538
|
+
// Map profile keys (kebab-case: "task-completion") to classifyRubric
|
|
539
|
+
// output (camelCase: "taskCompletion")
|
|
540
|
+
const kebabToCamel = {
|
|
541
|
+
"code-correctness": "codeCorrectness",
|
|
542
|
+
"doc-coverage": "docCoverage",
|
|
543
|
+
"task-completion": "taskCompletion",
|
|
544
|
+
};
|
|
545
|
+
let total = 0;
|
|
546
|
+
for (const [profileKey, weight] of Object.entries(profile)) {
|
|
547
|
+
const dimKey = kebabToCamel[profileKey] ?? profileKey;
|
|
548
|
+
total += (dimensionAverages[dimKey] ?? 0) * weight;
|
|
549
|
+
}
|
|
550
|
+
return total;
|
|
551
|
+
}
|
|
492
552
|
/**
|
|
493
553
|
* Core scoring logic: takes a pre-filtered array of TestResult and produces
|
|
494
554
|
* FeatureScore[] grouped by feature area. This is the shared implementation
|
|
495
555
|
* used by both the overall scoring and per-model scoring paths.
|
|
496
556
|
*
|
|
497
557
|
* @param results Pre-filtered (valid) test results
|
|
498
|
-
* @param
|
|
499
|
-
* @param
|
|
558
|
+
* @param goldProfile Weight profile for gold (with-docs) entries
|
|
559
|
+
* @param baselineProfile Weight profile for baseline (without-docs) entries
|
|
560
|
+
* @param modelId Optional model identifier to tag each FeatureScore
|
|
500
561
|
*/
|
|
501
|
-
function scoreResults(results,
|
|
502
|
-
const wTask = weights["task-completion"] ?? 0.5;
|
|
503
|
-
const wCode = weights["code-correctness"] ?? 0.25;
|
|
504
|
-
const wDoc = weights["doc-coverage"] ?? 0.25;
|
|
562
|
+
function scoreResults(results, goldProfile, baselineProfile, modelId) {
|
|
505
563
|
// Group by feature + docs/no-docs
|
|
506
564
|
const byFeature = {};
|
|
507
565
|
for (const result of results) {
|
|
@@ -519,65 +577,35 @@ function scoreResults(results, weights, modelId) {
|
|
|
519
577
|
}
|
|
520
578
|
const scores = [];
|
|
521
579
|
for (const [feature, data] of Object.entries(byFeature)) {
|
|
522
|
-
// --- With docs ---
|
|
523
|
-
|
|
524
|
-
let
|
|
525
|
-
let totalDoc = 0;
|
|
526
|
-
let featureCost = 0;
|
|
580
|
+
// --- With docs (gold / ceiling) ---
|
|
581
|
+
const goldDims = accumulateDimensions(data.withDocs);
|
|
582
|
+
let featureCost = goldDims.totalCost;
|
|
527
583
|
const countWithDocs = data.withDocs.length || 1;
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
else if (kind === "codeCorrectness") {
|
|
540
|
-
totalCode += score;
|
|
541
|
-
}
|
|
542
|
-
else if (kind === "docCoverage") {
|
|
543
|
-
totalDoc += score;
|
|
544
|
-
}
|
|
545
|
-
}
|
|
546
|
-
}
|
|
547
|
-
// Per-dimension averages (each 0–100)
|
|
548
|
-
const avgTask = totalTask / countWithDocs;
|
|
549
|
-
const avgCode = totalCode / countWithDocs;
|
|
550
|
-
const avgDoc = totalDoc / countWithDocs;
|
|
551
|
-
// Weighted composite (0–100)
|
|
552
|
-
const withDocsTotal = avgTask * wTask + avgCode * wCode + avgDoc * wDoc;
|
|
553
|
-
// --- Without docs (baseline) ---
|
|
554
|
-
let baselineTotal = 0;
|
|
555
|
-
let baselineCount = 0;
|
|
556
|
-
for (const test of data.withoutDocs) {
|
|
557
|
-
featureCost += test.cost;
|
|
558
|
-
for (const comp of test.gradingResult.componentResults) {
|
|
559
|
-
if (comp.assertion?.type !== "llm-rubric") {
|
|
560
|
-
continue;
|
|
561
|
-
}
|
|
562
|
-
baselineTotal += parseRubricScore(comp);
|
|
563
|
-
baselineCount++;
|
|
564
|
-
}
|
|
565
|
-
}
|
|
566
|
-
const withoutDocsScore = baselineCount > 0 ? baselineTotal / baselineCount : 0;
|
|
584
|
+
const avgGold = averageDimensions(goldDims, countWithDocs);
|
|
585
|
+
const withDocsTotal = weightedComposite(avgGold, goldProfile);
|
|
586
|
+
// --- Without docs (baseline / floor) ---
|
|
587
|
+
// Uses the baseline profile (e.g. "output-only") which may exclude
|
|
588
|
+
// dimensions like doc-coverage that are undefined without docs.
|
|
589
|
+
// See docs/design-docs/named-scoring-profiles.md.
|
|
590
|
+
const baselineDims = accumulateDimensions(data.withoutDocs);
|
|
591
|
+
featureCost += baselineDims.totalCost;
|
|
592
|
+
const countWithoutDocs = data.withoutDocs.length || 1;
|
|
593
|
+
const avgBaseline = averageDimensions(baselineDims, countWithoutDocs);
|
|
594
|
+
const withoutDocsScore = weightedComposite(avgBaseline, baselineProfile);
|
|
567
595
|
const ceilingScore = Math.round(withDocsTotal);
|
|
568
596
|
const floorScore = Math.round(withoutDocsScore);
|
|
569
597
|
const docLift = ceilingScore - floorScore;
|
|
570
598
|
const featureScore = {
|
|
571
599
|
ceilingScore,
|
|
572
|
-
codeCorrectness: Math.round(
|
|
573
|
-
docCoverage: Math.round(
|
|
600
|
+
codeCorrectness: Math.round(avgGold.codeCorrectness ?? 0),
|
|
601
|
+
docCoverage: Math.round(avgGold.docCoverage ?? 0),
|
|
574
602
|
docLift,
|
|
575
603
|
docQualityGap: 100 - ceilingScore,
|
|
576
604
|
feature,
|
|
577
605
|
floorScore,
|
|
578
606
|
...(modelId && { modelId }),
|
|
579
607
|
negativeDocLift: docLift < 0,
|
|
580
|
-
taskCompletion: Math.round(
|
|
608
|
+
taskCompletion: Math.round(avgGold.taskCompletion ?? 0),
|
|
581
609
|
testCount: data.withDocs.length,
|
|
582
610
|
totalCost: featureCost,
|
|
583
611
|
totalScore: ceilingScore,
|
|
@@ -597,11 +625,8 @@ function scoreResults(results, weights, modelId) {
|
|
|
597
625
|
* Returns a record keyed by feature area with the composite actual score.
|
|
598
626
|
*/
|
|
599
627
|
// ActualScoreEntry — imported from @sanity/ailf-core via pipeline/types.js
|
|
600
|
-
export function scoreAgenticResults(resultsPath,
|
|
628
|
+
export function scoreAgenticResults(resultsPath, profile) {
|
|
601
629
|
const results = readAndNormalizeResults(resultsPath);
|
|
602
|
-
const wTask = weights["task-completion"] ?? 0.5;
|
|
603
|
-
const wCode = weights["code-correctness"] ?? 0.25;
|
|
604
|
-
const wDoc = weights["doc-coverage"] ?? 0.25;
|
|
605
630
|
// Group by feature area
|
|
606
631
|
const byFeature = {};
|
|
607
632
|
for (const result of results) {
|
|
@@ -613,37 +638,17 @@ export function scoreAgenticResults(resultsPath, weights) {
|
|
|
613
638
|
}
|
|
614
639
|
const entries = {};
|
|
615
640
|
for (const [feature, featureResults] of Object.entries(byFeature)) {
|
|
616
|
-
let totalTask = 0;
|
|
617
|
-
let totalCode = 0;
|
|
618
|
-
let totalDoc = 0;
|
|
619
|
-
let featureCost = 0;
|
|
620
641
|
const count = featureResults.length || 1;
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
if (comp.assertion?.type !== "llm-rubric")
|
|
625
|
-
continue;
|
|
626
|
-
const score = parseRubricScore(comp);
|
|
627
|
-
const kind = classifyRubric(comp);
|
|
628
|
-
if (kind === "taskCompletion")
|
|
629
|
-
totalTask += score;
|
|
630
|
-
else if (kind === "codeCorrectness")
|
|
631
|
-
totalCode += score;
|
|
632
|
-
else if (kind === "docCoverage")
|
|
633
|
-
totalDoc += score;
|
|
634
|
-
}
|
|
635
|
-
}
|
|
636
|
-
const avgTask = totalTask / count;
|
|
637
|
-
const avgCode = totalCode / count;
|
|
638
|
-
const avgDoc = totalDoc / count;
|
|
639
|
-
const actualScore = Math.round(avgTask * wTask + avgCode * wCode + avgDoc * wDoc);
|
|
642
|
+
const accumulated = accumulateDimensions(featureResults);
|
|
643
|
+
const avg = averageDimensions(accumulated, count);
|
|
644
|
+
const actualScore = Math.round(weightedComposite(avg, profile));
|
|
640
645
|
entries[feature] = {
|
|
641
646
|
actualScore,
|
|
642
|
-
codeCorrectness: Math.round(
|
|
643
|
-
docCoverage: Math.round(
|
|
644
|
-
taskCompletion: Math.round(
|
|
647
|
+
codeCorrectness: Math.round(avg.codeCorrectness ?? 0),
|
|
648
|
+
docCoverage: Math.round(avg.docCoverage ?? 0),
|
|
649
|
+
taskCompletion: Math.round(avg.taskCompletion ?? 0),
|
|
645
650
|
testCount: featureResults.length,
|
|
646
|
-
totalCost:
|
|
651
|
+
totalCost: accumulated.totalCost,
|
|
647
652
|
};
|
|
648
653
|
}
|
|
649
654
|
return entries;
|
|
@@ -655,11 +660,8 @@ export function scoreAgenticResults(resultsPath, weights) {
|
|
|
655
660
|
* producing a map of model → feature → ActualScoreEntry.
|
|
656
661
|
* Used to enrich the per-model breakdown with actual scores in full mode.
|
|
657
662
|
*/
|
|
658
|
-
export function scoreAgenticResultsPerModel(resultsPath,
|
|
663
|
+
export function scoreAgenticResultsPerModel(resultsPath, profile) {
|
|
659
664
|
const results = readAndNormalizeResults(resultsPath);
|
|
660
|
-
const wTask = weights["task-completion"] ?? 0.5;
|
|
661
|
-
const wCode = weights["code-correctness"] ?? 0.25;
|
|
662
|
-
const wDoc = weights["doc-coverage"] ?? 0.25;
|
|
663
665
|
// Group by model, then feature
|
|
664
666
|
const byModel = {};
|
|
665
667
|
for (const result of results) {
|
|
@@ -675,37 +677,17 @@ export function scoreAgenticResultsPerModel(resultsPath, weights) {
|
|
|
675
677
|
for (const [modelId, features] of Object.entries(byModel)) {
|
|
676
678
|
perModel[modelId] = {};
|
|
677
679
|
for (const [feature, featureResults] of Object.entries(features)) {
|
|
678
|
-
let totalTask = 0;
|
|
679
|
-
let totalCode = 0;
|
|
680
|
-
let totalDoc = 0;
|
|
681
|
-
let featureCost = 0;
|
|
682
680
|
const count = featureResults.length || 1;
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
if (comp.assertion?.type !== "llm-rubric")
|
|
687
|
-
continue;
|
|
688
|
-
const score = parseRubricScore(comp);
|
|
689
|
-
const kind = classifyRubric(comp);
|
|
690
|
-
if (kind === "taskCompletion")
|
|
691
|
-
totalTask += score;
|
|
692
|
-
else if (kind === "codeCorrectness")
|
|
693
|
-
totalCode += score;
|
|
694
|
-
else if (kind === "docCoverage")
|
|
695
|
-
totalDoc += score;
|
|
696
|
-
}
|
|
697
|
-
}
|
|
698
|
-
const avgTask = totalTask / count;
|
|
699
|
-
const avgCode = totalCode / count;
|
|
700
|
-
const avgDoc = totalDoc / count;
|
|
701
|
-
const actualScore = Math.round(avgTask * wTask + avgCode * wCode + avgDoc * wDoc);
|
|
681
|
+
const accumulated = accumulateDimensions(featureResults);
|
|
682
|
+
const avg = averageDimensions(accumulated, count);
|
|
683
|
+
const actualScore = Math.round(weightedComposite(avg, profile));
|
|
702
684
|
perModel[modelId][feature] = {
|
|
703
685
|
actualScore,
|
|
704
|
-
codeCorrectness: Math.round(
|
|
705
|
-
docCoverage: Math.round(
|
|
706
|
-
taskCompletion: Math.round(
|
|
686
|
+
codeCorrectness: Math.round(avg.codeCorrectness ?? 0),
|
|
687
|
+
docCoverage: Math.round(avg.docCoverage ?? 0),
|
|
688
|
+
taskCompletion: Math.round(avg.taskCompletion ?? 0),
|
|
707
689
|
testCount: featureResults.length,
|
|
708
|
-
totalCost:
|
|
690
|
+
totalCost: accumulated.totalCost,
|
|
709
691
|
};
|
|
710
692
|
}
|
|
711
693
|
}
|
|
@@ -760,10 +742,18 @@ export function calculateAndWriteScores(options) {
|
|
|
760
742
|
if (source) {
|
|
761
743
|
log.info(`Source: ${sourceName} (${source.baseUrl})`);
|
|
762
744
|
}
|
|
763
|
-
// Load
|
|
745
|
+
// Load rubric config and resolve scoring profiles per variant.
|
|
746
|
+
// Gold (with-docs) entries use the "default" profile (3 dimensions).
|
|
747
|
+
// Baseline (without-docs) entries use "output-only" (2 dimensions,
|
|
748
|
+
// doc-coverage excluded). See docs/design-docs/named-scoring-profiles.md.
|
|
764
749
|
const rubricConfig = loadRubricTemplates(ROOT);
|
|
765
|
-
|
|
766
|
-
const
|
|
750
|
+
const goldProfile = resolveProfile("baseline", "gold", rubricConfig);
|
|
751
|
+
const baselineProfileWeights = resolveProfile("baseline", "baseline", rubricConfig);
|
|
752
|
+
log.debug("Loaded scoring profiles", {
|
|
753
|
+
gold: goldProfile,
|
|
754
|
+
baseline: baselineProfileWeights,
|
|
755
|
+
});
|
|
756
|
+
const baselineScores = calculateScores(baselineResultsPath, goldProfile, baselineProfileWeights);
|
|
767
757
|
log.debug("Baseline scores calculated", {
|
|
768
758
|
featureCount: baselineScores.length,
|
|
769
759
|
features: baselineScores.map((s) => ({
|
|
@@ -773,7 +763,7 @@ export function calculateAndWriteScores(options) {
|
|
|
773
763
|
docLift: s.docLift,
|
|
774
764
|
})),
|
|
775
765
|
});
|
|
776
|
-
const perModel = calculateScoresPerModel(baselineResultsPath,
|
|
766
|
+
const perModel = calculateScoresPerModel(baselineResultsPath, goldProfile, baselineProfileWeights);
|
|
777
767
|
const urlRefs = aggregateUrlReferences(baselineResultsPath);
|
|
778
768
|
const sourceVerification = buildSourceVerification(ROOT, source, {
|
|
779
769
|
allowedOrigins: options.allowedOrigins,
|
|
@@ -788,7 +778,8 @@ export function calculateAndWriteScores(options) {
|
|
|
788
778
|
let evaluationMode;
|
|
789
779
|
if (mode === "full" && existsSync(agenticResultsPath)) {
|
|
790
780
|
log.info(`\nReading agentic results from: ${agenticResultsPath}`);
|
|
791
|
-
const
|
|
781
|
+
const agenticProfile = resolveProfile("agentic", "gold", rubricConfig);
|
|
782
|
+
const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile);
|
|
792
783
|
log.debug("Agentic scores calculated", {
|
|
793
784
|
featureCount: Object.keys(agenticScores).length,
|
|
794
785
|
features: Object.entries(agenticScores).map(([f, s]) => ({
|
|
@@ -801,7 +792,7 @@ export function calculateAndWriteScores(options) {
|
|
|
801
792
|
evaluationMode = "full";
|
|
802
793
|
// Merge agentic actual scores into the per-model breakdown
|
|
803
794
|
if (perModel) {
|
|
804
|
-
const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath,
|
|
795
|
+
const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath, agenticProfile);
|
|
805
796
|
for (const entry of perModel) {
|
|
806
797
|
const modelAgentic = agenticPerModel[entry.modelId];
|
|
807
798
|
if (modelAgentic) {
|
|
@@ -34,11 +34,11 @@
|
|
|
34
34
|
* value: ["client.fetch", "createClient"]
|
|
35
35
|
* baseline:
|
|
36
36
|
* enabled: true
|
|
37
|
-
* rubric:
|
|
37
|
+
* rubric: full
|
|
38
38
|
*
|
|
39
39
|
* Expands to:
|
|
40
40
|
* 1. Gold entry — uses vars.docs as-is, resolves templates, appends doc-coverage
|
|
41
|
-
* 2. Baseline entry — sets docs: "",
|
|
41
|
+
* 2. Baseline entry — sets docs: "", uses full rubric (same assertions as gold)
|
|
42
42
|
*/
|
|
43
43
|
import type { TaskDefinition } from "../_vendor/ailf-core/index.d.ts";
|
|
44
44
|
import type { Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
@@ -72,14 +72,14 @@ export interface LegacyTaskEntry {
|
|
|
72
72
|
}
|
|
73
73
|
/** A single task definition in the new format (input). */
|
|
74
74
|
export interface SingleTaskDefinition {
|
|
75
|
-
/** Grading assertions (applied to gold
|
|
75
|
+
/** Grading assertions (applied to both gold and baseline by default). */
|
|
76
76
|
assert: AssertEntry[];
|
|
77
77
|
/** Baseline generation options. */
|
|
78
78
|
baseline?: {
|
|
79
79
|
/** Whether to generate a baseline variant. Default: true. */
|
|
80
80
|
enabled?: boolean;
|
|
81
81
|
/** Rubric mode: 'full' copies all asserts, 'abbreviated' generates a
|
|
82
|
-
* summary rubric, 'none' omits rubric asserts. Default: '
|
|
82
|
+
* summary rubric, 'none' omits rubric asserts. Default: 'full'. */
|
|
83
83
|
rubric?: "abbreviated" | "full" | "none";
|
|
84
84
|
};
|
|
85
85
|
/** Human-readable description of what this task tests. */
|
|
@@ -34,11 +34,11 @@
|
|
|
34
34
|
* value: ["client.fetch", "createClient"]
|
|
35
35
|
* baseline:
|
|
36
36
|
* enabled: true
|
|
37
|
-
* rubric:
|
|
37
|
+
* rubric: full
|
|
38
38
|
*
|
|
39
39
|
* Expands to:
|
|
40
40
|
* 1. Gold entry — uses vars.docs as-is, resolves templates, appends doc-coverage
|
|
41
|
-
* 2. Baseline entry — sets docs: "",
|
|
41
|
+
* 2. Baseline entry — sets docs: "", uses full rubric (same assertions as gold)
|
|
42
42
|
*/
|
|
43
43
|
import { existsSync, readFileSync, readdirSync } from "fs";
|
|
44
44
|
import { resolve } from "path";
|
|
@@ -181,7 +181,7 @@ export function expandTask(task, rubricConfig, mode = "baseline") {
|
|
|
181
181
|
// Restricted to the 'without-docs' prompt. Unless explicitly disabled.
|
|
182
182
|
const baselineEnabled = task.baseline?.enabled !== false;
|
|
183
183
|
if (baselineEnabled) {
|
|
184
|
-
const rubricMode = task.baseline?.rubric ?? "
|
|
184
|
+
const rubricMode = task.baseline?.rubric ?? "full";
|
|
185
185
|
const baselineAsserts = buildBaselineAsserts(resolvedAsserts, rubricMode);
|
|
186
186
|
entries.push({
|
|
187
187
|
description: `${task.description} (baseline)`,
|
|
@@ -81,7 +81,8 @@ export function extractGradingJudgments(file) {
|
|
|
81
81
|
continue;
|
|
82
82
|
const description = result.testCase?.description ?? "unknown";
|
|
83
83
|
const hasDocs = result.vars?.docs && result.vars.docs.trim().length > 0;
|
|
84
|
-
// Only grade "gold" (with-docs) tests — baseline tests
|
|
84
|
+
// Only grade "gold" (with-docs) tests — baseline tests use the output-only
|
|
85
|
+
// scoring profile and doc-coverage is undefined without docs
|
|
85
86
|
if (!hasDocs)
|
|
86
87
|
continue;
|
|
87
88
|
const area = detectFeatureArea(description);
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/profile-resolution.ts
|
|
3
|
+
*
|
|
4
|
+
* Resolves the correct weight profile for a given (mode, variant) pair.
|
|
5
|
+
* The scoring engine calls this to determine which dimensions and weights
|
|
6
|
+
* apply to each test entry's composite score.
|
|
7
|
+
*
|
|
8
|
+
* Resolution order:
|
|
9
|
+
* 1. Explicit binding: mode-profiles.<mode>.<variant> → profile name
|
|
10
|
+
* 2. Fallback: the "default" profile
|
|
11
|
+
*
|
|
12
|
+
* Supports both the new `profiles` format and the legacy flat `weights`
|
|
13
|
+
* format (treated as a single profile named "default").
|
|
14
|
+
*
|
|
15
|
+
* @see docs/design-docs/named-scoring-profiles.md
|
|
16
|
+
*/
|
|
17
|
+
import type { RubricConfig, WeightProfile } from "../_vendor/ailf-core/index.d.ts";
|
|
18
|
+
/**
|
|
19
|
+
* Resolve all named profiles from a RubricConfig, normalizing the legacy
|
|
20
|
+
* flat `weights` format into a `profiles` map with a single "default" entry.
|
|
21
|
+
*
|
|
22
|
+
* @returns A map of profile names → weight maps. Always includes "default".
|
|
23
|
+
*/
|
|
24
|
+
export declare function resolveProfiles(config: RubricConfig): Record<string, WeightProfile>;
|
|
25
|
+
/**
|
|
26
|
+
* Resolve the weight profile for a specific (mode, variant) pair.
|
|
27
|
+
*
|
|
28
|
+
* @param mode - Evaluation mode (e.g., "baseline", "agentic", "agent-task")
|
|
29
|
+
* @param variant - Entry variant: "gold" (with docs) or "baseline" (without docs)
|
|
30
|
+
* @param config - Parsed rubrics.yaml config
|
|
31
|
+
* @returns The resolved weight profile (dimension → weight map)
|
|
32
|
+
*
|
|
33
|
+
* @example
|
|
34
|
+
* resolveProfile("baseline", "gold", config) // → default profile
|
|
35
|
+
* resolveProfile("baseline", "baseline", config) // → output-only profile
|
|
36
|
+
* resolveProfile("agentic", "gold", config) // → default profile
|
|
37
|
+
* resolveProfile("unknown-mode", "gold", config) // → default (fallback)
|
|
38
|
+
*/
|
|
39
|
+
export declare function resolveProfile(mode: string, variant: string, config: RubricConfig): WeightProfile;
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/profile-resolution.ts
|
|
3
|
+
*
|
|
4
|
+
* Resolves the correct weight profile for a given (mode, variant) pair.
|
|
5
|
+
* The scoring engine calls this to determine which dimensions and weights
|
|
6
|
+
* apply to each test entry's composite score.
|
|
7
|
+
*
|
|
8
|
+
* Resolution order:
|
|
9
|
+
* 1. Explicit binding: mode-profiles.<mode>.<variant> → profile name
|
|
10
|
+
* 2. Fallback: the "default" profile
|
|
11
|
+
*
|
|
12
|
+
* Supports both the new `profiles` format and the legacy flat `weights`
|
|
13
|
+
* format (treated as a single profile named "default").
|
|
14
|
+
*
|
|
15
|
+
* @see docs/design-docs/named-scoring-profiles.md
|
|
16
|
+
*/
|
|
17
|
+
/**
|
|
18
|
+
* Resolve all named profiles from a RubricConfig, normalizing the legacy
|
|
19
|
+
* flat `weights` format into a `profiles` map with a single "default" entry.
|
|
20
|
+
*
|
|
21
|
+
* @returns A map of profile names → weight maps. Always includes "default".
|
|
22
|
+
*/
|
|
23
|
+
export function resolveProfiles(config) {
|
|
24
|
+
if (config.profiles) {
|
|
25
|
+
return config.profiles;
|
|
26
|
+
}
|
|
27
|
+
// Legacy format: flat weights → single "default" profile
|
|
28
|
+
if (config.weights) {
|
|
29
|
+
return { default: config.weights };
|
|
30
|
+
}
|
|
31
|
+
// Schema validation should prevent this, but be defensive
|
|
32
|
+
throw new Error("rubrics.yaml has neither 'profiles' nor 'weights' — cannot resolve scoring profiles");
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Resolve the weight profile for a specific (mode, variant) pair.
|
|
36
|
+
*
|
|
37
|
+
* @param mode - Evaluation mode (e.g., "baseline", "agentic", "agent-task")
|
|
38
|
+
* @param variant - Entry variant: "gold" (with docs) or "baseline" (without docs)
|
|
39
|
+
* @param config - Parsed rubrics.yaml config
|
|
40
|
+
* @returns The resolved weight profile (dimension → weight map)
|
|
41
|
+
*
|
|
42
|
+
* @example
|
|
43
|
+
* resolveProfile("baseline", "gold", config) // → default profile
|
|
44
|
+
* resolveProfile("baseline", "baseline", config) // → output-only profile
|
|
45
|
+
* resolveProfile("agentic", "gold", config) // → default profile
|
|
46
|
+
* resolveProfile("unknown-mode", "gold", config) // → default (fallback)
|
|
47
|
+
*/
|
|
48
|
+
export function resolveProfile(mode, variant, config) {
|
|
49
|
+
const profiles = resolveProfiles(config);
|
|
50
|
+
const modeProfiles = config["mode-profiles"];
|
|
51
|
+
// Look up explicit binding: mode-profiles.<mode>.<variant> → profile name
|
|
52
|
+
const profileName = modeProfiles?.[mode]?.[variant];
|
|
53
|
+
if (profileName) {
|
|
54
|
+
const profile = profiles[profileName];
|
|
55
|
+
if (!profile) {
|
|
56
|
+
throw new Error(`mode-profiles.${mode}.${variant} references profile "${profileName}" ` +
|
|
57
|
+
`which does not exist. Available profiles: ${Object.keys(profiles).join(", ")}`);
|
|
58
|
+
}
|
|
59
|
+
return profile;
|
|
60
|
+
}
|
|
61
|
+
// Fall back to "default" profile
|
|
62
|
+
const defaultProfile = profiles["default"];
|
|
63
|
+
if (!defaultProfile) {
|
|
64
|
+
throw new Error(`No scoring profile found for mode="${mode}" variant="${variant}" ` +
|
|
65
|
+
`and no "default" profile exists. ` +
|
|
66
|
+
`Available profiles: ${Object.keys(profiles).join(", ")}`);
|
|
67
|
+
}
|
|
68
|
+
return defaultProfile;
|
|
69
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sanity/ailf",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.5.0",
|
|
4
4
|
"private": false,
|
|
5
5
|
"publishConfig": {
|
|
6
6
|
"access": "restricted"
|
|
@@ -41,8 +41,8 @@
|
|
|
41
41
|
"tsx": "^4.19.2",
|
|
42
42
|
"typescript": "^5.7.3",
|
|
43
43
|
"@sanity/ailf-core": "0.1.0",
|
|
44
|
-
"@sanity/ailf-
|
|
45
|
-
"@sanity/ailf-
|
|
44
|
+
"@sanity/ailf-tasks": "0.1.4",
|
|
45
|
+
"@sanity/ailf-shared": "0.1.0"
|
|
46
46
|
},
|
|
47
47
|
"scripts": {
|
|
48
48
|
"build": "tsc && tsx scripts/bundle-workspace-deps.ts",
|