npm - @sanity/ailf - Versions diffs - 3.8.1 → 4.0.0 - Mend

@sanity/ailf 3.8.1 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

package/config/canary-tasks.ts +64 -0
package/config/models.ts +32 -4
package/config/test-budgets.ts +24 -0
package/dist/_vendor/ailf-core/config-helpers.d.ts +26 -1
package/dist/_vendor/ailf-core/config-helpers.js +81 -1
package/dist/_vendor/ailf-core/index.d.ts +1 -1
package/dist/_vendor/ailf-core/index.js +1 -1
package/dist/_vendor/ailf-core/schemas/canary-tasks.d.ts +52 -0
package/dist/_vendor/ailf-core/schemas/canary-tasks.js +46 -0
package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
package/dist/_vendor/ailf-core/schemas/index.js +2 -0
package/dist/_vendor/ailf-core/schemas/test-budgets.d.ts +19 -0
package/dist/_vendor/ailf-core/schemas/test-budgets.js +34 -0
package/dist/_vendor/ailf-shared/canary-drift.d.ts +84 -0
package/dist/_vendor/ailf-shared/canary-drift.js +86 -0
package/dist/_vendor/ailf-shared/index.d.ts +16 -9
package/dist/_vendor/ailf-shared/index.js +13 -9
package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
package/dist/agent-observer/agentic-provider.js +28 -23
package/dist/agent-observer/classifier.js +7 -2
package/dist/agent-observer/proxy.d.ts +88 -3
package/dist/agent-observer/proxy.js +174 -16
package/dist/agent-observer/types.d.ts +23 -5
package/dist/cli-program.js +1 -1
package/dist/commands/baseline.d.ts +3 -1
package/dist/commands/baseline.js +29 -9
package/dist/commands/cache.d.ts +5 -1
package/dist/commands/cache.js +31 -15
package/dist/commands/compare.js +11 -4
package/dist/commands/explain-handler.js +2 -2
package/dist/config/canary-tasks.ts +64 -0
package/dist/config/models.ts +32 -4
package/dist/config/test-budgets.ts +24 -0
package/dist/pipeline/baseline.d.ts +14 -3
package/dist/pipeline/baseline.js +7 -13
package/dist/pipeline/calculate-scores.d.ts +17 -2
package/dist/pipeline/calculate-scores.js +139 -1
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +5 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +25 -2
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +5 -1
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +4 -0
package/dist/pipeline/compiler/promptfoo-compiler.js +23 -0
package/dist/pipeline/compiler/provider-assembler.d.ts +23 -0
package/dist/pipeline/compiler/provider-assembler.js +37 -2
package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
package/dist/tasks/knowledge-probe/groq-projections.task.ts +29 -11
package/package.json +2 -1
package/tasks/knowledge-probe/groq-projections.task.ts +29 -11

package/dist/pipeline/calculate-scores.js CHANGED Viewed

@@ -531,6 +531,45 @@ function printAgentBehaviorReport(agentBehavior, log) {
 // ---------------------------------------------------------------------------
 // Grader cost extraction
 // ---------------------------------------------------------------------------
+/**
+ * Resolve a per-test cost. Promptfoo populates `r.cost` for most providers
+ * directly, but `openai:responses:` (and occasionally `openai:chat:`) leaves
+ * `cost` at 0 for newer models — Promptfoo's pricing table can lag the
+ * model launch. When `cost` is 0 but `response.tokenUsage` is recorded,
+ * fall back to AILF's local pricing table so the per-model rollup row
+ * isn't dropped on the floor. See W0123.
+ */
+function resolveTestCost(r) {
+    const promptfooCost = r.cost ?? 0;
+    if (promptfooCost > 0)
+        return promptfooCost;
+    const tokens = r.response?.tokenUsage;
+    if (!tokens)
+        return promptfooCost;
+    const model = extractModelFromProviderId(r.provider?.id);
+    if (!model)
+        return promptfooCost;
+    return calculateCost(model, tokens.prompt ?? 0, tokens.completion ?? 0);
+}
+/**
+ * Extract the model name from a Promptfoo provider id. Provider ids are
+ * colon-segmented `<vendor>:<surface>:<model>` (e.g. `openai:responses:gpt-5.4`,
+ * `anthropic:messages:claude-opus-4-6`); the model is the trailing segment.
+ * Returns undefined for ids that don't carry a model segment (e.g. agentic
+ * providers whose id ends in a `file://` URL).
+ */
+function extractModelFromProviderId(providerId) {
+    if (!providerId)
+        return undefined;
+    const parts = providerId.split(":");
+    if (parts.length < 2)
+        return undefined;
+    const last = parts[parts.length - 1];
+    if (!last || last.startsWith("file://") || last.startsWith("http")) {
+        return undefined;
+    }
+    return last;
+}
 /**
  * Reads the raw Promptfoo output file and normalizes each result so that
  * `description` is always a top-level field (pulled from `testCase` if needed).
@@ -551,7 +590,7 @@ function readAndNormalizeResults(resultsPath, log) {
     let synthesizedCount = 0;
     for (const r of wrapper.results) {
         const base = {
-            cost: r.cost ?? 0,
+            cost: resolveTestCost(r),
             description: r.testCase?.description ?? "unknown",
             latencyMs: r.latencyMs,
             metadata: r.metadata,
@@ -719,6 +758,55 @@ function extractTaskId(description) {
     return description.trim() || "unknown";
 }
 // ---------------------------------------------------------------------------
+// Knowledge-probe scoring — closed-book recall with no docs context
+// ---------------------------------------------------------------------------
+/**
+ * Score knowledge-probe evaluation results.
+ *
+ * Knowledge-probe mode evaluates parametric recall: the model has no `docs`
+ * var and answers from training-data knowledge alone. The compiler explicitly
+ * deletes `vars.docs`, so every result lands in the without-docs bucket of
+ * the literacy scoring path — collapsing testCount and ceilingScore to zero.
+ *
+ * This branch mirrors the shape of `scoreAgentHarnessResults` but groups by
+ * feature area (KP results carry `__featureArea` from the compiler), and
+ * uses the `knowledge-probe` profile (factual-correctness / completeness /
+ * currency). Literacy-specific fields (ceilingScore, floorScore, docLift,
+ * docQualityGap) are zero — KP has no with-docs/without-docs decomposition.
+ */
+export function scoreKnowledgeProbeResults(results, profile) {
+    const byFeature = {};
+    for (const result of results) {
+        const feature = result.vars.__featureArea || detectFeatureArea(result.description);
+        if (!byFeature[feature]) {
+            byFeature[feature] = [];
+        }
+        byFeature[feature].push(result);
+    }
+    const scores = [];
+    for (const [feature, featureResults] of Object.entries(byFeature)) {
+        const scored = scoreTestGroup(featureResults, profile, feature);
+        scores.push({
+            assertionPassRate: scored.dimensions.assertionPassRate,
+            ceilingScore: 0,
+            codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
+            dimensions: scored.dimensions,
+            docCoverage: scored.dimensions.docCoverage ?? 0,
+            docLift: 0,
+            docQualityGap: 0,
+            feature,
+            floorScore: 0,
+            groupType: "feature",
+            negativeDocLift: false,
+            taskCompletion: scored.dimensions.taskCompletion ?? 0,
+            testCount: featureResults.length,
+            totalCost: scored.totalCost,
+            totalScore: scored.composite,
+        });
+    }
+    return scores.sort((a, b) => a.feature.localeCompare(b.feature));
+}
+// ---------------------------------------------------------------------------
 // Agentic scoring — all results are "actual" (agent retrieves docs via tools)
 // ---------------------------------------------------------------------------
 /**
@@ -893,6 +981,56 @@ export function calculateAndWriteScores(options) {
         const testSummary = computeTestSummary(baselineResultsPath);
         return { belowCritical: summary.belowCritical, testSummary };
     }
+    // ── Knowledge-probe scoring path ────────────────────────────
+    // Knowledge-probe mode evaluates parametric recall (no docs context).
+    // The KP compiler deletes `vars.docs`, so the literacy path would bucket
+    // every result into `withoutDocs` and collapse testCount + dimensions
+    // to zero. This branch groups by feature area only and uses the
+    // `knowledge-probe` profile (factual-correctness / completeness /
+    // currency). See docs/design-docs/mode-agnostic-scoring.md.
+    if (mode === "knowledge-probe") {
+        const probeProfile = resolveProfile("knowledge-probe", "gold", rubricConfig);
+        log.debug("Knowledge-probe scoring profile", probeProfile);
+        const results = readAndNormalizeResults(baselineResultsPath);
+        const scores = scoreKnowledgeProbeResults(results, probeProfile);
+        log.debug("Knowledge-probe scores calculated", {
+            featureCount: scores.length,
+            features: scores.map((s) => ({
+                feature: s.feature,
+                totalScore: s.totalScore,
+                testCount: s.testCount,
+                dimensions: s.dimensions,
+            })),
+        });
+        const urlRefs = aggregateUrlReferences(baselineResultsPath);
+        const sourceVerification = buildSourceVerification(ROOT, source, {
+            allowedOrigins: options.allowedOrigins,
+            mode,
+            searchMode: options.searchMode,
+        });
+        const graderCost = extractGraderCost(baselineResultsPath);
+        const summary = printReport(scores, urlRefs, source, null, // no agent behavior — KP is closed-book
+        graderCost, null, // no per-model breakdown for now
+        null, // no source isolation — KP doesn't fetch sources
+        sourceVerification, "knowledge-probe", log);
+        // Persist
+        const outDir = join(ROOT, "results", "latest");
+        mkdirSync(outDir, { recursive: true });
+        writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
+        log.info("Score summary written to results/latest/score-summary.json");
+        const judgments = extractGraderJudgments(baselineResultsPath);
+        if (judgments.length > 0) {
+            writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
+            log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
+        }
+        const testResults = extractStoredTestResults(baselineResultsPath);
+        if (testResults.length > 0) {
+            writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
+            log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
+        }
+        const testSummary = computeTestSummary(baselineResultsPath);
+        return { belowCritical: summary.belowCritical, testSummary };
+    }
     // ── Literacy scoring path ───────────────────────────────────
     // Gold (with-docs) entries use the "default" profile (3 dimensions).
     // Baseline (without-docs) entries use "output-only" (2 dimensions,

package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts CHANGED Viewed

@@ -9,6 +9,11 @@ import type { KnowledgeProbeCompileOptions } from "./types.js";
  * Tool-use assertions are rejected (knowledge probes don't use tools).
  * LLM-graded assertions receive the configured grader provider.
  * All other assertions are passed through.
+ *
+ * Templated `llm-rubric` assertions (those with `template` + `criteria`) go
+ * through the shared rubric resolver so the compiled assertion carries
+ * `metadata.dimension` — without this, the scoring engine can't classify
+ * KP grader output and dimension scores collapse to zero (W0128 / DOC-2077).
  */
 export declare function mapKnowledgeProbeAssertion(assertion: {
     type: string;

package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js CHANGED Viewed

@@ -1,12 +1,18 @@
 /**
  * Assertion mapping for knowledge probe evaluations.
  */
+import { resolveTemplatedAssertion } from "../../rubric-resolution.js";
 /**
  * Map a raw knowledge probe assertion to a Promptfoo assertion.
  *
  * Tool-use assertions are rejected (knowledge probes don't use tools).
  * LLM-graded assertions receive the configured grader provider.
  * All other assertions are passed through.
+ *
+ * Templated `llm-rubric` assertions (those with `template` + `criteria`) go
+ * through the shared rubric resolver so the compiled assertion carries
+ * `metadata.dimension` — without this, the scoring engine can't classify
+ * KP grader output and dimension scores collapse to zero (W0128 / DOC-2077).
  */
 export function mapKnowledgeProbeAssertion(assertion, options, warnings) {
     switch (assertion.type) {
@@ -27,9 +33,26 @@ export function mapKnowledgeProbeAssertion(assertion, options, warnings) {
                     ? { weight: assertion.weight }
                     : {}),
             };
-        // LLM-graded assertions — add grader provider
-        case "g-eval":
         case "llm-rubric":
+            // Templated form (template + criteria) → resolve to full rubric text
+            // with dimension metadata attached.
+            if ("template" in assertion && "criteria" in assertion) {
+                return resolveTemplatedAssertion(assertion, options?.rubricConfig, options?.graderProvider, warnings);
+            }
+            // Inline value form — pass through with grader provider, no metadata.
+            // Back-compat for tasks not yet migrated to the templated form.
+            return {
+                type: "llm-rubric",
+                ...("value" in assertion ? { value: assertion.value } : {}),
+                ...(typeof assertion.weight === "number"
+                    ? { weight: assertion.weight }
+                    : {}),
+                ...(options?.graderProvider
+                    ? { provider: options.graderProvider }
+                    : {}),
+            };
+        // Other LLM-graded assertions — add grader provider
+        case "g-eval":
         case "model-graded-closedqa":
         case "model-graded-factuality":
             return {

package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js CHANGED Viewed

@@ -37,7 +37,11 @@ export const handler = {
         if (!("mode" in task) || task.mode !== "knowledge-probe") {
             throw new Error(`Knowledge probe handler received task with mode "${task.mode ?? "undefined"}" — expected "knowledge-probe"`);
         }
-        const result = compileKnowledgeProbeTask(task, { graderProvider: ctx.graderProvider, models: ctx.models });
+        const result = compileKnowledgeProbeTask(task, {
+            graderProvider: ctx.graderProvider,
+            models: ctx.models,
+            rubricConfig: ctx.rubricConfig,
+        });
         return {
             providers: result.providers,
             tests: result.tests,

package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts CHANGED Viewed

@@ -2,6 +2,7 @@
  * Public types for the knowledge-probe mode handler.
  */
 import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
+import type { RubricConfig } from "../../rubric-resolution.js";
 /** Options for compiling a knowledge probe task */
 export interface KnowledgeProbeCompileOptions {
     /** Grader provider for LLM-graded assertions */
@@ -12,6 +13,9 @@ export interface KnowledgeProbeCompileOptions {
         label: string;
         config?: Record<string, unknown>;
     }[];
+    /** Rubric config (templates, weights, profiles) — needed to resolve
+     * templated `llm-rubric` assertions to dimension metadata. */
+    rubricConfig?: RubricConfig;
 }
 /** Result of compiling a single knowledge probe task */
 export interface KnowledgeProbeCompileResult {

package/dist/pipeline/compiler/promptfoo-compiler.js CHANGED Viewed

@@ -11,10 +11,20 @@
  *
  * @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
  */
+import { dirname, resolve as resolvePath } from "node:path";
+import { fileURLToPath } from "node:url";
 import { mapAssertions } from "./assertion-mapper.js";
 import { resolveTaskFixtures } from "./fixture-resolver.js";
 import { LiteracyVariant } from "../normalize-mode.js";
 import { resolveVariables } from "./variable-resolver.js";
+/**
+ * Absolute filesystem path to the AILF mock Promptfoo provider. Resolved
+ * once at module load relative to this file. Promptfoo's `file://` provider
+ * loader requires an absolute path. See buildProviders for the env-var
+ * gate that swaps real providers for this mock.
+ */
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const MOCK_PROVIDER_ABSPATH = resolvePath(__dirname, "..", "..", "promptfoo-providers", "mock-provider.cjs");
 // ---------------------------------------------------------------------------
 // Public API
 // ---------------------------------------------------------------------------
@@ -143,6 +153,19 @@ function buildProviders(models, mode) {
             },
         });
     }
+    // Replay swap — when AILF_REPLAY_LLMS=1 is set, rewrite every provider's
+    // `id` to the file-based AILF mock provider so the Promptfoo subprocess
+    // never makes a live LLM call. We preserve `label` and stash the
+    // original `id` in `config.originalId` so the mock provider can surface
+    // model identity in its output and reports remain interpretable.
+    // See W0110 (M5.1) and docs/design-docs/testing-strategy.md.
+    if (process.env.AILF_REPLAY_LLMS === "1") {
+        return providers.map((p) => ({
+            id: `file://${MOCK_PROVIDER_ABSPATH}`,
+            label: p.label,
+            config: { ...p.config, originalId: p.id },
+        }));
+    }
     return providers;
 }
 /**

package/dist/pipeline/compiler/provider-assembler.d.ts CHANGED Viewed

@@ -6,6 +6,21 @@
  *
  * Separated into its own module so GenerateConfigsStep can import it
  * without pulling in the full legacy generate-configs machinery.
+ *
+ * W0134 — per-mode maxToolRounds
+ *
+ * The agentic naive variant gets a higher round budget than agentic
+ * optimized: naive simulates current real-world agent behavior under
+ * retrieval pressure (it spends rounds on retries when fetches fail) and
+ * benefits from more headroom; optimized bypasses Jina via the .md-direct
+ * branch and rarely needs more than a couple of rounds. Bumping globally
+ * would inflate optimized cost without changing its measured behavior.
+ *
+ * Resolution order (most specific wins):
+ *   1. `model.config.maxToolRounds` — per-model override.
+ *   2. `defaults.modeMaxToolRounds[variant]` — per-variant override.
+ *   3. `defaults.maxToolRounds` — global default.
+ *   4. Hard fallback (5).
  */
 import { type ModelsConfig } from "../../_vendor/ailf-core/index.d.ts";
 import type { ResolvedSourceConfig } from "../../sources.js";
@@ -37,3 +52,11 @@ export interface ModelsAndProviders {
  * the per-variant promptfoo config files.
  */
 export declare function loadModelsAndProviders(rootDir: string, source?: ResolvedSourceConfig, searchMode?: string, allowedOrigins?: string[]): ModelsAndProviders;
+/**
+ * Resolve `maxToolRounds` for an agentic variant (W0134).
+ *
+ * Most-specific wins: per-model `config.maxToolRounds` > per-variant
+ * `defaults.modeMaxToolRounds[variant]` > global `defaults.maxToolRounds`
+ * > hard fallback (5).
+ */
+export declare function resolveMaxToolRounds(models: ModelsConfig, model: ModelsConfig["models"][number], variant: "agentic-naive" | "agentic-optimized"): number;

package/dist/pipeline/compiler/provider-assembler.js CHANGED Viewed

@@ -6,6 +6,21 @@
  *
  * Separated into its own module so GenerateConfigsStep can import it
  * without pulling in the full legacy generate-configs machinery.
+ *
+ * W0134 — per-mode maxToolRounds
+ *
+ * The agentic naive variant gets a higher round budget than agentic
+ * optimized: naive simulates current real-world agent behavior under
+ * retrieval pressure (it spends rounds on retries when fetches fail) and
+ * benefits from more headroom; optimized bypasses Jina via the .md-direct
+ * branch and rarely needs more than a couple of rounds. Bumping globally
+ * would inflate optimized cost without changing its measured behavior.
+ *
+ * Resolution order (most specific wins):
+ *   1. `model.config.maxToolRounds` — per-model override.
+ *   2. `defaults.modeMaxToolRounds[variant]` — per-variant override.
+ *   3. `defaults.maxToolRounds` — global default.
+ *   4. Hard fallback (5).
  */
 import { extractModelName, extractProvider, mergeConfig, } from "../../_vendor/ailf-core/index.js";
 import { loadConfigFile } from "./config-loader.js";
@@ -100,7 +115,7 @@ function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
             config: {
                 ...mergeConfig(models.defaults, model.config, {
                     agentMode: "naive",
-                    maxToolRounds: models.defaults.maxToolRounds ?? 5,
+                    maxToolRounds: resolveMaxToolRounds(models, model, "agentic-naive"),
                     model: modelName,
                     provider,
                 }),
@@ -120,7 +135,7 @@ function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
             config: {
                 ...mergeConfig(models.defaults, model.config, {
                     agentMode: "optimized",
-                    maxToolRounds: models.defaults.maxToolRounds ?? 5,
+                    maxToolRounds: resolveMaxToolRounds(models, model, "agentic-optimized"),
                     model: modelName,
                     provider,
                 }),
@@ -135,6 +150,26 @@ function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
     }
     return providers;
 }
+/**
+ * Resolve `maxToolRounds` for an agentic variant (W0134).
+ *
+ * Most-specific wins: per-model `config.maxToolRounds` > per-variant
+ * `defaults.modeMaxToolRounds[variant]` > global `defaults.maxToolRounds`
+ * > hard fallback (5).
+ */
+export function resolveMaxToolRounds(models, model, variant) {
+    const perModel = model.config?.maxToolRounds;
+    if (typeof perModel === "number")
+        return perModel;
+    const modeOverrides = models.defaults.modeMaxToolRounds;
+    const perVariant = modeOverrides?.[variant];
+    if (typeof perVariant === "number")
+        return perVariant;
+    const globalDefault = models.defaults.maxToolRounds;
+    if (typeof globalDefault === "number")
+        return globalDefault;
+    return 5;
+}
 // ---------------------------------------------------------------------------
 // Helpers
 // ---------------------------------------------------------------------------

package/dist/pipeline/mirror-repo-tasks.d.ts CHANGED Viewed

@@ -107,7 +107,7 @@ export declare function buildMirrorDocument(task: LiteracyTaskDefinition, opts:
     slugToDocId: Map<string, string>;
 }): {
     baseline?: {
-        rubric?: "abbreviated" | "full" | "none" | undefined;
+        rubric?: "full" | "abbreviated" | "none" | undefined;
         enabled?: boolean | undefined;
     } | undefined;
     _id: string;

package/dist/tasks/knowledge-probe/groq-projections.task.ts CHANGED Viewed

@@ -41,22 +41,40 @@ export default defineTask({
   assertions: [
     { type: "contains", value: "->" },
     { type: "contains", value: "select(" },
+    // Templated rubrics so the compiled assertions carry `metadata.dimension`
+    // and the scoring engine can populate per-dimension scores from the KP
+    // profile (factual-correctness 0.45 / completeness 0.35 / currency 0.20).
     {
       type: "llm-rubric",
-      value:
-        "The response should demonstrate accurate knowledge of GROQ " +
-        "projection syntax with working code examples. Check that the " +
-        "dereference operator, spread syntax, and select() are correctly " +
-        "explained with valid GROQ code.",
-      weight: 0.6,
+      template: "factual-correctness",
+      criteria: [
+        "The dereference operator `->` is correctly explained for following references",
+        "The spread operator `...` is shown in a valid projection example",
+        "`select()` is used with valid syntax for conditional projections",
+        'Computed field names (e.g., `"label": title`) are demonstrated correctly',
+        "Code examples use valid GROQ — no fabricated operators or deprecated syntax",
+      ],
     },
     {
       type: "llm-rubric",
-      value:
-        "Evaluate whether the response reflects current GROQ syntax " +
-        "(post-2023). Check for deprecated patterns or outdated " +
-        "recommendations.",
-      weight: 0.4,
+      template: "completeness",
+      criteria: [
+        "Basic object projection with `{}` is covered",
+        "Nested projections and the spread operator are both addressed",
+        "Computed/aliased field names are demonstrated",
+        "The dereference operator `->` is included with a worked example",
+        "Both inclusive (`[0..5]`) and exclusive (`[0...5]`) array slicing are explained",
+        "Conditional projections via `select()` are covered",
+      ],
+    },
+    {
+      type: "llm-rubric",
+      template: "currency",
+      criteria: [
+        "Examples reflect current GROQ syntax (post-2023) — no deprecated patterns",
+        "Recommendations don't reference removed or legacy query forms",
+        "Modern projection idioms are used (e.g., spread + override)",
+      ],
     },
   ],
 })

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sanity/ailf",
-  "version": "3.8.1",
+  "version": "4.0.0",
   "private": false,
   "publishConfig": {
     "access": "public"
@@ -77,6 +77,7 @@
     "test": "tsx --test src/__tests__/*.test.ts src/adapters/**/__tests__/*.adapter.test.ts",
     "test:e2e": "AILF_E2E=1 tsx --test src/__tests__/e2e/*.e2e.test.ts",
     "test:e2e:adapters": "AILF_E2E=1 tsx --test src/adapters/**/__tests__/*.adapter.test.ts",
+    "test:e2e:api": "AILF_E2E_API=1 tsx --test src/__tests__/api-tier2-tenant-integration.test.ts",
     "test:all": "AILF_E2E=1 tsx --test src/__tests__/*.test.ts src/pipeline/compiler/__tests__/*.test.ts src/__tests__/e2e/*.e2e.test.ts src/adapters/**/__tests__/*.adapter.test.ts",
     "pr-comment": "tsx src/cli.ts pr-comment",
     "coverage-audit": "tsx src/cli.ts report coverage",

package/tasks/knowledge-probe/groq-projections.task.ts CHANGED Viewed

@@ -41,22 +41,40 @@ export default defineTask({
   assertions: [
     { type: "contains", value: "->" },
     { type: "contains", value: "select(" },
+    // Templated rubrics so the compiled assertions carry `metadata.dimension`
+    // and the scoring engine can populate per-dimension scores from the KP
+    // profile (factual-correctness 0.45 / completeness 0.35 / currency 0.20).
     {
       type: "llm-rubric",
-      value:
-        "The response should demonstrate accurate knowledge of GROQ " +
-        "projection syntax with working code examples. Check that the " +
-        "dereference operator, spread syntax, and select() are correctly " +
-        "explained with valid GROQ code.",
-      weight: 0.6,
+      template: "factual-correctness",
+      criteria: [
+        "The dereference operator `->` is correctly explained for following references",
+        "The spread operator `...` is shown in a valid projection example",
+        "`select()` is used with valid syntax for conditional projections",
+        'Computed field names (e.g., `"label": title`) are demonstrated correctly',
+        "Code examples use valid GROQ — no fabricated operators or deprecated syntax",
+      ],
     },
     {
       type: "llm-rubric",
-      value:
-        "Evaluate whether the response reflects current GROQ syntax " +
-        "(post-2023). Check for deprecated patterns or outdated " +
-        "recommendations.",
-      weight: 0.4,
+      template: "completeness",
+      criteria: [
+        "Basic object projection with `{}` is covered",
+        "Nested projections and the spread operator are both addressed",
+        "Computed/aliased field names are demonstrated",
+        "The dereference operator `->` is included with a worked example",
+        "Both inclusive (`[0..5]`) and exclusive (`[0...5]`) array slicing are explained",
+        "Conditional projections via `select()` are covered",
+      ],
+    },
+    {
+      type: "llm-rubric",
+      template: "currency",
+      criteria: [
+        "Examples reflect current GROQ syntax (post-2023) — no deprecated patterns",
+        "Recommendations don't reference removed or legacy query forms",
+        "Modern projection idioms are used (e.g., spread + override)",
+      ],
     },
   ],
 })