npm - @sanity/ailf - Versions diffs - 7.0.1 → 7.1.0 - Mend

@sanity/ailf 7.0.1 → 7.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

package/config/rubrics.ts +12 -13
package/dist/_vendor/ailf-core/ports/context.d.ts +45 -3
package/dist/_vendor/ailf-core/ports/index.d.ts +1 -1
package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +9 -1
package/dist/_vendor/ailf-core/schemas/branded-string.js +16 -6
package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -0
package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
package/dist/_vendor/ailf-core/schemas/report.d.ts +12 -0
package/dist/_vendor/ailf-core/schemas/report.js +2 -0
package/dist/_vendor/ailf-core/schemas/team.d.ts +22 -0
package/dist/_vendor/ailf-core/schemas/team.js +63 -0
package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +51 -0
package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +17 -0
package/dist/_vendor/ailf-core/types/team.d.ts +65 -0
package/dist/_vendor/ailf-core/types/team.js +1 -0
package/dist/_vendor/ailf-shared/eval-modes.d.ts +2 -0
package/dist/_vendor/ailf-shared/eval-modes.js +5 -0
package/dist/_vendor/ailf-shared/event-types.d.ts +15 -0
package/dist/_vendor/ailf-shared/event-types.js +23 -0
package/dist/_vendor/ailf-shared/generated/help-content.js +2 -2
package/dist/_vendor/ailf-shared/index.d.ts +4 -2
package/dist/_vendor/ailf-shared/index.js +4 -2
package/dist/_vendor/ailf-shared/member-roles.d.ts +16 -0
package/dist/_vendor/ailf-shared/member-roles.js +16 -0
package/dist/_vendor/ailf-shared/owner-teams.d.ts +19 -0
package/dist/_vendor/ailf-shared/owner-teams.js +7 -0
package/dist/_vendor/ailf-shared/run-context.d.ts +8 -1
package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +65 -1
package/dist/adapters/grader-outputs/promptfoo-grader-output.js +35 -0
package/dist/adapters/task-sources/changed-docs-filter.d.ts +12 -0
package/dist/adapters/task-sources/changed-docs-filter.js +30 -0
package/dist/adapters/task-sources/content-lake-task-source.js +2 -1
package/dist/adapters/task-sources/repo-task-source.js +2 -1
package/dist/commands/pipeline-action.d.ts +4 -3
package/dist/commands/pipeline-action.js +7 -5
package/dist/commands/run.js +2 -2
package/dist/config/rubrics.ts +12 -13
package/dist/job-store.d.ts +18 -0
package/dist/job-store.js +34 -0
package/dist/orchestration/build-app-context.js +8 -1
package/dist/orchestration/pipeline-orchestrator.js +46 -1
package/dist/orchestration/steps/compare-step.d.ts +7 -0
package/dist/orchestration/steps/compare-step.js +59 -23
package/dist/orchestration/steps/fetch-docs-step.js +3 -0
package/dist/orchestration/steps/finalize-run-step.js +2 -0
package/dist/orchestration/steps/generate-configs-step.d.ts +32 -1
package/dist/orchestration/steps/generate-configs-step.js +47 -13
package/dist/orchestration/steps/grader-consistency-step.js +11 -0
package/dist/orchestration/steps/publish-report-step.d.ts +12 -1
package/dist/orchestration/steps/publish-report-step.js +19 -3
package/dist/pipeline/cache-hit-restore.d.ts +14 -1
package/dist/pipeline/cache-hit-restore.js +17 -0
package/dist/pipeline/calculate-scores.js +57 -21
package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +7 -2
package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +13 -4
package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +1 -1
package/dist/pipeline/compiler/provider-assembler.d.ts +15 -1
package/dist/pipeline/compiler/provider-assembler.js +16 -3
package/dist/pipeline/failure-modes.d.ts +20 -10
package/dist/pipeline/failure-modes.js +84 -15
package/dist/pipeline/map-request-to-config.js +2 -0
package/dist/pipeline/normalize-mode.d.ts +1 -1
package/dist/pipeline/normalize-mode.js +2 -0
package/dist/pipeline/run-context.d.ts +16 -1
package/dist/pipeline/run-context.js +12 -1
package/dist/pipeline/validate.d.ts +8 -4
package/dist/pipeline/validate.js +8 -18
package/dist/report-store.d.ts +14 -1
package/dist/report-store.js +32 -0
package/dist/sanity/client.js +2 -2
package/package.json +1 -1

package/config/rubrics.ts CHANGED Viewed

@@ -15,10 +15,6 @@ import { defineRubrics } from "@sanity/ailf-core"
 // template entry below. Source of truth lives in packages/eval/src/grader/;
 // the helper picks the right list by dimension family.
 import { failureModesForDimension } from "../src/grader/index.js"
-// Single source of truth for the wire-format version stamped into the
-// grader-prompt footer (VER-01 D-02). Interpolated below so the
-// announced version cannot drift from the schema's expected value.
-import { graderJudgmentsVersion } from "../src/adapters/grader-outputs/index.js"
 export default defineRubrics({
   templates: {
@@ -242,20 +238,23 @@ export default defineRubrics({
     "agent-harness": { gold: "agent-harness" },
   },
-  // Phase 3 GRAD-05 (Plan 03-01) — structured GraderJudgment JSON sketch.
-  // Documents the target wire format the grader emits. The strict schema's
-  // GRAD-02 additive fields stay optional in this plan; Plan 03-04 flips
-  // them to required and bumps graderJudgmentsVersion to 1.0.0.
+  // W0273 — the footer documents the wire-format subset of GraderJudgment
+  // that the grader LLM actually controls. The pipeline parses this against
+  // GraderEmittedJudgmentSchema and then synthesizes the four pipeline-owned
+  // fields (judgmentId, metadata.{graderModel,graderJudgmentsVersion},
+  // hallucinationCheckedAgainst) to build the storage GraderJudgment.
+  //
+  // See docs/audits/2026-05-22-empty-gap-analysis-regression.md for the
+  // rationale (Phase 3 GRAD-05 made these fields required + .strict(),
+  // and asking the LLM for pipeline-owned values caused 100% parse
+  // failures starting 2026-05-11).
   footer: `Return ONLY a JSON object with this exact shape:
 {
-  "judgmentId": "<string>",
   "score": <number 0-100>,
   "reason": "<explanation, ≤500 chars>",
+  "failureMode": "<one of the declared modes for this dimension or 'unclassified'>",
   "subJudgments": [{ "criterionId": "<id>", "met": <bool>, "evidence": "<≤280 chars>", "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" } }],
   "docCitations": [{ "documentId": "<id>", "slug": "<optional slug>", "role": "supports|contradicts|missing|irrelevant", "hallucinated": <bool> }],
-  "failureMode": "<one of the declared modes for this dimension or 'unclassified'>",
-  "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" },
-  "hallucinationCheckedAgainst": ["<doc id>"],
-  "metadata": { "graderModel": "<string>", "graderJudgmentsVersion": "${graderJudgmentsVersion}" }
+  "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" }
 }`,
 })

package/dist/_vendor/ailf-core/ports/context.d.ts CHANGED Viewed

@@ -11,7 +11,7 @@
  * Fields marked optional are transitional — they will become required
  * as downstream consumers are converted to use them.
  */
-import type { RunClassification, RunExecutorSurface } from "../../ailf-shared/index.d.ts";
+import type { LiteracyVariant, RunClassification, RunExecutorSurface } from "../../ailf-shared/index.d.ts";
 import type { RunId } from "../types/branded-ids.js";
 import type { DebugOptions, EvalMode, PluginRegistry } from "../types/index.js";
 import type { ArtifactWriter } from "./artifact-writer.js";
@@ -42,12 +42,20 @@ export interface ResolvedConfig {
      * `mode: "literacy", variant: "baseline"`. This keeps the pipeline
      * mode-agnostic while preserving literacy's multi-variant behavior.
      *
-     * Values: "baseline" | "agentic" | "observed" | "full" | undefined
      * Undefined means "use the default variant for the mode" (baseline for literacy).
      */
-    variant?: string;
+    variant?: LiteracyVariant;
     /** Debug options */
     debug?: DebugOptions;
+    /**
+     * Filter the evaluated cohort to a subset of the configured model IDs.
+     *
+     * Each entry must match the `id` of a model declared in
+     * `config/models.ts`. Unknown IDs are dropped at the runner with a
+     * structured warning AND surfaced on the job's `error` field so callers
+     * can detect typos — silent strips are not acceptable.
+     */
+    models?: string[];
     /** Feature area filter */
     areas?: string[];
     /** Task ID filter */
@@ -68,6 +76,12 @@ export interface ResolvedConfig {
     compareThreshold?: number;
     /** Comparison baseline path */
     compareBaseline?: string;
+    /**
+     * Comparison baseline expressed as a previously-published
+     * `ailf.report` document id. Takes precedence over `compareBaseline`
+     * when both are set.
+     */
+    compareBaselineReportId?: string;
     /** Whether gap analysis is enabled */
     gapAnalysisEnabled: boolean;
     /** Whether publishing is enabled */
@@ -323,6 +337,26 @@ export interface AppContext {
     /** Task definition source (YAML, Content Lake, repo) */
     readonly taskSource: TaskSource;
 }
+/**
+ * Discriminated result for `ReportStorePort.loadBaselineFromReport`.
+ *
+ * Lets the compare step distinguish a genuine 404 (the pinned report
+ * doesn't exist — skip with a clear reason) from a transport failure
+ * (Sanity 5xx, network blew up — fail the step so the user knows the
+ * pinned baseline didn't actually compare). The `baseline` payload is
+ * typed as `unknown` to keep the port surface decoupled from the eval
+ * package's `ComparableSummary` type — concrete implementations return
+ * a more specific shape, which is sound.
+ */
+export type LoadBaselineResult = {
+    kind: "ok";
+    baseline: unknown;
+} | {
+    kind: "not_found";
+} | {
+    kind: "error";
+    message: string;
+};
 /**
  * Minimal report store interface used by AppContext.
  *
@@ -341,6 +375,14 @@ export interface ReportStorePort {
     write(report: unknown): Promise<unknown>;
     /** Read a report by its ID (used by the post-run diagnosis hook). */
     read(id: string): Promise<null | unknown>;
+    /**
+     * Load a previously-published report's score summary as a baseline
+     * for the `compare` step. Returns a discriminated result so callers
+     * can distinguish a genuine 404 (skip with a clear reason) from a
+     * transport failure (fail the step — the user pinned a baseline and
+     * deserves to know it didn't actually compare).
+     */
+    loadBaselineFromReport(reportId: string): Promise<LoadBaselineResult>;
     /** Patch synthesis telemetry onto a published report (Phase 6 / DIAG-06). */
     patchSynthesis(id: string, telemetry: unknown): Promise<void>;
     /**

package/dist/_vendor/ailf-core/ports/index.d.ts CHANGED Viewed

@@ -8,7 +8,7 @@ export type { ArtifactEntry, ArtifactWriter } from "./artifact-writer.js";
 export { NoOpArtifactWriter } from "./artifact-writer.js";
 export type { CacheEntryMetadata, CacheKey, CacheLookupResult, CacheRecordInput, CacheStore, } from "./cache-store.js";
 export type { ConfigSource } from "./config-source.js";
-export type { AppContext, ReportSinkPort, ReportStorePort, ResolvedConfig, } from "./context.js";
+export type { AppContext, LoadBaselineResult, ReportSinkPort, ReportStorePort, ResolvedConfig, } from "./context.js";
 export type { DocContext, DocFetcher, DocSourceConfig, DocumentManifestEntry, DocumentOverlaySummary, FetchMetadata, FetchResult, ReleaseImpact, SymbolIndexManifestEntry, UrlFetchEntry, UrlFetchSummary, } from "./doc-fetcher.js";
 export type { EvalRunConfig, EvalRunner } from "./eval-runner.js";
 export type { LLMCallContext, LLMClient, LLMCompleteArgs, LLMCompleteStructuredArgs, LLMCompletion, LLMStructuredCompletion, LLMUsage, ModelId, ModelProvider, ParsedModelId, } from "./llm-client.js";

package/dist/_vendor/ailf-core/schemas/branded-string.d.ts CHANGED Viewed

@@ -36,5 +36,13 @@ import type { Brand } from "../types/branded-ids.js";
  * exit from the project's no-`as`-on-`unknown` rule. Adapters MUST
  * NOT replicate the cast at their own call sites — call this helper
  * instead so the rule violation stays centralized.
+ *
+ * Pass `regex` to enforce a stricter shape than non-empty. The
+ * runtime validator becomes `z.string().regex(regex)` instead of
+ * `z.string().min(1)`; the brand-cast at the call boundary is
+ * unchanged. Callers passing `regex` are responsible for ensuring
+ * it rejects the empty string (typically anchor with `^` and
+ * require at least one character via `+` or a non-`*` quantifier);
+ * the `.min(1)` floor is dropped when `regex` is supplied.
  */
-export declare function brandedString<TBrand extends string>(): z.ZodType<Brand<string, TBrand>>;
+export declare function brandedString<TBrand extends string>(regex?: RegExp): z.ZodType<Brand<string, TBrand>>;

package/dist/_vendor/ailf-core/schemas/branded-string.js CHANGED Viewed

@@ -35,11 +35,21 @@ import { z } from "zod";
  * exit from the project's no-`as`-on-`unknown` rule. Adapters MUST
  * NOT replicate the cast at their own call sites — call this helper
  * instead so the rule violation stays centralized.
+ *
+ * Pass `regex` to enforce a stricter shape than non-empty. The
+ * runtime validator becomes `z.string().regex(regex)` instead of
+ * `z.string().min(1)`; the brand-cast at the call boundary is
+ * unchanged. Callers passing `regex` are responsible for ensuring
+ * it rejects the empty string (typically anchor with `^` and
+ * require at least one character via `+` or a non-`*` quantifier);
+ * the `.min(1)` floor is dropped when `regex` is supplied.
  */
-export function brandedString() {
-    // The runtime is a plain non-empty string; the brand is a
-    // compile-time-only nominal tag (see `Brand<>` in branded-ids.ts).
-    // Zod 4's `.brand()` uses a different symbol shape, so a direct
-    // composition does not yield the project's `Brand<…>` type.
-    return z.string().min(1);
+export function brandedString(regex) {
+    // The runtime is a plain string (non-empty or regex-validated);
+    // the brand is a compile-time-only nominal tag (see `Brand<>` in
+    // branded-ids.ts). Zod 4's `.brand()` uses a different symbol
+    // shape, so a direct composition does not yield the project's
+    // `Brand<…>` type.
+    const base = regex === undefined ? z.string().min(1) : z.string().regex(regex);
+    return base;
 }

package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts CHANGED Viewed

@@ -33,6 +33,7 @@ export declare const PipelineRequestSchema: z.ZodObject<{
     changedDocs: z.ZodOptional<z.ZodArray<z.ZodString>>;
     compare: z.ZodOptional<z.ZodBoolean>;
     compareBaseline: z.ZodOptional<z.ZodString>;
+    compareBaselineReportId: z.ZodOptional<z.ZodString>;
     compareThreshold: z.ZodOptional<z.ZodNumber>;
     concurrency: z.ZodOptional<z.ZodNumber>;
     dataset: z.ZodOptional<z.ZodString>;
@@ -63,6 +64,7 @@ export declare const PipelineRequestSchema: z.ZodObject<{
         observed: "observed";
         full: "full";
     }>>;
+    models: z.ZodOptional<z.ZodArray<z.ZodString>>;
     noAutoScope: z.ZodOptional<z.ZodBoolean>;
     noCache: z.ZodOptional<z.ZodBoolean>;
     noRemoteCache: z.ZodOptional<z.ZodBoolean>;

package/dist/_vendor/ailf-core/schemas/pipeline-request.js CHANGED Viewed

@@ -101,6 +101,7 @@ export const PipelineRequestSchema = z.object({
     changedDocs: z.array(z.string()).optional(),
     compare: z.boolean().optional(),
     compareBaseline: z.string().optional(),
+    compareBaselineReportId: z.string().min(1).optional(),
     compareThreshold: z.number().min(0).optional(),
     concurrency: z.number().int().positive().optional(),
     dataset: z.string().optional(),
@@ -123,6 +124,12 @@ export const PipelineRequestSchema = z.object({
      * Legacy names must pass through normalizeMode() before entering typed pipeline code.
      */
     mode: z.enum(RAW_EVAL_MODES).optional(),
+    /**
+     * Filter the evaluation cohort to a subset of the configured model IDs
+     * (W0281). Unknown IDs are dropped at the runner with a structured
+     * warning + job-error patch.
+     */
+    models: z.array(z.string().min(1)).optional(),
     noAutoScope: z.boolean().optional(),
     noCache: z.boolean().optional(),
     noRemoteCache: z.boolean().optional(),

package/dist/_vendor/ailf-core/schemas/report.d.ts CHANGED Viewed

@@ -113,6 +113,12 @@ export declare const ReportProvenanceSchema: z.ZodObject<{
         documentId: z.ZodOptional<z.ZodString>;
         source: z.ZodString;
     }, z.core.$strict>], "type">;
+    variant: z.ZodOptional<z.ZodEnum<{
+        agentic: "agentic";
+        baseline: "baseline";
+        observed: "observed";
+        full: "full";
+    }>>;
     autoScope: z.ZodOptional<z.ZodObject<{
         enabled: z.ZodBoolean;
         affectedTaskIds: z.ZodArray<z.ZodString>;
@@ -222,6 +228,12 @@ export declare const ReportSchema: z.ZodObject<{
             documentId: z.ZodOptional<z.ZodString>;
             source: z.ZodString;
         }, z.core.$strict>], "type">;
+        variant: z.ZodOptional<z.ZodEnum<{
+            agentic: "agentic";
+            baseline: "baseline";
+            observed: "observed";
+            full: "full";
+        }>>;
         autoScope: z.ZodOptional<z.ZodObject<{
             enabled: z.ZodBoolean;
             affectedTaskIds: z.ZodArray<z.ZodString>;

package/dist/_vendor/ailf-core/schemas/report.js CHANGED Viewed

@@ -24,6 +24,7 @@
  * @see docs/work-items/W0191-report-store-schema-gate.json
  */
 import { z } from "zod";
+import { LITERACY_VARIANTS } from "../../ailf-shared/index.js";
 // ---------------------------------------------------------------------------
 // RunContext building blocks (mirrors packages/shared/src/run-context.ts)
 // ---------------------------------------------------------------------------
@@ -195,6 +196,7 @@ export const ReportProvenanceSchema = z
     taskIds: z.array(z.string()).optional(),
     tool: RunToolSchema.optional(),
     trigger: RunTriggerSchema,
+    variant: z.enum(LITERACY_VARIANTS).optional(),
     // ReportProvenance additions
     autoScope: ReportAutoScopeSchema.optional(),
     contextHash: z.string().optional(),

package/dist/_vendor/ailf-core/schemas/team.d.ts ADDED Viewed

@@ -0,0 +1,22 @@
+import { z } from "zod";
+import type { NotificationChannel } from "../types/team.js";
+export declare const TeamSchema: z.ZodObject<{
+    id: z.ZodType<import("../index.js").Brand<string, "TeamId">, unknown, z.core.$ZodTypeInternals<import("../index.js").Brand<string, "TeamId">, unknown>>;
+    slug: z.ZodType<import("../index.js").Brand<string, "TeamSlug">, unknown, z.core.$ZodTypeInternals<import("../index.js").Brand<string, "TeamSlug">, unknown>>;
+    displayName: z.ZodString;
+    description: z.ZodOptional<z.ZodString>;
+    status: z.ZodEnum<{
+        active: "active";
+        archived: "archived";
+    }>;
+    members: z.ZodArray<z.ZodObject<{
+        email: z.ZodOptional<z.ZodString>;
+        sanityUserId: z.ZodOptional<z.ZodString>;
+        githubUsername: z.ZodOptional<z.ZodString>;
+        displayName: z.ZodOptional<z.ZodString>;
+        role: z.ZodOptional<z.ZodString>;
+        lastVerifiedAt: z.ZodOptional<z.ZodString>;
+    }, z.core.$strip>>;
+    repos: z.ZodOptional<z.ZodArray<z.ZodString>>;
+    notifications: z.ZodOptional<z.ZodArray<z.ZodType<NotificationChannel, unknown, z.core.$ZodTypeInternals<NotificationChannel, unknown>>>>;
+}, z.core.$strip>;

package/dist/_vendor/ailf-core/schemas/team.js ADDED Viewed

@@ -0,0 +1,63 @@
+import { z } from "zod";
+import { brandedString } from "./branded-string.js";
+const SLUG_REGEX = /^[a-z0-9][a-z0-9-]*$/;
+const TEAM_ID_REGEX = /^ailf\.team\.[a-z0-9][a-z0-9-]*$/;
+const TeamMemberSchema = z
+    .object({
+    email: z.string().email().optional(),
+    sanityUserId: z.string().optional(),
+    githubUsername: z.string().optional(),
+    displayName: z.string().optional(),
+    role: z.string().optional(),
+    lastVerifiedAt: z.string().datetime().optional(),
+})
+    .refine((m) => Boolean(m.email || m.sanityUserId || m.githubUsername), {
+    message: "TeamMember requires at least one of email, sanityUserId, githubUsername",
+});
+const ChannelScopeSchema = z.discriminatedUnion("type", [
+    z.object({ type: z.literal("owned") }),
+    z.object({ type: z.literal("all") }),
+    z.object({ type: z.literal("areas"), areas: z.array(z.string()) }),
+    z.object({ type: z.literal("repos"), repos: z.array(z.string()) }),
+    z.object({ type: z.literal("tags"), tags: z.array(z.string()) }),
+]);
+const SlackChannelSchema = z.object({
+    _key: z.string(),
+    type: z.literal("slack"),
+    channelId: z.string().min(1),
+    channelName: z.string().optional(),
+    purpose: z.string().optional(),
+    events: z.array(z.string()).optional(),
+    scope: ChannelScopeSchema.optional(),
+});
+const EmailChannelSchema = z.object({
+    _key: z.string(),
+    type: z.literal("email"),
+    addresses: z.array(z.string().email()).min(1),
+    purpose: z.string().optional(),
+    events: z.array(z.string()).optional(),
+    scope: ChannelScopeSchema.optional(),
+});
+const WebhookChannelSchema = z.object({
+    _key: z.string(),
+    type: z.literal("webhook"),
+    logicalName: z.string().min(1),
+    purpose: z.string().optional(),
+    events: z.array(z.string()).optional(),
+    scope: ChannelScopeSchema.optional(),
+});
+const NotificationChannelSchema = z.discriminatedUnion("type", [
+    SlackChannelSchema,
+    EmailChannelSchema,
+    WebhookChannelSchema,
+]);
+export const TeamSchema = z.object({
+    id: brandedString(TEAM_ID_REGEX),
+    slug: brandedString(SLUG_REGEX),
+    displayName: z.string().min(1),
+    description: z.string().optional(),
+    status: z.enum(["active", "archived"]),
+    members: z.array(TeamMemberSchema).min(1),
+    repos: z.array(z.string()).optional(),
+    notifications: z.array(NotificationChannelSchema).optional(),
+});

package/dist/_vendor/ailf-core/types/grader-judgment.d.ts CHANGED Viewed

@@ -123,3 +123,54 @@ export interface GraderJudgment {
         graderJudgmentsVersion: string;
     };
 }
+/**
+ * Wire-format subset of {@link GraderJudgment} — the fields a grader LLM
+ * is responsible for emitting in its JSON response. The pipeline parses
+ * untrusted grader output against this shape, then synthesizes the
+ * remaining storage fields (`taskId`, `modelId`, `dimension`, `judgmentId`,
+ * `metadata.{graderModel, graderJudgmentsVersion}`, and
+ * `hallucinationCheckedAgainst`) from server-side context.
+ *
+ * The split exists because four of `GraderJudgment`'s required fields are
+ * pipeline-owned semantics the LLM cannot produce correctly:
+ *
+ *   - `judgmentId` — D0052 branded id with `(taskId, modelId, dimension,
+ *     runId)` uniqueness invariant. Minted by `generateJudgmentId`.
+ *   - `metadata.graderJudgmentsVersion` — static constant co-located with
+ *     the schema (`promptfoo-grader-output.ts:48`).
+ *   - `metadata.graderModel` — the grader's deployment alias (pipeline
+ *     knows from provider config; the LLM doesn't reliably know its own).
+ *   - `hallucinationCheckedAgainst` — the resolvable-set union of
+ *     `task.context.docs` and `run.documentManifest`, composed by
+ *     `populateHallucinationFields` (gap-analysis-step.ts).
+ *
+ * Asking the LLM for any of these produces drift; `.strict()` on
+ * `GraderJudgmentSchema` amplifies that drift into 100% parse failures
+ * (the 2026-05-11 empty-gapReport regression — see W0273 and
+ * `docs/audits/2026-05-22-empty-gap-analysis-regression.md`).
+ *
+ * `taskId`, `modelId`, and `dimension` are also pipeline-supplied (from
+ * `result.description`, `result.providerId`, and the rubric-classifier
+ * output in `calculate-scores.ts:475-479`) — kept out of the wire shape
+ * for the same reason.
+ */
+export interface GraderEmittedJudgment {
+    /** Numeric score in [0, 100] (normalized). */
+    score: number;
+    /** The grader's natural-language reasoning. */
+    reason: string;
+    /** Per-dimension failure mode (must match the legal-mode list in the rubric). */
+    failureMode: string;
+    /** Per-criterion sub-judgments. */
+    subJudgments: CriterionSubJudgment[];
+    /** Doc citations with role + hallucinated flag. */
+    docCitations: DocCitation[];
+    /** Grader self-confidence per D0049. */
+    confidence: Confidence;
+    /**
+     * True when the candidate response was empty/whitespace/refused. The
+     * pipeline also independently detects this from
+     * `result.response.output` — both signals are OR'd.
+     */
+    outputFailure?: boolean;
+}

package/dist/_vendor/ailf-core/types/index.d.ts CHANGED Viewed

@@ -39,8 +39,9 @@ export type { AgentHarnessTaskDefinition, ContentLakeAuthorableMode, ContentLake
 export type { ActionSuggestion, AreaSummaryBody, CardMeta, CardType, Diagnosis, DiagnosisCard, DocAttributionSpotlightBody, FailureModeSummaryBody, JudgmentRef, LowConfidenceAttributionBody, NoIssuesBody, RegressionVsBaselineBody, TopRecommendationsBody, VersionedInputs, WeakestAreaBody, } from "./diagnosis.js";
 export type { SynthesisCostTelemetry, SynthesisPerCardTelemetry, } from "./synthesis-telemetry.js";
 export type { AttributionMeta, DocAttribution, JudgmentAttribution, } from "./attribution.js";
-export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderJudgment, } from "./grader-judgment.js";
+export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderEmittedJudgment, GraderJudgment, } from "./grader-judgment.js";
 export type { LegacyGraderJudgment } from "./legacy-grader-judgment.js";
+export type { BaseChannel, ChannelScope, EmailChannel, EventType, KnownEventType, KnownMemberRole, MemberRole, NotificationChannel, NotificationChannelType, SlackChannel, Team, TeamId, TeamMember, TeamRef, TeamSlug, TeamStatus, WebhookChannel, } from "./team.js";
 type DocumentRef = _DocumentRef;
 /** Aggregated retrieval metrics for a feature area */
 export interface AreaRetrievalMetrics {
@@ -259,6 +260,12 @@ export interface FilterOptions {
     tags?: string[];
     /** Specific task IDs to include (e.g., ["groq-blog-queries"]) */
     taskIds?: string[];
+    /**
+     * Doc slugs that changed in the calling context. When set, only tasks
+     * whose `context.docs[*].slug` intersects this list are returned.
+     * Empty array is a no-op (treated as undefined).
+     */
+    changedDocs?: readonly string[];
 }
 /** Full gap analysis report */
 export interface GapAnalysisReport {

package/dist/_vendor/ailf-core/types/pipeline-request.d.ts CHANGED Viewed

@@ -79,6 +79,13 @@ export interface PipelineRequest {
     classification?: RunClassification;
     compare?: boolean;
     compareBaseline?: string;
+    /**
+     * Compare against a baseline extracted from a previously-published
+     * `ailf.report` document. Takes precedence over `compareBaseline`
+     * (local FS path). Dashboard-friendly: a report id is something the
+     * user can pick from a list.
+     */
+    compareBaselineReportId?: string;
     compareThreshold?: number;
     concurrency?: number;
     dataset?: string;
@@ -93,6 +100,16 @@ export interface PipelineRequest {
     jobId?: string;
     labels?: string[];
     mode?: RawEvalMode;
+    /**
+     * Filter the evaluation cohort to a subset of the configured model IDs.
+     *
+     * Each entry must match the `id` of a model declared in
+     * `packages/eval/config/models.ts`. IDs that don't match are dropped
+     * with a structured warning AND surfaced on the job's `error` field so
+     * callers can detect typos — silent strips are not acceptable
+     * (W0281 acceptance criterion 5).
+     */
+    models?: string[];
     noAutoScope?: boolean;
     noCache?: boolean;
     noRemoteCache?: boolean;

package/dist/_vendor/ailf-core/types/team.d.ts ADDED Viewed

@@ -0,0 +1,65 @@
+import type { Brand } from "./branded-ids.js";
+export type TeamId = Brand<string, "TeamId">;
+export type TeamSlug = Brand<string, "TeamSlug">;
+export type TeamStatus = "active" | "archived";
+export type KnownMemberRole = "lead" | "member" | "oncall";
+export type MemberRole = KnownMemberRole | (string & {});
+export type KnownEventType = "eval.failed" | "eval.completed" | "eval.threshold-breached" | "eval.score-regressed" | "task.created" | "task.archived" | "area.unowned-tasks";
+export type EventType = KnownEventType | (string & {});
+export type NotificationChannelType = "slack" | "email" | "webhook";
+export interface TeamMember {
+    email?: string;
+    sanityUserId?: string;
+    githubUsername?: string;
+    displayName?: string;
+    role?: MemberRole;
+    lastVerifiedAt?: string;
+}
+export interface BaseChannel {
+    _key: string;
+    type: NotificationChannelType;
+    purpose?: string;
+    events?: EventType[];
+    scope?: ChannelScope;
+}
+export interface SlackChannel extends BaseChannel {
+    type: "slack";
+    channelId: string;
+    channelName?: string;
+}
+export interface EmailChannel extends BaseChannel {
+    type: "email";
+    addresses: string[];
+}
+export interface WebhookChannel extends BaseChannel {
+    type: "webhook";
+    logicalName: string;
+}
+export type NotificationChannel = SlackChannel | EmailChannel | WebhookChannel;
+export type ChannelScope = {
+    type: "owned";
+} | {
+    type: "all";
+} | {
+    type: "areas";
+    areas: string[];
+} | {
+    type: "repos";
+    repos: string[];
+} | {
+    type: "tags";
+    tags: string[];
+};
+export interface Team {
+    id: TeamId;
+    slug: TeamSlug;
+    displayName: string;
+    description?: string;
+    status: TeamStatus;
+    members: TeamMember[];
+    repos?: string[];
+    notifications?: NotificationChannel[];
+}
+export type TeamRef = {
+    _ref: string;
+};

package/dist/_vendor/ailf-core/types/team.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ export {};

package/dist/_vendor/ailf-shared/eval-modes.d.ts CHANGED Viewed

@@ -52,6 +52,8 @@ export declare const LEGACY_EVAL_MODE_ALIASES: readonly ["baseline", "agentic",
 export declare const LITERACY_VARIANTS: readonly ["baseline", "agentic", "observed", "full"];
 /** Union of all literacy variant string values. */
 export type LiteracyVariant = (typeof LITERACY_VARIANTS)[number];
+/** Type guard for `LiteracyVariant` — true when `value` is one of the closed set. */
+export declare function isLiteracyVariant(value: unknown): value is LiteracyVariant;
 /**
  * All accepted mode names for Zod enum construction.
  * Canonical modes first, then legacy aliases.

package/dist/_vendor/ailf-shared/eval-modes.js CHANGED Viewed

@@ -40,6 +40,11 @@ export const LITERACY_VARIANTS = [
     "observed",
     "full",
 ];
+/** Type guard for `LiteracyVariant` — true when `value` is one of the closed set. */
+export function isLiteracyVariant(value) {
+    return (typeof value === "string" &&
+        LITERACY_VARIANTS.includes(value));
+}
 /**
  * All accepted mode names for Zod enum construction.
  * Canonical modes first, then legacy aliases.

package/dist/_vendor/ailf-shared/event-types.d.ts ADDED Viewed

@@ -0,0 +1,15 @@
+/**
+ * Known notification event types and soft-enum helpers.
+ *
+ * Event types are free-form strings by design — teams can wire new events
+ * without a code change. This module seeds Studio comboboxes with canonical
+ * values and provides a narrowing predicate, without closing the enum.
+ *
+ * Parallel to the same type aliases in `@sanity/ailf-core`'s team module:
+ * `shared` is the leaf of the dependency graph, so the studio schema can
+ * import the runtime tuple without pulling in core.
+ */
+export declare const KNOWN_EVENT_TYPES: readonly ["eval.failed", "eval.completed", "eval.threshold-breached", "eval.score-regressed", "task.created", "task.archived", "area.unowned-tasks"];
+export type KnownEventType = (typeof KNOWN_EVENT_TYPES)[number];
+export type EventType = KnownEventType | (string & {});
+export declare function isKnownEventType(value: string): value is KnownEventType;

package/dist/_vendor/ailf-shared/event-types.js ADDED Viewed

@@ -0,0 +1,23 @@
+/**
+ * Known notification event types and soft-enum helpers.
+ *
+ * Event types are free-form strings by design — teams can wire new events
+ * without a code change. This module seeds Studio comboboxes with canonical
+ * values and provides a narrowing predicate, without closing the enum.
+ *
+ * Parallel to the same type aliases in `@sanity/ailf-core`'s team module:
+ * `shared` is the leaf of the dependency graph, so the studio schema can
+ * import the runtime tuple without pulling in core.
+ */
+export const KNOWN_EVENT_TYPES = [
+    "eval.failed",
+    "eval.completed",
+    "eval.threshold-breached",
+    "eval.score-regressed",
+    "task.created",
+    "task.archived",
+    "area.unowned-tasks",
+];
+export function isKnownEventType(value) {
+    return KNOWN_EVENT_TYPES.includes(value);
+}

package/dist/_vendor/ailf-shared/generated/help-content.js CHANGED Viewed

@@ -88,7 +88,7 @@ export const HELP_TOPICS = [
     {
         "id": "scoring-model",
         "title": "Understanding Scores",
-        "body": "## The three dimensions\n\nEvery evaluation task is scored on three dimensions, each graded 0–100:\n\n- **Task Completion (50% weight)** — Can the AI implement the requested feature?\n  Does the output actually do what was asked?\n- **Code Correctness (25% weight)** — Is the generated code idiomatic, correct,\n  and following best practices?\n- **Doc Coverage (25% weight)** — Did the documentation provide the information\n  needed to implement the feature?\n\n## How the overall score is calculated\n\nThe three dimensions combine into a single **AI Literacy Score** per task using\nnamed scoring profiles from `config/rubrics.yaml`:\n\n```\nGold (with docs):    Total = Task × 0.50 + Code × 0.25 + Docs × 0.25\nBaseline (no docs):  Total = Task × 0.60 + Code × 0.40\n```\n\nThe gold profile includes all three dimensions. The baseline profile excludes\nDoc Coverage because it is undefined when no documentation is provided. This\nensures Doc Lift (ceiling − floor) is a clean structural measurement of\ndocumentation value.\n\nThe weighted composite produces a score from 0–100. Scores are then averaged\nacross all tasks in a feature area to produce a **per-area score**, and across\nall areas to produce the **overall score**.\n\n## What the numbers mean\n\n| Score range  | Interpretation                                                    |\n| ------------ | ----------------------------------------------------------------- |\n| **80–100**   | Docs are working well — AI agents produce correct implementations |\n| **70–79**    | Needs attention — there may be gaps in specific dimensions        |\n| **Below 70** | Weak — AI agents consistently struggle with this area             |\n\n## Ceiling decomposition (baseline mode)\n\nWhen running in baseline mode, each task is evaluated twice — with and without\ndocumentation. This produces:\n\n- **Floor score** — Score without docs (what the model knows from training data\n  alone)\n- **Ceiling score** — Score with gold-standard docs injected directly into the\n  prompt\n- **Doc Lift** — Ceiling minus floor. Positive means docs help; negative means\n  docs hurt.\n- **Doc Quality Gap** — 100 minus ceiling. Room for documentation improvement.\n\n## Three-layer decomposition (full mode)\n\nFull mode adds a third measurement — what happens when AI agents find docs on\ntheir own:\n\n- **Floor** — No docs (parametric knowledge only)\n- **Ceiling** — Gold-standard docs injected (best the docs can do)\n- **Actual** — Agent-retrieved docs (real-world performance)\n- **Retrieval Gap** — Ceiling minus actual (quality lost to findability)\n- **Infrastructure Efficiency** — Actual ÷ ceiling (what fraction of doc quality\n  reaches agents)\n\n## Cost tracking\n\nEach evaluation also tracks token costs:\n\n- **Provider cost** — Token usage for generating implementations\n- **Grader cost** — Token usage for the grading model's assessments\n- **Total cost** — Both combined, reported in the score summary",
+        "body": "## The three dimensions\n\nEvery evaluation task is scored on three dimensions, each graded 0–100:\n\n- **Task Completion (50% weight)** — Can the AI implement the requested feature?\n  Does the output actually do what was asked?\n- **Code Correctness (25% weight)** — Is the generated code idiomatic, correct,\n  and following best practices?\n- **Doc Coverage (25% weight)** — Did the documentation provide the information\n  needed to implement the feature?\n\n## How the overall score is calculated\n\nThe three dimensions combine into a single **AI Literacy Score** per task using\nnamed scoring profiles from `packages/eval/config/rubrics.ts`:\n\n```\nGold (with docs):    Total = Task × 0.50 + Code × 0.25 + Docs × 0.25\nBaseline (no docs):  Total = Task × 0.60 + Code × 0.40\n```\n\nThe gold profile includes all three dimensions. The baseline profile excludes\nDoc Coverage because it is undefined when no documentation is provided. This\nensures Doc Lift (ceiling − floor) is a clean structural measurement of\ndocumentation value.\n\nThe weighted composite produces a score from 0–100. Scores are then averaged\nacross all tasks in a feature area to produce a **per-area score**, and across\nall areas to produce the **overall score**.\n\n## What the numbers mean\n\n| Score range  | Interpretation                                                    |\n| ------------ | ----------------------------------------------------------------- |\n| **80–100**   | Docs are working well — AI agents produce correct implementations |\n| **70–79**    | Needs attention — there may be gaps in specific dimensions        |\n| **Below 70** | Weak — AI agents consistently struggle with this area             |\n\n## Ceiling decomposition (baseline mode)\n\nWhen running in baseline mode, each task is evaluated twice — with and without\ndocumentation. This produces:\n\n- **Floor score** — Score without docs (what the model knows from training data\n  alone)\n- **Ceiling score** — Score with gold-standard docs injected directly into the\n  prompt\n- **Doc Lift** — Ceiling minus floor. Positive means docs help; negative means\n  docs hurt.\n- **Doc Quality Gap** — 100 minus ceiling. Room for documentation improvement.\n\n## Three-layer decomposition (full mode)\n\nFull mode adds a third measurement — what happens when AI agents find docs on\ntheir own:\n\n- **Floor** — No docs (parametric knowledge only)\n- **Ceiling** — Gold-standard docs injected (best the docs can do)\n- **Actual** — Agent-retrieved docs (real-world performance)\n- **Retrieval Gap** — Ceiling minus actual (quality lost to findability)\n- **Infrastructure Efficiency** — Actual ÷ ceiling (what fraction of doc quality\n  reaches agents)\n\n## Cost tracking\n\nEach evaluation also tracks token costs:\n\n- **Provider cost** — Token usage for generating implementations\n- **Grader cost** — Token usage for the grading model's assessments\n- **Total cost** — Both combined, reported in the score summary",
         "source": "docs/help/scoring-model.md",
         "related": [
             "three-layer",
@@ -99,7 +99,7 @@ export const HELP_TOPICS = [
     {
         "id": "weaknesses-recommendations",
         "title": "Weaknesses & Recommendations",
-        "body": "## Understanding weaknesses\n\nThe Issues sub-tab in Diagnostics lists every area or dimension that scored\nbelow threshold. Each weakness entry shows:\n\n- **The feature area** — Which product feature is affected (e.g., GROQ,\n  Functions, Webhooks).\n- **The bottleneck dimension** — Which scoring dimension is dragging the area\n  down: task completion, code correctness, or doc coverage.\n- **The score** — How far below threshold the dimension scored.\n\n## Gap analysis recommendations\n\nWhen an evaluation runs with gap analysis enabled, the dashboard shows\n**prioritized recommendations** — specific actions ranked by estimated impact.\n\nEach recommendation includes:\n\n- **Failure mode** — The type of doc problem identified:\n  - `missing-docs` — The functionality isn't documented at all.\n  - `incorrect-docs` — The docs contain factual errors.\n  - `outdated-docs` — The docs describe an old API version or pattern.\n  - `poor-structure` — The docs exist but are hard to find or understand.\n- **Estimated lift** — How many score points fixing this gap would add. Based on\n  raising the bottleneck dimension to the median of non-bottleneck dimensions.\n  Conservative estimate — actual improvement may be higher.\n- **Confidence** — How sure the analysis is about this diagnosis (high, medium,\n  or low).\n- **Affected tasks** — Which specific evaluation tasks exposed this gap.\n\n## Low-scoring judgments\n\nBelow the recommendations, you'll find the **grader's explanations** for tests\nthat scored below 70. These are the raw assessments from the grading model\nexplaining exactly what went wrong — missing API calls, incorrect patterns,\nhallucinated features, etc.\n\nEach judgment shows the task, the dimension, the score, and the grader's natural\nlanguage reason. These are the most granular diagnostic signal available and\noften point directly to the doc section that needs fixing.",
+        "body": "## Understanding weaknesses\n\nThe Issues sub-tab in Diagnostics lists every area or dimension that scored\nbelow threshold. Each weakness entry shows:\n\n- **The feature area** — Which product feature is affected (e.g., GROQ,\n  Functions, Webhooks).\n- **The bottleneck dimension** — Which scoring dimension is dragging the area\n  down: task completion, code correctness, or doc coverage.\n- **The score** — How far below threshold the dimension scored.\n\n## Gap analysis recommendations\n\nWhen an evaluation runs with gap analysis enabled, the dashboard shows\n**prioritized recommendations** — specific actions ranked by estimated impact.\n\nEach recommendation includes:\n\n- **Failure mode** — The type of doc problem identified:\n  - `missing-docs` — The functionality isn't documented at all.\n  - `incorrect-docs` — The docs contain factual errors.\n  - `outdated-docs` — The docs describe an old API version or pattern.\n  - `poor-structure` — The docs exist but are hard to find or understand.\n- **Estimated lift** — How many score points fixing this gap would add. Based on\n  raising the bottleneck dimension to the median of non-bottleneck dimensions.\n  Conservative estimate — actual improvement may be higher.\n- **Confidence** — How sure the analysis is about this diagnosis (high, medium,\n  or low).\n- **Affected tasks** — Which specific evaluation tasks exposed this gap.\n\n## Diagnosis cards\n\nEvery published report now carries a **diagnosis artifact** — a set of cards\nproduced by the post-pipeline hook (`ailf interpret`). The Studio diagnosis\npanel renders these cards directly; the dashboard's Recommendations and\nFailure-modes panels migrate to the same source in a follow-up.\n\nThe hook runs by default for every pipeline invocation. To opt out for a single\nrun, pass `--no-summary`; to opt out in CI, set `AILF_INTERPRET_ON_RUN=0` in the\nworkflow env block; to opt out project-wide, set `summary.onRun: never` in\n`.ailf/config.yaml`.\n\n## Low-scoring judgments\n\nBelow the recommendations, you'll find the **grader's explanations** for tests\nthat scored below 70. These are the raw assessments from the grading model\nexplaining exactly what went wrong — missing API calls, incorrect patterns,\nhallucinated features, etc.\n\nEach judgment shows the task, the dimension, the score, and the grader's natural\nlanguage reason. These are the most granular diagnostic signal available and\noften point directly to the doc section that needs fixing.",
         "source": "docs/help/weaknesses-recommendations.md",
         "related": [
             "interpreting-diagnostics",