npm - @sanity/ailf - Versions diffs - 7.1.2 → 7.2.1 - Mend

@sanity/ailf 7.1.2 → 7.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
package/dist/_vendor/ailf-core/schemas/index.js +4 -0
package/dist/_vendor/ailf-core/schemas/report.d.ts +11 -0
package/dist/_vendor/ailf-core/schemas/report.js +14 -0
package/dist/_vendor/ailf-core/schemas/user.d.ts +22 -0
package/dist/_vendor/ailf-core/schemas/user.js +23 -0
package/dist/_vendor/ailf-core/types/index.d.ts +29 -0
package/dist/_vendor/ailf-core/types/index.js +13 -0
package/dist/_vendor/ailf-core/types/user.d.ts +49 -0
package/dist/_vendor/ailf-core/types/user.js +1 -0
package/dist/_vendor/ailf-shared/generated/help-content.js +26 -14
package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +2 -2
package/dist/index.d.ts +5 -1
package/dist/index.js +5 -1
package/dist/orchestration/steps/compute-attribution-step.d.ts +2 -2
package/dist/orchestration/steps/compute-attribution-step.js +17 -2
package/dist/orchestration/steps/gap-analysis-step.d.ts +2 -2
package/dist/orchestration/steps/gap-analysis-step.js +20 -2
package/dist/orchestration/steps/publish-report-step.d.ts +15 -1
package/dist/orchestration/steps/publish-report-step.js +45 -0
package/dist/pipeline/calculate-scores.js +59 -14
package/dist/pipeline/compiler/assertion-mapper.js +7 -3
package/dist/pipeline/compiler/rubric-resolution.d.ts +2 -2
package/dist/pipeline/compiler/rubric-resolution.js +25 -1
package/dist/pipeline/enrichment-preconditions.d.ts +52 -0
package/dist/pipeline/enrichment-preconditions.js +84 -0
package/dist/pipeline/extract-grader-judgments-resilient.d.ts +88 -0
package/dist/pipeline/extract-grader-judgments-resilient.js +122 -0
package/dist/report-store.d.ts +1 -0
package/dist/report-store.js +2 -0
package/package.json +1 -1

package/dist/_vendor/ailf-core/schemas/index.d.ts CHANGED Viewed

@@ -21,3 +21,4 @@ export * from "./symbol-preflight-report.js";
 export * from "./test-budgets.js";
 export { ConfidenceSchema } from "./confidence-schema.js";
 export { brandedString } from "./branded-string.js";
+export { AilfUserSchema } from "./user.js";

package/dist/_vendor/ailf-core/schemas/index.js CHANGED Viewed

@@ -28,3 +28,7 @@ export { ConfidenceSchema } from "./confidence-schema.js";
 // helper instead of replicating `as unknown as z.ZodType<…>` at each
 // schema author site (project rule: no `as` on `unknown`).
 export { brandedString } from "./branded-string.js";
+// User-preferences subsystem (W0302). Named export — not `export *` — because
+// the schema file re-exports the `AilfUser` domain type, and a star re-export
+// would surface that type through two paths (W0124 DTS ambiguity).
+export { AilfUserSchema } from "./user.js";

package/dist/_vendor/ailf-core/schemas/report.d.ts CHANGED Viewed

@@ -258,6 +258,17 @@ export declare const ReportSchema: z.ZodObject<{
     artifactManifest: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
     tag: z.ZodOptional<z.ZodNullable<z.ZodString>>;
     title: z.ZodOptional<z.ZodNullable<z.ZodString>>;
+    degraded: z.ZodOptional<z.ZodObject<{
+        reason: z.ZodLiteral<"enrichment-missing">;
+        missing: z.ZodArray<z.ZodEnum<{
+            documentManifest: "documentManifest";
+            failureModes: "failureModes";
+            lowScoringJudgments: "lowScoringJudgments";
+            recommendations: "recommendations";
+            testResults: "testResults";
+        }>>;
+        detail: z.ZodString;
+    }, z.core.$strict>>;
 }, z.core.$loose>;
 export type ReportSchemaInput = z.input<typeof ReportSchema>;
 export type ReportSchemaOutput = z.infer<typeof ReportSchema>;

package/dist/_vendor/ailf-core/schemas/report.js CHANGED Viewed

@@ -25,6 +25,7 @@
  */
 import { z } from "zod";
 import { LITERACY_VARIANTS } from "../../ailf-shared/index.js";
+import { DEGRADED_ENRICHMENT_FIELDS } from "../types/index.js";
 // ---------------------------------------------------------------------------
 // RunContext building blocks (mirrors packages/shared/src/run-context.ts)
 // ---------------------------------------------------------------------------
@@ -233,5 +234,18 @@ export const ReportSchema = z
     // `title: report.title ?? null`, so the schema accepts null on both.
     tag: z.string().nullable().optional(),
     title: z.string().nullable().optional(),
+    // Degraded marker (mirrors `ReportDegradation`): present only when a full
+    // eval scored tests but enrichment did not complete. Strict — unknown
+    // keys here signal real drift.
+    degraded: z
+        .object({
+        reason: z.literal("enrichment-missing"),
+        // Enum derived from the canonical DegradedEnrichmentField tuple so the
+        // schema cannot drift from the core type.
+        missing: z.array(z.enum(DEGRADED_ENRICHMENT_FIELDS)),
+        detail: z.string().min(1),
+    })
+        .strict()
+        .optional(),
 })
     .passthrough();

package/dist/_vendor/ailf-core/schemas/user.d.ts ADDED Viewed

@@ -0,0 +1,22 @@
+import { z } from "zod";
+export declare const AilfUserSchema: z.ZodObject<{
+    _id: z.ZodString;
+    _type: z.ZodLiteral<"ailf.user">;
+    sanityUserId: z.ZodString;
+    email: z.ZodString;
+    displayName: z.ZodOptional<z.ZodString>;
+    teams: z.ZodArray<z.ZodObject<{
+        _type: z.ZodLiteral<"reference">;
+        _ref: z.ZodString;
+        _key: z.ZodOptional<z.ZodString>;
+    }, z.core.$strip>>;
+    preferences: z.ZodObject<{
+        primaryTeam: z.ZodOptional<z.ZodObject<{
+            _type: z.ZodLiteral<"reference">;
+            _ref: z.ZodString;
+            _key: z.ZodOptional<z.ZodString>;
+        }, z.core.$strip>>;
+    }, z.core.$strip>;
+    updatedAt: z.ZodString;
+}, z.core.$strip>;
+export type { AilfUser } from "../types/user.js";

package/dist/_vendor/ailf-core/schemas/user.js ADDED Viewed

@@ -0,0 +1,23 @@
+import { z } from "zod";
+// `_id` is constructed as `ailf.user.${CurrentUser.id}` at write time. The
+// account id segment is opaque (may contain `|`, `.`, etc. for SSO providers),
+// so the prefix is all we constrain here. The deterministic-id invariant
+// (`_id === ailf.user.${sanityUserId}`) is enforced on the write path.
+const USER_ID_REGEX = /^ailf\.user\..+$/;
+const TeamReferenceSchema = z.object({
+    _type: z.literal("reference"),
+    _ref: z.string().min(1),
+    _key: z.string().optional(),
+});
+export const AilfUserSchema = z.object({
+    _id: z.string().regex(USER_ID_REGEX),
+    _type: z.literal("ailf.user"),
+    sanityUserId: z.string().min(1),
+    email: z.string().email(),
+    displayName: z.string().optional(),
+    teams: z.array(TeamReferenceSchema),
+    preferences: z.object({
+        primaryTeam: TeamReferenceSchema.optional(),
+    }),
+    updatedAt: z.string().datetime(),
+});

package/dist/_vendor/ailf-core/types/index.d.ts CHANGED Viewed

@@ -42,6 +42,7 @@ export type { AttributionMeta, DocAttribution, JudgmentAttribution, } from "./at
 export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderEmittedJudgment, GraderJudgment, } from "./grader-judgment.js";
 export type { LegacyGraderJudgment } from "./legacy-grader-judgment.js";
 export type { BaseChannel, ChannelScope, EmailChannel, EventType, KnownEventType, KnownMemberRole, MemberRole, NotificationChannel, NotificationChannelType, SlackChannel, Team, TeamId, TeamMember, TeamRef, TeamSlug, TeamStatus, WebhookChannel, } from "./team.js";
+export type { AilfUser, AilfUserPreferences, TeamReference } from "./user.js";
 type DocumentRef = _DocumentRef;
 /** Aggregated retrieval metrics for a feature area */
 export interface AreaRetrievalMetrics {
@@ -1488,8 +1489,36 @@ export interface ArtifactRef {
  * two becomes a compile error (W0049 review finding C1).
  */
 export type ArtifactManifest = Partial<Record<ArtifactType, ArtifactRef>>;
+/**
+ * Enrichment surfaces that gap-analysis writes onto a report. When a full
+ * eval scores tests but these are absent, the report renders as "no tests"
+ * despite carrying a score — the degraded condition `ReportDegradation`
+ * records.
+ */
+export declare const DEGRADED_ENRICHMENT_FIELDS: readonly ["documentManifest", "failureModes", "lowScoringJudgments", "recommendations", "testResults"];
+export type DegradedEnrichmentField = (typeof DEGRADED_ENRICHMENT_FIELDS)[number];
+/**
+ * Marks a published report as degraded: the eval ran and scored tests, but
+ * one or more enrichment surfaces never landed (e.g. gap-analysis skipped
+ * because `grader-judgments.json` was missing). Present so the dashboard and
+ * Studio can show "enrichment failed" rather than a misleading empty
+ * "no tests" state on a report that still has a score.
+ */
+export interface ReportDegradation {
+    /** Why the report is degraded. Single-variant union, widen as needed. */
+    reason: "enrichment-missing";
+    /** Enrichment surfaces absent on this report despite a full eval. */
+    missing: DegradedEnrichmentField[];
+    /** Human-readable explanation for dashboard / Studio empty-state copy. */
+    detail: string;
+}
 /** A published evaluation report — the atomic unit of the report store */
 export interface Report {
+    /**
+     * Set when the report is published in a degraded state — a full eval
+     * scored tests but enrichment did not complete. Absent on healthy reports.
+     */
+    degraded?: ReportDegradation;
     /**
      * Snapshot of the run manifest's `artifacts` slice at publish time (D0032).
      * The source of truth lives in `gs://…/runs/{runId}/manifest.json`; this

package/dist/_vendor/ailf-core/types/index.js CHANGED Viewed

@@ -42,3 +42,16 @@ export function isLegacyFailureMode(mode) {
  * that imports it from @sanity/ailf-core.
  */
 export { NOISE_THRESHOLD as DEFAULT_NOISE_THRESHOLD } from "../../ailf-shared/index.js";
+/**
+ * Enrichment surfaces that gap-analysis writes onto a report. When a full
+ * eval scores tests but these are absent, the report renders as "no tests"
+ * despite carrying a score — the degraded condition `ReportDegradation`
+ * records.
+ */
+export const DEGRADED_ENRICHMENT_FIELDS = [
+    "documentManifest",
+    "failureModes",
+    "lowScoringJudgments",
+    "recommendations",
+    "testResults",
+];

package/dist/_vendor/ailf-core/types/user.d.ts ADDED Viewed

@@ -0,0 +1,49 @@
+/**
+ * A Sanity reference to an `ailf.team` document.
+ *
+ * Members of an array (`AilfUser.teams[]`) carry a `_key`; the single-valued
+ * `preferences.primaryTeam` does not. The team slug downstream consumers need
+ * is a derived, read-time value from a GROQ projection — never stored here.
+ */
+export interface TeamReference {
+    _type: "reference";
+    _ref: string;
+    _key?: string;
+}
+/**
+ * Per-user UI preferences. Room to grow (default view, density, …) — kept
+ * minimal for v0 (YAGNI).
+ */
+export interface AilfUserPreferences {
+    /**
+     * Reference to the user's default team — one of `AilfUser.teams[]`. Distinct
+     * from `teams[]` so "which team's view do I default to" can differ from "all
+     * teams I affiliate with". The slug is derived in GROQ at read time.
+     */
+    primaryTeam?: TeamReference;
+}
+/**
+ * Per-account user document — one per Sanity account, keyed by a deterministic
+ * `_id` of `ailf.user.${sanityUserId}`. Stores self-declared team affiliation
+ * (references to `ailf.team`) plus UI preferences, and is the primary source
+ * for dashboard personalization. Stores minimal PII: `sanityUserId`, `email`,
+ * and `displayName` only.
+ *
+ * @see docs/design-docs/user-settings.md
+ */
+export interface AilfUser {
+    /** Deterministic: `ailf.user.${sanityUserId}`. */
+    _id: string;
+    _type: "ailf.user";
+    /** `CurrentUser.id` — the stable, globally-unique key, mirrored for GROQ. */
+    sanityUserId: string;
+    /** Denormalized for display / joins (lowercased at write time). */
+    email: string;
+    /** `CurrentUser.name` snapshot. */
+    displayName?: string;
+    /** Self-declared affiliation — drives personalization only. */
+    teams: TeamReference[];
+    preferences: AilfUserPreferences;
+    /** ISO 8601 UTC — stamped on each save. */
+    updatedAt: string;
+}

package/dist/_vendor/ailf-core/types/user.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ export {};

package/dist/_vendor/ailf-shared/generated/help-content.js CHANGED Viewed

@@ -44,6 +44,17 @@ export const HELP_TOPICS = [
             "scoring-model"
         ]
     },
+    {
+        "id": "failure-modes",
+        "title": "Failure Modes",
+        "body": "## What this view is for\n\nThe Recommendations view tells you which fixes to make. This view tells you what\nkind of problem you have. It groups the run's weaknesses by the documentation\nissue behind them, so you can see patterns across the whole evaluation rather\nthan one fix at a time. If most of your weak spots are the same kind of problem,\nthat is a signal about how to spend your docs effort.\n\n## What you are looking at\n\nRecent reports show **interpretive cards** drawn from the run's diagnosis:\n\n- **Weakest area** names the single feature area dragging the score down most,\n  the failure mode behind it, and a confidence level with the sample size, so\n  you know how strong the signal is.\n- **Failure mode** highlights one category of problem, which scoring dimension\n  it shows up in, and how often it occurred across the tests that were checked.\n- **Area summary** gives a plain-language read on how an area is doing and why.\n\nOlder reports show a **category breakdown** instead. Each failure category is a\nchip with a count. Selecting a chip lists the gaps in that category, and each\ngap shows an estimated score lift if fixed, a confidence level, a short\nremediation note, and the specific tasks that exposed it. You can click a task\nto jump to it.\n\n## The failure modes\n\nEach weakness is sorted into one of these categories. The category is the\nfastest way to know what kind of work the fix needs:\n\n- **Missing docs**: the doc the model needed does not exist or is not indexed.\n  The fix is to write new documentation.\n- **Incorrect docs**: a doc has a factual error or a wrong example. The fix is\n  to correct it.\n- **Outdated docs**: a doc exists but reflects a previous API surface. The fix\n  is to bring it up to date.\n- **Poor structure**: the information is correct but hard for an agent to find\n  or skim. The fix is to reorganize or clarify.\n- **Model limitation**: the model struggles even with correct docs available.\n  This is not a documentation problem, so treat it as context rather than a\n  to-do.\n- **Unclassified**: the run could not categorize the weakness. Use the linked\n  tasks and the grader's notes to judge it yourself.\n\nDepending on the evaluation mode you may see additional categories, including\nones specific to agent behavior such as tool misuse or missing error handling.\n\n## How to use it\n\nStart with the category that has the most gaps or the highest combined lift. The\ncategory tells you the shape of the work before you open a single page: write,\ncorrect, update, or restructure. Categories that are not documentation problems,\nsuch as model limitation, are worth noting but are not yours to fix in the docs.\n\n## Related views\n\n- **Recommendations** turns these weaknesses into a ranked list of specific\n  edits.\n- **Low-scoring judgments** shows the grader's raw notes on the tests that\n  scored lowest, which is the most granular signal behind any failure mode.\n\n## When this view is empty\n\nIf a report shows no failure modes, the evaluation either classified nothing\nworth flagging or the run predates this view. A clean result here usually means\nthe docs held up across the evaluated tasks.",
+        "source": "docs/help/failure-modes.md",
+        "related": [
+            "recommendations",
+            "scoring-model",
+            "negative-doc-lift"
+        ]
+    },
     {
         "id": "getting-started",
         "title": "Getting Started",
@@ -57,11 +68,12 @@ export const HELP_TOPICS = [
     {
         "id": "interpreting-diagnostics",
         "title": "Interpreting Diagnostics",
-        "body": "## The diagnostics tab\n\nWhen you open a report and click the **Diagnostics** tab, you see a health\nsummary of your documentation across all feature areas. This is the most\nactionable view in the dashboard — it tells you exactly where to focus your doc\nimprovement efforts.\n\n## Health categories\n\nFeature areas are grouped into three health bands:\n\n- **Strong (80+)** — Docs are working well. AI agents produce correct, complete\n  implementations. No action needed unless you see regression.\n- **Needs Attention (70–79)** — Docs are okay but have gaps. There may be\n  specific dimensions (like code correctness or doc coverage) dragging the score\n  down. Worth investigating.\n- **Weak (below 70)** — Docs are not providing enough support. AI agents\n  consistently struggle with these features. These need priority attention.\n\n## Strengths vs. Issues\n\nThe diagnostics tab has two sub-views:\n\n**Strengths** highlights what's working: high-scoring areas, strong dimensions,\nand areas where agents successfully find and use your docs. Use this to\nunderstand what good looks like in your docs — and replicate it elsewhere.\n\n**Issues** lists the problems: weak areas, dimensions scoring below 50, negative\ndoc lift, retrieval problems, and (if gap analysis was run) specific\nrecommendations with estimated score lift.\n\n## Key diagnostic signals\n\n| Signal                         | What it means                              | What to do                               |\n| ------------------------------ | ------------------------------------------ | ---------------------------------------- |\n| **Negative doc lift**          | Docs are worse than no docs                | Rewrite or remove the offending docs     |\n| **Large retrieval gap**        | Good docs exist but agents can't find them | Improve page titles, metadata, SEO       |\n| **Low code correctness**       | Agents find the docs but produce bad code  | Add or fix code examples                 |\n| **Low doc coverage**           | The docs don't cover what the task needs   | Write new documentation                  |\n| **Efficiency anomaly (>100%)** | Agents do better without gold docs         | Injected docs may be confusing the model |",
+        "body": "## Reading the health of your docs\n\nA report scores each feature area on how well your documentation lets AI coding\ntools implement that feature. Reading those scores well is what turns a number\ninto a plan: it tells you where the docs are working, where they are not, and\nwhat kind of problem you are dealing with.\n\n## Health bands\n\nEach area's score falls into one of three bands:\n\n- **Strong (80 and above)**: docs are working well. Agents produce correct,\n  complete implementations. No action needed unless you see a regression.\n- **Needs attention (70 to 79)**: docs are okay but have gaps. A specific\n  dimension such as code correctness or doc coverage may be dragging the score\n  down. Worth investigating.\n- **Weak (below 70)**: docs are not providing enough support. Agents\n  consistently struggle with these features. These need priority attention.\n\n## Strong areas are signal too\n\nIt is easy to focus only on what is broken, but the strong areas are worth\nreading. They show what good looks like in your docs: clear structure, accurate\nexamples, the patterns agents can follow. When you fix a weak area, that is the\nbar to copy.\n\n## Key diagnostic signals\n\nA low score has a reason behind it. These signals tell you which reason, and\nwhat to do about it:\n\n| Signal                         | What it means                               | What to do                               |\n| ------------------------------ | ------------------------------------------- | ---------------------------------------- |\n| **Negative doc lift**          | Docs are worse than no docs                 | Rewrite or remove the offending docs     |\n| **Large retrieval gap**        | Good docs exist but agents cannot find them | Improve page titles, metadata, structure |\n| **Low code correctness**       | Agents find the docs but produce bad code   | Add or fix code examples                 |\n| **Low doc coverage**           | The docs do not cover what the task needs   | Write new documentation                  |\n| **Efficiency anomaly (>100%)** | Agents do better without the docs           | Injected docs may be confusing the model |\n\n## Where to go next\n\nWhen you know which areas are weak and why, the **Recommendations** view turns\nthat into a ranked list of specific edits, and the **Failure modes** view groups\nthe weaknesses by the kind of documentation problem behind them.",
         "source": "docs/help/interpreting-diagnostics.md",
         "related": [
-            "scoring-model",
-            "weaknesses-recommendations"
+            "recommendations",
+            "failure-modes",
+            "scoring-model"
         ]
     },
     {
@@ -74,6 +86,17 @@ export const HELP_TOPICS = [
             "comparing-runs"
         ]
     },
+    {
+        "id": "recommendations",
+        "title": "Recommendations",
+        "body": "## What this view is for\n\nThis is the \"what do I fix\" view. The scores tell you how well your\ndocumentation supports AI coding tools. This view turns those scores into a\nranked list of specific changes, so you can spend your time on the edits that\nshould move the score the most.\n\nEverything here comes from the same evaluation run you are looking at, and it\npoints at your own documentation pages rather than giving generic advice.\n\n## What you are looking at\n\nRecent reports show a set of **diagnosis cards**. Each card answers one question\nabout the run.\n\n**Top recommendations** is the main card. It opens with a short summary, then\nlists a few suggested changes ranked by priority. Each suggestion has:\n\n- A **priority** tag of high, medium, or low that tells you what to do first.\n- A **title** that names the change in one line.\n- A **description** of the specific fix, usually quoting the exact symbol,\n  query, or pattern involved.\n- A **doc reference** showing which page, and the section when it is known, the\n  change applies to. Every reference points to a real page that was part of this\n  run, so you can open it and start editing.\n\nYou may also see supporting cards:\n\n- **Doc attribution spotlight** shows which documentation pages most influenced\n  the results, and whether each one helped or hurt. Use it to confirm a\n  recommendation is pointing at the right page.\n- **Low-confidence attribution** lists results where the link between a doc and\n  an outcome was uncertain. Treat anything flagged here as a lead to verify, not\n  a settled conclusion.\n- **Regression vs baseline** appears when you are comparing against an earlier\n  run. It shows which areas moved up or down and the likely reason for each\n  change.\n\n## How to use it\n\nWork top down. Start with the high-priority suggestions, open the referenced\npage, and make the change. Priority reflects how much each change is expected to\nhelp, so the top of the list is usually where your effort goes furthest.\n\nThe recommendations are written by a model that reads this run's results. They\nare grounded in your actual docs and cannot reference a page that was not in the\nrun, but they are still suggestions. Read the linked page before acting, and use\nthe confidence signals to decide how much to trust each item.\n\n## Where this comes from\n\nA recommendation is the end of a chain: a test scored low, the grader said why,\nthe run classified that into a failure mode, and this view proposes the edit. If\nyou want to see the failure modes themselves, grouped by category, open the\n**Failure modes** view. If you want the grader's raw notes on the lowest scores,\nopen the **Low-scoring judgments** view.\n\n## Older reports\n\nReports created before the diagnosis cards shipped show a simpler list instead.\nEach row names a feature area, the failure mode behind it, an estimated score\nlift if you fix it, a confidence level, and the tasks that exposed the gap. The\nestimated lift is conservative. It assumes fixing the gap raises the weak\ndimension only to the median of the others, so the real improvement can be\nhigher.\n\n## When this view is empty\n\nIf a report shows no recommendations, the evaluation either ran and found\nnothing worth flagging, or the run predates this feature. A score with no\nrecommendations is usually a good sign, because it means the docs held up across\nthe evaluated tasks.",
+        "source": "docs/help/recommendations.md",
+        "related": [
+            "failure-modes",
+            "interpreting-diagnostics",
+            "scoring-model"
+        ]
+    },
     {
         "id": "retrieval-gap",
         "title": "Retrieval Gap & Infrastructure Efficiency",
@@ -96,17 +119,6 @@ export const HELP_TOPICS = [
             "eval-modes"
         ]
     },
-    {
-        "id": "weaknesses-recommendations",
-        "title": "Weaknesses & Recommendations",
-        "body": "## Understanding weaknesses\n\nThe Issues sub-tab in Diagnostics lists every area or dimension that scored\nbelow threshold. Each weakness entry shows:\n\n- **The feature area** — Which product feature is affected (e.g., GROQ,\n  Functions, Webhooks).\n- **The bottleneck dimension** — Which scoring dimension is dragging the area\n  down: task completion, code correctness, or doc coverage.\n- **The score** — How far below threshold the dimension scored.\n\n## Gap analysis recommendations\n\nWhen an evaluation runs with gap analysis enabled, the dashboard shows\n**prioritized recommendations** — specific actions ranked by estimated impact.\n\nEach recommendation includes:\n\n- **Failure mode** — The type of doc problem identified:\n  - `missing-docs` — The functionality isn't documented at all.\n  - `incorrect-docs` — The docs contain factual errors.\n  - `outdated-docs` — The docs describe an old API version or pattern.\n  - `poor-structure` — The docs exist but are hard to find or understand.\n- **Estimated lift** — How many score points fixing this gap would add. Based on\n  raising the bottleneck dimension to the median of non-bottleneck dimensions.\n  Conservative estimate — actual improvement may be higher.\n- **Confidence** — How sure the analysis is about this diagnosis (high, medium,\n  or low).\n- **Affected tasks** — Which specific evaluation tasks exposed this gap.\n\n## Diagnosis cards\n\nEvery published report now carries a **diagnosis artifact** — a set of cards\nproduced by the post-pipeline hook (`ailf interpret`). The Studio diagnosis\npanel renders these cards directly; the dashboard's Recommendations and\nFailure-modes panels migrate to the same source in a follow-up.\n\nThe hook runs by default for every pipeline invocation. To opt out for a single\nrun, pass `--no-summary`; to opt out in CI, set `AILF_INTERPRET_ON_RUN=0` in the\nworkflow env block; to opt out project-wide, set `summary.onRun: never` in\n`.ailf/config.yaml`.\n\n## Low-scoring judgments\n\nBelow the recommendations, you'll find the **grader's explanations** for tests\nthat scored below 70. These are the raw assessments from the grading model\nexplaining exactly what went wrong — missing API calls, incorrect patterns,\nhallucinated features, etc.\n\nEach judgment shows the task, the dimension, the score, and the grader's natural\nlanguage reason. These are the most granular diagnostic signal available and\noften point directly to the doc section that needs fixing.",
-        "source": "docs/help/weaknesses-recommendations.md",
-        "related": [
-            "interpreting-diagnostics",
-            "scoring-model",
-            "negative-doc-lift"
-        ]
-    },
     {
         "id": "how-agents-work",
         "title": "How AI Agents Find Documentation",

package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts CHANGED Viewed

@@ -76,9 +76,9 @@ export declare const GraderJudgmentSchema: z.ZodObject<{
         documentId: z.ZodString;
         slug: z.ZodOptional<z.ZodString>;
         role: z.ZodEnum<{
+            missing: "missing";
             supports: "supports";
             contradicts: "contradicts";
-            missing: "missing";
             irrelevant: "irrelevant";
         }>;
         hallucinated: z.ZodOptional<z.ZodBoolean>;
@@ -145,9 +145,9 @@ export declare const GraderEmittedJudgmentSchema: z.ZodObject<{
         documentId: z.ZodString;
         slug: z.ZodOptional<z.ZodString>;
         role: z.ZodEnum<{
+            missing: "missing";
             supports: "supports";
             contradicts: "contradicts";
-            missing: "missing";
             irrelevant: "irrelevant";
         }>;
         hallucinated: z.ZodOptional<z.ZodBoolean>;

package/dist/index.d.ts CHANGED Viewed

@@ -17,7 +17,11 @@
  *   area: "groq",
  *   prompt: { text: "Write GROQ queries..." },
  *   assertions: [
- *     { type: "llm-rubric", template: "task-completion", criteria: ["..."] },
+ *     {
+ *       type: "llm-rubric",
+ *       template: "task-completion",
+ *       criteria: [{ id: "uses-projection", text: "Uses a projection" }],
+ *     },
  *   ],
  * })
  * ```

package/dist/index.js CHANGED Viewed

@@ -17,7 +17,11 @@
  *   area: "groq",
  *   prompt: { text: "Write GROQ queries..." },
  *   assertions: [
- *     { type: "llm-rubric", template: "task-completion", criteria: ["..."] },
+ *     {
+ *       type: "llm-rubric",
+ *       template: "task-completion",
+ *       criteria: [{ id: "uses-projection", text: "Uses a projection" }],
+ *     },
  *   ],
  * })
  * ```

package/dist/orchestration/steps/compute-attribution-step.d.ts CHANGED Viewed

@@ -35,10 +35,10 @@
  * @see docs/decisions/D0050-per-entry-attribution-layout.md
  * @see docs/decisions/D0052-judgment-ref-granularity.md
  */
-import type { AppContext, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
+import type { AppContext, PipelineState, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
 export declare class ComputeAttributionStep implements PipelineStep {
     readonly name = "compute-attribution";
     readonly optional = true;
     check(ctx: AppContext): ValidationIssue[];
-    execute(ctx: AppContext, _state?: unknown): Promise<StepResult>;
+    execute(ctx: AppContext, state?: PipelineState): Promise<StepResult>;
 }

package/dist/orchestration/steps/compute-attribution-step.js CHANGED Viewed

@@ -40,6 +40,7 @@ import { resolve } from "node:path";
 import { isSlugRef } from "../../_vendor/ailf-core/index.js";
 import { calibrationSetVersion, embeddingModel, ensembleVersion, } from "../../pipeline/attribution.js";
 import { V0_WEIGHTS, computeJudgmentAttribution, } from "../../pipeline/compute-attribution.js";
+import { classifyEnrichmentInputs, degradedEnrichmentError, } from "../../pipeline/enrichment-preconditions.js";
 // ---------------------------------------------------------------------------
 // Step implementation
 // ---------------------------------------------------------------------------
@@ -79,12 +80,26 @@ export class ComputeAttributionStep {
         }
         return issues;
     }
-    async execute(ctx, _state) {
+    async execute(ctx, state) {
         const start = Date.now();
         const root = ctx.config.rootDir;
         const judgmentsPath = resolve(root, "results", "latest", "grader-judgments.json");
         const summaryPath = resolve(root, "results", "latest", "score-summary.json");
-        if (!existsSync(judgmentsPath)) {
+        // Mirror gap-analysis: a full eval that scored tests but persisted no
+        // grader judgments is a degraded run, not a benign skip. Fail loud so the
+        // outcome surfaces in pipeline-result and on the job document. A remote
+        // cache hit restores score-summary.json without grader-judgments.json, so
+        // its missing judgments are legitimate — never fail loud on a cache hit.
+        const fromRemoteCache = (state?.remoteCacheHits?.size ?? 0) > 0;
+        const inputs = classifyEnrichmentInputs(root);
+        if (inputs.kind === "judgments-missing-after-eval" && !fromRemoteCache) {
+            return {
+                durationMs: Date.now() - start,
+                status: "failed",
+                error: degradedEnrichmentError("compute-attribution", inputs.scoredTestCount),
+            };
+        }
+        if (inputs.kind !== "ready") {
             return { status: "skipped", reason: "No grader-judgments.json" };
         }
         if (!existsSync(summaryPath)) {

package/dist/orchestration/steps/gap-analysis-step.d.ts CHANGED Viewed

@@ -14,10 +14,10 @@
  *
  * This is an optional step — failure doesn't stop the pipeline.
  */
-import type { AppContext, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
+import type { AppContext, PipelineState, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
 export declare class GapAnalysisStep implements PipelineStep {
     readonly name = "gap-analysis";
     readonly optional = true;
     check(ctx: AppContext): ValidationIssue[];
-    execute(ctx: AppContext): Promise<StepResult>;
+    execute(ctx: AppContext, state?: PipelineState): Promise<StepResult>;
 }

package/dist/orchestration/steps/gap-analysis-step.js CHANGED Viewed

@@ -18,6 +18,7 @@ import { existsSync, mkdirSync, readFileSync, renameSync, writeFileSync, } from
 import { join, resolve } from "path";
 import { assoc, isSlugRef } from "../../_vendor/ailf-core/index.js";
 import { emitFileContents } from "../../artifact-capture/emit-file.js";
+import { classifyEnrichmentInputs, degradedEnrichmentError, } from "../../pipeline/enrichment-preconditions.js";
 export class GapAnalysisStep {
     name = "gap-analysis";
     optional = true;
@@ -34,12 +35,29 @@ export class GapAnalysisStep {
         }
         return [];
     }
-    async execute(ctx) {
+    async execute(ctx, state) {
         const root = ctx.config.rootDir;
         const start = Date.now();
         const judgmentsPath = resolve(root, "results", "latest", "grader-judgments.json");
         const scoreSummaryPath = resolve(root, "results", "latest", "score-summary.json");
-        if (!existsSync(judgmentsPath)) {
+        // Distinguish a legitimate skip (no graded eval ran this pipeline) from a
+        // degraded run where a full eval scored tests but no judgments persisted.
+        // The latter must fail loud — returning a benign `skipped` is what let
+        // reports publish with a score but no test details.
+        //
+        // A remote cache hit restores score-summary.json (with testCount) from a
+        // prior report but never writes grader-judgments.json, so judgments are
+        // legitimately absent — that is a benign skip, not a degraded full eval.
+        const fromRemoteCache = (state?.remoteCacheHits?.size ?? 0) > 0;
+        const inputs = classifyEnrichmentInputs(root);
+        if (inputs.kind === "judgments-missing-after-eval" && !fromRemoteCache) {
+            return {
+                durationMs: Date.now() - start,
+                status: "failed",
+                error: degradedEnrichmentError("gap-analysis", inputs.scoredTestCount),
+            };
+        }
+        if (inputs.kind !== "ready") {
             return {
                 status: "skipped",
                 reason: "No grader-judgments.json — run a full evaluation first",

package/dist/orchestration/steps/publish-report-step.d.ts CHANGED Viewed

@@ -10,7 +10,7 @@
  * - P5: Local-first (pipeline never fails because of a store write)
  * - P6: Sinks are fire-and-forget (failures logged, not thrown)
  */
-import { type AppContext, type PipelineState, type PipelineStep, type PromptfooUrlEntry, type ReportAutoScope, type ScoreSummary, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
+import { type AppContext, type PipelineState, type PipelineStep, type PromptfooUrlEntry, type ReportAutoScope, type ReportDegradation, type ScoreSummary, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
 import { type ProvenanceInput } from "../../pipeline/provenance.js";
 export declare class PublishReportStep implements PipelineStep {
     private readonly pipelineStart;
@@ -25,6 +25,20 @@ export declare class PublishReportStep implements PipelineStep {
     check(): ValidationIssue[];
     execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
 }
+/**
+ * Detect whether a report should publish as degraded.
+ *
+ * The symptom is a scored run whose per-test details never landed: a full
+ * eval counted tests (`scores[].testCount > 0`) but `summary.testResults` is
+ * absent because gap-analysis skipped or failed. Such a report renders an
+ * empty "no tests" state in Studio despite carrying a score. Returns the
+ * marker enumerating which enrichment surfaces are missing, or `undefined`
+ * for a healthy report (or a run with no scored tests, where an empty report
+ * is legitimate).
+ *
+ * Exported for unit testing — production callers reach it via execute().
+ */
+export declare function detectReportDegradation(summary: ScoreSummary): ReportDegradation | undefined;
 /**
  * Assemble provenance input from the score summary and pipeline context.
  *

package/dist/orchestration/steps/publish-report-step.js CHANGED Viewed

@@ -110,9 +110,15 @@ export class PublishReportStep {
         // agentBehavior arrays) point at their external artifacts via
         // `id = manifestEntryKey`; Studio hydrates on drill-down.
         const slimSummary = buildSlimReportSummary(summary, ctx.config.mode);
+        // Degraded-report detection (the "no tests on a scored report" symptom):
+        // a full eval scored tests but the gap-analysis enrichment never landed.
+        // Computed from the full summary read above — independent of which
+        // upstream step skipped — so the marker fires regardless of the cause.
+        const degraded = detectReportDegradation(summary);
         const report = {
             comparison: comparison ?? undefined,
             completedAt: now,
+            ...(degraded ? { degraded } : {}),
             durationMs,
             id: reportId,
             provenance,
@@ -192,6 +198,45 @@ export class PublishReportStep {
 // ---------------------------------------------------------------------------
 // Helpers
 // ---------------------------------------------------------------------------
+/**
+ * Detect whether a report should publish as degraded.
+ *
+ * The symptom is a scored run whose per-test details never landed: a full
+ * eval counted tests (`scores[].testCount > 0`) but `summary.testResults` is
+ * absent because gap-analysis skipped or failed. Such a report renders an
+ * empty "no tests" state in Studio despite carrying a score. Returns the
+ * marker enumerating which enrichment surfaces are missing, or `undefined`
+ * for a healthy report (or a run with no scored tests, where an empty report
+ * is legitimate).
+ *
+ * Exported for unit testing — production callers reach it via execute().
+ */
+export function detectReportDegradation(summary) {
+    const scoredTestCount = (summary.scores ?? []).reduce((n, s) => n + (typeof s.testCount === "number" ? s.testCount : 0), 0);
+    const hasTestResults = (summary.testResults?.length ?? 0) > 0;
+    if (scoredTestCount === 0 || hasTestResults)
+        return undefined;
+    // `testResults` is the load-bearing signal (its absence is the rendered
+    // "no tests" symptom). The remaining fields are best-effort detail: some
+    // are literacy-only (e.g. documentManifest), so they may appear here for a
+    // degraded non-literacy run even though that mode never produces them.
+    const missing = ["testResults"];
+    if (!summary.failureModes)
+        missing.push("failureModes");
+    if (!summary.lowScoringJudgments?.length)
+        missing.push("lowScoringJudgments");
+    if (!summary.documentManifest?.length)
+        missing.push("documentManifest");
+    if (!summary.recommendations)
+        missing.push("recommendations");
+    return {
+        reason: "enrichment-missing",
+        missing,
+        detail: `Evaluation scored ${scoredTestCount} test(s) but enrichment did not ` +
+            `complete; per-test details and failure analysis are unavailable for ` +
+            `this report.`,
+    };
+}
 /**
  * Assemble provenance input from the score summary and pipeline context.
  *

package/dist/pipeline/calculate-scores.js CHANGED Viewed

@@ -41,6 +41,7 @@ import { resolveProfile } from "./profile-resolution.js";
 import { loadSource } from "../sources.js";
 import { LiteracyVariant } from "./normalize-mode.js";
 import { scoreTestGroup, } from "./compiler/scoring-bridge.js";
+import { extractGraderJudgmentsResilient, } from "./extract-grader-judgments-resilient.js";
 // Re-export from core for backward compatibility.
 // Existing imports from this file continue to work unchanged.
 export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
@@ -321,6 +322,54 @@ export function extractGraderJudgments(resultsPath, telemetry) {
     }
     return judgments;
 }
+/**
+ * Light parse of a results file's entry count — diagnostics only. Avoids the
+ * full normalize + debug logging of `readAndNormalizeResults`. Returns 0 when
+ * the file is missing or unparseable.
+ */
+function countResultEntries(resultsPath) {
+    try {
+        const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
+        const wrapper = file.results ?? file;
+        return Array.isArray(wrapper.results) ? wrapper.results.length : 0;
+    }
+    catch {
+        return 0;
+    }
+}
+/**
+ * Count classifiable llm-rubric components in a results file — i.e. the number
+ * of judgments a healthy `extractGraderJudgments` should produce. Used only to
+ * set the severity of a persistent-empty extraction: a file with classifiable
+ * components but 0 extracted judgments is an error; a file with none (all
+ * api-errors / no llm-rubric) is a benign empty.
+ *
+ * Deliberately an independent count path (not `extractGraderJudgments`) so the
+ * cross-check is meaningful. Returns 0 when the file is missing or unparseable.
+ */
+function countClassifiableRubricComponents(resultsPath) {
+    if (!existsSync(resultsPath))
+        return 0;
+    let n = 0;
+    for (const result of readAndNormalizeResults(resultsPath)) {
+        for (const comp of result.gradingResult.componentResults) {
+            if (comp.assertion?.type === "llm-rubric" && classifyRubric(comp)) {
+                n += 1;
+            }
+        }
+    }
+    return n;
+}
+/**
+ * Shared dependency bundle for `extractGraderJudgmentsResilient` — wires the
+ * real extractor + fs counters. Defined once so all persist sites self-heal
+ * identically.
+ */
+const resilientJudgmentDeps = {
+    countClassifiable: countClassifiableRubricComponents,
+    countResults: countResultEntries,
+    extract: extractGraderJudgments,
+};
 /**
  * Stamp every grader judgment with a D0049 ceiling-cross-check confidence
  * triple and increment `GraderReliability.failureModeCalibration` whenever
@@ -1494,7 +1543,7 @@ export async function calculateAndWriteScores(options) {
         writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
         log.info("Score summary written to results/latest/score-summary.json");
         // Extract and persist grader judgments
-        const judgments = extractGraderJudgments(baselineResultsPath);
+        const judgments = await extractGraderJudgmentsResilient([baselineResultsPath], undefined, log, { deps: resilientJudgmentDeps });
         const borderlineConsistency = await runBorderlinePass(judgments, [
             baselineResultsPath,
         ]);
@@ -1557,7 +1606,7 @@ export async function calculateAndWriteScores(options) {
         mkdirSync(outDir, { recursive: true });
         writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
         log.info("Score summary written to results/latest/score-summary.json");
-        const judgments = extractGraderJudgments(baselineResultsPath);
+        const judgments = await extractGraderJudgmentsResilient([baselineResultsPath], undefined, log, { deps: resilientJudgmentDeps });
         const borderlineConsistency = await runBorderlinePass(judgments, [
             baselineResultsPath,
         ]);
@@ -1687,18 +1736,14 @@ export async function calculateAndWriteScores(options) {
     // the ceiling-cross-check disagreement counter (`failureModeCalibration`)
     // is incremented during the post-extraction validation pass below.
     const reliability = { graderModel: "unknown" };
-    const judgments = extractGraderJudgments(baselineResultsPath, {
-        reliability,
-        ...(options.runId ? { runId: options.runId } : {}),
-    });
-    // In full mode, also extract judgments from agentic results
-    if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
-        const agenticJudgments = extractGraderJudgments(agenticResultsPath, {
-            reliability,
-            ...(options.runId ? { runId: options.runId } : {}),
-        });
-        judgments.push(...agenticJudgments);
-    }
+    // Extract through the resilient wrapper so an empty result from the transient
+    // read anomaly is instrumented and self-healed rather than silently skipping
+    // the grader-judgments persist. In full mode both the baseline and agentic
+    // result files are graded against the shared telemetry.
+    const judgmentResultPaths = mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)
+        ? [baselineResultsPath, agenticResultsPath]
+        : [baselineResultsPath];
+    const judgments = await extractGraderJudgmentsResilient(judgmentResultPaths, { reliability, ...(options.runId ? { runId: options.runId } : {}) }, log, { deps: resilientJudgmentDeps });
     // Borderline-consensus pass — re-grade the ±5 borderline subset N times
     // and merge medians back into the canonical judgments BEFORE
     // `validateGraderJudgmentsCalibration` runs, so the calibration counter

package/dist/pipeline/compiler/assertion-mapper.js CHANGED Viewed

@@ -153,9 +153,13 @@ function mapTemplatedAssertion(assertion, options) {
     const result = {
         type: "llm-rubric",
         // The rubric prompt will be fully assembled by the PromptfooCompiler
-        // using rubric templates. Here we pass the template ref + criteria
-        // as metadata so the compiler can resolve it.
-        value: `[template:${assertion.template}] ${assertion.criteria.join("; ")}`,
+        // using rubric templates. Here we pass the template ref + criteria as
+        // metadata so the compiler can resolve it. Criteria are either legacy
+        // bare strings or canonical `{ id, text }` objects — render the text of
+        // each (interpolating an object directly would emit "[object Object]").
+        value: `[template:${assertion.template}] ${assertion.criteria
+            .map((c) => (typeof c === "string" ? c : c.text))
+            .join("; ")}`,
     };
     if (assertion.weight !== undefined) {
         result.weight = assertion.weight;

package/dist/pipeline/compiler/rubric-resolution.d.ts CHANGED Viewed

@@ -13,7 +13,7 @@
  * @see docs/design-docs/two-stage-grader-symbol-preflight.md — W0198 Phase 6
  * @see config/rubrics.ts — template definitions
  */
-import type { PreflightRubricContext } from "../../_vendor/ailf-core/index.d.ts";
+import type { CriterionRef, PreflightRubricContext } from "../../_vendor/ailf-core/index.d.ts";
 import type { PromptfooAssertion } from "./assertion-mapper.js";
 export type { PreflightRubricContext } from "../../_vendor/ailf-core/index.d.ts";
 /** Minimal rubric config needed for template resolution */
@@ -59,7 +59,7 @@ export interface RubricResolutionInput {
  * Returns null (with a warning) if the template can't be resolved.
  */
 export declare function resolveTemplatedAssertion(assertion: {
-    criteria: string[];
+    criteria: (string | CriterionRef)[];
     template: string;
     type: string;
 }, rubricConfig: RubricResolutionInput | undefined, graderProvider: string | undefined, warnings: string[], canonicalReference?: string, preflightContext?: PreflightRubricContext): PromptfooAssertion | null;

package/dist/pipeline/compiler/rubric-resolution.js CHANGED Viewed

@@ -45,7 +45,9 @@ export function resolveTemplatedAssertion(assertion, rubricConfig, graderProvide
         return null;
     }
     const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
-    const criteriaText = assertion.criteria.map((c) => `- ${c}`).join("\n");
+    const criteriaText = assertion.criteria
+        .map((c) => renderCriterion(c, assertion.template))
+        .join("\n");
     // W0198 Phase 6 — when the deterministic preflight lane is wired and this
     // rubric scores `code-correctness`, prefix a system instruction so the
     // grader does not re-judge symbol existence. The lane separation is the
@@ -79,6 +81,28 @@ export function resolveTemplatedAssertion(assertion, rubricConfig, graderProvide
             : {}),
     };
 }
+/**
+ * Render a single rubric criterion to its bullet line.
+ *
+ * Criteria come in two shapes: the canonical `CriterionRef` object
+ * (`{ id, text }`, used by Content Lake / API tasks) and a legacy bare
+ * string (still used by many repo `defineTask` task files). Both render to
+ * their text. Interpolating a `CriterionRef` object directly (`- ${c}`)
+ * silently renders `- [object Object]`, stripping the grader's acceptance
+ * criteria and degrading grade quality — that is the bug this guards. A
+ * criterion that yields no renderable text (e.g. an object missing `text`)
+ * fails loud at compile time rather than reaching the grader.
+ */
+function renderCriterion(criterion, template) {
+    const text = typeof criterion === "string" ? criterion : criterion?.text;
+    if (typeof text !== "string" || text.trim() === "") {
+        throw new Error(`Rubric template "${template}" has a criterion with no renderable text ` +
+            `(received: ${JSON.stringify(criterion)?.slice(0, 160)}). Each criterion ` +
+            `must be a non-empty string or a { id, text } object; an object without ` +
+            `text renders "[object Object]" in the grader prompt.`);
+    }
+    return `- ${text}`;
+}
 /**
  * Build the W0198 Phase 6 preflight preface for a `code-correctness`
  * rubric. Returned with a trailing newline so it composes cleanly with

package/dist/pipeline/enrichment-preconditions.d.ts ADDED Viewed

@@ -0,0 +1,52 @@
+/**
+ * pipeline/enrichment-preconditions.ts
+ *
+ * Classifies the inputs the post-scoring enrichment steps (gap-analysis,
+ * compute-attribution) depend on, so a missing `grader-judgments.json` can be
+ * told apart as either a legitimate skip (no graded eval ran this pipeline) or
+ * a degraded outcome (a full eval scored tests but no judgments persisted).
+ *
+ * The degraded case is the failure these steps must stop swallowing:
+ * `calculate-scores` wrote `score-summary.json` with per-area `testCount > 0`
+ * but never wrote `grader-judgments.json`, so the enrichment steps self-skip
+ * and the report ships with no test details while still showing a score.
+ * Distinguishing the two is what lets the steps fail loud instead of returning
+ * a benign `skipped`.
+ */
+/**
+ * Outcome of classifying the enrichment inputs under `results/latest/`.
+ *
+ * - `ready` — `grader-judgments.json` is present and non-empty; enrichment
+ *   can run.
+ * - `no-full-eval` — no graded eval produced judgments this run. A legitimate
+ *   skip: standalone gap-analysis on cached results, a non-graded run, or an
+ *   eval that scored nothing.
+ * - `judgments-missing-after-eval` — a full eval scored tests
+ *   (`score-summary.json` carries `testCount > 0`) yet `grader-judgments.json`
+ *   is missing or empty. This is the degraded condition the steps surface.
+ */
+export type EnrichmentInputs = {
+    kind: "ready";
+    judgmentCount: number;
+} | {
+    kind: "no-full-eval";
+} | {
+    kind: "judgments-missing-after-eval";
+    scoredTestCount: number;
+};
+/**
+ * Classify the enrichment inputs for a run by inspecting
+ * `results/latest/grader-judgments.json` and `score-summary.json`.
+ *
+ * Pure read-only filesystem inspection — never throws on malformed input; a
+ * file that does not parse to the expected shape is treated as absent so that
+ * "no usable judgments" and "no usable summary" both collapse to a single
+ * branch.
+ */
+export declare function classifyEnrichmentInputs(rootDir: string): EnrichmentInputs;
+/**
+ * Build the fail-loud error message for the degraded
+ * `judgments-missing-after-eval` case. Shared by the enrichment steps so the
+ * pipeline-result and job-document surfaces carry one consistent wording.
+ */
+export declare function degradedEnrichmentError(step: string, scoredTestCount: number): string;

package/dist/pipeline/enrichment-preconditions.js ADDED Viewed

@@ -0,0 +1,84 @@
+/**
+ * pipeline/enrichment-preconditions.ts
+ *
+ * Classifies the inputs the post-scoring enrichment steps (gap-analysis,
+ * compute-attribution) depend on, so a missing `grader-judgments.json` can be
+ * told apart as either a legitimate skip (no graded eval ran this pipeline) or
+ * a degraded outcome (a full eval scored tests but no judgments persisted).
+ *
+ * The degraded case is the failure these steps must stop swallowing:
+ * `calculate-scores` wrote `score-summary.json` with per-area `testCount > 0`
+ * but never wrote `grader-judgments.json`, so the enrichment steps self-skip
+ * and the report ships with no test details while still showing a score.
+ * Distinguishing the two is what lets the steps fail loud instead of returning
+ * a benign `skipped`.
+ */
+import { existsSync, readFileSync } from "node:fs";
+import { resolve } from "node:path";
+/**
+ * Classify the enrichment inputs for a run by inspecting
+ * `results/latest/grader-judgments.json` and `score-summary.json`.
+ *
+ * Pure read-only filesystem inspection — never throws on malformed input; a
+ * file that does not parse to the expected shape is treated as absent so that
+ * "no usable judgments" and "no usable summary" both collapse to a single
+ * branch.
+ */
+export function classifyEnrichmentInputs(rootDir) {
+    const judgmentCount = countGraderJudgments(rootDir);
+    if (judgmentCount > 0) {
+        return { kind: "ready", judgmentCount };
+    }
+    const scoredTestCount = scoredTestCountFromSummary(rootDir);
+    if (scoredTestCount > 0) {
+        return { kind: "judgments-missing-after-eval", scoredTestCount };
+    }
+    return { kind: "no-full-eval" };
+}
+/**
+ * Build the fail-loud error message for the degraded
+ * `judgments-missing-after-eval` case. Shared by the enrichment steps so the
+ * pipeline-result and job-document surfaces carry one consistent wording.
+ */
+export function degradedEnrichmentError(step, scoredTestCount) {
+    return (`${step}: grader-judgments.json missing after a full eval — ` +
+        `${scoredTestCount} test(s) scored but 0 grader judgments persisted. ` +
+        `The report is marked degraded rather than published as healthy.`);
+}
+/**
+ * Count the judgments in `grader-judgments.json`. Returns 0 when the file is
+ * absent, unreadable, not valid JSON, or not an array — every "no usable
+ * judgments" shape collapses to 0 so callers branch on a single number. An
+ * empty array is therefore indistinguishable from a missing file by design
+ * (both are "no judgments persisted").
+ */
+function countGraderJudgments(rootDir) {
+    const path = resolve(rootDir, "results", "latest", "grader-judgments.json");
+    if (!existsSync(path))
+        return 0;
+    try {
+        const parsed = JSON.parse(readFileSync(path, "utf-8"));
+        return Array.isArray(parsed) ? parsed.length : 0;
+    }
+    catch {
+        return 0;
+    }
+}
+/**
+ * Sum the per-area `testCount` from `score-summary.json` — the signal that a
+ * full eval scored tests this run. Returns 0 when the summary is absent,
+ * unreadable, or carries no scored tests.
+ */
+function scoredTestCountFromSummary(rootDir) {
+    const path = resolve(rootDir, "results", "latest", "score-summary.json");
+    if (!existsSync(path))
+        return 0;
+    try {
+        const parsed = JSON.parse(readFileSync(path, "utf-8"));
+        const scores = Array.isArray(parsed.scores) ? parsed.scores : [];
+        return scores.reduce((sum, s) => sum + (typeof s.testCount === "number" ? s.testCount : 0), 0);
+    }
+    catch {
+        return 0;
+    }
+}

package/dist/pipeline/extract-grader-judgments-resilient.d.ts ADDED Viewed

@@ -0,0 +1,88 @@
+/**
+ * pipeline/extract-grader-judgments-resilient.ts
+ *
+ * Resilient grader-judgment extraction for the `calculate-scores` persist
+ * junction.
+ *
+ * Background: `calculateAndWriteScores` extracts grader judgments from the
+ * eval results file(s), then writes `grader-judgments.json` only when the
+ * array is non-empty (the `judgments.length > 0` guard). A runtime anomaly was
+ * observed where `extractGraderJudgments` returned 0 judgments for a results
+ * file that demonstrably contained classifiable llm-rubric components — the
+ * same file, read tens of milliseconds later by `extractStoredTestResults`,
+ * yielded the full set (entries with populated dimensions). The empty array
+ * silently skipped the write, so gap-analysis and compute-attribution skipped
+ * and the report shipped with a score but no tests.
+ *
+ * The committed code reads the file via a pure `readFileSync` with identical
+ * classification on both sides, so the divergence is not reproducible from the
+ * source + captured artifacts — it is a transient read anomaly at the live
+ * junction. This wrapper does not pretend to know the mechanism; it makes the
+ * junction observable and recovers from the transient:
+ *
+ *  1. **Instruments** — logs the resolved path(s), file size/mtime, parsed
+ *     result count, and judgment count on every run (never silent on 0), so a
+ *     future empty-judgments persist is diagnosable from the run log alone.
+ *  2. **Self-heals** — when extraction yields 0 judgments but a results file
+ *     exists, it re-extracts with bounded retries. A later read that yields
+ *     judgments proves the initial 0 was transient; the recovered judgments
+ *     are returned. If every attempt yields 0, severity is decided by an
+ *     independent classifiable-component count: a genuinely judgment-free run
+ *     (all api-errors / no llm-rubric) logs a warning, while 0 judgments
+ *     against present classifiable components logs an error (the downstream
+ *     gap-analysis fail-loud guard is the backstop).
+ *
+ * The extractor and fs helpers are injected so the wrapper is unit-testable
+ * without importing the ~3000-line scoring module (which would be circular)
+ * or touching the real filesystem.
+ */
+import type { GraderJudgment, GraderReliability, Logger } from "../_vendor/ailf-core/index.d.ts";
+/** Telemetry sink threaded into each extraction (shared reliability counters). */
+export interface ExtractionTelemetry {
+    reliability: GraderReliability;
+    runId?: string;
+}
+/** Cheap on-disk stat used for diagnostics and the retry gate. */
+export interface FileStat {
+    exists: boolean;
+    mtimeMs: number;
+    size: number;
+}
+/** Injectable seams — defaults wire the real fs; tests substitute fakes. */
+export interface ResilientExtractionDeps {
+    /**
+     * The real `extractGraderJudgments`. Injected (rather than imported) to
+     * avoid a circular dependency with `calculate-scores.ts`.
+     */
+    extract: (path: string, telemetry?: ExtractionTelemetry) => GraderJudgment[];
+    /** Parsed result-entry count for a path — diagnostics only. */
+    countResults?: (path: string) => number;
+    /**
+     * Count of classifiable llm-rubric components for a path. Used only to set
+     * the severity of a persistent-empty extraction: a file with classifiable
+     * components but 0 extracted judgments is an error; a file with none (all
+     * api-errors / no llm-rubric) is a benign empty.
+     */
+    countClassifiable?: (path: string) => number;
+    /** On-disk stat (existence + size + mtime). */
+    statFile?: (path: string) => FileStat;
+    /** Backoff between self-heal attempts. */
+    sleep?: (ms: number) => Promise<void>;
+}
+export interface ResilientExtractionOptions {
+    /** Total extraction attempts when the first yields 0 (default 3, min 1). */
+    maxAttempts?: number;
+    /** Delay before each retry, in ms (default 200). */
+    delayMs?: number;
+    deps: ResilientExtractionDeps;
+}
+/**
+ * Extract grader judgments across one or more results files, instrumented and
+ * self-healing. See the module header for the rationale.
+ *
+ * @param resultsPaths  One or more results files (e.g. baseline + agentic in
+ *   literacy full mode). Missing paths are skipped.
+ * @param telemetry  Shared reliability sink threaded into every extraction.
+ * @param log  Pipeline logger — the junction is logged here on every run.
+ */
+export declare function extractGraderJudgmentsResilient(resultsPaths: readonly string[], telemetry: ExtractionTelemetry | undefined, log: Logger, options: ResilientExtractionOptions): Promise<GraderJudgment[]>;

package/dist/pipeline/extract-grader-judgments-resilient.js ADDED Viewed

@@ -0,0 +1,122 @@
+/**
+ * pipeline/extract-grader-judgments-resilient.ts
+ *
+ * Resilient grader-judgment extraction for the `calculate-scores` persist
+ * junction.
+ *
+ * Background: `calculateAndWriteScores` extracts grader judgments from the
+ * eval results file(s), then writes `grader-judgments.json` only when the
+ * array is non-empty (the `judgments.length > 0` guard). A runtime anomaly was
+ * observed where `extractGraderJudgments` returned 0 judgments for a results
+ * file that demonstrably contained classifiable llm-rubric components — the
+ * same file, read tens of milliseconds later by `extractStoredTestResults`,
+ * yielded the full set (entries with populated dimensions). The empty array
+ * silently skipped the write, so gap-analysis and compute-attribution skipped
+ * and the report shipped with a score but no tests.
+ *
+ * The committed code reads the file via a pure `readFileSync` with identical
+ * classification on both sides, so the divergence is not reproducible from the
+ * source + captured artifacts — it is a transient read anomaly at the live
+ * junction. This wrapper does not pretend to know the mechanism; it makes the
+ * junction observable and recovers from the transient:
+ *
+ *  1. **Instruments** — logs the resolved path(s), file size/mtime, parsed
+ *     result count, and judgment count on every run (never silent on 0), so a
+ *     future empty-judgments persist is diagnosable from the run log alone.
+ *  2. **Self-heals** — when extraction yields 0 judgments but a results file
+ *     exists, it re-extracts with bounded retries. A later read that yields
+ *     judgments proves the initial 0 was transient; the recovered judgments
+ *     are returned. If every attempt yields 0, severity is decided by an
+ *     independent classifiable-component count: a genuinely judgment-free run
+ *     (all api-errors / no llm-rubric) logs a warning, while 0 judgments
+ *     against present classifiable components logs an error (the downstream
+ *     gap-analysis fail-loud guard is the backstop).
+ *
+ * The extractor and fs helpers are injected so the wrapper is unit-testable
+ * without importing the ~3000-line scoring module (which would be circular)
+ * or touching the real filesystem.
+ */
+import { existsSync, statSync } from "node:fs";
+const defaultStat = (path) => {
+    if (!existsSync(path))
+        return { exists: false, mtimeMs: 0, size: 0 };
+    const s = statSync(path);
+    return { exists: true, mtimeMs: s.mtimeMs, size: s.size };
+};
+const defaultSleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
+/**
+ * Extract grader judgments across one or more results files, instrumented and
+ * self-healing. See the module header for the rationale.
+ *
+ * @param resultsPaths  One or more results files (e.g. baseline + agentic in
+ *   literacy full mode). Missing paths are skipped.
+ * @param telemetry  Shared reliability sink threaded into every extraction.
+ * @param log  Pipeline logger — the junction is logged here on every run.
+ */
+export async function extractGraderJudgmentsResilient(resultsPaths, telemetry, log, options) {
+    const { extract } = options.deps;
+    const statFile = options.deps.statFile ?? defaultStat;
+    const sleep = options.deps.sleep ?? defaultSleep;
+    const { countResults, countClassifiable } = options.deps;
+    const maxAttempts = Math.max(1, options.maxAttempts ?? 3);
+    const delayMs = options.delayMs ?? 200;
+    const present = resultsPaths.filter((p) => statFile(p).exists);
+    const extractAll = () => {
+        const all = [];
+        for (const p of present) {
+            all.push(...extract(p, telemetry));
+        }
+        return all;
+    };
+    const diag = (path) => {
+        const st = statFile(path);
+        return {
+            mtimeMs: st.mtimeMs,
+            path,
+            sizeBytes: st.size,
+            ...(countResults ? { resultCount: countResults(path) } : {}),
+        };
+    };
+    // Attempt 1 — always instrument the junction so 0 is never silent.
+    let judgments = extractAll();
+    for (const p of present) {
+        log.info("Grader judgments — persist junction read", diag(p));
+    }
+    log.info(`Grader judgments extracted: ${judgments.length} total`, {
+        judgmentCount: judgments.length,
+        paths: present,
+    });
+    if (judgments.length > 0)
+        return judgments;
+    // 0 judgments and no results file present → genuinely nothing to grade.
+    // A missing file cannot become non-empty within the retry window.
+    if (present.length === 0) {
+        log.info("No grader judgments — no results file present (nothing to grade)");
+        return judgments;
+    }
+    // Results file(s) exist but extraction yielded 0 judgments — a suspected
+    // transient read anomaly. Loud diagnostic, then bounded self-heal retries;
+    // the same file read tens of ms later has been observed to yield the full set.
+    log.warn("Grader extraction returned 0 judgments despite present results file(s) — suspected transient read anomaly; attempting self-heal", { paths: present.map(diag) });
+    for (let attempt = 2; attempt <= maxAttempts; attempt++) {
+        await sleep(delayMs);
+        judgments = extractAll();
+        log.warn(`Grader self-heal attempt ${attempt}/${maxAttempts}: ${judgments.length} judgment(s)`);
+        if (judgments.length > 0) {
+            log.warn(`Grader self-heal recovered ${judgments.length} grader judgment(s) on attempt ${attempt} — the initial empty extraction was a transient read anomaly`);
+            return judgments;
+        }
+    }
+    // Still empty after every attempt. Severity depends on whether the files
+    // actually contain classifiable components.
+    const classifiable = countClassifiable
+        ? present.reduce((n, p) => n + countClassifiable(p), 0)
+        : undefined;
+    if (classifiable === 0) {
+        log.warn(`No grader judgments after ${maxAttempts} attempt(s) — results contain no classifiable llm-rubric components (e.g. all api-errors); nothing to persist`);
+    }
+    else {
+        log.error(`Grader judgments empty after ${maxAttempts} attempt(s) but ${classifiable ?? "an unknown number of"} classifiable component(s) present in the results file(s) — persisting none; downstream gap-analysis/attribution will fail loud`, { paths: present.map(diag) });
+    }
+    return judgments;
+}

package/dist/report-store.d.ts CHANGED Viewed

@@ -216,6 +216,7 @@ export interface SanityReportDoc {
     _type: string;
     comparison: null | Omit<ComparisonReport, "baseline" | "experiment">;
     completedAt: string;
+    degraded?: Report["degraded"];
     durationMs: number;
     provenance: Report["provenance"];
     reportId: ReportId;

package/dist/report-store.js CHANGED Viewed

@@ -477,6 +477,7 @@ export function toSanityReportDoc(report) {
         _type: REPORT_TYPE,
         comparison,
         completedAt: report.completedAt,
+        ...(report.degraded ? { degraded: report.degraded } : {}),
         durationMs: report.durationMs,
         provenance: report.provenance,
         reportId: report.id,
@@ -526,6 +527,7 @@ export function toReport(doc) {
         artifactManifest,
         comparison: doc.comparison,
         completedAt: doc.completedAt,
+        degraded: doc.degraded,
         durationMs: doc.durationMs,
         id: doc.reportId,
         provenance: doc.provenance,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sanity/ailf",
-  "version": "7.1.2",
+  "version": "7.2.1",
   "private": false,
   "publishConfig": {
     "access": "public"