@sanity/ailf 7.1.2 → 7.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
  2. package/dist/_vendor/ailf-core/schemas/index.js +4 -0
  3. package/dist/_vendor/ailf-core/schemas/report.d.ts +11 -0
  4. package/dist/_vendor/ailf-core/schemas/report.js +14 -0
  5. package/dist/_vendor/ailf-core/schemas/user.d.ts +22 -0
  6. package/dist/_vendor/ailf-core/schemas/user.js +23 -0
  7. package/dist/_vendor/ailf-core/types/index.d.ts +29 -0
  8. package/dist/_vendor/ailf-core/types/index.js +13 -0
  9. package/dist/_vendor/ailf-core/types/user.d.ts +49 -0
  10. package/dist/_vendor/ailf-core/types/user.js +1 -0
  11. package/dist/_vendor/ailf-shared/generated/help-content.js +26 -14
  12. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +2 -2
  13. package/dist/index.d.ts +5 -1
  14. package/dist/index.js +5 -1
  15. package/dist/orchestration/steps/compute-attribution-step.d.ts +2 -2
  16. package/dist/orchestration/steps/compute-attribution-step.js +17 -2
  17. package/dist/orchestration/steps/gap-analysis-step.d.ts +2 -2
  18. package/dist/orchestration/steps/gap-analysis-step.js +20 -2
  19. package/dist/orchestration/steps/publish-report-step.d.ts +15 -1
  20. package/dist/orchestration/steps/publish-report-step.js +45 -0
  21. package/dist/pipeline/calculate-scores.js +59 -14
  22. package/dist/pipeline/compiler/assertion-mapper.js +7 -3
  23. package/dist/pipeline/compiler/rubric-resolution.d.ts +2 -2
  24. package/dist/pipeline/compiler/rubric-resolution.js +25 -1
  25. package/dist/pipeline/enrichment-preconditions.d.ts +52 -0
  26. package/dist/pipeline/enrichment-preconditions.js +84 -0
  27. package/dist/pipeline/extract-grader-judgments-resilient.d.ts +88 -0
  28. package/dist/pipeline/extract-grader-judgments-resilient.js +122 -0
  29. package/dist/report-store.d.ts +1 -0
  30. package/dist/report-store.js +2 -0
  31. package/package.json +1 -1
@@ -21,3 +21,4 @@ export * from "./symbol-preflight-report.js";
21
21
  export * from "./test-budgets.js";
22
22
  export { ConfidenceSchema } from "./confidence-schema.js";
23
23
  export { brandedString } from "./branded-string.js";
24
+ export { AilfUserSchema } from "./user.js";
@@ -28,3 +28,7 @@ export { ConfidenceSchema } from "./confidence-schema.js";
28
28
  // helper instead of replicating `as unknown as z.ZodType<…>` at each
29
29
  // schema author site (project rule: no `as` on `unknown`).
30
30
  export { brandedString } from "./branded-string.js";
31
+ // User-preferences subsystem (W0302). Named export — not `export *` — because
32
+ // the schema file re-exports the `AilfUser` domain type, and a star re-export
33
+ // would surface that type through two paths (W0124 DTS ambiguity).
34
+ export { AilfUserSchema } from "./user.js";
@@ -258,6 +258,17 @@ export declare const ReportSchema: z.ZodObject<{
258
258
  artifactManifest: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
259
259
  tag: z.ZodOptional<z.ZodNullable<z.ZodString>>;
260
260
  title: z.ZodOptional<z.ZodNullable<z.ZodString>>;
261
+ degraded: z.ZodOptional<z.ZodObject<{
262
+ reason: z.ZodLiteral<"enrichment-missing">;
263
+ missing: z.ZodArray<z.ZodEnum<{
264
+ documentManifest: "documentManifest";
265
+ failureModes: "failureModes";
266
+ lowScoringJudgments: "lowScoringJudgments";
267
+ recommendations: "recommendations";
268
+ testResults: "testResults";
269
+ }>>;
270
+ detail: z.ZodString;
271
+ }, z.core.$strict>>;
261
272
  }, z.core.$loose>;
262
273
  export type ReportSchemaInput = z.input<typeof ReportSchema>;
263
274
  export type ReportSchemaOutput = z.infer<typeof ReportSchema>;
@@ -25,6 +25,7 @@
25
25
  */
26
26
  import { z } from "zod";
27
27
  import { LITERACY_VARIANTS } from "../../ailf-shared/index.js";
28
+ import { DEGRADED_ENRICHMENT_FIELDS } from "../types/index.js";
28
29
  // ---------------------------------------------------------------------------
29
30
  // RunContext building blocks (mirrors packages/shared/src/run-context.ts)
30
31
  // ---------------------------------------------------------------------------
@@ -233,5 +234,18 @@ export const ReportSchema = z
233
234
  // `title: report.title ?? null`, so the schema accepts null on both.
234
235
  tag: z.string().nullable().optional(),
235
236
  title: z.string().nullable().optional(),
237
+ // Degraded marker (mirrors `ReportDegradation`): present only when a full
238
+ // eval scored tests but enrichment did not complete. Strict — unknown
239
+ // keys here signal real drift.
240
+ degraded: z
241
+ .object({
242
+ reason: z.literal("enrichment-missing"),
243
+ // Enum derived from the canonical DegradedEnrichmentField tuple so the
244
+ // schema cannot drift from the core type.
245
+ missing: z.array(z.enum(DEGRADED_ENRICHMENT_FIELDS)),
246
+ detail: z.string().min(1),
247
+ })
248
+ .strict()
249
+ .optional(),
236
250
  })
237
251
  .passthrough();
@@ -0,0 +1,22 @@
1
+ import { z } from "zod";
2
+ export declare const AilfUserSchema: z.ZodObject<{
3
+ _id: z.ZodString;
4
+ _type: z.ZodLiteral<"ailf.user">;
5
+ sanityUserId: z.ZodString;
6
+ email: z.ZodString;
7
+ displayName: z.ZodOptional<z.ZodString>;
8
+ teams: z.ZodArray<z.ZodObject<{
9
+ _type: z.ZodLiteral<"reference">;
10
+ _ref: z.ZodString;
11
+ _key: z.ZodOptional<z.ZodString>;
12
+ }, z.core.$strip>>;
13
+ preferences: z.ZodObject<{
14
+ primaryTeam: z.ZodOptional<z.ZodObject<{
15
+ _type: z.ZodLiteral<"reference">;
16
+ _ref: z.ZodString;
17
+ _key: z.ZodOptional<z.ZodString>;
18
+ }, z.core.$strip>>;
19
+ }, z.core.$strip>;
20
+ updatedAt: z.ZodString;
21
+ }, z.core.$strip>;
22
+ export type { AilfUser } from "../types/user.js";
@@ -0,0 +1,23 @@
1
+ import { z } from "zod";
2
+ // `_id` is constructed as `ailf.user.${CurrentUser.id}` at write time. The
3
+ // account id segment is opaque (may contain `|`, `.`, etc. for SSO providers),
4
+ // so the prefix is all we constrain here. The deterministic-id invariant
5
+ // (`_id === ailf.user.${sanityUserId}`) is enforced on the write path.
6
+ const USER_ID_REGEX = /^ailf\.user\..+$/;
7
+ const TeamReferenceSchema = z.object({
8
+ _type: z.literal("reference"),
9
+ _ref: z.string().min(1),
10
+ _key: z.string().optional(),
11
+ });
12
+ export const AilfUserSchema = z.object({
13
+ _id: z.string().regex(USER_ID_REGEX),
14
+ _type: z.literal("ailf.user"),
15
+ sanityUserId: z.string().min(1),
16
+ email: z.string().email(),
17
+ displayName: z.string().optional(),
18
+ teams: z.array(TeamReferenceSchema),
19
+ preferences: z.object({
20
+ primaryTeam: TeamReferenceSchema.optional(),
21
+ }),
22
+ updatedAt: z.string().datetime(),
23
+ });
@@ -42,6 +42,7 @@ export type { AttributionMeta, DocAttribution, JudgmentAttribution, } from "./at
42
42
  export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderEmittedJudgment, GraderJudgment, } from "./grader-judgment.js";
43
43
  export type { LegacyGraderJudgment } from "./legacy-grader-judgment.js";
44
44
  export type { BaseChannel, ChannelScope, EmailChannel, EventType, KnownEventType, KnownMemberRole, MemberRole, NotificationChannel, NotificationChannelType, SlackChannel, Team, TeamId, TeamMember, TeamRef, TeamSlug, TeamStatus, WebhookChannel, } from "./team.js";
45
+ export type { AilfUser, AilfUserPreferences, TeamReference } from "./user.js";
45
46
  type DocumentRef = _DocumentRef;
46
47
  /** Aggregated retrieval metrics for a feature area */
47
48
  export interface AreaRetrievalMetrics {
@@ -1488,8 +1489,36 @@ export interface ArtifactRef {
1488
1489
  * two becomes a compile error (W0049 review finding C1).
1489
1490
  */
1490
1491
  export type ArtifactManifest = Partial<Record<ArtifactType, ArtifactRef>>;
1492
+ /**
1493
+ * Enrichment surfaces that gap-analysis writes onto a report. When a full
1494
+ * eval scores tests but these are absent, the report renders as "no tests"
1495
+ * despite carrying a score — the degraded condition `ReportDegradation`
1496
+ * records.
1497
+ */
1498
+ export declare const DEGRADED_ENRICHMENT_FIELDS: readonly ["documentManifest", "failureModes", "lowScoringJudgments", "recommendations", "testResults"];
1499
+ export type DegradedEnrichmentField = (typeof DEGRADED_ENRICHMENT_FIELDS)[number];
1500
+ /**
1501
+ * Marks a published report as degraded: the eval ran and scored tests, but
1502
+ * one or more enrichment surfaces never landed (e.g. gap-analysis skipped
1503
+ * because `grader-judgments.json` was missing). Present so the dashboard and
1504
+ * Studio can show "enrichment failed" rather than a misleading empty
1505
+ * "no tests" state on a report that still has a score.
1506
+ */
1507
+ export interface ReportDegradation {
1508
+ /** Why the report is degraded. Single-variant union, widen as needed. */
1509
+ reason: "enrichment-missing";
1510
+ /** Enrichment surfaces absent on this report despite a full eval. */
1511
+ missing: DegradedEnrichmentField[];
1512
+ /** Human-readable explanation for dashboard / Studio empty-state copy. */
1513
+ detail: string;
1514
+ }
1491
1515
  /** A published evaluation report — the atomic unit of the report store */
1492
1516
  export interface Report {
1517
+ /**
1518
+ * Set when the report is published in a degraded state — a full eval
1519
+ * scored tests but enrichment did not complete. Absent on healthy reports.
1520
+ */
1521
+ degraded?: ReportDegradation;
1493
1522
  /**
1494
1523
  * Snapshot of the run manifest's `artifacts` slice at publish time (D0032).
1495
1524
  * The source of truth lives in `gs://…/runs/{runId}/manifest.json`; this
@@ -42,3 +42,16 @@ export function isLegacyFailureMode(mode) {
42
42
  * that imports it from @sanity/ailf-core.
43
43
  */
44
44
  export { NOISE_THRESHOLD as DEFAULT_NOISE_THRESHOLD } from "../../ailf-shared/index.js";
45
+ /**
46
+ * Enrichment surfaces that gap-analysis writes onto a report. When a full
47
+ * eval scores tests but these are absent, the report renders as "no tests"
48
+ * despite carrying a score — the degraded condition `ReportDegradation`
49
+ * records.
50
+ */
51
+ export const DEGRADED_ENRICHMENT_FIELDS = [
52
+ "documentManifest",
53
+ "failureModes",
54
+ "lowScoringJudgments",
55
+ "recommendations",
56
+ "testResults",
57
+ ];
@@ -0,0 +1,49 @@
1
+ /**
2
+ * A Sanity reference to an `ailf.team` document.
3
+ *
4
+ * Members of an array (`AilfUser.teams[]`) carry a `_key`; the single-valued
5
+ * `preferences.primaryTeam` does not. The team slug downstream consumers need
6
+ * is a derived, read-time value from a GROQ projection — never stored here.
7
+ */
8
+ export interface TeamReference {
9
+ _type: "reference";
10
+ _ref: string;
11
+ _key?: string;
12
+ }
13
+ /**
14
+ * Per-user UI preferences. Room to grow (default view, density, …) — kept
15
+ * minimal for v0 (YAGNI).
16
+ */
17
+ export interface AilfUserPreferences {
18
+ /**
19
+ * Reference to the user's default team — one of `AilfUser.teams[]`. Distinct
20
+ * from `teams[]` so "which team's view do I default to" can differ from "all
21
+ * teams I affiliate with". The slug is derived in GROQ at read time.
22
+ */
23
+ primaryTeam?: TeamReference;
24
+ }
25
+ /**
26
+ * Per-account user document — one per Sanity account, keyed by a deterministic
27
+ * `_id` of `ailf.user.${sanityUserId}`. Stores self-declared team affiliation
28
+ * (references to `ailf.team`) plus UI preferences, and is the primary source
29
+ * for dashboard personalization. Stores minimal PII: `sanityUserId`, `email`,
30
+ * and `displayName` only.
31
+ *
32
+ * @see docs/design-docs/user-settings.md
33
+ */
34
+ export interface AilfUser {
35
+ /** Deterministic: `ailf.user.${sanityUserId}`. */
36
+ _id: string;
37
+ _type: "ailf.user";
38
+ /** `CurrentUser.id` — the stable, globally-unique key, mirrored for GROQ. */
39
+ sanityUserId: string;
40
+ /** Denormalized for display / joins (lowercased at write time). */
41
+ email: string;
42
+ /** `CurrentUser.name` snapshot. */
43
+ displayName?: string;
44
+ /** Self-declared affiliation — drives personalization only. */
45
+ teams: TeamReference[];
46
+ preferences: AilfUserPreferences;
47
+ /** ISO 8601 UTC — stamped on each save. */
48
+ updatedAt: string;
49
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -44,6 +44,17 @@ export const HELP_TOPICS = [
44
44
  "scoring-model"
45
45
  ]
46
46
  },
47
+ {
48
+ "id": "failure-modes",
49
+ "title": "Failure Modes",
50
+ "body": "## What this view is for\n\nThe Recommendations view tells you which fixes to make. This view tells you what\nkind of problem you have. It groups the run's weaknesses by the documentation\nissue behind them, so you can see patterns across the whole evaluation rather\nthan one fix at a time. If most of your weak spots are the same kind of problem,\nthat is a signal about how to spend your docs effort.\n\n## What you are looking at\n\nRecent reports show **interpretive cards** drawn from the run's diagnosis:\n\n- **Weakest area** names the single feature area dragging the score down most,\n the failure mode behind it, and a confidence level with the sample size, so\n you know how strong the signal is.\n- **Failure mode** highlights one category of problem, which scoring dimension\n it shows up in, and how often it occurred across the tests that were checked.\n- **Area summary** gives a plain-language read on how an area is doing and why.\n\nOlder reports show a **category breakdown** instead. Each failure category is a\nchip with a count. Selecting a chip lists the gaps in that category, and each\ngap shows an estimated score lift if fixed, a confidence level, a short\nremediation note, and the specific tasks that exposed it. You can click a task\nto jump to it.\n\n## The failure modes\n\nEach weakness is sorted into one of these categories. The category is the\nfastest way to know what kind of work the fix needs:\n\n- **Missing docs**: the doc the model needed does not exist or is not indexed.\n The fix is to write new documentation.\n- **Incorrect docs**: a doc has a factual error or a wrong example. The fix is\n to correct it.\n- **Outdated docs**: a doc exists but reflects a previous API surface. The fix\n is to bring it up to date.\n- **Poor structure**: the information is correct but hard for an agent to find\n or skim. The fix is to reorganize or clarify.\n- **Model limitation**: the model struggles even with correct docs available.\n This is not a documentation problem, so treat it as context rather than a\n to-do.\n- **Unclassified**: the run could not categorize the weakness. Use the linked\n tasks and the grader's notes to judge it yourself.\n\nDepending on the evaluation mode you may see additional categories, including\nones specific to agent behavior such as tool misuse or missing error handling.\n\n## How to use it\n\nStart with the category that has the most gaps or the highest combined lift. The\ncategory tells you the shape of the work before you open a single page: write,\ncorrect, update, or restructure. Categories that are not documentation problems,\nsuch as model limitation, are worth noting but are not yours to fix in the docs.\n\n## Related views\n\n- **Recommendations** turns these weaknesses into a ranked list of specific\n edits.\n- **Low-scoring judgments** shows the grader's raw notes on the tests that\n scored lowest, which is the most granular signal behind any failure mode.\n\n## When this view is empty\n\nIf a report shows no failure modes, the evaluation either classified nothing\nworth flagging or the run predates this view. A clean result here usually means\nthe docs held up across the evaluated tasks.",
51
+ "source": "docs/help/failure-modes.md",
52
+ "related": [
53
+ "recommendations",
54
+ "scoring-model",
55
+ "negative-doc-lift"
56
+ ]
57
+ },
47
58
  {
48
59
  "id": "getting-started",
49
60
  "title": "Getting Started",
@@ -57,11 +68,12 @@ export const HELP_TOPICS = [
57
68
  {
58
69
  "id": "interpreting-diagnostics",
59
70
  "title": "Interpreting Diagnostics",
60
- "body": "## The diagnostics tab\n\nWhen you open a report and click the **Diagnostics** tab, you see a health\nsummary of your documentation across all feature areas. This is the most\nactionable view in the dashboard it tells you exactly where to focus your doc\nimprovement efforts.\n\n## Health categories\n\nFeature areas are grouped into three health bands:\n\n- **Strong (80+)** Docs are working well. AI agents produce correct, complete\n implementations. No action needed unless you see regression.\n- **Needs Attention (7079)** Docs are okay but have gaps. There may be\n specific dimensions (like code correctness or doc coverage) dragging the score\n down. Worth investigating.\n- **Weak (below 70)** Docs are not providing enough support. AI agents\n consistently struggle with these features. These need priority attention.\n\n## Strengths vs. Issues\n\nThe diagnostics tab has two sub-views:\n\n**Strengths** highlights what's working: high-scoring areas, strong dimensions,\nand areas where agents successfully find and use your docs. Use this to\nunderstand what good looks like in your docs and replicate it elsewhere.\n\n**Issues** lists the problems: weak areas, dimensions scoring below 50, negative\ndoc lift, retrieval problems, and (if gap analysis was run) specific\nrecommendations with estimated score lift.\n\n## Key diagnostic signals\n\n| Signal | What it means | What to do |\n| ------------------------------ | ------------------------------------------ | ---------------------------------------- |\n| **Negative doc lift** | Docs are worse than no docs | Rewrite or remove the offending docs |\n| **Large retrieval gap** | Good docs exist but agents can't find them | Improve page titles, metadata, SEO |\n| **Low code correctness** | Agents find the docs but produce bad code | Add or fix code examples |\n| **Low doc coverage** | The docs don't cover what the task needs | Write new documentation |\n| **Efficiency anomaly (>100%)** | Agents do better without gold docs | Injected docs may be confusing the model |",
71
+ "body": "## Reading the health of your docs\n\nA report scores each feature area on how well your documentation lets AI coding\ntools implement that feature. Reading those scores well is what turns a number\ninto a plan: it tells you where the docs are working, where they are not, and\nwhat kind of problem you are dealing with.\n\n## Health bands\n\nEach area's score falls into one of three bands:\n\n- **Strong (80 and above)**: docs are working well. Agents produce correct,\n complete implementations. No action needed unless you see a regression.\n- **Needs attention (70 to 79)**: docs are okay but have gaps. A specific\n dimension such as code correctness or doc coverage may be dragging the score\n down. Worth investigating.\n- **Weak (below 70)**: docs are not providing enough support. Agents\n consistently struggle with these features. These need priority attention.\n\n## Strong areas are signal too\n\nIt is easy to focus only on what is broken, but the strong areas are worth\nreading. They show what good looks like in your docs: clear structure, accurate\nexamples, the patterns agents can follow. When you fix a weak area, that is the\nbar to copy.\n\n## Key diagnostic signals\n\nA low score has a reason behind it. These signals tell you which reason, and\nwhat to do about it:\n\n| Signal | What it means | What to do |\n| ------------------------------ | ------------------------------------------- | ---------------------------------------- |\n| **Negative doc lift** | Docs are worse than no docs | Rewrite or remove the offending docs |\n| **Large retrieval gap** | Good docs exist but agents cannot find them | Improve page titles, metadata, structure |\n| **Low code correctness** | Agents find the docs but produce bad code | Add or fix code examples |\n| **Low doc coverage** | The docs do not cover what the task needs | Write new documentation |\n| **Efficiency anomaly (>100%)** | Agents do better without the docs | Injected docs may be confusing the model |\n\n## Where to go next\n\nWhen you know which areas are weak and why, the **Recommendations** view turns\nthat into a ranked list of specific edits, and the **Failure modes** view groups\nthe weaknesses by the kind of documentation problem behind them.",
61
72
  "source": "docs/help/interpreting-diagnostics.md",
62
73
  "related": [
63
- "scoring-model",
64
- "weaknesses-recommendations"
74
+ "recommendations",
75
+ "failure-modes",
76
+ "scoring-model"
65
77
  ]
66
78
  },
67
79
  {
@@ -74,6 +86,17 @@ export const HELP_TOPICS = [
74
86
  "comparing-runs"
75
87
  ]
76
88
  },
89
+ {
90
+ "id": "recommendations",
91
+ "title": "Recommendations",
92
+ "body": "## What this view is for\n\nThis is the \"what do I fix\" view. The scores tell you how well your\ndocumentation supports AI coding tools. This view turns those scores into a\nranked list of specific changes, so you can spend your time on the edits that\nshould move the score the most.\n\nEverything here comes from the same evaluation run you are looking at, and it\npoints at your own documentation pages rather than giving generic advice.\n\n## What you are looking at\n\nRecent reports show a set of **diagnosis cards**. Each card answers one question\nabout the run.\n\n**Top recommendations** is the main card. It opens with a short summary, then\nlists a few suggested changes ranked by priority. Each suggestion has:\n\n- A **priority** tag of high, medium, or low that tells you what to do first.\n- A **title** that names the change in one line.\n- A **description** of the specific fix, usually quoting the exact symbol,\n query, or pattern involved.\n- A **doc reference** showing which page, and the section when it is known, the\n change applies to. Every reference points to a real page that was part of this\n run, so you can open it and start editing.\n\nYou may also see supporting cards:\n\n- **Doc attribution spotlight** shows which documentation pages most influenced\n the results, and whether each one helped or hurt. Use it to confirm a\n recommendation is pointing at the right page.\n- **Low-confidence attribution** lists results where the link between a doc and\n an outcome was uncertain. Treat anything flagged here as a lead to verify, not\n a settled conclusion.\n- **Regression vs baseline** appears when you are comparing against an earlier\n run. It shows which areas moved up or down and the likely reason for each\n change.\n\n## How to use it\n\nWork top down. Start with the high-priority suggestions, open the referenced\npage, and make the change. Priority reflects how much each change is expected to\nhelp, so the top of the list is usually where your effort goes furthest.\n\nThe recommendations are written by a model that reads this run's results. They\nare grounded in your actual docs and cannot reference a page that was not in the\nrun, but they are still suggestions. Read the linked page before acting, and use\nthe confidence signals to decide how much to trust each item.\n\n## Where this comes from\n\nA recommendation is the end of a chain: a test scored low, the grader said why,\nthe run classified that into a failure mode, and this view proposes the edit. If\nyou want to see the failure modes themselves, grouped by category, open the\n**Failure modes** view. If you want the grader's raw notes on the lowest scores,\nopen the **Low-scoring judgments** view.\n\n## Older reports\n\nReports created before the diagnosis cards shipped show a simpler list instead.\nEach row names a feature area, the failure mode behind it, an estimated score\nlift if you fix it, a confidence level, and the tasks that exposed the gap. The\nestimated lift is conservative. It assumes fixing the gap raises the weak\ndimension only to the median of the others, so the real improvement can be\nhigher.\n\n## When this view is empty\n\nIf a report shows no recommendations, the evaluation either ran and found\nnothing worth flagging, or the run predates this feature. A score with no\nrecommendations is usually a good sign, because it means the docs held up across\nthe evaluated tasks.",
93
+ "source": "docs/help/recommendations.md",
94
+ "related": [
95
+ "failure-modes",
96
+ "interpreting-diagnostics",
97
+ "scoring-model"
98
+ ]
99
+ },
77
100
  {
78
101
  "id": "retrieval-gap",
79
102
  "title": "Retrieval Gap & Infrastructure Efficiency",
@@ -96,17 +119,6 @@ export const HELP_TOPICS = [
96
119
  "eval-modes"
97
120
  ]
98
121
  },
99
- {
100
- "id": "weaknesses-recommendations",
101
- "title": "Weaknesses & Recommendations",
102
- "body": "## Understanding weaknesses\n\nThe Issues sub-tab in Diagnostics lists every area or dimension that scored\nbelow threshold. Each weakness entry shows:\n\n- **The feature area** — Which product feature is affected (e.g., GROQ,\n Functions, Webhooks).\n- **The bottleneck dimension** — Which scoring dimension is dragging the area\n down: task completion, code correctness, or doc coverage.\n- **The score** — How far below threshold the dimension scored.\n\n## Gap analysis recommendations\n\nWhen an evaluation runs with gap analysis enabled, the dashboard shows\n**prioritized recommendations** — specific actions ranked by estimated impact.\n\nEach recommendation includes:\n\n- **Failure mode** — The type of doc problem identified:\n - `missing-docs` — The functionality isn't documented at all.\n - `incorrect-docs` — The docs contain factual errors.\n - `outdated-docs` — The docs describe an old API version or pattern.\n - `poor-structure` — The docs exist but are hard to find or understand.\n- **Estimated lift** — How many score points fixing this gap would add. Based on\n raising the bottleneck dimension to the median of non-bottleneck dimensions.\n Conservative estimate — actual improvement may be higher.\n- **Confidence** — How sure the analysis is about this diagnosis (high, medium,\n or low).\n- **Affected tasks** — Which specific evaluation tasks exposed this gap.\n\n## Diagnosis cards\n\nEvery published report now carries a **diagnosis artifact** — a set of cards\nproduced by the post-pipeline hook (`ailf interpret`). The Studio diagnosis\npanel renders these cards directly; the dashboard's Recommendations and\nFailure-modes panels migrate to the same source in a follow-up.\n\nThe hook runs by default for every pipeline invocation. To opt out for a single\nrun, pass `--no-summary`; to opt out in CI, set `AILF_INTERPRET_ON_RUN=0` in the\nworkflow env block; to opt out project-wide, set `summary.onRun: never` in\n`.ailf/config.yaml`.\n\n## Low-scoring judgments\n\nBelow the recommendations, you'll find the **grader's explanations** for tests\nthat scored below 70. These are the raw assessments from the grading model\nexplaining exactly what went wrong — missing API calls, incorrect patterns,\nhallucinated features, etc.\n\nEach judgment shows the task, the dimension, the score, and the grader's natural\nlanguage reason. These are the most granular diagnostic signal available and\noften point directly to the doc section that needs fixing.",
103
- "source": "docs/help/weaknesses-recommendations.md",
104
- "related": [
105
- "interpreting-diagnostics",
106
- "scoring-model",
107
- "negative-doc-lift"
108
- ]
109
- },
110
122
  {
111
123
  "id": "how-agents-work",
112
124
  "title": "How AI Agents Find Documentation",
@@ -76,9 +76,9 @@ export declare const GraderJudgmentSchema: z.ZodObject<{
76
76
  documentId: z.ZodString;
77
77
  slug: z.ZodOptional<z.ZodString>;
78
78
  role: z.ZodEnum<{
79
+ missing: "missing";
79
80
  supports: "supports";
80
81
  contradicts: "contradicts";
81
- missing: "missing";
82
82
  irrelevant: "irrelevant";
83
83
  }>;
84
84
  hallucinated: z.ZodOptional<z.ZodBoolean>;
@@ -145,9 +145,9 @@ export declare const GraderEmittedJudgmentSchema: z.ZodObject<{
145
145
  documentId: z.ZodString;
146
146
  slug: z.ZodOptional<z.ZodString>;
147
147
  role: z.ZodEnum<{
148
+ missing: "missing";
148
149
  supports: "supports";
149
150
  contradicts: "contradicts";
150
- missing: "missing";
151
151
  irrelevant: "irrelevant";
152
152
  }>;
153
153
  hallucinated: z.ZodOptional<z.ZodBoolean>;
package/dist/index.d.ts CHANGED
@@ -17,7 +17,11 @@
17
17
  * area: "groq",
18
18
  * prompt: { text: "Write GROQ queries..." },
19
19
  * assertions: [
20
- * { type: "llm-rubric", template: "task-completion", criteria: ["..."] },
20
+ * {
21
+ * type: "llm-rubric",
22
+ * template: "task-completion",
23
+ * criteria: [{ id: "uses-projection", text: "Uses a projection" }],
24
+ * },
21
25
  * ],
22
26
  * })
23
27
  * ```
package/dist/index.js CHANGED
@@ -17,7 +17,11 @@
17
17
  * area: "groq",
18
18
  * prompt: { text: "Write GROQ queries..." },
19
19
  * assertions: [
20
- * { type: "llm-rubric", template: "task-completion", criteria: ["..."] },
20
+ * {
21
+ * type: "llm-rubric",
22
+ * template: "task-completion",
23
+ * criteria: [{ id: "uses-projection", text: "Uses a projection" }],
24
+ * },
21
25
  * ],
22
26
  * })
23
27
  * ```
@@ -35,10 +35,10 @@
35
35
  * @see docs/decisions/D0050-per-entry-attribution-layout.md
36
36
  * @see docs/decisions/D0052-judgment-ref-granularity.md
37
37
  */
38
- import type { AppContext, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
38
+ import type { AppContext, PipelineState, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
39
39
  export declare class ComputeAttributionStep implements PipelineStep {
40
40
  readonly name = "compute-attribution";
41
41
  readonly optional = true;
42
42
  check(ctx: AppContext): ValidationIssue[];
43
- execute(ctx: AppContext, _state?: unknown): Promise<StepResult>;
43
+ execute(ctx: AppContext, state?: PipelineState): Promise<StepResult>;
44
44
  }
@@ -40,6 +40,7 @@ import { resolve } from "node:path";
40
40
  import { isSlugRef } from "../../_vendor/ailf-core/index.js";
41
41
  import { calibrationSetVersion, embeddingModel, ensembleVersion, } from "../../pipeline/attribution.js";
42
42
  import { V0_WEIGHTS, computeJudgmentAttribution, } from "../../pipeline/compute-attribution.js";
43
+ import { classifyEnrichmentInputs, degradedEnrichmentError, } from "../../pipeline/enrichment-preconditions.js";
43
44
  // ---------------------------------------------------------------------------
44
45
  // Step implementation
45
46
  // ---------------------------------------------------------------------------
@@ -79,12 +80,26 @@ export class ComputeAttributionStep {
79
80
  }
80
81
  return issues;
81
82
  }
82
- async execute(ctx, _state) {
83
+ async execute(ctx, state) {
83
84
  const start = Date.now();
84
85
  const root = ctx.config.rootDir;
85
86
  const judgmentsPath = resolve(root, "results", "latest", "grader-judgments.json");
86
87
  const summaryPath = resolve(root, "results", "latest", "score-summary.json");
87
- if (!existsSync(judgmentsPath)) {
88
+ // Mirror gap-analysis: a full eval that scored tests but persisted no
89
+ // grader judgments is a degraded run, not a benign skip. Fail loud so the
90
+ // outcome surfaces in pipeline-result and on the job document. A remote
91
+ // cache hit restores score-summary.json without grader-judgments.json, so
92
+ // its missing judgments are legitimate — never fail loud on a cache hit.
93
+ const fromRemoteCache = (state?.remoteCacheHits?.size ?? 0) > 0;
94
+ const inputs = classifyEnrichmentInputs(root);
95
+ if (inputs.kind === "judgments-missing-after-eval" && !fromRemoteCache) {
96
+ return {
97
+ durationMs: Date.now() - start,
98
+ status: "failed",
99
+ error: degradedEnrichmentError("compute-attribution", inputs.scoredTestCount),
100
+ };
101
+ }
102
+ if (inputs.kind !== "ready") {
88
103
  return { status: "skipped", reason: "No grader-judgments.json" };
89
104
  }
90
105
  if (!existsSync(summaryPath)) {
@@ -14,10 +14,10 @@
14
14
  *
15
15
  * This is an optional step — failure doesn't stop the pipeline.
16
16
  */
17
- import type { AppContext, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
17
+ import type { AppContext, PipelineState, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
18
18
  export declare class GapAnalysisStep implements PipelineStep {
19
19
  readonly name = "gap-analysis";
20
20
  readonly optional = true;
21
21
  check(ctx: AppContext): ValidationIssue[];
22
- execute(ctx: AppContext): Promise<StepResult>;
22
+ execute(ctx: AppContext, state?: PipelineState): Promise<StepResult>;
23
23
  }
@@ -18,6 +18,7 @@ import { existsSync, mkdirSync, readFileSync, renameSync, writeFileSync, } from
18
18
  import { join, resolve } from "path";
19
19
  import { assoc, isSlugRef } from "../../_vendor/ailf-core/index.js";
20
20
  import { emitFileContents } from "../../artifact-capture/emit-file.js";
21
+ import { classifyEnrichmentInputs, degradedEnrichmentError, } from "../../pipeline/enrichment-preconditions.js";
21
22
  export class GapAnalysisStep {
22
23
  name = "gap-analysis";
23
24
  optional = true;
@@ -34,12 +35,29 @@ export class GapAnalysisStep {
34
35
  }
35
36
  return [];
36
37
  }
37
- async execute(ctx) {
38
+ async execute(ctx, state) {
38
39
  const root = ctx.config.rootDir;
39
40
  const start = Date.now();
40
41
  const judgmentsPath = resolve(root, "results", "latest", "grader-judgments.json");
41
42
  const scoreSummaryPath = resolve(root, "results", "latest", "score-summary.json");
42
- if (!existsSync(judgmentsPath)) {
43
+ // Distinguish a legitimate skip (no graded eval ran this pipeline) from a
44
+ // degraded run where a full eval scored tests but no judgments persisted.
45
+ // The latter must fail loud — returning a benign `skipped` is what let
46
+ // reports publish with a score but no test details.
47
+ //
48
+ // A remote cache hit restores score-summary.json (with testCount) from a
49
+ // prior report but never writes grader-judgments.json, so judgments are
50
+ // legitimately absent — that is a benign skip, not a degraded full eval.
51
+ const fromRemoteCache = (state?.remoteCacheHits?.size ?? 0) > 0;
52
+ const inputs = classifyEnrichmentInputs(root);
53
+ if (inputs.kind === "judgments-missing-after-eval" && !fromRemoteCache) {
54
+ return {
55
+ durationMs: Date.now() - start,
56
+ status: "failed",
57
+ error: degradedEnrichmentError("gap-analysis", inputs.scoredTestCount),
58
+ };
59
+ }
60
+ if (inputs.kind !== "ready") {
43
61
  return {
44
62
  status: "skipped",
45
63
  reason: "No grader-judgments.json — run a full evaluation first",
@@ -10,7 +10,7 @@
10
10
  * - P5: Local-first (pipeline never fails because of a store write)
11
11
  * - P6: Sinks are fire-and-forget (failures logged, not thrown)
12
12
  */
13
- import { type AppContext, type PipelineState, type PipelineStep, type PromptfooUrlEntry, type ReportAutoScope, type ScoreSummary, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
13
+ import { type AppContext, type PipelineState, type PipelineStep, type PromptfooUrlEntry, type ReportAutoScope, type ReportDegradation, type ScoreSummary, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
14
14
  import { type ProvenanceInput } from "../../pipeline/provenance.js";
15
15
  export declare class PublishReportStep implements PipelineStep {
16
16
  private readonly pipelineStart;
@@ -25,6 +25,20 @@ export declare class PublishReportStep implements PipelineStep {
25
25
  check(): ValidationIssue[];
26
26
  execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
27
27
  }
28
+ /**
29
+ * Detect whether a report should publish as degraded.
30
+ *
31
+ * The symptom is a scored run whose per-test details never landed: a full
32
+ * eval counted tests (`scores[].testCount > 0`) but `summary.testResults` is
33
+ * absent because gap-analysis skipped or failed. Such a report renders an
34
+ * empty "no tests" state in Studio despite carrying a score. Returns the
35
+ * marker enumerating which enrichment surfaces are missing, or `undefined`
36
+ * for a healthy report (or a run with no scored tests, where an empty report
37
+ * is legitimate).
38
+ *
39
+ * Exported for unit testing — production callers reach it via execute().
40
+ */
41
+ export declare function detectReportDegradation(summary: ScoreSummary): ReportDegradation | undefined;
28
42
  /**
29
43
  * Assemble provenance input from the score summary and pipeline context.
30
44
  *
@@ -110,9 +110,15 @@ export class PublishReportStep {
110
110
  // agentBehavior arrays) point at their external artifacts via
111
111
  // `id = manifestEntryKey`; Studio hydrates on drill-down.
112
112
  const slimSummary = buildSlimReportSummary(summary, ctx.config.mode);
113
+ // Degraded-report detection (the "no tests on a scored report" symptom):
114
+ // a full eval scored tests but the gap-analysis enrichment never landed.
115
+ // Computed from the full summary read above — independent of which
116
+ // upstream step skipped — so the marker fires regardless of the cause.
117
+ const degraded = detectReportDegradation(summary);
113
118
  const report = {
114
119
  comparison: comparison ?? undefined,
115
120
  completedAt: now,
121
+ ...(degraded ? { degraded } : {}),
116
122
  durationMs,
117
123
  id: reportId,
118
124
  provenance,
@@ -192,6 +198,45 @@ export class PublishReportStep {
192
198
  // ---------------------------------------------------------------------------
193
199
  // Helpers
194
200
  // ---------------------------------------------------------------------------
201
+ /**
202
+ * Detect whether a report should publish as degraded.
203
+ *
204
+ * The symptom is a scored run whose per-test details never landed: a full
205
+ * eval counted tests (`scores[].testCount > 0`) but `summary.testResults` is
206
+ * absent because gap-analysis skipped or failed. Such a report renders an
207
+ * empty "no tests" state in Studio despite carrying a score. Returns the
208
+ * marker enumerating which enrichment surfaces are missing, or `undefined`
209
+ * for a healthy report (or a run with no scored tests, where an empty report
210
+ * is legitimate).
211
+ *
212
+ * Exported for unit testing — production callers reach it via execute().
213
+ */
214
+ export function detectReportDegradation(summary) {
215
+ const scoredTestCount = (summary.scores ?? []).reduce((n, s) => n + (typeof s.testCount === "number" ? s.testCount : 0), 0);
216
+ const hasTestResults = (summary.testResults?.length ?? 0) > 0;
217
+ if (scoredTestCount === 0 || hasTestResults)
218
+ return undefined;
219
+ // `testResults` is the load-bearing signal (its absence is the rendered
220
+ // "no tests" symptom). The remaining fields are best-effort detail: some
221
+ // are literacy-only (e.g. documentManifest), so they may appear here for a
222
+ // degraded non-literacy run even though that mode never produces them.
223
+ const missing = ["testResults"];
224
+ if (!summary.failureModes)
225
+ missing.push("failureModes");
226
+ if (!summary.lowScoringJudgments?.length)
227
+ missing.push("lowScoringJudgments");
228
+ if (!summary.documentManifest?.length)
229
+ missing.push("documentManifest");
230
+ if (!summary.recommendations)
231
+ missing.push("recommendations");
232
+ return {
233
+ reason: "enrichment-missing",
234
+ missing,
235
+ detail: `Evaluation scored ${scoredTestCount} test(s) but enrichment did not ` +
236
+ `complete; per-test details and failure analysis are unavailable for ` +
237
+ `this report.`,
238
+ };
239
+ }
195
240
  /**
196
241
  * Assemble provenance input from the score summary and pipeline context.
197
242
  *
@@ -41,6 +41,7 @@ import { resolveProfile } from "./profile-resolution.js";
41
41
  import { loadSource } from "../sources.js";
42
42
  import { LiteracyVariant } from "./normalize-mode.js";
43
43
  import { scoreTestGroup, } from "./compiler/scoring-bridge.js";
44
+ import { extractGraderJudgmentsResilient, } from "./extract-grader-judgments-resilient.js";
44
45
  // Re-export from core for backward compatibility.
45
46
  // Existing imports from this file continue to work unchanged.
46
47
  export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
@@ -321,6 +322,54 @@ export function extractGraderJudgments(resultsPath, telemetry) {
321
322
  }
322
323
  return judgments;
323
324
  }
325
+ /**
326
+ * Light parse of a results file's entry count — diagnostics only. Avoids the
327
+ * full normalize + debug logging of `readAndNormalizeResults`. Returns 0 when
328
+ * the file is missing or unparseable.
329
+ */
330
+ function countResultEntries(resultsPath) {
331
+ try {
332
+ const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
333
+ const wrapper = file.results ?? file;
334
+ return Array.isArray(wrapper.results) ? wrapper.results.length : 0;
335
+ }
336
+ catch {
337
+ return 0;
338
+ }
339
+ }
340
+ /**
341
+ * Count classifiable llm-rubric components in a results file — i.e. the number
342
+ * of judgments a healthy `extractGraderJudgments` should produce. Used only to
343
+ * set the severity of a persistent-empty extraction: a file with classifiable
344
+ * components but 0 extracted judgments is an error; a file with none (all
345
+ * api-errors / no llm-rubric) is a benign empty.
346
+ *
347
+ * Deliberately an independent count path (not `extractGraderJudgments`) so the
348
+ * cross-check is meaningful. Returns 0 when the file is missing or unparseable.
349
+ */
350
+ function countClassifiableRubricComponents(resultsPath) {
351
+ if (!existsSync(resultsPath))
352
+ return 0;
353
+ let n = 0;
354
+ for (const result of readAndNormalizeResults(resultsPath)) {
355
+ for (const comp of result.gradingResult.componentResults) {
356
+ if (comp.assertion?.type === "llm-rubric" && classifyRubric(comp)) {
357
+ n += 1;
358
+ }
359
+ }
360
+ }
361
+ return n;
362
+ }
363
+ /**
364
+ * Shared dependency bundle for `extractGraderJudgmentsResilient` — wires the
365
+ * real extractor + fs counters. Defined once so all persist sites self-heal
366
+ * identically.
367
+ */
368
+ const resilientJudgmentDeps = {
369
+ countClassifiable: countClassifiableRubricComponents,
370
+ countResults: countResultEntries,
371
+ extract: extractGraderJudgments,
372
+ };
324
373
  /**
325
374
  * Stamp every grader judgment with a D0049 ceiling-cross-check confidence
326
375
  * triple and increment `GraderReliability.failureModeCalibration` whenever
@@ -1494,7 +1543,7 @@ export async function calculateAndWriteScores(options) {
1494
1543
  writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
1495
1544
  log.info("Score summary written to results/latest/score-summary.json");
1496
1545
  // Extract and persist grader judgments
1497
- const judgments = extractGraderJudgments(baselineResultsPath);
1546
+ const judgments = await extractGraderJudgmentsResilient([baselineResultsPath], undefined, log, { deps: resilientJudgmentDeps });
1498
1547
  const borderlineConsistency = await runBorderlinePass(judgments, [
1499
1548
  baselineResultsPath,
1500
1549
  ]);
@@ -1557,7 +1606,7 @@ export async function calculateAndWriteScores(options) {
1557
1606
  mkdirSync(outDir, { recursive: true });
1558
1607
  writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
1559
1608
  log.info("Score summary written to results/latest/score-summary.json");
1560
- const judgments = extractGraderJudgments(baselineResultsPath);
1609
+ const judgments = await extractGraderJudgmentsResilient([baselineResultsPath], undefined, log, { deps: resilientJudgmentDeps });
1561
1610
  const borderlineConsistency = await runBorderlinePass(judgments, [
1562
1611
  baselineResultsPath,
1563
1612
  ]);
@@ -1687,18 +1736,14 @@ export async function calculateAndWriteScores(options) {
1687
1736
  // the ceiling-cross-check disagreement counter (`failureModeCalibration`)
1688
1737
  // is incremented during the post-extraction validation pass below.
1689
1738
  const reliability = { graderModel: "unknown" };
1690
- const judgments = extractGraderJudgments(baselineResultsPath, {
1691
- reliability,
1692
- ...(options.runId ? { runId: options.runId } : {}),
1693
- });
1694
- // In full mode, also extract judgments from agentic results
1695
- if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
1696
- const agenticJudgments = extractGraderJudgments(agenticResultsPath, {
1697
- reliability,
1698
- ...(options.runId ? { runId: options.runId } : {}),
1699
- });
1700
- judgments.push(...agenticJudgments);
1701
- }
1739
+ // Extract through the resilient wrapper so an empty result from the transient
1740
+ // read anomaly is instrumented and self-healed rather than silently skipping
1741
+ // the grader-judgments persist. In full mode both the baseline and agentic
1742
+ // result files are graded against the shared telemetry.
1743
+ const judgmentResultPaths = mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)
1744
+ ? [baselineResultsPath, agenticResultsPath]
1745
+ : [baselineResultsPath];
1746
+ const judgments = await extractGraderJudgmentsResilient(judgmentResultPaths, { reliability, ...(options.runId ? { runId: options.runId } : {}) }, log, { deps: resilientJudgmentDeps });
1702
1747
  // Borderline-consensus pass — re-grade the ±5 borderline subset N times
1703
1748
  // and merge medians back into the canonical judgments BEFORE
1704
1749
  // `validateGraderJudgmentsCalibration` runs, so the calibration counter
@@ -153,9 +153,13 @@ function mapTemplatedAssertion(assertion, options) {
153
153
  const result = {
154
154
  type: "llm-rubric",
155
155
  // The rubric prompt will be fully assembled by the PromptfooCompiler
156
- // using rubric templates. Here we pass the template ref + criteria
157
- // as metadata so the compiler can resolve it.
158
- value: `[template:${assertion.template}] ${assertion.criteria.join("; ")}`,
156
+ // using rubric templates. Here we pass the template ref + criteria as
157
+ // metadata so the compiler can resolve it. Criteria are either legacy
158
+ // bare strings or canonical `{ id, text }` objects — render the text of
159
+ // each (interpolating an object directly would emit "[object Object]").
160
+ value: `[template:${assertion.template}] ${assertion.criteria
161
+ .map((c) => (typeof c === "string" ? c : c.text))
162
+ .join("; ")}`,
159
163
  };
160
164
  if (assertion.weight !== undefined) {
161
165
  result.weight = assertion.weight;
@@ -13,7 +13,7 @@
13
13
  * @see docs/design-docs/two-stage-grader-symbol-preflight.md — W0198 Phase 6
14
14
  * @see config/rubrics.ts — template definitions
15
15
  */
16
- import type { PreflightRubricContext } from "../../_vendor/ailf-core/index.d.ts";
16
+ import type { CriterionRef, PreflightRubricContext } from "../../_vendor/ailf-core/index.d.ts";
17
17
  import type { PromptfooAssertion } from "./assertion-mapper.js";
18
18
  export type { PreflightRubricContext } from "../../_vendor/ailf-core/index.d.ts";
19
19
  /** Minimal rubric config needed for template resolution */
@@ -59,7 +59,7 @@ export interface RubricResolutionInput {
59
59
  * Returns null (with a warning) if the template can't be resolved.
60
60
  */
61
61
  export declare function resolveTemplatedAssertion(assertion: {
62
- criteria: string[];
62
+ criteria: (string | CriterionRef)[];
63
63
  template: string;
64
64
  type: string;
65
65
  }, rubricConfig: RubricResolutionInput | undefined, graderProvider: string | undefined, warnings: string[], canonicalReference?: string, preflightContext?: PreflightRubricContext): PromptfooAssertion | null;
@@ -45,7 +45,9 @@ export function resolveTemplatedAssertion(assertion, rubricConfig, graderProvide
45
45
  return null;
46
46
  }
47
47
  const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
48
- const criteriaText = assertion.criteria.map((c) => `- ${c}`).join("\n");
48
+ const criteriaText = assertion.criteria
49
+ .map((c) => renderCriterion(c, assertion.template))
50
+ .join("\n");
49
51
  // W0198 Phase 6 — when the deterministic preflight lane is wired and this
50
52
  // rubric scores `code-correctness`, prefix a system instruction so the
51
53
  // grader does not re-judge symbol existence. The lane separation is the
@@ -79,6 +81,28 @@ export function resolveTemplatedAssertion(assertion, rubricConfig, graderProvide
79
81
  : {}),
80
82
  };
81
83
  }
84
+ /**
85
+ * Render a single rubric criterion to its bullet line.
86
+ *
87
+ * Criteria come in two shapes: the canonical `CriterionRef` object
88
+ * (`{ id, text }`, used by Content Lake / API tasks) and a legacy bare
89
+ * string (still used by many repo `defineTask` task files). Both render to
90
+ * their text. Interpolating a `CriterionRef` object directly (`- ${c}`)
91
+ * silently renders `- [object Object]`, stripping the grader's acceptance
92
+ * criteria and degrading grade quality — that is the bug this guards. A
93
+ * criterion that yields no renderable text (e.g. an object missing `text`)
94
+ * fails loud at compile time rather than reaching the grader.
95
+ */
96
+ function renderCriterion(criterion, template) {
97
+ const text = typeof criterion === "string" ? criterion : criterion?.text;
98
+ if (typeof text !== "string" || text.trim() === "") {
99
+ throw new Error(`Rubric template "${template}" has a criterion with no renderable text ` +
100
+ `(received: ${JSON.stringify(criterion)?.slice(0, 160)}). Each criterion ` +
101
+ `must be a non-empty string or a { id, text } object; an object without ` +
102
+ `text renders "[object Object]" in the grader prompt.`);
103
+ }
104
+ return `- ${text}`;
105
+ }
82
106
  /**
83
107
  * Build the W0198 Phase 6 preflight preface for a `code-correctness`
84
108
  * rubric. Returned with a trailing newline so it composes cleanly with
@@ -0,0 +1,52 @@
1
+ /**
2
+ * pipeline/enrichment-preconditions.ts
3
+ *
4
+ * Classifies the inputs the post-scoring enrichment steps (gap-analysis,
5
+ * compute-attribution) depend on, so a missing `grader-judgments.json` can be
6
+ * told apart as either a legitimate skip (no graded eval ran this pipeline) or
7
+ * a degraded outcome (a full eval scored tests but no judgments persisted).
8
+ *
9
+ * The degraded case is the failure these steps must stop swallowing:
10
+ * `calculate-scores` wrote `score-summary.json` with per-area `testCount > 0`
11
+ * but never wrote `grader-judgments.json`, so the enrichment steps self-skip
12
+ * and the report ships with no test details while still showing a score.
13
+ * Distinguishing the two is what lets the steps fail loud instead of returning
14
+ * a benign `skipped`.
15
+ */
16
+ /**
17
+ * Outcome of classifying the enrichment inputs under `results/latest/`.
18
+ *
19
+ * - `ready` — `grader-judgments.json` is present and non-empty; enrichment
20
+ * can run.
21
+ * - `no-full-eval` — no graded eval produced judgments this run. A legitimate
22
+ * skip: standalone gap-analysis on cached results, a non-graded run, or an
23
+ * eval that scored nothing.
24
+ * - `judgments-missing-after-eval` — a full eval scored tests
25
+ * (`score-summary.json` carries `testCount > 0`) yet `grader-judgments.json`
26
+ * is missing or empty. This is the degraded condition the steps surface.
27
+ */
28
+ export type EnrichmentInputs = {
29
+ kind: "ready";
30
+ judgmentCount: number;
31
+ } | {
32
+ kind: "no-full-eval";
33
+ } | {
34
+ kind: "judgments-missing-after-eval";
35
+ scoredTestCount: number;
36
+ };
37
+ /**
38
+ * Classify the enrichment inputs for a run by inspecting
39
+ * `results/latest/grader-judgments.json` and `score-summary.json`.
40
+ *
41
+ * Pure read-only filesystem inspection — never throws on malformed input; a
42
+ * file that does not parse to the expected shape is treated as absent so that
43
+ * "no usable judgments" and "no usable summary" both collapse to a single
44
+ * branch.
45
+ */
46
+ export declare function classifyEnrichmentInputs(rootDir: string): EnrichmentInputs;
47
+ /**
48
+ * Build the fail-loud error message for the degraded
49
+ * `judgments-missing-after-eval` case. Shared by the enrichment steps so the
50
+ * pipeline-result and job-document surfaces carry one consistent wording.
51
+ */
52
+ export declare function degradedEnrichmentError(step: string, scoredTestCount: number): string;
@@ -0,0 +1,84 @@
1
+ /**
2
+ * pipeline/enrichment-preconditions.ts
3
+ *
4
+ * Classifies the inputs the post-scoring enrichment steps (gap-analysis,
5
+ * compute-attribution) depend on, so a missing `grader-judgments.json` can be
6
+ * told apart as either a legitimate skip (no graded eval ran this pipeline) or
7
+ * a degraded outcome (a full eval scored tests but no judgments persisted).
8
+ *
9
+ * The degraded case is the failure these steps must stop swallowing:
10
+ * `calculate-scores` wrote `score-summary.json` with per-area `testCount > 0`
11
+ * but never wrote `grader-judgments.json`, so the enrichment steps self-skip
12
+ * and the report ships with no test details while still showing a score.
13
+ * Distinguishing the two is what lets the steps fail loud instead of returning
14
+ * a benign `skipped`.
15
+ */
16
+ import { existsSync, readFileSync } from "node:fs";
17
+ import { resolve } from "node:path";
18
+ /**
19
+ * Classify the enrichment inputs for a run by inspecting
20
+ * `results/latest/grader-judgments.json` and `score-summary.json`.
21
+ *
22
+ * Pure read-only filesystem inspection — never throws on malformed input; a
23
+ * file that does not parse to the expected shape is treated as absent so that
24
+ * "no usable judgments" and "no usable summary" both collapse to a single
25
+ * branch.
26
+ */
27
+ export function classifyEnrichmentInputs(rootDir) {
28
+ const judgmentCount = countGraderJudgments(rootDir);
29
+ if (judgmentCount > 0) {
30
+ return { kind: "ready", judgmentCount };
31
+ }
32
+ const scoredTestCount = scoredTestCountFromSummary(rootDir);
33
+ if (scoredTestCount > 0) {
34
+ return { kind: "judgments-missing-after-eval", scoredTestCount };
35
+ }
36
+ return { kind: "no-full-eval" };
37
+ }
38
+ /**
39
+ * Build the fail-loud error message for the degraded
40
+ * `judgments-missing-after-eval` case. Shared by the enrichment steps so the
41
+ * pipeline-result and job-document surfaces carry one consistent wording.
42
+ */
43
+ export function degradedEnrichmentError(step, scoredTestCount) {
44
+ return (`${step}: grader-judgments.json missing after a full eval — ` +
45
+ `${scoredTestCount} test(s) scored but 0 grader judgments persisted. ` +
46
+ `The report is marked degraded rather than published as healthy.`);
47
+ }
48
+ /**
49
+ * Count the judgments in `grader-judgments.json`. Returns 0 when the file is
50
+ * absent, unreadable, not valid JSON, or not an array — every "no usable
51
+ * judgments" shape collapses to 0 so callers branch on a single number. An
52
+ * empty array is therefore indistinguishable from a missing file by design
53
+ * (both are "no judgments persisted").
54
+ */
55
+ function countGraderJudgments(rootDir) {
56
+ const path = resolve(rootDir, "results", "latest", "grader-judgments.json");
57
+ if (!existsSync(path))
58
+ return 0;
59
+ try {
60
+ const parsed = JSON.parse(readFileSync(path, "utf-8"));
61
+ return Array.isArray(parsed) ? parsed.length : 0;
62
+ }
63
+ catch {
64
+ return 0;
65
+ }
66
+ }
67
+ /**
68
+ * Sum the per-area `testCount` from `score-summary.json` — the signal that a
69
+ * full eval scored tests this run. Returns 0 when the summary is absent,
70
+ * unreadable, or carries no scored tests.
71
+ */
72
+ function scoredTestCountFromSummary(rootDir) {
73
+ const path = resolve(rootDir, "results", "latest", "score-summary.json");
74
+ if (!existsSync(path))
75
+ return 0;
76
+ try {
77
+ const parsed = JSON.parse(readFileSync(path, "utf-8"));
78
+ const scores = Array.isArray(parsed.scores) ? parsed.scores : [];
79
+ return scores.reduce((sum, s) => sum + (typeof s.testCount === "number" ? s.testCount : 0), 0);
80
+ }
81
+ catch {
82
+ return 0;
83
+ }
84
+ }
@@ -0,0 +1,88 @@
1
+ /**
2
+ * pipeline/extract-grader-judgments-resilient.ts
3
+ *
4
+ * Resilient grader-judgment extraction for the `calculate-scores` persist
5
+ * junction.
6
+ *
7
+ * Background: `calculateAndWriteScores` extracts grader judgments from the
8
+ * eval results file(s), then writes `grader-judgments.json` only when the
9
+ * array is non-empty (the `judgments.length > 0` guard). A runtime anomaly was
10
+ * observed where `extractGraderJudgments` returned 0 judgments for a results
11
+ * file that demonstrably contained classifiable llm-rubric components — the
12
+ * same file, read tens of milliseconds later by `extractStoredTestResults`,
13
+ * yielded the full set (entries with populated dimensions). The empty array
14
+ * silently skipped the write, so gap-analysis and compute-attribution skipped
15
+ * and the report shipped with a score but no tests.
16
+ *
17
+ * The committed code reads the file via a pure `readFileSync` with identical
18
+ * classification on both sides, so the divergence is not reproducible from the
19
+ * source + captured artifacts — it is a transient read anomaly at the live
20
+ * junction. This wrapper does not pretend to know the mechanism; it makes the
21
+ * junction observable and recovers from the transient:
22
+ *
23
+ * 1. **Instruments** — logs the resolved path(s), file size/mtime, parsed
24
+ * result count, and judgment count on every run (never silent on 0), so a
25
+ * future empty-judgments persist is diagnosable from the run log alone.
26
+ * 2. **Self-heals** — when extraction yields 0 judgments but a results file
27
+ * exists, it re-extracts with bounded retries. A later read that yields
28
+ * judgments proves the initial 0 was transient; the recovered judgments
29
+ * are returned. If every attempt yields 0, severity is decided by an
30
+ * independent classifiable-component count: a genuinely judgment-free run
31
+ * (all api-errors / no llm-rubric) logs a warning, while 0 judgments
32
+ * against present classifiable components logs an error (the downstream
33
+ * gap-analysis fail-loud guard is the backstop).
34
+ *
35
+ * The extractor and fs helpers are injected so the wrapper is unit-testable
36
+ * without importing the ~3000-line scoring module (which would be circular)
37
+ * or touching the real filesystem.
38
+ */
39
+ import type { GraderJudgment, GraderReliability, Logger } from "../_vendor/ailf-core/index.d.ts";
40
+ /** Telemetry sink threaded into each extraction (shared reliability counters). */
41
+ export interface ExtractionTelemetry {
42
+ reliability: GraderReliability;
43
+ runId?: string;
44
+ }
45
+ /** Cheap on-disk stat used for diagnostics and the retry gate. */
46
+ export interface FileStat {
47
+ exists: boolean;
48
+ mtimeMs: number;
49
+ size: number;
50
+ }
51
+ /** Injectable seams — defaults wire the real fs; tests substitute fakes. */
52
+ export interface ResilientExtractionDeps {
53
+ /**
54
+ * The real `extractGraderJudgments`. Injected (rather than imported) to
55
+ * avoid a circular dependency with `calculate-scores.ts`.
56
+ */
57
+ extract: (path: string, telemetry?: ExtractionTelemetry) => GraderJudgment[];
58
+ /** Parsed result-entry count for a path — diagnostics only. */
59
+ countResults?: (path: string) => number;
60
+ /**
61
+ * Count of classifiable llm-rubric components for a path. Used only to set
62
+ * the severity of a persistent-empty extraction: a file with classifiable
63
+ * components but 0 extracted judgments is an error; a file with none (all
64
+ * api-errors / no llm-rubric) is a benign empty.
65
+ */
66
+ countClassifiable?: (path: string) => number;
67
+ /** On-disk stat (existence + size + mtime). */
68
+ statFile?: (path: string) => FileStat;
69
+ /** Backoff between self-heal attempts. */
70
+ sleep?: (ms: number) => Promise<void>;
71
+ }
72
+ export interface ResilientExtractionOptions {
73
+ /** Total extraction attempts when the first yields 0 (default 3, min 1). */
74
+ maxAttempts?: number;
75
+ /** Delay before each retry, in ms (default 200). */
76
+ delayMs?: number;
77
+ deps: ResilientExtractionDeps;
78
+ }
79
+ /**
80
+ * Extract grader judgments across one or more results files, instrumented and
81
+ * self-healing. See the module header for the rationale.
82
+ *
83
+ * @param resultsPaths One or more results files (e.g. baseline + agentic in
84
+ * literacy full mode). Missing paths are skipped.
85
+ * @param telemetry Shared reliability sink threaded into every extraction.
86
+ * @param log Pipeline logger — the junction is logged here on every run.
87
+ */
88
+ export declare function extractGraderJudgmentsResilient(resultsPaths: readonly string[], telemetry: ExtractionTelemetry | undefined, log: Logger, options: ResilientExtractionOptions): Promise<GraderJudgment[]>;
@@ -0,0 +1,122 @@
1
+ /**
2
+ * pipeline/extract-grader-judgments-resilient.ts
3
+ *
4
+ * Resilient grader-judgment extraction for the `calculate-scores` persist
5
+ * junction.
6
+ *
7
+ * Background: `calculateAndWriteScores` extracts grader judgments from the
8
+ * eval results file(s), then writes `grader-judgments.json` only when the
9
+ * array is non-empty (the `judgments.length > 0` guard). A runtime anomaly was
10
+ * observed where `extractGraderJudgments` returned 0 judgments for a results
11
+ * file that demonstrably contained classifiable llm-rubric components — the
12
+ * same file, read tens of milliseconds later by `extractStoredTestResults`,
13
+ * yielded the full set (entries with populated dimensions). The empty array
14
+ * silently skipped the write, so gap-analysis and compute-attribution skipped
15
+ * and the report shipped with a score but no tests.
16
+ *
17
+ * The committed code reads the file via a pure `readFileSync` with identical
18
+ * classification on both sides, so the divergence is not reproducible from the
19
+ * source + captured artifacts — it is a transient read anomaly at the live
20
+ * junction. This wrapper does not pretend to know the mechanism; it makes the
21
+ * junction observable and recovers from the transient:
22
+ *
23
+ * 1. **Instruments** — logs the resolved path(s), file size/mtime, parsed
24
+ * result count, and judgment count on every run (never silent on 0), so a
25
+ * future empty-judgments persist is diagnosable from the run log alone.
26
+ * 2. **Self-heals** — when extraction yields 0 judgments but a results file
27
+ * exists, it re-extracts with bounded retries. A later read that yields
28
+ * judgments proves the initial 0 was transient; the recovered judgments
29
+ * are returned. If every attempt yields 0, severity is decided by an
30
+ * independent classifiable-component count: a genuinely judgment-free run
31
+ * (all api-errors / no llm-rubric) logs a warning, while 0 judgments
32
+ * against present classifiable components logs an error (the downstream
33
+ * gap-analysis fail-loud guard is the backstop).
34
+ *
35
+ * The extractor and fs helpers are injected so the wrapper is unit-testable
36
+ * without importing the ~3000-line scoring module (which would be circular)
37
+ * or touching the real filesystem.
38
+ */
39
+ import { existsSync, statSync } from "node:fs";
40
+ const defaultStat = (path) => {
41
+ if (!existsSync(path))
42
+ return { exists: false, mtimeMs: 0, size: 0 };
43
+ const s = statSync(path);
44
+ return { exists: true, mtimeMs: s.mtimeMs, size: s.size };
45
+ };
46
+ const defaultSleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
47
+ /**
48
+ * Extract grader judgments across one or more results files, instrumented and
49
+ * self-healing. See the module header for the rationale.
50
+ *
51
+ * @param resultsPaths One or more results files (e.g. baseline + agentic in
52
+ * literacy full mode). Missing paths are skipped.
53
+ * @param telemetry Shared reliability sink threaded into every extraction.
54
+ * @param log Pipeline logger — the junction is logged here on every run.
55
+ */
56
+ export async function extractGraderJudgmentsResilient(resultsPaths, telemetry, log, options) {
57
+ const { extract } = options.deps;
58
+ const statFile = options.deps.statFile ?? defaultStat;
59
+ const sleep = options.deps.sleep ?? defaultSleep;
60
+ const { countResults, countClassifiable } = options.deps;
61
+ const maxAttempts = Math.max(1, options.maxAttempts ?? 3);
62
+ const delayMs = options.delayMs ?? 200;
63
+ const present = resultsPaths.filter((p) => statFile(p).exists);
64
+ const extractAll = () => {
65
+ const all = [];
66
+ for (const p of present) {
67
+ all.push(...extract(p, telemetry));
68
+ }
69
+ return all;
70
+ };
71
+ const diag = (path) => {
72
+ const st = statFile(path);
73
+ return {
74
+ mtimeMs: st.mtimeMs,
75
+ path,
76
+ sizeBytes: st.size,
77
+ ...(countResults ? { resultCount: countResults(path) } : {}),
78
+ };
79
+ };
80
+ // Attempt 1 — always instrument the junction so 0 is never silent.
81
+ let judgments = extractAll();
82
+ for (const p of present) {
83
+ log.info("Grader judgments — persist junction read", diag(p));
84
+ }
85
+ log.info(`Grader judgments extracted: ${judgments.length} total`, {
86
+ judgmentCount: judgments.length,
87
+ paths: present,
88
+ });
89
+ if (judgments.length > 0)
90
+ return judgments;
91
+ // 0 judgments and no results file present → genuinely nothing to grade.
92
+ // A missing file cannot become non-empty within the retry window.
93
+ if (present.length === 0) {
94
+ log.info("No grader judgments — no results file present (nothing to grade)");
95
+ return judgments;
96
+ }
97
+ // Results file(s) exist but extraction yielded 0 judgments — a suspected
98
+ // transient read anomaly. Loud diagnostic, then bounded self-heal retries;
99
+ // the same file read tens of ms later has been observed to yield the full set.
100
+ log.warn("Grader extraction returned 0 judgments despite present results file(s) — suspected transient read anomaly; attempting self-heal", { paths: present.map(diag) });
101
+ for (let attempt = 2; attempt <= maxAttempts; attempt++) {
102
+ await sleep(delayMs);
103
+ judgments = extractAll();
104
+ log.warn(`Grader self-heal attempt ${attempt}/${maxAttempts}: ${judgments.length} judgment(s)`);
105
+ if (judgments.length > 0) {
106
+ log.warn(`Grader self-heal recovered ${judgments.length} grader judgment(s) on attempt ${attempt} — the initial empty extraction was a transient read anomaly`);
107
+ return judgments;
108
+ }
109
+ }
110
+ // Still empty after every attempt. Severity depends on whether the files
111
+ // actually contain classifiable components.
112
+ const classifiable = countClassifiable
113
+ ? present.reduce((n, p) => n + countClassifiable(p), 0)
114
+ : undefined;
115
+ if (classifiable === 0) {
116
+ log.warn(`No grader judgments after ${maxAttempts} attempt(s) — results contain no classifiable llm-rubric components (e.g. all api-errors); nothing to persist`);
117
+ }
118
+ else {
119
+ log.error(`Grader judgments empty after ${maxAttempts} attempt(s) but ${classifiable ?? "an unknown number of"} classifiable component(s) present in the results file(s) — persisting none; downstream gap-analysis/attribution will fail loud`, { paths: present.map(diag) });
120
+ }
121
+ return judgments;
122
+ }
@@ -216,6 +216,7 @@ export interface SanityReportDoc {
216
216
  _type: string;
217
217
  comparison: null | Omit<ComparisonReport, "baseline" | "experiment">;
218
218
  completedAt: string;
219
+ degraded?: Report["degraded"];
219
220
  durationMs: number;
220
221
  provenance: Report["provenance"];
221
222
  reportId: ReportId;
@@ -477,6 +477,7 @@ export function toSanityReportDoc(report) {
477
477
  _type: REPORT_TYPE,
478
478
  comparison,
479
479
  completedAt: report.completedAt,
480
+ ...(report.degraded ? { degraded: report.degraded } : {}),
480
481
  durationMs: report.durationMs,
481
482
  provenance: report.provenance,
482
483
  reportId: report.id,
@@ -526,6 +527,7 @@ export function toReport(doc) {
526
527
  artifactManifest,
527
528
  comparison: doc.comparison,
528
529
  completedAt: doc.completedAt,
530
+ degraded: doc.degraded,
529
531
  durationMs: doc.durationMs,
530
532
  id: doc.reportId,
531
533
  provenance: doc.provenance,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "7.1.2",
3
+ "version": "7.2.1",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"