@sanity/ailf 7.1.2 → 7.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/index.js +4 -0
- package/dist/_vendor/ailf-core/schemas/report.d.ts +11 -0
- package/dist/_vendor/ailf-core/schemas/report.js +14 -0
- package/dist/_vendor/ailf-core/schemas/user.d.ts +22 -0
- package/dist/_vendor/ailf-core/schemas/user.js +23 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +29 -0
- package/dist/_vendor/ailf-core/types/index.js +13 -0
- package/dist/_vendor/ailf-core/types/user.d.ts +49 -0
- package/dist/_vendor/ailf-core/types/user.js +1 -0
- package/dist/_vendor/ailf-shared/generated/help-content.js +26 -14
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +2 -2
- package/dist/orchestration/steps/compute-attribution-step.d.ts +2 -2
- package/dist/orchestration/steps/compute-attribution-step.js +17 -2
- package/dist/orchestration/steps/gap-analysis-step.d.ts +2 -2
- package/dist/orchestration/steps/gap-analysis-step.js +20 -2
- package/dist/orchestration/steps/publish-report-step.d.ts +15 -1
- package/dist/orchestration/steps/publish-report-step.js +45 -0
- package/dist/pipeline/calculate-scores.js +59 -14
- package/dist/pipeline/enrichment-preconditions.d.ts +52 -0
- package/dist/pipeline/enrichment-preconditions.js +84 -0
- package/dist/pipeline/extract-grader-judgments-resilient.d.ts +88 -0
- package/dist/pipeline/extract-grader-judgments-resilient.js +122 -0
- package/dist/report-store.d.ts +1 -0
- package/dist/report-store.js +2 -0
- package/package.json +1 -1
|
@@ -28,3 +28,7 @@ export { ConfidenceSchema } from "./confidence-schema.js";
|
|
|
28
28
|
// helper instead of replicating `as unknown as z.ZodType<…>` at each
|
|
29
29
|
// schema author site (project rule: no `as` on `unknown`).
|
|
30
30
|
export { brandedString } from "./branded-string.js";
|
|
31
|
+
// User-preferences subsystem (W0302). Named export — not `export *` — because
|
|
32
|
+
// the schema file re-exports the `AilfUser` domain type, and a star re-export
|
|
33
|
+
// would surface that type through two paths (W0124 DTS ambiguity).
|
|
34
|
+
export { AilfUserSchema } from "./user.js";
|
|
@@ -258,6 +258,17 @@ export declare const ReportSchema: z.ZodObject<{
|
|
|
258
258
|
artifactManifest: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
259
259
|
tag: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
260
260
|
title: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
261
|
+
degraded: z.ZodOptional<z.ZodObject<{
|
|
262
|
+
reason: z.ZodLiteral<"enrichment-missing">;
|
|
263
|
+
missing: z.ZodArray<z.ZodEnum<{
|
|
264
|
+
documentManifest: "documentManifest";
|
|
265
|
+
failureModes: "failureModes";
|
|
266
|
+
lowScoringJudgments: "lowScoringJudgments";
|
|
267
|
+
recommendations: "recommendations";
|
|
268
|
+
testResults: "testResults";
|
|
269
|
+
}>>;
|
|
270
|
+
detail: z.ZodString;
|
|
271
|
+
}, z.core.$strict>>;
|
|
261
272
|
}, z.core.$loose>;
|
|
262
273
|
export type ReportSchemaInput = z.input<typeof ReportSchema>;
|
|
263
274
|
export type ReportSchemaOutput = z.infer<typeof ReportSchema>;
|
|
@@ -25,6 +25,7 @@
|
|
|
25
25
|
*/
|
|
26
26
|
import { z } from "zod";
|
|
27
27
|
import { LITERACY_VARIANTS } from "../../ailf-shared/index.js";
|
|
28
|
+
import { DEGRADED_ENRICHMENT_FIELDS } from "../types/index.js";
|
|
28
29
|
// ---------------------------------------------------------------------------
|
|
29
30
|
// RunContext building blocks (mirrors packages/shared/src/run-context.ts)
|
|
30
31
|
// ---------------------------------------------------------------------------
|
|
@@ -233,5 +234,18 @@ export const ReportSchema = z
|
|
|
233
234
|
// `title: report.title ?? null`, so the schema accepts null on both.
|
|
234
235
|
tag: z.string().nullable().optional(),
|
|
235
236
|
title: z.string().nullable().optional(),
|
|
237
|
+
// Degraded marker (mirrors `ReportDegradation`): present only when a full
|
|
238
|
+
// eval scored tests but enrichment did not complete. Strict — unknown
|
|
239
|
+
// keys here signal real drift.
|
|
240
|
+
degraded: z
|
|
241
|
+
.object({
|
|
242
|
+
reason: z.literal("enrichment-missing"),
|
|
243
|
+
// Enum derived from the canonical DegradedEnrichmentField tuple so the
|
|
244
|
+
// schema cannot drift from the core type.
|
|
245
|
+
missing: z.array(z.enum(DEGRADED_ENRICHMENT_FIELDS)),
|
|
246
|
+
detail: z.string().min(1),
|
|
247
|
+
})
|
|
248
|
+
.strict()
|
|
249
|
+
.optional(),
|
|
236
250
|
})
|
|
237
251
|
.passthrough();
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
export declare const AilfUserSchema: z.ZodObject<{
|
|
3
|
+
_id: z.ZodString;
|
|
4
|
+
_type: z.ZodLiteral<"ailf.user">;
|
|
5
|
+
sanityUserId: z.ZodString;
|
|
6
|
+
email: z.ZodString;
|
|
7
|
+
displayName: z.ZodOptional<z.ZodString>;
|
|
8
|
+
teams: z.ZodArray<z.ZodObject<{
|
|
9
|
+
_type: z.ZodLiteral<"reference">;
|
|
10
|
+
_ref: z.ZodString;
|
|
11
|
+
_key: z.ZodOptional<z.ZodString>;
|
|
12
|
+
}, z.core.$strip>>;
|
|
13
|
+
preferences: z.ZodObject<{
|
|
14
|
+
primaryTeam: z.ZodOptional<z.ZodObject<{
|
|
15
|
+
_type: z.ZodLiteral<"reference">;
|
|
16
|
+
_ref: z.ZodString;
|
|
17
|
+
_key: z.ZodOptional<z.ZodString>;
|
|
18
|
+
}, z.core.$strip>>;
|
|
19
|
+
}, z.core.$strip>;
|
|
20
|
+
updatedAt: z.ZodString;
|
|
21
|
+
}, z.core.$strip>;
|
|
22
|
+
export type { AilfUser } from "../types/user.js";
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
// `_id` is constructed as `ailf.user.${CurrentUser.id}` at write time. The
|
|
3
|
+
// account id segment is opaque (may contain `|`, `.`, etc. for SSO providers),
|
|
4
|
+
// so the prefix is all we constrain here. The deterministic-id invariant
|
|
5
|
+
// (`_id === ailf.user.${sanityUserId}`) is enforced on the write path.
|
|
6
|
+
const USER_ID_REGEX = /^ailf\.user\..+$/;
|
|
7
|
+
const TeamReferenceSchema = z.object({
|
|
8
|
+
_type: z.literal("reference"),
|
|
9
|
+
_ref: z.string().min(1),
|
|
10
|
+
_key: z.string().optional(),
|
|
11
|
+
});
|
|
12
|
+
export const AilfUserSchema = z.object({
|
|
13
|
+
_id: z.string().regex(USER_ID_REGEX),
|
|
14
|
+
_type: z.literal("ailf.user"),
|
|
15
|
+
sanityUserId: z.string().min(1),
|
|
16
|
+
email: z.string().email(),
|
|
17
|
+
displayName: z.string().optional(),
|
|
18
|
+
teams: z.array(TeamReferenceSchema),
|
|
19
|
+
preferences: z.object({
|
|
20
|
+
primaryTeam: TeamReferenceSchema.optional(),
|
|
21
|
+
}),
|
|
22
|
+
updatedAt: z.string().datetime(),
|
|
23
|
+
});
|
|
@@ -42,6 +42,7 @@ export type { AttributionMeta, DocAttribution, JudgmentAttribution, } from "./at
|
|
|
42
42
|
export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderEmittedJudgment, GraderJudgment, } from "./grader-judgment.js";
|
|
43
43
|
export type { LegacyGraderJudgment } from "./legacy-grader-judgment.js";
|
|
44
44
|
export type { BaseChannel, ChannelScope, EmailChannel, EventType, KnownEventType, KnownMemberRole, MemberRole, NotificationChannel, NotificationChannelType, SlackChannel, Team, TeamId, TeamMember, TeamRef, TeamSlug, TeamStatus, WebhookChannel, } from "./team.js";
|
|
45
|
+
export type { AilfUser, AilfUserPreferences, TeamReference } from "./user.js";
|
|
45
46
|
type DocumentRef = _DocumentRef;
|
|
46
47
|
/** Aggregated retrieval metrics for a feature area */
|
|
47
48
|
export interface AreaRetrievalMetrics {
|
|
@@ -1488,8 +1489,36 @@ export interface ArtifactRef {
|
|
|
1488
1489
|
* two becomes a compile error (W0049 review finding C1).
|
|
1489
1490
|
*/
|
|
1490
1491
|
export type ArtifactManifest = Partial<Record<ArtifactType, ArtifactRef>>;
|
|
1492
|
+
/**
|
|
1493
|
+
* Enrichment surfaces that gap-analysis writes onto a report. When a full
|
|
1494
|
+
* eval scores tests but these are absent, the report renders as "no tests"
|
|
1495
|
+
* despite carrying a score — the degraded condition `ReportDegradation`
|
|
1496
|
+
* records.
|
|
1497
|
+
*/
|
|
1498
|
+
export declare const DEGRADED_ENRICHMENT_FIELDS: readonly ["documentManifest", "failureModes", "lowScoringJudgments", "recommendations", "testResults"];
|
|
1499
|
+
export type DegradedEnrichmentField = (typeof DEGRADED_ENRICHMENT_FIELDS)[number];
|
|
1500
|
+
/**
|
|
1501
|
+
* Marks a published report as degraded: the eval ran and scored tests, but
|
|
1502
|
+
* one or more enrichment surfaces never landed (e.g. gap-analysis skipped
|
|
1503
|
+
* because `grader-judgments.json` was missing). Present so the dashboard and
|
|
1504
|
+
* Studio can show "enrichment failed" rather than a misleading empty
|
|
1505
|
+
* "no tests" state on a report that still has a score.
|
|
1506
|
+
*/
|
|
1507
|
+
export interface ReportDegradation {
|
|
1508
|
+
/** Why the report is degraded. Single-variant union, widen as needed. */
|
|
1509
|
+
reason: "enrichment-missing";
|
|
1510
|
+
/** Enrichment surfaces absent on this report despite a full eval. */
|
|
1511
|
+
missing: DegradedEnrichmentField[];
|
|
1512
|
+
/** Human-readable explanation for dashboard / Studio empty-state copy. */
|
|
1513
|
+
detail: string;
|
|
1514
|
+
}
|
|
1491
1515
|
/** A published evaluation report — the atomic unit of the report store */
|
|
1492
1516
|
export interface Report {
|
|
1517
|
+
/**
|
|
1518
|
+
* Set when the report is published in a degraded state — a full eval
|
|
1519
|
+
* scored tests but enrichment did not complete. Absent on healthy reports.
|
|
1520
|
+
*/
|
|
1521
|
+
degraded?: ReportDegradation;
|
|
1493
1522
|
/**
|
|
1494
1523
|
* Snapshot of the run manifest's `artifacts` slice at publish time (D0032).
|
|
1495
1524
|
* The source of truth lives in `gs://…/runs/{runId}/manifest.json`; this
|
|
@@ -42,3 +42,16 @@ export function isLegacyFailureMode(mode) {
|
|
|
42
42
|
* that imports it from @sanity/ailf-core.
|
|
43
43
|
*/
|
|
44
44
|
export { NOISE_THRESHOLD as DEFAULT_NOISE_THRESHOLD } from "../../ailf-shared/index.js";
|
|
45
|
+
/**
|
|
46
|
+
* Enrichment surfaces that gap-analysis writes onto a report. When a full
|
|
47
|
+
* eval scores tests but these are absent, the report renders as "no tests"
|
|
48
|
+
* despite carrying a score — the degraded condition `ReportDegradation`
|
|
49
|
+
* records.
|
|
50
|
+
*/
|
|
51
|
+
export const DEGRADED_ENRICHMENT_FIELDS = [
|
|
52
|
+
"documentManifest",
|
|
53
|
+
"failureModes",
|
|
54
|
+
"lowScoringJudgments",
|
|
55
|
+
"recommendations",
|
|
56
|
+
"testResults",
|
|
57
|
+
];
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* A Sanity reference to an `ailf.team` document.
|
|
3
|
+
*
|
|
4
|
+
* Members of an array (`AilfUser.teams[]`) carry a `_key`; the single-valued
|
|
5
|
+
* `preferences.primaryTeam` does not. The team slug downstream consumers need
|
|
6
|
+
* is a derived, read-time value from a GROQ projection — never stored here.
|
|
7
|
+
*/
|
|
8
|
+
export interface TeamReference {
|
|
9
|
+
_type: "reference";
|
|
10
|
+
_ref: string;
|
|
11
|
+
_key?: string;
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Per-user UI preferences. Room to grow (default view, density, …) — kept
|
|
15
|
+
* minimal for v0 (YAGNI).
|
|
16
|
+
*/
|
|
17
|
+
export interface AilfUserPreferences {
|
|
18
|
+
/**
|
|
19
|
+
* Reference to the user's default team — one of `AilfUser.teams[]`. Distinct
|
|
20
|
+
* from `teams[]` so "which team's view do I default to" can differ from "all
|
|
21
|
+
* teams I affiliate with". The slug is derived in GROQ at read time.
|
|
22
|
+
*/
|
|
23
|
+
primaryTeam?: TeamReference;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Per-account user document — one per Sanity account, keyed by a deterministic
|
|
27
|
+
* `_id` of `ailf.user.${sanityUserId}`. Stores self-declared team affiliation
|
|
28
|
+
* (references to `ailf.team`) plus UI preferences, and is the primary source
|
|
29
|
+
* for dashboard personalization. Stores minimal PII: `sanityUserId`, `email`,
|
|
30
|
+
* and `displayName` only.
|
|
31
|
+
*
|
|
32
|
+
* @see docs/design-docs/user-settings.md
|
|
33
|
+
*/
|
|
34
|
+
export interface AilfUser {
|
|
35
|
+
/** Deterministic: `ailf.user.${sanityUserId}`. */
|
|
36
|
+
_id: string;
|
|
37
|
+
_type: "ailf.user";
|
|
38
|
+
/** `CurrentUser.id` — the stable, globally-unique key, mirrored for GROQ. */
|
|
39
|
+
sanityUserId: string;
|
|
40
|
+
/** Denormalized for display / joins (lowercased at write time). */
|
|
41
|
+
email: string;
|
|
42
|
+
/** `CurrentUser.name` snapshot. */
|
|
43
|
+
displayName?: string;
|
|
44
|
+
/** Self-declared affiliation — drives personalization only. */
|
|
45
|
+
teams: TeamReference[];
|
|
46
|
+
preferences: AilfUserPreferences;
|
|
47
|
+
/** ISO 8601 UTC — stamped on each save. */
|
|
48
|
+
updatedAt: string;
|
|
49
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -44,6 +44,17 @@ export const HELP_TOPICS = [
|
|
|
44
44
|
"scoring-model"
|
|
45
45
|
]
|
|
46
46
|
},
|
|
47
|
+
{
|
|
48
|
+
"id": "failure-modes",
|
|
49
|
+
"title": "Failure Modes",
|
|
50
|
+
"body": "## What this view is for\n\nThe Recommendations view tells you which fixes to make. This view tells you what\nkind of problem you have. It groups the run's weaknesses by the documentation\nissue behind them, so you can see patterns across the whole evaluation rather\nthan one fix at a time. If most of your weak spots are the same kind of problem,\nthat is a signal about how to spend your docs effort.\n\n## What you are looking at\n\nRecent reports show **interpretive cards** drawn from the run's diagnosis:\n\n- **Weakest area** names the single feature area dragging the score down most,\n the failure mode behind it, and a confidence level with the sample size, so\n you know how strong the signal is.\n- **Failure mode** highlights one category of problem, which scoring dimension\n it shows up in, and how often it occurred across the tests that were checked.\n- **Area summary** gives a plain-language read on how an area is doing and why.\n\nOlder reports show a **category breakdown** instead. Each failure category is a\nchip with a count. Selecting a chip lists the gaps in that category, and each\ngap shows an estimated score lift if fixed, a confidence level, a short\nremediation note, and the specific tasks that exposed it. You can click a task\nto jump to it.\n\n## The failure modes\n\nEach weakness is sorted into one of these categories. The category is the\nfastest way to know what kind of work the fix needs:\n\n- **Missing docs**: the doc the model needed does not exist or is not indexed.\n The fix is to write new documentation.\n- **Incorrect docs**: a doc has a factual error or a wrong example. The fix is\n to correct it.\n- **Outdated docs**: a doc exists but reflects a previous API surface. The fix\n is to bring it up to date.\n- **Poor structure**: the information is correct but hard for an agent to find\n or skim. The fix is to reorganize or clarify.\n- **Model limitation**: the model struggles even with correct docs available.\n This is not a documentation problem, so treat it as context rather than a\n to-do.\n- **Unclassified**: the run could not categorize the weakness. Use the linked\n tasks and the grader's notes to judge it yourself.\n\nDepending on the evaluation mode you may see additional categories, including\nones specific to agent behavior such as tool misuse or missing error handling.\n\n## How to use it\n\nStart with the category that has the most gaps or the highest combined lift. The\ncategory tells you the shape of the work before you open a single page: write,\ncorrect, update, or restructure. Categories that are not documentation problems,\nsuch as model limitation, are worth noting but are not yours to fix in the docs.\n\n## Related views\n\n- **Recommendations** turns these weaknesses into a ranked list of specific\n edits.\n- **Low-scoring judgments** shows the grader's raw notes on the tests that\n scored lowest, which is the most granular signal behind any failure mode.\n\n## When this view is empty\n\nIf a report shows no failure modes, the evaluation either classified nothing\nworth flagging or the run predates this view. A clean result here usually means\nthe docs held up across the evaluated tasks.",
|
|
51
|
+
"source": "docs/help/failure-modes.md",
|
|
52
|
+
"related": [
|
|
53
|
+
"recommendations",
|
|
54
|
+
"scoring-model",
|
|
55
|
+
"negative-doc-lift"
|
|
56
|
+
]
|
|
57
|
+
},
|
|
47
58
|
{
|
|
48
59
|
"id": "getting-started",
|
|
49
60
|
"title": "Getting Started",
|
|
@@ -57,11 +68,12 @@ export const HELP_TOPICS = [
|
|
|
57
68
|
{
|
|
58
69
|
"id": "interpreting-diagnostics",
|
|
59
70
|
"title": "Interpreting Diagnostics",
|
|
60
|
-
"body": "##
|
|
71
|
+
"body": "## Reading the health of your docs\n\nA report scores each feature area on how well your documentation lets AI coding\ntools implement that feature. Reading those scores well is what turns a number\ninto a plan: it tells you where the docs are working, where they are not, and\nwhat kind of problem you are dealing with.\n\n## Health bands\n\nEach area's score falls into one of three bands:\n\n- **Strong (80 and above)**: docs are working well. Agents produce correct,\n complete implementations. No action needed unless you see a regression.\n- **Needs attention (70 to 79)**: docs are okay but have gaps. A specific\n dimension such as code correctness or doc coverage may be dragging the score\n down. Worth investigating.\n- **Weak (below 70)**: docs are not providing enough support. Agents\n consistently struggle with these features. These need priority attention.\n\n## Strong areas are signal too\n\nIt is easy to focus only on what is broken, but the strong areas are worth\nreading. They show what good looks like in your docs: clear structure, accurate\nexamples, the patterns agents can follow. When you fix a weak area, that is the\nbar to copy.\n\n## Key diagnostic signals\n\nA low score has a reason behind it. These signals tell you which reason, and\nwhat to do about it:\n\n| Signal | What it means | What to do |\n| ------------------------------ | ------------------------------------------- | ---------------------------------------- |\n| **Negative doc lift** | Docs are worse than no docs | Rewrite or remove the offending docs |\n| **Large retrieval gap** | Good docs exist but agents cannot find them | Improve page titles, metadata, structure |\n| **Low code correctness** | Agents find the docs but produce bad code | Add or fix code examples |\n| **Low doc coverage** | The docs do not cover what the task needs | Write new documentation |\n| **Efficiency anomaly (>100%)** | Agents do better without the docs | Injected docs may be confusing the model |\n\n## Where to go next\n\nWhen you know which areas are weak and why, the **Recommendations** view turns\nthat into a ranked list of specific edits, and the **Failure modes** view groups\nthe weaknesses by the kind of documentation problem behind them.",
|
|
61
72
|
"source": "docs/help/interpreting-diagnostics.md",
|
|
62
73
|
"related": [
|
|
63
|
-
"
|
|
64
|
-
"
|
|
74
|
+
"recommendations",
|
|
75
|
+
"failure-modes",
|
|
76
|
+
"scoring-model"
|
|
65
77
|
]
|
|
66
78
|
},
|
|
67
79
|
{
|
|
@@ -74,6 +86,17 @@ export const HELP_TOPICS = [
|
|
|
74
86
|
"comparing-runs"
|
|
75
87
|
]
|
|
76
88
|
},
|
|
89
|
+
{
|
|
90
|
+
"id": "recommendations",
|
|
91
|
+
"title": "Recommendations",
|
|
92
|
+
"body": "## What this view is for\n\nThis is the \"what do I fix\" view. The scores tell you how well your\ndocumentation supports AI coding tools. This view turns those scores into a\nranked list of specific changes, so you can spend your time on the edits that\nshould move the score the most.\n\nEverything here comes from the same evaluation run you are looking at, and it\npoints at your own documentation pages rather than giving generic advice.\n\n## What you are looking at\n\nRecent reports show a set of **diagnosis cards**. Each card answers one question\nabout the run.\n\n**Top recommendations** is the main card. It opens with a short summary, then\nlists a few suggested changes ranked by priority. Each suggestion has:\n\n- A **priority** tag of high, medium, or low that tells you what to do first.\n- A **title** that names the change in one line.\n- A **description** of the specific fix, usually quoting the exact symbol,\n query, or pattern involved.\n- A **doc reference** showing which page, and the section when it is known, the\n change applies to. Every reference points to a real page that was part of this\n run, so you can open it and start editing.\n\nYou may also see supporting cards:\n\n- **Doc attribution spotlight** shows which documentation pages most influenced\n the results, and whether each one helped or hurt. Use it to confirm a\n recommendation is pointing at the right page.\n- **Low-confidence attribution** lists results where the link between a doc and\n an outcome was uncertain. Treat anything flagged here as a lead to verify, not\n a settled conclusion.\n- **Regression vs baseline** appears when you are comparing against an earlier\n run. It shows which areas moved up or down and the likely reason for each\n change.\n\n## How to use it\n\nWork top down. Start with the high-priority suggestions, open the referenced\npage, and make the change. Priority reflects how much each change is expected to\nhelp, so the top of the list is usually where your effort goes furthest.\n\nThe recommendations are written by a model that reads this run's results. They\nare grounded in your actual docs and cannot reference a page that was not in the\nrun, but they are still suggestions. Read the linked page before acting, and use\nthe confidence signals to decide how much to trust each item.\n\n## Where this comes from\n\nA recommendation is the end of a chain: a test scored low, the grader said why,\nthe run classified that into a failure mode, and this view proposes the edit. If\nyou want to see the failure modes themselves, grouped by category, open the\n**Failure modes** view. If you want the grader's raw notes on the lowest scores,\nopen the **Low-scoring judgments** view.\n\n## Older reports\n\nReports created before the diagnosis cards shipped show a simpler list instead.\nEach row names a feature area, the failure mode behind it, an estimated score\nlift if you fix it, a confidence level, and the tasks that exposed the gap. The\nestimated lift is conservative. It assumes fixing the gap raises the weak\ndimension only to the median of the others, so the real improvement can be\nhigher.\n\n## When this view is empty\n\nIf a report shows no recommendations, the evaluation either ran and found\nnothing worth flagging, or the run predates this feature. A score with no\nrecommendations is usually a good sign, because it means the docs held up across\nthe evaluated tasks.",
|
|
93
|
+
"source": "docs/help/recommendations.md",
|
|
94
|
+
"related": [
|
|
95
|
+
"failure-modes",
|
|
96
|
+
"interpreting-diagnostics",
|
|
97
|
+
"scoring-model"
|
|
98
|
+
]
|
|
99
|
+
},
|
|
77
100
|
{
|
|
78
101
|
"id": "retrieval-gap",
|
|
79
102
|
"title": "Retrieval Gap & Infrastructure Efficiency",
|
|
@@ -96,17 +119,6 @@ export const HELP_TOPICS = [
|
|
|
96
119
|
"eval-modes"
|
|
97
120
|
]
|
|
98
121
|
},
|
|
99
|
-
{
|
|
100
|
-
"id": "weaknesses-recommendations",
|
|
101
|
-
"title": "Weaknesses & Recommendations",
|
|
102
|
-
"body": "## Understanding weaknesses\n\nThe Issues sub-tab in Diagnostics lists every area or dimension that scored\nbelow threshold. Each weakness entry shows:\n\n- **The feature area** — Which product feature is affected (e.g., GROQ,\n Functions, Webhooks).\n- **The bottleneck dimension** — Which scoring dimension is dragging the area\n down: task completion, code correctness, or doc coverage.\n- **The score** — How far below threshold the dimension scored.\n\n## Gap analysis recommendations\n\nWhen an evaluation runs with gap analysis enabled, the dashboard shows\n**prioritized recommendations** — specific actions ranked by estimated impact.\n\nEach recommendation includes:\n\n- **Failure mode** — The type of doc problem identified:\n - `missing-docs` — The functionality isn't documented at all.\n - `incorrect-docs` — The docs contain factual errors.\n - `outdated-docs` — The docs describe an old API version or pattern.\n - `poor-structure` — The docs exist but are hard to find or understand.\n- **Estimated lift** — How many score points fixing this gap would add. Based on\n raising the bottleneck dimension to the median of non-bottleneck dimensions.\n Conservative estimate — actual improvement may be higher.\n- **Confidence** — How sure the analysis is about this diagnosis (high, medium,\n or low).\n- **Affected tasks** — Which specific evaluation tasks exposed this gap.\n\n## Diagnosis cards\n\nEvery published report now carries a **diagnosis artifact** — a set of cards\nproduced by the post-pipeline hook (`ailf interpret`). The Studio diagnosis\npanel renders these cards directly; the dashboard's Recommendations and\nFailure-modes panels migrate to the same source in a follow-up.\n\nThe hook runs by default for every pipeline invocation. To opt out for a single\nrun, pass `--no-summary`; to opt out in CI, set `AILF_INTERPRET_ON_RUN=0` in the\nworkflow env block; to opt out project-wide, set `summary.onRun: never` in\n`.ailf/config.yaml`.\n\n## Low-scoring judgments\n\nBelow the recommendations, you'll find the **grader's explanations** for tests\nthat scored below 70. These are the raw assessments from the grading model\nexplaining exactly what went wrong — missing API calls, incorrect patterns,\nhallucinated features, etc.\n\nEach judgment shows the task, the dimension, the score, and the grader's natural\nlanguage reason. These are the most granular diagnostic signal available and\noften point directly to the doc section that needs fixing.",
|
|
103
|
-
"source": "docs/help/weaknesses-recommendations.md",
|
|
104
|
-
"related": [
|
|
105
|
-
"interpreting-diagnostics",
|
|
106
|
-
"scoring-model",
|
|
107
|
-
"negative-doc-lift"
|
|
108
|
-
]
|
|
109
|
-
},
|
|
110
122
|
{
|
|
111
123
|
"id": "how-agents-work",
|
|
112
124
|
"title": "How AI Agents Find Documentation",
|
|
@@ -76,9 +76,9 @@ export declare const GraderJudgmentSchema: z.ZodObject<{
|
|
|
76
76
|
documentId: z.ZodString;
|
|
77
77
|
slug: z.ZodOptional<z.ZodString>;
|
|
78
78
|
role: z.ZodEnum<{
|
|
79
|
+
missing: "missing";
|
|
79
80
|
supports: "supports";
|
|
80
81
|
contradicts: "contradicts";
|
|
81
|
-
missing: "missing";
|
|
82
82
|
irrelevant: "irrelevant";
|
|
83
83
|
}>;
|
|
84
84
|
hallucinated: z.ZodOptional<z.ZodBoolean>;
|
|
@@ -145,9 +145,9 @@ export declare const GraderEmittedJudgmentSchema: z.ZodObject<{
|
|
|
145
145
|
documentId: z.ZodString;
|
|
146
146
|
slug: z.ZodOptional<z.ZodString>;
|
|
147
147
|
role: z.ZodEnum<{
|
|
148
|
+
missing: "missing";
|
|
148
149
|
supports: "supports";
|
|
149
150
|
contradicts: "contradicts";
|
|
150
|
-
missing: "missing";
|
|
151
151
|
irrelevant: "irrelevant";
|
|
152
152
|
}>;
|
|
153
153
|
hallucinated: z.ZodOptional<z.ZodBoolean>;
|
|
@@ -35,10 +35,10 @@
|
|
|
35
35
|
* @see docs/decisions/D0050-per-entry-attribution-layout.md
|
|
36
36
|
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
37
37
|
*/
|
|
38
|
-
import type { AppContext, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
|
|
38
|
+
import type { AppContext, PipelineState, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
|
|
39
39
|
export declare class ComputeAttributionStep implements PipelineStep {
|
|
40
40
|
readonly name = "compute-attribution";
|
|
41
41
|
readonly optional = true;
|
|
42
42
|
check(ctx: AppContext): ValidationIssue[];
|
|
43
|
-
execute(ctx: AppContext,
|
|
43
|
+
execute(ctx: AppContext, state?: PipelineState): Promise<StepResult>;
|
|
44
44
|
}
|
|
@@ -40,6 +40,7 @@ import { resolve } from "node:path";
|
|
|
40
40
|
import { isSlugRef } from "../../_vendor/ailf-core/index.js";
|
|
41
41
|
import { calibrationSetVersion, embeddingModel, ensembleVersion, } from "../../pipeline/attribution.js";
|
|
42
42
|
import { V0_WEIGHTS, computeJudgmentAttribution, } from "../../pipeline/compute-attribution.js";
|
|
43
|
+
import { classifyEnrichmentInputs, degradedEnrichmentError, } from "../../pipeline/enrichment-preconditions.js";
|
|
43
44
|
// ---------------------------------------------------------------------------
|
|
44
45
|
// Step implementation
|
|
45
46
|
// ---------------------------------------------------------------------------
|
|
@@ -79,12 +80,26 @@ export class ComputeAttributionStep {
|
|
|
79
80
|
}
|
|
80
81
|
return issues;
|
|
81
82
|
}
|
|
82
|
-
async execute(ctx,
|
|
83
|
+
async execute(ctx, state) {
|
|
83
84
|
const start = Date.now();
|
|
84
85
|
const root = ctx.config.rootDir;
|
|
85
86
|
const judgmentsPath = resolve(root, "results", "latest", "grader-judgments.json");
|
|
86
87
|
const summaryPath = resolve(root, "results", "latest", "score-summary.json");
|
|
87
|
-
|
|
88
|
+
// Mirror gap-analysis: a full eval that scored tests but persisted no
|
|
89
|
+
// grader judgments is a degraded run, not a benign skip. Fail loud so the
|
|
90
|
+
// outcome surfaces in pipeline-result and on the job document. A remote
|
|
91
|
+
// cache hit restores score-summary.json without grader-judgments.json, so
|
|
92
|
+
// its missing judgments are legitimate — never fail loud on a cache hit.
|
|
93
|
+
const fromRemoteCache = (state?.remoteCacheHits?.size ?? 0) > 0;
|
|
94
|
+
const inputs = classifyEnrichmentInputs(root);
|
|
95
|
+
if (inputs.kind === "judgments-missing-after-eval" && !fromRemoteCache) {
|
|
96
|
+
return {
|
|
97
|
+
durationMs: Date.now() - start,
|
|
98
|
+
status: "failed",
|
|
99
|
+
error: degradedEnrichmentError("compute-attribution", inputs.scoredTestCount),
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
if (inputs.kind !== "ready") {
|
|
88
103
|
return { status: "skipped", reason: "No grader-judgments.json" };
|
|
89
104
|
}
|
|
90
105
|
if (!existsSync(summaryPath)) {
|
|
@@ -14,10 +14,10 @@
|
|
|
14
14
|
*
|
|
15
15
|
* This is an optional step — failure doesn't stop the pipeline.
|
|
16
16
|
*/
|
|
17
|
-
import type { AppContext, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
|
|
17
|
+
import type { AppContext, PipelineState, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
|
|
18
18
|
export declare class GapAnalysisStep implements PipelineStep {
|
|
19
19
|
readonly name = "gap-analysis";
|
|
20
20
|
readonly optional = true;
|
|
21
21
|
check(ctx: AppContext): ValidationIssue[];
|
|
22
|
-
execute(ctx: AppContext): Promise<StepResult>;
|
|
22
|
+
execute(ctx: AppContext, state?: PipelineState): Promise<StepResult>;
|
|
23
23
|
}
|
|
@@ -18,6 +18,7 @@ import { existsSync, mkdirSync, readFileSync, renameSync, writeFileSync, } from
|
|
|
18
18
|
import { join, resolve } from "path";
|
|
19
19
|
import { assoc, isSlugRef } from "../../_vendor/ailf-core/index.js";
|
|
20
20
|
import { emitFileContents } from "../../artifact-capture/emit-file.js";
|
|
21
|
+
import { classifyEnrichmentInputs, degradedEnrichmentError, } from "../../pipeline/enrichment-preconditions.js";
|
|
21
22
|
export class GapAnalysisStep {
|
|
22
23
|
name = "gap-analysis";
|
|
23
24
|
optional = true;
|
|
@@ -34,12 +35,29 @@ export class GapAnalysisStep {
|
|
|
34
35
|
}
|
|
35
36
|
return [];
|
|
36
37
|
}
|
|
37
|
-
async execute(ctx) {
|
|
38
|
+
async execute(ctx, state) {
|
|
38
39
|
const root = ctx.config.rootDir;
|
|
39
40
|
const start = Date.now();
|
|
40
41
|
const judgmentsPath = resolve(root, "results", "latest", "grader-judgments.json");
|
|
41
42
|
const scoreSummaryPath = resolve(root, "results", "latest", "score-summary.json");
|
|
42
|
-
|
|
43
|
+
// Distinguish a legitimate skip (no graded eval ran this pipeline) from a
|
|
44
|
+
// degraded run where a full eval scored tests but no judgments persisted.
|
|
45
|
+
// The latter must fail loud — returning a benign `skipped` is what let
|
|
46
|
+
// reports publish with a score but no test details.
|
|
47
|
+
//
|
|
48
|
+
// A remote cache hit restores score-summary.json (with testCount) from a
|
|
49
|
+
// prior report but never writes grader-judgments.json, so judgments are
|
|
50
|
+
// legitimately absent — that is a benign skip, not a degraded full eval.
|
|
51
|
+
const fromRemoteCache = (state?.remoteCacheHits?.size ?? 0) > 0;
|
|
52
|
+
const inputs = classifyEnrichmentInputs(root);
|
|
53
|
+
if (inputs.kind === "judgments-missing-after-eval" && !fromRemoteCache) {
|
|
54
|
+
return {
|
|
55
|
+
durationMs: Date.now() - start,
|
|
56
|
+
status: "failed",
|
|
57
|
+
error: degradedEnrichmentError("gap-analysis", inputs.scoredTestCount),
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
if (inputs.kind !== "ready") {
|
|
43
61
|
return {
|
|
44
62
|
status: "skipped",
|
|
45
63
|
reason: "No grader-judgments.json — run a full evaluation first",
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* - P5: Local-first (pipeline never fails because of a store write)
|
|
11
11
|
* - P6: Sinks are fire-and-forget (failures logged, not thrown)
|
|
12
12
|
*/
|
|
13
|
-
import { type AppContext, type PipelineState, type PipelineStep, type PromptfooUrlEntry, type ReportAutoScope, type ScoreSummary, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
|
|
13
|
+
import { type AppContext, type PipelineState, type PipelineStep, type PromptfooUrlEntry, type ReportAutoScope, type ReportDegradation, type ScoreSummary, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
|
|
14
14
|
import { type ProvenanceInput } from "../../pipeline/provenance.js";
|
|
15
15
|
export declare class PublishReportStep implements PipelineStep {
|
|
16
16
|
private readonly pipelineStart;
|
|
@@ -25,6 +25,20 @@ export declare class PublishReportStep implements PipelineStep {
|
|
|
25
25
|
check(): ValidationIssue[];
|
|
26
26
|
execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
|
|
27
27
|
}
|
|
28
|
+
/**
|
|
29
|
+
* Detect whether a report should publish as degraded.
|
|
30
|
+
*
|
|
31
|
+
* The symptom is a scored run whose per-test details never landed: a full
|
|
32
|
+
* eval counted tests (`scores[].testCount > 0`) but `summary.testResults` is
|
|
33
|
+
* absent because gap-analysis skipped or failed. Such a report renders an
|
|
34
|
+
* empty "no tests" state in Studio despite carrying a score. Returns the
|
|
35
|
+
* marker enumerating which enrichment surfaces are missing, or `undefined`
|
|
36
|
+
* for a healthy report (or a run with no scored tests, where an empty report
|
|
37
|
+
* is legitimate).
|
|
38
|
+
*
|
|
39
|
+
* Exported for unit testing — production callers reach it via execute().
|
|
40
|
+
*/
|
|
41
|
+
export declare function detectReportDegradation(summary: ScoreSummary): ReportDegradation | undefined;
|
|
28
42
|
/**
|
|
29
43
|
* Assemble provenance input from the score summary and pipeline context.
|
|
30
44
|
*
|
|
@@ -110,9 +110,15 @@ export class PublishReportStep {
|
|
|
110
110
|
// agentBehavior arrays) point at their external artifacts via
|
|
111
111
|
// `id = manifestEntryKey`; Studio hydrates on drill-down.
|
|
112
112
|
const slimSummary = buildSlimReportSummary(summary, ctx.config.mode);
|
|
113
|
+
// Degraded-report detection (the "no tests on a scored report" symptom):
|
|
114
|
+
// a full eval scored tests but the gap-analysis enrichment never landed.
|
|
115
|
+
// Computed from the full summary read above — independent of which
|
|
116
|
+
// upstream step skipped — so the marker fires regardless of the cause.
|
|
117
|
+
const degraded = detectReportDegradation(summary);
|
|
113
118
|
const report = {
|
|
114
119
|
comparison: comparison ?? undefined,
|
|
115
120
|
completedAt: now,
|
|
121
|
+
...(degraded ? { degraded } : {}),
|
|
116
122
|
durationMs,
|
|
117
123
|
id: reportId,
|
|
118
124
|
provenance,
|
|
@@ -192,6 +198,45 @@ export class PublishReportStep {
|
|
|
192
198
|
// ---------------------------------------------------------------------------
|
|
193
199
|
// Helpers
|
|
194
200
|
// ---------------------------------------------------------------------------
|
|
201
|
+
/**
|
|
202
|
+
* Detect whether a report should publish as degraded.
|
|
203
|
+
*
|
|
204
|
+
* The symptom is a scored run whose per-test details never landed: a full
|
|
205
|
+
* eval counted tests (`scores[].testCount > 0`) but `summary.testResults` is
|
|
206
|
+
* absent because gap-analysis skipped or failed. Such a report renders an
|
|
207
|
+
* empty "no tests" state in Studio despite carrying a score. Returns the
|
|
208
|
+
* marker enumerating which enrichment surfaces are missing, or `undefined`
|
|
209
|
+
* for a healthy report (or a run with no scored tests, where an empty report
|
|
210
|
+
* is legitimate).
|
|
211
|
+
*
|
|
212
|
+
* Exported for unit testing — production callers reach it via execute().
|
|
213
|
+
*/
|
|
214
|
+
export function detectReportDegradation(summary) {
|
|
215
|
+
const scoredTestCount = (summary.scores ?? []).reduce((n, s) => n + (typeof s.testCount === "number" ? s.testCount : 0), 0);
|
|
216
|
+
const hasTestResults = (summary.testResults?.length ?? 0) > 0;
|
|
217
|
+
if (scoredTestCount === 0 || hasTestResults)
|
|
218
|
+
return undefined;
|
|
219
|
+
// `testResults` is the load-bearing signal (its absence is the rendered
|
|
220
|
+
// "no tests" symptom). The remaining fields are best-effort detail: some
|
|
221
|
+
// are literacy-only (e.g. documentManifest), so they may appear here for a
|
|
222
|
+
// degraded non-literacy run even though that mode never produces them.
|
|
223
|
+
const missing = ["testResults"];
|
|
224
|
+
if (!summary.failureModes)
|
|
225
|
+
missing.push("failureModes");
|
|
226
|
+
if (!summary.lowScoringJudgments?.length)
|
|
227
|
+
missing.push("lowScoringJudgments");
|
|
228
|
+
if (!summary.documentManifest?.length)
|
|
229
|
+
missing.push("documentManifest");
|
|
230
|
+
if (!summary.recommendations)
|
|
231
|
+
missing.push("recommendations");
|
|
232
|
+
return {
|
|
233
|
+
reason: "enrichment-missing",
|
|
234
|
+
missing,
|
|
235
|
+
detail: `Evaluation scored ${scoredTestCount} test(s) but enrichment did not ` +
|
|
236
|
+
`complete; per-test details and failure analysis are unavailable for ` +
|
|
237
|
+
`this report.`,
|
|
238
|
+
};
|
|
239
|
+
}
|
|
195
240
|
/**
|
|
196
241
|
* Assemble provenance input from the score summary and pipeline context.
|
|
197
242
|
*
|
|
@@ -41,6 +41,7 @@ import { resolveProfile } from "./profile-resolution.js";
|
|
|
41
41
|
import { loadSource } from "../sources.js";
|
|
42
42
|
import { LiteracyVariant } from "./normalize-mode.js";
|
|
43
43
|
import { scoreTestGroup, } from "./compiler/scoring-bridge.js";
|
|
44
|
+
import { extractGraderJudgmentsResilient, } from "./extract-grader-judgments-resilient.js";
|
|
44
45
|
// Re-export from core for backward compatibility.
|
|
45
46
|
// Existing imports from this file continue to work unchanged.
|
|
46
47
|
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
|
|
@@ -321,6 +322,54 @@ export function extractGraderJudgments(resultsPath, telemetry) {
|
|
|
321
322
|
}
|
|
322
323
|
return judgments;
|
|
323
324
|
}
|
|
325
|
+
/**
|
|
326
|
+
* Light parse of a results file's entry count — diagnostics only. Avoids the
|
|
327
|
+
* full normalize + debug logging of `readAndNormalizeResults`. Returns 0 when
|
|
328
|
+
* the file is missing or unparseable.
|
|
329
|
+
*/
|
|
330
|
+
function countResultEntries(resultsPath) {
|
|
331
|
+
try {
|
|
332
|
+
const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
|
|
333
|
+
const wrapper = file.results ?? file;
|
|
334
|
+
return Array.isArray(wrapper.results) ? wrapper.results.length : 0;
|
|
335
|
+
}
|
|
336
|
+
catch {
|
|
337
|
+
return 0;
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
/**
|
|
341
|
+
* Count classifiable llm-rubric components in a results file — i.e. the number
|
|
342
|
+
* of judgments a healthy `extractGraderJudgments` should produce. Used only to
|
|
343
|
+
* set the severity of a persistent-empty extraction: a file with classifiable
|
|
344
|
+
* components but 0 extracted judgments is an error; a file with none (all
|
|
345
|
+
* api-errors / no llm-rubric) is a benign empty.
|
|
346
|
+
*
|
|
347
|
+
* Deliberately an independent count path (not `extractGraderJudgments`) so the
|
|
348
|
+
* cross-check is meaningful. Returns 0 when the file is missing or unparseable.
|
|
349
|
+
*/
|
|
350
|
+
function countClassifiableRubricComponents(resultsPath) {
|
|
351
|
+
if (!existsSync(resultsPath))
|
|
352
|
+
return 0;
|
|
353
|
+
let n = 0;
|
|
354
|
+
for (const result of readAndNormalizeResults(resultsPath)) {
|
|
355
|
+
for (const comp of result.gradingResult.componentResults) {
|
|
356
|
+
if (comp.assertion?.type === "llm-rubric" && classifyRubric(comp)) {
|
|
357
|
+
n += 1;
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
return n;
|
|
362
|
+
}
|
|
363
|
+
/**
|
|
364
|
+
* Shared dependency bundle for `extractGraderJudgmentsResilient` — wires the
|
|
365
|
+
* real extractor + fs counters. Defined once so all persist sites self-heal
|
|
366
|
+
* identically.
|
|
367
|
+
*/
|
|
368
|
+
const resilientJudgmentDeps = {
|
|
369
|
+
countClassifiable: countClassifiableRubricComponents,
|
|
370
|
+
countResults: countResultEntries,
|
|
371
|
+
extract: extractGraderJudgments,
|
|
372
|
+
};
|
|
324
373
|
/**
|
|
325
374
|
* Stamp every grader judgment with a D0049 ceiling-cross-check confidence
|
|
326
375
|
* triple and increment `GraderReliability.failureModeCalibration` whenever
|
|
@@ -1494,7 +1543,7 @@ export async function calculateAndWriteScores(options) {
|
|
|
1494
1543
|
writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
|
|
1495
1544
|
log.info("Score summary written to results/latest/score-summary.json");
|
|
1496
1545
|
// Extract and persist grader judgments
|
|
1497
|
-
const judgments =
|
|
1546
|
+
const judgments = await extractGraderJudgmentsResilient([baselineResultsPath], undefined, log, { deps: resilientJudgmentDeps });
|
|
1498
1547
|
const borderlineConsistency = await runBorderlinePass(judgments, [
|
|
1499
1548
|
baselineResultsPath,
|
|
1500
1549
|
]);
|
|
@@ -1557,7 +1606,7 @@ export async function calculateAndWriteScores(options) {
|
|
|
1557
1606
|
mkdirSync(outDir, { recursive: true });
|
|
1558
1607
|
writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
|
|
1559
1608
|
log.info("Score summary written to results/latest/score-summary.json");
|
|
1560
|
-
const judgments =
|
|
1609
|
+
const judgments = await extractGraderJudgmentsResilient([baselineResultsPath], undefined, log, { deps: resilientJudgmentDeps });
|
|
1561
1610
|
const borderlineConsistency = await runBorderlinePass(judgments, [
|
|
1562
1611
|
baselineResultsPath,
|
|
1563
1612
|
]);
|
|
@@ -1687,18 +1736,14 @@ export async function calculateAndWriteScores(options) {
|
|
|
1687
1736
|
// the ceiling-cross-check disagreement counter (`failureModeCalibration`)
|
|
1688
1737
|
// is incremented during the post-extraction validation pass below.
|
|
1689
1738
|
const reliability = { graderModel: "unknown" };
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
|
|
1693
|
-
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
...(options.runId ? { runId: options.runId } : {}),
|
|
1699
|
-
});
|
|
1700
|
-
judgments.push(...agenticJudgments);
|
|
1701
|
-
}
|
|
1739
|
+
// Extract through the resilient wrapper so an empty result from the transient
|
|
1740
|
+
// read anomaly is instrumented and self-healed rather than silently skipping
|
|
1741
|
+
// the grader-judgments persist. In full mode both the baseline and agentic
|
|
1742
|
+
// result files are graded against the shared telemetry.
|
|
1743
|
+
const judgmentResultPaths = mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)
|
|
1744
|
+
? [baselineResultsPath, agenticResultsPath]
|
|
1745
|
+
: [baselineResultsPath];
|
|
1746
|
+
const judgments = await extractGraderJudgmentsResilient(judgmentResultPaths, { reliability, ...(options.runId ? { runId: options.runId } : {}) }, log, { deps: resilientJudgmentDeps });
|
|
1702
1747
|
// Borderline-consensus pass — re-grade the ±5 borderline subset N times
|
|
1703
1748
|
// and merge medians back into the canonical judgments BEFORE
|
|
1704
1749
|
// `validateGraderJudgmentsCalibration` runs, so the calibration counter
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/enrichment-preconditions.ts
|
|
3
|
+
*
|
|
4
|
+
* Classifies the inputs the post-scoring enrichment steps (gap-analysis,
|
|
5
|
+
* compute-attribution) depend on, so a missing `grader-judgments.json` can be
|
|
6
|
+
* told apart as either a legitimate skip (no graded eval ran this pipeline) or
|
|
7
|
+
* a degraded outcome (a full eval scored tests but no judgments persisted).
|
|
8
|
+
*
|
|
9
|
+
* The degraded case is the failure these steps must stop swallowing:
|
|
10
|
+
* `calculate-scores` wrote `score-summary.json` with per-area `testCount > 0`
|
|
11
|
+
* but never wrote `grader-judgments.json`, so the enrichment steps self-skip
|
|
12
|
+
* and the report ships with no test details while still showing a score.
|
|
13
|
+
* Distinguishing the two is what lets the steps fail loud instead of returning
|
|
14
|
+
* a benign `skipped`.
|
|
15
|
+
*/
|
|
16
|
+
/**
|
|
17
|
+
* Outcome of classifying the enrichment inputs under `results/latest/`.
|
|
18
|
+
*
|
|
19
|
+
* - `ready` — `grader-judgments.json` is present and non-empty; enrichment
|
|
20
|
+
* can run.
|
|
21
|
+
* - `no-full-eval` — no graded eval produced judgments this run. A legitimate
|
|
22
|
+
* skip: standalone gap-analysis on cached results, a non-graded run, or an
|
|
23
|
+
* eval that scored nothing.
|
|
24
|
+
* - `judgments-missing-after-eval` — a full eval scored tests
|
|
25
|
+
* (`score-summary.json` carries `testCount > 0`) yet `grader-judgments.json`
|
|
26
|
+
* is missing or empty. This is the degraded condition the steps surface.
|
|
27
|
+
*/
|
|
28
|
+
export type EnrichmentInputs = {
|
|
29
|
+
kind: "ready";
|
|
30
|
+
judgmentCount: number;
|
|
31
|
+
} | {
|
|
32
|
+
kind: "no-full-eval";
|
|
33
|
+
} | {
|
|
34
|
+
kind: "judgments-missing-after-eval";
|
|
35
|
+
scoredTestCount: number;
|
|
36
|
+
};
|
|
37
|
+
/**
|
|
38
|
+
* Classify the enrichment inputs for a run by inspecting
|
|
39
|
+
* `results/latest/grader-judgments.json` and `score-summary.json`.
|
|
40
|
+
*
|
|
41
|
+
* Pure read-only filesystem inspection — never throws on malformed input; a
|
|
42
|
+
* file that does not parse to the expected shape is treated as absent so that
|
|
43
|
+
* "no usable judgments" and "no usable summary" both collapse to a single
|
|
44
|
+
* branch.
|
|
45
|
+
*/
|
|
46
|
+
export declare function classifyEnrichmentInputs(rootDir: string): EnrichmentInputs;
|
|
47
|
+
/**
|
|
48
|
+
* Build the fail-loud error message for the degraded
|
|
49
|
+
* `judgments-missing-after-eval` case. Shared by the enrichment steps so the
|
|
50
|
+
* pipeline-result and job-document surfaces carry one consistent wording.
|
|
51
|
+
*/
|
|
52
|
+
export declare function degradedEnrichmentError(step: string, scoredTestCount: number): string;
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/enrichment-preconditions.ts
|
|
3
|
+
*
|
|
4
|
+
* Classifies the inputs the post-scoring enrichment steps (gap-analysis,
|
|
5
|
+
* compute-attribution) depend on, so a missing `grader-judgments.json` can be
|
|
6
|
+
* told apart as either a legitimate skip (no graded eval ran this pipeline) or
|
|
7
|
+
* a degraded outcome (a full eval scored tests but no judgments persisted).
|
|
8
|
+
*
|
|
9
|
+
* The degraded case is the failure these steps must stop swallowing:
|
|
10
|
+
* `calculate-scores` wrote `score-summary.json` with per-area `testCount > 0`
|
|
11
|
+
* but never wrote `grader-judgments.json`, so the enrichment steps self-skip
|
|
12
|
+
* and the report ships with no test details while still showing a score.
|
|
13
|
+
* Distinguishing the two is what lets the steps fail loud instead of returning
|
|
14
|
+
* a benign `skipped`.
|
|
15
|
+
*/
|
|
16
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
17
|
+
import { resolve } from "node:path";
|
|
18
|
+
/**
|
|
19
|
+
* Classify the enrichment inputs for a run by inspecting
|
|
20
|
+
* `results/latest/grader-judgments.json` and `score-summary.json`.
|
|
21
|
+
*
|
|
22
|
+
* Pure read-only filesystem inspection — never throws on malformed input; a
|
|
23
|
+
* file that does not parse to the expected shape is treated as absent so that
|
|
24
|
+
* "no usable judgments" and "no usable summary" both collapse to a single
|
|
25
|
+
* branch.
|
|
26
|
+
*/
|
|
27
|
+
export function classifyEnrichmentInputs(rootDir) {
|
|
28
|
+
const judgmentCount = countGraderJudgments(rootDir);
|
|
29
|
+
if (judgmentCount > 0) {
|
|
30
|
+
return { kind: "ready", judgmentCount };
|
|
31
|
+
}
|
|
32
|
+
const scoredTestCount = scoredTestCountFromSummary(rootDir);
|
|
33
|
+
if (scoredTestCount > 0) {
|
|
34
|
+
return { kind: "judgments-missing-after-eval", scoredTestCount };
|
|
35
|
+
}
|
|
36
|
+
return { kind: "no-full-eval" };
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Build the fail-loud error message for the degraded
|
|
40
|
+
* `judgments-missing-after-eval` case. Shared by the enrichment steps so the
|
|
41
|
+
* pipeline-result and job-document surfaces carry one consistent wording.
|
|
42
|
+
*/
|
|
43
|
+
export function degradedEnrichmentError(step, scoredTestCount) {
|
|
44
|
+
return (`${step}: grader-judgments.json missing after a full eval — ` +
|
|
45
|
+
`${scoredTestCount} test(s) scored but 0 grader judgments persisted. ` +
|
|
46
|
+
`The report is marked degraded rather than published as healthy.`);
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Count the judgments in `grader-judgments.json`. Returns 0 when the file is
|
|
50
|
+
* absent, unreadable, not valid JSON, or not an array — every "no usable
|
|
51
|
+
* judgments" shape collapses to 0 so callers branch on a single number. An
|
|
52
|
+
* empty array is therefore indistinguishable from a missing file by design
|
|
53
|
+
* (both are "no judgments persisted").
|
|
54
|
+
*/
|
|
55
|
+
function countGraderJudgments(rootDir) {
|
|
56
|
+
const path = resolve(rootDir, "results", "latest", "grader-judgments.json");
|
|
57
|
+
if (!existsSync(path))
|
|
58
|
+
return 0;
|
|
59
|
+
try {
|
|
60
|
+
const parsed = JSON.parse(readFileSync(path, "utf-8"));
|
|
61
|
+
return Array.isArray(parsed) ? parsed.length : 0;
|
|
62
|
+
}
|
|
63
|
+
catch {
|
|
64
|
+
return 0;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Sum the per-area `testCount` from `score-summary.json` — the signal that a
|
|
69
|
+
* full eval scored tests this run. Returns 0 when the summary is absent,
|
|
70
|
+
* unreadable, or carries no scored tests.
|
|
71
|
+
*/
|
|
72
|
+
function scoredTestCountFromSummary(rootDir) {
|
|
73
|
+
const path = resolve(rootDir, "results", "latest", "score-summary.json");
|
|
74
|
+
if (!existsSync(path))
|
|
75
|
+
return 0;
|
|
76
|
+
try {
|
|
77
|
+
const parsed = JSON.parse(readFileSync(path, "utf-8"));
|
|
78
|
+
const scores = Array.isArray(parsed.scores) ? parsed.scores : [];
|
|
79
|
+
return scores.reduce((sum, s) => sum + (typeof s.testCount === "number" ? s.testCount : 0), 0);
|
|
80
|
+
}
|
|
81
|
+
catch {
|
|
82
|
+
return 0;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/extract-grader-judgments-resilient.ts
|
|
3
|
+
*
|
|
4
|
+
* Resilient grader-judgment extraction for the `calculate-scores` persist
|
|
5
|
+
* junction.
|
|
6
|
+
*
|
|
7
|
+
* Background: `calculateAndWriteScores` extracts grader judgments from the
|
|
8
|
+
* eval results file(s), then writes `grader-judgments.json` only when the
|
|
9
|
+
* array is non-empty (the `judgments.length > 0` guard). A runtime anomaly was
|
|
10
|
+
* observed where `extractGraderJudgments` returned 0 judgments for a results
|
|
11
|
+
* file that demonstrably contained classifiable llm-rubric components — the
|
|
12
|
+
* same file, read tens of milliseconds later by `extractStoredTestResults`,
|
|
13
|
+
* yielded the full set (entries with populated dimensions). The empty array
|
|
14
|
+
* silently skipped the write, so gap-analysis and compute-attribution skipped
|
|
15
|
+
* and the report shipped with a score but no tests.
|
|
16
|
+
*
|
|
17
|
+
* The committed code reads the file via a pure `readFileSync` with identical
|
|
18
|
+
* classification on both sides, so the divergence is not reproducible from the
|
|
19
|
+
* source + captured artifacts — it is a transient read anomaly at the live
|
|
20
|
+
* junction. This wrapper does not pretend to know the mechanism; it makes the
|
|
21
|
+
* junction observable and recovers from the transient:
|
|
22
|
+
*
|
|
23
|
+
* 1. **Instruments** — logs the resolved path(s), file size/mtime, parsed
|
|
24
|
+
* result count, and judgment count on every run (never silent on 0), so a
|
|
25
|
+
* future empty-judgments persist is diagnosable from the run log alone.
|
|
26
|
+
* 2. **Self-heals** — when extraction yields 0 judgments but a results file
|
|
27
|
+
* exists, it re-extracts with bounded retries. A later read that yields
|
|
28
|
+
* judgments proves the initial 0 was transient; the recovered judgments
|
|
29
|
+
* are returned. If every attempt yields 0, severity is decided by an
|
|
30
|
+
* independent classifiable-component count: a genuinely judgment-free run
|
|
31
|
+
* (all api-errors / no llm-rubric) logs a warning, while 0 judgments
|
|
32
|
+
* against present classifiable components logs an error (the downstream
|
|
33
|
+
* gap-analysis fail-loud guard is the backstop).
|
|
34
|
+
*
|
|
35
|
+
* The extractor and fs helpers are injected so the wrapper is unit-testable
|
|
36
|
+
* without importing the ~3000-line scoring module (which would be circular)
|
|
37
|
+
* or touching the real filesystem.
|
|
38
|
+
*/
|
|
39
|
+
import type { GraderJudgment, GraderReliability, Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
40
|
+
/** Telemetry sink threaded into each extraction (shared reliability counters). */
|
|
41
|
+
export interface ExtractionTelemetry {
|
|
42
|
+
reliability: GraderReliability;
|
|
43
|
+
runId?: string;
|
|
44
|
+
}
|
|
45
|
+
/** Cheap on-disk stat used for diagnostics and the retry gate. */
|
|
46
|
+
export interface FileStat {
|
|
47
|
+
exists: boolean;
|
|
48
|
+
mtimeMs: number;
|
|
49
|
+
size: number;
|
|
50
|
+
}
|
|
51
|
+
/** Injectable seams — defaults wire the real fs; tests substitute fakes. */
|
|
52
|
+
export interface ResilientExtractionDeps {
|
|
53
|
+
/**
|
|
54
|
+
* The real `extractGraderJudgments`. Injected (rather than imported) to
|
|
55
|
+
* avoid a circular dependency with `calculate-scores.ts`.
|
|
56
|
+
*/
|
|
57
|
+
extract: (path: string, telemetry?: ExtractionTelemetry) => GraderJudgment[];
|
|
58
|
+
/** Parsed result-entry count for a path — diagnostics only. */
|
|
59
|
+
countResults?: (path: string) => number;
|
|
60
|
+
/**
|
|
61
|
+
* Count of classifiable llm-rubric components for a path. Used only to set
|
|
62
|
+
* the severity of a persistent-empty extraction: a file with classifiable
|
|
63
|
+
* components but 0 extracted judgments is an error; a file with none (all
|
|
64
|
+
* api-errors / no llm-rubric) is a benign empty.
|
|
65
|
+
*/
|
|
66
|
+
countClassifiable?: (path: string) => number;
|
|
67
|
+
/** On-disk stat (existence + size + mtime). */
|
|
68
|
+
statFile?: (path: string) => FileStat;
|
|
69
|
+
/** Backoff between self-heal attempts. */
|
|
70
|
+
sleep?: (ms: number) => Promise<void>;
|
|
71
|
+
}
|
|
72
|
+
export interface ResilientExtractionOptions {
|
|
73
|
+
/** Total extraction attempts when the first yields 0 (default 3, min 1). */
|
|
74
|
+
maxAttempts?: number;
|
|
75
|
+
/** Delay before each retry, in ms (default 200). */
|
|
76
|
+
delayMs?: number;
|
|
77
|
+
deps: ResilientExtractionDeps;
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* Extract grader judgments across one or more results files, instrumented and
|
|
81
|
+
* self-healing. See the module header for the rationale.
|
|
82
|
+
*
|
|
83
|
+
* @param resultsPaths One or more results files (e.g. baseline + agentic in
|
|
84
|
+
* literacy full mode). Missing paths are skipped.
|
|
85
|
+
* @param telemetry Shared reliability sink threaded into every extraction.
|
|
86
|
+
* @param log Pipeline logger — the junction is logged here on every run.
|
|
87
|
+
*/
|
|
88
|
+
export declare function extractGraderJudgmentsResilient(resultsPaths: readonly string[], telemetry: ExtractionTelemetry | undefined, log: Logger, options: ResilientExtractionOptions): Promise<GraderJudgment[]>;
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pipeline/extract-grader-judgments-resilient.ts
|
|
3
|
+
*
|
|
4
|
+
* Resilient grader-judgment extraction for the `calculate-scores` persist
|
|
5
|
+
* junction.
|
|
6
|
+
*
|
|
7
|
+
* Background: `calculateAndWriteScores` extracts grader judgments from the
|
|
8
|
+
* eval results file(s), then writes `grader-judgments.json` only when the
|
|
9
|
+
* array is non-empty (the `judgments.length > 0` guard). A runtime anomaly was
|
|
10
|
+
* observed where `extractGraderJudgments` returned 0 judgments for a results
|
|
11
|
+
* file that demonstrably contained classifiable llm-rubric components — the
|
|
12
|
+
* same file, read tens of milliseconds later by `extractStoredTestResults`,
|
|
13
|
+
* yielded the full set (entries with populated dimensions). The empty array
|
|
14
|
+
* silently skipped the write, so gap-analysis and compute-attribution skipped
|
|
15
|
+
* and the report shipped with a score but no tests.
|
|
16
|
+
*
|
|
17
|
+
* The committed code reads the file via a pure `readFileSync` with identical
|
|
18
|
+
* classification on both sides, so the divergence is not reproducible from the
|
|
19
|
+
* source + captured artifacts — it is a transient read anomaly at the live
|
|
20
|
+
* junction. This wrapper does not pretend to know the mechanism; it makes the
|
|
21
|
+
* junction observable and recovers from the transient:
|
|
22
|
+
*
|
|
23
|
+
* 1. **Instruments** — logs the resolved path(s), file size/mtime, parsed
|
|
24
|
+
* result count, and judgment count on every run (never silent on 0), so a
|
|
25
|
+
* future empty-judgments persist is diagnosable from the run log alone.
|
|
26
|
+
* 2. **Self-heals** — when extraction yields 0 judgments but a results file
|
|
27
|
+
* exists, it re-extracts with bounded retries. A later read that yields
|
|
28
|
+
* judgments proves the initial 0 was transient; the recovered judgments
|
|
29
|
+
* are returned. If every attempt yields 0, severity is decided by an
|
|
30
|
+
* independent classifiable-component count: a genuinely judgment-free run
|
|
31
|
+
* (all api-errors / no llm-rubric) logs a warning, while 0 judgments
|
|
32
|
+
* against present classifiable components logs an error (the downstream
|
|
33
|
+
* gap-analysis fail-loud guard is the backstop).
|
|
34
|
+
*
|
|
35
|
+
* The extractor and fs helpers are injected so the wrapper is unit-testable
|
|
36
|
+
* without importing the ~3000-line scoring module (which would be circular)
|
|
37
|
+
* or touching the real filesystem.
|
|
38
|
+
*/
|
|
39
|
+
import { existsSync, statSync } from "node:fs";
|
|
40
|
+
const defaultStat = (path) => {
|
|
41
|
+
if (!existsSync(path))
|
|
42
|
+
return { exists: false, mtimeMs: 0, size: 0 };
|
|
43
|
+
const s = statSync(path);
|
|
44
|
+
return { exists: true, mtimeMs: s.mtimeMs, size: s.size };
|
|
45
|
+
};
|
|
46
|
+
const defaultSleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
|
|
47
|
+
/**
|
|
48
|
+
* Extract grader judgments across one or more results files, instrumented and
|
|
49
|
+
* self-healing. See the module header for the rationale.
|
|
50
|
+
*
|
|
51
|
+
* @param resultsPaths One or more results files (e.g. baseline + agentic in
|
|
52
|
+
* literacy full mode). Missing paths are skipped.
|
|
53
|
+
* @param telemetry Shared reliability sink threaded into every extraction.
|
|
54
|
+
* @param log Pipeline logger — the junction is logged here on every run.
|
|
55
|
+
*/
|
|
56
|
+
export async function extractGraderJudgmentsResilient(resultsPaths, telemetry, log, options) {
|
|
57
|
+
const { extract } = options.deps;
|
|
58
|
+
const statFile = options.deps.statFile ?? defaultStat;
|
|
59
|
+
const sleep = options.deps.sleep ?? defaultSleep;
|
|
60
|
+
const { countResults, countClassifiable } = options.deps;
|
|
61
|
+
const maxAttempts = Math.max(1, options.maxAttempts ?? 3);
|
|
62
|
+
const delayMs = options.delayMs ?? 200;
|
|
63
|
+
const present = resultsPaths.filter((p) => statFile(p).exists);
|
|
64
|
+
const extractAll = () => {
|
|
65
|
+
const all = [];
|
|
66
|
+
for (const p of present) {
|
|
67
|
+
all.push(...extract(p, telemetry));
|
|
68
|
+
}
|
|
69
|
+
return all;
|
|
70
|
+
};
|
|
71
|
+
const diag = (path) => {
|
|
72
|
+
const st = statFile(path);
|
|
73
|
+
return {
|
|
74
|
+
mtimeMs: st.mtimeMs,
|
|
75
|
+
path,
|
|
76
|
+
sizeBytes: st.size,
|
|
77
|
+
...(countResults ? { resultCount: countResults(path) } : {}),
|
|
78
|
+
};
|
|
79
|
+
};
|
|
80
|
+
// Attempt 1 — always instrument the junction so 0 is never silent.
|
|
81
|
+
let judgments = extractAll();
|
|
82
|
+
for (const p of present) {
|
|
83
|
+
log.info("Grader judgments — persist junction read", diag(p));
|
|
84
|
+
}
|
|
85
|
+
log.info(`Grader judgments extracted: ${judgments.length} total`, {
|
|
86
|
+
judgmentCount: judgments.length,
|
|
87
|
+
paths: present,
|
|
88
|
+
});
|
|
89
|
+
if (judgments.length > 0)
|
|
90
|
+
return judgments;
|
|
91
|
+
// 0 judgments and no results file present → genuinely nothing to grade.
|
|
92
|
+
// A missing file cannot become non-empty within the retry window.
|
|
93
|
+
if (present.length === 0) {
|
|
94
|
+
log.info("No grader judgments — no results file present (nothing to grade)");
|
|
95
|
+
return judgments;
|
|
96
|
+
}
|
|
97
|
+
// Results file(s) exist but extraction yielded 0 judgments — a suspected
|
|
98
|
+
// transient read anomaly. Loud diagnostic, then bounded self-heal retries;
|
|
99
|
+
// the same file read tens of ms later has been observed to yield the full set.
|
|
100
|
+
log.warn("Grader extraction returned 0 judgments despite present results file(s) — suspected transient read anomaly; attempting self-heal", { paths: present.map(diag) });
|
|
101
|
+
for (let attempt = 2; attempt <= maxAttempts; attempt++) {
|
|
102
|
+
await sleep(delayMs);
|
|
103
|
+
judgments = extractAll();
|
|
104
|
+
log.warn(`Grader self-heal attempt ${attempt}/${maxAttempts}: ${judgments.length} judgment(s)`);
|
|
105
|
+
if (judgments.length > 0) {
|
|
106
|
+
log.warn(`Grader self-heal recovered ${judgments.length} grader judgment(s) on attempt ${attempt} — the initial empty extraction was a transient read anomaly`);
|
|
107
|
+
return judgments;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
// Still empty after every attempt. Severity depends on whether the files
|
|
111
|
+
// actually contain classifiable components.
|
|
112
|
+
const classifiable = countClassifiable
|
|
113
|
+
? present.reduce((n, p) => n + countClassifiable(p), 0)
|
|
114
|
+
: undefined;
|
|
115
|
+
if (classifiable === 0) {
|
|
116
|
+
log.warn(`No grader judgments after ${maxAttempts} attempt(s) — results contain no classifiable llm-rubric components (e.g. all api-errors); nothing to persist`);
|
|
117
|
+
}
|
|
118
|
+
else {
|
|
119
|
+
log.error(`Grader judgments empty after ${maxAttempts} attempt(s) but ${classifiable ?? "an unknown number of"} classifiable component(s) present in the results file(s) — persisting none; downstream gap-analysis/attribution will fail loud`, { paths: present.map(diag) });
|
|
120
|
+
}
|
|
121
|
+
return judgments;
|
|
122
|
+
}
|
package/dist/report-store.d.ts
CHANGED
|
@@ -216,6 +216,7 @@ export interface SanityReportDoc {
|
|
|
216
216
|
_type: string;
|
|
217
217
|
comparison: null | Omit<ComparisonReport, "baseline" | "experiment">;
|
|
218
218
|
completedAt: string;
|
|
219
|
+
degraded?: Report["degraded"];
|
|
219
220
|
durationMs: number;
|
|
220
221
|
provenance: Report["provenance"];
|
|
221
222
|
reportId: ReportId;
|
package/dist/report-store.js
CHANGED
|
@@ -477,6 +477,7 @@ export function toSanityReportDoc(report) {
|
|
|
477
477
|
_type: REPORT_TYPE,
|
|
478
478
|
comparison,
|
|
479
479
|
completedAt: report.completedAt,
|
|
480
|
+
...(report.degraded ? { degraded: report.degraded } : {}),
|
|
480
481
|
durationMs: report.durationMs,
|
|
481
482
|
provenance: report.provenance,
|
|
482
483
|
reportId: report.id,
|
|
@@ -526,6 +527,7 @@ export function toReport(doc) {
|
|
|
526
527
|
artifactManifest,
|
|
527
528
|
comparison: doc.comparison,
|
|
528
529
|
completedAt: doc.completedAt,
|
|
530
|
+
degraded: doc.degraded,
|
|
529
531
|
durationMs: doc.durationMs,
|
|
530
532
|
id: doc.reportId,
|
|
531
533
|
provenance: doc.provenance,
|