@sanity/ailf 7.0.1 → 7.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/config/rubrics.ts +12 -13
  2. package/dist/_vendor/ailf-core/ports/context.d.ts +45 -3
  3. package/dist/_vendor/ailf-core/ports/index.d.ts +1 -1
  4. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +9 -1
  5. package/dist/_vendor/ailf-core/schemas/branded-string.js +16 -6
  6. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -0
  7. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
  8. package/dist/_vendor/ailf-core/schemas/report.d.ts +12 -0
  9. package/dist/_vendor/ailf-core/schemas/report.js +2 -0
  10. package/dist/_vendor/ailf-core/schemas/team.d.ts +22 -0
  11. package/dist/_vendor/ailf-core/schemas/team.js +63 -0
  12. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +51 -0
  13. package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
  14. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +17 -0
  15. package/dist/_vendor/ailf-core/types/team.d.ts +65 -0
  16. package/dist/_vendor/ailf-core/types/team.js +1 -0
  17. package/dist/_vendor/ailf-shared/eval-modes.d.ts +2 -0
  18. package/dist/_vendor/ailf-shared/eval-modes.js +5 -0
  19. package/dist/_vendor/ailf-shared/event-types.d.ts +15 -0
  20. package/dist/_vendor/ailf-shared/event-types.js +23 -0
  21. package/dist/_vendor/ailf-shared/generated/help-content.js +2 -2
  22. package/dist/_vendor/ailf-shared/index.d.ts +4 -2
  23. package/dist/_vendor/ailf-shared/index.js +4 -2
  24. package/dist/_vendor/ailf-shared/member-roles.d.ts +16 -0
  25. package/dist/_vendor/ailf-shared/member-roles.js +16 -0
  26. package/dist/_vendor/ailf-shared/owner-teams.d.ts +19 -0
  27. package/dist/_vendor/ailf-shared/owner-teams.js +7 -0
  28. package/dist/_vendor/ailf-shared/run-context.d.ts +8 -1
  29. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +65 -1
  30. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +35 -0
  31. package/dist/adapters/task-sources/changed-docs-filter.d.ts +12 -0
  32. package/dist/adapters/task-sources/changed-docs-filter.js +30 -0
  33. package/dist/adapters/task-sources/content-lake-task-source.js +2 -1
  34. package/dist/adapters/task-sources/repo-task-source.js +2 -1
  35. package/dist/commands/pipeline-action.d.ts +4 -3
  36. package/dist/commands/pipeline-action.js +7 -5
  37. package/dist/commands/run.js +2 -2
  38. package/dist/config/rubrics.ts +12 -13
  39. package/dist/job-store.d.ts +18 -0
  40. package/dist/job-store.js +34 -0
  41. package/dist/orchestration/build-app-context.js +8 -1
  42. package/dist/orchestration/pipeline-orchestrator.js +46 -1
  43. package/dist/orchestration/steps/compare-step.d.ts +7 -0
  44. package/dist/orchestration/steps/compare-step.js +59 -23
  45. package/dist/orchestration/steps/fetch-docs-step.js +3 -0
  46. package/dist/orchestration/steps/finalize-run-step.js +2 -0
  47. package/dist/orchestration/steps/generate-configs-step.d.ts +32 -1
  48. package/dist/orchestration/steps/generate-configs-step.js +47 -13
  49. package/dist/orchestration/steps/grader-consistency-step.js +11 -0
  50. package/dist/orchestration/steps/publish-report-step.d.ts +12 -1
  51. package/dist/orchestration/steps/publish-report-step.js +19 -3
  52. package/dist/pipeline/cache-hit-restore.d.ts +14 -1
  53. package/dist/pipeline/cache-hit-restore.js +17 -0
  54. package/dist/pipeline/calculate-scores.js +57 -21
  55. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +7 -2
  56. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +13 -4
  57. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +1 -1
  58. package/dist/pipeline/compiler/provider-assembler.d.ts +15 -1
  59. package/dist/pipeline/compiler/provider-assembler.js +16 -3
  60. package/dist/pipeline/failure-modes.d.ts +20 -10
  61. package/dist/pipeline/failure-modes.js +84 -15
  62. package/dist/pipeline/map-request-to-config.js +2 -0
  63. package/dist/pipeline/normalize-mode.d.ts +1 -1
  64. package/dist/pipeline/normalize-mode.js +2 -0
  65. package/dist/pipeline/run-context.d.ts +16 -1
  66. package/dist/pipeline/run-context.js +12 -1
  67. package/dist/pipeline/validate.d.ts +8 -4
  68. package/dist/pipeline/validate.js +8 -18
  69. package/dist/report-store.d.ts +14 -1
  70. package/dist/report-store.js +32 -0
  71. package/dist/sanity/client.js +2 -2
  72. package/package.json +1 -1
@@ -20,15 +20,17 @@
20
20
  export { computeCanaryDrift, type CanaryDriftReport, type CanaryReportSlim, type DriftEntry, type DriftThresholds, type DriftVerdict, } from "./canary-drift.js";
21
21
  export { type DocumentRef } from "./document-ref.js";
22
22
  export { makeEditorialReference, type EditorialReference, type MakeEditorialReferenceArgs, } from "./editorial-reference.js";
23
+ export { isKnownEventType, KNOWN_EVENT_TYPES, type EventType, type KnownEventType, } from "./event-types.js";
23
24
  export { FEATURE_FLAGS, type FeatureFlag, type FeatureFlagKey, } from "./feature-flags.js";
24
25
  export { DEFAULT_GCS_ARTIFACT_BUCKET } from "./gcs-defaults.js";
25
26
  export { GLOSSARY, type GlossaryEntry, type GlossarySlug } from "./glossary.js";
26
27
  export { HELP_TOPICS } from "./help-content.js";
27
28
  export { type HelpTopic } from "./help-topics.js";
29
+ export { isKnownMemberRole, KNOWN_MEMBER_ROLES, type KnownMemberRole, type MemberRole, } from "./member-roles.js";
28
30
  export { GRADE_BOUNDARIES, scoreGrade, type ScoreGrade, } from "./score-grades.js";
29
31
  export { NOISE_THRESHOLD } from "./noise-threshold.js";
30
- export { CANONICAL_EVAL_MODES, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, type EvalMode, type LiteracyVariant, type RawEvalMode, } from "./eval-modes.js";
31
- export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, } from "./owner-teams.js";
32
+ export { CANONICAL_EVAL_MODES, isLiteracyVariant, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, type EvalMode, type LiteracyVariant, type RawEvalMode, } from "./eval-modes.js";
33
+ export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, resolveTeamRef, type SlugLike, } from "./owner-teams.js";
32
34
  export { isRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, type RunClassification, type RunExecutor, type RunExecutorSurface, type RunExecutorSystem, type RunExecutorUser, type RunHost, type RunLineage, type RunOwner, type RunTool, } from "./run-classification.js";
33
35
  export { type RunTrigger } from "./run-trigger.js";
34
36
  export { type RunContext } from "./run-context.js";
@@ -19,12 +19,14 @@
19
19
  */
20
20
  export { computeCanaryDrift, } from "./canary-drift.js";
21
21
  export { makeEditorialReference, } from "./editorial-reference.js";
22
+ export { isKnownEventType, KNOWN_EVENT_TYPES, } from "./event-types.js";
22
23
  export { FEATURE_FLAGS, } from "./feature-flags.js";
23
24
  export { DEFAULT_GCS_ARTIFACT_BUCKET } from "./gcs-defaults.js";
24
25
  export { GLOSSARY } from "./glossary.js";
25
26
  export { HELP_TOPICS } from "./help-content.js";
27
+ export { isKnownMemberRole, KNOWN_MEMBER_ROLES, } from "./member-roles.js";
26
28
  export { GRADE_BOUNDARIES, scoreGrade, } from "./score-grades.js";
27
29
  export { NOISE_THRESHOLD } from "./noise-threshold.js";
28
- export { CANONICAL_EVAL_MODES, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, } from "./eval-modes.js";
29
- export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, } from "./owner-teams.js";
30
+ export { CANONICAL_EVAL_MODES, isLiteracyVariant, LEGACY_EVAL_MODE_ALIASES, LITERACY_VARIANTS, RAW_EVAL_MODES, } from "./eval-modes.js";
31
+ export { isKnownOwnerTeam, KNOWN_OWNER_TEAMS, normalizeOwnerTeam, resolveTeamRef, } from "./owner-teams.js";
30
32
  export { isRunClassification, RUN_CLASSIFICATIONS, RUN_EXECUTOR_SURFACES, } from "./run-classification.js";
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Known team-member roles and soft-enum helpers.
3
+ *
4
+ * Roles are free-form strings — teams can introduce custom roles (e.g.
5
+ * `"reviewer"`, `"sme"`) without a code change. This module seeds Studio
6
+ * comboboxes with canonical values and exposes a narrowing predicate
7
+ * without closing the enum.
8
+ *
9
+ * Parallel to the same type aliases in `@sanity/ailf-core`'s team module:
10
+ * `shared` is the leaf of the dependency graph, so the studio schema can
11
+ * import the runtime tuple without pulling in core.
12
+ */
13
+ export declare const KNOWN_MEMBER_ROLES: readonly ["lead", "member", "oncall"];
14
+ export type KnownMemberRole = (typeof KNOWN_MEMBER_ROLES)[number];
15
+ export type MemberRole = KnownMemberRole | (string & {});
16
+ export declare function isKnownMemberRole(value: string): value is KnownMemberRole;
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Known team-member roles and soft-enum helpers.
3
+ *
4
+ * Roles are free-form strings — teams can introduce custom roles (e.g.
5
+ * `"reviewer"`, `"sme"`) without a code change. This module seeds Studio
6
+ * comboboxes with canonical values and exposes a narrowing predicate
7
+ * without closing the enum.
8
+ *
9
+ * Parallel to the same type aliases in `@sanity/ailf-core`'s team module:
10
+ * `shared` is the leaf of the dependency graph, so the studio schema can
11
+ * import the runtime tuple without pulling in core.
12
+ */
13
+ export const KNOWN_MEMBER_ROLES = ["lead", "member", "oncall"];
14
+ export function isKnownMemberRole(value) {
15
+ return KNOWN_MEMBER_ROLES.includes(value);
16
+ }
@@ -24,3 +24,22 @@ export declare const KNOWN_OWNER_TEAMS: readonly string[];
24
24
  */
25
25
  export declare function normalizeOwnerTeam(value: string | undefined | null): string;
26
26
  export declare function isKnownOwnerTeam(value: string): boolean;
27
+ /**
28
+ * Lightweight team lookup against an in-memory team list.
29
+ *
30
+ * Consumers fetch team docs via GROQ then call this helper to resolve a
31
+ * freeform `owner.team` string (D0037) into a Sanity reference. Unknown
32
+ * strings return null. Known aliases (via OWNER_TEAM_ALIASES) are honored.
33
+ *
34
+ * Returning null is the cue for the UI to render an "unresolved team"
35
+ * badge — not an error condition.
36
+ */
37
+ export interface SlugLike {
38
+ _id: string;
39
+ slug: {
40
+ current: string;
41
+ };
42
+ }
43
+ export declare function resolveTeamRef(value: string | null | undefined, teams: readonly SlugLike[]): {
44
+ _ref: string;
45
+ } | null;
@@ -50,3 +50,10 @@ export function normalizeOwnerTeam(value) {
50
50
  export function isKnownOwnerTeam(value) {
51
51
  return KNOWN_OWNER_TEAMS.includes(value);
52
52
  }
53
+ export function resolveTeamRef(value, teams) {
54
+ const normalized = normalizeOwnerTeam(value ?? undefined);
55
+ if (!normalized || normalized === "unknown")
56
+ return null;
57
+ const match = teams.find((t) => t.slug?.current === normalized);
58
+ return match ? { _ref: match._id } : null;
59
+ }
@@ -14,7 +14,7 @@
14
14
  * @see docs/decisions/D0032-run-anchored-artifact-store.md
15
15
  * @see docs/design-docs/run-artifact-store.md (§ Drift Prevention)
16
16
  */
17
- import type { EvalMode } from "./eval-modes.js";
17
+ import type { EvalMode, LiteracyVariant } from "./eval-modes.js";
18
18
  import type { RunClassification, RunExecutor, RunHost, RunLineage, RunOwner, RunTool } from "./run-classification.js";
19
19
  import type { RunTrigger } from "./run-trigger.js";
20
20
  export interface RunContext {
@@ -75,4 +75,11 @@ export interface RunContext {
75
75
  tool?: RunTool;
76
76
  /** What initiated this run */
77
77
  trigger: RunTrigger;
78
+ /**
79
+ * Literacy mode variant — `baseline`, `agentic`, `observed`, or `full`.
80
+ * Only meaningful when `mode === "literacy"`; absent for other modes.
81
+ * Surfaced on `ReportProvenance` so dashboards can disambiguate which
82
+ * variant produced a given report.
83
+ */
84
+ variant?: LiteracyVariant;
78
85
  }
@@ -99,4 +99,68 @@ export declare const GraderJudgmentSchema: z.ZodObject<{
99
99
  graderJudgmentsVersion: z.ZodString;
100
100
  }, z.core.$strip>;
101
101
  }, z.core.$strict>;
102
- export type { GraderJudgment } from "../../_vendor/ailf-core/index.d.ts";
102
+ /**
103
+ * Wire-format schema — what the grader LLM is asked to emit.
104
+ *
105
+ * This is the subset of {@link GraderJudgmentSchema} that contains only
106
+ * fields the LLM can correctly produce. The pipeline parses untrusted
107
+ * grader output against this shape, then synthesizes the four
108
+ * pipeline-owned fields (`judgmentId`, `metadata.graderModel`,
109
+ * `metadata.graderJudgmentsVersion`, `hallucinationCheckedAgainst`) plus
110
+ * the three result-context fields (`taskId`, `modelId`, `dimension`) to
111
+ * build the full {@link GraderJudgment} storage shape.
112
+ *
113
+ * `.strict()` is retained — the LLM is told exactly which keys to emit,
114
+ * so any extras are either prompt-injection attempts or noise we want
115
+ * to surface as parse failures (which then drop to the
116
+ * `synthesizeUnparsedJudgment` fallback). The bar to hit that fallback
117
+ * is much lower than before W0273 — previously a missing pipeline-owned
118
+ * field tripped it on every emission.
119
+ *
120
+ * Asserts `satisfies z.ZodType<GraderEmittedJudgment>` against the
121
+ * independently-authored core type (D0045). The trust-boundary CI gate
122
+ * (`pnpm check-trust-boundary-satisfies`) covers this file.
123
+ *
124
+ * @see docs/audits/2026-05-22-empty-gap-analysis-regression.md
125
+ */
126
+ export declare const GraderEmittedJudgmentSchema: z.ZodObject<{
127
+ score: z.ZodNumber;
128
+ reason: z.ZodString;
129
+ failureMode: z.ZodString;
130
+ subJudgments: z.ZodArray<z.ZodObject<{
131
+ criterionId: z.ZodString;
132
+ met: z.ZodBoolean;
133
+ evidence: z.ZodString;
134
+ confidence: z.ZodObject<{
135
+ level: z.ZodEnum<{
136
+ low: "low";
137
+ medium: "medium";
138
+ high: "high";
139
+ }>;
140
+ signalsPresent: z.ZodNumber;
141
+ derivation: z.ZodString;
142
+ }, z.core.$strip>;
143
+ }, z.core.$strip>>;
144
+ docCitations: z.ZodArray<z.ZodObject<{
145
+ documentId: z.ZodString;
146
+ slug: z.ZodOptional<z.ZodString>;
147
+ role: z.ZodEnum<{
148
+ supports: "supports";
149
+ contradicts: "contradicts";
150
+ missing: "missing";
151
+ irrelevant: "irrelevant";
152
+ }>;
153
+ hallucinated: z.ZodOptional<z.ZodBoolean>;
154
+ }, z.core.$strip>>;
155
+ confidence: z.ZodObject<{
156
+ level: z.ZodEnum<{
157
+ low: "low";
158
+ medium: "medium";
159
+ high: "high";
160
+ }>;
161
+ signalsPresent: z.ZodNumber;
162
+ derivation: z.ZodString;
163
+ }, z.core.$strip>;
164
+ outputFailure: z.ZodOptional<z.ZodBoolean>;
165
+ }, z.core.$strict>;
166
+ export type { GraderEmittedJudgment, GraderJudgment } from "../../_vendor/ailf-core/index.d.ts";
@@ -91,3 +91,38 @@ export const GraderJudgmentSchema = z
91
91
  }),
92
92
  })
93
93
  .strict();
94
+ /**
95
+ * Wire-format schema — what the grader LLM is asked to emit.
96
+ *
97
+ * This is the subset of {@link GraderJudgmentSchema} that contains only
98
+ * fields the LLM can correctly produce. The pipeline parses untrusted
99
+ * grader output against this shape, then synthesizes the four
100
+ * pipeline-owned fields (`judgmentId`, `metadata.graderModel`,
101
+ * `metadata.graderJudgmentsVersion`, `hallucinationCheckedAgainst`) plus
102
+ * the three result-context fields (`taskId`, `modelId`, `dimension`) to
103
+ * build the full {@link GraderJudgment} storage shape.
104
+ *
105
+ * `.strict()` is retained — the LLM is told exactly which keys to emit,
106
+ * so any extras are either prompt-injection attempts or noise we want
107
+ * to surface as parse failures (which then drop to the
108
+ * `synthesizeUnparsedJudgment` fallback). The bar to hit that fallback
109
+ * is much lower than before W0273 — previously a missing pipeline-owned
110
+ * field tripped it on every emission.
111
+ *
112
+ * Asserts `satisfies z.ZodType<GraderEmittedJudgment>` against the
113
+ * independently-authored core type (D0045). The trust-boundary CI gate
114
+ * (`pnpm check-trust-boundary-satisfies`) covers this file.
115
+ *
116
+ * @see docs/audits/2026-05-22-empty-gap-analysis-regression.md
117
+ */
118
+ export const GraderEmittedJudgmentSchema = z
119
+ .object({
120
+ score: z.number(),
121
+ reason: z.string(),
122
+ failureMode: z.string(),
123
+ subJudgments: z.array(CriterionSubJudgmentSchema),
124
+ docCitations: z.array(DocCitationSchema),
125
+ confidence: ConfidenceSchema,
126
+ outputFailure: z.boolean().optional(),
127
+ })
128
+ .strict();
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Filter tasks by changed doc slugs.
3
+ *
4
+ * When `changedDocs` is set and non-empty, returns only tasks whose
5
+ * `context.docs[*].slug` intersects the provided list. Tasks without a
6
+ * `context.docs` array (e.g. knowledge-probe, mcp-server, agent-harness
7
+ * modes) are excluded — there's no way for them to "touch" a doc slug.
8
+ *
9
+ * An empty or undefined `changedDocs` is a no-op (returns input).
10
+ */
11
+ import type { GeneralizedTaskDefinition } from "../../_vendor/ailf-core/index.d.ts";
12
+ export declare function filterByChangedDocs(tasks: readonly GeneralizedTaskDefinition[], changedDocs: readonly string[] | undefined): GeneralizedTaskDefinition[];
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Every variant of `GeneralizedTaskDefinition` declares an optional
3
+ * `context?: { docs?: GeneralizedDocRef[]; ... }`, so structural access
4
+ * narrows correctly without an `as` cast. Returns `undefined` when the
5
+ * task carries no doc refs.
6
+ */
7
+ function taskContextDocs(task) {
8
+ return task.context?.docs;
9
+ }
10
+ /**
11
+ * `GeneralizedDocRef` is a 4-way union — only `SlugDocRef` and `IdDocRef`
12
+ * carry a `slug`. Returns `undefined` for path / perspective refs.
13
+ */
14
+ function docSlug(ref) {
15
+ return "slug" in ref ? ref.slug : undefined;
16
+ }
17
+ export function filterByChangedDocs(tasks, changedDocs) {
18
+ if (!changedDocs || changedDocs.length === 0)
19
+ return [...tasks];
20
+ const wanted = new Set(changedDocs);
21
+ return tasks.filter((task) => {
22
+ const docs = taskContextDocs(task);
23
+ if (!docs || docs.length === 0)
24
+ return false;
25
+ return docs.some((d) => {
26
+ const slug = docSlug(d);
27
+ return slug != null && wanted.has(slug);
28
+ });
29
+ });
30
+ }
@@ -15,6 +15,7 @@
15
15
  * @see packages/core/src/ports/task-source.ts — TaskSource port
16
16
  * @see docs/decisions/D0038-content-lake-authorable-task-modes.md
17
17
  */
18
+ import { filterByChangedDocs } from "./changed-docs-filter.js";
18
19
  import { ContentLakeAuthorableTaskSchema } from "./repo-schemas.js";
19
20
  // ---------------------------------------------------------------------------
20
21
  // GROQ query — fetches ailf.task documents with resolved references
@@ -127,7 +128,7 @@ export class ContentLakeTaskSource {
127
128
  console.warn(" ⚠️ ContentLakeTaskSource: no ailf.task documents found in the Content Lake. " +
128
129
  "Have you run the migration (Phase 3) or created tasks in Studio?");
129
130
  }
130
- return definitions;
131
+ return filterByChangedDocs(definitions, filter?.changedDocs);
131
132
  }
132
133
  }
133
134
  // ---------------------------------------------------------------------------
@@ -22,6 +22,7 @@ import { existsSync, readdirSync, readFileSync } from "fs";
22
22
  import { resolve } from "path";
23
23
  import { load } from "js-yaml";
24
24
  import { CANONICAL_EVAL_MODES } from "../../_vendor/ailf-shared/index.js";
25
+ import { filterByChangedDocs } from "./changed-docs-filter.js";
25
26
  import { detectLegacyFieldNames, migratePromptShape, parseCanonicalTaskFile, } from "./repo-schemas.js";
26
27
  import { discoverTsTaskFiles, loadTsTaskFile } from "./task-file-loader.js";
27
28
  /** Set of canonical mode names for O(1) lookup */
@@ -111,7 +112,7 @@ export class RepoTaskSource {
111
112
  }
112
113
  }
113
114
  }
114
- return definitions;
115
+ return filterByChangedDocs(definitions, filter?.changedDocs);
115
116
  }
116
117
  }
117
118
  // ---------------------------------------------------------------------------
@@ -98,13 +98,14 @@ export declare function computeResolvedOptions(opts: PipelineCliOptions): Resolv
98
98
  /**
99
99
  * Determine whether the post-run diagnosis summary hook should fire.
100
100
  *
101
- * 4-level precedence chain (D6-20):
101
+ * 4-level precedence chain (D0054):
102
102
  * Level 1 — CLI flag (absolute): if `cliOpts.summary` is boolean, use it.
103
103
  * Level 2 — AILF_INTERPRET_ON_RUN env var (absolute): strict "1"/"0" parse;
104
104
  * anything else falls through (T-06-11 spoofing mitigation).
105
105
  * Level 3 — config `summary.onRun` (absolute): "always" → true; "never" → false;
106
106
  * "auto" or absent falls through to level 4.
107
- * Level 4 — default auto: TTY && !CI (SC1 default-off in CI).
107
+ * Level 4 — default always-on: diagnosis is a first-class output produced
108
+ * for every pipeline run unless explicitly opted out at levels 1–3.
108
109
  */
109
110
  export declare function shouldRunPostSummary(cliOpts: PipelineCliOptions, resolvedOnRun: "auto" | "always" | "never" | undefined): boolean;
110
111
  export declare function buildSynthesisTelemetry(diagnosis: Diagnosis): SynthesisCostTelemetry;
@@ -113,7 +114,7 @@ export declare function buildSynthesisTelemetry(diagnosis: Diagnosis): Synthesis
113
114
  *
114
115
  * Fires after orchestratePipeline() + writePipelineResult() (D6-02).
115
116
  * Hook failure prints to stderr but does NOT change exit code (D6-03).
116
- * CI default-off: fires only when shouldRunPostSummary returns true (D6-20).
117
+ * Fires whenever shouldRunPostSummary returns true (D0054 — default on).
117
118
  *
118
119
  * @param ctx - App context (composition root wiring)
119
120
  * @param result - Pipeline result (includes reportId when published)
@@ -388,13 +388,14 @@ function resolvePublishAuto(repoValue) {
388
388
  /**
389
389
  * Determine whether the post-run diagnosis summary hook should fire.
390
390
  *
391
- * 4-level precedence chain (D6-20):
391
+ * 4-level precedence chain (D0054):
392
392
  * Level 1 — CLI flag (absolute): if `cliOpts.summary` is boolean, use it.
393
393
  * Level 2 — AILF_INTERPRET_ON_RUN env var (absolute): strict "1"/"0" parse;
394
394
  * anything else falls through (T-06-11 spoofing mitigation).
395
395
  * Level 3 — config `summary.onRun` (absolute): "always" → true; "never" → false;
396
396
  * "auto" or absent falls through to level 4.
397
- * Level 4 — default auto: TTY && !CI (SC1 default-off in CI).
397
+ * Level 4 — default always-on: diagnosis is a first-class output produced
398
+ * for every pipeline run unless explicitly opted out at levels 1–3.
398
399
  */
399
400
  export function shouldRunPostSummary(cliOpts, resolvedOnRun) {
400
401
  // Level 1: CLI flag wins absolutely
@@ -415,8 +416,9 @@ export function shouldRunPostSummary(cliOpts, resolvedOnRun) {
415
416
  if (resolvedOnRun === "never")
416
417
  return false;
417
418
  // "auto" or undefined falls through
418
- // Level 4: default auto fire only when stdout is interactive and not in CI
419
- return Boolean(process.stdout.isTTY) && process.env.CI !== "true";
419
+ // Level 4: diagnosis is on by default emit a diagnosis artifact for every
420
+ // pipeline run unless an upstream level explicitly opted out (D0054).
421
+ return true;
420
422
  }
421
423
  /**
422
424
  * Build a SynthesisCostTelemetry payload from a completed Diagnosis.
@@ -479,7 +481,7 @@ export function buildSynthesisTelemetry(diagnosis) {
479
481
  *
480
482
  * Fires after orchestratePipeline() + writePipelineResult() (D6-02).
481
483
  * Hook failure prints to stderr but does NOT change exit code (D6-03).
482
- * CI default-off: fires only when shouldRunPostSummary returns true (D6-20).
484
+ * Fires whenever shouldRunPostSummary returns true (D0054 — default on).
483
485
  *
484
486
  * @param ctx - App context (composition root wiring)
485
487
  * @param result - Pipeline result (includes reportId when published)
@@ -43,8 +43,8 @@ export function createRunCommand() {
43
43
  .option("-p, --publish", "Write report to Sanity + fan out to sinks (auto-enabled for full runs when report store is configured)")
44
44
  .option("--no-publish", "Suppress auto-publishing")
45
45
  .option("--publish-tag <tag>", "Label for published report")
46
- .option("--summary", "Force post-run diagnosis summary (overrides config and CI default-off)")
47
- .option("--no-summary", "Suppress post-run diagnosis summary")
46
+ .option("--summary", "Force post-run diagnosis summary (overrides config and env var)")
47
+ .option("--no-summary", "Suppress post-run diagnosis summary (default is on)")
48
48
  .option("--config <path>", "Load pipeline config from a TS/JS/YAML/JSON file (overrides most CLI flags)")
49
49
  .option("-o, --output <path>", "Write PR comment markdown to file")
50
50
  .option("--promptfoo-url <url>", "Promptfoo share URL for report")
@@ -15,10 +15,6 @@ import { defineRubrics } from "../_vendor/ailf-core/index.js"
15
15
  // template entry below. Source of truth lives in packages/eval/src/grader/;
16
16
  // the helper picks the right list by dimension family.
17
17
  import { failureModesForDimension } from "../grader/index.js"
18
- // Single source of truth for the wire-format version stamped into the
19
- // grader-prompt footer (VER-01 D-02). Interpolated below so the
20
- // announced version cannot drift from the schema's expected value.
21
- import { graderJudgmentsVersion } from "../adapters/grader-outputs/index.js"
22
18
 
23
19
  export default defineRubrics({
24
20
  templates: {
@@ -242,20 +238,23 @@ export default defineRubrics({
242
238
  "agent-harness": { gold: "agent-harness" },
243
239
  },
244
240
 
245
- // Phase 3 GRAD-05 (Plan 03-01) structured GraderJudgment JSON sketch.
246
- // Documents the target wire format the grader emits. The strict schema's
247
- // GRAD-02 additive fields stay optional in this plan; Plan 03-04 flips
248
- // them to required and bumps graderJudgmentsVersion to 1.0.0.
241
+ // W0273 the footer documents the wire-format subset of GraderJudgment
242
+ // that the grader LLM actually controls. The pipeline parses this against
243
+ // GraderEmittedJudgmentSchema and then synthesizes the four pipeline-owned
244
+ // fields (judgmentId, metadata.{graderModel,graderJudgmentsVersion},
245
+ // hallucinationCheckedAgainst) to build the storage GraderJudgment.
246
+ //
247
+ // See docs/audits/2026-05-22-empty-gap-analysis-regression.md for the
248
+ // rationale (Phase 3 GRAD-05 made these fields required + .strict(),
249
+ // and asking the LLM for pipeline-owned values caused 100% parse
250
+ // failures starting 2026-05-11).
249
251
  footer: `Return ONLY a JSON object with this exact shape:
250
252
  {
251
- "judgmentId": "<string>",
252
253
  "score": <number 0-100>,
253
254
  "reason": "<explanation, ≤500 chars>",
255
+ "failureMode": "<one of the declared modes for this dimension or 'unclassified'>",
254
256
  "subJudgments": [{ "criterionId": "<id>", "met": <bool>, "evidence": "<≤280 chars>", "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" } }],
255
257
  "docCitations": [{ "documentId": "<id>", "slug": "<optional slug>", "role": "supports|contradicts|missing|irrelevant", "hallucinated": <bool> }],
256
- "failureMode": "<one of the declared modes for this dimension or 'unclassified'>",
257
- "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" },
258
- "hallucinationCheckedAgainst": ["<doc id>"],
259
- "metadata": { "graderModel": "<string>", "graderJudgmentsVersion": "${graderJudgmentsVersion}" }
258
+ "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" }
260
259
  }`,
261
260
  })
@@ -101,4 +101,22 @@ export declare class JobStore {
101
101
  * Update a job's status and optional associated data.
102
102
  */
103
103
  updateJob(jobId: string, update: Partial<Pick<JobDocument, "completedAt" | "error" | "execution" | "progress" | "reportId" | "startedAt" | "status">>): Promise<boolean>;
104
+ /**
105
+ * Patch the parent ailf.evalRequest doc when this job reaches a terminal
106
+ * state. The webhook handler writes `jobId` onto the evalRequest at
107
+ * dispatch time, so we look the parent up by that field.
108
+ *
109
+ * Best-effort: returns `false` on lookup miss or write failure, never
110
+ * throws. Closes the S1-B gap from the 2026-05-24 new-eval audit — until
111
+ * this runs, evalRequest docs stay `status: "dispatched"` indefinitely
112
+ * and the dashboard can't surface completion or errors to users.
113
+ *
114
+ * @returns true on successful patch, false on lookup miss or write error
115
+ */
116
+ patchEvalRequestForJob(jobId: string, patch: {
117
+ status: "completed" | "failed";
118
+ completedAt: string;
119
+ reportId?: string;
120
+ error?: string;
121
+ }): Promise<boolean>;
104
122
  }
package/dist/job-store.js CHANGED
@@ -149,6 +149,40 @@ export class JobStore {
149
149
  return false;
150
150
  }
151
151
  }
152
+ /**
153
+ * Patch the parent ailf.evalRequest doc when this job reaches a terminal
154
+ * state. The webhook handler writes `jobId` onto the evalRequest at
155
+ * dispatch time, so we look the parent up by that field.
156
+ *
157
+ * Best-effort: returns `false` on lookup miss or write failure, never
158
+ * throws. Closes the S1-B gap from the 2026-05-24 new-eval audit — until
159
+ * this runs, evalRequest docs stay `status: "dispatched"` indefinitely
160
+ * and the dashboard can't surface completion or errors to users.
161
+ *
162
+ * @returns true on successful patch, false on lookup miss or write error
163
+ */
164
+ async patchEvalRequestForJob(jobId, patch) {
165
+ try {
166
+ const evalRequest = await this.client.fetch(`*[_type == "ailf.evalRequest" && jobId == $jobId][0]{_id}`, { jobId });
167
+ if (!evalRequest?._id) {
168
+ return false;
169
+ }
170
+ await this.client
171
+ .patch(evalRequest._id)
172
+ .set({
173
+ status: patch.status,
174
+ completedAt: patch.completedAt,
175
+ ...(patch.reportId ? { reportId: patch.reportId } : {}),
176
+ ...(patch.error ? { error: patch.error } : {}),
177
+ })
178
+ .commit();
179
+ return true;
180
+ }
181
+ catch (error) {
182
+ console.warn(` ⚠️ Failed to patch ailf.evalRequest for jobId ${jobId}: ${error instanceof Error ? error.message : String(error)}`);
183
+ return false;
184
+ }
185
+ }
152
186
  }
153
187
  // ---------------------------------------------------------------------------
154
188
  // Helpers
@@ -8,6 +8,7 @@
8
8
  * Once all commands construct ResolvedConfig directly (or use --config),
9
9
  * this bridge can be deleted.
10
10
  */
11
+ import { isLiteracyVariant } from "../_vendor/ailf-shared/index.js";
11
12
  import { createAppContext } from "../composition-root.js";
12
13
  import { tryLoadConfigFile } from "../pipeline/compiler/config-loader.js";
13
14
  /**
@@ -18,10 +19,16 @@ import { tryLoadConfigFile } from "../pipeline/compiler/config-loader.js";
18
19
  * are derived (e.g., areas from areaOption).
19
20
  */
20
21
  export function mapToResolvedConfig(opts, rootDir) {
22
+ // `opts.variant` is a free-form string from CLI / config flags; narrow it
23
+ // to the closed `LiteracyVariant` set so downstream consumers (the report
24
+ // provenance derivation, in particular) never see a bogus string.
25
+ // Unknown values silently drop to undefined — the legacy behavior — but a
26
+ // narrowing surface is in place for the day we want to error here.
27
+ const variant = isLiteracyVariant(opts.variant) ? opts.variant : undefined;
21
28
  return {
22
29
  rootDir,
23
30
  mode: opts.mode,
24
- variant: opts.variant,
31
+ variant,
25
32
  noAutoScope: opts.noAutoScope ?? false,
26
33
  debug: opts.debug,
27
34
  areas: opts.areaOption
@@ -69,6 +69,35 @@ async function reportJobProgress(ctx, stepName, completedSteps, totalSteps, stat
69
69
  ctx.logger.warn(`Failed to report job progress for step "${stepName}" — continuing`);
70
70
  }
71
71
  }
72
+ /**
73
+ * Patch the parent ailf.evalRequest doc when the underlying ailf.job
74
+ * reaches a terminal state (completed | failed).
75
+ *
76
+ * Thin wrapper over `JobStore.patchEvalRequestForJob` that handles client
77
+ * construction (env-driven token) and the logger callback. Best-effort:
78
+ * never throws, logs warnings on lookup miss or write failure.
79
+ *
80
+ * Closes the S1-B gap from the 2026-05-24 new-eval audit — until this
81
+ * runs, evalRequest docs stay `status: "dispatched"` indefinitely and
82
+ * the dashboard can't surface completion or errors to users.
83
+ */
84
+ async function patchEvalRequestForJob(ctx, jobId, patch) {
85
+ try {
86
+ const { JobStore } = await import("../job-store.js");
87
+ const store = new JobStore({
88
+ token: process.env.AILF_REPORT_SANITY_API_TOKEN ??
89
+ process.env.SANITY_API_TOKEN ??
90
+ undefined,
91
+ });
92
+ const patched = await store.patchEvalRequestForJob(jobId, patch);
93
+ if (!patched) {
94
+ ctx.logger.debug(`No ailf.evalRequest patched for jobId ${jobId} — lookup miss or write failure`);
95
+ }
96
+ }
97
+ catch (err) {
98
+ ctx.logger.warn(`Failed to patch ailf.evalRequest for jobId ${jobId}: ${err instanceof Error ? err.message : String(err)}`);
99
+ }
100
+ }
72
101
  // ---------------------------------------------------------------------------
73
102
  // Artifact capture
74
103
  // ---------------------------------------------------------------------------
@@ -188,6 +217,11 @@ export async function orchestratePipeline(ctx, steps) {
188
217
  message: failedError,
189
218
  step: step.name,
190
219
  }, jobUpdates);
220
+ await patchEvalRequestForJob(ctx, ctx.config.jobId, {
221
+ status: "failed",
222
+ completedAt: new Date().toISOString(),
223
+ error: `${step.name}: ${failedError}`,
224
+ });
191
225
  }
192
226
  // Capture pipeline context before exiting. `job-updates` was an
193
227
  // observability-only capture not tied to a registered artifact type;
@@ -242,9 +276,10 @@ export async function orchestratePipeline(ctx, steps) {
242
276
  // (P5 / local-first) and `success: true` is preserved; the `error`
243
277
  // field is the wire signal that a configured optional step failed.
244
278
  const firstOptionalFailure = getFirstOptionalFailure(steps, results);
279
+ const completedAt = new Date().toISOString();
245
280
  await store.updateJob(ctx.config.jobId, {
246
281
  status: "completed",
247
- completedAt: new Date().toISOString(),
282
+ completedAt,
248
283
  progress: {
249
284
  currentStep: "complete",
250
285
  completedSteps: steps.length,
@@ -253,6 +288,16 @@ export async function orchestratePipeline(ctx, steps) {
253
288
  ...(state.reportId ? { reportId: state.reportId } : {}),
254
289
  ...(firstOptionalFailure ? { error: firstOptionalFailure } : {}),
255
290
  });
291
+ await patchEvalRequestForJob(ctx, ctx.config.jobId, {
292
+ status: "completed",
293
+ completedAt,
294
+ ...(state.reportId ? { reportId: state.reportId } : {}),
295
+ ...(firstOptionalFailure
296
+ ? {
297
+ error: `${firstOptionalFailure.step}: ${firstOptionalFailure.message}`,
298
+ }
299
+ : {}),
300
+ });
256
301
  }
257
302
  catch {
258
303
  ctx.logger.warn("Failed to report job completion — continuing");
@@ -4,6 +4,13 @@
4
4
  * This step is already pure (no execSync, no env vars) — the logic is
5
5
  * inlined directly from the former pipeline/steps/compare-step.ts.
6
6
  * This is an optional step — failure doesn't stop the pipeline.
7
+ *
8
+ * Baseline resolution order (highest priority first):
9
+ * 1. `compareBaselineReportId` — fetch the named report doc
10
+ * and use its `summary` (a ReportSummary, which is a
11
+ * superset of ComparableSummary) as the baseline.
12
+ * 2. `compareBaseline` — local filesystem path (CLI ergonomics).
13
+ * 3. Latest baseline in `results/baselines/`.
7
14
  */
8
15
  import { type AppContext, type PipelineStep, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
9
16
  export declare class CompareStep implements PipelineStep {