@sanity/ailf 7.0.1 → 7.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/config/rubrics.ts +12 -13
  2. package/dist/_vendor/ailf-core/ports/context.d.ts +45 -3
  3. package/dist/_vendor/ailf-core/ports/index.d.ts +1 -1
  4. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +9 -1
  5. package/dist/_vendor/ailf-core/schemas/branded-string.js +16 -6
  6. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -0
  7. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
  8. package/dist/_vendor/ailf-core/schemas/report.d.ts +12 -0
  9. package/dist/_vendor/ailf-core/schemas/report.js +2 -0
  10. package/dist/_vendor/ailf-core/schemas/team.d.ts +22 -0
  11. package/dist/_vendor/ailf-core/schemas/team.js +63 -0
  12. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +51 -0
  13. package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
  14. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +17 -0
  15. package/dist/_vendor/ailf-core/types/team.d.ts +65 -0
  16. package/dist/_vendor/ailf-core/types/team.js +1 -0
  17. package/dist/_vendor/ailf-shared/eval-modes.d.ts +2 -0
  18. package/dist/_vendor/ailf-shared/eval-modes.js +5 -0
  19. package/dist/_vendor/ailf-shared/event-types.d.ts +15 -0
  20. package/dist/_vendor/ailf-shared/event-types.js +23 -0
  21. package/dist/_vendor/ailf-shared/generated/help-content.js +2 -2
  22. package/dist/_vendor/ailf-shared/index.d.ts +4 -2
  23. package/dist/_vendor/ailf-shared/index.js +4 -2
  24. package/dist/_vendor/ailf-shared/member-roles.d.ts +16 -0
  25. package/dist/_vendor/ailf-shared/member-roles.js +16 -0
  26. package/dist/_vendor/ailf-shared/owner-teams.d.ts +19 -0
  27. package/dist/_vendor/ailf-shared/owner-teams.js +7 -0
  28. package/dist/_vendor/ailf-shared/run-context.d.ts +8 -1
  29. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +65 -1
  30. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +35 -0
  31. package/dist/adapters/task-sources/changed-docs-filter.d.ts +12 -0
  32. package/dist/adapters/task-sources/changed-docs-filter.js +30 -0
  33. package/dist/adapters/task-sources/content-lake-task-source.js +2 -1
  34. package/dist/adapters/task-sources/repo-task-source.js +2 -1
  35. package/dist/commands/pipeline-action.d.ts +4 -3
  36. package/dist/commands/pipeline-action.js +7 -5
  37. package/dist/commands/run.js +2 -2
  38. package/dist/config/rubrics.ts +12 -13
  39. package/dist/job-store.d.ts +18 -0
  40. package/dist/job-store.js +34 -0
  41. package/dist/orchestration/build-app-context.js +8 -1
  42. package/dist/orchestration/pipeline-orchestrator.js +46 -1
  43. package/dist/orchestration/steps/compare-step.d.ts +7 -0
  44. package/dist/orchestration/steps/compare-step.js +59 -23
  45. package/dist/orchestration/steps/fetch-docs-step.js +3 -0
  46. package/dist/orchestration/steps/finalize-run-step.js +2 -0
  47. package/dist/orchestration/steps/generate-configs-step.d.ts +32 -1
  48. package/dist/orchestration/steps/generate-configs-step.js +47 -13
  49. package/dist/orchestration/steps/grader-consistency-step.js +11 -0
  50. package/dist/orchestration/steps/publish-report-step.d.ts +12 -1
  51. package/dist/orchestration/steps/publish-report-step.js +19 -3
  52. package/dist/pipeline/cache-hit-restore.d.ts +14 -1
  53. package/dist/pipeline/cache-hit-restore.js +17 -0
  54. package/dist/pipeline/calculate-scores.js +57 -21
  55. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +7 -2
  56. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +13 -4
  57. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +1 -1
  58. package/dist/pipeline/compiler/provider-assembler.d.ts +15 -1
  59. package/dist/pipeline/compiler/provider-assembler.js +16 -3
  60. package/dist/pipeline/failure-modes.d.ts +20 -10
  61. package/dist/pipeline/failure-modes.js +84 -15
  62. package/dist/pipeline/map-request-to-config.js +2 -0
  63. package/dist/pipeline/normalize-mode.d.ts +1 -1
  64. package/dist/pipeline/normalize-mode.js +2 -0
  65. package/dist/pipeline/run-context.d.ts +16 -1
  66. package/dist/pipeline/run-context.js +12 -1
  67. package/dist/pipeline/validate.d.ts +8 -4
  68. package/dist/pipeline/validate.js +8 -18
  69. package/dist/report-store.d.ts +14 -1
  70. package/dist/report-store.js +32 -0
  71. package/dist/sanity/client.js +2 -2
  72. package/package.json +1 -1
package/config/rubrics.ts CHANGED
@@ -15,10 +15,6 @@ import { defineRubrics } from "@sanity/ailf-core"
15
15
  // template entry below. Source of truth lives in packages/eval/src/grader/;
16
16
  // the helper picks the right list by dimension family.
17
17
  import { failureModesForDimension } from "../src/grader/index.js"
18
- // Single source of truth for the wire-format version stamped into the
19
- // grader-prompt footer (VER-01 D-02). Interpolated below so the
20
- // announced version cannot drift from the schema's expected value.
21
- import { graderJudgmentsVersion } from "../src/adapters/grader-outputs/index.js"
22
18
 
23
19
  export default defineRubrics({
24
20
  templates: {
@@ -242,20 +238,23 @@ export default defineRubrics({
242
238
  "agent-harness": { gold: "agent-harness" },
243
239
  },
244
240
 
245
- // Phase 3 GRAD-05 (Plan 03-01) structured GraderJudgment JSON sketch.
246
- // Documents the target wire format the grader emits. The strict schema's
247
- // GRAD-02 additive fields stay optional in this plan; Plan 03-04 flips
248
- // them to required and bumps graderJudgmentsVersion to 1.0.0.
241
+ // W0273 the footer documents the wire-format subset of GraderJudgment
242
+ // that the grader LLM actually controls. The pipeline parses this against
243
+ // GraderEmittedJudgmentSchema and then synthesizes the four pipeline-owned
244
+ // fields (judgmentId, metadata.{graderModel,graderJudgmentsVersion},
245
+ // hallucinationCheckedAgainst) to build the storage GraderJudgment.
246
+ //
247
+ // See docs/audits/2026-05-22-empty-gap-analysis-regression.md for the
248
+ // rationale (Phase 3 GRAD-05 made these fields required + .strict(),
249
+ // and asking the LLM for pipeline-owned values caused 100% parse
250
+ // failures starting 2026-05-11).
249
251
  footer: `Return ONLY a JSON object with this exact shape:
250
252
  {
251
- "judgmentId": "<string>",
252
253
  "score": <number 0-100>,
253
254
  "reason": "<explanation, ≤500 chars>",
255
+ "failureMode": "<one of the declared modes for this dimension or 'unclassified'>",
254
256
  "subJudgments": [{ "criterionId": "<id>", "met": <bool>, "evidence": "<≤280 chars>", "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" } }],
255
257
  "docCitations": [{ "documentId": "<id>", "slug": "<optional slug>", "role": "supports|contradicts|missing|irrelevant", "hallucinated": <bool> }],
256
- "failureMode": "<one of the declared modes for this dimension or 'unclassified'>",
257
- "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" },
258
- "hallucinationCheckedAgainst": ["<doc id>"],
259
- "metadata": { "graderModel": "<string>", "graderJudgmentsVersion": "${graderJudgmentsVersion}" }
258
+ "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" }
260
259
  }`,
261
260
  })
@@ -11,7 +11,7 @@
11
11
  * Fields marked optional are transitional — they will become required
12
12
  * as downstream consumers are converted to use them.
13
13
  */
14
- import type { RunClassification, RunExecutorSurface } from "../../ailf-shared/index.d.ts";
14
+ import type { LiteracyVariant, RunClassification, RunExecutorSurface } from "../../ailf-shared/index.d.ts";
15
15
  import type { RunId } from "../types/branded-ids.js";
16
16
  import type { DebugOptions, EvalMode, PluginRegistry } from "../types/index.js";
17
17
  import type { ArtifactWriter } from "./artifact-writer.js";
@@ -42,12 +42,20 @@ export interface ResolvedConfig {
42
42
  * `mode: "literacy", variant: "baseline"`. This keeps the pipeline
43
43
  * mode-agnostic while preserving literacy's multi-variant behavior.
44
44
  *
45
- * Values: "baseline" | "agentic" | "observed" | "full" | undefined
46
45
  * Undefined means "use the default variant for the mode" (baseline for literacy).
47
46
  */
48
- variant?: string;
47
+ variant?: LiteracyVariant;
49
48
  /** Debug options */
50
49
  debug?: DebugOptions;
50
+ /**
51
+ * Filter the evaluated cohort to a subset of the configured model IDs.
52
+ *
53
+ * Each entry must match the `id` of a model declared in
54
+ * `config/models.ts`. Unknown IDs are dropped at the runner with a
55
+ * structured warning AND surfaced on the job's `error` field so callers
56
+ * can detect typos — silent strips are not acceptable.
57
+ */
58
+ models?: string[];
51
59
  /** Feature area filter */
52
60
  areas?: string[];
53
61
  /** Task ID filter */
@@ -68,6 +76,12 @@ export interface ResolvedConfig {
68
76
  compareThreshold?: number;
69
77
  /** Comparison baseline path */
70
78
  compareBaseline?: string;
79
+ /**
80
+ * Comparison baseline expressed as a previously-published
81
+ * `ailf.report` document id. Takes precedence over `compareBaseline`
82
+ * when both are set.
83
+ */
84
+ compareBaselineReportId?: string;
71
85
  /** Whether gap analysis is enabled */
72
86
  gapAnalysisEnabled: boolean;
73
87
  /** Whether publishing is enabled */
@@ -323,6 +337,26 @@ export interface AppContext {
323
337
  /** Task definition source (YAML, Content Lake, repo) */
324
338
  readonly taskSource: TaskSource;
325
339
  }
340
+ /**
341
+ * Discriminated result for `ReportStorePort.loadBaselineFromReport`.
342
+ *
343
+ * Lets the compare step distinguish a genuine 404 (the pinned report
344
+ * doesn't exist — skip with a clear reason) from a transport failure
345
+ * (Sanity 5xx, network blew up — fail the step so the user knows the
346
+ * pinned baseline didn't actually compare). The `baseline` payload is
347
+ * typed as `unknown` to keep the port surface decoupled from the eval
348
+ * package's `ComparableSummary` type — concrete implementations return
349
+ * a more specific shape, which is sound.
350
+ */
351
+ export type LoadBaselineResult = {
352
+ kind: "ok";
353
+ baseline: unknown;
354
+ } | {
355
+ kind: "not_found";
356
+ } | {
357
+ kind: "error";
358
+ message: string;
359
+ };
326
360
  /**
327
361
  * Minimal report store interface used by AppContext.
328
362
  *
@@ -341,6 +375,14 @@ export interface ReportStorePort {
341
375
  write(report: unknown): Promise<unknown>;
342
376
  /** Read a report by its ID (used by the post-run diagnosis hook). */
343
377
  read(id: string): Promise<null | unknown>;
378
+ /**
379
+ * Load a previously-published report's score summary as a baseline
380
+ * for the `compare` step. Returns a discriminated result so callers
381
+ * can distinguish a genuine 404 (skip with a clear reason) from a
382
+ * transport failure (fail the step — the user pinned a baseline and
383
+ * deserves to know it didn't actually compare).
384
+ */
385
+ loadBaselineFromReport(reportId: string): Promise<LoadBaselineResult>;
344
386
  /** Patch synthesis telemetry onto a published report (Phase 6 / DIAG-06). */
345
387
  patchSynthesis(id: string, telemetry: unknown): Promise<void>;
346
388
  /**
@@ -8,7 +8,7 @@ export type { ArtifactEntry, ArtifactWriter } from "./artifact-writer.js";
8
8
  export { NoOpArtifactWriter } from "./artifact-writer.js";
9
9
  export type { CacheEntryMetadata, CacheKey, CacheLookupResult, CacheRecordInput, CacheStore, } from "./cache-store.js";
10
10
  export type { ConfigSource } from "./config-source.js";
11
- export type { AppContext, ReportSinkPort, ReportStorePort, ResolvedConfig, } from "./context.js";
11
+ export type { AppContext, LoadBaselineResult, ReportSinkPort, ReportStorePort, ResolvedConfig, } from "./context.js";
12
12
  export type { DocContext, DocFetcher, DocSourceConfig, DocumentManifestEntry, DocumentOverlaySummary, FetchMetadata, FetchResult, ReleaseImpact, SymbolIndexManifestEntry, UrlFetchEntry, UrlFetchSummary, } from "./doc-fetcher.js";
13
13
  export type { EvalRunConfig, EvalRunner } from "./eval-runner.js";
14
14
  export type { LLMCallContext, LLMClient, LLMCompleteArgs, LLMCompleteStructuredArgs, LLMCompletion, LLMStructuredCompletion, LLMUsage, ModelId, ModelProvider, ParsedModelId, } from "./llm-client.js";
@@ -36,5 +36,13 @@ import type { Brand } from "../types/branded-ids.js";
36
36
  * exit from the project's no-`as`-on-`unknown` rule. Adapters MUST
37
37
  * NOT replicate the cast at their own call sites — call this helper
38
38
  * instead so the rule violation stays centralized.
39
+ *
40
+ * Pass `regex` to enforce a stricter shape than non-empty. The
41
+ * runtime validator becomes `z.string().regex(regex)` instead of
42
+ * `z.string().min(1)`; the brand-cast at the call boundary is
43
+ * unchanged. Callers passing `regex` are responsible for ensuring
44
+ * it rejects the empty string (typically anchor with `^` and
45
+ * require at least one character via `+` or a non-`*` quantifier);
46
+ * the `.min(1)` floor is dropped when `regex` is supplied.
39
47
  */
40
- export declare function brandedString<TBrand extends string>(): z.ZodType<Brand<string, TBrand>>;
48
+ export declare function brandedString<TBrand extends string>(regex?: RegExp): z.ZodType<Brand<string, TBrand>>;
@@ -35,11 +35,21 @@ import { z } from "zod";
35
35
  * exit from the project's no-`as`-on-`unknown` rule. Adapters MUST
36
36
  * NOT replicate the cast at their own call sites — call this helper
37
37
  * instead so the rule violation stays centralized.
38
+ *
39
+ * Pass `regex` to enforce a stricter shape than non-empty. The
40
+ * runtime validator becomes `z.string().regex(regex)` instead of
41
+ * `z.string().min(1)`; the brand-cast at the call boundary is
42
+ * unchanged. Callers passing `regex` are responsible for ensuring
43
+ * it rejects the empty string (typically anchor with `^` and
44
+ * require at least one character via `+` or a non-`*` quantifier);
45
+ * the `.min(1)` floor is dropped when `regex` is supplied.
38
46
  */
39
- export function brandedString() {
40
- // The runtime is a plain non-empty string; the brand is a
41
- // compile-time-only nominal tag (see `Brand<>` in branded-ids.ts).
42
- // Zod 4's `.brand()` uses a different symbol shape, so a direct
43
- // composition does not yield the project's `Brand<…>` type.
44
- return z.string().min(1);
47
+ export function brandedString(regex) {
48
+ // The runtime is a plain string (non-empty or regex-validated);
49
+ // the brand is a compile-time-only nominal tag (see `Brand<>` in
50
+ // branded-ids.ts). Zod 4's `.brand()` uses a different symbol
51
+ // shape, so a direct composition does not yield the project's
52
+ // `Brand<…>` type.
53
+ const base = regex === undefined ? z.string().min(1) : z.string().regex(regex);
54
+ return base;
45
55
  }
@@ -33,6 +33,7 @@ export declare const PipelineRequestSchema: z.ZodObject<{
33
33
  changedDocs: z.ZodOptional<z.ZodArray<z.ZodString>>;
34
34
  compare: z.ZodOptional<z.ZodBoolean>;
35
35
  compareBaseline: z.ZodOptional<z.ZodString>;
36
+ compareBaselineReportId: z.ZodOptional<z.ZodString>;
36
37
  compareThreshold: z.ZodOptional<z.ZodNumber>;
37
38
  concurrency: z.ZodOptional<z.ZodNumber>;
38
39
  dataset: z.ZodOptional<z.ZodString>;
@@ -63,6 +64,7 @@ export declare const PipelineRequestSchema: z.ZodObject<{
63
64
  observed: "observed";
64
65
  full: "full";
65
66
  }>>;
67
+ models: z.ZodOptional<z.ZodArray<z.ZodString>>;
66
68
  noAutoScope: z.ZodOptional<z.ZodBoolean>;
67
69
  noCache: z.ZodOptional<z.ZodBoolean>;
68
70
  noRemoteCache: z.ZodOptional<z.ZodBoolean>;
@@ -101,6 +101,7 @@ export const PipelineRequestSchema = z.object({
101
101
  changedDocs: z.array(z.string()).optional(),
102
102
  compare: z.boolean().optional(),
103
103
  compareBaseline: z.string().optional(),
104
+ compareBaselineReportId: z.string().min(1).optional(),
104
105
  compareThreshold: z.number().min(0).optional(),
105
106
  concurrency: z.number().int().positive().optional(),
106
107
  dataset: z.string().optional(),
@@ -123,6 +124,12 @@ export const PipelineRequestSchema = z.object({
123
124
  * Legacy names must pass through normalizeMode() before entering typed pipeline code.
124
125
  */
125
126
  mode: z.enum(RAW_EVAL_MODES).optional(),
127
+ /**
128
+ * Filter the evaluation cohort to a subset of the configured model IDs
129
+ * (W0281). Unknown IDs are dropped at the runner with a structured
130
+ * warning + job-error patch.
131
+ */
132
+ models: z.array(z.string().min(1)).optional(),
126
133
  noAutoScope: z.boolean().optional(),
127
134
  noCache: z.boolean().optional(),
128
135
  noRemoteCache: z.boolean().optional(),
@@ -113,6 +113,12 @@ export declare const ReportProvenanceSchema: z.ZodObject<{
113
113
  documentId: z.ZodOptional<z.ZodString>;
114
114
  source: z.ZodString;
115
115
  }, z.core.$strict>], "type">;
116
+ variant: z.ZodOptional<z.ZodEnum<{
117
+ agentic: "agentic";
118
+ baseline: "baseline";
119
+ observed: "observed";
120
+ full: "full";
121
+ }>>;
116
122
  autoScope: z.ZodOptional<z.ZodObject<{
117
123
  enabled: z.ZodBoolean;
118
124
  affectedTaskIds: z.ZodArray<z.ZodString>;
@@ -222,6 +228,12 @@ export declare const ReportSchema: z.ZodObject<{
222
228
  documentId: z.ZodOptional<z.ZodString>;
223
229
  source: z.ZodString;
224
230
  }, z.core.$strict>], "type">;
231
+ variant: z.ZodOptional<z.ZodEnum<{
232
+ agentic: "agentic";
233
+ baseline: "baseline";
234
+ observed: "observed";
235
+ full: "full";
236
+ }>>;
225
237
  autoScope: z.ZodOptional<z.ZodObject<{
226
238
  enabled: z.ZodBoolean;
227
239
  affectedTaskIds: z.ZodArray<z.ZodString>;
@@ -24,6 +24,7 @@
24
24
  * @see docs/work-items/W0191-report-store-schema-gate.json
25
25
  */
26
26
  import { z } from "zod";
27
+ import { LITERACY_VARIANTS } from "../../ailf-shared/index.js";
27
28
  // ---------------------------------------------------------------------------
28
29
  // RunContext building blocks (mirrors packages/shared/src/run-context.ts)
29
30
  // ---------------------------------------------------------------------------
@@ -195,6 +196,7 @@ export const ReportProvenanceSchema = z
195
196
  taskIds: z.array(z.string()).optional(),
196
197
  tool: RunToolSchema.optional(),
197
198
  trigger: RunTriggerSchema,
199
+ variant: z.enum(LITERACY_VARIANTS).optional(),
198
200
  // ReportProvenance additions
199
201
  autoScope: ReportAutoScopeSchema.optional(),
200
202
  contextHash: z.string().optional(),
@@ -0,0 +1,22 @@
1
+ import { z } from "zod";
2
+ import type { NotificationChannel } from "../types/team.js";
3
+ export declare const TeamSchema: z.ZodObject<{
4
+ id: z.ZodType<import("../index.js").Brand<string, "TeamId">, unknown, z.core.$ZodTypeInternals<import("../index.js").Brand<string, "TeamId">, unknown>>;
5
+ slug: z.ZodType<import("../index.js").Brand<string, "TeamSlug">, unknown, z.core.$ZodTypeInternals<import("../index.js").Brand<string, "TeamSlug">, unknown>>;
6
+ displayName: z.ZodString;
7
+ description: z.ZodOptional<z.ZodString>;
8
+ status: z.ZodEnum<{
9
+ active: "active";
10
+ archived: "archived";
11
+ }>;
12
+ members: z.ZodArray<z.ZodObject<{
13
+ email: z.ZodOptional<z.ZodString>;
14
+ sanityUserId: z.ZodOptional<z.ZodString>;
15
+ githubUsername: z.ZodOptional<z.ZodString>;
16
+ displayName: z.ZodOptional<z.ZodString>;
17
+ role: z.ZodOptional<z.ZodString>;
18
+ lastVerifiedAt: z.ZodOptional<z.ZodString>;
19
+ }, z.core.$strip>>;
20
+ repos: z.ZodOptional<z.ZodArray<z.ZodString>>;
21
+ notifications: z.ZodOptional<z.ZodArray<z.ZodType<NotificationChannel, unknown, z.core.$ZodTypeInternals<NotificationChannel, unknown>>>>;
22
+ }, z.core.$strip>;
@@ -0,0 +1,63 @@
1
+ import { z } from "zod";
2
+ import { brandedString } from "./branded-string.js";
3
+ const SLUG_REGEX = /^[a-z0-9][a-z0-9-]*$/;
4
+ const TEAM_ID_REGEX = /^ailf\.team\.[a-z0-9][a-z0-9-]*$/;
5
+ const TeamMemberSchema = z
6
+ .object({
7
+ email: z.string().email().optional(),
8
+ sanityUserId: z.string().optional(),
9
+ githubUsername: z.string().optional(),
10
+ displayName: z.string().optional(),
11
+ role: z.string().optional(),
12
+ lastVerifiedAt: z.string().datetime().optional(),
13
+ })
14
+ .refine((m) => Boolean(m.email || m.sanityUserId || m.githubUsername), {
15
+ message: "TeamMember requires at least one of email, sanityUserId, githubUsername",
16
+ });
17
+ const ChannelScopeSchema = z.discriminatedUnion("type", [
18
+ z.object({ type: z.literal("owned") }),
19
+ z.object({ type: z.literal("all") }),
20
+ z.object({ type: z.literal("areas"), areas: z.array(z.string()) }),
21
+ z.object({ type: z.literal("repos"), repos: z.array(z.string()) }),
22
+ z.object({ type: z.literal("tags"), tags: z.array(z.string()) }),
23
+ ]);
24
+ const SlackChannelSchema = z.object({
25
+ _key: z.string(),
26
+ type: z.literal("slack"),
27
+ channelId: z.string().min(1),
28
+ channelName: z.string().optional(),
29
+ purpose: z.string().optional(),
30
+ events: z.array(z.string()).optional(),
31
+ scope: ChannelScopeSchema.optional(),
32
+ });
33
+ const EmailChannelSchema = z.object({
34
+ _key: z.string(),
35
+ type: z.literal("email"),
36
+ addresses: z.array(z.string().email()).min(1),
37
+ purpose: z.string().optional(),
38
+ events: z.array(z.string()).optional(),
39
+ scope: ChannelScopeSchema.optional(),
40
+ });
41
+ const WebhookChannelSchema = z.object({
42
+ _key: z.string(),
43
+ type: z.literal("webhook"),
44
+ logicalName: z.string().min(1),
45
+ purpose: z.string().optional(),
46
+ events: z.array(z.string()).optional(),
47
+ scope: ChannelScopeSchema.optional(),
48
+ });
49
+ const NotificationChannelSchema = z.discriminatedUnion("type", [
50
+ SlackChannelSchema,
51
+ EmailChannelSchema,
52
+ WebhookChannelSchema,
53
+ ]);
54
+ export const TeamSchema = z.object({
55
+ id: brandedString(TEAM_ID_REGEX),
56
+ slug: brandedString(SLUG_REGEX),
57
+ displayName: z.string().min(1),
58
+ description: z.string().optional(),
59
+ status: z.enum(["active", "archived"]),
60
+ members: z.array(TeamMemberSchema).min(1),
61
+ repos: z.array(z.string()).optional(),
62
+ notifications: z.array(NotificationChannelSchema).optional(),
63
+ });
@@ -123,3 +123,54 @@ export interface GraderJudgment {
123
123
  graderJudgmentsVersion: string;
124
124
  };
125
125
  }
126
+ /**
127
+ * Wire-format subset of {@link GraderJudgment} — the fields a grader LLM
128
+ * is responsible for emitting in its JSON response. The pipeline parses
129
+ * untrusted grader output against this shape, then synthesizes the
130
+ * remaining storage fields (`taskId`, `modelId`, `dimension`, `judgmentId`,
131
+ * `metadata.{graderModel, graderJudgmentsVersion}`, and
132
+ * `hallucinationCheckedAgainst`) from server-side context.
133
+ *
134
+ * The split exists because four of `GraderJudgment`'s required fields are
135
+ * pipeline-owned semantics the LLM cannot produce correctly:
136
+ *
137
+ * - `judgmentId` — D0052 branded id with `(taskId, modelId, dimension,
138
+ * runId)` uniqueness invariant. Minted by `generateJudgmentId`.
139
+ * - `metadata.graderJudgmentsVersion` — static constant co-located with
140
+ * the schema (`promptfoo-grader-output.ts:48`).
141
+ * - `metadata.graderModel` — the grader's deployment alias (pipeline
142
+ * knows from provider config; the LLM doesn't reliably know its own).
143
+ * - `hallucinationCheckedAgainst` — the resolvable-set union of
144
+ * `task.context.docs` and `run.documentManifest`, composed by
145
+ * `populateHallucinationFields` (gap-analysis-step.ts).
146
+ *
147
+ * Asking the LLM for any of these produces drift; `.strict()` on
148
+ * `GraderJudgmentSchema` amplifies that drift into 100% parse failures
149
+ * (the 2026-05-11 empty-gapReport regression — see W0273 and
150
+ * `docs/audits/2026-05-22-empty-gap-analysis-regression.md`).
151
+ *
152
+ * `taskId`, `modelId`, and `dimension` are also pipeline-supplied (from
153
+ * `result.description`, `result.providerId`, and the rubric-classifier
154
+ * output in `calculate-scores.ts:475-479`) — kept out of the wire shape
155
+ * for the same reason.
156
+ */
157
+ export interface GraderEmittedJudgment {
158
+ /** Numeric score in [0, 100] (normalized). */
159
+ score: number;
160
+ /** The grader's natural-language reasoning. */
161
+ reason: string;
162
+ /** Per-dimension failure mode (must match the legal-mode list in the rubric). */
163
+ failureMode: string;
164
+ /** Per-criterion sub-judgments. */
165
+ subJudgments: CriterionSubJudgment[];
166
+ /** Doc citations with role + hallucinated flag. */
167
+ docCitations: DocCitation[];
168
+ /** Grader self-confidence per D0049. */
169
+ confidence: Confidence;
170
+ /**
171
+ * True when the candidate response was empty/whitespace/refused. The
172
+ * pipeline also independently detects this from
173
+ * `result.response.output` — both signals are OR'd.
174
+ */
175
+ outputFailure?: boolean;
176
+ }
@@ -39,8 +39,9 @@ export type { AgentHarnessTaskDefinition, ContentLakeAuthorableMode, ContentLake
39
39
  export type { ActionSuggestion, AreaSummaryBody, CardMeta, CardType, Diagnosis, DiagnosisCard, DocAttributionSpotlightBody, FailureModeSummaryBody, JudgmentRef, LowConfidenceAttributionBody, NoIssuesBody, RegressionVsBaselineBody, TopRecommendationsBody, VersionedInputs, WeakestAreaBody, } from "./diagnosis.js";
40
40
  export type { SynthesisCostTelemetry, SynthesisPerCardTelemetry, } from "./synthesis-telemetry.js";
41
41
  export type { AttributionMeta, DocAttribution, JudgmentAttribution, } from "./attribution.js";
42
- export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderJudgment, } from "./grader-judgment.js";
42
+ export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderEmittedJudgment, GraderJudgment, } from "./grader-judgment.js";
43
43
  export type { LegacyGraderJudgment } from "./legacy-grader-judgment.js";
44
+ export type { BaseChannel, ChannelScope, EmailChannel, EventType, KnownEventType, KnownMemberRole, MemberRole, NotificationChannel, NotificationChannelType, SlackChannel, Team, TeamId, TeamMember, TeamRef, TeamSlug, TeamStatus, WebhookChannel, } from "./team.js";
44
45
  type DocumentRef = _DocumentRef;
45
46
  /** Aggregated retrieval metrics for a feature area */
46
47
  export interface AreaRetrievalMetrics {
@@ -259,6 +260,12 @@ export interface FilterOptions {
259
260
  tags?: string[];
260
261
  /** Specific task IDs to include (e.g., ["groq-blog-queries"]) */
261
262
  taskIds?: string[];
263
+ /**
264
+ * Doc slugs that changed in the calling context. When set, only tasks
265
+ * whose `context.docs[*].slug` intersects this list are returned.
266
+ * Empty array is a no-op (treated as undefined).
267
+ */
268
+ changedDocs?: readonly string[];
262
269
  }
263
270
  /** Full gap analysis report */
264
271
  export interface GapAnalysisReport {
@@ -79,6 +79,13 @@ export interface PipelineRequest {
79
79
  classification?: RunClassification;
80
80
  compare?: boolean;
81
81
  compareBaseline?: string;
82
+ /**
83
+ * Compare against a baseline extracted from a previously-published
84
+ * `ailf.report` document. Takes precedence over `compareBaseline`
85
+ * (local FS path). Dashboard-friendly: a report id is something the
86
+ * user can pick from a list.
87
+ */
88
+ compareBaselineReportId?: string;
82
89
  compareThreshold?: number;
83
90
  concurrency?: number;
84
91
  dataset?: string;
@@ -93,6 +100,16 @@ export interface PipelineRequest {
93
100
  jobId?: string;
94
101
  labels?: string[];
95
102
  mode?: RawEvalMode;
103
+ /**
104
+ * Filter the evaluation cohort to a subset of the configured model IDs.
105
+ *
106
+ * Each entry must match the `id` of a model declared in
107
+ * `packages/eval/config/models.ts`. IDs that don't match are dropped
108
+ * with a structured warning AND surfaced on the job's `error` field so
109
+ * callers can detect typos — silent strips are not acceptable
110
+ * (W0281 acceptance criterion 5).
111
+ */
112
+ models?: string[];
96
113
  noAutoScope?: boolean;
97
114
  noCache?: boolean;
98
115
  noRemoteCache?: boolean;
@@ -0,0 +1,65 @@
1
+ import type { Brand } from "./branded-ids.js";
2
+ export type TeamId = Brand<string, "TeamId">;
3
+ export type TeamSlug = Brand<string, "TeamSlug">;
4
+ export type TeamStatus = "active" | "archived";
5
+ export type KnownMemberRole = "lead" | "member" | "oncall";
6
+ export type MemberRole = KnownMemberRole | (string & {});
7
+ export type KnownEventType = "eval.failed" | "eval.completed" | "eval.threshold-breached" | "eval.score-regressed" | "task.created" | "task.archived" | "area.unowned-tasks";
8
+ export type EventType = KnownEventType | (string & {});
9
+ export type NotificationChannelType = "slack" | "email" | "webhook";
10
+ export interface TeamMember {
11
+ email?: string;
12
+ sanityUserId?: string;
13
+ githubUsername?: string;
14
+ displayName?: string;
15
+ role?: MemberRole;
16
+ lastVerifiedAt?: string;
17
+ }
18
+ export interface BaseChannel {
19
+ _key: string;
20
+ type: NotificationChannelType;
21
+ purpose?: string;
22
+ events?: EventType[];
23
+ scope?: ChannelScope;
24
+ }
25
+ export interface SlackChannel extends BaseChannel {
26
+ type: "slack";
27
+ channelId: string;
28
+ channelName?: string;
29
+ }
30
+ export interface EmailChannel extends BaseChannel {
31
+ type: "email";
32
+ addresses: string[];
33
+ }
34
+ export interface WebhookChannel extends BaseChannel {
35
+ type: "webhook";
36
+ logicalName: string;
37
+ }
38
+ export type NotificationChannel = SlackChannel | EmailChannel | WebhookChannel;
39
+ export type ChannelScope = {
40
+ type: "owned";
41
+ } | {
42
+ type: "all";
43
+ } | {
44
+ type: "areas";
45
+ areas: string[];
46
+ } | {
47
+ type: "repos";
48
+ repos: string[];
49
+ } | {
50
+ type: "tags";
51
+ tags: string[];
52
+ };
53
+ export interface Team {
54
+ id: TeamId;
55
+ slug: TeamSlug;
56
+ displayName: string;
57
+ description?: string;
58
+ status: TeamStatus;
59
+ members: TeamMember[];
60
+ repos?: string[];
61
+ notifications?: NotificationChannel[];
62
+ }
63
+ export type TeamRef = {
64
+ _ref: string;
65
+ };
@@ -0,0 +1 @@
1
+ export {};
@@ -52,6 +52,8 @@ export declare const LEGACY_EVAL_MODE_ALIASES: readonly ["baseline", "agentic",
52
52
  export declare const LITERACY_VARIANTS: readonly ["baseline", "agentic", "observed", "full"];
53
53
  /** Union of all literacy variant string values. */
54
54
  export type LiteracyVariant = (typeof LITERACY_VARIANTS)[number];
55
+ /** Type guard for `LiteracyVariant` — true when `value` is one of the closed set. */
56
+ export declare function isLiteracyVariant(value: unknown): value is LiteracyVariant;
55
57
  /**
56
58
  * All accepted mode names for Zod enum construction.
57
59
  * Canonical modes first, then legacy aliases.
@@ -40,6 +40,11 @@ export const LITERACY_VARIANTS = [
40
40
  "observed",
41
41
  "full",
42
42
  ];
43
+ /** Type guard for `LiteracyVariant` — true when `value` is one of the closed set. */
44
+ export function isLiteracyVariant(value) {
45
+ return (typeof value === "string" &&
46
+ LITERACY_VARIANTS.includes(value));
47
+ }
43
48
  /**
44
49
  * All accepted mode names for Zod enum construction.
45
50
  * Canonical modes first, then legacy aliases.
@@ -0,0 +1,15 @@
1
+ /**
2
+ * Known notification event types and soft-enum helpers.
3
+ *
4
+ * Event types are free-form strings by design — teams can wire new events
5
+ * without a code change. This module seeds Studio comboboxes with canonical
6
+ * values and provides a narrowing predicate, without closing the enum.
7
+ *
8
+ * Parallel to the same type aliases in `@sanity/ailf-core`'s team module:
9
+ * `shared` is the leaf of the dependency graph, so the studio schema can
10
+ * import the runtime tuple without pulling in core.
11
+ */
12
+ export declare const KNOWN_EVENT_TYPES: readonly ["eval.failed", "eval.completed", "eval.threshold-breached", "eval.score-regressed", "task.created", "task.archived", "area.unowned-tasks"];
13
+ export type KnownEventType = (typeof KNOWN_EVENT_TYPES)[number];
14
+ export type EventType = KnownEventType | (string & {});
15
+ export declare function isKnownEventType(value: string): value is KnownEventType;
@@ -0,0 +1,23 @@
1
+ /**
2
+ * Known notification event types and soft-enum helpers.
3
+ *
4
+ * Event types are free-form strings by design — teams can wire new events
5
+ * without a code change. This module seeds Studio comboboxes with canonical
6
+ * values and provides a narrowing predicate, without closing the enum.
7
+ *
8
+ * Parallel to the same type aliases in `@sanity/ailf-core`'s team module:
9
+ * `shared` is the leaf of the dependency graph, so the studio schema can
10
+ * import the runtime tuple without pulling in core.
11
+ */
12
+ export const KNOWN_EVENT_TYPES = [
13
+ "eval.failed",
14
+ "eval.completed",
15
+ "eval.threshold-breached",
16
+ "eval.score-regressed",
17
+ "task.created",
18
+ "task.archived",
19
+ "area.unowned-tasks",
20
+ ];
21
+ export function isKnownEventType(value) {
22
+ return KNOWN_EVENT_TYPES.includes(value);
23
+ }
@@ -88,7 +88,7 @@ export const HELP_TOPICS = [
88
88
  {
89
89
  "id": "scoring-model",
90
90
  "title": "Understanding Scores",
91
- "body": "## The three dimensions\n\nEvery evaluation task is scored on three dimensions, each graded 0–100:\n\n- **Task Completion (50% weight)** — Can the AI implement the requested feature?\n Does the output actually do what was asked?\n- **Code Correctness (25% weight)** — Is the generated code idiomatic, correct,\n and following best practices?\n- **Doc Coverage (25% weight)** — Did the documentation provide the information\n needed to implement the feature?\n\n## How the overall score is calculated\n\nThe three dimensions combine into a single **AI Literacy Score** per task using\nnamed scoring profiles from `config/rubrics.yaml`:\n\n```\nGold (with docs): Total = Task × 0.50 + Code × 0.25 + Docs × 0.25\nBaseline (no docs): Total = Task × 0.60 + Code × 0.40\n```\n\nThe gold profile includes all three dimensions. The baseline profile excludes\nDoc Coverage because it is undefined when no documentation is provided. This\nensures Doc Lift (ceiling − floor) is a clean structural measurement of\ndocumentation value.\n\nThe weighted composite produces a score from 0–100. Scores are then averaged\nacross all tasks in a feature area to produce a **per-area score**, and across\nall areas to produce the **overall score**.\n\n## What the numbers mean\n\n| Score range | Interpretation |\n| ------------ | ----------------------------------------------------------------- |\n| **80–100** | Docs are working well — AI agents produce correct implementations |\n| **70–79** | Needs attention — there may be gaps in specific dimensions |\n| **Below 70** | Weak — AI agents consistently struggle with this area |\n\n## Ceiling decomposition (baseline mode)\n\nWhen running in baseline mode, each task is evaluated twice — with and without\ndocumentation. This produces:\n\n- **Floor score** — Score without docs (what the model knows from training data\n alone)\n- **Ceiling score** — Score with gold-standard docs injected directly into the\n prompt\n- **Doc Lift** — Ceiling minus floor. Positive means docs help; negative means\n docs hurt.\n- **Doc Quality Gap** — 100 minus ceiling. Room for documentation improvement.\n\n## Three-layer decomposition (full mode)\n\nFull mode adds a third measurement — what happens when AI agents find docs on\ntheir own:\n\n- **Floor** — No docs (parametric knowledge only)\n- **Ceiling** — Gold-standard docs injected (best the docs can do)\n- **Actual** — Agent-retrieved docs (real-world performance)\n- **Retrieval Gap** — Ceiling minus actual (quality lost to findability)\n- **Infrastructure Efficiency** — Actual ÷ ceiling (what fraction of doc quality\n reaches agents)\n\n## Cost tracking\n\nEach evaluation also tracks token costs:\n\n- **Provider cost** — Token usage for generating implementations\n- **Grader cost** — Token usage for the grading model's assessments\n- **Total cost** — Both combined, reported in the score summary",
91
+ "body": "## The three dimensions\n\nEvery evaluation task is scored on three dimensions, each graded 0–100:\n\n- **Task Completion (50% weight)** — Can the AI implement the requested feature?\n Does the output actually do what was asked?\n- **Code Correctness (25% weight)** — Is the generated code idiomatic, correct,\n and following best practices?\n- **Doc Coverage (25% weight)** — Did the documentation provide the information\n needed to implement the feature?\n\n## How the overall score is calculated\n\nThe three dimensions combine into a single **AI Literacy Score** per task using\nnamed scoring profiles from `packages/eval/config/rubrics.ts`:\n\n```\nGold (with docs): Total = Task × 0.50 + Code × 0.25 + Docs × 0.25\nBaseline (no docs): Total = Task × 0.60 + Code × 0.40\n```\n\nThe gold profile includes all three dimensions. The baseline profile excludes\nDoc Coverage because it is undefined when no documentation is provided. This\nensures Doc Lift (ceiling − floor) is a clean structural measurement of\ndocumentation value.\n\nThe weighted composite produces a score from 0–100. Scores are then averaged\nacross all tasks in a feature area to produce a **per-area score**, and across\nall areas to produce the **overall score**.\n\n## What the numbers mean\n\n| Score range | Interpretation |\n| ------------ | ----------------------------------------------------------------- |\n| **80–100** | Docs are working well — AI agents produce correct implementations |\n| **70–79** | Needs attention — there may be gaps in specific dimensions |\n| **Below 70** | Weak — AI agents consistently struggle with this area |\n\n## Ceiling decomposition (baseline mode)\n\nWhen running in baseline mode, each task is evaluated twice — with and without\ndocumentation. This produces:\n\n- **Floor score** — Score without docs (what the model knows from training data\n alone)\n- **Ceiling score** — Score with gold-standard docs injected directly into the\n prompt\n- **Doc Lift** — Ceiling minus floor. Positive means docs help; negative means\n docs hurt.\n- **Doc Quality Gap** — 100 minus ceiling. Room for documentation improvement.\n\n## Three-layer decomposition (full mode)\n\nFull mode adds a third measurement — what happens when AI agents find docs on\ntheir own:\n\n- **Floor** — No docs (parametric knowledge only)\n- **Ceiling** — Gold-standard docs injected (best the docs can do)\n- **Actual** — Agent-retrieved docs (real-world performance)\n- **Retrieval Gap** — Ceiling minus actual (quality lost to findability)\n- **Infrastructure Efficiency** — Actual ÷ ceiling (what fraction of doc quality\n reaches agents)\n\n## Cost tracking\n\nEach evaluation also tracks token costs:\n\n- **Provider cost** — Token usage for generating implementations\n- **Grader cost** — Token usage for the grading model's assessments\n- **Total cost** — Both combined, reported in the score summary",
92
92
  "source": "docs/help/scoring-model.md",
93
93
  "related": [
94
94
  "three-layer",
@@ -99,7 +99,7 @@ export const HELP_TOPICS = [
99
99
  {
100
100
  "id": "weaknesses-recommendations",
101
101
  "title": "Weaknesses & Recommendations",
102
- "body": "## Understanding weaknesses\n\nThe Issues sub-tab in Diagnostics lists every area or dimension that scored\nbelow threshold. Each weakness entry shows:\n\n- **The feature area** — Which product feature is affected (e.g., GROQ,\n Functions, Webhooks).\n- **The bottleneck dimension** — Which scoring dimension is dragging the area\n down: task completion, code correctness, or doc coverage.\n- **The score** — How far below threshold the dimension scored.\n\n## Gap analysis recommendations\n\nWhen an evaluation runs with gap analysis enabled, the dashboard shows\n**prioritized recommendations** — specific actions ranked by estimated impact.\n\nEach recommendation includes:\n\n- **Failure mode** — The type of doc problem identified:\n - `missing-docs` — The functionality isn't documented at all.\n - `incorrect-docs` — The docs contain factual errors.\n - `outdated-docs` — The docs describe an old API version or pattern.\n - `poor-structure` — The docs exist but are hard to find or understand.\n- **Estimated lift** — How many score points fixing this gap would add. Based on\n raising the bottleneck dimension to the median of non-bottleneck dimensions.\n Conservative estimate — actual improvement may be higher.\n- **Confidence** — How sure the analysis is about this diagnosis (high, medium,\n or low).\n- **Affected tasks** — Which specific evaluation tasks exposed this gap.\n\n## Low-scoring judgments\n\nBelow the recommendations, you'll find the **grader's explanations** for tests\nthat scored below 70. These are the raw assessments from the grading model\nexplaining exactly what went wrong — missing API calls, incorrect patterns,\nhallucinated features, etc.\n\nEach judgment shows the task, the dimension, the score, and the grader's natural\nlanguage reason. These are the most granular diagnostic signal available and\noften point directly to the doc section that needs fixing.",
102
+ "body": "## Understanding weaknesses\n\nThe Issues sub-tab in Diagnostics lists every area or dimension that scored\nbelow threshold. Each weakness entry shows:\n\n- **The feature area** — Which product feature is affected (e.g., GROQ,\n Functions, Webhooks).\n- **The bottleneck dimension** — Which scoring dimension is dragging the area\n down: task completion, code correctness, or doc coverage.\n- **The score** — How far below threshold the dimension scored.\n\n## Gap analysis recommendations\n\nWhen an evaluation runs with gap analysis enabled, the dashboard shows\n**prioritized recommendations** — specific actions ranked by estimated impact.\n\nEach recommendation includes:\n\n- **Failure mode** — The type of doc problem identified:\n - `missing-docs` — The functionality isn't documented at all.\n - `incorrect-docs` — The docs contain factual errors.\n - `outdated-docs` — The docs describe an old API version or pattern.\n - `poor-structure` — The docs exist but are hard to find or understand.\n- **Estimated lift** — How many score points fixing this gap would add. Based on\n raising the bottleneck dimension to the median of non-bottleneck dimensions.\n Conservative estimate — actual improvement may be higher.\n- **Confidence** — How sure the analysis is about this diagnosis (high, medium,\n or low).\n- **Affected tasks** — Which specific evaluation tasks exposed this gap.\n\n## Diagnosis cards\n\nEvery published report now carries a **diagnosis artifact** — a set of cards\nproduced by the post-pipeline hook (`ailf interpret`). The Studio diagnosis\npanel renders these cards directly; the dashboard's Recommendations and\nFailure-modes panels migrate to the same source in a follow-up.\n\nThe hook runs by default for every pipeline invocation. To opt out for a single\nrun, pass `--no-summary`; to opt out in CI, set `AILF_INTERPRET_ON_RUN=0` in the\nworkflow env block; to opt out project-wide, set `summary.onRun: never` in\n`.ailf/config.yaml`.\n\n## Low-scoring judgments\n\nBelow the recommendations, you'll find the **grader's explanations** for tests\nthat scored below 70. These are the raw assessments from the grading model\nexplaining exactly what went wrong — missing API calls, incorrect patterns,\nhallucinated features, etc.\n\nEach judgment shows the task, the dimension, the score, and the grader's natural\nlanguage reason. These are the most granular diagnostic signal available and\noften point directly to the doc section that needs fixing.",
103
103
  "source": "docs/help/weaknesses-recommendations.md",
104
104
  "related": [
105
105
  "interpreting-diagnostics",