@sanity/ailf 7.0.1 → 7.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/config/rubrics.ts +12 -13
  2. package/dist/_vendor/ailf-core/ports/context.d.ts +45 -3
  3. package/dist/_vendor/ailf-core/ports/index.d.ts +1 -1
  4. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +9 -1
  5. package/dist/_vendor/ailf-core/schemas/branded-string.js +16 -6
  6. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -0
  7. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
  8. package/dist/_vendor/ailf-core/schemas/report.d.ts +12 -0
  9. package/dist/_vendor/ailf-core/schemas/report.js +2 -0
  10. package/dist/_vendor/ailf-core/schemas/team.d.ts +22 -0
  11. package/dist/_vendor/ailf-core/schemas/team.js +63 -0
  12. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +51 -0
  13. package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
  14. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +17 -0
  15. package/dist/_vendor/ailf-core/types/team.d.ts +65 -0
  16. package/dist/_vendor/ailf-core/types/team.js +1 -0
  17. package/dist/_vendor/ailf-shared/eval-modes.d.ts +2 -0
  18. package/dist/_vendor/ailf-shared/eval-modes.js +5 -0
  19. package/dist/_vendor/ailf-shared/event-types.d.ts +15 -0
  20. package/dist/_vendor/ailf-shared/event-types.js +23 -0
  21. package/dist/_vendor/ailf-shared/generated/help-content.js +2 -2
  22. package/dist/_vendor/ailf-shared/index.d.ts +4 -2
  23. package/dist/_vendor/ailf-shared/index.js +4 -2
  24. package/dist/_vendor/ailf-shared/member-roles.d.ts +16 -0
  25. package/dist/_vendor/ailf-shared/member-roles.js +16 -0
  26. package/dist/_vendor/ailf-shared/owner-teams.d.ts +19 -0
  27. package/dist/_vendor/ailf-shared/owner-teams.js +7 -0
  28. package/dist/_vendor/ailf-shared/run-context.d.ts +8 -1
  29. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +65 -1
  30. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +35 -0
  31. package/dist/adapters/task-sources/changed-docs-filter.d.ts +12 -0
  32. package/dist/adapters/task-sources/changed-docs-filter.js +30 -0
  33. package/dist/adapters/task-sources/content-lake-task-source.js +2 -1
  34. package/dist/adapters/task-sources/repo-task-source.js +2 -1
  35. package/dist/commands/pipeline-action.d.ts +4 -3
  36. package/dist/commands/pipeline-action.js +7 -5
  37. package/dist/commands/run.js +2 -2
  38. package/dist/config/rubrics.ts +12 -13
  39. package/dist/job-store.d.ts +18 -0
  40. package/dist/job-store.js +34 -0
  41. package/dist/orchestration/build-app-context.js +8 -1
  42. package/dist/orchestration/pipeline-orchestrator.js +46 -1
  43. package/dist/orchestration/steps/compare-step.d.ts +7 -0
  44. package/dist/orchestration/steps/compare-step.js +59 -23
  45. package/dist/orchestration/steps/fetch-docs-step.js +3 -0
  46. package/dist/orchestration/steps/finalize-run-step.js +2 -0
  47. package/dist/orchestration/steps/generate-configs-step.d.ts +32 -1
  48. package/dist/orchestration/steps/generate-configs-step.js +47 -13
  49. package/dist/orchestration/steps/grader-consistency-step.js +11 -0
  50. package/dist/orchestration/steps/publish-report-step.d.ts +12 -1
  51. package/dist/orchestration/steps/publish-report-step.js +19 -3
  52. package/dist/pipeline/cache-hit-restore.d.ts +14 -1
  53. package/dist/pipeline/cache-hit-restore.js +17 -0
  54. package/dist/pipeline/calculate-scores.js +57 -21
  55. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +7 -2
  56. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +13 -4
  57. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +1 -1
  58. package/dist/pipeline/compiler/provider-assembler.d.ts +15 -1
  59. package/dist/pipeline/compiler/provider-assembler.js +16 -3
  60. package/dist/pipeline/failure-modes.d.ts +20 -10
  61. package/dist/pipeline/failure-modes.js +84 -15
  62. package/dist/pipeline/map-request-to-config.js +2 -0
  63. package/dist/pipeline/normalize-mode.d.ts +1 -1
  64. package/dist/pipeline/normalize-mode.js +2 -0
  65. package/dist/pipeline/run-context.d.ts +16 -1
  66. package/dist/pipeline/run-context.js +12 -1
  67. package/dist/pipeline/validate.d.ts +8 -4
  68. package/dist/pipeline/validate.js +8 -18
  69. package/dist/report-store.d.ts +14 -1
  70. package/dist/report-store.js +32 -0
  71. package/dist/sanity/client.js +2 -2
  72. package/package.json +1 -1
@@ -64,9 +64,16 @@ function applyReplaySwap(providers) {
64
64
  * Returns provider arrays keyed by literacy variant name (baseline,
65
65
  * agentic, observed). These are consumed by the YAML writer to produce
66
66
  * the per-variant promptfoo config files.
67
+ *
68
+ * `loaded` (optional) lets callers pre-load and pre-filter the
69
+ * `ModelsConfig` so a caller-side filter (e.g. W0281's
70
+ * `filterModelsByRequest`) actually takes effect on the assembled
71
+ * providers — building providers from the unfiltered set would silently
72
+ * defeat the filter, since promptfoo decides which LLMs to call from the
73
+ * providers array, not the returned `models` field.
67
74
  */
68
- export function loadModelsAndProviders(rootDir, source, searchMode, allowedOrigins) {
69
- const models = loadModelsYaml(rootDir);
75
+ export function loadModelsAndProviders(rootDir, source, searchMode, allowedOrigins, loaded) {
76
+ const models = loaded ?? loadModelsYaml(rootDir);
70
77
  return {
71
78
  models,
72
79
  providers: {
@@ -203,6 +210,12 @@ export function resolveMaxToolRounds(models, model, variant) {
203
210
  // ---------------------------------------------------------------------------
204
211
  // Helpers
205
212
  // ---------------------------------------------------------------------------
206
- function loadModelsYaml(rootDir) {
213
+ /**
214
+ * Load the `ModelsConfig` for `rootDir` from disk. Exported so callers
215
+ * that need to pre-filter the model set before provider assembly (e.g.
216
+ * `PipelineRequest.models`) can hand the filtered config back to
217
+ * `loadModelsAndProviders` via its optional `loaded` parameter.
218
+ */
219
+ export function loadModelsYaml(rootDir) {
207
220
  return loadConfigFile("models", rootDir).data;
208
221
  }
@@ -1,24 +1,34 @@
1
1
  /**
2
2
  * pipeline/failure-modes.ts
3
3
  *
4
- * Ceiling-cross-check failure-mode validator + report assembly.
4
+ * Ceiling-cross-check failure-mode validator + report assembly + keyword
5
+ * fallback classifier.
5
6
  *
6
7
  * The grader emits `failureMode` directly under the per-dimension taxonomy
7
- * (Plan 03-02 — `packages/eval/src/grader/`). This module consumes the
8
- * grader's emission as the source of truth and uses the surviving ceiling
9
- * decomposition (`classifyByCeiling`) only as a CONFIDENCE VALIDATOR it
10
- * cross-checks the emitted mode against structural score signals and emits
11
- * a D0049 `Confidence` triple stamped with `derivation: "ceiling-cross-check"`.
8
+ * (Plan 03-02 — `packages/eval/src/grader/`) when its structured response
9
+ * is available to the pipeline. In practice (W0273 discovery), Promptfoo's
10
+ * `llm-rubric` post-processor extracts `score` + `reason` from the grader's
11
+ * JSON envelope and discards the rest of the structured surface including
12
+ * `failureMode`. The wire-shape footer instructs the LLM correctly but the
13
+ * structured fields never reach `extractGraderJudgments`, so every emission
14
+ * arrives as the synthesized `failureMode: "unclassified"` placeholder.
12
15
  *
13
- * The legacy keyword-pattern classifier (and its five regex pattern
14
- * constants) was deleted in Plan 03-03 its production coverage was ~1%
15
- * (Doc 03 §"Where classification lives"), and resurrecting it as a fallback
16
- * is explicitly out of scope.
16
+ * To restore the pre-2026-05-11 classification rate (15-23% 0% 15-23%),
17
+ * a keyword-pattern classifier is run as a FALLBACK when the grader's
18
+ * emitted mode is `"unclassified"` and the score is below the classification
19
+ * threshold. Plan 03-03 deleted this classifier in favor of grader-emission
20
+ * source-of-truth; W0273 reinstates it because the grader-emission path is
21
+ * blocked by Promptfoo's lossy `llm-rubric` parsing. The long-term fix
22
+ * (capturing the grader's full structured response) is tracked separately.
23
+ *
24
+ * `classifyByCeiling` continues to serve as the confidence cross-check.
17
25
  *
18
26
  * @see docs/decisions/D0005-grader-model-separation.md — single grader emits
19
27
  * failureMode under the per-dimension taxonomy
20
28
  * @see docs/decisions/D0049-shared-confidence-contract.md — Confidence triple
21
29
  * shape and `ceiling-cross-check` derivation tag
30
+ * @see docs/audits/2026-05-22-empty-gap-analysis-regression.md — W0273 root
31
+ * cause (Promptfoo strips structured fields)
22
32
  */
23
33
  import type { Confidence } from "../_vendor/ailf-core/index.d.ts";
24
34
  import type { FailureModeReport, FeatureScore, GraderJudgment } from "./types.js";
@@ -1,24 +1,34 @@
1
1
  /**
2
2
  * pipeline/failure-modes.ts
3
3
  *
4
- * Ceiling-cross-check failure-mode validator + report assembly.
4
+ * Ceiling-cross-check failure-mode validator + report assembly + keyword
5
+ * fallback classifier.
5
6
  *
6
7
  * The grader emits `failureMode` directly under the per-dimension taxonomy
7
- * (Plan 03-02 — `packages/eval/src/grader/`). This module consumes the
8
- * grader's emission as the source of truth and uses the surviving ceiling
9
- * decomposition (`classifyByCeiling`) only as a CONFIDENCE VALIDATOR it
10
- * cross-checks the emitted mode against structural score signals and emits
11
- * a D0049 `Confidence` triple stamped with `derivation: "ceiling-cross-check"`.
8
+ * (Plan 03-02 — `packages/eval/src/grader/`) when its structured response
9
+ * is available to the pipeline. In practice (W0273 discovery), Promptfoo's
10
+ * `llm-rubric` post-processor extracts `score` + `reason` from the grader's
11
+ * JSON envelope and discards the rest of the structured surface including
12
+ * `failureMode`. The wire-shape footer instructs the LLM correctly but the
13
+ * structured fields never reach `extractGraderJudgments`, so every emission
14
+ * arrives as the synthesized `failureMode: "unclassified"` placeholder.
12
15
  *
13
- * The legacy keyword-pattern classifier (and its five regex pattern
14
- * constants) was deleted in Plan 03-03 its production coverage was ~1%
15
- * (Doc 03 §"Where classification lives"), and resurrecting it as a fallback
16
- * is explicitly out of scope.
16
+ * To restore the pre-2026-05-11 classification rate (15-23% 0% 15-23%),
17
+ * a keyword-pattern classifier is run as a FALLBACK when the grader's
18
+ * emitted mode is `"unclassified"` and the score is below the classification
19
+ * threshold. Plan 03-03 deleted this classifier in favor of grader-emission
20
+ * source-of-truth; W0273 reinstates it because the grader-emission path is
21
+ * blocked by Promptfoo's lossy `llm-rubric` parsing. The long-term fix
22
+ * (capturing the grader's full structured response) is tracked separately.
23
+ *
24
+ * `classifyByCeiling` continues to serve as the confidence cross-check.
17
25
  *
18
26
  * @see docs/decisions/D0005-grader-model-separation.md — single grader emits
19
27
  * failureMode under the per-dimension taxonomy
20
28
  * @see docs/decisions/D0049-shared-confidence-contract.md — Confidence triple
21
29
  * shape and `ceiling-cross-check` derivation tag
30
+ * @see docs/audits/2026-05-22-empty-gap-analysis-regression.md — W0273 root
31
+ * cause (Promptfoo strips structured fields)
22
32
  */
23
33
  import { LEGACY_FAILURE_MODES, detectFeatureArea } from "../_vendor/ailf-core/index.js";
24
34
  // ---------------------------------------------------------------------------
@@ -27,6 +37,20 @@ import { LEGACY_FAILURE_MODES, detectFeatureArea } from "../_vendor/ailf-core/in
27
37
  /** Only classify judgments with scores below this threshold */
28
38
  const CLASSIFICATION_THRESHOLD = 60;
29
39
  // ---------------------------------------------------------------------------
40
+ // Keyword patterns (W0273 fallback)
41
+ //
42
+ // Verbatim from the pre-Plan-03-03 implementation. Used only when the
43
+ // grader's emitted `failureMode` is `"unclassified"` — the grader's
44
+ // emission still wins whenever it actually reaches the pipeline.
45
+ // ---------------------------------------------------------------------------
46
+ /** API error pattern — checked FIRST to prevent timeout errors containing
47
+ * "deprecated" from being misclassified as outdated-docs. */
48
+ const API_ERROR_PATTERN = /\[api-error\]|timeout|timed out|rate limit|429|503|ECONNRESET|ETIMEDOUT|socket hang up|fetch failed/i;
49
+ const OUTDATED_PATTERN = /deprecated|old api|v[0-9]+ syntax|no longer supported|legacy|previous version|outdated|superseded|replaced by/i;
50
+ const MISSING_PATTERN = /no documentation|not covered|had to guess|not found|missing.*doc|no.*information|undocumented|couldn't find|without.*documentation/i;
51
+ const INCORRECT_PATTERN = /contradicts|incorrect.*doc|doc.*incorrect|wrong.*doc|doc.*wrong|documentation says.*but|factual error|inaccurate|misleading.*doc/i;
52
+ const POOR_STRUCTURE_PATTERN = /unclear|ambiguous|couldn't determine|conflicting|confusing|hard to follow|poorly organized|scattered|fragmented/i;
53
+ // ---------------------------------------------------------------------------
30
54
  // Public API
31
55
  // ---------------------------------------------------------------------------
32
56
  /**
@@ -69,13 +93,25 @@ export function buildFailureModeReport(judgments, scores) {
69
93
  // grader's actual taxonomy choice rather than a collapsed
70
94
  // `"unclassified"` bucket.
71
95
  const emittedMode = readEmittedMode(judgment);
96
+ // W0273 fallback — when the grader's emitted mode is "unclassified"
97
+ // (the synthesized-unparsed-judgment placeholder; in practice this
98
+ // is every judgment today because Promptfoo's llm-rubric strips the
99
+ // grader's structured response), try keyword classification against
100
+ // the reason prose. Gated on score < CLASSIFICATION_THRESHOLD so
101
+ // passing judgments don't get spurious classifications.
102
+ const keywordFallback = emittedMode === "unclassified" &&
103
+ judgment.score < CLASSIFICATION_THRESHOLD
104
+ ? classifyByKeyword(judgment.reason)
105
+ : null;
72
106
  // Cross-check the grader's emission against ceiling decomposition.
73
107
  const stamp = validateFailureMode(judgment, ceilingScore, floorScore);
74
- const classification = {
75
- confidence: stamp.level,
76
- mode: emittedMode,
77
- source: "ceiling",
78
- };
108
+ const classification = keywordFallback
109
+ ? keywordFallback
110
+ : {
111
+ confidence: stamp.level,
112
+ mode: emittedMode,
113
+ source: "ceiling",
114
+ };
79
115
  classifiedJudgments.push({ classification, judgment });
80
116
  summary[classification.mode] = (summary[classification.mode] ?? 0) + 1;
81
117
  // Per-area tracking
@@ -282,6 +318,39 @@ function readEmittedMode(judgment) {
282
318
  }
283
319
  return emitted;
284
320
  }
321
+ /**
322
+ * Classify the failure mode of a low-scoring grader judgment by matching
323
+ * keyword patterns against the reason prose. Returns `null` when no
324
+ * pattern matches. Patterns checked in priority order (API errors first
325
+ * so timeout messages containing "deprecated" don't get misclassified
326
+ * as outdated-docs).
327
+ *
328
+ * W0273 — reinstated as a fallback when the grader's emitted failureMode
329
+ * is "unclassified". Plan 03-03 deleted this code in favor of grader-
330
+ * emission source-of-truth; the deletion is reversed here because
331
+ * Promptfoo's llm-rubric post-processor strips the grader's structured
332
+ * response (only score + reason survive into `comp.*`), so the
333
+ * grader-emission path produces 0% classification on every run.
334
+ */
335
+ function classifyByKeyword(reason) {
336
+ const lower = reason.toLowerCase();
337
+ if (API_ERROR_PATTERN.test(lower)) {
338
+ return { confidence: "high", mode: "api-error", source: "keyword" };
339
+ }
340
+ if (OUTDATED_PATTERN.test(lower)) {
341
+ return { confidence: "high", mode: "outdated-docs", source: "keyword" };
342
+ }
343
+ if (MISSING_PATTERN.test(lower)) {
344
+ return { confidence: "high", mode: "missing-docs", source: "keyword" };
345
+ }
346
+ if (INCORRECT_PATTERN.test(lower)) {
347
+ return { confidence: "medium", mode: "incorrect-docs", source: "keyword" };
348
+ }
349
+ if (POOR_STRUCTURE_PATTERN.test(lower)) {
350
+ return { confidence: "medium", mode: "poor-structure", source: "keyword" };
351
+ }
352
+ return null;
353
+ }
285
354
  /**
286
355
  * Classify by ceiling-decomposition structural signals — preserved
287
356
  * verbatim from the pre-Plan-03-03 implementation. The function itself
@@ -37,6 +37,7 @@ export function mapRequestToConfig(request, rootDir) {
37
37
  mode,
38
38
  variant,
39
39
  debug: mapDebug(request.debug),
40
+ models: request.models,
40
41
  areas: request.areas,
41
42
  tasks: request.tasks,
42
43
  changedDocs: request.changedDocs,
@@ -46,6 +47,7 @@ export function mapRequestToConfig(request, rootDir) {
46
47
  compareEnabled: request.compare ?? false,
47
48
  compareThreshold: request.compareThreshold,
48
49
  compareBaseline: request.compareBaseline,
50
+ compareBaselineReportId: request.compareBaselineReportId,
49
51
  gapAnalysisEnabled: request.gapAnalysis ?? true,
50
52
  publishEnabled: request.publish ?? publishDefault,
51
53
  publishTag: request.publishTag,
@@ -35,7 +35,7 @@ export type LiteracyVariantName = (typeof LiteracyVariant)[keyof typeof Literacy
35
35
  export type LiteracyEvalSubMode = typeof LiteracyVariant.STANDARD | typeof LiteracyVariant.AGENTIC;
36
36
  export interface NormalizedMode {
37
37
  mode: EvalMode;
38
- variant?: string;
38
+ variant?: LiteracyVariantName;
39
39
  }
40
40
  /**
41
41
  * Normalize a raw CLI mode string to a canonical mode + optional variant.
@@ -55,6 +55,8 @@ const ALL_ACCEPTED = [
55
55
  export function normalizeMode(input) {
56
56
  if (LEGACY_LITERACY_VARIANTS.has(input)) {
57
57
  console.warn(`⚠ Deprecated: --mode ${input} is a legacy alias. Use --mode literacy --variant ${input} instead.`);
58
+ // The membership check above narrows `input` to LITERACY_VARIANTS — the
59
+ // cast is to the closed type, not a widening.
58
60
  return { mode: "literacy", variant: input };
59
61
  }
60
62
  if (CANONICAL_MODES.has(input)) {
@@ -13,7 +13,7 @@
13
13
  * @see docs/decisions/D0032-run-anchored-artifact-store.md (§ Move 5 — Drift Prevention)
14
14
  */
15
15
  import { type Logger, type RunContext } from "../_vendor/ailf-core/index.d.ts";
16
- import { type RunClassification, type RunExecutor, type RunExecutorSurface, type RunHost, type RunLineage, type RunOwner, type RunTool } from "../_vendor/ailf-shared/index.d.ts";
16
+ import { type LiteracyVariant, type RunClassification, type RunExecutor, type RunExecutorSurface, type RunHost, type RunLineage, type RunOwner, type RunTool } from "../_vendor/ailf-shared/index.d.ts";
17
17
  import type { ResolvedSourceConfig } from "../sources.js";
18
18
  import type { EvalMode } from "./types.js";
19
19
  /**
@@ -74,6 +74,21 @@ export interface RunContextInput {
74
74
  source: ResolvedSourceConfig;
75
75
  /** Specific task IDs evaluated (if scoped) */
76
76
  taskIds?: string[];
77
+ /**
78
+ * Literacy mode variant (`baseline | agentic | observed | full`). Only
79
+ * meaningful when `mode === "literacy"`; ignored for other modes. Lands
80
+ * on `RunContext.variant` and `ReportProvenance.variant` so consumers
81
+ * can disambiguate which literacy variant the run executed.
82
+ */
83
+ variant?: LiteracyVariant;
84
+ /**
85
+ * Model IDs the caller requested via `PipelineRequest.models`. When
86
+ * present, `RunContext.models` is filtered to this subset so the report's
87
+ * `provenance.models` reflects what was actually evaluated. Unknown IDs
88
+ * are silently filtered out — the upstream rejection path (W0281
89
+ * `filterModelsByRequest`) has already failed the run or warned.
90
+ */
91
+ requestedModelIds?: string[];
77
92
  }
78
93
  /**
79
94
  * Derive `RunContext` from pipeline inputs. The only construction path.
@@ -68,8 +68,18 @@ export function buildRunContext(input) {
68
68
  // config/models.ts model matrix — listing those models would be
69
69
  // misleading. Only include them for literacy mode where they're the
70
70
  // actual eval targets.
71
+ //
72
+ // When `PipelineRequest.models` pinned a subset, filter here too so
73
+ // `provenance.models` matches what actually ran (W0281). Without this
74
+ // the report would advertise the full cohort even though only the
75
+ // requested subset reached the LLMs.
76
+ const requestedSet = input.requestedModelIds?.length
77
+ ? new Set(input.requestedModelIds)
78
+ : undefined;
71
79
  const evaluatedModels = input.mode === "literacy"
72
- ? models.models.map((m) => ({ id: m.id, label: m.label }))
80
+ ? models.models
81
+ .filter((m) => !requestedSet || requestedSet.has(m.id))
82
+ .map((m) => ({ id: m.id, label: m.label }))
73
83
  : [];
74
84
  return {
75
85
  areas: input.areas,
@@ -95,6 +105,7 @@ export function buildRunContext(input) {
95
105
  taskIds: input.taskIds,
96
106
  tool,
97
107
  trigger,
108
+ variant: input.mode === "literacy" ? input.variant : undefined,
98
109
  };
99
110
  }
100
111
  // ---------------------------------------------------------------------------
@@ -14,11 +14,15 @@ import type { ValidationIssue, ValidationResult } from "./types.js";
14
14
  */
15
15
  export declare function validateConfiguration(rootDir: string): ValidationResult;
16
16
  /**
17
- * Check that canonical context files exist. These are the per-task
18
- * gold-retrieval contexts actually referenced by task definitions.
17
+ * Check that the canonical-contexts directory exists.
19
18
  *
20
- * Contexts are generated by fetch-docs and may not exist yet —
21
- * returns warnings, not errors.
19
+ * Contexts are populated by fetch-docs, which scopes to the tasks
20
+ * actually being evaluated (not every task in the registry). Warning
21
+ * on individual missing files here would fire for every task the user
22
+ * didn't select — pure noise that previously crowded out real errors
23
+ * in the GHA safety-net's tail-of-log capture (W0282). The per-task
24
+ * precondition is enforced by `run-eval-step.ts:checkCanonicalContextsExist`
25
+ * against the filtered task set, where missing files are real errors.
22
26
  */
23
27
  export declare function validateContexts(rootDir: string): ValidationIssue[];
24
28
  /**
@@ -34,11 +34,15 @@ export function validateConfiguration(rootDir) {
34
34
  return { issues, valid };
35
35
  }
36
36
  /**
37
- * Check that canonical context files exist. These are the per-task
38
- * gold-retrieval contexts actually referenced by task definitions.
37
+ * Check that the canonical-contexts directory exists.
39
38
  *
40
- * Contexts are generated by fetch-docs and may not exist yet —
41
- * returns warnings, not errors.
39
+ * Contexts are populated by fetch-docs, which scopes to the tasks
40
+ * actually being evaluated (not every task in the registry). Warning
41
+ * on individual missing files here would fire for every task the user
42
+ * didn't select — pure noise that previously crowded out real errors
43
+ * in the GHA safety-net's tail-of-log capture (W0282). The per-task
44
+ * precondition is enforced by `run-eval-step.ts:checkCanonicalContextsExist`
45
+ * against the filtered task set, where missing files are real errors.
42
46
  */
43
47
  export function validateContexts(rootDir) {
44
48
  const source = "validateContexts";
@@ -46,20 +50,6 @@ export function validateContexts(rootDir) {
46
50
  const canonicalDir = path.join(rootDir, "contexts", "canonical");
47
51
  if (!fs.existsSync(canonicalDir)) {
48
52
  issues.push(warning(source, "contexts/canonical/ directory not found — run 'pnpm fetch-docs' to generate", canonicalDir));
49
- return issues;
50
- }
51
- const mappings = resolveMappings(rootDir);
52
- for (const [, areaConfig] of Object.entries(mappings.feature_areas)) {
53
- if (!areaConfig?.tasks)
54
- continue;
55
- for (const task of areaConfig.tasks) {
56
- if (!task.id)
57
- continue;
58
- const contextFile = path.join(canonicalDir, `${task.id}.md`);
59
- if (!fs.existsSync(contextFile)) {
60
- issues.push(warning(source, `Missing canonical context for task '${task.id}' — run 'pnpm fetch-docs' to generate`, contextFile));
61
- }
62
- }
63
53
  }
64
54
  return issues;
65
55
  }
@@ -15,7 +15,7 @@
15
15
  * @see docs/design-docs/report-store/domain-model.md
16
16
  */
17
17
  import type { SanityClient } from "@sanity/client";
18
- import type { ArtifactRef, ArtifactType, SynthesisCostTelemetry } from "./_vendor/ailf-core/index.d.ts";
18
+ import type { ArtifactRef, ArtifactType, LoadBaselineResult, SynthesisCostTelemetry } from "./_vendor/ailf-core/index.d.ts";
19
19
  import type { ComparisonReport, ISOTimestamp, LineageQuery, Report, ReportId, ReportProvenance, ScoreSummary } from "./pipeline/types.js";
20
20
  /**
21
21
  * Result of an auto-comparison, bundling the ComparisonReport with the
@@ -113,6 +113,19 @@ export declare class ReportStore {
113
113
  * W0191 runtime schema gate. Sanity API failures still return null.
114
114
  */
115
115
  read(id: ReportId): Promise<null | Report>;
116
+ /**
117
+ * Load a previously-published report's score summary as a baseline
118
+ * for comparison. Returns a discriminated result so the caller can
119
+ * distinguish a genuine 404 (skip compare with a clear reason) from
120
+ * a transport failure (fail the step — the user pinned a baseline
121
+ * and deserves to know it didn't actually compare).
122
+ *
123
+ * The report's `summary` field is a `ReportSummary` — a superset of
124
+ * `ComparableSummary` — so the projection below carries everything
125
+ * the `compare()` primitive needs (`overall`, `perModel`, `scores`)
126
+ * without re-hydrating the slim prose/array fields.
127
+ */
128
+ loadBaselineFromReport(reportId: string): Promise<LoadBaselineResult>;
116
129
  /**
117
130
  * Write a report to the Sanity Content Lake.
118
131
  *
@@ -270,6 +270,38 @@ export class ReportStore {
270
270
  return null;
271
271
  }
272
272
  }
273
+ /**
274
+ * Load a previously-published report's score summary as a baseline
275
+ * for comparison. Returns a discriminated result so the caller can
276
+ * distinguish a genuine 404 (skip compare with a clear reason) from
277
+ * a transport failure (fail the step — the user pinned a baseline
278
+ * and deserves to know it didn't actually compare).
279
+ *
280
+ * The report's `summary` field is a `ReportSummary` — a superset of
281
+ * `ComparableSummary` — so the projection below carries everything
282
+ * the `compare()` primitive needs (`overall`, `perModel`, `scores`)
283
+ * without re-hydrating the slim prose/array fields.
284
+ */
285
+ async loadBaselineFromReport(reportId) {
286
+ try {
287
+ const doc = await this.client.fetch(`*[_type == $type && reportId == $id][0]{ summary }`, { id: reportId, type: REPORT_TYPE });
288
+ const summary = doc?.summary;
289
+ if (!summary)
290
+ return { kind: "not_found" };
291
+ return {
292
+ kind: "ok",
293
+ baseline: {
294
+ overall: summary.overall,
295
+ perModel: summary.perModel,
296
+ scores: summary.scores,
297
+ },
298
+ };
299
+ }
300
+ catch (error) {
301
+ const message = error instanceof Error ? error.message : String(error);
302
+ return { kind: "error", message };
303
+ }
304
+ }
273
305
  /**
274
306
  * Write a report to the Sanity Content Lake.
275
307
  *
@@ -108,8 +108,8 @@ export function getSanityClient(overrides, source) {
108
108
  * fall back to `SANITY_DATASET` so existing CI workflows that pin a
109
109
  * test/staging dataset (e.g. Tier 2 with `SANITY_DATASET=ailf-test`)
110
110
  * continue to work without a new env var. The hard-coded fallback is
111
- * the editorial dataset name during the D0043 cutover window the flip
112
- * to `ailf-prod-private` happens after the migration script runs.
111
+ * `AILF_DATASET_DEFAULT` (`ailf-prod-private`, D0043) only reached for
112
+ * ad-hoc runs with no env at all.
113
113
  *
114
114
  * Token resolution prefers the AILF-scoped token, falling back to
115
115
  * the shared `SANITY_API_TOKEN`.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "7.0.1",
3
+ "version": "7.1.0",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"