@sanity/ailf 4.2.0 → 4.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/config/package-surface.ts +37 -0
  2. package/config/preflight-scoring.ts +26 -0
  3. package/dist/_vendor/ailf-core/artifact-registry.d.ts +1 -1
  4. package/dist/_vendor/ailf-core/artifact-registry.js +47 -0
  5. package/dist/_vendor/ailf-core/config-helpers.d.ts +35 -0
  6. package/dist/_vendor/ailf-core/config-helpers.js +67 -0
  7. package/dist/_vendor/ailf-core/index.d.ts +1 -1
  8. package/dist/_vendor/ailf-core/index.js +1 -1
  9. package/dist/_vendor/ailf-core/ports/context.d.ts +18 -0
  10. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +30 -0
  11. package/dist/_vendor/ailf-core/ports/index.d.ts +3 -1
  12. package/dist/_vendor/ailf-core/ports/index.js +1 -0
  13. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +23 -0
  14. package/dist/_vendor/ailf-core/ports/package-surface-resolver.d.ts +71 -0
  15. package/dist/_vendor/ailf-core/ports/package-surface-resolver.js +36 -0
  16. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +6 -0
  17. package/dist/_vendor/ailf-core/schemas/eval-config.js +14 -0
  18. package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
  19. package/dist/_vendor/ailf-core/schemas/index.js +1 -0
  20. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +4 -0
  21. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
  22. package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.d.ts +51 -0
  23. package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.js +57 -0
  24. package/dist/_vendor/ailf-core/types/index.d.ts +12 -0
  25. package/dist/_vendor/ailf-core/types/index.js +1 -0
  26. package/dist/_vendor/ailf-core/types/package-surface.d.ts +36 -0
  27. package/dist/_vendor/ailf-core/types/package-surface.js +13 -0
  28. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
  29. package/dist/_vendor/ailf-core/types/preflight-scoring.d.ts +52 -0
  30. package/dist/_vendor/ailf-core/types/preflight-scoring.js +18 -0
  31. package/dist/_vendor/ailf-core/types/repo-config.d.ts +14 -0
  32. package/dist/_vendor/ailf-core/types/symbol-preflight-report.d.ts +66 -0
  33. package/dist/_vendor/ailf-core/types/symbol-preflight-report.js +25 -0
  34. package/dist/adapters/api-client/build-request.d.ts +1 -0
  35. package/dist/adapters/api-client/build-request.js +3 -0
  36. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  37. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +4 -0
  38. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +159 -82
  39. package/dist/adapters/index.d.ts +1 -0
  40. package/dist/adapters/index.js +1 -0
  41. package/dist/adapters/package-surface/dts-package-surface.d.ts +46 -0
  42. package/dist/adapters/package-surface/dts-package-surface.js +173 -0
  43. package/dist/adapters/package-surface/in-memory-package-surface.d.ts +15 -0
  44. package/dist/adapters/package-surface/in-memory-package-surface.js +28 -0
  45. package/dist/adapters/package-surface/index.d.ts +9 -0
  46. package/dist/adapters/package-surface/index.js +8 -0
  47. package/dist/adapters/package-surface/parse-dts-exports.d.ts +31 -0
  48. package/dist/adapters/package-surface/parse-dts-exports.js +54 -0
  49. package/dist/adapters/task-sources/repo-schemas.d.ts +6 -0
  50. package/dist/adapters/task-sources/repo-schemas.js +15 -0
  51. package/dist/commands/pipeline-action.d.ts +2 -0
  52. package/dist/commands/pipeline-action.js +12 -0
  53. package/dist/commands/remote-pipeline.js +10 -2
  54. package/dist/commands/remote-results.d.ts +12 -1
  55. package/dist/commands/remote-results.js +25 -5
  56. package/dist/composition-root.js +9 -0
  57. package/dist/config/package-surface.ts +37 -0
  58. package/dist/config/preflight-scoring.ts +26 -0
  59. package/dist/index.d.ts +2 -2
  60. package/dist/index.js +1 -1
  61. package/dist/orchestration/build-app-context.js +1 -0
  62. package/dist/orchestration/pipeline-orchestrator.d.ts +19 -1
  63. package/dist/orchestration/pipeline-orchestrator.js +38 -0
  64. package/dist/orchestration/steps/calculate-scores-step.js +11 -0
  65. package/dist/orchestration/steps/generate-configs-step.js +16 -1
  66. package/dist/orchestration/steps/run-eval-step.js +27 -0
  67. package/dist/pipeline/calculate-scores.d.ts +66 -5
  68. package/dist/pipeline/calculate-scores.js +141 -27
  69. package/dist/pipeline/compiler/index.d.ts +1 -1
  70. package/dist/pipeline/compiler/index.js +1 -1
  71. package/dist/pipeline/compiler/literacy-bridge.d.ts +9 -0
  72. package/dist/pipeline/compiler/literacy-bridge.js +2 -0
  73. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +1 -1
  74. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +31 -4
  75. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +146 -1
  76. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +2 -0
  77. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +17 -2
  78. package/dist/pipeline/compiler/rubric-resolution.d.ts +17 -1
  79. package/dist/pipeline/compiler/rubric-resolution.js +78 -2
  80. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -2
  81. package/dist/pipeline/compiler/scoring-bridge.js +104 -10
  82. package/dist/pipeline/eval-fingerprint.d.ts +9 -0
  83. package/dist/pipeline/eval-fingerprint.js +7 -1
  84. package/dist/pipeline/map-request-to-config.js +1 -0
  85. package/dist/pipeline/preflight/compute-preflight.d.ts +67 -0
  86. package/dist/pipeline/preflight/compute-preflight.js +118 -0
  87. package/dist/pipeline/preflight/emit-symbol-preflight.d.ts +51 -0
  88. package/dist/pipeline/preflight/emit-symbol-preflight.js +102 -0
  89. package/dist/pipeline/preflight/load-package-surface.d.ts +14 -0
  90. package/dist/pipeline/preflight/load-package-surface.js +19 -0
  91. package/dist/pipeline/preflight/load-preflight-context.d.ts +13 -0
  92. package/dist/pipeline/preflight/load-preflight-context.js +25 -0
  93. package/dist/pipeline/preflight/load-preflight-scoring.d.ts +12 -0
  94. package/dist/pipeline/preflight/load-preflight-scoring.js +17 -0
  95. package/dist/pipeline/preflight/parse-imports.d.ts +62 -0
  96. package/dist/pipeline/preflight/parse-imports.js +125 -0
  97. package/dist/report-store.d.ts +8 -0
  98. package/dist/report-store.js +55 -6
  99. package/dist/sanity/document-renderers.d.ts +45 -7
  100. package/dist/sanity/document-renderers.js +99 -13
  101. package/dist/sanity/queries.d.ts +11 -11
  102. package/dist/sanity/queries.js +7 -0
  103. package/dist/sanity/symbol-index.d.ts +98 -0
  104. package/dist/sanity/symbol-index.js +615 -0
  105. package/package.json +2 -1
@@ -0,0 +1,28 @@
1
+ /**
2
+ * InMemoryPackageSurface — `PackageSurfaceResolver` test double.
3
+ *
4
+ * Backed by a plain `Map<string, PackageSurface>`; calls for unknown
5
+ * packages throw the same `package-not-installed` error the
6
+ * `DtsPackageSurface` adapter throws, so test scenarios for the
7
+ * `unresolved` path need no special handling.
8
+ */
9
+ import { PackageSurfaceResolverError, } from "../../_vendor/ailf-core/index.js";
10
+ export class InMemoryPackageSurface {
11
+ surfaces;
12
+ constructor(surfaces = []) {
13
+ this.surfaces = new Map();
14
+ for (const surface of surfaces) {
15
+ this.surfaces.set(surface.pkg, surface);
16
+ }
17
+ }
18
+ set(surface) {
19
+ this.surfaces.set(surface.pkg, surface);
20
+ }
21
+ async resolveExports(pkg) {
22
+ const surface = this.surfaces.get(pkg);
23
+ if (!surface) {
24
+ throw new PackageSurfaceResolverError("package-not-installed", pkg, `InMemoryPackageSurface has no entry for "${pkg}".`);
25
+ }
26
+ return surface;
27
+ }
28
+ }
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Package-surface resolver adapters.
3
+ *
4
+ * @see packages/core/src/ports/package-surface-resolver.ts
5
+ */
6
+ export { DtsPackageSurface, type DtsPackageSurfaceOptions, type PackageRootResolver, } from "./dts-package-surface.js";
7
+ export { InMemoryPackageSurface } from "./in-memory-package-surface.js";
8
+ export { parseDtsExports } from "./parse-dts-exports.js";
9
+ export type { ParsedDtsExports } from "./parse-dts-exports.js";
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Package-surface resolver adapters.
3
+ *
4
+ * @see packages/core/src/ports/package-surface-resolver.ts
5
+ */
6
+ export { DtsPackageSurface, } from "./dts-package-surface.js";
7
+ export { InMemoryPackageSurface } from "./in-memory-package-surface.js";
8
+ export { parseDtsExports } from "./parse-dts-exports.js";
@@ -0,0 +1,31 @@
1
+ /**
2
+ * parse-dts-exports — pure function that extracts the public surface of a
3
+ * single `.d.ts` file as a list of top-level exported binding names plus
4
+ * any `export * from "./relative"` re-export specifiers.
5
+ *
6
+ * Implementation: delegates to `oxc-parser`'s `staticExports` view, which
7
+ * already decomposes each export statement into entries with `importName` /
8
+ * `exportName` / `moduleRequest` discriminators. We translate that view
9
+ * into the two outputs the W0198 preflight cares about — bare names and
10
+ * wildcard re-export specifiers — and drop default exports per the
11
+ * design's "named-bindings only" rule.
12
+ *
13
+ * Why oxc-parser instead of regex: top-level `.d.ts` syntax has enough TS
14
+ * surface area (declaration merging, conditional `exports` map types,
15
+ * ambient namespace augmentation) that a real AST is cheaper to maintain
16
+ * than a regex with the same coverage. Why oxc-parser instead of
17
+ * `typescript`: typescript isn't in `@sanity/ailf`'s runtime install graph
18
+ * and adding it adds ~50MB; oxc-parser is a few-MB native binary aligned
19
+ * with our existing `oxlint` / `oxfmt` toolchain.
20
+ */
21
+ export interface ParsedDtsExports {
22
+ /** Bare exported identifier names found in this file. */
23
+ readonly names: readonly string[];
24
+ /**
25
+ * Specifiers from `export * from "<spec>"` declarations. Only relative
26
+ * specifiers (starting with `.`) are useful for one-hop following; the
27
+ * caller decides which to resolve.
28
+ */
29
+ readonly reExports: readonly string[];
30
+ }
31
+ export declare function parseDtsExports(src: string): ParsedDtsExports;
@@ -0,0 +1,54 @@
1
+ /**
2
+ * parse-dts-exports — pure function that extracts the public surface of a
3
+ * single `.d.ts` file as a list of top-level exported binding names plus
4
+ * any `export * from "./relative"` re-export specifiers.
5
+ *
6
+ * Implementation: delegates to `oxc-parser`'s `staticExports` view, which
7
+ * already decomposes each export statement into entries with `importName` /
8
+ * `exportName` / `moduleRequest` discriminators. We translate that view
9
+ * into the two outputs the W0198 preflight cares about — bare names and
10
+ * wildcard re-export specifiers — and drop default exports per the
11
+ * design's "named-bindings only" rule.
12
+ *
13
+ * Why oxc-parser instead of regex: top-level `.d.ts` syntax has enough TS
14
+ * surface area (declaration merging, conditional `exports` map types,
15
+ * ambient namespace augmentation) that a real AST is cheaper to maintain
16
+ * than a regex with the same coverage. Why oxc-parser instead of
17
+ * `typescript`: typescript isn't in `@sanity/ailf`'s runtime install graph
18
+ * and adding it adds ~50MB; oxc-parser is a few-MB native binary aligned
19
+ * with our existing `oxlint` / `oxfmt` toolchain.
20
+ */
21
+ import { parseSync } from "oxc-parser";
22
+ export function parseDtsExports(src) {
23
+ // Filename hint drives the parser's grammar — `.d.ts` enables the
24
+ // ambient-only forms we want and disables expression-context grammar
25
+ // we'd otherwise have to ignore.
26
+ const result = parseSync("input.d.ts", src, { lang: "dts" });
27
+ const names = new Set();
28
+ const reExports = [];
29
+ for (const exportStmt of result.module.staticExports) {
30
+ for (const entry of exportStmt.entries) {
31
+ // Wildcard re-export: `export * from "./other"`. The namespace form
32
+ // `export * as ns from "./other"` falls into the named-export branch
33
+ // below because it does expose a binding (`ns`) at the top level.
34
+ if (entry.importName.kind === "AllButDefault" &&
35
+ entry.moduleRequest !== null) {
36
+ reExports.push(entry.moduleRequest.value);
37
+ continue;
38
+ }
39
+ // Anything that produces a stable named binding visible to consumers.
40
+ // Covers own declarations, local re-exports (`export { x as y }`),
41
+ // module re-exports (`export { x } from "./y"`), and namespace
42
+ // re-exports (`export * as ns from "./y"`). `export default ...`
43
+ // lands in `exportName.kind === "Default"` and is intentionally
44
+ // skipped — the W0198 preflight only judges named bindings.
45
+ if (entry.exportName.kind === "Name" && entry.exportName.name) {
46
+ names.add(entry.exportName.name);
47
+ }
48
+ }
49
+ }
50
+ return {
51
+ names: [...names].sort(),
52
+ reExports,
53
+ };
54
+ }
@@ -1471,6 +1471,12 @@ export declare const RepoConfigSchema: z.ZodObject<{
1471
1471
  gapAnalysis: z.ZodOptional<z.ZodBoolean>;
1472
1472
  apiUrl: z.ZodOptional<z.ZodString>;
1473
1473
  }, z.core.$strip>>;
1474
+ grader: z.ZodOptional<z.ZodObject<{
1475
+ context: z.ZodOptional<z.ZodEnum<{
1476
+ "rubric-only": "rubric-only";
1477
+ "with-docs": "with-docs";
1478
+ }>>;
1479
+ }, z.core.$strip>>;
1474
1480
  output: z.ZodOptional<z.ZodObject<{
1475
1481
  dir: z.ZodOptional<z.ZodString>;
1476
1482
  }, z.core.$strip>>;
@@ -566,6 +566,20 @@ const ExecutionConfigSchema = z
566
566
  apiUrl: z.string().url().optional(),
567
567
  })
568
568
  .optional();
569
+ /**
570
+ * Grader configuration.
571
+ *
572
+ * - `context: "rubric-only"` — grader sees only the rubric template +
573
+ * criteria + candidate response.
574
+ * - `context: "with-docs"` — canonical reference content is injected into
575
+ * the assertion's `rubricPrompt` so the grader has authoritative ground
576
+ * truth.
577
+ */
578
+ const GraderConfigSchema = z
579
+ .object({
580
+ context: z.enum(["rubric-only", "with-docs"]).optional(),
581
+ })
582
+ .optional();
569
583
  /**
570
584
  * Task-source configuration (W0077 Phase 6h). Replaces the retired
571
585
  * `--task-source` and `--repo-tasks-path` CLI flags on `ailf run`.
@@ -658,6 +672,7 @@ export const RepoConfigSchema = z.object({
658
672
  reportStore: ReportStoreConfigSchema,
659
673
  publish: PublishConfigSchema,
660
674
  execution: ExecutionConfigSchema,
675
+ grader: GraderConfigSchema,
661
676
  output: OutputConfigSchema,
662
677
  owner: OwnerConfigSchema,
663
678
  agentic: AgenticConfigSchema,
@@ -27,6 +27,8 @@ export interface ResolvedOptions {
27
27
  dryRun: boolean;
28
28
  gapAnalysisEnabled: boolean;
29
29
  graderReplications?: number;
30
+ /** Grader context policy from `.ailf/config.yaml` `grader.context` */
31
+ graderContext?: "rubric-only" | "with-docs";
30
32
  headerArgs: string[];
31
33
  impactSummary?: ImpactSummary;
32
34
  mode: EvalMode;
@@ -249,6 +249,17 @@ export function computeResolvedOptions(opts) {
249
249
  const concurrency = repoConfig?.execution?.concurrency;
250
250
  const graderReplications = repoConfig?.execution?.graderReplications;
251
251
  const gapAnalysisEnabled = repoConfig?.execution?.gapAnalysis ?? true;
252
+ // Grader context policy. Cascade: env var > .ailf/config.yaml > unset
253
+ // (defaults to rubric-only at the EvalConfig boundary). The env var is the
254
+ // operational lever for one-shot comparison runs without editing the config file.
255
+ const rawGraderContext = process.env.AILF_GRADER_CONTEXT ?? repoConfig?.grader?.context;
256
+ const graderContext = rawGraderContext === "with-docs" || rawGraderContext === "rubric-only"
257
+ ? rawGraderContext
258
+ : undefined;
259
+ if (rawGraderContext && graderContext === undefined) {
260
+ console.error(`❌ Invalid grader.context "${rawGraderContext}". Must be "rubric-only" or "with-docs".`);
261
+ process.exit(1);
262
+ }
252
263
  // Remote mode
253
264
  const remote = opts.remote || process.env.AILF_REMOTE === "1";
254
265
  const apiUrl = process.env.AILF_API_URL ??
@@ -274,6 +285,7 @@ export function computeResolvedOptions(opts) {
274
285
  dryRun: opts.dryRun,
275
286
  gapAnalysisEnabled,
276
287
  graderReplications,
288
+ graderContext,
277
289
  headerArgs,
278
290
  impactSummary,
279
291
  mode,
@@ -90,12 +90,19 @@ export async function runRemotePipeline(opts, rootDir) {
90
90
  console.error(formatJobError(job));
91
91
  process.exit(1);
92
92
  }
93
- // 7. Fetch and write output artifacts
94
- await writeRemoteResults(client, job, {
93
+ // 7. Fetch and write output artifacts. A `completed` job that carries
94
+ // `job.error` is a degraded completion (DOC-2121 RC-3): a configured
95
+ // optional step failed end-to-end. Artifacts still write so the caller
96
+ // keeps useful local state, but the CLI exits non-zero so external
97
+ // `--remote` consumers don't mistake the placeholder for success.
98
+ const outcome = await writeRemoteResults(client, job, {
95
99
  outputDir: opts.outputDir,
96
100
  outputPath: opts.outputPath,
97
101
  apiUrl: opts.apiUrl,
98
102
  });
103
+ if (outcome.degraded) {
104
+ process.exit(1);
105
+ }
99
106
  }
100
107
  // ---------------------------------------------------------------------------
101
108
  // Helpers
@@ -133,6 +140,7 @@ function toConfigSlice(opts) {
133
140
  datasetOverride: opts.datasetOverride,
134
141
  projectIdOverride: opts.projectIdOverride,
135
142
  perspectiveOverride: opts.perspectiveOverride,
143
+ graderContext: opts.graderContext,
136
144
  graderReplications: opts.graderReplications,
137
145
  gapAnalysisEnabled: opts.gapAnalysisEnabled,
138
146
  noRemoteCache: opts.noRemoteCache,
@@ -21,6 +21,11 @@ export interface WriteResultsOptions {
21
21
  /** API base URL (for metadata). */
22
22
  apiUrl: string;
23
23
  }
24
+ /** Outcome flags so the caller can decide the process exit code. */
25
+ export interface WriteResultsOutcome {
26
+ /** True when `job.error` was set on a completed job (DOC-2121 RC-3). */
27
+ degraded: boolean;
28
+ }
24
29
  /**
25
30
  * Fetch report artifacts from the API and write them to disk.
26
31
  *
@@ -29,5 +34,11 @@ export interface WriteResultsOptions {
29
34
  * - `<outputDir>/report.md` — full markdown report (if reportId present)
30
35
  * - `<outputDir>/job-metadata.json` — job tracking info
31
36
  * - `--output` path — markdown report (if specified)
37
+ *
38
+ * Returns an outcome the caller uses to choose an exit code: a `completed`
39
+ * job that carries `job.error` is treated as a *degraded* completion (a
40
+ * configured optional step failed end-to-end; see DOC-2121 RC-3) and the
41
+ * caller should exit non-zero so external `--remote` consumers don't read
42
+ * a clean completion as success.
32
43
  */
33
- export declare function writeRemoteResults(client: ApiClient, job: JobResponse, options: WriteResultsOptions): Promise<void>;
44
+ export declare function writeRemoteResults(client: ApiClient, job: JobResponse, options: WriteResultsOptions): Promise<WriteResultsOutcome>;
@@ -12,9 +12,6 @@
12
12
  */
13
13
  import { mkdirSync, writeFileSync } from "fs";
14
14
  import { resolve } from "path";
15
- // ---------------------------------------------------------------------------
16
- // Public API
17
- // ---------------------------------------------------------------------------
18
15
  /**
19
16
  * Fetch report artifacts from the API and write them to disk.
20
17
  *
@@ -23,6 +20,12 @@ import { resolve } from "path";
23
20
  * - `<outputDir>/report.md` — full markdown report (if reportId present)
24
21
  * - `<outputDir>/job-metadata.json` — job tracking info
25
22
  * - `--output` path — markdown report (if specified)
23
+ *
24
+ * Returns an outcome the caller uses to choose an exit code: a `completed`
25
+ * job that carries `job.error` is treated as a *degraded* completion (a
26
+ * configured optional step failed end-to-end; see DOC-2121 RC-3) and the
27
+ * caller should exit non-zero so external `--remote` consumers don't read
28
+ * a clean completion as success.
26
29
  */
27
30
  export async function writeRemoteResults(client, job, options) {
28
31
  const resultsDir = options.outputDir;
@@ -55,11 +58,20 @@ export async function writeRemoteResults(client, job, options) {
55
58
  reportId: job.reportId ?? null,
56
59
  reportUrl: job.reportUrl ?? null,
57
60
  execution: job.execution ?? null,
61
+ error: job.error ?? null,
58
62
  apiUrl: options.apiUrl,
59
63
  }, null, 2));
60
- // 4. Print summary
64
+ // 4. Print summary. A completed job with `job.error` set means a
65
+ // configured optional step failed end-to-end — print the diagnostic
66
+ // and signal the caller to exit non-zero.
67
+ const degraded = Boolean(job.error);
61
68
  console.log("");
62
- console.log(`✅ Evaluation completed`);
69
+ if (degraded) {
70
+ console.log(`⚠️ Evaluation completed with errors`);
71
+ }
72
+ else {
73
+ console.log(`✅ Evaluation completed`);
74
+ }
63
75
  console.log(` 📊 Results: ${resolve(resultsDir, "score-summary.json")}`);
64
76
  if (reportWritten) {
65
77
  console.log(` 📝 Report: ${resolve(resultsDir, "report.md")}`);
@@ -71,6 +83,14 @@ export async function writeRemoteResults(client, job, options) {
71
83
  console.log(` 🔗 Studio: ${job.reportUrl}`);
72
84
  }
73
85
  console.log(` 🏷️ Job ID: ${job.jobId}`);
86
+ if (job.error) {
87
+ console.error("");
88
+ console.error(` ❌ Step "${job.error.step ?? "<unknown>"}" failed: ${job.error.message}`);
89
+ if (!job.reportId) {
90
+ console.error(" No report was published. See the API gateway run page for details.");
91
+ }
92
+ }
93
+ return { degraded };
74
94
  }
75
95
  // ---------------------------------------------------------------------------
76
96
  // Helpers
@@ -29,6 +29,7 @@ import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js"
29
29
  import { loadExternalPresets } from "./pipeline/compiler/preset-loader.js";
30
30
  import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
31
31
  import { PromptfooEvalAdapter } from "./adapters/eval-runners/promptfoo-eval-adapter.js";
32
+ import { DtsPackageSurface } from "./adapters/package-surface/index.js";
32
33
  import { ConsoleLogger, JsonLogger, QuietLogger, } from "./adapters/loggers/index.js";
33
34
  import { ConsoleProgressReporter } from "./adapters/progress/console-progress-reporter.js";
34
35
  import { CompositeTaskSource, ContentLakeTaskSource, RepoTaskSource, } from "./adapters/task-sources/index.js";
@@ -65,6 +66,13 @@ export function createAppContext(config) {
65
66
  const docFetcher = docFetcherFactory ? docFetcherFactory() : undefined;
66
67
  // Eval runner — Promptfoo subprocess
67
68
  const evalRunner = new PromptfooEvalAdapter(config.rootDir);
69
+ // Package-surface resolver for the W0198 symbol-resolution preflight.
70
+ // Reads installed `.d.ts` from the eval package's node_modules chain
71
+ // (anchored at `config.rootDir`). Phase 1 wires the adapter; later
72
+ // phases consume it.
73
+ const packageSurfaceResolver = new DtsPackageSurface({
74
+ resolveFromDir: config.rootDir,
75
+ });
68
76
  // Report store — Sanity Content Lake (for publish + auto-compare)
69
77
  const reportStore = createReportStore(config);
70
78
  // Sinks — loaded from config/sinks
@@ -90,6 +98,7 @@ export function createAppContext(config) {
90
98
  docFetcher,
91
99
  evalRunner,
92
100
  logger,
101
+ packageSurfaceResolver,
93
102
  progress,
94
103
  registry,
95
104
  reportStore,
@@ -0,0 +1,37 @@
1
+ /**
2
+ * package-surface.ts — Framework-level package-surface manifest for the
3
+ * W0198 symbol-resolution preflight.
4
+ *
5
+ * The manifest pins each in-scope package to a single semver-major range.
6
+ * The preflight resolver answers "does symbol X export from package Y" by
7
+ * reading the installed package's `.d.ts` against this pin. Tasks
8
+ * reference packages by name; they do not carry per-package version
9
+ * metadata (per-task overrides remain a future extension point).
10
+ *
11
+ * Bumping a major is an editorial event — one PR that updates the pin,
12
+ * regenerates cached surfaces, and re-runs the historical comparison set.
13
+ * Patch and minor releases within a pinned major flow silently because
14
+ * semver disallows the export removals that would change a deduction
15
+ * outcome.
16
+ *
17
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md
18
+ */
19
+
20
+ import { definePackageSurface } from "../_vendor/ailf-core/index.js"
21
+
22
+ export default definePackageSurface({
23
+ packages: [
24
+ // Sanity App SDK — drives App SDK literacy tasks (e.g. DOC-2117).
25
+ { pkg: "@sanity/sdk-react", semverPin: "^2.0.0" },
26
+
27
+ // Sanity Studio runtime — drives Studio-side literacy tasks.
28
+ // Pinned to the installed major in this repo (^5.x). The earlier
29
+ // major (v3) is an explicit per-task override candidate when a task
30
+ // exercises legacy Studio behavior.
31
+ { pkg: "sanity", semverPin: "^5.0.0" },
32
+
33
+ // @sanity/client — drives direct-client literacy tasks. Pinned to
34
+ // the installed major in this repo (^7.x).
35
+ { pkg: "@sanity/client", semverPin: "^7.0.0" },
36
+ ],
37
+ })
@@ -0,0 +1,26 @@
1
+ /**
2
+ * preflight-scoring.ts — How heavily the W0198 deterministic preflight
3
+ * contributes to the `code-correctness` dimension.
4
+ *
5
+ * The preflight (`SymbolPreflightReport`) and the LLM rubric both feed
6
+ * into `code-correctness` per D0010's weighted dimension aggregation.
7
+ * `codeCorrectnessWeight` sets the relative share between them — `0.4`
8
+ * means preflight is 40% of the dimension, rubric 60%.
9
+ *
10
+ * Bumping this is an editorial decision: a higher weight ties more of the
11
+ * `code-correctness` score to the deterministic existence-check (less
12
+ * grader noise on the symbol-existence question, but also less elasticity
13
+ * for the rubric to penalize stylistic/correctness issues the preflight
14
+ * cannot see). A lower weight cedes more authority back to the rubric.
15
+ *
16
+ * The default of `0.4` is a starting balance; revisit once the
17
+ * `unresolved` rate stabilizes in CI.
18
+ *
19
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md
20
+ */
21
+
22
+ import { definePreflightScoring } from "../_vendor/ailf-core/index.js"
23
+
24
+ export default definePreflightScoring({
25
+ codeCorrectnessWeight: 0.4,
26
+ })
package/dist/index.d.ts CHANGED
@@ -33,8 +33,8 @@
33
33
  * })
34
34
  * ```
35
35
  */
36
- export { defineConfig, defineFeatures, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./_vendor/ailf-core/index.d.ts";
37
- export type { PricingEntry, PromptEntry, SourceEntry } from "./_vendor/ailf-core/index.d.ts";
36
+ export { defineConfig, defineFeatures, defineModels, definePackageSurface, definePreflightScoring, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./_vendor/ailf-core/index.d.ts";
37
+ export type { PackageSurfaceConfig, PackageSurfaceEntry, PreflightScoringConfig, PricingEntry, PromptEntry, SourceEntry, } from "./_vendor/ailf-core/index.d.ts";
38
38
  export { env } from "./_vendor/ailf-core/index.d.ts";
39
39
  export type { AgentHarnessTaskDefinition, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PathDocRef, PerspectiveDocRef, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./_vendor/ailf-core/index.d.ts";
40
40
  export { CanonicalTaskFileSchema, CanonicalTaskSchema, CURATED_ASSERTION_TYPES, detectLegacyFieldNames, parseCanonicalTaskFile, RUBRIC_TEMPLATE_NAMES, type CanonicalTask, type CuratedAssertionType, type RubricTemplateName, } from "./adapters/task-sources/repo-schemas.js";
package/dist/index.js CHANGED
@@ -36,7 +36,7 @@
36
36
  // ---------------------------------------------------------------------------
37
37
  // Configuration helpers (define* identity functions for typed authoring)
38
38
  // ---------------------------------------------------------------------------
39
- export { defineConfig, defineFeatures, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./_vendor/ailf-core/index.js";
39
+ export { defineConfig, defineFeatures, defineModels, definePackageSurface, definePreflightScoring, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./_vendor/ailf-core/index.js";
40
40
  // ---------------------------------------------------------------------------
41
41
  // Environment helper
42
42
  // ---------------------------------------------------------------------------
@@ -49,6 +49,7 @@ export function mapToResolvedConfig(opts, rootDir) {
49
49
  noCache: opts.noCache,
50
50
  noRemoteCache: opts.noRemoteCache,
51
51
  graderReplications: opts.graderReplications,
52
+ graderContext: opts.graderContext,
52
53
  outputDir: opts.outputDir,
53
54
  outputPath: opts.outputPath,
54
55
  urls: opts.urlArgs.length > 0 ? opts.urlArgs : undefined,
@@ -11,7 +11,7 @@
11
11
  * each step completes. This enables the GET /v1/jobs/:jobId polling
12
12
  * endpoint to show real-time progress.
13
13
  */
14
- import { type AppContext, type PipelineResult, type PipelineStep } from "../_vendor/ailf-core/index.d.ts";
14
+ import { type AppContext, type PipelineResult, type PipelineStep, type StepResult } from "../_vendor/ailf-core/index.d.ts";
15
15
  /**
16
16
  * Run a sequence of pipeline steps, short-circuiting on required step failure.
17
17
  *
@@ -22,3 +22,21 @@ import { type AppContext, type PipelineResult, type PipelineStep } from "../_ven
22
22
  * Lake after each step completes.
23
23
  */
24
24
  export declare function orchestratePipeline(ctx: AppContext, steps: PipelineStep[]): Promise<PipelineResult>;
25
+ /**
26
+ * Find the first optional pipeline step that returned `status: "failed"`
27
+ * in step-array order. Returns the diagnostic shape the API job document
28
+ * already accepts (`{ message, step }`), or null when no optional step
29
+ * failed.
30
+ *
31
+ * Required-step failures don't reach this code path — the orchestrator
32
+ * aborts before completion when a required step fails. This helper is
33
+ * the bridge between "step ran and failed" and the wire signal that
34
+ * external `--remote` consumers use to distinguish a clean completion
35
+ * from a degraded one.
36
+ *
37
+ * @see docs/design-docs/optional-step-failure-surfacing.md
38
+ */
39
+ export declare function getFirstOptionalFailure(steps: readonly PipelineStep[], results: Record<string, StepResult>): {
40
+ message: string;
41
+ step: string;
42
+ } | null;
@@ -236,6 +236,12 @@ export async function orchestratePipeline(ctx, steps) {
236
236
  process.env.SANITY_API_TOKEN ??
237
237
  undefined,
238
238
  });
239
+ // DOC-2121 RC-3 — surface the first configured-but-failed optional
240
+ // step on the job document so external --remote consumers can detect
241
+ // partial-completion outcomes. The pipeline still ran end to end
242
+ // (P5 / local-first) and `success: true` is preserved; the `error`
243
+ // field is the wire signal that a configured optional step failed.
244
+ const firstOptionalFailure = getFirstOptionalFailure(steps, results);
239
245
  await store.updateJob(ctx.config.jobId, {
240
246
  status: "completed",
241
247
  completedAt: new Date().toISOString(),
@@ -245,6 +251,7 @@ export async function orchestratePipeline(ctx, steps) {
245
251
  totalSteps: steps.length,
246
252
  },
247
253
  ...(state.reportId ? { reportId: state.reportId } : {}),
254
+ ...(firstOptionalFailure ? { error: firstOptionalFailure } : {}),
248
255
  });
249
256
  }
250
257
  catch {
@@ -275,6 +282,37 @@ export async function orchestratePipeline(ctx, steps) {
275
282
  };
276
283
  }
277
284
  // ---------------------------------------------------------------------------
285
+ // Optional-step failure surfacing (DOC-2121 RC-3)
286
+ // ---------------------------------------------------------------------------
287
+ /**
288
+ * Find the first optional pipeline step that returned `status: "failed"`
289
+ * in step-array order. Returns the diagnostic shape the API job document
290
+ * already accepts (`{ message, step }`), or null when no optional step
291
+ * failed.
292
+ *
293
+ * Required-step failures don't reach this code path — the orchestrator
294
+ * aborts before completion when a required step fails. This helper is
295
+ * the bridge between "step ran and failed" and the wire signal that
296
+ * external `--remote` consumers use to distinguish a clean completion
297
+ * from a degraded one.
298
+ *
299
+ * @see docs/design-docs/optional-step-failure-surfacing.md
300
+ */
301
+ export function getFirstOptionalFailure(steps, results) {
302
+ for (const step of steps) {
303
+ if (step.optional !== true)
304
+ continue;
305
+ const result = results[step.name];
306
+ if (result?.status === "failed") {
307
+ return {
308
+ message: result.error ?? `${step.name} failed`,
309
+ step: step.name,
310
+ };
311
+ }
312
+ }
313
+ return null;
314
+ }
315
+ // ---------------------------------------------------------------------------
278
316
  // Artifact export phase gate (W0053)
279
317
  // ---------------------------------------------------------------------------
280
318
  /**
@@ -14,6 +14,7 @@ import { buildCacheContext } from "../cache-context.js";
14
14
  import { calculateAndWriteScores } from "../../pipeline/calculate-scores.js";
15
15
  import { checkResultsExist, checkScoreSummaryValid, } from "../../pipeline/checks.js";
16
16
  import { resultsFileForMode } from "../../pipeline/eval-constants.js";
17
+ import { loadPreflightScoring } from "../../pipeline/preflight/load-preflight-scoring.js";
17
18
  import { loadSource } from "../../sources.js";
18
19
  import { uploadTestOutputs } from "../../pipeline/upload-test-outputs.js";
19
20
  import { configToSourceOverrides } from "../config-to-source-overrides.js";
@@ -77,6 +78,13 @@ export class CalculateScoresStep {
77
78
  catch {
78
79
  // Non-fatal — proceed without source metadata
79
80
  }
81
+ // W0198 — load preflight scoring config (silent fall-through to the
82
+ // default weight when absent). Lazy: ignored when no preflight reports
83
+ // were emitted upstream.
84
+ const preflightScoring = await loadPreflightScoring(ctx.config.rootDir).catch((err) => {
85
+ ctx.logger.warn(`[warn] W0198 preflight: failed to load preflight-scoring config — ${err instanceof Error ? err.message : String(err)}`);
86
+ return undefined;
87
+ });
80
88
  let belowCritical = [];
81
89
  try {
82
90
  const result = calculateAndWriteScores({
@@ -87,11 +95,14 @@ export class CalculateScoresStep {
87
95
  mode: ctx.config.mode === "literacy"
88
96
  ? (ctx.config.variant ?? LiteracyVariant.STANDARD)
89
97
  : ctx.config.mode,
98
+ preflightReports: state.preflightReports,
99
+ preflightWeight: preflightScoring?.codeCorrectnessWeight,
90
100
  resolvedSource,
91
101
  resultsPath: primaryResultsRun !== LiteracyVariant.STANDARD
92
102
  ? join(ctx.config.rootDir, resultsFile)
93
103
  : undefined,
94
104
  rootDir: ctx.config.rootDir,
105
+ runId: ctx.runId,
95
106
  searchMode: ctx.config.searchMode,
96
107
  source: ctx.config.source,
97
108
  });
@@ -18,6 +18,7 @@ import { getStepInputPaths } from "../../pipeline/cache.js";
18
18
  import { buildCacheContext } from "../cache-context.js";
19
19
  import { checkGeneratedConfigsExist } from "../../pipeline/checks.js";
20
20
  import { validateModelsYaml } from "../../pipeline/validate.js";
21
+ import { loadPreflightContext } from "../../pipeline/preflight/load-preflight-context.js";
21
22
  import { loadSource } from "../../sources.js";
22
23
  import { configToSourceOverrides } from "../config-to-source-overrides.js";
23
24
  export class GenerateConfigsStep {
@@ -110,12 +111,21 @@ export class GenerateConfigsStep {
110
111
  catch {
111
112
  ctx.logger.warn(" ⚠ Could not load rubric config — templates will not resolve");
112
113
  }
113
- // Compile for each variant
114
+ // Compile for each variant. `graderContext` defaults to "rubric-only" here
115
+ // so handlers see a definite value rather than implicit-undefined.
116
+ const graderContext = ctx.config.graderContext ?? "rubric-only";
117
+ // W0198 Phase 6 — when the package-surface manifest is authored, pass
118
+ // the in-scope package list down so the literacy mode handler can
119
+ // prefix the `code-correctness` rubric with the deterministic-lane
120
+ // system instruction. Silent fall-through when absent.
121
+ const preflightContext = await loadPreflightContext(ctx.config.rootDir, ctx.logger);
114
122
  const baselineResults = this.compileAll(handler, tasks, {
115
123
  rootDir: ctx.config.rootDir,
116
124
  graderProvider: models.grader.id,
117
125
  models: baselineModels,
118
126
  rubricConfig,
127
+ graderContext,
128
+ preflightContext,
119
129
  evalMode: LiteracyVariant.STANDARD,
120
130
  });
121
131
  const agenticResults = this.compileAll(handler, tasks, {
@@ -123,6 +133,8 @@ export class GenerateConfigsStep {
123
133
  graderProvider: models.grader.id,
124
134
  models: agenticModels,
125
135
  rubricConfig,
136
+ graderContext,
137
+ preflightContext,
126
138
  evalMode: LiteracyVariant.AGENTIC,
127
139
  });
128
140
  // Log warnings
@@ -174,11 +186,14 @@ export class GenerateConfigsStep {
174
186
  catch {
175
187
  ctx.logger.warn(" ⚠ Could not load rubric config — templates will not resolve");
176
188
  }
189
+ const preflightContext = await loadPreflightContext(ctx.config.rootDir, ctx.logger);
177
190
  const merged = this.compileAll(handler, tasks, {
178
191
  rootDir: ctx.config.rootDir,
179
192
  graderProvider: models.grader.id,
180
193
  models: modeModels,
181
194
  rubricConfig,
195
+ graderContext: ctx.config.graderContext ?? "rubric-only",
196
+ preflightContext,
182
197
  });
183
198
  for (const w of merged.warnings) {
184
199
  ctx.logger.warn(` ⚠ ${w}`);