@sanity/ailf 4.2.0 → 4.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/config/package-surface.ts +37 -0
  2. package/config/preflight-scoring.ts +26 -0
  3. package/dist/_vendor/ailf-core/artifact-registry.d.ts +1 -1
  4. package/dist/_vendor/ailf-core/artifact-registry.js +47 -0
  5. package/dist/_vendor/ailf-core/config-helpers.d.ts +35 -0
  6. package/dist/_vendor/ailf-core/config-helpers.js +67 -0
  7. package/dist/_vendor/ailf-core/index.d.ts +1 -1
  8. package/dist/_vendor/ailf-core/index.js +1 -1
  9. package/dist/_vendor/ailf-core/ports/context.d.ts +18 -0
  10. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +30 -0
  11. package/dist/_vendor/ailf-core/ports/index.d.ts +3 -1
  12. package/dist/_vendor/ailf-core/ports/index.js +1 -0
  13. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +23 -0
  14. package/dist/_vendor/ailf-core/ports/package-surface-resolver.d.ts +71 -0
  15. package/dist/_vendor/ailf-core/ports/package-surface-resolver.js +36 -0
  16. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +6 -0
  17. package/dist/_vendor/ailf-core/schemas/eval-config.js +14 -0
  18. package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
  19. package/dist/_vendor/ailf-core/schemas/index.js +1 -0
  20. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +4 -0
  21. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
  22. package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.d.ts +51 -0
  23. package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.js +57 -0
  24. package/dist/_vendor/ailf-core/types/index.d.ts +12 -0
  25. package/dist/_vendor/ailf-core/types/index.js +1 -0
  26. package/dist/_vendor/ailf-core/types/package-surface.d.ts +36 -0
  27. package/dist/_vendor/ailf-core/types/package-surface.js +13 -0
  28. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
  29. package/dist/_vendor/ailf-core/types/preflight-scoring.d.ts +52 -0
  30. package/dist/_vendor/ailf-core/types/preflight-scoring.js +18 -0
  31. package/dist/_vendor/ailf-core/types/repo-config.d.ts +14 -0
  32. package/dist/_vendor/ailf-core/types/symbol-preflight-report.d.ts +66 -0
  33. package/dist/_vendor/ailf-core/types/symbol-preflight-report.js +25 -0
  34. package/dist/adapters/api-client/build-request.d.ts +1 -0
  35. package/dist/adapters/api-client/build-request.js +3 -0
  36. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  37. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +4 -0
  38. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +159 -82
  39. package/dist/adapters/index.d.ts +1 -0
  40. package/dist/adapters/index.js +1 -0
  41. package/dist/adapters/package-surface/dts-package-surface.d.ts +46 -0
  42. package/dist/adapters/package-surface/dts-package-surface.js +173 -0
  43. package/dist/adapters/package-surface/in-memory-package-surface.d.ts +15 -0
  44. package/dist/adapters/package-surface/in-memory-package-surface.js +28 -0
  45. package/dist/adapters/package-surface/index.d.ts +9 -0
  46. package/dist/adapters/package-surface/index.js +8 -0
  47. package/dist/adapters/package-surface/parse-dts-exports.d.ts +31 -0
  48. package/dist/adapters/package-surface/parse-dts-exports.js +54 -0
  49. package/dist/adapters/task-sources/repo-schemas.d.ts +6 -0
  50. package/dist/adapters/task-sources/repo-schemas.js +15 -0
  51. package/dist/commands/pipeline-action.d.ts +2 -0
  52. package/dist/commands/pipeline-action.js +12 -0
  53. package/dist/commands/remote-pipeline.js +10 -2
  54. package/dist/commands/remote-results.d.ts +12 -1
  55. package/dist/commands/remote-results.js +25 -5
  56. package/dist/composition-root.js +9 -0
  57. package/dist/config/package-surface.ts +37 -0
  58. package/dist/config/preflight-scoring.ts +26 -0
  59. package/dist/index.d.ts +2 -2
  60. package/dist/index.js +1 -1
  61. package/dist/orchestration/build-app-context.js +1 -0
  62. package/dist/orchestration/pipeline-orchestrator.d.ts +19 -1
  63. package/dist/orchestration/pipeline-orchestrator.js +38 -0
  64. package/dist/orchestration/steps/calculate-scores-step.js +11 -0
  65. package/dist/orchestration/steps/generate-configs-step.js +16 -1
  66. package/dist/orchestration/steps/run-eval-step.js +27 -0
  67. package/dist/pipeline/calculate-scores.d.ts +66 -5
  68. package/dist/pipeline/calculate-scores.js +141 -27
  69. package/dist/pipeline/compiler/index.d.ts +1 -1
  70. package/dist/pipeline/compiler/index.js +1 -1
  71. package/dist/pipeline/compiler/literacy-bridge.d.ts +9 -0
  72. package/dist/pipeline/compiler/literacy-bridge.js +2 -0
  73. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +1 -1
  74. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +31 -4
  75. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +146 -1
  76. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +2 -0
  77. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +17 -2
  78. package/dist/pipeline/compiler/rubric-resolution.d.ts +17 -1
  79. package/dist/pipeline/compiler/rubric-resolution.js +78 -2
  80. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -2
  81. package/dist/pipeline/compiler/scoring-bridge.js +104 -10
  82. package/dist/pipeline/eval-fingerprint.d.ts +9 -0
  83. package/dist/pipeline/eval-fingerprint.js +7 -1
  84. package/dist/pipeline/map-request-to-config.js +1 -0
  85. package/dist/pipeline/preflight/compute-preflight.d.ts +67 -0
  86. package/dist/pipeline/preflight/compute-preflight.js +118 -0
  87. package/dist/pipeline/preflight/emit-symbol-preflight.d.ts +51 -0
  88. package/dist/pipeline/preflight/emit-symbol-preflight.js +102 -0
  89. package/dist/pipeline/preflight/load-package-surface.d.ts +14 -0
  90. package/dist/pipeline/preflight/load-package-surface.js +19 -0
  91. package/dist/pipeline/preflight/load-preflight-context.d.ts +13 -0
  92. package/dist/pipeline/preflight/load-preflight-context.js +25 -0
  93. package/dist/pipeline/preflight/load-preflight-scoring.d.ts +12 -0
  94. package/dist/pipeline/preflight/load-preflight-scoring.js +17 -0
  95. package/dist/pipeline/preflight/parse-imports.d.ts +62 -0
  96. package/dist/pipeline/preflight/parse-imports.js +125 -0
  97. package/dist/report-store.d.ts +8 -0
  98. package/dist/report-store.js +55 -6
  99. package/dist/sanity/document-renderers.d.ts +45 -7
  100. package/dist/sanity/document-renderers.js +99 -13
  101. package/dist/sanity/queries.d.ts +11 -11
  102. package/dist/sanity/queries.js +7 -0
  103. package/dist/sanity/symbol-index.d.ts +98 -0
  104. package/dist/sanity/symbol-index.js +615 -0
  105. package/package.json +2 -1
@@ -0,0 +1,51 @@
1
+ /**
2
+ * @sanity/ailf-core — SymbolPreflightReport schema
3
+ *
4
+ * The trust-boundary parser for the W0198 deterministic-lane artifact.
5
+ * Used when reading a previously-emitted preflight report back from
6
+ * disk / GCS so a downstream step (re-grading, comparison, dashboard
7
+ * rendering) can trust its shape.
8
+ *
9
+ * The schema asserts `satisfies z.ZodType<SymbolPreflightReport>`
10
+ * against the domain type in
11
+ * `packages/core/src/types/symbol-preflight-report.ts` (D0045 / W0187),
12
+ * so any drift between the two is a build error.
13
+ */
14
+ import { z } from "zod";
15
+ export declare const SymbolPreflightReportSchema: z.ZodObject<{
16
+ candidate: z.ZodObject<{
17
+ taskId: z.ZodString;
18
+ testIndex: z.ZodNumber;
19
+ }, z.core.$strip>;
20
+ findings: z.ZodArray<z.ZodDiscriminatedUnion<[z.ZodObject<{
21
+ result: z.ZodLiteral<"exists">;
22
+ pkg: z.ZodString;
23
+ version: z.ZodString;
24
+ binding: z.ZodString;
25
+ source: z.ZodEnum<{
26
+ types: "types";
27
+ runtime: "runtime";
28
+ }>;
29
+ }, z.core.$strip>, z.ZodObject<{
30
+ result: z.ZodLiteral<"missing">;
31
+ pkg: z.ZodString;
32
+ version: z.ZodString;
33
+ binding: z.ZodString;
34
+ }, z.core.$strip>, z.ZodObject<{
35
+ result: z.ZodLiteral<"unresolved">;
36
+ pkg: z.ZodString;
37
+ binding: z.ZodString;
38
+ reason: z.ZodEnum<{
39
+ "package-not-installed": "package-not-installed";
40
+ "types-entry-missing": "types-entry-missing";
41
+ "parse-failed": "parse-failed";
42
+ "reexport-hop-unfollowed": "reexport-hop-unfollowed";
43
+ }>;
44
+ }, z.core.$strip>], "result">>;
45
+ deduction: z.ZodObject<{
46
+ perMissing: z.ZodNumber;
47
+ cap: z.ZodNumber;
48
+ total: z.ZodNumber;
49
+ }, z.core.$strip>;
50
+ }, z.core.$strip>;
51
+ export type { SymbolPreflightReport } from "../types/symbol-preflight-report.js";
@@ -0,0 +1,57 @@
1
+ /**
2
+ * @sanity/ailf-core — SymbolPreflightReport schema
3
+ *
4
+ * The trust-boundary parser for the W0198 deterministic-lane artifact.
5
+ * Used when reading a previously-emitted preflight report back from
6
+ * disk / GCS so a downstream step (re-grading, comparison, dashboard
7
+ * rendering) can trust its shape.
8
+ *
9
+ * The schema asserts `satisfies z.ZodType<SymbolPreflightReport>`
10
+ * against the domain type in
11
+ * `packages/core/src/types/symbol-preflight-report.ts` (D0045 / W0187),
12
+ * so any drift between the two is a build error.
13
+ */
14
+ import { z } from "zod";
15
+ const ExistsFindingSchema = z.object({
16
+ result: z.literal("exists"),
17
+ pkg: z.string().min(1),
18
+ version: z.string().min(1),
19
+ binding: z.string().min(1),
20
+ source: z.enum(["types", "runtime"]),
21
+ });
22
+ const MissingFindingSchema = z.object({
23
+ result: z.literal("missing"),
24
+ pkg: z.string().min(1),
25
+ version: z.string().min(1),
26
+ binding: z.string().min(1),
27
+ });
28
+ const UnresolvedReasonSchema = z.enum([
29
+ "package-not-installed",
30
+ "types-entry-missing",
31
+ "parse-failed",
32
+ "reexport-hop-unfollowed",
33
+ ]);
34
+ const UnresolvedFindingSchema = z.object({
35
+ result: z.literal("unresolved"),
36
+ pkg: z.string().min(1),
37
+ binding: z.string().min(1),
38
+ reason: UnresolvedReasonSchema,
39
+ });
40
+ const FindingSchema = z.discriminatedUnion("result", [
41
+ ExistsFindingSchema,
42
+ MissingFindingSchema,
43
+ UnresolvedFindingSchema,
44
+ ]);
45
+ const DeductionSchema = z.object({
46
+ perMissing: z.number().nonnegative(),
47
+ cap: z.number().nonnegative(),
48
+ total: z.number().nonnegative(),
49
+ });
50
+ export const SymbolPreflightReportSchema = z.object({
51
+ candidate: z.object({
52
+ taskId: z.string().min(1),
53
+ testIndex: z.number().int().nonnegative(),
54
+ }),
55
+ findings: z.array(FindingSchema),
56
+ deduction: DeductionSchema,
57
+ });
@@ -11,6 +11,7 @@
11
11
  */
12
12
  import type { DocumentRef as _DocumentRef, EvalMode, RunContext } from "../../ailf-shared/index.d.ts";
13
13
  import type { ArtifactType } from "../artifact-registry.js";
14
+ import type { SymbolPreflightReport } from "./symbol-preflight-report.js";
14
15
  import type { AssociationValues, RunId } from "./branded-ids.js";
15
16
  export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "./scoring-input.js";
16
17
  export type { DocumentRef, RunContext, RunTrigger } from "../../ailf-shared/index.d.ts";
@@ -26,6 +27,9 @@ export type { EvalTrace, ToolCallCategory, ToolCallRecord, TraceEvent, TraceSpan
26
27
  export type { CompareRequest, CompareScoreSummaryRef, LookupDocRequest, ValidateConfigRequest, ValidateSourceRequest, ValidateTaskAssertionDef, ValidateTaskCanonicalDocRef, ValidateTaskDef, ValidateTaskRequest, } from "./api-requests.js";
27
28
  export type { AilfEvalWorkflow, AilfEvalWorkflowJob, AilfEvalWorkflowStep, RepoAgenticConfig, RepoArtifactsConfig, RepoConfig, RepoExecutionConfig, RepoOutputConfig, RepoOwnerConfig, RepoPublishConfig, RepoReportStoreConfig, RepoSourceConfig, RepoTaskSourceConfig, RepoTriggersConfig, ScheduleTriggerConfig, TriggerConfig, TriggerMode, } from "./repo-config.js";
28
29
  export type { PipelineRequest, PipelineRequestCallback, PipelineRequestCallerExecutor, PipelineRequestCallerGit, PipelineRequestCallerOwner, PipelineRequestDebug, PipelineRequestTaskSource, } from "./pipeline-request.js";
30
+ export type { PackageSurfaceConfig, PackageSurfaceEntry, } from "./package-surface.js";
31
+ export type { SymbolPreflightDeduction, SymbolPreflightFinding, SymbolPreflightReport, SymbolPreflightUnresolvedReason, } from "./symbol-preflight-report.js";
32
+ export { DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT, type PreflightRubricContext, type PreflightScoringConfig, } from "./preflight-scoring.js";
29
33
  export type { ArtifactId, AssociationAxis, AssociationValues, Brand, EntryKey, Err, FixtureId, IdValidationError, NewReportId, Ok, ProviderId, PromptId, Result, ResultId, RubricId, RunFingerprint, RunId, SuiteId, TaskId, TaskSlug, TraceId, } from "./branded-ids.js";
30
34
  export { err, fixtureId, generateRunId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
31
35
  export type { AgentHarnessTaskDefinition, ContentLakeAuthorableMode, ContentLakeAuthorableTask, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PromptVars, PathDocRef, PerspectiveDocRef, ReservedPromptVarKey, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
@@ -593,6 +597,14 @@ export interface PipelineState {
593
597
  * Consumed by the orchestrator for the enriched PipelineResult.
594
598
  */
595
599
  testSummary?: TestSummary;
600
+ /**
601
+ * W0198 deterministic-lane reports keyed by `${run}/${mode}/${task}/${model}`.
602
+ * Set by RunEvalStep after `emitSymbolPreflight`, consumed by the
603
+ * scoring step (Phase 5) so the per-test merge does not need to read
604
+ * the artifact back from disk. The map is intentionally small — one
605
+ * entry per (task, model) per mode.
606
+ */
607
+ preflightReports?: Map<string, SymbolPreflightReport>;
596
608
  }
597
609
  /**
598
610
  * Release auto-scope metadata — which tasks are affected by a content
@@ -16,6 +16,7 @@ export { InMemoryPluginRegistry } from "./plugin-registry.js";
16
16
  // version is used internally by LiteracyModeConfig. If consumers need
17
17
  // the mode-specific version, they import from "./eval-mode-config.js".
18
18
  export { evalModeType } from "./eval-mode-config.js";
19
+ export { DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT, } from "./preflight-scoring.js";
19
20
  export { err, fixtureId, generateRunId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
20
21
  // ---------------------------------------------------------------------------
21
22
  // Comparison (Approach 2: structured comparison output)
@@ -0,0 +1,36 @@
1
+ /**
2
+ * Package surface manifest types — framework-level pin of which package
3
+ * surfaces the symbol-resolution preflight (W0198) reads against.
4
+ *
5
+ * Tasks reference packages by name; version metadata lives here, not on
6
+ * the task. A semver-major bump is an editorial event (regenerate cached
7
+ * surfaces, run the historical comparison set, ride a release) — patch
8
+ * and minor releases within a pinned major flow silently because semver
9
+ * disallows the export removals that would change a deduction outcome.
10
+ *
11
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md
12
+ */
13
+ /**
14
+ * One in-scope package, pinned to a semver-major range.
15
+ *
16
+ * - `pkg` — npm package name (e.g. `"@sanity/sdk-react"`).
17
+ * - `semverPin` — semver range pinned to a single major (e.g. `"^2.0.0"`).
18
+ * The W0198 resolver answers existence questions against this major;
19
+ * minor and patch upgrades within the major do not require a manifest
20
+ * bump because semver disallows export removal.
21
+ */
22
+ export interface PackageSurfaceEntry {
23
+ pkg: string;
24
+ semverPin: string;
25
+ }
26
+ /**
27
+ * Framework-level package-surface manifest.
28
+ *
29
+ * Authored once in `packages/eval/config/package-surface.ts` via
30
+ * `definePackageSurface(...)`. Per-task pin overrides remain a deliberate
31
+ * future extension point; the shape leaves room for them but the initial
32
+ * slice does not implement them.
33
+ */
34
+ export interface PackageSurfaceConfig {
35
+ packages: PackageSurfaceEntry[];
36
+ }
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Package surface manifest types — framework-level pin of which package
3
+ * surfaces the symbol-resolution preflight (W0198) reads against.
4
+ *
5
+ * Tasks reference packages by name; version metadata lives here, not on
6
+ * the task. A semver-major bump is an editorial event (regenerate cached
7
+ * surfaces, run the historical comparison set, ride a release) — patch
8
+ * and minor releases within a pinned major flow silently because semver
9
+ * disallows the export removals that would change a deduction outcome.
10
+ *
11
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md
12
+ */
13
+ export {};
@@ -85,6 +85,7 @@ export interface PipelineRequest {
85
85
  debug?: PipelineRequestDebug | boolean;
86
86
  executor?: PipelineRequestCallerExecutor;
87
87
  gapAnalysis?: boolean;
88
+ graderContext?: "rubric-only" | "with-docs";
88
89
  graderReplications?: number;
89
90
  headers?: Record<string, string>;
90
91
  inlineTasks?: Record<string, unknown>[];
@@ -0,0 +1,52 @@
1
+ /**
2
+ * Preflight-scoring config — knob for how heavily the W0198 symbol-existence
3
+ * preflight contributes to the `code-correctness` dimension.
4
+ *
5
+ * The deterministic preflight (`SymbolPreflightReport`) and the LLM rubric
6
+ * both feed into `code-correctness` per D0010. This config sets the relative
7
+ * share of the two lanes: `codeCorrectnessWeight: 0.4` means preflight is
8
+ * 40% of the dimension, rubric 60%.
9
+ *
10
+ * Authored in `packages/eval/config/preflight-scoring.ts` via
11
+ * `definePreflightScoring()`. Loaded lazily by the scoring step; absence
12
+ * means "use the default" rather than "disable" — disabling the preflight
13
+ * entirely is a separate axis (no manifest, no resolver wired).
14
+ *
15
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md
16
+ */
17
+ export interface PreflightScoringConfig {
18
+ /**
19
+ * Preflight's relative share of the `code-correctness` dimension, in
20
+ * `[0, 1]`. The complementary `1 - codeCorrectnessWeight` is the LLM
21
+ * rubric's share. Default `0.4`.
22
+ */
23
+ codeCorrectnessWeight: number;
24
+ }
25
+ /** Default preflight share of `code-correctness` when no config file is authored. */
26
+ export declare const DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT = 0.4;
27
+ /**
28
+ * W0198 Phase 6 — describes the deterministic symbol-preflight lane to the
29
+ * grader. When supplied to the literacy compiler, the rubric for the
30
+ * `code-correctness` dimension is prefixed with a system instruction telling
31
+ * the grader to treat the preflight's existence findings as ground truth and
32
+ * confine its code-correctness judgment to non-existence concerns (idiomatic
33
+ * usage, code organization, type safety, completeness).
34
+ *
35
+ * The actual per-test findings live on the deterministic lane (Phase 4
36
+ * artifact + Phase 5 score merge) and never enter the grader prompt — the
37
+ * lanes are kept structurally independent so the rubric never sees, and
38
+ * therefore cannot relitigate, an existence verdict.
39
+ *
40
+ * Lives in `@sanity/ailf-core` so the `CompilationContext` port and the
41
+ * eval-package compiler can both reference the same shape without creating
42
+ * a cross-package import cycle.
43
+ */
44
+ export interface PreflightRubricContext {
45
+ /**
46
+ * Names of the packages currently in the framework-level package-surface
47
+ * manifest. The grader is shown this list so it knows the boundary of the
48
+ * deterministic lane (named imports from these packages are existence-
49
+ * checked; everything else falls through to the rubric).
50
+ */
51
+ readonly packages: readonly string[];
52
+ }
@@ -0,0 +1,18 @@
1
+ /**
2
+ * Preflight-scoring config — knob for how heavily the W0198 symbol-existence
3
+ * preflight contributes to the `code-correctness` dimension.
4
+ *
5
+ * The deterministic preflight (`SymbolPreflightReport`) and the LLM rubric
6
+ * both feed into `code-correctness` per D0010. This config sets the relative
7
+ * share of the two lanes: `codeCorrectnessWeight: 0.4` means preflight is
8
+ * 40% of the dimension, rubric 60%.
9
+ *
10
+ * Authored in `packages/eval/config/preflight-scoring.ts` via
11
+ * `definePreflightScoring()`. Loaded lazily by the scoring step; absence
12
+ * means "use the default" rather than "disable" — disabling the preflight
13
+ * entirely is a separate axis (no manifest, no resolver wired).
14
+ *
15
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md
16
+ */
17
+ /** Default preflight share of `code-correctness` when no config file is authored. */
18
+ export const DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT = 0.4;
@@ -51,6 +51,19 @@ export interface RepoExecutionConfig {
51
51
  gapAnalysis?: boolean;
52
52
  graderReplications?: number;
53
53
  }
54
+ /** Grader configuration. */
55
+ export interface RepoGraderConfig {
56
+ /**
57
+ * Grader context policy.
58
+ *
59
+ * - `"rubric-only"` — grader sees only the rubric template + criteria +
60
+ * candidate response.
61
+ * - `"with-docs"` — canonical reference content for each task is injected
62
+ * into the assertion's `rubricPrompt` so the grader has authoritative
63
+ * ground truth.
64
+ */
65
+ context?: "rubric-only" | "with-docs";
66
+ }
54
67
  /** Where the pipeline writes results (replaces `--output-dir`). */
55
68
  export interface RepoOutputConfig {
56
69
  dir?: string;
@@ -95,6 +108,7 @@ export interface RepoConfig {
95
108
  agentic?: RepoAgenticConfig;
96
109
  artifacts?: RepoArtifactsConfig;
97
110
  execution?: RepoExecutionConfig;
111
+ grader?: RepoGraderConfig;
98
112
  output?: RepoOutputConfig;
99
113
  owner?: RepoOwnerConfig;
100
114
  publish?: RepoPublishConfig;
@@ -0,0 +1,66 @@
1
+ /**
2
+ * Symbol-preflight report — the deterministic half of the W0198 two-stage
3
+ * grader. Built per candidate output by `computePreflight()` in
4
+ * `packages/eval/src/pipeline/preflight/`; consumed by the scoring bridge
5
+ * (Phase 5) and emitted as an artifact (Phase 4) so reviewers can audit
6
+ * which surface a deduction was computed against.
7
+ *
8
+ * The shape is a deliberate three-way discriminated union — `exists`,
9
+ * `missing`, `unresolved` — not a binary plus a `confident` flag. Treating
10
+ * the lanes as structurally distinct is load-bearing for measurement
11
+ * integrity: collapsing `unresolved` into `missing` would re-introduce
12
+ * hallucination in deterministic disguise, which is the whole failure
13
+ * mode this tier exists to fix.
14
+ *
15
+ * Score impact, by lane:
16
+ * - `exists` — no deduction; LLM rubric is told the binding exists
17
+ * and instructed not to litigate its existence.
18
+ * - `missing` — deterministic deduction in `code-correctness`; LLM
19
+ * does not re-judge the existence question.
20
+ * - `unresolved` — no deduction; grading for that binding falls through
21
+ * to the W0196 / W0197 LLM-rubric path.
22
+ *
23
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md
24
+ */
25
+ /**
26
+ * Reason an `unresolved` finding was emitted. Each value names a specific
27
+ * resolver-side limit; collapsing these into a single bucket would lose
28
+ * the measurement-quality signal CI surfaces back to the framework's
29
+ * authors.
30
+ */
31
+ export type SymbolPreflightUnresolvedReason = "package-not-installed" | "types-entry-missing" | "parse-failed" | "reexport-hop-unfollowed";
32
+ export type SymbolPreflightFinding = {
33
+ readonly result: "exists";
34
+ readonly pkg: string;
35
+ readonly version: string;
36
+ readonly binding: string;
37
+ readonly source: "types" | "runtime";
38
+ } | {
39
+ readonly result: "missing";
40
+ readonly pkg: string;
41
+ readonly version: string;
42
+ readonly binding: string;
43
+ } | {
44
+ readonly result: "unresolved";
45
+ readonly pkg: string;
46
+ readonly binding: string;
47
+ readonly reason: SymbolPreflightUnresolvedReason;
48
+ };
49
+ /**
50
+ * Deduction roll-up — `total = min(missing_count * perMissing, cap)`.
51
+ * The score the rubric merges in is `1 - total / cap`, computed in the
52
+ * scoring bridge (Phase 5) so this report stays a pure data artifact.
53
+ */
54
+ export interface SymbolPreflightDeduction {
55
+ readonly perMissing: number;
56
+ readonly cap: number;
57
+ readonly total: number;
58
+ }
59
+ export interface SymbolPreflightReport {
60
+ readonly candidate: {
61
+ readonly taskId: string;
62
+ readonly testIndex: number;
63
+ };
64
+ readonly findings: readonly SymbolPreflightFinding[];
65
+ readonly deduction: SymbolPreflightDeduction;
66
+ }
@@ -0,0 +1,25 @@
1
+ /**
2
+ * Symbol-preflight report — the deterministic half of the W0198 two-stage
3
+ * grader. Built per candidate output by `computePreflight()` in
4
+ * `packages/eval/src/pipeline/preflight/`; consumed by the scoring bridge
5
+ * (Phase 5) and emitted as an artifact (Phase 4) so reviewers can audit
6
+ * which surface a deduction was computed against.
7
+ *
8
+ * The shape is a deliberate three-way discriminated union — `exists`,
9
+ * `missing`, `unresolved` — not a binary plus a `confident` flag. Treating
10
+ * the lanes as structurally distinct is load-bearing for measurement
11
+ * integrity: collapsing `unresolved` into `missing` would re-introduce
12
+ * hallucination in deterministic disguise, which is the whole failure
13
+ * mode this tier exists to fix.
14
+ *
15
+ * Score impact, by lane:
16
+ * - `exists` — no deduction; LLM rubric is told the binding exists
17
+ * and instructed not to litigate its existence.
18
+ * - `missing` — deterministic deduction in `code-correctness`; LLM
19
+ * does not re-judge the existence question.
20
+ * - `unresolved` — no deduction; grading for that binding falls through
21
+ * to the W0196 / W0197 LLM-rubric path.
22
+ *
23
+ * @see docs/design-docs/two-stage-grader-symbol-preflight.md
24
+ */
25
+ export {};
@@ -57,6 +57,7 @@ export interface RemoteConfigSlice {
57
57
  datasetOverride?: string;
58
58
  projectIdOverride?: string;
59
59
  perspectiveOverride?: string;
60
+ graderContext?: "rubric-only" | "with-docs";
60
61
  graderReplications?: number;
61
62
  gapAnalysisEnabled?: boolean;
62
63
  noRemoteCache?: boolean;
@@ -124,6 +124,9 @@ export async function buildRemoteRequest(options) {
124
124
  if (config.perspectiveOverride)
125
125
  raw.perspective = config.perspectiveOverride;
126
126
  // Advanced
127
+ if (config.graderContext) {
128
+ raw.graderContext = config.graderContext;
129
+ }
127
130
  if (config.graderReplications) {
128
131
  raw.graderReplications = config.graderReplications;
129
132
  }
@@ -125,6 +125,7 @@ function mapEvalConfigToResolvedConfig(config, rootDir) {
125
125
  noCache: config.noCache ?? false,
126
126
  noRemoteCache: config.noRemoteCache ?? false,
127
127
  graderReplications: config.execution?.graderReplications,
128
+ graderContext: config.grader?.context,
128
129
  urls: config.urls,
129
130
  headers: config.agentic?.headers,
130
131
  allowedOrigins: config.agentic?.allowedOrigins,
@@ -42,6 +42,10 @@ export declare class SanityDocFetcher implements DocFetcher {
42
42
  * `typesReference` renderer to inline typedoc JSON. Returns `null`
43
43
  * on any HTTP/network failure rather than throwing — the renderer
44
44
  * surfaces a placeholder so the rest of the context still renders.
45
+ *
46
+ * Wrapped in a 30s `AbortSignal.timeout` so a slow CDN can't hang the
47
+ * eval pipeline indefinitely. Timeouts surface as a single `null`
48
+ * return like any other fetch failure.
45
49
  */
46
50
  private fetchAttachmentBody;
47
51
  /**