@sanity/ailf 4.2.0 → 4.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/package-surface.ts +37 -0
- package/config/preflight-scoring.ts +26 -0
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +1 -1
- package/dist/_vendor/ailf-core/artifact-registry.js +47 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +35 -0
- package/dist/_vendor/ailf-core/config-helpers.js +67 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/index.js +1 -1
- package/dist/_vendor/ailf-core/ports/context.d.ts +18 -0
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +30 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +3 -1
- package/dist/_vendor/ailf-core/ports/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +23 -0
- package/dist/_vendor/ailf-core/ports/package-surface-resolver.d.ts +71 -0
- package/dist/_vendor/ailf-core/ports/package-surface-resolver.js +36 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +6 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +14 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/index.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +4 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
- package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.d.ts +51 -0
- package/dist/_vendor/ailf-core/schemas/symbol-preflight-report.js +57 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +12 -0
- package/dist/_vendor/ailf-core/types/index.js +1 -0
- package/dist/_vendor/ailf-core/types/package-surface.d.ts +36 -0
- package/dist/_vendor/ailf-core/types/package-surface.js +13 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/preflight-scoring.d.ts +52 -0
- package/dist/_vendor/ailf-core/types/preflight-scoring.js +18 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +14 -0
- package/dist/_vendor/ailf-core/types/symbol-preflight-report.d.ts +66 -0
- package/dist/_vendor/ailf-core/types/symbol-preflight-report.js +25 -0
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +4 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +159 -82
- package/dist/adapters/index.d.ts +1 -0
- package/dist/adapters/index.js +1 -0
- package/dist/adapters/package-surface/dts-package-surface.d.ts +46 -0
- package/dist/adapters/package-surface/dts-package-surface.js +173 -0
- package/dist/adapters/package-surface/in-memory-package-surface.d.ts +15 -0
- package/dist/adapters/package-surface/in-memory-package-surface.js +28 -0
- package/dist/adapters/package-surface/index.d.ts +9 -0
- package/dist/adapters/package-surface/index.js +8 -0
- package/dist/adapters/package-surface/parse-dts-exports.d.ts +31 -0
- package/dist/adapters/package-surface/parse-dts-exports.js +54 -0
- package/dist/adapters/task-sources/repo-schemas.d.ts +6 -0
- package/dist/adapters/task-sources/repo-schemas.js +15 -0
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +12 -0
- package/dist/commands/remote-pipeline.js +10 -2
- package/dist/commands/remote-results.d.ts +12 -1
- package/dist/commands/remote-results.js +25 -5
- package/dist/composition-root.js +9 -0
- package/dist/config/package-surface.ts +37 -0
- package/dist/config/preflight-scoring.ts +26 -0
- package/dist/index.d.ts +2 -2
- package/dist/index.js +1 -1
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/pipeline-orchestrator.d.ts +19 -1
- package/dist/orchestration/pipeline-orchestrator.js +38 -0
- package/dist/orchestration/steps/calculate-scores-step.js +11 -0
- package/dist/orchestration/steps/generate-configs-step.js +16 -1
- package/dist/orchestration/steps/run-eval-step.js +27 -0
- package/dist/pipeline/calculate-scores.d.ts +66 -5
- package/dist/pipeline/calculate-scores.js +141 -27
- package/dist/pipeline/compiler/index.d.ts +1 -1
- package/dist/pipeline/compiler/index.js +1 -1
- package/dist/pipeline/compiler/literacy-bridge.d.ts +9 -0
- package/dist/pipeline/compiler/literacy-bridge.js +2 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +31 -4
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +146 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +2 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +17 -2
- package/dist/pipeline/compiler/rubric-resolution.d.ts +17 -1
- package/dist/pipeline/compiler/rubric-resolution.js +78 -2
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -2
- package/dist/pipeline/compiler/scoring-bridge.js +104 -10
- package/dist/pipeline/eval-fingerprint.d.ts +9 -0
- package/dist/pipeline/eval-fingerprint.js +7 -1
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/dist/pipeline/preflight/compute-preflight.d.ts +67 -0
- package/dist/pipeline/preflight/compute-preflight.js +118 -0
- package/dist/pipeline/preflight/emit-symbol-preflight.d.ts +51 -0
- package/dist/pipeline/preflight/emit-symbol-preflight.js +102 -0
- package/dist/pipeline/preflight/load-package-surface.d.ts +14 -0
- package/dist/pipeline/preflight/load-package-surface.js +19 -0
- package/dist/pipeline/preflight/load-preflight-context.d.ts +13 -0
- package/dist/pipeline/preflight/load-preflight-context.js +25 -0
- package/dist/pipeline/preflight/load-preflight-scoring.d.ts +12 -0
- package/dist/pipeline/preflight/load-preflight-scoring.js +17 -0
- package/dist/pipeline/preflight/parse-imports.d.ts +62 -0
- package/dist/pipeline/preflight/parse-imports.js +125 -0
- package/dist/report-store.d.ts +8 -0
- package/dist/report-store.js +55 -6
- package/dist/sanity/document-renderers.d.ts +45 -7
- package/dist/sanity/document-renderers.js +99 -13
- package/dist/sanity/queries.d.ts +11 -11
- package/dist/sanity/queries.js +7 -0
- package/dist/sanity/symbol-index.d.ts +98 -0
- package/dist/sanity/symbol-index.js +615 -0
- package/package.json +2 -1
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @sanity/ailf-core — SymbolPreflightReport schema
|
|
3
|
+
*
|
|
4
|
+
* The trust-boundary parser for the W0198 deterministic-lane artifact.
|
|
5
|
+
* Used when reading a previously-emitted preflight report back from
|
|
6
|
+
* disk / GCS so a downstream step (re-grading, comparison, dashboard
|
|
7
|
+
* rendering) can trust its shape.
|
|
8
|
+
*
|
|
9
|
+
* The schema asserts `satisfies z.ZodType<SymbolPreflightReport>`
|
|
10
|
+
* against the domain type in
|
|
11
|
+
* `packages/core/src/types/symbol-preflight-report.ts` (D0045 / W0187),
|
|
12
|
+
* so any drift between the two is a build error.
|
|
13
|
+
*/
|
|
14
|
+
import { z } from "zod";
|
|
15
|
+
export declare const SymbolPreflightReportSchema: z.ZodObject<{
|
|
16
|
+
candidate: z.ZodObject<{
|
|
17
|
+
taskId: z.ZodString;
|
|
18
|
+
testIndex: z.ZodNumber;
|
|
19
|
+
}, z.core.$strip>;
|
|
20
|
+
findings: z.ZodArray<z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
21
|
+
result: z.ZodLiteral<"exists">;
|
|
22
|
+
pkg: z.ZodString;
|
|
23
|
+
version: z.ZodString;
|
|
24
|
+
binding: z.ZodString;
|
|
25
|
+
source: z.ZodEnum<{
|
|
26
|
+
types: "types";
|
|
27
|
+
runtime: "runtime";
|
|
28
|
+
}>;
|
|
29
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
30
|
+
result: z.ZodLiteral<"missing">;
|
|
31
|
+
pkg: z.ZodString;
|
|
32
|
+
version: z.ZodString;
|
|
33
|
+
binding: z.ZodString;
|
|
34
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
35
|
+
result: z.ZodLiteral<"unresolved">;
|
|
36
|
+
pkg: z.ZodString;
|
|
37
|
+
binding: z.ZodString;
|
|
38
|
+
reason: z.ZodEnum<{
|
|
39
|
+
"package-not-installed": "package-not-installed";
|
|
40
|
+
"types-entry-missing": "types-entry-missing";
|
|
41
|
+
"parse-failed": "parse-failed";
|
|
42
|
+
"reexport-hop-unfollowed": "reexport-hop-unfollowed";
|
|
43
|
+
}>;
|
|
44
|
+
}, z.core.$strip>], "result">>;
|
|
45
|
+
deduction: z.ZodObject<{
|
|
46
|
+
perMissing: z.ZodNumber;
|
|
47
|
+
cap: z.ZodNumber;
|
|
48
|
+
total: z.ZodNumber;
|
|
49
|
+
}, z.core.$strip>;
|
|
50
|
+
}, z.core.$strip>;
|
|
51
|
+
export type { SymbolPreflightReport } from "../types/symbol-preflight-report.js";
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @sanity/ailf-core — SymbolPreflightReport schema
|
|
3
|
+
*
|
|
4
|
+
* The trust-boundary parser for the W0198 deterministic-lane artifact.
|
|
5
|
+
* Used when reading a previously-emitted preflight report back from
|
|
6
|
+
* disk / GCS so a downstream step (re-grading, comparison, dashboard
|
|
7
|
+
* rendering) can trust its shape.
|
|
8
|
+
*
|
|
9
|
+
* The schema asserts `satisfies z.ZodType<SymbolPreflightReport>`
|
|
10
|
+
* against the domain type in
|
|
11
|
+
* `packages/core/src/types/symbol-preflight-report.ts` (D0045 / W0187),
|
|
12
|
+
* so any drift between the two is a build error.
|
|
13
|
+
*/
|
|
14
|
+
import { z } from "zod";
|
|
15
|
+
const ExistsFindingSchema = z.object({
|
|
16
|
+
result: z.literal("exists"),
|
|
17
|
+
pkg: z.string().min(1),
|
|
18
|
+
version: z.string().min(1),
|
|
19
|
+
binding: z.string().min(1),
|
|
20
|
+
source: z.enum(["types", "runtime"]),
|
|
21
|
+
});
|
|
22
|
+
const MissingFindingSchema = z.object({
|
|
23
|
+
result: z.literal("missing"),
|
|
24
|
+
pkg: z.string().min(1),
|
|
25
|
+
version: z.string().min(1),
|
|
26
|
+
binding: z.string().min(1),
|
|
27
|
+
});
|
|
28
|
+
const UnresolvedReasonSchema = z.enum([
|
|
29
|
+
"package-not-installed",
|
|
30
|
+
"types-entry-missing",
|
|
31
|
+
"parse-failed",
|
|
32
|
+
"reexport-hop-unfollowed",
|
|
33
|
+
]);
|
|
34
|
+
const UnresolvedFindingSchema = z.object({
|
|
35
|
+
result: z.literal("unresolved"),
|
|
36
|
+
pkg: z.string().min(1),
|
|
37
|
+
binding: z.string().min(1),
|
|
38
|
+
reason: UnresolvedReasonSchema,
|
|
39
|
+
});
|
|
40
|
+
const FindingSchema = z.discriminatedUnion("result", [
|
|
41
|
+
ExistsFindingSchema,
|
|
42
|
+
MissingFindingSchema,
|
|
43
|
+
UnresolvedFindingSchema,
|
|
44
|
+
]);
|
|
45
|
+
const DeductionSchema = z.object({
|
|
46
|
+
perMissing: z.number().nonnegative(),
|
|
47
|
+
cap: z.number().nonnegative(),
|
|
48
|
+
total: z.number().nonnegative(),
|
|
49
|
+
});
|
|
50
|
+
export const SymbolPreflightReportSchema = z.object({
|
|
51
|
+
candidate: z.object({
|
|
52
|
+
taskId: z.string().min(1),
|
|
53
|
+
testIndex: z.number().int().nonnegative(),
|
|
54
|
+
}),
|
|
55
|
+
findings: z.array(FindingSchema),
|
|
56
|
+
deduction: DeductionSchema,
|
|
57
|
+
});
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
*/
|
|
12
12
|
import type { DocumentRef as _DocumentRef, EvalMode, RunContext } from "../../ailf-shared/index.d.ts";
|
|
13
13
|
import type { ArtifactType } from "../artifact-registry.js";
|
|
14
|
+
import type { SymbolPreflightReport } from "./symbol-preflight-report.js";
|
|
14
15
|
import type { AssociationValues, RunId } from "./branded-ids.js";
|
|
15
16
|
export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "./scoring-input.js";
|
|
16
17
|
export type { DocumentRef, RunContext, RunTrigger } from "../../ailf-shared/index.d.ts";
|
|
@@ -26,6 +27,9 @@ export type { EvalTrace, ToolCallCategory, ToolCallRecord, TraceEvent, TraceSpan
|
|
|
26
27
|
export type { CompareRequest, CompareScoreSummaryRef, LookupDocRequest, ValidateConfigRequest, ValidateSourceRequest, ValidateTaskAssertionDef, ValidateTaskCanonicalDocRef, ValidateTaskDef, ValidateTaskRequest, } from "./api-requests.js";
|
|
27
28
|
export type { AilfEvalWorkflow, AilfEvalWorkflowJob, AilfEvalWorkflowStep, RepoAgenticConfig, RepoArtifactsConfig, RepoConfig, RepoExecutionConfig, RepoOutputConfig, RepoOwnerConfig, RepoPublishConfig, RepoReportStoreConfig, RepoSourceConfig, RepoTaskSourceConfig, RepoTriggersConfig, ScheduleTriggerConfig, TriggerConfig, TriggerMode, } from "./repo-config.js";
|
|
28
29
|
export type { PipelineRequest, PipelineRequestCallback, PipelineRequestCallerExecutor, PipelineRequestCallerGit, PipelineRequestCallerOwner, PipelineRequestDebug, PipelineRequestTaskSource, } from "./pipeline-request.js";
|
|
30
|
+
export type { PackageSurfaceConfig, PackageSurfaceEntry, } from "./package-surface.js";
|
|
31
|
+
export type { SymbolPreflightDeduction, SymbolPreflightFinding, SymbolPreflightReport, SymbolPreflightUnresolvedReason, } from "./symbol-preflight-report.js";
|
|
32
|
+
export { DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT, type PreflightRubricContext, type PreflightScoringConfig, } from "./preflight-scoring.js";
|
|
29
33
|
export type { ArtifactId, AssociationAxis, AssociationValues, Brand, EntryKey, Err, FixtureId, IdValidationError, NewReportId, Ok, ProviderId, PromptId, Result, ResultId, RubricId, RunFingerprint, RunId, SuiteId, TaskId, TaskSlug, TraceId, } from "./branded-ids.js";
|
|
30
34
|
export { err, fixtureId, generateRunId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
|
|
31
35
|
export type { AgentHarnessTaskDefinition, ContentLakeAuthorableMode, ContentLakeAuthorableTask, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PromptVars, PathDocRef, PerspectiveDocRef, ReservedPromptVarKey, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
|
|
@@ -593,6 +597,14 @@ export interface PipelineState {
|
|
|
593
597
|
* Consumed by the orchestrator for the enriched PipelineResult.
|
|
594
598
|
*/
|
|
595
599
|
testSummary?: TestSummary;
|
|
600
|
+
/**
|
|
601
|
+
* W0198 deterministic-lane reports keyed by `${run}/${mode}/${task}/${model}`.
|
|
602
|
+
* Set by RunEvalStep after `emitSymbolPreflight`, consumed by the
|
|
603
|
+
* scoring step (Phase 5) so the per-test merge does not need to read
|
|
604
|
+
* the artifact back from disk. The map is intentionally small — one
|
|
605
|
+
* entry per (task, model) per mode.
|
|
606
|
+
*/
|
|
607
|
+
preflightReports?: Map<string, SymbolPreflightReport>;
|
|
596
608
|
}
|
|
597
609
|
/**
|
|
598
610
|
* Release auto-scope metadata — which tasks are affected by a content
|
|
@@ -16,6 +16,7 @@ export { InMemoryPluginRegistry } from "./plugin-registry.js";
|
|
|
16
16
|
// version is used internally by LiteracyModeConfig. If consumers need
|
|
17
17
|
// the mode-specific version, they import from "./eval-mode-config.js".
|
|
18
18
|
export { evalModeType } from "./eval-mode-config.js";
|
|
19
|
+
export { DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT, } from "./preflight-scoring.js";
|
|
19
20
|
export { err, fixtureId, generateRunId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
|
|
20
21
|
// ---------------------------------------------------------------------------
|
|
21
22
|
// Comparison (Approach 2: structured comparison output)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Package surface manifest types — framework-level pin of which package
|
|
3
|
+
* surfaces the symbol-resolution preflight (W0198) reads against.
|
|
4
|
+
*
|
|
5
|
+
* Tasks reference packages by name; version metadata lives here, not on
|
|
6
|
+
* the task. A semver-major bump is an editorial event (regenerate cached
|
|
7
|
+
* surfaces, run the historical comparison set, ride a release) — patch
|
|
8
|
+
* and minor releases within a pinned major flow silently because semver
|
|
9
|
+
* disallows the export removals that would change a deduction outcome.
|
|
10
|
+
*
|
|
11
|
+
* @see docs/design-docs/two-stage-grader-symbol-preflight.md
|
|
12
|
+
*/
|
|
13
|
+
/**
|
|
14
|
+
* One in-scope package, pinned to a semver-major range.
|
|
15
|
+
*
|
|
16
|
+
* - `pkg` — npm package name (e.g. `"@sanity/sdk-react"`).
|
|
17
|
+
* - `semverPin` — semver range pinned to a single major (e.g. `"^2.0.0"`).
|
|
18
|
+
* The W0198 resolver answers existence questions against this major;
|
|
19
|
+
* minor and patch upgrades within the major do not require a manifest
|
|
20
|
+
* bump because semver disallows export removal.
|
|
21
|
+
*/
|
|
22
|
+
export interface PackageSurfaceEntry {
|
|
23
|
+
pkg: string;
|
|
24
|
+
semverPin: string;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Framework-level package-surface manifest.
|
|
28
|
+
*
|
|
29
|
+
* Authored once in `packages/eval/config/package-surface.ts` via
|
|
30
|
+
* `definePackageSurface(...)`. Per-task pin overrides remain a deliberate
|
|
31
|
+
* future extension point; the shape leaves room for them but the initial
|
|
32
|
+
* slice does not implement them.
|
|
33
|
+
*/
|
|
34
|
+
export interface PackageSurfaceConfig {
|
|
35
|
+
packages: PackageSurfaceEntry[];
|
|
36
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Package surface manifest types — framework-level pin of which package
|
|
3
|
+
* surfaces the symbol-resolution preflight (W0198) reads against.
|
|
4
|
+
*
|
|
5
|
+
* Tasks reference packages by name; version metadata lives here, not on
|
|
6
|
+
* the task. A semver-major bump is an editorial event (regenerate cached
|
|
7
|
+
* surfaces, run the historical comparison set, ride a release) — patch
|
|
8
|
+
* and minor releases within a pinned major flow silently because semver
|
|
9
|
+
* disallows the export removals that would change a deduction outcome.
|
|
10
|
+
*
|
|
11
|
+
* @see docs/design-docs/two-stage-grader-symbol-preflight.md
|
|
12
|
+
*/
|
|
13
|
+
export {};
|
|
@@ -85,6 +85,7 @@ export interface PipelineRequest {
|
|
|
85
85
|
debug?: PipelineRequestDebug | boolean;
|
|
86
86
|
executor?: PipelineRequestCallerExecutor;
|
|
87
87
|
gapAnalysis?: boolean;
|
|
88
|
+
graderContext?: "rubric-only" | "with-docs";
|
|
88
89
|
graderReplications?: number;
|
|
89
90
|
headers?: Record<string, string>;
|
|
90
91
|
inlineTasks?: Record<string, unknown>[];
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Preflight-scoring config — knob for how heavily the W0198 symbol-existence
|
|
3
|
+
* preflight contributes to the `code-correctness` dimension.
|
|
4
|
+
*
|
|
5
|
+
* The deterministic preflight (`SymbolPreflightReport`) and the LLM rubric
|
|
6
|
+
* both feed into `code-correctness` per D0010. This config sets the relative
|
|
7
|
+
* share of the two lanes: `codeCorrectnessWeight: 0.4` means preflight is
|
|
8
|
+
* 40% of the dimension, rubric 60%.
|
|
9
|
+
*
|
|
10
|
+
* Authored in `packages/eval/config/preflight-scoring.ts` via
|
|
11
|
+
* `definePreflightScoring()`. Loaded lazily by the scoring step; absence
|
|
12
|
+
* means "use the default" rather than "disable" — disabling the preflight
|
|
13
|
+
* entirely is a separate axis (no manifest, no resolver wired).
|
|
14
|
+
*
|
|
15
|
+
* @see docs/design-docs/two-stage-grader-symbol-preflight.md
|
|
16
|
+
*/
|
|
17
|
+
export interface PreflightScoringConfig {
|
|
18
|
+
/**
|
|
19
|
+
* Preflight's relative share of the `code-correctness` dimension, in
|
|
20
|
+
* `[0, 1]`. The complementary `1 - codeCorrectnessWeight` is the LLM
|
|
21
|
+
* rubric's share. Default `0.4`.
|
|
22
|
+
*/
|
|
23
|
+
codeCorrectnessWeight: number;
|
|
24
|
+
}
|
|
25
|
+
/** Default preflight share of `code-correctness` when no config file is authored. */
|
|
26
|
+
export declare const DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT = 0.4;
|
|
27
|
+
/**
|
|
28
|
+
* W0198 Phase 6 — describes the deterministic symbol-preflight lane to the
|
|
29
|
+
* grader. When supplied to the literacy compiler, the rubric for the
|
|
30
|
+
* `code-correctness` dimension is prefixed with a system instruction telling
|
|
31
|
+
* the grader to treat the preflight's existence findings as ground truth and
|
|
32
|
+
* confine its code-correctness judgment to non-existence concerns (idiomatic
|
|
33
|
+
* usage, code organization, type safety, completeness).
|
|
34
|
+
*
|
|
35
|
+
* The actual per-test findings live on the deterministic lane (Phase 4
|
|
36
|
+
* artifact + Phase 5 score merge) and never enter the grader prompt — the
|
|
37
|
+
* lanes are kept structurally independent so the rubric never sees, and
|
|
38
|
+
* therefore cannot relitigate, an existence verdict.
|
|
39
|
+
*
|
|
40
|
+
* Lives in `@sanity/ailf-core` so the `CompilationContext` port and the
|
|
41
|
+
* eval-package compiler can both reference the same shape without creating
|
|
42
|
+
* a cross-package import cycle.
|
|
43
|
+
*/
|
|
44
|
+
export interface PreflightRubricContext {
|
|
45
|
+
/**
|
|
46
|
+
* Names of the packages currently in the framework-level package-surface
|
|
47
|
+
* manifest. The grader is shown this list so it knows the boundary of the
|
|
48
|
+
* deterministic lane (named imports from these packages are existence-
|
|
49
|
+
* checked; everything else falls through to the rubric).
|
|
50
|
+
*/
|
|
51
|
+
readonly packages: readonly string[];
|
|
52
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Preflight-scoring config — knob for how heavily the W0198 symbol-existence
|
|
3
|
+
* preflight contributes to the `code-correctness` dimension.
|
|
4
|
+
*
|
|
5
|
+
* The deterministic preflight (`SymbolPreflightReport`) and the LLM rubric
|
|
6
|
+
* both feed into `code-correctness` per D0010. This config sets the relative
|
|
7
|
+
* share of the two lanes: `codeCorrectnessWeight: 0.4` means preflight is
|
|
8
|
+
* 40% of the dimension, rubric 60%.
|
|
9
|
+
*
|
|
10
|
+
* Authored in `packages/eval/config/preflight-scoring.ts` via
|
|
11
|
+
* `definePreflightScoring()`. Loaded lazily by the scoring step; absence
|
|
12
|
+
* means "use the default" rather than "disable" — disabling the preflight
|
|
13
|
+
* entirely is a separate axis (no manifest, no resolver wired).
|
|
14
|
+
*
|
|
15
|
+
* @see docs/design-docs/two-stage-grader-symbol-preflight.md
|
|
16
|
+
*/
|
|
17
|
+
/** Default preflight share of `code-correctness` when no config file is authored. */
|
|
18
|
+
export const DEFAULT_PREFLIGHT_CODE_CORRECTNESS_WEIGHT = 0.4;
|
|
@@ -51,6 +51,19 @@ export interface RepoExecutionConfig {
|
|
|
51
51
|
gapAnalysis?: boolean;
|
|
52
52
|
graderReplications?: number;
|
|
53
53
|
}
|
|
54
|
+
/** Grader configuration. */
|
|
55
|
+
export interface RepoGraderConfig {
|
|
56
|
+
/**
|
|
57
|
+
* Grader context policy.
|
|
58
|
+
*
|
|
59
|
+
* - `"rubric-only"` — grader sees only the rubric template + criteria +
|
|
60
|
+
* candidate response.
|
|
61
|
+
* - `"with-docs"` — canonical reference content for each task is injected
|
|
62
|
+
* into the assertion's `rubricPrompt` so the grader has authoritative
|
|
63
|
+
* ground truth.
|
|
64
|
+
*/
|
|
65
|
+
context?: "rubric-only" | "with-docs";
|
|
66
|
+
}
|
|
54
67
|
/** Where the pipeline writes results (replaces `--output-dir`). */
|
|
55
68
|
export interface RepoOutputConfig {
|
|
56
69
|
dir?: string;
|
|
@@ -95,6 +108,7 @@ export interface RepoConfig {
|
|
|
95
108
|
agentic?: RepoAgenticConfig;
|
|
96
109
|
artifacts?: RepoArtifactsConfig;
|
|
97
110
|
execution?: RepoExecutionConfig;
|
|
111
|
+
grader?: RepoGraderConfig;
|
|
98
112
|
output?: RepoOutputConfig;
|
|
99
113
|
owner?: RepoOwnerConfig;
|
|
100
114
|
publish?: RepoPublishConfig;
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Symbol-preflight report — the deterministic half of the W0198 two-stage
|
|
3
|
+
* grader. Built per candidate output by `computePreflight()` in
|
|
4
|
+
* `packages/eval/src/pipeline/preflight/`; consumed by the scoring bridge
|
|
5
|
+
* (Phase 5) and emitted as an artifact (Phase 4) so reviewers can audit
|
|
6
|
+
* which surface a deduction was computed against.
|
|
7
|
+
*
|
|
8
|
+
* The shape is a deliberate three-way discriminated union — `exists`,
|
|
9
|
+
* `missing`, `unresolved` — not a binary plus a `confident` flag. Treating
|
|
10
|
+
* the lanes as structurally distinct is load-bearing for measurement
|
|
11
|
+
* integrity: collapsing `unresolved` into `missing` would re-introduce
|
|
12
|
+
* hallucination in deterministic disguise, which is the whole failure
|
|
13
|
+
* mode this tier exists to fix.
|
|
14
|
+
*
|
|
15
|
+
* Score impact, by lane:
|
|
16
|
+
* - `exists` — no deduction; LLM rubric is told the binding exists
|
|
17
|
+
* and instructed not to litigate its existence.
|
|
18
|
+
* - `missing` — deterministic deduction in `code-correctness`; LLM
|
|
19
|
+
* does not re-judge the existence question.
|
|
20
|
+
* - `unresolved` — no deduction; grading for that binding falls through
|
|
21
|
+
* to the W0196 / W0197 LLM-rubric path.
|
|
22
|
+
*
|
|
23
|
+
* @see docs/design-docs/two-stage-grader-symbol-preflight.md
|
|
24
|
+
*/
|
|
25
|
+
/**
|
|
26
|
+
* Reason an `unresolved` finding was emitted. Each value names a specific
|
|
27
|
+
* resolver-side limit; collapsing these into a single bucket would lose
|
|
28
|
+
* the measurement-quality signal CI surfaces back to the framework's
|
|
29
|
+
* authors.
|
|
30
|
+
*/
|
|
31
|
+
export type SymbolPreflightUnresolvedReason = "package-not-installed" | "types-entry-missing" | "parse-failed" | "reexport-hop-unfollowed";
|
|
32
|
+
export type SymbolPreflightFinding = {
|
|
33
|
+
readonly result: "exists";
|
|
34
|
+
readonly pkg: string;
|
|
35
|
+
readonly version: string;
|
|
36
|
+
readonly binding: string;
|
|
37
|
+
readonly source: "types" | "runtime";
|
|
38
|
+
} | {
|
|
39
|
+
readonly result: "missing";
|
|
40
|
+
readonly pkg: string;
|
|
41
|
+
readonly version: string;
|
|
42
|
+
readonly binding: string;
|
|
43
|
+
} | {
|
|
44
|
+
readonly result: "unresolved";
|
|
45
|
+
readonly pkg: string;
|
|
46
|
+
readonly binding: string;
|
|
47
|
+
readonly reason: SymbolPreflightUnresolvedReason;
|
|
48
|
+
};
|
|
49
|
+
/**
|
|
50
|
+
* Deduction roll-up — `total = min(missing_count * perMissing, cap)`.
|
|
51
|
+
* The score the rubric merges in is `1 - total / cap`, computed in the
|
|
52
|
+
* scoring bridge (Phase 5) so this report stays a pure data artifact.
|
|
53
|
+
*/
|
|
54
|
+
export interface SymbolPreflightDeduction {
|
|
55
|
+
readonly perMissing: number;
|
|
56
|
+
readonly cap: number;
|
|
57
|
+
readonly total: number;
|
|
58
|
+
}
|
|
59
|
+
export interface SymbolPreflightReport {
|
|
60
|
+
readonly candidate: {
|
|
61
|
+
readonly taskId: string;
|
|
62
|
+
readonly testIndex: number;
|
|
63
|
+
};
|
|
64
|
+
readonly findings: readonly SymbolPreflightFinding[];
|
|
65
|
+
readonly deduction: SymbolPreflightDeduction;
|
|
66
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Symbol-preflight report — the deterministic half of the W0198 two-stage
|
|
3
|
+
* grader. Built per candidate output by `computePreflight()` in
|
|
4
|
+
* `packages/eval/src/pipeline/preflight/`; consumed by the scoring bridge
|
|
5
|
+
* (Phase 5) and emitted as an artifact (Phase 4) so reviewers can audit
|
|
6
|
+
* which surface a deduction was computed against.
|
|
7
|
+
*
|
|
8
|
+
* The shape is a deliberate three-way discriminated union — `exists`,
|
|
9
|
+
* `missing`, `unresolved` — not a binary plus a `confident` flag. Treating
|
|
10
|
+
* the lanes as structurally distinct is load-bearing for measurement
|
|
11
|
+
* integrity: collapsing `unresolved` into `missing` would re-introduce
|
|
12
|
+
* hallucination in deterministic disguise, which is the whole failure
|
|
13
|
+
* mode this tier exists to fix.
|
|
14
|
+
*
|
|
15
|
+
* Score impact, by lane:
|
|
16
|
+
* - `exists` — no deduction; LLM rubric is told the binding exists
|
|
17
|
+
* and instructed not to litigate its existence.
|
|
18
|
+
* - `missing` — deterministic deduction in `code-correctness`; LLM
|
|
19
|
+
* does not re-judge the existence question.
|
|
20
|
+
* - `unresolved` — no deduction; grading for that binding falls through
|
|
21
|
+
* to the W0196 / W0197 LLM-rubric path.
|
|
22
|
+
*
|
|
23
|
+
* @see docs/design-docs/two-stage-grader-symbol-preflight.md
|
|
24
|
+
*/
|
|
25
|
+
export {};
|
|
@@ -57,6 +57,7 @@ export interface RemoteConfigSlice {
|
|
|
57
57
|
datasetOverride?: string;
|
|
58
58
|
projectIdOverride?: string;
|
|
59
59
|
perspectiveOverride?: string;
|
|
60
|
+
graderContext?: "rubric-only" | "with-docs";
|
|
60
61
|
graderReplications?: number;
|
|
61
62
|
gapAnalysisEnabled?: boolean;
|
|
62
63
|
noRemoteCache?: boolean;
|
|
@@ -124,6 +124,9 @@ export async function buildRemoteRequest(options) {
|
|
|
124
124
|
if (config.perspectiveOverride)
|
|
125
125
|
raw.perspective = config.perspectiveOverride;
|
|
126
126
|
// Advanced
|
|
127
|
+
if (config.graderContext) {
|
|
128
|
+
raw.graderContext = config.graderContext;
|
|
129
|
+
}
|
|
127
130
|
if (config.graderReplications) {
|
|
128
131
|
raw.graderReplications = config.graderReplications;
|
|
129
132
|
}
|
|
@@ -125,6 +125,7 @@ function mapEvalConfigToResolvedConfig(config, rootDir) {
|
|
|
125
125
|
noCache: config.noCache ?? false,
|
|
126
126
|
noRemoteCache: config.noRemoteCache ?? false,
|
|
127
127
|
graderReplications: config.execution?.graderReplications,
|
|
128
|
+
graderContext: config.grader?.context,
|
|
128
129
|
urls: config.urls,
|
|
129
130
|
headers: config.agentic?.headers,
|
|
130
131
|
allowedOrigins: config.agentic?.allowedOrigins,
|
|
@@ -42,6 +42,10 @@ export declare class SanityDocFetcher implements DocFetcher {
|
|
|
42
42
|
* `typesReference` renderer to inline typedoc JSON. Returns `null`
|
|
43
43
|
* on any HTTP/network failure rather than throwing — the renderer
|
|
44
44
|
* surfaces a placeholder so the rest of the context still renders.
|
|
45
|
+
*
|
|
46
|
+
* Wrapped in a 30s `AbortSignal.timeout` so a slow CDN can't hang the
|
|
47
|
+
* eval pipeline indefinitely. Timeouts surface as a single `null`
|
|
48
|
+
* return like any other fetch failure.
|
|
45
49
|
*/
|
|
46
50
|
private fetchAttachmentBody;
|
|
47
51
|
/**
|