@sanity/ailf 4.3.0 → 4.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/ports/context.d.ts +2 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +4 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +3 -1
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/commands/remote-pipeline.js +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +5 -4
- package/dist/pipeline/eval-fingerprint.d.ts +2 -2
- package/dist/pipeline/eval-fingerprint.js +8 -2
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/dist/pipeline/preflight/emit-symbol-preflight.js +7 -1
- package/dist/pipeline/preflight/extract-code-blocks.d.ts +21 -0
- package/dist/pipeline/preflight/extract-code-blocks.js +48 -0
- package/package.json +1 -1
|
@@ -86,7 +86,8 @@ export interface ResolvedConfig {
|
|
|
86
86
|
* canonical doc content as authoritative ground truth.
|
|
87
87
|
*
|
|
88
88
|
* Sourced from EvalConfig `grader.context` or the equivalent CLI/env
|
|
89
|
-
* surface. Defaults to `"
|
|
89
|
+
* surface. Defaults to `"with-docs"` at the EvalConfig boundary
|
|
90
|
+
* (flipped in W0200 after DOC-2117 validation).
|
|
90
91
|
*/
|
|
91
92
|
graderContext?: "rubric-only" | "with-docs";
|
|
92
93
|
/** Base directory for user-facing pipeline output artifacts. */
|
|
@@ -43,6 +43,10 @@ export declare const PipelineRequestSchema: z.ZodObject<{
|
|
|
43
43
|
sample: z.ZodOptional<z.ZodNumber>;
|
|
44
44
|
}, z.core.$strip>]>>;
|
|
45
45
|
gapAnalysis: z.ZodOptional<z.ZodBoolean>;
|
|
46
|
+
graderContext: z.ZodOptional<z.ZodEnum<{
|
|
47
|
+
"rubric-only": "rubric-only";
|
|
48
|
+
"with-docs": "with-docs";
|
|
49
|
+
}>>;
|
|
46
50
|
graderReplications: z.ZodOptional<z.ZodNumber>;
|
|
47
51
|
headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
48
52
|
inlineTasks: z.ZodOptional<z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
|
|
@@ -106,6 +106,13 @@ export const PipelineRequestSchema = z.object({
|
|
|
106
106
|
dataset: z.string().optional(),
|
|
107
107
|
debug: z.union([z.boolean(), DebugOptionsSchema]).optional(),
|
|
108
108
|
gapAnalysis: z.boolean().optional(),
|
|
109
|
+
/**
|
|
110
|
+
* Grader-context policy (W0196 / DOC-2117). When `"with-docs"`, the
|
|
111
|
+
* canonical reference is injected into the LLM grader's `rubricPrompt`
|
|
112
|
+
* as authoritative ground truth. When omitted or `"rubric-only"`, the
|
|
113
|
+
* grader sees only the rubric template (legacy behavior).
|
|
114
|
+
*/
|
|
115
|
+
graderContext: z.enum(["rubric-only", "with-docs"]).optional(),
|
|
109
116
|
graderReplications: z.number().int().positive().optional(),
|
|
110
117
|
headers: z.record(z.string(), z.string()).optional(),
|
|
111
118
|
inlineTasks: z.array(z.record(z.string(), z.unknown())).optional(),
|
|
@@ -85,6 +85,7 @@ export interface PipelineRequest {
|
|
|
85
85
|
debug?: PipelineRequestDebug | boolean;
|
|
86
86
|
executor?: PipelineRequestCallerExecutor;
|
|
87
87
|
gapAnalysis?: boolean;
|
|
88
|
+
graderContext?: "rubric-only" | "with-docs";
|
|
88
89
|
graderReplications?: number;
|
|
89
90
|
headers?: Record<string, string>;
|
|
90
91
|
inlineTasks?: Record<string, unknown>[];
|
|
@@ -54,7 +54,9 @@ export interface RepoExecutionConfig {
|
|
|
54
54
|
/** Grader configuration. */
|
|
55
55
|
export interface RepoGraderConfig {
|
|
56
56
|
/**
|
|
57
|
-
* Grader context policy.
|
|
57
|
+
* Grader context policy. Defaults to `"with-docs"` (W0200 flip after
|
|
58
|
+
* DOC-2117 validation); set to `"rubric-only"` to opt back into the
|
|
59
|
+
* legacy behavior.
|
|
58
60
|
*
|
|
59
61
|
* - `"rubric-only"` — grader sees only the rubric template + criteria +
|
|
60
62
|
* candidate response.
|
|
@@ -57,6 +57,7 @@ export interface RemoteConfigSlice {
|
|
|
57
57
|
datasetOverride?: string;
|
|
58
58
|
projectIdOverride?: string;
|
|
59
59
|
perspectiveOverride?: string;
|
|
60
|
+
graderContext?: "rubric-only" | "with-docs";
|
|
60
61
|
graderReplications?: number;
|
|
61
62
|
gapAnalysisEnabled?: boolean;
|
|
62
63
|
noRemoteCache?: boolean;
|
|
@@ -124,6 +124,9 @@ export async function buildRemoteRequest(options) {
|
|
|
124
124
|
if (config.perspectiveOverride)
|
|
125
125
|
raw.perspective = config.perspectiveOverride;
|
|
126
126
|
// Advanced
|
|
127
|
+
if (config.graderContext) {
|
|
128
|
+
raw.graderContext = config.graderContext;
|
|
129
|
+
}
|
|
127
130
|
if (config.graderReplications) {
|
|
128
131
|
raw.graderReplications = config.graderReplications;
|
|
129
132
|
}
|
|
@@ -140,6 +140,7 @@ function toConfigSlice(opts) {
|
|
|
140
140
|
datasetOverride: opts.datasetOverride,
|
|
141
141
|
projectIdOverride: opts.projectIdOverride,
|
|
142
142
|
perspectiveOverride: opts.perspectiveOverride,
|
|
143
|
+
graderContext: opts.graderContext,
|
|
143
144
|
graderReplications: opts.graderReplications,
|
|
144
145
|
gapAnalysisEnabled: opts.gapAnalysisEnabled,
|
|
145
146
|
noRemoteCache: opts.noRemoteCache,
|
|
@@ -111,9 +111,10 @@ export class GenerateConfigsStep {
|
|
|
111
111
|
catch {
|
|
112
112
|
ctx.logger.warn(" ⚠ Could not load rubric config — templates will not resolve");
|
|
113
113
|
}
|
|
114
|
-
// Compile for each variant. `graderContext` defaults to "
|
|
115
|
-
//
|
|
116
|
-
|
|
114
|
+
// Compile for each variant. `graderContext` defaults to "with-docs" here
|
|
115
|
+
// (flipped in W0200 after PR #555 validation receipts) so handlers see a
|
|
116
|
+
// definite value rather than implicit-undefined.
|
|
117
|
+
const graderContext = ctx.config.graderContext ?? "with-docs";
|
|
117
118
|
// W0198 Phase 6 — when the package-surface manifest is authored, pass
|
|
118
119
|
// the in-scope package list down so the literacy mode handler can
|
|
119
120
|
// prefix the `code-correctness` rubric with the deterministic-lane
|
|
@@ -192,7 +193,7 @@ export class GenerateConfigsStep {
|
|
|
192
193
|
graderProvider: models.grader.id,
|
|
193
194
|
models: modeModels,
|
|
194
195
|
rubricConfig,
|
|
195
|
-
graderContext: ctx.config.graderContext ?? "
|
|
196
|
+
graderContext: ctx.config.graderContext ?? "with-docs",
|
|
196
197
|
preflightContext,
|
|
197
198
|
});
|
|
198
199
|
for (const w of merged.warnings) {
|
|
@@ -46,8 +46,8 @@ export interface FingerprintInput {
|
|
|
46
46
|
* content, so the cache must treat them as different evaluations even
|
|
47
47
|
* when tasks + docs + grader model match.
|
|
48
48
|
*
|
|
49
|
-
* Defaults to "
|
|
50
|
-
* the EvalConfig boundary default.
|
|
49
|
+
* Defaults to "with-docs" inside the hash when undefined, matching
|
|
50
|
+
* the EvalConfig boundary default (flipped in v4 / W0200).
|
|
51
51
|
*/
|
|
52
52
|
graderContext?: "rubric-only" | "with-docs";
|
|
53
53
|
}
|
|
@@ -42,8 +42,14 @@ import { join, relative, resolve } from "path";
|
|
|
42
42
|
* v3 (2026-05-06): grader-context policy ("rubric-only" vs "with-docs")
|
|
43
43
|
* affects rubricPrompt content and therefore eval output, so it must be
|
|
44
44
|
* hashed. Bumping invalidates v2 fingerprints.
|
|
45
|
+
*
|
|
46
|
+
* v4 (2026-05-08): grader-context default flipped from "rubric-only" to
|
|
47
|
+
* "with-docs" (W0200 / DOC-2117) after PR #555 validation receipts. The
|
|
48
|
+
* undefined-equivalence at the EvalConfig boundary moves with it, so
|
|
49
|
+
* cached v3 fingerprints would no longer match the same EvalConfig.
|
|
50
|
+
* Bumping invalidates v3 fingerprints.
|
|
45
51
|
*/
|
|
46
|
-
const FINGERPRINT_VERSION = "eval-fingerprint-
|
|
52
|
+
const FINGERPRINT_VERSION = "eval-fingerprint-v4";
|
|
47
53
|
/**
|
|
48
54
|
* Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
|
|
49
55
|
*
|
|
@@ -56,7 +62,7 @@ const FINGERPRINT_VERSION = "eval-fingerprint-v3";
|
|
|
56
62
|
*/
|
|
57
63
|
export function computeEvalFingerprint(input) {
|
|
58
64
|
const { graderModel, mode, rootDir, tasks } = input;
|
|
59
|
-
const graderContext = input.graderContext ?? "
|
|
65
|
+
const graderContext = input.graderContext ?? "with-docs";
|
|
60
66
|
const hash = createHash("sha256");
|
|
61
67
|
hash.update(`version:${FINGERPRINT_VERSION}\n`);
|
|
62
68
|
hash.update(`mode:${mode}\n`);
|
|
@@ -52,6 +52,7 @@ export function mapRequestToConfig(request, rootDir) {
|
|
|
52
52
|
noAutoScope: request.noAutoScope ?? false,
|
|
53
53
|
noCache: request.noCache ?? false,
|
|
54
54
|
noRemoteCache: request.noRemoteCache ?? false,
|
|
55
|
+
graderContext: request.graderContext,
|
|
55
56
|
graderReplications: request.graderReplications,
|
|
56
57
|
urls: request.urls,
|
|
57
58
|
headers: request.headers,
|
|
@@ -23,6 +23,7 @@
|
|
|
23
23
|
import { readFileSync } from "node:fs";
|
|
24
24
|
import { resolveVariantMode, } from "../../_vendor/ailf-core/index.js";
|
|
25
25
|
import { computePreflight } from "./compute-preflight.js";
|
|
26
|
+
import { extractCodeBlocks } from "./extract-code-blocks.js";
|
|
26
27
|
/** Map a per-row preflight key to a stable string for in-memory lookup. */
|
|
27
28
|
export function preflightKey(key) {
|
|
28
29
|
return `${key.run}/${key.mode}/${key.task}/${key.model}`;
|
|
@@ -78,10 +79,15 @@ export async function emitSymbolPreflight(input) {
|
|
|
78
79
|
const output = row.response?.output;
|
|
79
80
|
if (typeof output !== "string" || output.length === 0)
|
|
80
81
|
continue;
|
|
82
|
+
// Candidate outputs are typically markdown — prose with fenced code
|
|
83
|
+
// blocks. Feed `parseImports` only the TS/JS source so it doesn't
|
|
84
|
+
// choke on the surrounding markdown. Falls back to the raw text
|
|
85
|
+
// when no recognized fences are present.
|
|
86
|
+
const code = extractCodeBlocks(output);
|
|
81
87
|
let report;
|
|
82
88
|
try {
|
|
83
89
|
report = await computePreflight({
|
|
84
|
-
code
|
|
90
|
+
code,
|
|
85
91
|
candidate: { taskId: axisTask, testIndex: i },
|
|
86
92
|
packageSurface: input.packageSurface,
|
|
87
93
|
resolver: input.resolver,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* extract-code-blocks — pull TS/TSX/JS/JSX source out of a candidate
|
|
3
|
+
* response that's typically structured as markdown prose plus one or
|
|
4
|
+
* more fenced code blocks.
|
|
5
|
+
*
|
|
6
|
+
* AI candidates almost always wrap their code in triple-backtick fences
|
|
7
|
+
* with language hints (`\`\`\`tsx`, `\`\`\`typescript`, …). Feeding that
|
|
8
|
+
* raw text to `parseImports` (which calls a strict TS parser) extracts
|
|
9
|
+
* zero imports because the markdown surrounds — and breaks — the parse.
|
|
10
|
+
*
|
|
11
|
+
* Strategy: pull every fenced block whose language hint is in the
|
|
12
|
+
* allow-list (or unset, which we treat as TS-compatible by convention),
|
|
13
|
+
* concatenate the contents with blank-line separators, and return that.
|
|
14
|
+
* When the input has no recognized fences, return it unchanged so a
|
|
15
|
+
* raw-code candidate still parses.
|
|
16
|
+
*
|
|
17
|
+
* Out of scope: indented (4-space) code blocks. The corpus uses fenced
|
|
18
|
+
* blocks exclusively, and supporting indented blocks would require a
|
|
19
|
+
* markdown parser; the candidate-corpus reality doesn't justify it.
|
|
20
|
+
*/
|
|
21
|
+
export declare function extractCodeBlocks(text: string): string;
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* extract-code-blocks — pull TS/TSX/JS/JSX source out of a candidate
|
|
3
|
+
* response that's typically structured as markdown prose plus one or
|
|
4
|
+
* more fenced code blocks.
|
|
5
|
+
*
|
|
6
|
+
* AI candidates almost always wrap their code in triple-backtick fences
|
|
7
|
+
* with language hints (`\`\`\`tsx`, `\`\`\`typescript`, …). Feeding that
|
|
8
|
+
* raw text to `parseImports` (which calls a strict TS parser) extracts
|
|
9
|
+
* zero imports because the markdown surrounds — and breaks — the parse.
|
|
10
|
+
*
|
|
11
|
+
* Strategy: pull every fenced block whose language hint is in the
|
|
12
|
+
* allow-list (or unset, which we treat as TS-compatible by convention),
|
|
13
|
+
* concatenate the contents with blank-line separators, and return that.
|
|
14
|
+
* When the input has no recognized fences, return it unchanged so a
|
|
15
|
+
* raw-code candidate still parses.
|
|
16
|
+
*
|
|
17
|
+
* Out of scope: indented (4-space) code blocks. The corpus uses fenced
|
|
18
|
+
* blocks exclusively, and supporting indented blocks would require a
|
|
19
|
+
* markdown parser; the candidate-corpus reality doesn't justify it.
|
|
20
|
+
*/
|
|
21
|
+
const ALLOWED_LANGS = new Set([
|
|
22
|
+
"tsx",
|
|
23
|
+
"ts",
|
|
24
|
+
"typescript",
|
|
25
|
+
"jsx",
|
|
26
|
+
"js",
|
|
27
|
+
"javascript",
|
|
28
|
+
"",
|
|
29
|
+
]);
|
|
30
|
+
const FENCE_RE = /^([ \t]*)```([^\n`]*)\n([\s\S]*?)\n[ \t]*```\s*$/gm;
|
|
31
|
+
export function extractCodeBlocks(text) {
|
|
32
|
+
let sawAnyFence = false;
|
|
33
|
+
const blocks = [];
|
|
34
|
+
for (const match of text.matchAll(FENCE_RE)) {
|
|
35
|
+
sawAnyFence = true;
|
|
36
|
+
const lang = (match[2] ?? "").trim().toLowerCase();
|
|
37
|
+
if (!ALLOWED_LANGS.has(lang))
|
|
38
|
+
continue;
|
|
39
|
+
blocks.push(match[3] ?? "");
|
|
40
|
+
}
|
|
41
|
+
// No fences at all → assume the candidate emitted raw source.
|
|
42
|
+
if (!sawAnyFence)
|
|
43
|
+
return text;
|
|
44
|
+
// Fences were present but none in a JS/TS-family language.
|
|
45
|
+
if (blocks.length === 0)
|
|
46
|
+
return "";
|
|
47
|
+
return blocks.join("\n\n");
|
|
48
|
+
}
|