@sanity/ailf 4.3.0 → 4.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -86,7 +86,8 @@ export interface ResolvedConfig {
86
86
  * canonical doc content as authoritative ground truth.
87
87
  *
88
88
  * Sourced from EvalConfig `grader.context` or the equivalent CLI/env
89
- * surface. Defaults to `"rubric-only"` at the EvalConfig boundary.
89
+ * surface. Defaults to `"with-docs"` at the EvalConfig boundary
90
+ * (flipped in W0200 after DOC-2117 validation).
90
91
  */
91
92
  graderContext?: "rubric-only" | "with-docs";
92
93
  /** Base directory for user-facing pipeline output artifacts. */
@@ -43,6 +43,10 @@ export declare const PipelineRequestSchema: z.ZodObject<{
43
43
  sample: z.ZodOptional<z.ZodNumber>;
44
44
  }, z.core.$strip>]>>;
45
45
  gapAnalysis: z.ZodOptional<z.ZodBoolean>;
46
+ graderContext: z.ZodOptional<z.ZodEnum<{
47
+ "rubric-only": "rubric-only";
48
+ "with-docs": "with-docs";
49
+ }>>;
46
50
  graderReplications: z.ZodOptional<z.ZodNumber>;
47
51
  headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
48
52
  inlineTasks: z.ZodOptional<z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
@@ -106,6 +106,13 @@ export const PipelineRequestSchema = z.object({
106
106
  dataset: z.string().optional(),
107
107
  debug: z.union([z.boolean(), DebugOptionsSchema]).optional(),
108
108
  gapAnalysis: z.boolean().optional(),
109
+ /**
110
+ * Grader-context policy (W0196 / DOC-2117). When `"with-docs"`, the
111
+ * canonical reference is injected into the LLM grader's `rubricPrompt`
112
+ * as authoritative ground truth. When omitted or `"rubric-only"`, the
113
+ * grader sees only the rubric template (legacy behavior).
114
+ */
115
+ graderContext: z.enum(["rubric-only", "with-docs"]).optional(),
109
116
  graderReplications: z.number().int().positive().optional(),
110
117
  headers: z.record(z.string(), z.string()).optional(),
111
118
  inlineTasks: z.array(z.record(z.string(), z.unknown())).optional(),
@@ -85,6 +85,7 @@ export interface PipelineRequest {
85
85
  debug?: PipelineRequestDebug | boolean;
86
86
  executor?: PipelineRequestCallerExecutor;
87
87
  gapAnalysis?: boolean;
88
+ graderContext?: "rubric-only" | "with-docs";
88
89
  graderReplications?: number;
89
90
  headers?: Record<string, string>;
90
91
  inlineTasks?: Record<string, unknown>[];
@@ -54,7 +54,9 @@ export interface RepoExecutionConfig {
54
54
  /** Grader configuration. */
55
55
  export interface RepoGraderConfig {
56
56
  /**
57
- * Grader context policy.
57
+ * Grader context policy. Defaults to `"with-docs"` (W0200 flip after
58
+ * DOC-2117 validation); set to `"rubric-only"` to opt back into the
59
+ * legacy behavior.
58
60
  *
59
61
  * - `"rubric-only"` — grader sees only the rubric template + criteria +
60
62
  * candidate response.
@@ -57,6 +57,7 @@ export interface RemoteConfigSlice {
57
57
  datasetOverride?: string;
58
58
  projectIdOverride?: string;
59
59
  perspectiveOverride?: string;
60
+ graderContext?: "rubric-only" | "with-docs";
60
61
  graderReplications?: number;
61
62
  gapAnalysisEnabled?: boolean;
62
63
  noRemoteCache?: boolean;
@@ -124,6 +124,9 @@ export async function buildRemoteRequest(options) {
124
124
  if (config.perspectiveOverride)
125
125
  raw.perspective = config.perspectiveOverride;
126
126
  // Advanced
127
+ if (config.graderContext) {
128
+ raw.graderContext = config.graderContext;
129
+ }
127
130
  if (config.graderReplications) {
128
131
  raw.graderReplications = config.graderReplications;
129
132
  }
@@ -140,6 +140,7 @@ function toConfigSlice(opts) {
140
140
  datasetOverride: opts.datasetOverride,
141
141
  projectIdOverride: opts.projectIdOverride,
142
142
  perspectiveOverride: opts.perspectiveOverride,
143
+ graderContext: opts.graderContext,
143
144
  graderReplications: opts.graderReplications,
144
145
  gapAnalysisEnabled: opts.gapAnalysisEnabled,
145
146
  noRemoteCache: opts.noRemoteCache,
@@ -111,9 +111,10 @@ export class GenerateConfigsStep {
111
111
  catch {
112
112
  ctx.logger.warn(" ⚠ Could not load rubric config — templates will not resolve");
113
113
  }
114
- // Compile for each variant. `graderContext` defaults to "rubric-only" here
115
- // so handlers see a definite value rather than implicit-undefined.
116
- const graderContext = ctx.config.graderContext ?? "rubric-only";
114
+ // Compile for each variant. `graderContext` defaults to "with-docs" here
115
+ // (flipped in W0200 after PR #555 validation receipts) so handlers see a
116
+ // definite value rather than implicit-undefined.
117
+ const graderContext = ctx.config.graderContext ?? "with-docs";
117
118
  // W0198 Phase 6 — when the package-surface manifest is authored, pass
118
119
  // the in-scope package list down so the literacy mode handler can
119
120
  // prefix the `code-correctness` rubric with the deterministic-lane
@@ -192,7 +193,7 @@ export class GenerateConfigsStep {
192
193
  graderProvider: models.grader.id,
193
194
  models: modeModels,
194
195
  rubricConfig,
195
- graderContext: ctx.config.graderContext ?? "rubric-only",
196
+ graderContext: ctx.config.graderContext ?? "with-docs",
196
197
  preflightContext,
197
198
  });
198
199
  for (const w of merged.warnings) {
@@ -46,8 +46,8 @@ export interface FingerprintInput {
46
46
  * content, so the cache must treat them as different evaluations even
47
47
  * when tasks + docs + grader model match.
48
48
  *
49
- * Defaults to "rubric-only" inside the hash when undefined, matching
50
- * the EvalConfig boundary default.
49
+ * Defaults to "with-docs" inside the hash when undefined, matching
50
+ * the EvalConfig boundary default (flipped in v4 / W0200).
51
51
  */
52
52
  graderContext?: "rubric-only" | "with-docs";
53
53
  }
@@ -42,8 +42,14 @@ import { join, relative, resolve } from "path";
42
42
  * v3 (2026-05-06): grader-context policy ("rubric-only" vs "with-docs")
43
43
  * affects rubricPrompt content and therefore eval output, so it must be
44
44
  * hashed. Bumping invalidates v2 fingerprints.
45
+ *
46
+ * v4 (2026-05-08): grader-context default flipped from "rubric-only" to
47
+ * "with-docs" (W0200 / DOC-2117) after PR #555 validation receipts. The
48
+ * undefined-equivalence at the EvalConfig boundary moves with it, so
49
+ * cached v3 fingerprints would no longer match the same EvalConfig.
50
+ * Bumping invalidates v3 fingerprints.
45
51
  */
46
- const FINGERPRINT_VERSION = "eval-fingerprint-v3";
52
+ const FINGERPRINT_VERSION = "eval-fingerprint-v4";
47
53
  /**
48
54
  * Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
49
55
  *
@@ -56,7 +62,7 @@ const FINGERPRINT_VERSION = "eval-fingerprint-v3";
56
62
  */
57
63
  export function computeEvalFingerprint(input) {
58
64
  const { graderModel, mode, rootDir, tasks } = input;
59
- const graderContext = input.graderContext ?? "rubric-only";
65
+ const graderContext = input.graderContext ?? "with-docs";
60
66
  const hash = createHash("sha256");
61
67
  hash.update(`version:${FINGERPRINT_VERSION}\n`);
62
68
  hash.update(`mode:${mode}\n`);
@@ -52,6 +52,7 @@ export function mapRequestToConfig(request, rootDir) {
52
52
  noAutoScope: request.noAutoScope ?? false,
53
53
  noCache: request.noCache ?? false,
54
54
  noRemoteCache: request.noRemoteCache ?? false,
55
+ graderContext: request.graderContext,
55
56
  graderReplications: request.graderReplications,
56
57
  urls: request.urls,
57
58
  headers: request.headers,
@@ -23,6 +23,7 @@
23
23
  import { readFileSync } from "node:fs";
24
24
  import { resolveVariantMode, } from "../../_vendor/ailf-core/index.js";
25
25
  import { computePreflight } from "./compute-preflight.js";
26
+ import { extractCodeBlocks } from "./extract-code-blocks.js";
26
27
  /** Map a per-row preflight key to a stable string for in-memory lookup. */
27
28
  export function preflightKey(key) {
28
29
  return `${key.run}/${key.mode}/${key.task}/${key.model}`;
@@ -78,10 +79,15 @@ export async function emitSymbolPreflight(input) {
78
79
  const output = row.response?.output;
79
80
  if (typeof output !== "string" || output.length === 0)
80
81
  continue;
82
+ // Candidate outputs are typically markdown — prose with fenced code
83
+ // blocks. Feed `parseImports` only the TS/JS source so it doesn't
84
+ // choke on the surrounding markdown. Falls back to the raw text
85
+ // when no recognized fences are present.
86
+ const code = extractCodeBlocks(output);
81
87
  let report;
82
88
  try {
83
89
  report = await computePreflight({
84
- code: output,
90
+ code,
85
91
  candidate: { taskId: axisTask, testIndex: i },
86
92
  packageSurface: input.packageSurface,
87
93
  resolver: input.resolver,
@@ -0,0 +1,21 @@
1
+ /**
2
+ * extract-code-blocks — pull TS/TSX/JS/JSX source out of a candidate
3
+ * response that's typically structured as markdown prose plus one or
4
+ * more fenced code blocks.
5
+ *
6
+ * AI candidates almost always wrap their code in triple-backtick fences
7
+ * with language hints (`\`\`\`tsx`, `\`\`\`typescript`, …). Feeding that
8
+ * raw text to `parseImports` (which calls a strict TS parser) extracts
9
+ * zero imports because the markdown surrounds — and breaks — the parse.
10
+ *
11
+ * Strategy: pull every fenced block whose language hint is in the
12
+ * allow-list (or unset, which we treat as TS-compatible by convention),
13
+ * concatenate the contents with blank-line separators, and return that.
14
+ * When the input has no recognized fences, return it unchanged so a
15
+ * raw-code candidate still parses.
16
+ *
17
+ * Out of scope: indented (4-space) code blocks. The corpus uses fenced
18
+ * blocks exclusively, and supporting indented blocks would require a
19
+ * markdown parser; the candidate-corpus reality doesn't justify it.
20
+ */
21
+ export declare function extractCodeBlocks(text: string): string;
@@ -0,0 +1,48 @@
1
+ /**
2
+ * extract-code-blocks — pull TS/TSX/JS/JSX source out of a candidate
3
+ * response that's typically structured as markdown prose plus one or
4
+ * more fenced code blocks.
5
+ *
6
+ * AI candidates almost always wrap their code in triple-backtick fences
7
+ * with language hints (`\`\`\`tsx`, `\`\`\`typescript`, …). Feeding that
8
+ * raw text to `parseImports` (which calls a strict TS parser) extracts
9
+ * zero imports because the markdown surrounds — and breaks — the parse.
10
+ *
11
+ * Strategy: pull every fenced block whose language hint is in the
12
+ * allow-list (or unset, which we treat as TS-compatible by convention),
13
+ * concatenate the contents with blank-line separators, and return that.
14
+ * When the input has no recognized fences, return it unchanged so a
15
+ * raw-code candidate still parses.
16
+ *
17
+ * Out of scope: indented (4-space) code blocks. The corpus uses fenced
18
+ * blocks exclusively, and supporting indented blocks would require a
19
+ * markdown parser; the candidate-corpus reality doesn't justify it.
20
+ */
21
+ const ALLOWED_LANGS = new Set([
22
+ "tsx",
23
+ "ts",
24
+ "typescript",
25
+ "jsx",
26
+ "js",
27
+ "javascript",
28
+ "",
29
+ ]);
30
+ const FENCE_RE = /^([ \t]*)```([^\n`]*)\n([\s\S]*?)\n[ \t]*```\s*$/gm;
31
+ export function extractCodeBlocks(text) {
32
+ let sawAnyFence = false;
33
+ const blocks = [];
34
+ for (const match of text.matchAll(FENCE_RE)) {
35
+ sawAnyFence = true;
36
+ const lang = (match[2] ?? "").trim().toLowerCase();
37
+ if (!ALLOWED_LANGS.has(lang))
38
+ continue;
39
+ blocks.push(match[3] ?? "");
40
+ }
41
+ // No fences at all → assume the candidate emitted raw source.
42
+ if (!sawAnyFence)
43
+ return text;
44
+ // Fences were present but none in a JS/TS-family language.
45
+ if (blocks.length === 0)
46
+ return "";
47
+ return blocks.join("\n\n");
48
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "4.3.0",
3
+ "version": "4.4.0",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"