npm - @sanity/ailf - Versions diffs - 0.5.0 → 2.0.0 - Mend

@sanity/ailf 0.5.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (377) hide show

package/dist/pipeline/grader-validate-runner.js CHANGED Viewed

@@ -99,11 +99,13 @@ export function formatValidationReport(result) {
     const sep = "|------------------|-------|-------------|-----------|--------|-------|";
     lines.push(h);
     lines.push(sep);
-    const dims = [
-        { data: result.perDimension.taskCompletion, name: "Task Completion" },
-        { data: result.perDimension.codeCorrectness, name: "Code Correctness" },
-        { data: result.perDimension.docCoverage, name: "Doc Coverage" },
-    ];
+    const dims = Object.entries(result.perDimension).map(([key, data]) => ({
+        data,
+        name: key
+            .split(/[-_]/)
+            .map((w) => w.charAt(0).toUpperCase() + w.slice(1))
+            .join(" "),
+    }));
     for (const { data, name } of dims) {
         const quality = classifyCorrelation(data.correlation);
         const biasStr = data.bias > 0 ? `+${data.bias}` : `${data.bias}`;

package/dist/pipeline/grader-validation.d.ts CHANGED Viewed

@@ -63,12 +63,8 @@ export interface GraderValidation {
     overallMae: number;
     /** Whether the grader passes the MAE threshold (default: MAE < 10) */
     passesThreshold: boolean;
-    /** Per-dimension validity metrics */
-    perDimension: {
-        taskCompletion: DimensionValidity;
-        codeCorrectness: DimensionValidity;
-        docCoverage: DimensionValidity;
-    };
+    /** Per-dimension validity metrics (keyed by dimension name) */
+    perDimension: Record<string, DimensionValidity>;
     /** Total number of (grader, human) score pairs analyzed */
     totalObservations: number;
 }

package/dist/pipeline/grader-validation.js CHANGED Viewed

@@ -77,11 +77,7 @@ export function validateGrader(grades, graderModel, options) {
             overallCorrelation: 0,
             overallMae: 0,
             passesThreshold: true,
-            perDimension: {
-                codeCorrectness: { bias: 0, correlation: 0, count: 0, mae: 0 },
-                docCoverage: { bias: 0, correlation: 0, count: 0, mae: 0 },
-                taskCompletion: { bias: 0, correlation: 0, count: 0, mae: 0 },
-            },
+            perDimension: {},
             totalObservations: 0,
         };
     }
@@ -90,28 +86,24 @@ export function validateGrader(grades, graderModel, options) {
         grader: g.graderScore,
         human: g.humanScore,
     }));
-    // Group by dimension
-    const byDimension = {
-        codeCorrectness: grades
-            .filter((g) => g.dimension === "codeCorrectness")
-            .map((g) => ({ grader: g.graderScore, human: g.humanScore })),
-        docCoverage: grades
-            .filter((g) => g.dimension === "docCoverage")
-            .map((g) => ({ grader: g.graderScore, human: g.humanScore })),
-        taskCompletion: grades
-            .filter((g) => g.dimension === "taskCompletion")
-            .map((g) => ({ grader: g.graderScore, human: g.humanScore })),
-    };
+    // Group by dimension dynamically
+    const byDimension = {};
+    for (const g of grades) {
+        ;
+        (byDimension[g.dimension] ??= []).push({
+            grader: g.graderScore,
+            human: g.humanScore,
+        });
+    }
     // Overall metrics
     const overallMae = computeMae(allPairs);
     const overallCorrelation = Math.round(pearsonCorrelation(allPairs.map((p) => p.grader), allPairs.map((p) => p.human)) * 100) / 100;
     const overallBias = computeBias(allPairs);
     // Per-dimension metrics
-    const perDimension = {
-        codeCorrectness: computeDimensionValidity(byDimension.codeCorrectness),
-        docCoverage: computeDimensionValidity(byDimension.docCoverage),
-        taskCompletion: computeDimensionValidity(byDimension.taskCompletion),
-    };
+    const perDimension = {};
+    for (const [dim, dimPairs] of Object.entries(byDimension)) {
+        perDimension[dim] = computeDimensionValidity(dimPairs);
+    }
     // Find largest disagreements
     const disagreements = grades
         .map((g) => ({

package/dist/pipeline/map-request-to-config.js CHANGED Viewed

@@ -1,3 +1,4 @@
+import { normalizeMode } from "./normalize-mode.js";
 /**
  * Map a PipelineRequest to a ResolvedConfig.
  *
@@ -16,13 +17,17 @@
  * with `publish: false`.
  */
 export function mapRequestToConfig(request, rootDir) {
+    // Normalize mode so downstream pipeline code only sees canonical names.
+    // The API may receive legacy names ("baseline", "full") from older clients.
+    const { mode, variant } = normalizeMode(request.mode ?? "full");
     // API-triggered evaluations (identified by jobId) default to publish: true.
     // Without this, the job's reportId is always null and GET /v1/reports/:id
     // has nothing to return.
     const publishDefault = !!request.jobId;
     return {
         rootDir,
-        mode: request.mode ?? "full",
+        mode,
+        variant,
         debug: mapDebug(request.debug),
         areas: request.areas,
         tasks: request.tasks,
@@ -63,6 +68,7 @@ export function mapRequestToConfig(request, rootDir) {
         jobId: request.jobId,
         remote: false,
         apiUrl: "https://ailf-api.sanity.build",
+        presets: request.presets,
     };
 }
 function mapDebug(debug) {

package/dist/pipeline/mirror-repo-tasks.d.ts CHANGED Viewed

@@ -13,12 +13,12 @@
  * @see docs/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
  */
 import type { SanityClient } from "@sanity/client";
-import { type Logger, type TaskDefinition } from "../_vendor/ailf-core/index.d.ts";
+import { type LiteracyTaskDefinition, type Logger } from "../_vendor/ailf-core/index.d.ts";
 export interface MirrorOptions {
     /** Sanity client with write access */
     client: SanityClient;
     /** Tasks to mirror (already loaded from repo) */
-    tasks: TaskDefinition[];
+    tasks: LiteracyTaskDefinition[];
     /** Git context for origin provenance */
     git: GitContext;
     /** If true, log what would be done without writing */
@@ -58,7 +58,7 @@ export interface MirrorResult {
     skipped: number;
     /** Feature areas auto-created */
     areasCreated: string[];
-    /** Canonical doc slugs that failed to resolve */
+    /** Context doc slugs that failed to resolve */
     unresolvedSlugs: string[];
     /** Errors (non-fatal — mirror continues) */
     errors: string[];
@@ -70,7 +70,7 @@ export interface MirrorResult {
  * 1. Compute deterministic document ID
  * 2. Compute content hash of the task definition
  * 3. Check if mirror document exists with same hash → skip if unchanged
- * 4. Resolve canonical doc slugs → Sanity references
+ * 4. Resolve context doc slugs → Sanity references
  * 5. Auto-create feature areas if needed
  * 6. Upsert the ailf.task document with origin block
  */
@@ -90,15 +90,15 @@ export declare function detectGitContext(repoTasksPath: string): Promise<GitCont
  */
 export declare function mirrorDocId(owner: string, repo: string, taskId: string): string;
 /**
- * Compute a content hash of a TaskDefinition for change detection.
+ * Compute a content hash of a LiteracyTaskDefinition for change detection.
  *
  * Includes all fields that affect the mirror document. Excludes
  * runtime metadata like referenceSolution (filesystem path) since
  * that's not mirrored.
  */
-export declare function computeTaskHash(task: TaskDefinition): string;
+export declare function computeTaskHash(task: LiteracyTaskDefinition): string;
 /** @internal Exported for testing — not part of the public API. */
-export declare function buildMirrorDocument(task: TaskDefinition, opts: {
+export declare function buildMirrorDocument(task: LiteracyTaskDefinition, opts: {
     contentHash: string;
     docId: string;
     /** Existing author from the current mirror document (write-once preservation) */
@@ -113,9 +113,9 @@ export declare function buildMirrorDocument(task: TaskDefinition, opts: {
     _id: string;
     _type: string;
     ownership: string;
-    status: "active" | "draft" | "paused" | "archived";
-    assert: Record<string, unknown>[];
-    canonicalDocs: ({
+    status: import("@sanity/ailf-core").TaskStatus;
+    assertions: Record<string, unknown>[];
+    contextDocs: ({
         _key: string;
         reason: string;
     } | {
@@ -138,9 +138,9 @@ export declare function buildMirrorDocument(task: TaskDefinition, opts: {
         _key: string;
         reason: string;
     })[];
-    description: string;
+    title: string;
     docCoverage: boolean;
-    featureArea: {
+    area: {
         _ref: string;
         _type: string;
     };
@@ -161,5 +161,5 @@ export declare function buildMirrorDocument(task: TaskDefinition, opts: {
         author: GitAuthor;
         lastEditor: GitAuthor;
     };
-    taskPrompt: string;
+    promptText: string;
 };

package/dist/pipeline/mirror-repo-tasks.js CHANGED Viewed

@@ -26,7 +26,7 @@ import { ConsoleLogger } from "../adapters/loggers/index.js";
  * 1. Compute deterministic document ID
  * 2. Compute content hash of the task definition
  * 3. Check if mirror document exists with same hash → skip if unchanged
- * 4. Resolve canonical doc slugs → Sanity references
+ * 4. Resolve context doc slugs → Sanity references
  * 5. Auto-create feature areas if needed
  * 6. Upsert the ailf.task document with origin block
  */
@@ -43,10 +43,10 @@ export async function mirrorRepoTasks(options) {
     };
     if (tasks.length === 0)
         return result;
-    // Batch-resolve all canonical doc slugs (slug refs only — other ref types
+    // Batch-resolve all context doc slugs (slug refs only — other ref types
     // are stored without a resolved article reference for now)
     const allSlugs = [
-        ...new Set(tasks.flatMap((t) => t.canonicalDocs.filter(isSlugRef).map((d) => d.slug))),
+        ...new Set(tasks.flatMap((t) => (t.context?.docs ?? []).filter(isSlugRef).map((d) => d.slug))),
     ];
     const slugToDocId = await batchResolveDocSlugs(client, allSlugs);
     // Track unresolved slugs
@@ -56,7 +56,7 @@ export async function mirrorRepoTasks(options) {
         }
     }
     // Ensure all feature areas exist
-    const areas = [...new Set(tasks.map((t) => t.featureArea))];
+    const areas = [...new Set(tasks.map((t) => t.area ?? ""))];
     const createdAreas = await ensureFeatureAreas(client, areas, dryRun, log);
     result.areasCreated = createdAreas;
     // Fetch existing mirror document state for change detection + ownership check
@@ -241,7 +241,7 @@ export function mirrorDocId(owner, repo, taskId) {
 // Content hashing
 // ---------------------------------------------------------------------------
 /**
- * Compute a content hash of a TaskDefinition for change detection.
+ * Compute a content hash of a LiteracyTaskDefinition for change detection.
  *
  * Includes all fields that affect the mirror document. Excludes
  * runtime metadata like referenceSolution (filesystem path) since
@@ -250,10 +250,10 @@ export function mirrorDocId(owner, repo, taskId) {
 export function computeTaskHash(task) {
     const payload = JSON.stringify({
         id: task.id,
-        description: task.description,
-        featureArea: task.featureArea,
-        taskPrompt: task.taskPrompt,
-        canonicalDocs: task.canonicalDocs,
+        title: task.title,
+        area: task.area,
+        prompt: task.prompt,
+        docs: task.context?.docs,
         docCoverage: task.docCoverage,
         assertions: task.assertions,
         baseline: task.baseline,
@@ -353,10 +353,10 @@ async function fetchExistingDocState(client, docIds) {
 /** @internal Exported for testing — not part of the public API. */
 export function buildMirrorDocument(task, opts) {
     const { contentHash, docId, existingAuthor, git, slugToDocId } = opts;
-    // Build canonical docs with resolved references and correct refType.
+    // Build context docs with resolved references and correct refType.
     // Each ref type gets the appropriate resolution fields set on the
     // mirror document so Studio can display them correctly.
-    const canonicalDocs = task.canonicalDocs.map((ref, i) => {
+    const contextDocs = (task.context?.docs ?? []).map((ref, i) => {
         const base = { _key: `cd${i}`, reason: ref.reason ?? "" };
         if (isSlugRef(ref)) {
             const resolvedId = slugToDocId.get(ref.slug);
@@ -395,7 +395,7 @@ export function buildMirrorDocument(task, opts) {
         return base;
     });
     // Build assertions
-    const assertArray = task.assertions.map((a, i) => {
+    const assertArray = (task.assertions ?? []).map((a, i) => {
         const entry = {
             _key: `a${i}`,
             type: a.type,
@@ -420,19 +420,20 @@ export function buildMirrorDocument(task, opts) {
         }
         return entry;
     });
-    // Determine the source file path (best-effort from task's featureArea)
-    const filePath = `.ailf/tasks/${task.featureArea}.yaml`;
+    // Determine the source file path (best-effort from task's area)
+    const area = task.area ?? "";
+    const filePath = `.ailf/tasks/${area}.yaml`;
     return {
         _id: docId,
         _type: "ailf.task",
         ownership: "repo",
         status: task.status ?? "active",
-        assert: assertArray,
-        canonicalDocs,
-        description: task.description,
-        docCoverage: task.docCoverage,
-        featureArea: {
-            _ref: `ailf.featureArea.${task.featureArea}`,
+        assertions: assertArray,
+        contextDocs,
+        title: task.title,
+        docCoverage: task.docCoverage ?? false,
+        area: {
+            _ref: `ailf.featureArea.${area}`,
             _type: "reference",
         },
         id: { _type: "slug", current: task.id },
@@ -451,7 +452,7 @@ export function buildMirrorDocument(task, opts) {
             author: existingAuthor ?? git.author,
             lastEditor: git.author,
         },
-        taskPrompt: task.taskPrompt,
+        promptText: task.prompt?.text ?? "",
         ...(task.baseline
             ? {
                 baseline: {

package/dist/pipeline/normalize-mode.d.ts ADDED Viewed

@@ -0,0 +1,49 @@
+/**
+ * CLI boundary normalization for evaluation mode names.
+ *
+ * Legacy CLI users pass variant names like "baseline" or "agentic" as the
+ * --mode flag. This module normalizes those to the canonical mode ("literacy")
+ * plus a variant field, so downstream pipeline code only ever sees canonical
+ * mode names.
+ */
+import { type EvalMode } from "../_vendor/ailf-shared/index.d.ts";
+/**
+ * Literacy variant name constants.
+ *
+ * Production code imports these instead of scattering legacy string literals.
+ * Defined here (alongside the normalizer) so all variant name definitions
+ * live in one file — the single source of truth for the legacy-to-canonical
+ * mapping.
+ */
+export declare const LiteracyVariant: {
+    /** Standard with-docs / without-docs evaluation (legacy mode name: "baseline") */
+    readonly STANDARD: "baseline";
+    /** Agentic evaluation — model uses tools to find docs */
+    readonly AGENTIC: "agentic";
+    /** Observed mode — HTTP-instrumented behavior observation */
+    readonly OBSERVED: "observed";
+    /** Full mode — standard + agentic combined */
+    readonly FULL: "full";
+};
+/** Union of all literacy variant string values */
+export type LiteracyVariantName = (typeof LiteracyVariant)[keyof typeof LiteracyVariant];
+/**
+ * The two literacy evaluation sub-modes that control entry generation.
+ * "standard" (baseline) generates gold + floor entries; "agentic" generates
+ * gold entries only.
+ */
+export type LiteracyEvalSubMode = typeof LiteracyVariant.STANDARD | typeof LiteracyVariant.AGENTIC;
+export interface NormalizedMode {
+    mode: EvalMode;
+    variant?: string;
+}
+/**
+ * Normalize a raw CLI mode string to a canonical mode + optional variant.
+ *
+ * Legacy names ("baseline", "agentic", "observed", "full") are mapped to
+ * `{ mode: "literacy", variant: "<name>" }` and emit a deprecation warning
+ * on stderr. Canonical names pass through unchanged.
+ *
+ * @throws {Error} If the input is not a recognized mode or variant name.
+ */
+export declare function normalizeMode(input: string): NormalizedMode;

package/dist/pipeline/normalize-mode.js ADDED Viewed

@@ -0,0 +1,64 @@
+/**
+ * CLI boundary normalization for evaluation mode names.
+ *
+ * Legacy CLI users pass variant names like "baseline" or "agentic" as the
+ * --mode flag. This module normalizes those to the canonical mode ("literacy")
+ * plus a variant field, so downstream pipeline code only ever sees canonical
+ * mode names.
+ */
+import { CANONICAL_EVAL_MODES, LEGACY_EVAL_MODE_ALIASES, } from "../_vendor/ailf-shared/index.js";
+// ---------------------------------------------------------------------------
+// Constants (derived from shared package — single source of truth)
+// ---------------------------------------------------------------------------
+/** The 5 canonical evaluation modes. */
+const CANONICAL_MODES = new Set(CANONICAL_EVAL_MODES);
+/**
+ * Literacy variant name constants.
+ *
+ * Production code imports these instead of scattering legacy string literals.
+ * Defined here (alongside the normalizer) so all variant name definitions
+ * live in one file — the single source of truth for the legacy-to-canonical
+ * mapping.
+ */
+export const LiteracyVariant = {
+    /** Standard with-docs / without-docs evaluation (legacy mode name: "baseline") */
+    STANDARD: "baseline",
+    /** Agentic evaluation — model uses tools to find docs */
+    AGENTIC: "agentic",
+    /** Observed mode — HTTP-instrumented behavior observation */
+    OBSERVED: "observed",
+    /** Full mode — standard + agentic combined */
+    FULL: "full",
+};
+/**
+ * Legacy CLI names that are really literacy variants, not distinct modes.
+ * Each maps to `mode: "literacy"` with the original name as the variant.
+ */
+const LEGACY_LITERACY_VARIANTS = new Set(LEGACY_EVAL_MODE_ALIASES);
+/** Union of all accepted input strings for error messages. */
+const ALL_ACCEPTED = [
+    ...Array.from(CANONICAL_MODES),
+    ...Array.from(LEGACY_LITERACY_VARIANTS),
+];
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+/**
+ * Normalize a raw CLI mode string to a canonical mode + optional variant.
+ *
+ * Legacy names ("baseline", "agentic", "observed", "full") are mapped to
+ * `{ mode: "literacy", variant: "<name>" }` and emit a deprecation warning
+ * on stderr. Canonical names pass through unchanged.
+ *
+ * @throws {Error} If the input is not a recognized mode or variant name.
+ */
+export function normalizeMode(input) {
+    if (LEGACY_LITERACY_VARIANTS.has(input)) {
+        console.warn(`⚠ Deprecated: --mode ${input} is a legacy alias. Use --mode literacy --variant ${input} instead.`);
+        return { mode: "literacy", variant: input };
+    }
+    if (CANONICAL_MODES.has(input)) {
+        return { mode: input };
+    }
+    throw new Error(`Unknown mode "${input}". Valid modes: ${ALL_ACCEPTED.join(", ")}`);
+}

package/dist/pipeline/plan.d.ts CHANGED Viewed

@@ -10,6 +10,7 @@
  * @see docs/exec-plans/execution-preview.md
  */
 import type { DebugOptions, EvalMode } from "./types.js";
+import { LiteracyVariant } from "./normalize-mode.js";
 /** Comparison plan for --compare flag. */
 export interface ComparisonPlan {
     /** Age of the baseline in human-readable form */
@@ -121,8 +122,8 @@ export interface StepPlan {
 export interface TaskPlan {
     /** Test description */
     description: string;
-    /** Whether this is a gold (with docs) or baseline (without docs) variant */
-    variant: "baseline" | "gold";
+    /** Whether this is a gold (with docs) or standard/baseline (without docs) variant */
+    variant: typeof LiteracyVariant.STANDARD | "gold";
 }
 /** Minimal options shape needed to build a pipeline execution plan. */
 export interface PlanOptions {
@@ -138,6 +139,8 @@ export interface PlanOptions {
     gapAnalysisEnabled: boolean;
     graderReplications?: number;
     mode: EvalMode;
+    /** Literacy variant when mode is "literacy" (baseline, agentic, observed, full) */
+    variant?: string;
     noCache: boolean;
     publishEnabled: boolean;
     readinessEnabled: boolean;