npm - @sanity/ailf - Versions diffs - 2.0.1 → 2.1.0 - Mend

@sanity/ailf 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (160) hide show

package/LICENSE +21 -0
package/dist/cli.js +0 -0
package/dist/orchestration/steps/run-eval-step.js +1 -1
package/dist/pipeline/checks.d.ts +8 -3
package/dist/pipeline/checks.js +23 -3
package/package.json +25 -25
package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
package/dist/_vendor/ailf-tasks/cli.js +0 -61
package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
package/dist/_vendor/ailf-tasks/index.js +0 -16
package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
package/dist/_vendor/ailf-tasks/parser.js +0 -73
package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
package/dist/_vendor/ailf-tasks/schemas.js +0 -180
package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
package/dist/_vendor/ailf-tasks/validation.js +0 -162
package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
package/dist/adapters/task-sources/yaml-task-source.js +0 -139
package/dist/agent-observer/test-imports.d.ts +0 -7
package/dist/agent-observer/test-imports.js +0 -185
package/dist/commands/update-quality-scores.d.ts +0 -5
package/dist/commands/update-quality-scores.js +0 -20
package/dist/lib/agent-behavior-report.d.ts +0 -8
package/dist/lib/agent-behavior-report.js +0 -185
package/dist/lib/baseline.d.ts +0 -19
package/dist/lib/baseline.js +0 -153
package/dist/lib/calculate-scores.d.ts +0 -23
package/dist/lib/calculate-scores.js +0 -42
package/dist/lib/compare.d.ts +0 -18
package/dist/lib/compare.js +0 -170
package/dist/lib/coverage-audit.d.ts +0 -4
package/dist/lib/coverage-audit.js +0 -42
package/dist/lib/discovery-report.d.ts +0 -13
package/dist/lib/discovery-report.js +0 -57
package/dist/lib/fetch-docs.d.ts +0 -30
package/dist/lib/fetch-docs.js +0 -171
package/dist/lib/generate-configs.d.ts +0 -25
package/dist/lib/generate-configs.js +0 -42
package/dist/lib/grader-api.d.ts +0 -21
package/dist/lib/grader-api.js +0 -34
package/dist/lib/grader-compare.d.ts +0 -19
package/dist/lib/grader-compare.js +0 -91
package/dist/lib/grader-consistency.d.ts +0 -27
package/dist/lib/grader-consistency.js +0 -79
package/dist/lib/grader-sensitivity.d.ts +0 -19
package/dist/lib/grader-sensitivity.js +0 -75
package/dist/lib/grader-validate.d.ts +0 -19
package/dist/lib/grader-validate.js +0 -78
package/dist/lib/measure-retrieval.d.ts +0 -14
package/dist/lib/measure-retrieval.js +0 -71
package/dist/lib/pr-comment.d.ts +0 -16
package/dist/lib/pr-comment.js +0 -28
package/dist/lib/readiness-report.d.ts +0 -13
package/dist/lib/readiness-report.js +0 -108
package/dist/lib/webhook-server.d.ts +0 -11
package/dist/lib/webhook-server.js +0 -24
package/dist/lib/weekly-digest.d.ts +0 -24
package/dist/lib/weekly-digest.js +0 -148
package/dist/orchestration/env-bridge.d.ts +0 -21
package/dist/orchestration/env-bridge.js +0 -66
package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
package/dist/pipeline/compiler/task-bridge.js +0 -92
package/dist/pipeline/expand-tasks.d.ts +0 -232
package/dist/pipeline/expand-tasks.js +0 -467
package/dist/pipeline/generate-configs.d.ts +0 -92
package/dist/pipeline/generate-configs.js +0 -445
package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
package/dist/pipeline/steps/calculate-scores-step.js +0 -89
package/dist/pipeline/steps/compare-step.d.ts +0 -18
package/dist/pipeline/steps/compare-step.js +0 -90
package/dist/pipeline/steps/eval-step.d.ts +0 -53
package/dist/pipeline/steps/eval-step.js +0 -347
package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
package/dist/pipeline/steps/fetch-docs-step.js +0 -84
package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
package/dist/pipeline/steps/generate-configs-step.js +0 -98
package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
package/dist/pipeline/steps/grader-consistency-step.js +0 -74
package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
package/dist/pipeline/steps/publish-report-step.js +0 -243
package/dist/pipeline/steps/report-step.d.ts +0 -13
package/dist/pipeline/steps/report-step.js +0 -56
package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
package/dist/pipeline/steps/update-scores-step.js +0 -42
package/dist/scripts/agent-behavior-report.d.ts +0 -19
package/dist/scripts/agent-behavior-report.js +0 -315
package/dist/scripts/baseline.d.ts +0 -43
package/dist/scripts/baseline.js +0 -267
package/dist/scripts/calculate-scores.d.ts +0 -166
package/dist/scripts/calculate-scores.js +0 -1296
package/dist/scripts/compare.d.ts +0 -22
package/dist/scripts/compare.js +0 -334
package/dist/scripts/coverage-audit.d.ts +0 -44
package/dist/scripts/coverage-audit.js +0 -209
package/dist/scripts/debug-eval.d.ts +0 -19
package/dist/scripts/debug-eval.js +0 -73
package/dist/scripts/discovery-report.d.ts +0 -58
package/dist/scripts/discovery-report.js +0 -250
package/dist/scripts/fetch-docs.d.ts +0 -35
package/dist/scripts/fetch-docs.js +0 -472
package/dist/scripts/generate-configs.d.ts +0 -66
package/dist/scripts/generate-configs.js +0 -459
package/dist/scripts/grader-api.d.ts +0 -27
package/dist/scripts/grader-api.js +0 -206
package/dist/scripts/grader-compare.d.ts +0 -22
package/dist/scripts/grader-compare.js +0 -368
package/dist/scripts/grader-consistency.d.ts +0 -20
package/dist/scripts/grader-consistency.js +0 -313
package/dist/scripts/grader-sensitivity.d.ts +0 -22
package/dist/scripts/grader-sensitivity.js +0 -354
package/dist/scripts/grader-validate.d.ts +0 -19
package/dist/scripts/grader-validate.js +0 -267
package/dist/scripts/measure-retrieval.d.ts +0 -10
package/dist/scripts/measure-retrieval.js +0 -145
package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
package/dist/scripts/pipeline.d.ts +0 -76
package/dist/scripts/pipeline.js +0 -1031
package/dist/scripts/pr-comment.d.ts +0 -10
package/dist/scripts/pr-comment.js +0 -510
package/dist/scripts/readiness-report.d.ts +0 -88
package/dist/scripts/readiness-report.js +0 -342
package/dist/scripts/update-quality-scores.d.ts +0 -15
package/dist/scripts/update-quality-scores.js +0 -184
package/dist/scripts/validate-task-sources.d.ts +0 -21
package/dist/scripts/validate-task-sources.js +0 -210
package/dist/scripts/validate.d.ts +0 -13
package/dist/scripts/validate.js +0 -79
package/dist/scripts/webhook-server.d.ts +0 -26
package/dist/scripts/webhook-server.js +0 -147
package/dist/scripts/weekly-digest.d.ts +0 -24
package/dist/scripts/weekly-digest.js +0 -144
package/dist/sinks/format-slack.d.ts +0 -64
package/dist/sinks/format-slack.js +0 -306
package/dist/sinks/slack-sink.d.ts +0 -27
package/dist/sinks/slack-sink.js +0 -78
package/dist/sinks/webhook-sink.d.ts +0 -19
package/dist/sinks/webhook-sink.js +0 -50
package/tasks/.expanded.agentic.yaml +0 -280
package/tasks/.expanded.yaml +0 -565

package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js DELETED Viewed

@@ -1,245 +0,0 @@
-/**
- * KnowledgeProbeModeHandler — compilation rules for `knowledge-probe` mode.
- *
- * The simplest mode handler. Knowledge probes measure raw model knowledge
- * without documentation context, tool calling, or sandboxed execution.
- * They answer: "What does this model know about X without any help?"
- *
- * Key properties:
- * - No doc vars injected (intentionally empty)
- * - Uses the without-docs prompt template (or custom prompt)
- * - Standard LLM providers only (no agent SDKs, no MCP)
- * - No retrieval metrics (precision/recall/F1 not applicable)
- * - Results feed into the standard cross-model comparison pipeline
- *
- * This handler is the reference implementation for the mode handler pattern.
- *
- * @see docs/exec-plans/architecture-overhaul/phase-5-knowledge-probe.md
- * @see packages/core/src/types/generalized-task.ts — KnowledgeProbeTaskDefinition
- */
-// ---------------------------------------------------------------------------
-// Canonical knowledge probe prompt templates
-// ---------------------------------------------------------------------------
-// Handler-owned prompts for knowledge probe evaluations. These ask factual
-// questions without injecting documentation context — measuring raw model
-// knowledge about Sanity concepts.
-export const KNOWLEDGE_PROBE_PROMPT_TEMPLATES = {
-    "knowledge-probe": {
-        id: "knowledge-probe",
-        label: "Knowledge Probe (No Docs)",
-        template: `Answer the following question about Sanity.io based on your existing knowledge. Do not search for or reference external documentation.
-## Question
-{{task}}
-## Instructions
-1. Answer based solely on what you already know
-2. Be specific — include API names, function signatures, and code examples where relevant
-3. If you are unsure about a detail, say so rather than guessing
-4. Provide a complete, accurate answer
-Your answer:
-`,
-        variables: ["task"],
-    },
-};
-/**
- * Validate that a knowledge probe task definition has all required fields.
- */
-export function validateKnowledgeProbeTask(task) {
-    const errors = [];
-    if (!task.id) {
-        errors.push({ field: "id", message: "Task ID is required" });
-    }
-    if (!task.title) {
-        errors.push({ field: "title", message: "Task title is required" });
-    }
-    // Knowledge probes must have either a prompt or a description
-    if (!task.prompt?.text && !task.prompt?.vars?.task && !task.description) {
-        errors.push({
-            field: "prompt",
-            message: "Knowledge probe tasks require either prompt.text, prompt.vars.task, " +
-                "or description — the question to ask the model",
-        });
-    }
-    return errors;
-}
-// ---------------------------------------------------------------------------
-// Compilation
-// ---------------------------------------------------------------------------
-/**
- * Compile a knowledge probe task definition into Promptfoo configuration.
- *
- * This is intentionally minimal — knowledge probes map almost 1:1 to
- * basic Promptfoo test cases. The AILF value-add is type-safe authoring,
- * cross-model comparison, and score normalization.
- */
-export function compileKnowledgeProbeTask(task, options) {
-    const warnings = [];
-    // Validate
-    const validationErrors = validateKnowledgeProbeTask(task);
-    for (const err of validationErrors) {
-        warnings.push(`Knowledge probe "${task.id}": ${err.field} — ${err.message}`);
-    }
-    // Build providers from model list (or use a default placeholder)
-    const providers = buildProviders(options);
-    // Build prompts — knowledge probes use a single no-docs prompt
-    const prompts = buildPrompts(task);
-    // Build test cases
-    const tests = buildTestCases(task, options, warnings);
-    // Build metadata
-    const metadata = {
-        mode: "knowledge-probe",
-        probeStrategy: task.probeStrategy ?? "breadth-first",
-        noDocContext: true,
-        retrievalMetrics: false,
-    };
-    return { providers, tests, prompts, metadata, warnings };
-}
-// ---------------------------------------------------------------------------
-// Provider assembly
-// ---------------------------------------------------------------------------
-function buildProviders(options) {
-    if (options?.models && options.models.length > 0) {
-        return options.models.map((model) => ({
-            id: model.id,
-            label: model.label,
-            config: model.config,
-        }));
-    }
-    // No models specified — return empty (caller should provide models)
-    return [];
-}
-// ---------------------------------------------------------------------------
-// Prompt assembly
-// ---------------------------------------------------------------------------
-function buildPrompts(task) {
-    // Knowledge probes use a single prompt — no with-docs/without-docs split.
-    // The prompt IS the probe question.
-    const promptText = task.prompt?.text ??
-        task.prompt?.vars?.task ??
-        task.description ??
-        `Knowledge probe: ${task.title}`;
-    const systemMessage = task.prompt?.systemMessage;
-    return [
-        {
-            id: "knowledge-probe",
-            label: `Probe: ${task.title}`,
-            raw: systemMessage
-                ? `[system]\n${systemMessage}\n\n[user]\n${String(promptText)}`
-                : String(promptText),
-        },
-    ];
-}
-// ---------------------------------------------------------------------------
-// Test case assembly
-// ---------------------------------------------------------------------------
-function buildTestCases(task, options, warnings) {
-    // Build assertions
-    const assertions = [];
-    if (task.assertions) {
-        for (const assertion of task.assertions) {
-            const raw = assertion;
-            const mapped = mapKnowledgeProbeAssertion(raw, options, warnings);
-            if (mapped)
-                assertions.push(mapped);
-        }
-    }
-    // Build vars — intentionally no docs
-    const vars = {
-        task: task.prompt?.vars?.task ??
-            task.description ??
-            `Knowledge probe: ${task.title}`,
-        ...(task.prompt?.vars ?? {}),
-        // Metadata for scoring pipeline
-        __mode: "knowledge-probe",
-        __probeStrategy: task.probeStrategy ?? "breadth-first",
-    };
-    // Explicitly do NOT include docs
-    // This is the defining characteristic of knowledge-probe mode
-    delete vars.docs;
-    return [
-        {
-            description: `${task.id} — ${task.title}`,
-            vars,
-            ...(assertions.length > 0 ? { assert: assertions } : {}),
-        },
-    ];
-}
-// ---------------------------------------------------------------------------
-// Assertion mapping
-// ---------------------------------------------------------------------------
-function mapKnowledgeProbeAssertion(assertion, options, warnings) {
-    switch (assertion.type) {
-        // Standard assertions — pass through
-        case "contains":
-        case "contains-all":
-        case "contains-any":
-        case "equals":
-        case "is-json":
-        case "javascript":
-        case "python":
-        case "regex":
-        case "similar":
-            return {
-                type: assertion.type,
-                ...("value" in assertion ? { value: assertion.value } : {}),
-                ...(typeof assertion.weight === "number"
-                    ? { weight: assertion.weight }
-                    : {}),
-            };
-        // LLM-graded assertions — add grader provider
-        case "g-eval":
-        case "llm-rubric":
-        case "model-graded-closedqa":
-        case "model-graded-factuality":
-            return {
-                type: assertion.type,
-                ...("value" in assertion ? { value: assertion.value } : {}),
-                ...(typeof assertion.weight === "number"
-                    ? { weight: assertion.weight }
-                    : {}),
-                ...(options?.graderProvider
-                    ? { provider: options.graderProvider }
-                    : {}),
-            };
-        // Tool-use assertions are NOT valid for knowledge probes
-        case "skill-used":
-        case "tool-call-f1":
-        case "tool-called":
-        case "tool-input-matches":
-        case "tool-output-matches":
-            warnings.push(`Knowledge probe "${assertion.type}" assertion is not applicable — ` +
-                "knowledge probes don't use tools. Assertion skipped.");
-            return null;
-        default:
-            warnings.push(`Knowledge probe: unknown assertion type "${assertion.type}" — passed through`);
-            return {
-                type: assertion.type,
-                ...("value" in assertion ? { value: assertion.value } : {}),
-            };
-    }
-}
-// ---------------------------------------------------------------------------
-// ModeHandler adapter
-// ---------------------------------------------------------------------------
-/** ModeHandler-conformant export for the knowledge-probe evaluation mode. */
-export const handler = {
-    getPrompts() {
-        return KNOWLEDGE_PROBE_PROMPT_TEMPLATES;
-    },
-    compileTask(task, ctx) {
-        if (!("mode" in task) || task.mode !== "knowledge-probe") {
-            throw new Error(`Knowledge probe handler received task with mode "${task.mode ?? "undefined"}" — expected "knowledge-probe"`);
-        }
-        const result = compileKnowledgeProbeTask(task, { graderProvider: ctx.graderProvider, models: ctx.models });
-        return {
-            providers: result.providers,
-            tests: result.tests,
-            prompts: result.prompts,
-            warnings: result.warnings,
-            extras: { metadata: result.metadata },
-        };
-    },
-};

package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts DELETED Viewed

@@ -1,89 +0,0 @@
-/**
- * LiteracyModeHandler — compilation rules for `literacy` mode.
- *
- * This handler replaces the existing `generate-configs.ts` + `expand-tasks.ts`
- * code path for literacy (documentation) evaluation. It compiles
- * LiteracyTaskDefinition objects into Promptfoo structure:
- *
- * - Gold entry (with-docs prompt, canonical docs injected)
- * - Baseline entry (without-docs prompt, empty docs)
- * - Rubric template resolution from config/rubrics
- * - Doc-coverage auto-generation when opted in
- * - Structured dimension metadata on rubric assertions
- *
- * The handler accepts GeneralizedTaskDefinition, narrows to
- * LiteracyTaskDefinition, and produces Promptfoo output.
- *
- * @see docs/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
- * @see packages/eval/src/pipeline/expand-tasks.ts — the legacy code path
- */
-import type { LiteracyTaskDefinition, ModeHandler, PromptTemplate } from "../../../_vendor/ailf-core/index.d.ts";
-import { type LiteracyEvalSubMode } from "../../normalize-mode.js";
-import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../promptfoo-compiler.js";
-export declare const LITERACY_PROMPT_TEMPLATES: Record<string, PromptTemplate>;
-/** Options for compiling a literacy task */
-export interface LiteracyCompileOptions {
-    /** Grader provider for LLM-graded assertions */
-    graderProvider?: string;
-    /** Root directory (for resolving file:// doc paths) */
-    rootDir?: string;
-    /** Evaluation sub-mode — controls which entries are generated */
-    evalMode?: LiteracyEvalSubMode;
-    /** Model providers to include */
-    models?: {
-        id: string;
-        label: string;
-        config?: Record<string, unknown>;
-    }[];
-    /** Rubric config (templates, weights, profiles) — loaded from rubrics config */
-    rubricConfig?: RubricConfig;
-}
-/** Minimal rubric config needed by the handler */
-export interface RubricConfig {
-    templates: Record<string, {
-        dimension?: string;
-        header: string;
-        scale: string[];
-        criteria_label?: string;
-    }>;
-}
-/** Result of compiling a single literacy task */
-export interface LiteracyCompileResult {
-    /** Promptfoo provider configs */
-    providers: PromptfooProvider[];
-    /** Compiled test cases (gold + optional baseline) */
-    tests: PromptfooTestCase[];
-    /** Prompts for evaluation */
-    prompts: PromptfooPrompt[];
-    /** Warnings generated during compilation */
-    warnings: string[];
-}
-export interface LiteracyValidationError {
-    field: string;
-    message: string;
-}
-/**
- * Validate a literacy task definition.
- */
-export declare function validateLiteracyTask(task: LiteracyTaskDefinition): LiteracyValidationError[];
-/**
- * Compile a literacy task into Promptfoo configuration.
- *
- * Produces the same structure as the legacy expand-tasks.ts path:
- * - Gold entry with with-docs prompt and canonical doc context
- * - Baseline entry with without-docs prompt and empty docs
- * - Rubric assertions with structured dimension metadata
- */
-export declare function compileLiteracyTask(task: LiteracyTaskDefinition, options?: LiteracyCompileOptions): LiteracyCompileResult;
-/**
- * ModeHandler-conformant export for the literacy evaluation mode.
- *
- * The pipeline looks up this handler via `registry.getMode("literacy")`
- * and calls `handler.compileTask()`. The handler narrows the union to
- * LiteracyTaskDefinition and delegates to `compileLiteracyTask()`.
- *
- * Note: The literacy handler's `evalMode` variant ("baseline" vs "agentic")
- * is passed via `ctx.evalMode` — a literacy-specific extension of
- * CompilationContext. The pipeline sets this when compiling literacy tasks.
- */
-export declare const handler: ModeHandler;