npm - @sanity/ailf - Versions diffs - 0.5.0 → 2.0.0 - Mend

@sanity/ailf 0.5.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (377) hide show

package/dist/pipeline/compiler/task-graph-builder.js ADDED Viewed

@@ -0,0 +1,291 @@
+/**
+ * TaskGraphBuilder — converts task definitions into a TaskGraph IR.
+ *
+ * The builder is the first stage of the compilation pipeline:
+ *   GeneralizedTaskDefinitions → TaskGraphBuilder → TaskGraph → PromptfooCompiler → YAML
+ *
+ * Responsibilities:
+ * - Accept tasks from any source (TS, YAML, Content Lake)
+ * - Apply area/tag/mode filtering
+ * - Resolve inter-task dependencies into edges
+ * - Validate the graph is a DAG (reject cycles)
+ * - Assign execution priority via topological sort
+ *
+ * This module exists alongside `generate-configs.ts` — it does NOT replace
+ * the existing codegen path. Phase 7 will swap callers over to the compiler.
+ *
+ * @see packages/core/src/types/task-graph.ts — TaskGraph types
+ * @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
+ */
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+/**
+ * Build a TaskGraph from task definitions.
+ *
+ * 1. Filters tasks by area, tags, task IDs, and status
+ * 2. Creates TaskNodes with resolved variables
+ * 3. Discovers dependency edges from task metadata
+ * 4. Validates the graph is acyclic
+ * 5. Assigns topological priority
+ */
+export function buildTaskGraph(options) {
+    const warnings = [];
+    const filteredOut = [];
+    // Step 1: Filter tasks
+    const filtered = filterTasks(options.tasks, options.filter, filteredOut);
+    if (filtered.length === 0) {
+        return {
+            graph: null,
+            warnings: ["No tasks matched the filter criteria"],
+            filteredOut,
+        };
+    }
+    // Step 2: Create nodes
+    const nodes = new Map();
+    for (const task of filtered) {
+        const node = taskToNode(task);
+        if (nodes.has(node.taskId)) {
+            warnings.push(`Duplicate task ID "${node.taskId}" — later definition wins`);
+        }
+        nodes.set(node.taskId, node);
+    }
+    // Step 3: Discover edges from dependency metadata
+    const edges = discoverEdges(filtered, nodes, warnings);
+    // Step 4: Validate acyclicity
+    const cycleError = detectCycle(nodes, edges);
+    if (cycleError) {
+        throw new Error(`Task graph contains a cycle: ${cycleError.join(" → ")}. ` +
+            "Task graphs must be directed acyclic graphs (DAGs).");
+    }
+    // Step 5: Assign topological priority
+    assignPriority(nodes, edges);
+    // Step 6: Build fixture map (empty for now — Phase 2d fills this)
+    const fixtures = new Map();
+    const graph = {
+        compilationTarget: options.compilationTarget ?? "promptfoo",
+        edges,
+        fixtures,
+        nodes,
+    };
+    return { graph, warnings, filteredOut };
+}
+// ---------------------------------------------------------------------------
+// Filtering
+// ---------------------------------------------------------------------------
+function filterTasks(tasks, filter, filteredOut) {
+    return tasks.filter((task) => {
+        // Status filter — always applied (even without explicit filter options)
+        const status = task.status ?? "active";
+        const isTargetedById = filter?.taskIds && filter.taskIds.includes(task.id);
+        if (status === "archived") {
+            filteredOut.push(task.id);
+            return false;
+        }
+        if (status === "paused" && !isTargetedById) {
+            filteredOut.push(task.id);
+            return false;
+        }
+        if (status === "draft" && !isTargetedById && !filter?.includeDrafts) {
+            filteredOut.push(task.id);
+            return false;
+        }
+        // Remaining filters only apply when an explicit filter is provided
+        if (!filter)
+            return true;
+        // Area filter — GeneralizedTaskDefinition uses `area` (not `featureArea`)
+        const taskArea = task.area ?? "";
+        if (filter.areas &&
+            filter.areas.length > 0 &&
+            !filter.areas.map((a) => a.toLowerCase()).includes(taskArea.toLowerCase())) {
+            filteredOut.push(task.id);
+            return false;
+        }
+        // Task ID filter
+        if (filter.taskIds &&
+            filter.taskIds.length > 0 &&
+            !filter.taskIds.includes(task.id)) {
+            filteredOut.push(task.id);
+            return false;
+        }
+        // Tag filter
+        if (filter.tags &&
+            filter.tags.length > 0 &&
+            (!task.tags || !task.tags.some((t) => filter.tags.includes(t)))) {
+            filteredOut.push(task.id);
+            return false;
+        }
+        return true;
+    });
+}
+// ---------------------------------------------------------------------------
+// Node creation
+// ---------------------------------------------------------------------------
+function taskToNode(task) {
+    // GeneralizedTaskDefinition uses prompt.text/prompt.template instead of taskPrompt,
+    // and prompt.vars instead of extraVars
+    const promptText = task.prompt?.text ?? task.prompt?.template ?? "";
+    const promptVars = task.prompt?.vars ?? {};
+    const envelope = {
+        declarations: [],
+        provenance: {},
+        values: {
+            ...(promptText ? { task: promptText } : {}),
+            ...promptVars,
+        },
+    };
+    return {
+        dependsOn: [],
+        mode: task.mode,
+        priority: 0,
+        resolvedPrompt: promptText,
+        resolvedVariables: envelope,
+        taskId: task.id,
+    };
+}
+// ---------------------------------------------------------------------------
+// Edge discovery
+// ---------------------------------------------------------------------------
+/**
+ * Discover dependency edges from task metadata.
+ *
+ * Looks for explicit `dependsOn` arrays in prompt.vars (the generalized
+ * equivalent of the old extraVars convention).
+ * Future phases will add implicit deps from fixture sharing, data flow, etc.
+ */
+function discoverEdges(tasks, nodes, warnings) {
+    const edges = [];
+    for (const task of tasks) {
+        // Check for explicit dependencies in prompt.vars (was extraVars.dependsOn)
+        const deps = task.prompt?.vars?.dependsOn;
+        if (Array.isArray(deps)) {
+            for (const dep of deps) {
+                if (typeof dep !== "string")
+                    continue;
+                if (!nodes.has(dep)) {
+                    warnings.push(`Task "${task.id}" depends on "${dep}" which is not in the graph — ` +
+                        "dependency ignored (task may have been filtered out)");
+                    continue;
+                }
+                edges.push({ from: dep, to: task.id, type: "ordering" });
+                const node = nodes.get(task.id);
+                if (node && !node.dependsOn.includes(dep)) {
+                    node.dependsOn.push(dep);
+                }
+            }
+        }
+    }
+    return edges;
+}
+// ---------------------------------------------------------------------------
+// Cycle detection — Kahn's algorithm (topological sort)
+// ---------------------------------------------------------------------------
+/**
+ * Detect cycles in the task graph using Kahn's algorithm.
+ *
+ * @returns null if acyclic, or the cycle path as a string array
+ */
+export function detectCycle(nodes, edges) {
+    // Build in-degree map
+    const inDegree = new Map();
+    const adjacency = new Map();
+    for (const id of nodes.keys()) {
+        inDegree.set(id, 0);
+        adjacency.set(id, []);
+    }
+    for (const edge of edges) {
+        adjacency.get(edge.from).push(edge.to);
+        inDegree.set(edge.to, (inDegree.get(edge.to) ?? 0) + 1);
+    }
+    // Start with all zero-in-degree nodes
+    const queue = [];
+    for (const [id, deg] of inDegree) {
+        if (deg === 0)
+            queue.push(id);
+    }
+    let visited = 0;
+    while (queue.length > 0) {
+        const current = queue.shift();
+        visited++;
+        for (const neighbor of adjacency.get(current) ?? []) {
+            const newDeg = (inDegree.get(neighbor) ?? 1) - 1;
+            inDegree.set(neighbor, newDeg);
+            if (newDeg === 0)
+                queue.push(neighbor);
+        }
+    }
+    if (visited === nodes.size)
+        return null;
+    // Find cycle participants (nodes with remaining in-degree > 0)
+    const cycleNodes = [...inDegree.entries()]
+        .filter(([, deg]) => deg > 0)
+        .map(([id]) => id);
+    // Reconstruct a cycle path for the error message
+    return reconstructCyclePath(cycleNodes, adjacency);
+}
+/**
+ * Reconstruct a human-readable cycle path from cycle participants.
+ */
+function reconstructCyclePath(cycleNodes, adjacency) {
+    if (cycleNodes.length === 0)
+        return [];
+    const inCycle = new Set(cycleNodes);
+    const start = cycleNodes[0];
+    const path = [start];
+    const visited = new Set();
+    let current = start;
+    // Follow edges within the cycle to produce a readable path
+    while (true) {
+        visited.add(current);
+        const next = (adjacency.get(current) ?? []).find((n) => inCycle.has(n) && (!visited.has(n) || n === start));
+        if (!next)
+            break;
+        path.push(next);
+        if (next === start)
+            break; // Completed the cycle
+        current = next;
+    }
+    return path;
+}
+// ---------------------------------------------------------------------------
+// Topological priority assignment
+// ---------------------------------------------------------------------------
+/**
+ * Assign execution priority via topological order.
+ * Lower priority = earlier execution.
+ */
+function assignPriority(nodes, edges) {
+    const inDegree = new Map();
+    const adjacency = new Map();
+    for (const id of nodes.keys()) {
+        inDegree.set(id, 0);
+        adjacency.set(id, []);
+    }
+    for (const edge of edges) {
+        adjacency.get(edge.from).push(edge.to);
+        inDegree.set(edge.to, (inDegree.get(edge.to) ?? 0) + 1);
+    }
+    const queue = [];
+    for (const [id, deg] of inDegree) {
+        if (deg === 0)
+            queue.push(id);
+    }
+    let priority = 0;
+    while (queue.length > 0) {
+        // Process all nodes at the current level (same priority)
+        const levelSize = queue.length;
+        for (let i = 0; i < levelSize; i++) {
+            const current = queue.shift();
+            const node = nodes.get(current);
+            if (node)
+                node.priority = priority;
+            for (const neighbor of adjacency.get(current) ?? []) {
+                const newDeg = (inDegree.get(neighbor) ?? 1) - 1;
+                inDegree.set(neighbor, newDeg);
+                if (newDeg === 0)
+                    queue.push(neighbor);
+            }
+        }
+        priority++;
+    }
+}

package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts ADDED Viewed

@@ -0,0 +1,90 @@
+/**
+ * Cost tracking — model pricing, pre-run estimation, and post-run actuals.
+ *
+ * Uses a pricing table (YAML config or TS `definePricingTable()`) to compute
+ * USD cost from token usage. Supports budget controls with warn/stop thresholds.
+ *
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
+ */
+import type { TraceTokenUsage } from "../../../_vendor/ailf-core/index.d.ts";
+/** Per-model pricing (USD per 1M tokens) */
+export interface ModelPricing {
+    /** Input tokens cost per 1M tokens */
+    input: number;
+    /** Output tokens cost per 1M tokens */
+    output: number;
+    /** Cached input tokens cost per 1M tokens (optional) */
+    cachedInput?: number;
+}
+/** Budget control thresholds (in USD) */
+export interface BudgetConfig {
+    perRun?: {
+        warn: number;
+        stop: number;
+    };
+    perTask?: {
+        warn: number;
+        stop: number;
+    };
+}
+/** Cost estimate for a pipeline run */
+export interface CostEstimate {
+    /** Estimated total cost in USD */
+    totalUSD: number;
+    /** Per-model breakdown */
+    perModel: {
+        modelId: string;
+        estimatedUSD: number;
+    }[];
+    /** Whether estimate exceeds budget warning threshold */
+    exceedsWarning: boolean;
+    /** Whether estimate exceeds budget stop threshold */
+    exceedsStop: boolean;
+}
+/** Actual cost computed from real token usage */
+export interface ActualCost {
+    /** Actual total cost in USD */
+    totalUSD: number;
+    /** Per-model actual cost */
+    perModel: {
+        modelId: string;
+        actualUSD: number;
+        tokens: TraceTokenUsage;
+    }[];
+}
+/** Budget check result */
+export interface BudgetCheckResult {
+    /** Whether to proceed */
+    proceed: boolean;
+    /** Warning message (if any) */
+    warning?: string;
+    /** Current spend in USD */
+    currentUSD: number;
+    /** Budget limit that was checked */
+    limitUSD?: number;
+}
+/**
+ * Compute actual cost from token usage and model pricing.
+ *
+ * @param usage - Token counts from provider response
+ * @param pricing - Per-model pricing (USD per 1M tokens)
+ * @returns Cost in USD
+ */
+export declare function computeCost(usage: TraceTokenUsage, pricing: ModelPricing): number;
+/**
+ * Look up pricing for a model ID.
+ *
+ * Tries exact match first, then falls back to prefix matching
+ * (e.g., "openai:chat:gpt-4o-2024-11-20" matches "openai:chat:gpt-4o").
+ */
+export declare function lookupPricing(modelId: string, customPricing?: Record<string, ModelPricing>): ModelPricing | undefined;
+/**
+ * Estimate cost for a pipeline run before execution.
+ *
+ * Uses task count, estimated tokens per task complexity, and model pricing.
+ */
+export declare function estimateRunCost(taskCount: number, modelIds: string[], budget?: BudgetConfig, customPricing?: Record<string, ModelPricing>): CostEstimate;
+/**
+ * Check if current spend exceeds budget thresholds.
+ */
+export declare function checkBudget(currentUSD: number, budget: BudgetConfig, level: "perRun" | "perTask"): BudgetCheckResult;

package/dist/pipeline/compiler/telemetry/cost-tracker.js ADDED Viewed

@@ -0,0 +1,146 @@
+/**
+ * Cost tracking — model pricing, pre-run estimation, and post-run actuals.
+ *
+ * Uses a pricing table (YAML config or TS `definePricingTable()`) to compute
+ * USD cost from token usage. Supports budget controls with warn/stop thresholds.
+ *
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
+ */
+// ---------------------------------------------------------------------------
+// Pricing table
+// ---------------------------------------------------------------------------
+/** Default pricing table (updated periodically) */
+const DEFAULT_PRICING = {
+    "anthropic:messages:claude-opus-4-6": {
+        input: 15.0,
+        output: 75.0,
+        cachedInput: 1.5,
+    },
+    "anthropic:messages:claude-sonnet-4-6": {
+        input: 3.0,
+        output: 15.0,
+        cachedInput: 0.3,
+    },
+    "openai:chat:gpt-4.1": {
+        input: 2.0,
+        output: 8.0,
+        cachedInput: 0.5,
+    },
+    "openai:chat:gpt-4.1-mini": {
+        input: 0.4,
+        output: 1.6,
+        cachedInput: 0.1,
+    },
+    "openai:chat:gpt-4o": {
+        input: 2.5,
+        output: 10.0,
+        cachedInput: 1.25,
+    },
+    "openai:chat:gpt-5": {
+        input: 5.0,
+        output: 15.0,
+        cachedInput: 1.0,
+    },
+};
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+/**
+ * Compute actual cost from token usage and model pricing.
+ *
+ * @param usage - Token counts from provider response
+ * @param pricing - Per-model pricing (USD per 1M tokens)
+ * @returns Cost in USD
+ */
+export function computeCost(usage, pricing) {
+    const cached = usage.toolTokens ?? 0;
+    const uncachedPrompt = usage.promptTokens - cached;
+    const inputCost = (uncachedPrompt * pricing.input) / 1_000_000;
+    const cachedCost = pricing.cachedInput !== undefined
+        ? (cached * pricing.cachedInput) / 1_000_000
+        : (cached * pricing.input) / 1_000_000;
+    const outputCost = (usage.completionTokens * pricing.output) / 1_000_000;
+    return inputCost + cachedCost + outputCost;
+}
+/**
+ * Look up pricing for a model ID.
+ *
+ * Tries exact match first, then falls back to prefix matching
+ * (e.g., "openai:chat:gpt-4o-2024-11-20" matches "openai:chat:gpt-4o").
+ */
+export function lookupPricing(modelId, customPricing) {
+    // 1. Exact match in custom pricing
+    if (customPricing?.[modelId])
+        return customPricing[modelId];
+    // 2. Exact match in defaults
+    if (DEFAULT_PRICING[modelId])
+        return DEFAULT_PRICING[modelId];
+    // 3. Prefix match in custom pricing
+    if (customPricing) {
+        for (const [key, pricing] of Object.entries(customPricing)) {
+            if (modelId.startsWith(key))
+                return pricing;
+        }
+    }
+    // 4. Prefix match in defaults
+    for (const [key, pricing] of Object.entries(DEFAULT_PRICING)) {
+        if (modelId.startsWith(key))
+            return pricing;
+    }
+    return undefined;
+}
+/**
+ * Estimate cost for a pipeline run before execution.
+ *
+ * Uses task count, estimated tokens per task complexity, and model pricing.
+ */
+export function estimateRunCost(taskCount, modelIds, budget, customPricing) {
+    // Rough token estimates per task (empirical averages)
+    const AVG_PROMPT_TOKENS = 2000;
+    const AVG_COMPLETION_TOKENS = 1500;
+    const perModel = modelIds.map((modelId) => {
+        const pricing = lookupPricing(modelId, customPricing);
+        if (!pricing) {
+            return { modelId, estimatedUSD: 0 };
+        }
+        const estimatedUSD = computeCost({
+            promptTokens: AVG_PROMPT_TOKENS * taskCount,
+            completionTokens: AVG_COMPLETION_TOKENS * taskCount,
+            totalTokens: (AVG_PROMPT_TOKENS + AVG_COMPLETION_TOKENS) * taskCount,
+        }, pricing);
+        return { modelId, estimatedUSD };
+    });
+    const totalUSD = perModel.reduce((sum, m) => sum + m.estimatedUSD, 0);
+    return {
+        totalUSD,
+        perModel,
+        exceedsWarning: budget?.perRun ? totalUSD >= budget.perRun.warn : false,
+        exceedsStop: budget?.perRun ? totalUSD >= budget.perRun.stop : false,
+    };
+}
+/**
+ * Check if current spend exceeds budget thresholds.
+ */
+export function checkBudget(currentUSD, budget, level) {
+    const limits = budget[level];
+    if (!limits) {
+        return { proceed: true, currentUSD };
+    }
+    if (currentUSD >= limits.stop) {
+        return {
+            proceed: false,
+            warning: `Budget exceeded: $${currentUSD.toFixed(4)} >= $${limits.stop} (${level} stop limit)`,
+            currentUSD,
+            limitUSD: limits.stop,
+        };
+    }
+    if (currentUSD >= limits.warn) {
+        return {
+            proceed: true,
+            warning: `Budget warning: $${currentUSD.toFixed(4)} >= $${limits.warn} (${level} warn threshold)`,
+            currentUSD,
+            limitUSD: limits.warn,
+        };
+    }
+    return { proceed: true, currentUSD };
+}

package/dist/pipeline/compiler/telemetry/index.d.ts ADDED Viewed

@@ -0,0 +1,14 @@
+/**
+ * Telemetry — observability infrastructure for evaluation traces.
+ *
+ * Captures tool calls, token usage, cost, and timing for every evaluation.
+ * Full traces go to blob storage; sanitized summaries to Content Lake.
+ *
+ * @see docs/exec-plans/architecture-overhaul/phase-6-observability.md
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
+ */
+export { collectTrace, mergeTraces, type ProviderResponse, type RawToolCall, type TraceCollectorOptions, } from "./trace-collector.js";
+export { classifyToolCall, classifyToolCalls } from "./tool-classifier.js";
+export { checkBudget, computeCost, estimateRunCost, lookupPricing, type ActualCost, type BudgetCheckResult, type BudgetConfig, type CostEstimate, type ModelPricing, } from "./cost-tracker.js";
+export { extractTraceSummary, LocalTraceStore, type TraceSummary, type TraceStore, type TraceStoreResult, } from "./trace-store.js";
+export { createRedactionConfig, DEFAULT_REDACTION_RULES, redactTrace, type RedactionConfig, type RedactionResult, type RedactionRule, } from "./redactor.js";

package/dist/pipeline/compiler/telemetry/index.js ADDED Viewed

@@ -0,0 +1,19 @@
+/**
+ * Telemetry — observability infrastructure for evaluation traces.
+ *
+ * Captures tool calls, token usage, cost, and timing for every evaluation.
+ * Full traces go to blob storage; sanitized summaries to Content Lake.
+ *
+ * @see docs/exec-plans/architecture-overhaul/phase-6-observability.md
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
+ */
+// Trace collection
+export { collectTrace, mergeTraces, } from "./trace-collector.js";
+// Tool call classification
+export { classifyToolCall, classifyToolCalls } from "./tool-classifier.js";
+// Cost tracking
+export { checkBudget, computeCost, estimateRunCost, lookupPricing, } from "./cost-tracker.js";
+// Trace storage
+export { extractTraceSummary, LocalTraceStore, } from "./trace-store.js";
+// Redaction
+export { createRedactionConfig, DEFAULT_REDACTION_RULES, redactTrace, } from "./redactor.js";

package/dist/pipeline/compiler/telemetry/redactor.d.ts ADDED Viewed

@@ -0,0 +1,58 @@
+/**
+ * Redaction pipeline — strips sensitive data from traces before storage.
+ *
+ * Applied before ANY storage (both blob and Content Lake). Configurable
+ * patterns handle Bearer tokens, API keys, Sanity tokens, and other
+ * common secret formats.
+ *
+ * Principles:
+ * 1. Redact before store — sensitive data never reaches storage
+ * 2. Configurable patterns — teams can add project-specific rules
+ * 3. Truncation for cost — large outputs truncated to max bytes
+ * 4. No PII by default — tasks shouldn't contain PII, this is a safety net
+ *
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
+ */
+import type { EvalTrace } from "../../../_vendor/ailf-core/index.d.ts";
+/** A single redaction rule */
+export interface RedactionRule {
+    /** Rule name (for logging) */
+    name: string;
+    /** Regex pattern to match */
+    pattern: RegExp;
+    /** Replacement string (use $1, $2 for capture groups) */
+    replacement: string;
+}
+/** Redaction configuration */
+export interface RedactionConfig {
+    /** Regex-based substitution rules */
+    rules: RedactionRule[];
+    /** Fields to omit entirely from stored traces */
+    omitFields: string[];
+    /** Maximum tool call output size in bytes */
+    maxOutputBytes: number;
+}
+/** Result of redaction */
+export interface RedactionResult {
+    /** Redacted trace */
+    trace: EvalTrace;
+    /** Number of redactions applied */
+    redactionCount: number;
+    /** Which rules fired */
+    rulesApplied: string[];
+}
+/** Built-in redaction rules for common secret patterns */
+export declare const DEFAULT_REDACTION_RULES: RedactionRule[];
+/**
+ * Create a default redaction config.
+ *
+ * @param overrides - Custom rules or settings to merge
+ */
+export declare function createRedactionConfig(overrides?: Partial<RedactionConfig>): RedactionConfig;
+/**
+ * Apply redaction to an evaluation trace.
+ *
+ * Processes tool call inputs and outputs, event data, and search terms.
+ * Returns a new trace (does not mutate the original).
+ */
+export declare function redactTrace(trace: EvalTrace, config?: RedactionConfig): RedactionResult;