npm - @sanity/ailf - Versions diffs - 0.5.0 → 2.0.0 - Mend

@sanity/ailf 0.5.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (377) hide show

package/dist/pipeline/compiler/__tests__/telemetry.test.js ADDED Viewed

@@ -0,0 +1,503 @@
+/**
+ * telemetry.test.ts — Tests for the observability & telemetry subsystem.
+ *
+ * Covers tool call classification, trace collection, cost tracking,
+ * redaction pipeline, trace storage, and per-turn trace merging.
+ *
+ * Run: npx tsx --test src/pipeline/compiler/__tests__/telemetry.test.ts
+ */
+import assert from "node:assert/strict";
+import { existsSync, rmSync } from "node:fs";
+import { afterEach, describe, it } from "node:test";
+import { tmpdir } from "os";
+import { resolve } from "path";
+import { classifyToolCall, classifyToolCalls, } from "../telemetry/tool-classifier.js";
+import { collectTrace, mergeTraces } from "../telemetry/trace-collector.js";
+import { checkBudget, computeCost, estimateRunCost, lookupPricing, } from "../telemetry/cost-tracker.js";
+import { redactTrace } from "../telemetry/redactor.js";
+import { extractTraceSummary, LocalTraceStore, } from "../telemetry/trace-store.js";
+// ---------------------------------------------------------------------------
+// Tool call classification
+// ---------------------------------------------------------------------------
+describe("classifyToolCall", () => {
+    it("classifies known tools by exact name", () => {
+        assert.equal(classifyToolCall("WebSearch"), "search");
+        assert.equal(classifyToolCall("Read"), "read");
+        assert.equal(classifyToolCall("Write"), "write");
+        assert.equal(classifyToolCall("Bash"), "execute");
+        assert.equal(classifyToolCall("Browser.navigate"), "navigate");
+        assert.equal(classifyToolCall("AskUser"), "communicate");
+    });
+    it("uses heuristic for unknown tools", () => {
+        assert.equal(classifyToolCall("custom_search_tool"), "search");
+        assert.equal(classifyToolCall("ReadFromDB"), "read");
+        assert.equal(classifyToolCall("writeConfig"), "write");
+        assert.equal(classifyToolCall("executeScript"), "execute");
+    });
+    it("uses custom mappings over defaults", () => {
+        assert.equal(classifyToolCall("MyTool", { MyTool: "communicate" }), "communicate");
+    });
+    it("falls back to execute for truly unknown tools", () => {
+        assert.equal(classifyToolCall("zzz_unknown_zzz"), "execute");
+    });
+});
+describe("classifyToolCalls", () => {
+    it("classifies a batch and reports unrecognized names", () => {
+        const { categories, unrecognized } = classifyToolCalls([
+            "WebSearch",
+            "Read",
+            "zzz_mystery_tool",
+        ]);
+        assert.equal(categories.length, 3);
+        assert.equal(categories[0], "search");
+        assert.equal(categories[1], "read");
+        assert.ok(unrecognized.includes("zzz_mystery_tool"));
+    });
+});
+// ---------------------------------------------------------------------------
+// Trace collection
+// ---------------------------------------------------------------------------
+describe("collectTrace", () => {
+    const baseOptions = {
+        runId: "run-1",
+        taskId: "task-1",
+        testCaseIndex: 0,
+        modelId: "openai:chat:gpt-4o",
+    };
+    it("creates a trace from an empty response", () => {
+        const trace = collectTrace({}, baseOptions);
+        assert.equal(trace.runId, "run-1");
+        assert.equal(trace.taskId, "task-1");
+        assert.equal(trace.modelId, "openai:chat:gpt-4o");
+        assert.equal(trace.toolCalls.length, 0);
+        assert.equal(trace.tokensUsed.totalTokens, 0);
+    });
+    it("extracts tool calls from metadata", () => {
+        const trace = collectTrace({
+            metadata: {
+                toolCalls: [
+                    { name: "WebSearch", input: { query: "GROQ" }, durationMs: 100 },
+                    { name: "Read", input: { path: "/docs/groq.md" }, durationMs: 50 },
+                ],
+            },
+        }, baseOptions);
+        assert.equal(trace.toolCalls.length, 2);
+        assert.equal(trace.toolCalls[0].name, "WebSearch");
+        assert.equal(trace.toolCalls[0].category, "search");
+        assert.equal(trace.toolCalls[1].name, "Read");
+        assert.equal(trace.toolCalls[1].category, "read");
+    });
+    it("extracts token usage", () => {
+        const trace = collectTrace({ tokenUsage: { prompt: 1000, completion: 500, total: 1500 } }, baseOptions);
+        assert.equal(trace.tokensUsed.promptTokens, 1000);
+        assert.equal(trace.tokensUsed.completionTokens, 500);
+        assert.equal(trace.tokensUsed.totalTokens, 1500);
+    });
+    it("extracts URLs from tool calls", () => {
+        const trace = collectTrace({
+            metadata: {
+                toolCalls: [
+                    { name: "WebFetch", input: { url: "https://sanity.io/docs" } },
+                ],
+            },
+        }, baseOptions);
+        assert.ok(trace.urlsVisited.includes("https://sanity.io/docs"));
+    });
+    it("extracts search terms", () => {
+        const trace = collectTrace({
+            metadata: {
+                toolCalls: [
+                    { name: "WebSearch", input: { query: "GROQ projection" } },
+                ],
+            },
+        }, baseOptions);
+        assert.ok(trace.searchTerms.includes("GROQ projection"));
+    });
+    it("extracts files read and written", () => {
+        const trace = collectTrace({
+            metadata: {
+                toolCalls: [
+                    { name: "Read", input: { path: "/src/schema.ts" } },
+                    { name: "Write", input: { path: "/src/config.ts" } },
+                ],
+            },
+        }, baseOptions);
+        assert.ok(trace.filesRead.includes("/src/schema.ts"));
+        assert.ok(trace.filesWritten.includes("/src/config.ts"));
+    });
+    it("creates event log from tool calls", () => {
+        const trace = collectTrace({
+            metadata: {
+                toolCalls: [
+                    { name: "WebSearch", input: { query: "test" }, durationMs: 100 },
+                ],
+            },
+            latencyMs: 500,
+        }, baseOptions);
+        // Should have: llm_request, tool_call_start, tool_call_end, llm_response
+        assert.equal(trace.events.length, 4);
+        assert.equal(trace.events[0].type, "llm_request");
+        assert.equal(trace.events[1].type, "tool_call_start");
+        assert.equal(trace.events[2].type, "tool_call_end");
+        assert.equal(trace.events[3].type, "llm_response");
+    });
+    it("builds a root span", () => {
+        const trace = collectTrace({ latencyMs: 1000 }, baseOptions);
+        assert.equal(trace.spans.length, 1);
+        assert.equal(trace.spans[0].operation, "test-case");
+        assert.equal(trace.spans[0].parentSpanId, null);
+    });
+});
+// ---------------------------------------------------------------------------
+// mergeTraces (per-turn tracing — task 6f)
+// ---------------------------------------------------------------------------
+describe("mergeTraces", () => {
+    const parentOptions = {
+        runId: "run-1",
+        taskId: "task-1",
+        testCaseIndex: 0,
+        modelId: "openai:chat:gpt-4o",
+    };
+    function makeTurn(index) {
+        return collectTrace({
+            metadata: {
+                toolCalls: [
+                    {
+                        name: "WebSearch",
+                        input: { query: `turn ${index}` },
+                        durationMs: 50,
+                    },
+                ],
+            },
+            tokenUsage: { prompt: 100, completion: 50, total: 150 },
+            latencyMs: 200,
+        }, { ...parentOptions, testCaseIndex: index });
+    }
+    it("merges multiple turns into one trace", () => {
+        const turns = [makeTurn(0), makeTurn(1), makeTurn(2)];
+        const merged = mergeTraces(turns, parentOptions);
+        assert.equal(merged.toolCalls.length, 3);
+        assert.equal(merged.tokensUsed.promptTokens, 300);
+        assert.equal(merged.tokensUsed.completionTokens, 150);
+        assert.equal(merged.durationMs, 600);
+    });
+    it("creates per-turn spans under root", () => {
+        const turns = [makeTurn(0), makeTurn(1)];
+        const merged = mergeTraces(turns, parentOptions);
+        // root + 2 turns
+        assert.equal(merged.spans.length, 3);
+        assert.equal(merged.spans[0].operation, "test-case");
+        assert.equal(merged.spans[0].parentSpanId, null);
+        assert.equal(merged.spans[1].operation, "turn-0");
+        assert.equal(merged.spans[1].parentSpanId, merged.spans[0].spanId);
+        assert.equal(merged.spans[2].operation, "turn-1");
+    });
+    it("deduplicates URLs and search terms", () => {
+        const t1 = collectTrace({
+            metadata: {
+                toolCalls: [
+                    { name: "WebSearch", input: { query: "GROQ" } },
+                    { name: "WebFetch", input: { url: "https://sanity.io" } },
+                ],
+            },
+        }, { ...parentOptions, testCaseIndex: 0 });
+        const t2 = collectTrace({
+            metadata: {
+                toolCalls: [
+                    { name: "WebSearch", input: { query: "GROQ" } },
+                    { name: "WebFetch", input: { url: "https://sanity.io" } },
+                ],
+            },
+        }, { ...parentOptions, testCaseIndex: 1 });
+        const merged = mergeTraces([t1, t2], parentOptions);
+        assert.equal(merged.searchTerms.length, 1); // deduplicated
+        assert.equal(merged.urlsVisited.length, 1); // deduplicated
+    });
+    it("handles empty turns", () => {
+        const merged = mergeTraces([], parentOptions);
+        assert.equal(merged.toolCalls.length, 0);
+        assert.equal(merged.spans.length, 1); // root only
+    });
+});
+// ---------------------------------------------------------------------------
+// Cost tracking
+// ---------------------------------------------------------------------------
+describe("computeCost", () => {
+    it("computes cost from token usage and pricing", () => {
+        const cost = computeCost({ promptTokens: 1000, completionTokens: 500, totalTokens: 1500 }, { input: 3.0, output: 15.0 });
+        // 1000 * 3.0/1M + 500 * 15.0/1M = 0.003 + 0.0075 = 0.0105
+        assert.ok(Math.abs(cost - 0.0105) < 0.0001);
+    });
+    it("accounts for cached input tokens", () => {
+        const cost = computeCost({
+            promptTokens: 1000,
+            completionTokens: 500,
+            totalTokens: 1500,
+            toolTokens: 300,
+        }, { input: 3.0, output: 15.0, cachedInput: 0.3 });
+        // 700 * 3.0/1M + 300 * 0.3/1M + 500 * 15.0/1M = 0.0021 + 0.00009 + 0.0075
+        assert.ok(cost > 0);
+        assert.ok(cost < 0.02);
+    });
+});
+describe("lookupPricing", () => {
+    it("finds exact match", () => {
+        const pricing = lookupPricing("openai:chat:gpt-4o");
+        assert.ok(pricing);
+        assert.ok(pricing.input > 0);
+    });
+    it("falls back to prefix match", () => {
+        const pricing = lookupPricing("openai:chat:gpt-4o-2024-11-20");
+        assert.ok(pricing);
+    });
+    it("returns undefined for unknown model", () => {
+        const pricing = lookupPricing("unknown:model:xyz");
+        assert.equal(pricing, undefined);
+    });
+    it("uses custom pricing over defaults", () => {
+        const pricing = lookupPricing("custom:model", {
+            "custom:model": { input: 1.0, output: 2.0 },
+        });
+        assert.ok(pricing);
+        assert.equal(pricing.input, 1.0);
+    });
+});
+describe("estimateRunCost", () => {
+    it("estimates cost for a run", () => {
+        const estimate = estimateRunCost(5, ["openai:chat:gpt-4o"]);
+        assert.ok(estimate.totalUSD > 0);
+        assert.equal(estimate.perModel.length, 1);
+    });
+    it("flags budget warning", () => {
+        const estimate = estimateRunCost(100, ["openai:chat:gpt-4o", "anthropic:messages:claude-sonnet-4-6"], { perRun: { warn: 0.01, stop: 1.0 } });
+        assert.equal(estimate.exceedsWarning, true);
+    });
+});
+describe("checkBudget", () => {
+    it("allows spend below thresholds", () => {
+        const result = checkBudget(1.0, { perRun: { warn: 5.0, stop: 20.0 } }, "perRun");
+        assert.equal(result.proceed, true);
+        assert.equal(result.warning, undefined);
+    });
+    it("warns at warn threshold", () => {
+        const result = checkBudget(5.5, { perRun: { warn: 5.0, stop: 20.0 } }, "perRun");
+        assert.equal(result.proceed, true);
+        assert.ok(result.warning?.includes("warning"));
+    });
+    it("stops at stop threshold", () => {
+        const result = checkBudget(25.0, { perRun: { warn: 5.0, stop: 20.0 } }, "perRun");
+        assert.equal(result.proceed, false);
+        assert.ok(result.warning?.includes("exceeded"));
+    });
+});
+// ---------------------------------------------------------------------------
+// Redaction
+// ---------------------------------------------------------------------------
+describe("redactTrace", () => {
+    function makeTrace(toolCalls) {
+        return {
+            traceId: "trace-1",
+            runId: "run-1",
+            taskId: "task-1",
+            testCaseIndex: 0,
+            modelId: "openai:chat:gpt-4o",
+            spans: [],
+            toolCalls,
+            urlsVisited: [],
+            searchTerms: [],
+            filesRead: [],
+            filesWritten: [],
+            tokensUsed: { promptTokens: 0, completionTokens: 0, totalTokens: 0 },
+            costEstimate: 0,
+            durationMs: 0,
+            events: [],
+            startedAt: new Date().toISOString(),
+            completedAt: new Date().toISOString(),
+        };
+    }
+    it("redacts Bearer tokens in tool call inputs", () => {
+        const trace = makeTrace([
+            {
+                name: "WebFetch",
+                input: {
+                    url: "https://api.sanity.io",
+                    auth: "Bearer sk_live_abc123def456ghi789",
+                },
+                output: "OK",
+                durationMs: 100,
+                category: "read",
+            },
+        ]);
+        const { trace: redacted, redactionCount } = redactTrace(trace);
+        const inputStr = JSON.stringify(redacted.toolCalls[0].input);
+        assert.ok(!inputStr.includes("sk_live_abc123def456ghi789"));
+        assert.ok(inputStr.includes("[REDACTED]"));
+        assert.ok(redactionCount > 0);
+    });
+    it("redacts Sanity tokens", () => {
+        const trace = makeTrace([
+            {
+                name: "Write",
+                input: { token: "skAbcDefGhiJklMnoPqrStUvWxYz0123456789" },
+                output: null,
+                durationMs: 10,
+                category: "write",
+            },
+        ]);
+        const { trace: redacted } = redactTrace(trace);
+        const inputStr = JSON.stringify(redacted.toolCalls[0].input);
+        assert.ok(inputStr.includes("[REDACTED_SANITY_TOKEN]"));
+    });
+    it("redacts OpenAI keys", () => {
+        const trace = makeTrace([
+            {
+                name: "Bash",
+                input: {
+                    command: "export OPENAI_API_KEY=sk-proj-abcdefghij1234567890abcdefghij",
+                },
+                output: null,
+                durationMs: 10,
+                category: "execute",
+            },
+        ]);
+        const { trace: redacted } = redactTrace(trace);
+        const inputStr = JSON.stringify(redacted.toolCalls[0].input);
+        assert.ok(!inputStr.includes("sk-proj-abcdefghij1234567890abcdefghij"), "OpenAI key should be redacted");
+    });
+    it("does not mutate the original trace", () => {
+        const original = makeTrace([
+            {
+                name: "WebFetch",
+                input: { auth: "Bearer secrettoken1234567890" },
+                output: null,
+                durationMs: 10,
+                category: "read",
+            },
+        ]);
+        const originalStr = JSON.stringify(original);
+        redactTrace(original);
+        assert.equal(JSON.stringify(original), originalStr);
+    });
+    it("reports which rules fired", () => {
+        const trace = makeTrace([
+            {
+                name: "Bash",
+                input: {
+                    cmd: "curl -H 'Authorization: Bearer abc123def456789' https://api.example.com",
+                },
+                output: null,
+                durationMs: 10,
+                category: "execute",
+            },
+        ]);
+        const { rulesApplied } = redactTrace(trace);
+        assert.ok(rulesApplied.includes("bearer_tokens"));
+    });
+});
+// ---------------------------------------------------------------------------
+// Trace storage
+// ---------------------------------------------------------------------------
+describe("LocalTraceStore", () => {
+    const storeDir = resolve(tmpdir(), `ailf-trace-test-${process.pid}`);
+    afterEach(() => {
+        if (existsSync(storeDir)) {
+            rmSync(storeDir, { recursive: true, force: true });
+        }
+    });
+    it("stores and retrieves a trace", async () => {
+        const store = new LocalTraceStore(storeDir);
+        const trace = {
+            traceId: "trace-store-test",
+            runId: "run-1",
+            taskId: "task-1",
+            testCaseIndex: 0,
+            modelId: "openai:chat:gpt-4o",
+            spans: [],
+            toolCalls: [],
+            urlsVisited: [],
+            searchTerms: [],
+            filesRead: [],
+            filesWritten: [],
+            tokensUsed: { promptTokens: 100, completionTokens: 50, totalTokens: 150 },
+            costEstimate: 0.001,
+            durationMs: 500,
+            events: [],
+            startedAt: new Date().toISOString(),
+            completedAt: new Date().toISOString(),
+        };
+        const result = await store.store(trace);
+        assert.ok(result.uri.startsWith("file://"));
+        assert.ok(result.sizeBytes > 0);
+        const retrieved = await store.retrieve(result.uri);
+        assert.ok(retrieved);
+        assert.equal(retrieved.traceId, "trace-store-test");
+    });
+    it("returns null for non-existent trace", async () => {
+        const store = new LocalTraceStore(storeDir);
+        const result = await store.retrieve("file:///nonexistent/path.json");
+        assert.equal(result, null);
+    });
+});
+// ---------------------------------------------------------------------------
+// Trace summary extraction
+// ---------------------------------------------------------------------------
+describe("extractTraceSummary", () => {
+    it("extracts sanitized summary from full trace", () => {
+        const trace = {
+            traceId: "trace-summary-test",
+            runId: "run-1",
+            taskId: "task-1",
+            testCaseIndex: 0,
+            modelId: "openai:chat:gpt-4o",
+            spans: [],
+            toolCalls: [
+                {
+                    name: "WebSearch",
+                    input: {},
+                    output: null,
+                    durationMs: 100,
+                    category: "search",
+                },
+                {
+                    name: "Read",
+                    input: {},
+                    output: null,
+                    durationMs: 50,
+                    category: "read",
+                },
+                {
+                    name: "Read",
+                    input: {},
+                    output: null,
+                    durationMs: 30,
+                    category: "read",
+                },
+            ],
+            urlsVisited: ["https://sanity.io/docs"],
+            searchTerms: ["GROQ"],
+            filesRead: ["/src/schema.ts"],
+            filesWritten: [],
+            tokensUsed: {
+                promptTokens: 1000,
+                completionTokens: 500,
+                totalTokens: 1500,
+            },
+            costEstimate: 0.01,
+            durationMs: 2000,
+            events: [],
+            startedAt: new Date().toISOString(),
+            completedAt: new Date().toISOString(),
+        };
+        const summary = extractTraceSummary(trace, "file:///traces/trace-1.json");
+        assert.equal(summary.traceId, "trace-summary-test");
+        assert.equal(summary.traceDataUri, "file:///traces/trace-1.json");
+        assert.equal(summary.toolCallCount, 3);
+        assert.equal(summary.toolCallCategories.search, 1);
+        assert.equal(summary.toolCallCategories.read, 2);
+        assert.equal(summary.totalTokens, 1500);
+        assert.equal(summary.costEstimate, 0.01);
+        assert.equal(summary.urlsVisitedCount, 1);
+        assert.equal(summary.filesReadCount, 1);
+        assert.equal(summary.filesWrittenCount, 0);
+    });
+});

package/dist/pipeline/compiler/assertion-mapper.d.ts ADDED Viewed

@@ -0,0 +1,58 @@
+/**
+ * Assertion type mapper — maps AILF assertion types to Promptfoo assertion types.
+ *
+ * AILF assertions have two flavors:
+ * 1. Templated assertions (`type: "llm-rubric"` with `template` + `criteria`)
+ *    → resolved into Promptfoo's `llm-rubric` with a fully assembled rubric prompt
+ * 2. Value assertions (any other `type` with a `value`)
+ *    → passed through to Promptfoo mostly as-is
+ *
+ * This module handles the mapping for both, validates mode compatibility
+ * (e.g., `tool-called` is only valid for agent-harness/mcp-server modes),
+ * and normalizes weight fields.
+ *
+ * @see docs/design-docs/architecture-overhaul/scoring-rubrics-assertions.md
+ * @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
+ */
+import type { GeneralizedAssertionDefinition } from "../../_vendor/ailf-core/index.d.ts";
+import type { EvalMode } from "../../_vendor/ailf-shared/index.d.ts";
+/** A Promptfoo-compatible assertion object */
+export interface PromptfooAssertion {
+    type: string;
+    value?: unknown;
+    weight?: number;
+    /** Promptfoo-specific: provider for model-graded assertions */
+    provider?: string;
+    /** Promptfoo-specific: rubric prompt text */
+    rubricPrompt?: string;
+    /** Promptfoo-specific: threshold for similarity */
+    threshold?: number;
+    /** Additional properties passed through */
+    [key: string]: unknown;
+}
+/** Options for mapping assertions */
+export interface AssertionMapperOptions {
+    /** Evaluation mode — used for compatibility checking */
+    mode?: EvalMode;
+    /** Default grader provider (for LLM-graded assertions) */
+    graderProvider?: string;
+}
+/**
+ * Map an array of AILF assertions to Promptfoo assertions.
+ *
+ * @param assertions - AILF assertion definitions
+ * @param options - Mapper options
+ * @returns Mapped Promptfoo assertions and any warnings
+ */
+export declare function mapAssertions(assertions: GeneralizedAssertionDefinition[], options?: AssertionMapperOptions): {
+    mapped: PromptfooAssertion[];
+    warnings: string[];
+};
+/**
+ * Check if an assertion type is valid.
+ */
+export declare function isValidAssertionType(type: string): boolean;
+/**
+ * Check if an assertion type is compatible with a given mode.
+ */
+export declare function isAssertionCompatibleWithMode(type: string, mode: EvalMode): boolean;