npm - @sanity/ailf - Versions diffs - 0.4.1 → 1.0.0 - Mend

@sanity/ailf 0.4.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (290) hide show

package/config/features.ts +23 -0
package/config/models.ts +83 -0
package/config/prompts.ts +16 -0
package/config/rubrics.ts +225 -0
package/config/schedules.ts +47 -0
package/config/sinks.ts +37 -0
package/config/sources.ts +21 -0
package/config/thresholds.ts +61 -0
package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
package/dist/_vendor/ailf-core/config-helpers.js +150 -0
package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
package/dist/_vendor/ailf-core/env-helper.js +45 -0
package/dist/_vendor/ailf-core/examples/index.d.ts +10 -10
package/dist/_vendor/ailf-core/examples/index.js +10 -10
package/dist/_vendor/ailf-core/index.d.ts +3 -0
package/dist/_vendor/ailf-core/index.js +5 -0
package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +32 -31
package/dist/_vendor/ailf-core/schemas/pipeline.js +52 -12
package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
package/dist/_vendor/ailf-core/services/index.js +2 -1
package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
package/dist/_vendor/ailf-core/services/scoring.js +25 -15
package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
package/dist/_vendor/ailf-core/types/index.js +8 -1
package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
package/dist/_vendor/ailf-core/types/trace.js +18 -0
package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
package/dist/_vendor/ailf-shared/index.d.ts +0 -1
package/dist/_vendor/ailf-shared/index.js +0 -1
package/dist/adapters/api-client/build-request.js +14 -13
package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
package/dist/adapters/config-sources/file-config-adapter.js +38 -12
package/dist/adapters/config-sources/index.d.ts +2 -0
package/dist/adapters/config-sources/index.js +1 -0
package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
package/dist/adapters/config-sources/ts-config-loader.js +133 -0
package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
package/dist/adapters/task-sources/composite-task-source.js +1 -1
package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
package/dist/adapters/task-sources/index.d.ts +1 -0
package/dist/adapters/task-sources/index.js +1 -0
package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
package/dist/adapters/task-sources/repo-task-source.js +69 -16
package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
package/dist/adapters/task-sources/task-file-loader.js +83 -0
package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
package/dist/adapters/task-sources/yaml-task-source.js +19 -16
package/dist/cli.js +0 -2
package/dist/commands/baseline.js +4 -1
package/dist/commands/calculate-scores.js +1 -1
package/dist/commands/coverage-audit.js +7 -1
package/dist/commands/explain-handler.js +25 -23
package/dist/commands/fetch-docs.js +3 -2
package/dist/commands/generate-configs.js +1 -1
package/dist/commands/interactive.js +11 -7
package/dist/commands/pipeline-action.d.ts +2 -0
package/dist/commands/pipeline-action.js +16 -6
package/dist/commands/pipeline.d.ts +1 -0
package/dist/commands/pipeline.js +4 -2
package/dist/commands/pr-comment.js +1 -1
package/dist/commands/publish.js +2 -2
package/dist/commands/readiness-report.js +13 -6
package/dist/composition-root.d.ts +1 -1
package/dist/composition-root.js +67 -4
package/dist/orchestration/build-app-context.js +1 -0
package/dist/orchestration/build-step-sequence.js +24 -6
package/dist/orchestration/steps/calculate-scores-step.js +24 -11
package/dist/orchestration/steps/fetch-docs-step.js +6 -4
package/dist/orchestration/steps/gap-analysis-step.js +8 -7
package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
package/dist/orchestration/steps/generate-configs-step.js +245 -51
package/dist/orchestration/steps/grader-consistency-step.js +7 -4
package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
package/dist/orchestration/steps/readiness-step.js +5 -6
package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
package/dist/orchestration/steps/run-eval-step.js +8 -7
package/dist/pipeline/cache.d.ts +1 -1
package/dist/pipeline/cache.js +36 -8
package/dist/pipeline/calculate-scores.d.ts +5 -7
package/dist/pipeline/calculate-scores.js +74 -153
package/dist/pipeline/checks.js +2 -2
package/dist/pipeline/compare.js +8 -8
package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
package/dist/pipeline/compiler/assertion-mapper.js +175 -0
package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
package/dist/pipeline/compiler/config-loader.d.ts +56 -0
package/dist/pipeline/compiler/config-loader.js +111 -0
package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
package/dist/pipeline/compiler/fixture-resolver.js +113 -0
package/dist/pipeline/compiler/hash.d.ts +11 -0
package/dist/pipeline/compiler/hash.js +18 -0
package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
package/dist/pipeline/compiler/ignore-fields.js +113 -0
package/dist/pipeline/compiler/index.d.ts +29 -0
package/dist/pipeline/compiler/index.js +45 -0
package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
package/dist/pipeline/compiler/literacy-bridge.js +172 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
package/dist/pipeline/compiler/presets/index.d.ts +9 -0
package/dist/pipeline/compiler/presets/index.js +8 -0
package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
package/dist/pipeline/compiler/provider-assembler.js +137 -0
package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
package/dist/pipeline/compiler/sandbox/index.js +11 -0
package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
package/dist/pipeline/compiler/scoring-bridge.js +114 -0
package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
package/dist/pipeline/compiler/task-graph-builder.js +291 -0
package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
package/dist/pipeline/compiler/telemetry/index.js +19 -0
package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
package/dist/pipeline/compiler/variable-resolver.js +115 -0
package/dist/pipeline/coverage-audit.d.ts +15 -5
package/dist/pipeline/coverage-audit.js +41 -22
package/dist/pipeline/eval-constants.d.ts +16 -6
package/dist/pipeline/eval-constants.js +25 -4
package/dist/pipeline/eval-fingerprint.d.ts +2 -2
package/dist/pipeline/eval-fingerprint.js +8 -9
package/dist/pipeline/expand-tasks.d.ts +23 -14
package/dist/pipeline/expand-tasks.js +37 -31
package/dist/pipeline/gap-analysis.d.ts +1 -1
package/dist/pipeline/gap-analysis.js +2 -2
package/dist/pipeline/generate-configs.d.ts +22 -4
package/dist/pipeline/generate-configs.js +53 -24
package/dist/pipeline/grader-api.d.ts +3 -3
package/dist/pipeline/grader-api.js +5 -12
package/dist/pipeline/grader-compare-runner.js +20 -27
package/dist/pipeline/grader-comparison.d.ts +4 -8
package/dist/pipeline/grader-comparison.js +11 -17
package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
package/dist/pipeline/grader-consistency-runner.js +18 -21
package/dist/pipeline/grader-consistency.d.ts +6 -10
package/dist/pipeline/grader-consistency.js +13 -32
package/dist/pipeline/grader-sensitivity-runner.js +7 -5
package/dist/pipeline/grader-sensitivity.d.ts +2 -6
package/dist/pipeline/grader-sensitivity.js +10 -10
package/dist/pipeline/grader-validate-runner.js +7 -5
package/dist/pipeline/grader-validation.d.ts +2 -6
package/dist/pipeline/grader-validation.js +14 -22
package/dist/pipeline/map-request-to-config.js +6 -1
package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
package/dist/pipeline/mirror-repo-tasks.js +16 -15
package/dist/pipeline/normalize-mode.d.ts +49 -0
package/dist/pipeline/normalize-mode.js +64 -0
package/dist/pipeline/plan.d.ts +5 -2
package/dist/pipeline/plan.js +134 -78
package/dist/pipeline/pr-comment.js +2 -0
package/dist/pipeline/profile-resolution.d.ts +47 -0
package/dist/pipeline/profile-resolution.js +91 -0
package/dist/pipeline/provenance.d.ts +2 -2
package/dist/pipeline/provenance.js +12 -17
package/dist/pipeline/release-report.js +4 -4
package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
package/dist/pipeline/repo-threshold-evaluator.js +1 -1
package/dist/pipeline/rubric-loader.d.ts +20 -0
package/dist/pipeline/rubric-loader.js +37 -0
package/dist/pipeline/validate.d.ts +4 -4
package/dist/pipeline/validate.js +64 -53
package/dist/schedules/loader.js +18 -8
package/dist/scripts/migrate-task-mode.d.ts +24 -0
package/dist/scripts/migrate-task-mode.js +85 -0
package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
package/dist/scripts/validate-task-sources.d.ts +1 -1
package/dist/scripts/validate-task-sources.js +15 -15
package/dist/sinks/loader.js +5 -7
package/dist/sources.d.ts +7 -7
package/dist/sources.js +22 -24
package/dist/webhook/dispatch.js +2 -1
package/package.json +6 -3
package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
package/tasks/literacy/frameworks.task.ts +128 -0
package/tasks/literacy/functions.task.ts +69 -0
package/tasks/literacy/groq.task.ts +258 -0
package/tasks/literacy/nextjs-live.task.ts +75 -0
package/tasks/literacy/studio-setup.task.ts +131 -0
package/tasks/literacy/visual-editing.task.ts +146 -0
package/config/features.yaml +0 -116
package/config/models.yaml +0 -116
package/config/prompts.yaml +0 -75
package/config/rubrics.yaml +0 -62
package/config/schedules.yaml +0 -43
package/config/sinks.yaml +0 -54
package/config/sources.yaml +0 -51
package/config/thresholds.yaml +0 -49
package/dist/agent-observer/test-imports.d.ts +0 -7
package/dist/agent-observer/test-imports.js +0 -185

package/dist/pipeline/compiler/telemetry/cost-tracker.js ADDED Viewed

@@ -0,0 +1,146 @@
+/**
+ * Cost tracking — model pricing, pre-run estimation, and post-run actuals.
+ *
+ * Uses a pricing table (YAML config or TS `definePricingTable()`) to compute
+ * USD cost from token usage. Supports budget controls with warn/stop thresholds.
+ *
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
+ */
+// ---------------------------------------------------------------------------
+// Pricing table
+// ---------------------------------------------------------------------------
+/** Default pricing table (updated periodically) */
+const DEFAULT_PRICING = {
+    "anthropic:messages:claude-opus-4-6": {
+        input: 15.0,
+        output: 75.0,
+        cachedInput: 1.5,
+    },
+    "anthropic:messages:claude-sonnet-4-6": {
+        input: 3.0,
+        output: 15.0,
+        cachedInput: 0.3,
+    },
+    "openai:chat:gpt-4.1": {
+        input: 2.0,
+        output: 8.0,
+        cachedInput: 0.5,
+    },
+    "openai:chat:gpt-4.1-mini": {
+        input: 0.4,
+        output: 1.6,
+        cachedInput: 0.1,
+    },
+    "openai:chat:gpt-4o": {
+        input: 2.5,
+        output: 10.0,
+        cachedInput: 1.25,
+    },
+    "openai:chat:gpt-5": {
+        input: 5.0,
+        output: 15.0,
+        cachedInput: 1.0,
+    },
+};
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+/**
+ * Compute actual cost from token usage and model pricing.
+ *
+ * @param usage - Token counts from provider response
+ * @param pricing - Per-model pricing (USD per 1M tokens)
+ * @returns Cost in USD
+ */
+export function computeCost(usage, pricing) {
+    const cached = usage.toolTokens ?? 0;
+    const uncachedPrompt = usage.promptTokens - cached;
+    const inputCost = (uncachedPrompt * pricing.input) / 1_000_000;
+    const cachedCost = pricing.cachedInput !== undefined
+        ? (cached * pricing.cachedInput) / 1_000_000
+        : (cached * pricing.input) / 1_000_000;
+    const outputCost = (usage.completionTokens * pricing.output) / 1_000_000;
+    return inputCost + cachedCost + outputCost;
+}
+/**
+ * Look up pricing for a model ID.
+ *
+ * Tries exact match first, then falls back to prefix matching
+ * (e.g., "openai:chat:gpt-4o-2024-11-20" matches "openai:chat:gpt-4o").
+ */
+export function lookupPricing(modelId, customPricing) {
+    // 1. Exact match in custom pricing
+    if (customPricing?.[modelId])
+        return customPricing[modelId];
+    // 2. Exact match in defaults
+    if (DEFAULT_PRICING[modelId])
+        return DEFAULT_PRICING[modelId];
+    // 3. Prefix match in custom pricing
+    if (customPricing) {
+        for (const [key, pricing] of Object.entries(customPricing)) {
+            if (modelId.startsWith(key))
+                return pricing;
+        }
+    }
+    // 4. Prefix match in defaults
+    for (const [key, pricing] of Object.entries(DEFAULT_PRICING)) {
+        if (modelId.startsWith(key))
+            return pricing;
+    }
+    return undefined;
+}
+/**
+ * Estimate cost for a pipeline run before execution.
+ *
+ * Uses task count, estimated tokens per task complexity, and model pricing.
+ */
+export function estimateRunCost(taskCount, modelIds, budget, customPricing) {
+    // Rough token estimates per task (empirical averages)
+    const AVG_PROMPT_TOKENS = 2000;
+    const AVG_COMPLETION_TOKENS = 1500;
+    const perModel = modelIds.map((modelId) => {
+        const pricing = lookupPricing(modelId, customPricing);
+        if (!pricing) {
+            return { modelId, estimatedUSD: 0 };
+        }
+        const estimatedUSD = computeCost({
+            promptTokens: AVG_PROMPT_TOKENS * taskCount,
+            completionTokens: AVG_COMPLETION_TOKENS * taskCount,
+            totalTokens: (AVG_PROMPT_TOKENS + AVG_COMPLETION_TOKENS) * taskCount,
+        }, pricing);
+        return { modelId, estimatedUSD };
+    });
+    const totalUSD = perModel.reduce((sum, m) => sum + m.estimatedUSD, 0);
+    return {
+        totalUSD,
+        perModel,
+        exceedsWarning: budget?.perRun ? totalUSD >= budget.perRun.warn : false,
+        exceedsStop: budget?.perRun ? totalUSD >= budget.perRun.stop : false,
+    };
+}
+/**
+ * Check if current spend exceeds budget thresholds.
+ */
+export function checkBudget(currentUSD, budget, level) {
+    const limits = budget[level];
+    if (!limits) {
+        return { proceed: true, currentUSD };
+    }
+    if (currentUSD >= limits.stop) {
+        return {
+            proceed: false,
+            warning: `Budget exceeded: $${currentUSD.toFixed(4)} >= $${limits.stop} (${level} stop limit)`,
+            currentUSD,
+            limitUSD: limits.stop,
+        };
+    }
+    if (currentUSD >= limits.warn) {
+        return {
+            proceed: true,
+            warning: `Budget warning: $${currentUSD.toFixed(4)} >= $${limits.warn} (${level} warn threshold)`,
+            currentUSD,
+            limitUSD: limits.warn,
+        };
+    }
+    return { proceed: true, currentUSD };
+}

package/dist/pipeline/compiler/telemetry/index.d.ts ADDED Viewed

@@ -0,0 +1,14 @@
+/**
+ * Telemetry — observability infrastructure for evaluation traces.
+ *
+ * Captures tool calls, token usage, cost, and timing for every evaluation.
+ * Full traces go to blob storage; sanitized summaries to Content Lake.
+ *
+ * @see docs/exec-plans/architecture-overhaul/phase-6-observability.md
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
+ */
+export { collectTrace, mergeTraces, type ProviderResponse, type RawToolCall, type TraceCollectorOptions, } from "./trace-collector.js";
+export { classifyToolCall, classifyToolCalls } from "./tool-classifier.js";
+export { checkBudget, computeCost, estimateRunCost, lookupPricing, type ActualCost, type BudgetCheckResult, type BudgetConfig, type CostEstimate, type ModelPricing, } from "./cost-tracker.js";
+export { extractTraceSummary, LocalTraceStore, type TraceSummary, type TraceStore, type TraceStoreResult, } from "./trace-store.js";
+export { createRedactionConfig, DEFAULT_REDACTION_RULES, redactTrace, type RedactionConfig, type RedactionResult, type RedactionRule, } from "./redactor.js";

package/dist/pipeline/compiler/telemetry/index.js ADDED Viewed

@@ -0,0 +1,19 @@
+/**
+ * Telemetry — observability infrastructure for evaluation traces.
+ *
+ * Captures tool calls, token usage, cost, and timing for every evaluation.
+ * Full traces go to blob storage; sanitized summaries to Content Lake.
+ *
+ * @see docs/exec-plans/architecture-overhaul/phase-6-observability.md
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
+ */
+// Trace collection
+export { collectTrace, mergeTraces, } from "./trace-collector.js";
+// Tool call classification
+export { classifyToolCall, classifyToolCalls } from "./tool-classifier.js";
+// Cost tracking
+export { checkBudget, computeCost, estimateRunCost, lookupPricing, } from "./cost-tracker.js";
+// Trace storage
+export { extractTraceSummary, LocalTraceStore, } from "./trace-store.js";
+// Redaction
+export { createRedactionConfig, DEFAULT_REDACTION_RULES, redactTrace, } from "./redactor.js";

package/dist/pipeline/compiler/telemetry/redactor.d.ts ADDED Viewed

@@ -0,0 +1,58 @@
+/**
+ * Redaction pipeline — strips sensitive data from traces before storage.
+ *
+ * Applied before ANY storage (both blob and Content Lake). Configurable
+ * patterns handle Bearer tokens, API keys, Sanity tokens, and other
+ * common secret formats.
+ *
+ * Principles:
+ * 1. Redact before store — sensitive data never reaches storage
+ * 2. Configurable patterns — teams can add project-specific rules
+ * 3. Truncation for cost — large outputs truncated to max bytes
+ * 4. No PII by default — tasks shouldn't contain PII, this is a safety net
+ *
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
+ */
+import type { EvalTrace } from "../../../_vendor/ailf-core/index.d.ts";
+/** A single redaction rule */
+export interface RedactionRule {
+    /** Rule name (for logging) */
+    name: string;
+    /** Regex pattern to match */
+    pattern: RegExp;
+    /** Replacement string (use $1, $2 for capture groups) */
+    replacement: string;
+}
+/** Redaction configuration */
+export interface RedactionConfig {
+    /** Regex-based substitution rules */
+    rules: RedactionRule[];
+    /** Fields to omit entirely from stored traces */
+    omitFields: string[];
+    /** Maximum tool call output size in bytes */
+    maxOutputBytes: number;
+}
+/** Result of redaction */
+export interface RedactionResult {
+    /** Redacted trace */
+    trace: EvalTrace;
+    /** Number of redactions applied */
+    redactionCount: number;
+    /** Which rules fired */
+    rulesApplied: string[];
+}
+/** Built-in redaction rules for common secret patterns */
+export declare const DEFAULT_REDACTION_RULES: RedactionRule[];
+/**
+ * Create a default redaction config.
+ *
+ * @param overrides - Custom rules or settings to merge
+ */
+export declare function createRedactionConfig(overrides?: Partial<RedactionConfig>): RedactionConfig;
+/**
+ * Apply redaction to an evaluation trace.
+ *
+ * Processes tool call inputs and outputs, event data, and search terms.
+ * Returns a new trace (does not mutate the original).
+ */
+export declare function redactTrace(trace: EvalTrace, config?: RedactionConfig): RedactionResult;

package/dist/pipeline/compiler/telemetry/redactor.js ADDED Viewed

@@ -0,0 +1,222 @@
+/**
+ * Redaction pipeline — strips sensitive data from traces before storage.
+ *
+ * Applied before ANY storage (both blob and Content Lake). Configurable
+ * patterns handle Bearer tokens, API keys, Sanity tokens, and other
+ * common secret formats.
+ *
+ * Principles:
+ * 1. Redact before store — sensitive data never reaches storage
+ * 2. Configurable patterns — teams can add project-specific rules
+ * 3. Truncation for cost — large outputs truncated to max bytes
+ * 4. No PII by default — tasks shouldn't contain PII, this is a safety net
+ *
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
+ */
+// ---------------------------------------------------------------------------
+// Default rules
+// ---------------------------------------------------------------------------
+/** Built-in redaction rules for common secret patterns */
+export const DEFAULT_REDACTION_RULES = [
+    {
+        name: "bearer_tokens",
+        pattern: /Bearer\s+[A-Za-z0-9._~+/=-]{10,}/g,
+        replacement: "Bearer [REDACTED]",
+    },
+    {
+        name: "sanity_tokens",
+        pattern: /sk[A-Za-z0-9]{30,}/g,
+        replacement: "[REDACTED_SANITY_TOKEN]",
+    },
+    {
+        name: "openai_keys",
+        pattern: /sk-[A-Za-z0-9_-]{20,}/g,
+        replacement: "[REDACTED_OPENAI_KEY]",
+    },
+    {
+        name: "api_key_values",
+        pattern: /((?:api[_-]?key|token|secret|password|authorization)\s*[:=]\s*)(["']?)(?!\[REDACTED)[^\s"']{8,}\2/gi,
+        replacement: "$1$2[REDACTED]$2",
+    },
+    {
+        name: "slack_tokens",
+        pattern: /xoxb-[A-Za-z0-9-]{20,}/g,
+        replacement: "[REDACTED_SLACK_TOKEN]",
+    },
+    {
+        name: "github_tokens",
+        pattern: /gh[ps]_[A-Za-z0-9]{30,}/g,
+        replacement: "[REDACTED_GITHUB_TOKEN]",
+    },
+    {
+        name: "anthropic_keys",
+        pattern: /sk-ant-[A-Za-z0-9_-]{20,}/g,
+        replacement: "[REDACTED_ANTHROPIC_KEY]",
+    },
+    {
+        name: "base64_credentials",
+        pattern: /Basic\s+[A-Za-z0-9+/=]{20,}/g,
+        replacement: "Basic [REDACTED]",
+    },
+];
+/** Default fields to omit entirely */
+const DEFAULT_OMIT_FIELDS = [
+    "toolCalls[*].input.headers.Authorization",
+    "toolCalls[*].input.headers.Cookie",
+    "toolCalls[*].input.headers.Set-Cookie",
+];
+const DEFAULT_MAX_OUTPUT_BYTES = 10_240;
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+/**
+ * Create a default redaction config.
+ *
+ * @param overrides - Custom rules or settings to merge
+ */
+export function createRedactionConfig(overrides) {
+    return {
+        rules: overrides?.rules
+            ? [...DEFAULT_REDACTION_RULES, ...overrides.rules]
+            : DEFAULT_REDACTION_RULES,
+        omitFields: overrides?.omitFields
+            ? [...DEFAULT_OMIT_FIELDS, ...overrides.omitFields]
+            : DEFAULT_OMIT_FIELDS,
+        maxOutputBytes: overrides?.maxOutputBytes ?? DEFAULT_MAX_OUTPUT_BYTES,
+    };
+}
+/**
+ * Apply redaction to an evaluation trace.
+ *
+ * Processes tool call inputs and outputs, event data, and search terms.
+ * Returns a new trace (does not mutate the original).
+ */
+export function redactTrace(trace, config) {
+    const cfg = config ?? createRedactionConfig();
+    let redactionCount = 0;
+    const rulesApplied = new Set();
+    // Deep clone to avoid mutation
+    const redacted = JSON.parse(JSON.stringify(trace));
+    // Redact tool calls
+    redacted.toolCalls = redacted.toolCalls.map((call) => {
+        const result = redactToolCall(call, cfg);
+        redactionCount += result.count;
+        for (const rule of result.rules)
+            rulesApplied.add(rule);
+        return result.call;
+    });
+    // Redact events
+    redacted.events = redacted.events.map((event) => {
+        const dataStr = JSON.stringify(event.data);
+        const { text, count, rules } = applyRules(dataStr, cfg.rules);
+        redactionCount += count;
+        for (const rule of rules)
+            rulesApplied.add(rule);
+        return { ...event, data: JSON.parse(text) };
+    });
+    // Redact search terms (may contain embedded secrets)
+    redacted.searchTerms = redacted.searchTerms.map((term) => {
+        const { text, count, rules } = applyRules(term, cfg.rules);
+        redactionCount += count;
+        for (const rule of rules)
+            rulesApplied.add(rule);
+        return text;
+    });
+    return {
+        trace: redacted,
+        redactionCount,
+        rulesApplied: [...rulesApplied],
+    };
+}
+// ---------------------------------------------------------------------------
+// Tool call redaction
+// ---------------------------------------------------------------------------
+function redactToolCall(call, config) {
+    let count = 0;
+    const rules = [];
+    // Redact input
+    const inputStr = JSON.stringify(call.input);
+    const inputResult = applyRules(inputStr, config.rules);
+    count += inputResult.count;
+    rules.push(...inputResult.rules);
+    // Redact output
+    let outputStr = JSON.stringify(call.output);
+    // Truncate output if too large
+    if (outputStr.length > config.maxOutputBytes) {
+        outputStr = outputStr.slice(0, config.maxOutputBytes) + "... [truncated]";
+    }
+    const outputResult = applyRules(outputStr, config.rules);
+    count += outputResult.count;
+    rules.push(...outputResult.rules);
+    // Omit specific fields from input
+    let parsedInput = JSON.parse(inputResult.text);
+    parsedInput = omitFields(parsedInput, config.omitFields, "input");
+    return {
+        call: {
+            ...call,
+            input: parsedInput,
+            output: parseJsonSafe(outputResult.text),
+        },
+        count,
+        rules,
+    };
+}
+// ---------------------------------------------------------------------------
+// Rule application
+// ---------------------------------------------------------------------------
+function applyRules(text, rules) {
+    let result = text;
+    let count = 0;
+    const appliedRules = [];
+    for (const rule of rules) {
+        // Reset lastIndex before match() — global regexes are stateful
+        rule.pattern.lastIndex = 0;
+        const matches = result.match(rule.pattern);
+        if (matches && matches.length > 0) {
+            count += matches.length;
+            appliedRules.push(rule.name);
+            // Reset again before replace() — match() may leave lastIndex dirty
+            rule.pattern.lastIndex = 0;
+            result = result.replace(rule.pattern, rule.replacement);
+        }
+    }
+    return { text: result, count, rules: appliedRules };
+}
+// ---------------------------------------------------------------------------
+// Field omission
+// ---------------------------------------------------------------------------
+function omitFields(obj, patterns, context) {
+    for (const pattern of patterns) {
+        // Simple field path handling (not full JSONPath)
+        // Handles: "toolCalls[*].input.headers.Authorization" when context is "input"
+        if (pattern.includes(context)) {
+            const parts = pattern.split(".");
+            const fieldIndex = parts.indexOf(context);
+            if (fieldIndex >= 0) {
+                const remainingPath = parts.slice(fieldIndex + 1);
+                deleteNestedField(obj, remainingPath);
+            }
+        }
+    }
+    return obj;
+}
+function deleteNestedField(obj, path) {
+    if (path.length === 0)
+        return;
+    if (path.length === 1) {
+        delete obj[path[0]];
+        return;
+    }
+    const child = obj[path[0]];
+    if (child && typeof child === "object") {
+        deleteNestedField(child, path.slice(1));
+    }
+}
+function parseJsonSafe(text) {
+    try {
+        return JSON.parse(text);
+    }
+    catch {
+        return text;
+    }
+}

package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts ADDED Viewed

@@ -0,0 +1,32 @@
+/**
+ * Tool call classification — maps raw provider tool names to categories.
+ *
+ * Raw tool names from providers are noisy and inconsistent (`WebSearch` vs
+ * `web_search` vs `Browser.search`). This module normalizes every tool call
+ * into one of six standard categories for cross-model comparison.
+ *
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
+ */
+import type { ToolCallCategory } from "../../../_vendor/ailf-core/index.d.ts";
+/**
+ * Classify a tool call by its raw name.
+ *
+ * Resolution order:
+ * 1. Exact match in custom overrides (if provided)
+ * 2. Exact match in default tool categories
+ * 3. Heuristic pattern matching on the name
+ * 4. Falls back to "execute" (safest default for unknown tools)
+ *
+ * @param name - Raw tool name from the provider
+ * @param customMappings - Optional custom tool → category overrides
+ * @returns The classified category
+ */
+export declare function classifyToolCall(name: string, customMappings?: Record<string, ToolCallCategory>): ToolCallCategory;
+/**
+ * Classify multiple tool calls, returning the category for each.
+ * Also tracks unrecognized names for the caller to log warnings.
+ */
+export declare function classifyToolCalls(names: string[], customMappings?: Record<string, ToolCallCategory>): {
+    categories: ToolCallCategory[];
+    unrecognized: string[];
+};

package/dist/pipeline/compiler/telemetry/tool-classifier.js ADDED Viewed

@@ -0,0 +1,120 @@
+/**
+ * Tool call classification — maps raw provider tool names to categories.
+ *
+ * Raw tool names from providers are noisy and inconsistent (`WebSearch` vs
+ * `web_search` vs `Browser.search`). This module normalizes every tool call
+ * into one of six standard categories for cross-model comparison.
+ *
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
+ */
+// ---------------------------------------------------------------------------
+// Default tool name → category mapping
+// ---------------------------------------------------------------------------
+const DEFAULT_TOOL_CATEGORIES = {
+    // Search tools
+    Grep: "search",
+    WebSearch: "search",
+    grep: "search",
+    search: "search",
+    semantic_search: "search",
+    web_search: "search",
+    // Read tools
+    Glob: "read",
+    Read: "read",
+    WebFetch: "read",
+    cat: "read",
+    curl: "read",
+    file_read: "read",
+    read_file: "read",
+    web_fetch: "read",
+    // Write tools
+    Edit: "write",
+    FileEdit: "write",
+    Write: "write",
+    file_write: "write",
+    patch: "write",
+    write_file: "write",
+    // Execute tools
+    Bash: "execute",
+    RunCode: "execute",
+    bash: "execute",
+    exec: "execute",
+    python: "execute",
+    run_code: "execute",
+    shell: "execute",
+    // Navigate tools
+    "Browser.navigate": "navigate",
+    FollowLink: "navigate",
+    browse: "navigate",
+    follow_link: "navigate",
+    navigate: "navigate",
+    open_url: "navigate",
+    // Communicate tools
+    AskUser: "communicate",
+    TodoRead: "communicate",
+    TodoWrite: "communicate",
+    ask_user: "communicate",
+    submit_response: "communicate",
+};
+// ---------------------------------------------------------------------------
+// Heuristic patterns (fallback when name not in lookup table)
+// ---------------------------------------------------------------------------
+const HEURISTIC_PATTERNS = [
+    [/search|find|query|lookup|grep/i, "search"],
+    [/read|fetch|get|load|cat|view/i, "read"],
+    [/write|create|edit|update|patch|save|put|post/i, "write"],
+    [/exec|run|bash|shell|python|code|command/i, "execute"],
+    [/navigate|browse|open|follow|link|url/i, "navigate"],
+    [/ask|user|chat|message|submit|todo|response/i, "communicate"],
+];
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+/**
+ * Classify a tool call by its raw name.
+ *
+ * Resolution order:
+ * 1. Exact match in custom overrides (if provided)
+ * 2. Exact match in default tool categories
+ * 3. Heuristic pattern matching on the name
+ * 4. Falls back to "execute" (safest default for unknown tools)
+ *
+ * @param name - Raw tool name from the provider
+ * @param customMappings - Optional custom tool → category overrides
+ * @returns The classified category
+ */
+export function classifyToolCall(name, customMappings) {
+    // 1. Custom overrides
+    if (customMappings?.[name]) {
+        return customMappings[name];
+    }
+    // 2. Default lookup
+    if (DEFAULT_TOOL_CATEGORIES[name]) {
+        return DEFAULT_TOOL_CATEGORIES[name];
+    }
+    // 3. Heuristic matching
+    for (const [pattern, category] of HEURISTIC_PATTERNS) {
+        if (pattern.test(name)) {
+            return category;
+        }
+    }
+    // 4. Unknown → execute (safest default)
+    return "execute";
+}
+/**
+ * Classify multiple tool calls, returning the category for each.
+ * Also tracks unrecognized names for the caller to log warnings.
+ */
+export function classifyToolCalls(names, customMappings) {
+    const categories = [];
+    const unrecognized = [];
+    for (const name of names) {
+        const category = classifyToolCall(name, customMappings);
+        categories.push(category);
+        // Track names that required heuristic or default fallback
+        if (!DEFAULT_TOOL_CATEGORIES[name] && !customMappings?.[name]) {
+            unrecognized.push(name);
+        }
+    }
+    return { categories, unrecognized };
+}

package/dist/pipeline/compiler/telemetry/trace-collector.d.ts ADDED Viewed

@@ -0,0 +1,75 @@
+/**
+ * TraceCollector — extracts structured trace data from provider responses.
+ *
+ * Parses tool calls, token usage, and timing data from Promptfoo result
+ * objects and normalizes them into the canonical `EvalTrace` shape.
+ *
+ * Works via inline extraction — parsing provider response metadata
+ * directly, without requiring additional infrastructure.
+ *
+ * @see docs/design-docs/architecture-overhaul/observability-telemetry.md
+ * @see packages/core/src/types/trace.ts — EvalTrace types
+ */
+import type { EvalTrace, ToolCallCategory } from "../../../_vendor/ailf-core/index.d.ts";
+/** Raw provider response shape (subset of Promptfoo's result object) */
+export interface ProviderResponse {
+    /** Raw text output */
+    output?: string;
+    /** Token usage (varies by provider) */
+    tokenUsage?: {
+        completion?: number;
+        prompt?: number;
+        total?: number;
+        cached?: number;
+    };
+    /** Provider-specific metadata (e.g., Claude's toolCalls) */
+    metadata?: {
+        toolCalls?: RawToolCall[];
+        [key: string]: unknown;
+    };
+    /** Response latency in milliseconds */
+    latencyMs?: number;
+}
+/** Raw tool call from a provider (pre-normalization) */
+export interface RawToolCall {
+    name?: string;
+    input?: Record<string, unknown>;
+    output?: unknown;
+    error?: string;
+    durationMs?: number;
+    /** Alternative field names used by some providers */
+    function?: {
+        name?: string;
+        arguments?: string;
+    };
+    type?: string;
+}
+/** Options for trace collection */
+export interface TraceCollectorOptions {
+    /** Run ID to associate with this trace */
+    runId: string;
+    /** Task ID that produced this test case */
+    taskId: string;
+    /** Test case index within the task */
+    testCaseIndex: number;
+    /** Model under evaluation */
+    modelId: string;
+    /** Custom tool → category mappings */
+    toolCategories?: Record<string, ToolCallCategory>;
+    /** Maximum output size per tool call (bytes) */
+    maxOutputBytes?: number;
+}
+/**
+ * Collect a trace from a single provider response.
+ *
+ * Extracts tool calls, token usage, timing, and builds the
+ * chronological event log.
+ */
+export declare function collectTrace(response: ProviderResponse, options: TraceCollectorOptions): EvalTrace;
+/**
+ * Merge multiple per-turn traces into a single test case trace.
+ *
+ * Each turn produces its own trace. This function combines them into
+ * a parent trace with per-turn spans.
+ */
+export declare function mergeTraces(turns: EvalTrace[], parentOptions: TraceCollectorOptions): EvalTrace;