npm - @sanity/ailf - Versions diffs - 3.9.0 → 4.0.0 - Mend

@sanity/ailf 3.9.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/config/models.ts +32 -4
package/dist/_vendor/ailf-core/config-helpers.d.ts +8 -2
package/dist/_vendor/ailf-core/config-helpers.js +54 -1
package/dist/_vendor/ailf-shared/index.d.ts +16 -10
package/dist/_vendor/ailf-shared/index.js +13 -10
package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
package/dist/agent-observer/agentic-provider.js +28 -23
package/dist/agent-observer/classifier.js +7 -2
package/dist/agent-observer/proxy.d.ts +88 -3
package/dist/agent-observer/proxy.js +174 -16
package/dist/agent-observer/types.d.ts +23 -5
package/dist/cli-program.js +1 -1
package/dist/commands/baseline.d.ts +3 -1
package/dist/commands/baseline.js +29 -9
package/dist/commands/cache.d.ts +5 -1
package/dist/commands/cache.js +31 -15
package/dist/commands/compare.js +11 -4
package/dist/commands/explain-handler.js +2 -2
package/dist/config/models.ts +32 -4
package/dist/pipeline/baseline.d.ts +14 -3
package/dist/pipeline/baseline.js +7 -13
package/dist/pipeline/calculate-scores.js +40 -1
package/dist/pipeline/compiler/provider-assembler.d.ts +23 -0
package/dist/pipeline/compiler/provider-assembler.js +37 -2
package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
package/package.json +2 -2

package/dist/pipeline/baseline.d.ts CHANGED Viewed

@@ -29,9 +29,20 @@ export interface ScoreComparison {
     delta: number;
     feature: string;
 }
-export declare function compareBaseline(rootDir: string, baselineFile?: string): CompareResult;
-export declare function listBaselines(rootDir: string): BaselineMetadata[];
-export declare function saveBaseline(rootDir: string, tag?: string): {
+/**
+ * Paths the baseline pipeline functions read and write. Callers compose this
+ * from caller-relative paths so the functions stay agnostic of where the
+ * eval package itself lives on disk (W0098).
+ */
+export interface BaselineDirs {
+    /** Directory that contains baseline `*.json` snapshots. */
+    baselinesDir: string;
+    /** Absolute path to the current run's `score-summary.json`. */
+    scoreSummaryPath: string;
+}
+export declare function compareBaseline(dirs: BaselineDirs, baselineFile?: string): CompareResult;
+export declare function listBaselines(baselinesDir: string): BaselineMetadata[];
+export declare function saveBaseline(dirs: BaselineDirs, tag?: string): {
     success: boolean;
     message: string;
 };

package/dist/pipeline/baseline.js CHANGED Viewed

@@ -7,12 +7,8 @@
  */
 import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
 import { join } from "path";
-// ---------------------------------------------------------------------------
-// Compare
-// ---------------------------------------------------------------------------
-export function compareBaseline(rootDir, baselineFile) {
-    const baselinesDir = join(rootDir, "results", "baselines");
-    const scoreSummaryPath = join(rootDir, "results", "latest", "score-summary.json");
+export function compareBaseline(dirs, baselineFile) {
+    const { baselinesDir, scoreSummaryPath } = dirs;
     if (!existsSync(scoreSummaryPath)) {
         return {
             message: "No current score-summary.json found.",
@@ -20,7 +16,7 @@ export function compareBaseline(rootDir, baselineFile) {
         };
     }
     // Find baseline to compare against
-    const baselines = listBaselines(rootDir);
+    const baselines = listBaselines(baselinesDir);
     if (baselines.length === 0) {
         return {
             message: "No baselines saved yet. Run 'pnpm baseline:save' first.",
@@ -76,8 +72,7 @@ export function compareBaseline(rootDir, baselineFile) {
 // ---------------------------------------------------------------------------
 // List
 // ---------------------------------------------------------------------------
-export function listBaselines(rootDir) {
-    const baselinesDir = join(rootDir, "results", "baselines");
+export function listBaselines(baselinesDir) {
     if (!existsSync(baselinesDir)) {
         return [];
     }
@@ -102,9 +97,8 @@ export function listBaselines(rootDir) {
 // ---------------------------------------------------------------------------
 // Save
 // ---------------------------------------------------------------------------
-export function saveBaseline(rootDir, tag) {
-    const baselinesDir = join(rootDir, "results", "baselines");
-    const scoreSummaryPath = join(rootDir, "results", "latest", "score-summary.json");
+export function saveBaseline(dirs, tag) {
+    const { baselinesDir, scoreSummaryPath } = dirs;
     if (!existsSync(scoreSummaryPath)) {
         return {
             message: "No score-summary.json found. Run 'pnpm calculate-scores' first.",
@@ -135,7 +129,7 @@ export function saveBaseline(rootDir, tag) {
     };
     writeFileSync(join(baselinesDir, filename), JSON.stringify(baseline, null, 2));
     return {
-        message: `Saved baseline to results/baselines/${filename} (avg: ${Math.round(summary.overall.avgScore)}, ${summary.scores.length} areas)`,
+        message: `Saved baseline to ${join(baselinesDir, filename)} (avg: ${Math.round(summary.overall.avgScore)}, ${summary.scores.length} areas)`,
         success: true,
     };
 }

package/dist/pipeline/calculate-scores.js CHANGED Viewed

@@ -531,6 +531,45 @@ function printAgentBehaviorReport(agentBehavior, log) {
 // ---------------------------------------------------------------------------
 // Grader cost extraction
 // ---------------------------------------------------------------------------
+/**
+ * Resolve a per-test cost. Promptfoo populates `r.cost` for most providers
+ * directly, but `openai:responses:` (and occasionally `openai:chat:`) leaves
+ * `cost` at 0 for newer models — Promptfoo's pricing table can lag the
+ * model launch. When `cost` is 0 but `response.tokenUsage` is recorded,
+ * fall back to AILF's local pricing table so the per-model rollup row
+ * isn't dropped on the floor. See W0123.
+ */
+function resolveTestCost(r) {
+    const promptfooCost = r.cost ?? 0;
+    if (promptfooCost > 0)
+        return promptfooCost;
+    const tokens = r.response?.tokenUsage;
+    if (!tokens)
+        return promptfooCost;
+    const model = extractModelFromProviderId(r.provider?.id);
+    if (!model)
+        return promptfooCost;
+    return calculateCost(model, tokens.prompt ?? 0, tokens.completion ?? 0);
+}
+/**
+ * Extract the model name from a Promptfoo provider id. Provider ids are
+ * colon-segmented `<vendor>:<surface>:<model>` (e.g. `openai:responses:gpt-5.4`,
+ * `anthropic:messages:claude-opus-4-6`); the model is the trailing segment.
+ * Returns undefined for ids that don't carry a model segment (e.g. agentic
+ * providers whose id ends in a `file://` URL).
+ */
+function extractModelFromProviderId(providerId) {
+    if (!providerId)
+        return undefined;
+    const parts = providerId.split(":");
+    if (parts.length < 2)
+        return undefined;
+    const last = parts[parts.length - 1];
+    if (!last || last.startsWith("file://") || last.startsWith("http")) {
+        return undefined;
+    }
+    return last;
+}
 /**
  * Reads the raw Promptfoo output file and normalizes each result so that
  * `description` is always a top-level field (pulled from `testCase` if needed).
@@ -551,7 +590,7 @@ function readAndNormalizeResults(resultsPath, log) {
     let synthesizedCount = 0;
     for (const r of wrapper.results) {
         const base = {
-            cost: r.cost ?? 0,
+            cost: resolveTestCost(r),
             description: r.testCase?.description ?? "unknown",
             latencyMs: r.latencyMs,
             metadata: r.metadata,

package/dist/pipeline/compiler/provider-assembler.d.ts CHANGED Viewed

@@ -6,6 +6,21 @@
  *
  * Separated into its own module so GenerateConfigsStep can import it
  * without pulling in the full legacy generate-configs machinery.
+ *
+ * W0134 — per-mode maxToolRounds
+ *
+ * The agentic naive variant gets a higher round budget than agentic
+ * optimized: naive simulates current real-world agent behavior under
+ * retrieval pressure (it spends rounds on retries when fetches fail) and
+ * benefits from more headroom; optimized bypasses Jina via the .md-direct
+ * branch and rarely needs more than a couple of rounds. Bumping globally
+ * would inflate optimized cost without changing its measured behavior.
+ *
+ * Resolution order (most specific wins):
+ *   1. `model.config.maxToolRounds` — per-model override.
+ *   2. `defaults.modeMaxToolRounds[variant]` — per-variant override.
+ *   3. `defaults.maxToolRounds` — global default.
+ *   4. Hard fallback (5).
  */
 import { type ModelsConfig } from "../../_vendor/ailf-core/index.d.ts";
 import type { ResolvedSourceConfig } from "../../sources.js";
@@ -37,3 +52,11 @@ export interface ModelsAndProviders {
  * the per-variant promptfoo config files.
  */
 export declare function loadModelsAndProviders(rootDir: string, source?: ResolvedSourceConfig, searchMode?: string, allowedOrigins?: string[]): ModelsAndProviders;
+/**
+ * Resolve `maxToolRounds` for an agentic variant (W0134).
+ *
+ * Most-specific wins: per-model `config.maxToolRounds` > per-variant
+ * `defaults.modeMaxToolRounds[variant]` > global `defaults.maxToolRounds`
+ * > hard fallback (5).
+ */
+export declare function resolveMaxToolRounds(models: ModelsConfig, model: ModelsConfig["models"][number], variant: "agentic-naive" | "agentic-optimized"): number;

package/dist/pipeline/compiler/provider-assembler.js CHANGED Viewed

@@ -6,6 +6,21 @@
  *
  * Separated into its own module so GenerateConfigsStep can import it
  * without pulling in the full legacy generate-configs machinery.
+ *
+ * W0134 — per-mode maxToolRounds
+ *
+ * The agentic naive variant gets a higher round budget than agentic
+ * optimized: naive simulates current real-world agent behavior under
+ * retrieval pressure (it spends rounds on retries when fetches fail) and
+ * benefits from more headroom; optimized bypasses Jina via the .md-direct
+ * branch and rarely needs more than a couple of rounds. Bumping globally
+ * would inflate optimized cost without changing its measured behavior.
+ *
+ * Resolution order (most specific wins):
+ *   1. `model.config.maxToolRounds` — per-model override.
+ *   2. `defaults.modeMaxToolRounds[variant]` — per-variant override.
+ *   3. `defaults.maxToolRounds` — global default.
+ *   4. Hard fallback (5).
  */
 import { extractModelName, extractProvider, mergeConfig, } from "../../_vendor/ailf-core/index.js";
 import { loadConfigFile } from "./config-loader.js";
@@ -100,7 +115,7 @@ function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
             config: {
                 ...mergeConfig(models.defaults, model.config, {
                     agentMode: "naive",
-                    maxToolRounds: models.defaults.maxToolRounds ?? 5,
+                    maxToolRounds: resolveMaxToolRounds(models, model, "agentic-naive"),
                     model: modelName,
                     provider,
                 }),
@@ -120,7 +135,7 @@ function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
             config: {
                 ...mergeConfig(models.defaults, model.config, {
                     agentMode: "optimized",
-                    maxToolRounds: models.defaults.maxToolRounds ?? 5,
+                    maxToolRounds: resolveMaxToolRounds(models, model, "agentic-optimized"),
                     model: modelName,
                     provider,
                 }),
@@ -135,6 +150,26 @@ function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
     }
     return providers;
 }
+/**
+ * Resolve `maxToolRounds` for an agentic variant (W0134).
+ *
+ * Most-specific wins: per-model `config.maxToolRounds` > per-variant
+ * `defaults.modeMaxToolRounds[variant]` > global `defaults.maxToolRounds`
+ * > hard fallback (5).
+ */
+export function resolveMaxToolRounds(models, model, variant) {
+    const perModel = model.config?.maxToolRounds;
+    if (typeof perModel === "number")
+        return perModel;
+    const modeOverrides = models.defaults.modeMaxToolRounds;
+    const perVariant = modeOverrides?.[variant];
+    if (typeof perVariant === "number")
+        return perVariant;
+    const globalDefault = models.defaults.maxToolRounds;
+    if (typeof globalDefault === "number")
+        return globalDefault;
+    return 5;
+}
 // ---------------------------------------------------------------------------
 // Helpers
 // ---------------------------------------------------------------------------

package/dist/pipeline/mirror-repo-tasks.d.ts CHANGED Viewed

@@ -107,7 +107,7 @@ export declare function buildMirrorDocument(task: LiteracyTaskDefinition, opts:
     slugToDocId: Map<string, string>;
 }): {
     baseline?: {
-        rubric?: "abbreviated" | "full" | "none" | undefined;
+        rubric?: "full" | "abbreviated" | "none" | undefined;
         enabled?: boolean | undefined;
     } | undefined;
     _id: string;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sanity/ailf",
-  "version": "3.9.0",
+  "version": "4.0.0",
   "private": false,
   "publishConfig": {
     "access": "public"
@@ -77,7 +77,7 @@
     "test": "tsx --test src/__tests__/*.test.ts src/adapters/**/__tests__/*.adapter.test.ts",
     "test:e2e": "AILF_E2E=1 tsx --test src/__tests__/e2e/*.e2e.test.ts",
     "test:e2e:adapters": "AILF_E2E=1 tsx --test src/adapters/**/__tests__/*.adapter.test.ts",
-    "test:e2e:api": "AILF_E2E_API=1 tsx --test src/__tests__/api-tier2-tenant-integration.test.ts src/__tests__/run-remote-tier2.test.ts",
+    "test:e2e:api": "AILF_E2E_API=1 tsx --test src/__tests__/api-tier2-tenant-integration.test.ts",
     "test:all": "AILF_E2E=1 tsx --test src/__tests__/*.test.ts src/pipeline/compiler/__tests__/*.test.ts src/__tests__/e2e/*.e2e.test.ts src/adapters/**/__tests__/*.adapter.test.ts",
     "pr-comment": "tsx src/cli.ts pr-comment",
     "coverage-audit": "tsx src/cli.ts report coverage",