@sanity/ailf 3.9.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,9 +29,20 @@ export interface ScoreComparison {
29
29
  delta: number;
30
30
  feature: string;
31
31
  }
32
- export declare function compareBaseline(rootDir: string, baselineFile?: string): CompareResult;
33
- export declare function listBaselines(rootDir: string): BaselineMetadata[];
34
- export declare function saveBaseline(rootDir: string, tag?: string): {
32
+ /**
33
+ * Paths the baseline pipeline functions read and write. Callers compose this
34
+ * from caller-relative paths so the functions stay agnostic of where the
35
+ * eval package itself lives on disk (W0098).
36
+ */
37
+ export interface BaselineDirs {
38
+ /** Directory that contains baseline `*.json` snapshots. */
39
+ baselinesDir: string;
40
+ /** Absolute path to the current run's `score-summary.json`. */
41
+ scoreSummaryPath: string;
42
+ }
43
+ export declare function compareBaseline(dirs: BaselineDirs, baselineFile?: string): CompareResult;
44
+ export declare function listBaselines(baselinesDir: string): BaselineMetadata[];
45
+ export declare function saveBaseline(dirs: BaselineDirs, tag?: string): {
35
46
  success: boolean;
36
47
  message: string;
37
48
  };
@@ -7,12 +7,8 @@
7
7
  */
8
8
  import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
9
9
  import { join } from "path";
10
- // ---------------------------------------------------------------------------
11
- // Compare
12
- // ---------------------------------------------------------------------------
13
- export function compareBaseline(rootDir, baselineFile) {
14
- const baselinesDir = join(rootDir, "results", "baselines");
15
- const scoreSummaryPath = join(rootDir, "results", "latest", "score-summary.json");
10
+ export function compareBaseline(dirs, baselineFile) {
11
+ const { baselinesDir, scoreSummaryPath } = dirs;
16
12
  if (!existsSync(scoreSummaryPath)) {
17
13
  return {
18
14
  message: "No current score-summary.json found.",
@@ -20,7 +16,7 @@ export function compareBaseline(rootDir, baselineFile) {
20
16
  };
21
17
  }
22
18
  // Find baseline to compare against
23
- const baselines = listBaselines(rootDir);
19
+ const baselines = listBaselines(baselinesDir);
24
20
  if (baselines.length === 0) {
25
21
  return {
26
22
  message: "No baselines saved yet. Run 'pnpm baseline:save' first.",
@@ -76,8 +72,7 @@ export function compareBaseline(rootDir, baselineFile) {
76
72
  // ---------------------------------------------------------------------------
77
73
  // List
78
74
  // ---------------------------------------------------------------------------
79
- export function listBaselines(rootDir) {
80
- const baselinesDir = join(rootDir, "results", "baselines");
75
+ export function listBaselines(baselinesDir) {
81
76
  if (!existsSync(baselinesDir)) {
82
77
  return [];
83
78
  }
@@ -102,9 +97,8 @@ export function listBaselines(rootDir) {
102
97
  // ---------------------------------------------------------------------------
103
98
  // Save
104
99
  // ---------------------------------------------------------------------------
105
- export function saveBaseline(rootDir, tag) {
106
- const baselinesDir = join(rootDir, "results", "baselines");
107
- const scoreSummaryPath = join(rootDir, "results", "latest", "score-summary.json");
100
+ export function saveBaseline(dirs, tag) {
101
+ const { baselinesDir, scoreSummaryPath } = dirs;
108
102
  if (!existsSync(scoreSummaryPath)) {
109
103
  return {
110
104
  message: "No score-summary.json found. Run 'pnpm calculate-scores' first.",
@@ -135,7 +129,7 @@ export function saveBaseline(rootDir, tag) {
135
129
  };
136
130
  writeFileSync(join(baselinesDir, filename), JSON.stringify(baseline, null, 2));
137
131
  return {
138
- message: `Saved baseline to results/baselines/${filename} (avg: ${Math.round(summary.overall.avgScore)}, ${summary.scores.length} areas)`,
132
+ message: `Saved baseline to ${join(baselinesDir, filename)} (avg: ${Math.round(summary.overall.avgScore)}, ${summary.scores.length} areas)`,
139
133
  success: true,
140
134
  };
141
135
  }
@@ -531,6 +531,45 @@ function printAgentBehaviorReport(agentBehavior, log) {
531
531
  // ---------------------------------------------------------------------------
532
532
  // Grader cost extraction
533
533
  // ---------------------------------------------------------------------------
534
+ /**
535
+ * Resolve a per-test cost. Promptfoo populates `r.cost` for most providers
536
+ * directly, but `openai:responses:` (and occasionally `openai:chat:`) leaves
537
+ * `cost` at 0 for newer models — Promptfoo's pricing table can lag the
538
+ * model launch. When `cost` is 0 but `response.tokenUsage` is recorded,
539
+ * fall back to AILF's local pricing table so the per-model rollup row
540
+ * isn't dropped on the floor. See W0123.
541
+ */
542
+ function resolveTestCost(r) {
543
+ const promptfooCost = r.cost ?? 0;
544
+ if (promptfooCost > 0)
545
+ return promptfooCost;
546
+ const tokens = r.response?.tokenUsage;
547
+ if (!tokens)
548
+ return promptfooCost;
549
+ const model = extractModelFromProviderId(r.provider?.id);
550
+ if (!model)
551
+ return promptfooCost;
552
+ return calculateCost(model, tokens.prompt ?? 0, tokens.completion ?? 0);
553
+ }
554
+ /**
555
+ * Extract the model name from a Promptfoo provider id. Provider ids are
556
+ * colon-segmented `<vendor>:<surface>:<model>` (e.g. `openai:responses:gpt-5.4`,
557
+ * `anthropic:messages:claude-opus-4-6`); the model is the trailing segment.
558
+ * Returns undefined for ids that don't carry a model segment (e.g. agentic
559
+ * providers whose id ends in a `file://` URL).
560
+ */
561
+ function extractModelFromProviderId(providerId) {
562
+ if (!providerId)
563
+ return undefined;
564
+ const parts = providerId.split(":");
565
+ if (parts.length < 2)
566
+ return undefined;
567
+ const last = parts[parts.length - 1];
568
+ if (!last || last.startsWith("file://") || last.startsWith("http")) {
569
+ return undefined;
570
+ }
571
+ return last;
572
+ }
534
573
  /**
535
574
  * Reads the raw Promptfoo output file and normalizes each result so that
536
575
  * `description` is always a top-level field (pulled from `testCase` if needed).
@@ -551,7 +590,7 @@ function readAndNormalizeResults(resultsPath, log) {
551
590
  let synthesizedCount = 0;
552
591
  for (const r of wrapper.results) {
553
592
  const base = {
554
- cost: r.cost ?? 0,
593
+ cost: resolveTestCost(r),
555
594
  description: r.testCase?.description ?? "unknown",
556
595
  latencyMs: r.latencyMs,
557
596
  metadata: r.metadata,
@@ -6,6 +6,21 @@
6
6
  *
7
7
  * Separated into its own module so GenerateConfigsStep can import it
8
8
  * without pulling in the full legacy generate-configs machinery.
9
+ *
10
+ * W0134 — per-mode maxToolRounds
11
+ *
12
+ * The agentic naive variant gets a higher round budget than agentic
13
+ * optimized: naive simulates current real-world agent behavior under
14
+ * retrieval pressure (it spends rounds on retries when fetches fail) and
15
+ * benefits from more headroom; optimized bypasses Jina via the .md-direct
16
+ * branch and rarely needs more than a couple of rounds. Bumping globally
17
+ * would inflate optimized cost without changing its measured behavior.
18
+ *
19
+ * Resolution order (most specific wins):
20
+ * 1. `model.config.maxToolRounds` — per-model override.
21
+ * 2. `defaults.modeMaxToolRounds[variant]` — per-variant override.
22
+ * 3. `defaults.maxToolRounds` — global default.
23
+ * 4. Hard fallback (5).
9
24
  */
10
25
  import { type ModelsConfig } from "../../_vendor/ailf-core/index.d.ts";
11
26
  import type { ResolvedSourceConfig } from "../../sources.js";
@@ -37,3 +52,11 @@ export interface ModelsAndProviders {
37
52
  * the per-variant promptfoo config files.
38
53
  */
39
54
  export declare function loadModelsAndProviders(rootDir: string, source?: ResolvedSourceConfig, searchMode?: string, allowedOrigins?: string[]): ModelsAndProviders;
55
+ /**
56
+ * Resolve `maxToolRounds` for an agentic variant (W0134).
57
+ *
58
+ * Most-specific wins: per-model `config.maxToolRounds` > per-variant
59
+ * `defaults.modeMaxToolRounds[variant]` > global `defaults.maxToolRounds`
60
+ * > hard fallback (5).
61
+ */
62
+ export declare function resolveMaxToolRounds(models: ModelsConfig, model: ModelsConfig["models"][number], variant: "agentic-naive" | "agentic-optimized"): number;
@@ -6,6 +6,21 @@
6
6
  *
7
7
  * Separated into its own module so GenerateConfigsStep can import it
8
8
  * without pulling in the full legacy generate-configs machinery.
9
+ *
10
+ * W0134 — per-mode maxToolRounds
11
+ *
12
+ * The agentic naive variant gets a higher round budget than agentic
13
+ * optimized: naive simulates current real-world agent behavior under
14
+ * retrieval pressure (it spends rounds on retries when fetches fail) and
15
+ * benefits from more headroom; optimized bypasses Jina via the .md-direct
16
+ * branch and rarely needs more than a couple of rounds. Bumping globally
17
+ * would inflate optimized cost without changing its measured behavior.
18
+ *
19
+ * Resolution order (most specific wins):
20
+ * 1. `model.config.maxToolRounds` — per-model override.
21
+ * 2. `defaults.modeMaxToolRounds[variant]` — per-variant override.
22
+ * 3. `defaults.maxToolRounds` — global default.
23
+ * 4. Hard fallback (5).
9
24
  */
10
25
  import { extractModelName, extractProvider, mergeConfig, } from "../../_vendor/ailf-core/index.js";
11
26
  import { loadConfigFile } from "./config-loader.js";
@@ -100,7 +115,7 @@ function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
100
115
  config: {
101
116
  ...mergeConfig(models.defaults, model.config, {
102
117
  agentMode: "naive",
103
- maxToolRounds: models.defaults.maxToolRounds ?? 5,
118
+ maxToolRounds: resolveMaxToolRounds(models, model, "agentic-naive"),
104
119
  model: modelName,
105
120
  provider,
106
121
  }),
@@ -120,7 +135,7 @@ function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
120
135
  config: {
121
136
  ...mergeConfig(models.defaults, model.config, {
122
137
  agentMode: "optimized",
123
- maxToolRounds: models.defaults.maxToolRounds ?? 5,
138
+ maxToolRounds: resolveMaxToolRounds(models, model, "agentic-optimized"),
124
139
  model: modelName,
125
140
  provider,
126
141
  }),
@@ -135,6 +150,26 @@ function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
135
150
  }
136
151
  return providers;
137
152
  }
153
+ /**
154
+ * Resolve `maxToolRounds` for an agentic variant (W0134).
155
+ *
156
+ * Most-specific wins: per-model `config.maxToolRounds` > per-variant
157
+ * `defaults.modeMaxToolRounds[variant]` > global `defaults.maxToolRounds`
158
+ * > hard fallback (5).
159
+ */
160
+ export function resolveMaxToolRounds(models, model, variant) {
161
+ const perModel = model.config?.maxToolRounds;
162
+ if (typeof perModel === "number")
163
+ return perModel;
164
+ const modeOverrides = models.defaults.modeMaxToolRounds;
165
+ const perVariant = modeOverrides?.[variant];
166
+ if (typeof perVariant === "number")
167
+ return perVariant;
168
+ const globalDefault = models.defaults.maxToolRounds;
169
+ if (typeof globalDefault === "number")
170
+ return globalDefault;
171
+ return 5;
172
+ }
138
173
  // ---------------------------------------------------------------------------
139
174
  // Helpers
140
175
  // ---------------------------------------------------------------------------
@@ -107,7 +107,7 @@ export declare function buildMirrorDocument(task: LiteracyTaskDefinition, opts:
107
107
  slugToDocId: Map<string, string>;
108
108
  }): {
109
109
  baseline?: {
110
- rubric?: "abbreviated" | "full" | "none" | undefined;
110
+ rubric?: "full" | "abbreviated" | "none" | undefined;
111
111
  enabled?: boolean | undefined;
112
112
  } | undefined;
113
113
  _id: string;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "3.9.0",
3
+ "version": "4.0.0",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"
@@ -77,7 +77,7 @@
77
77
  "test": "tsx --test src/__tests__/*.test.ts src/adapters/**/__tests__/*.adapter.test.ts",
78
78
  "test:e2e": "AILF_E2E=1 tsx --test src/__tests__/e2e/*.e2e.test.ts",
79
79
  "test:e2e:adapters": "AILF_E2E=1 tsx --test src/adapters/**/__tests__/*.adapter.test.ts",
80
- "test:e2e:api": "AILF_E2E_API=1 tsx --test src/__tests__/api-tier2-tenant-integration.test.ts src/__tests__/run-remote-tier2.test.ts",
80
+ "test:e2e:api": "AILF_E2E_API=1 tsx --test src/__tests__/api-tier2-tenant-integration.test.ts",
81
81
  "test:all": "AILF_E2E=1 tsx --test src/__tests__/*.test.ts src/pipeline/compiler/__tests__/*.test.ts src/__tests__/e2e/*.e2e.test.ts src/adapters/**/__tests__/*.adapter.test.ts",
82
82
  "pr-comment": "tsx src/cli.ts pr-comment",
83
83
  "coverage-audit": "tsx src/cli.ts report coverage",