@sanity/ailf 3.9.0 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/models.ts +32 -4
- package/dist/_vendor/ailf-core/config-helpers.d.ts +8 -2
- package/dist/_vendor/ailf-core/config-helpers.js +54 -1
- package/dist/_vendor/ailf-core/services/slim-report-summary.js +13 -4
- package/dist/_vendor/ailf-core/types/index.d.ts +10 -0
- package/dist/_vendor/ailf-shared/index.d.ts +16 -10
- package/dist/_vendor/ailf-shared/index.js +13 -10
- package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
- package/dist/agent-observer/agentic-provider.js +28 -23
- package/dist/agent-observer/classifier.js +7 -2
- package/dist/agent-observer/proxy.d.ts +88 -3
- package/dist/agent-observer/proxy.js +174 -16
- package/dist/agent-observer/types.d.ts +23 -5
- package/dist/artifact-capture/accumulating-artifact-writer.d.ts +13 -0
- package/dist/artifact-capture/accumulating-artifact-writer.js +19 -0
- package/dist/cli-program.js +1 -1
- package/dist/commands/baseline.d.ts +3 -1
- package/dist/commands/baseline.js +29 -9
- package/dist/commands/cache.d.ts +5 -1
- package/dist/commands/cache.js +31 -15
- package/dist/commands/check-staleness.js +12 -4
- package/dist/commands/compare.js +11 -4
- package/dist/commands/explain-handler.js +2 -2
- package/dist/config/models.ts +32 -4
- package/dist/orchestration/steps/run-eval-step.js +39 -29
- package/dist/pipeline/baseline.d.ts +14 -3
- package/dist/pipeline/baseline.js +7 -13
- package/dist/pipeline/cache-hit-restore.d.ts +24 -0
- package/dist/pipeline/cache-hit-restore.js +32 -0
- package/dist/pipeline/calculate-scores.js +40 -1
- package/dist/pipeline/compiler/provider-assembler.d.ts +23 -0
- package/dist/pipeline/compiler/provider-assembler.js +37 -2
- package/dist/pipeline/eval-fingerprint.d.ts +33 -35
- package/dist/pipeline/eval-fingerprint.js +124 -106
- package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
- package/dist/report-store.js +3 -0
- package/package.json +2 -2
package/dist/config/models.ts
CHANGED
|
@@ -35,16 +35,23 @@ export default defineModels({
|
|
|
35
35
|
|
|
36
36
|
// ── OpenAI ─────────────────────────────────────────────────
|
|
37
37
|
{
|
|
38
|
+
// gpt-5.2 routes through chat completions (and through the in-house
|
|
39
|
+
// agentic provider for naive/optimized variants). `verbosity` is a
|
|
40
|
+
// Responses-API-only field — it would be silently dropped here, so
|
|
41
|
+
// it isn't configured. See W0131.
|
|
38
42
|
id: "openai:chat:gpt-5.2",
|
|
39
43
|
label: "GPT 5.2",
|
|
40
44
|
config: {
|
|
41
45
|
max_completion_tokens: 8192,
|
|
42
|
-
verbosity: "medium",
|
|
43
46
|
},
|
|
44
47
|
modes: ["literacy", "knowledge-probe"],
|
|
45
48
|
// All literacy variants included by default
|
|
46
49
|
},
|
|
47
50
|
{
|
|
51
|
+
// GPT 5.4 evaluated only on the baseline literacy variant. Promptfoo's
|
|
52
|
+
// native handling of `openai:responses:` honors reasoning / verbosity /
|
|
53
|
+
// summary; the in-house agentic provider does not (W0131). MCP-server
|
|
54
|
+
// and knowledge-probe routes go through Promptfoo native too.
|
|
48
55
|
id: "openai:responses:gpt-5.4",
|
|
49
56
|
label: "GPT 5.4",
|
|
50
57
|
config: {
|
|
@@ -55,7 +62,9 @@ export default defineModels({
|
|
|
55
62
|
},
|
|
56
63
|
timeoutMs: 600_000, // 10 min — reasoning model needs more headroom
|
|
57
64
|
modes: ["literacy", "mcp-server", "knowledge-probe"],
|
|
58
|
-
|
|
65
|
+
variants: {
|
|
66
|
+
literacy: ["baseline"],
|
|
67
|
+
},
|
|
59
68
|
},
|
|
60
69
|
|
|
61
70
|
// ── Disabled models (uncomment to enable) ──────────────────
|
|
@@ -93,12 +102,31 @@ export default defineModels({
|
|
|
93
102
|
defaults: {
|
|
94
103
|
temperature: 0.2,
|
|
95
104
|
max_tokens: 4096,
|
|
96
|
-
|
|
105
|
+
// Global default round budget for agentic modes. Per-mode overrides
|
|
106
|
+
// below give naive more headroom (W0134) since it spends rounds on
|
|
107
|
+
// retries when fetches fail. Per-model `config.maxToolRounds` still
|
|
108
|
+
// wins over both values.
|
|
109
|
+
maxToolRounds: 5,
|
|
110
|
+
modeMaxToolRounds: {
|
|
111
|
+
"agentic-naive": 8,
|
|
112
|
+
"agentic-optimized": 5,
|
|
113
|
+
},
|
|
97
114
|
observerOptions: {
|
|
98
|
-
|
|
115
|
+
// Per-class preview caps (W0133): default 4 KB, but search responses
|
|
116
|
+
// get 16 KB and llms.txt gets 128 KB so trace audits can resolve
|
|
117
|
+
// which result the model actually saw.
|
|
118
|
+
maxPreviewBytes: 4096,
|
|
119
|
+
previewLimits: {
|
|
120
|
+
default: 4096,
|
|
121
|
+
llmsTxt: 131072,
|
|
122
|
+
search: 16384,
|
|
123
|
+
},
|
|
99
124
|
captureResponsePreview: true,
|
|
100
125
|
includePatterns: ["sanity.io", "sanity.dev", "cdn.sanity.io"],
|
|
101
126
|
sensitiveHeaders: ["authorization", "cookie", "x-api-key"],
|
|
127
|
+
// statusOnlyForUnmatched defaults to true (W0132) — model-side
|
|
128
|
+
// traffic to api.openai.com / api.anthropic.com / googleapis.com
|
|
129
|
+
// surfaces in run artifacts as slim status-only entries.
|
|
102
130
|
},
|
|
103
131
|
},
|
|
104
132
|
})
|
|
@@ -8,10 +8,13 @@
|
|
|
8
8
|
import { existsSync, mkdirSync, writeFileSync } from "fs";
|
|
9
9
|
import { resolve } from "path";
|
|
10
10
|
import { emitPerEntryEvalResults } from "../../pipeline/emit-eval-results.js";
|
|
11
|
+
import { AccumulatingArtifactWriter } from "../../artifact-capture/accumulating-artifact-writer.js";
|
|
11
12
|
import { getStepInputPaths } from "../../pipeline/cache.js";
|
|
12
13
|
import { buildCacheContext } from "../cache-context.js";
|
|
14
|
+
import { remapToCacheHitRefs } from "../../pipeline/cache-hit-restore.js";
|
|
13
15
|
import { checkCanonicalContextsExist, checkGeneratedConfigsExist, checkResultsExist, } from "../../pipeline/checks.js";
|
|
14
16
|
import { computeEvalFingerprint } from "../../pipeline/eval-fingerprint.js";
|
|
17
|
+
import { loadGraderModel } from "../../pipeline/grader-api.js";
|
|
15
18
|
import { buildFilterFlags, configFileForMode, resultsFileForMode, scanResultsForErrors, } from "../../pipeline/eval-constants.js";
|
|
16
19
|
export class RunEvalStep {
|
|
17
20
|
mode;
|
|
@@ -39,31 +42,31 @@ export class RunEvalStep {
|
|
|
39
42
|
status: "failed",
|
|
40
43
|
};
|
|
41
44
|
}
|
|
45
|
+
// Load the task set once and reuse it for both the literacy precondition
|
|
46
|
+
// check and the fingerprint. Mirrors the area/task filter applied by
|
|
47
|
+
// fetch-docs so we only see tasks that were actually fetched.
|
|
48
|
+
const filter = ctx.config.areas || ctx.config.tasks || ctx.config.tags
|
|
49
|
+
? {
|
|
50
|
+
...(ctx.config.areas ? { areas: ctx.config.areas } : {}),
|
|
51
|
+
...(ctx.config.tasks ? { taskIds: ctx.config.tasks } : {}),
|
|
52
|
+
...(ctx.config.tags ? { tags: ctx.config.tags } : {}),
|
|
53
|
+
}
|
|
54
|
+
: undefined;
|
|
55
|
+
let tasks = await ctx.taskSource.loadTasks(filter);
|
|
56
|
+
// Release auto-scope: narrow to affected tasks (mirrors GenerateConfigsStep)
|
|
57
|
+
if (state.releaseAutoScope && !ctx.config.noAutoScope) {
|
|
58
|
+
const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
|
|
59
|
+
tasks = tasks.filter((t) => scopedIds.has(t.id));
|
|
60
|
+
}
|
|
42
61
|
// Precondition: canonical context files exist for filtered tasks.
|
|
43
62
|
// Only applies to literacy mode — other modes don't use canonical doc contexts.
|
|
44
63
|
if (this.mode === "literacy") {
|
|
45
|
-
// Must apply the same area/task filter as fetch-docs so we only
|
|
46
|
-
// check contexts that were actually fetched.
|
|
47
|
-
const filter = ctx.config.areas || ctx.config.tasks || ctx.config.tags
|
|
48
|
-
? {
|
|
49
|
-
...(ctx.config.areas ? { areas: ctx.config.areas } : {}),
|
|
50
|
-
...(ctx.config.tasks ? { taskIds: ctx.config.tasks } : {}),
|
|
51
|
-
...(ctx.config.tags ? { tags: ctx.config.tags } : {}),
|
|
52
|
-
}
|
|
53
|
-
: undefined;
|
|
54
|
-
let tasks = await ctx.taskSource.loadTasks(filter);
|
|
55
|
-
// Release auto-scope: narrow to affected tasks (mirrors GenerateConfigsStep)
|
|
56
|
-
if (state.releaseAutoScope && !ctx.config.noAutoScope) {
|
|
57
|
-
const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
|
|
58
|
-
tasks = tasks.filter((t) => scopedIds.has(t.id));
|
|
59
|
-
}
|
|
60
64
|
// Only check context files for tasks that have canonical docs.
|
|
61
65
|
// Tasks without canonical docs are skipped by FetchDocsStep (they
|
|
62
66
|
// have no docs to fetch), so no context file is written for them.
|
|
63
67
|
// The generated Promptfoo config still includes their "without-docs"
|
|
64
68
|
// variant (testing model knowledge alone), which doesn't need a
|
|
65
69
|
// context file.
|
|
66
|
-
// Bridge: narrow to literacy tasks with docs
|
|
67
70
|
const tasksWithDocs = tasks.filter((t) => t.mode === "literacy" && (t.context?.docs?.length ?? 0) > 0);
|
|
68
71
|
const taskIds = tasksWithDocs.map((t) => t.id);
|
|
69
72
|
const contextIssues = checkCanonicalContextsExist(rootDir, taskIds);
|
|
@@ -83,14 +86,8 @@ export class RunEvalStep {
|
|
|
83
86
|
if (!debug?.enabled) {
|
|
84
87
|
try {
|
|
85
88
|
evalFingerprint = computeEvalFingerprint({
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
areas: ctx.config.areas,
|
|
89
|
-
taskIds: ctx.config.tasks,
|
|
90
|
-
tags: ctx.config.tags,
|
|
91
|
-
}
|
|
92
|
-
: undefined,
|
|
93
|
-
graderModel: "default",
|
|
89
|
+
tasks,
|
|
90
|
+
graderModel: loadGraderModel(rootDir).id,
|
|
94
91
|
mode: this.mode,
|
|
95
92
|
rootDir,
|
|
96
93
|
});
|
|
@@ -119,11 +116,22 @@ export class RunEvalStep {
|
|
|
119
116
|
state.promptfooUrls ??= [];
|
|
120
117
|
state.promptfooUrls.push(...remoteCacheResult.promptfooUrls);
|
|
121
118
|
}
|
|
122
|
-
//
|
|
123
|
-
//
|
|
124
|
-
//
|
|
125
|
-
//
|
|
126
|
-
//
|
|
119
|
+
// D0040 / W0135 — restore the cached report's artifact manifest into
|
|
120
|
+
// the accumulator so the new run's RunManifest advertises the cached
|
|
121
|
+
// artifacts via cross-run lineage (`sourceRunId`) instead of skipping
|
|
122
|
+
// them entirely. Without this, Studio drill-downs on the new report
|
|
123
|
+
// 404 because per-entry GCS objects were never written under the new
|
|
124
|
+
// runId. Bytes are not duplicated; the original prefix is untouched.
|
|
125
|
+
if (remoteCacheResult.artifactManifest &&
|
|
126
|
+
remoteCacheResult.sourceRunId &&
|
|
127
|
+
ctx.artifactWriter instanceof AccumulatingArtifactWriter) {
|
|
128
|
+
const restored = remapToCacheHitRefs(remoteCacheResult.artifactManifest, { sourceRunId: remoteCacheResult.sourceRunId });
|
|
129
|
+
ctx.artifactWriter.injectAccumulated(restored);
|
|
130
|
+
const count = Object.keys(restored).length;
|
|
131
|
+
if (count > 0) {
|
|
132
|
+
console.log(` ↪ Restored ${count} artifact ref${count === 1 ? "" : "s"} from run ${remoteCacheResult.sourceRunId}`);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
127
135
|
return {
|
|
128
136
|
durationMs: Date.now() - start,
|
|
129
137
|
status: "success",
|
|
@@ -241,9 +249,11 @@ async function checkRemoteCache(fingerprint, reportStore, rootDir) {
|
|
|
241
249
|
console.log(` ✅ Remote cache hit — reusing report ${cachedReport.id} from ${cachedReport.completedAt}`);
|
|
242
250
|
console.log(` ℹ️ Fingerprint: ${fingerprint.slice(0, 16)}... (${queryMs}ms)`);
|
|
243
251
|
return {
|
|
252
|
+
artifactManifest: cachedReport.artifactManifest,
|
|
244
253
|
completedAt: cachedReport.completedAt,
|
|
245
254
|
promptfooUrls: cachedReport.provenance?.promptfooUrls,
|
|
246
255
|
reportId: cachedReport.id,
|
|
256
|
+
sourceRunId: cachedReport.provenance?.runId,
|
|
247
257
|
};
|
|
248
258
|
}
|
|
249
259
|
catch (err) {
|
|
@@ -29,9 +29,20 @@ export interface ScoreComparison {
|
|
|
29
29
|
delta: number;
|
|
30
30
|
feature: string;
|
|
31
31
|
}
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
32
|
+
/**
|
|
33
|
+
* Paths the baseline pipeline functions read and write. Callers compose this
|
|
34
|
+
* from caller-relative paths so the functions stay agnostic of where the
|
|
35
|
+
* eval package itself lives on disk (W0098).
|
|
36
|
+
*/
|
|
37
|
+
export interface BaselineDirs {
|
|
38
|
+
/** Directory that contains baseline `*.json` snapshots. */
|
|
39
|
+
baselinesDir: string;
|
|
40
|
+
/** Absolute path to the current run's `score-summary.json`. */
|
|
41
|
+
scoreSummaryPath: string;
|
|
42
|
+
}
|
|
43
|
+
export declare function compareBaseline(dirs: BaselineDirs, baselineFile?: string): CompareResult;
|
|
44
|
+
export declare function listBaselines(baselinesDir: string): BaselineMetadata[];
|
|
45
|
+
export declare function saveBaseline(dirs: BaselineDirs, tag?: string): {
|
|
35
46
|
success: boolean;
|
|
36
47
|
message: string;
|
|
37
48
|
};
|
|
@@ -7,12 +7,8 @@
|
|
|
7
7
|
*/
|
|
8
8
|
import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
|
|
9
9
|
import { join } from "path";
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
// ---------------------------------------------------------------------------
|
|
13
|
-
export function compareBaseline(rootDir, baselineFile) {
|
|
14
|
-
const baselinesDir = join(rootDir, "results", "baselines");
|
|
15
|
-
const scoreSummaryPath = join(rootDir, "results", "latest", "score-summary.json");
|
|
10
|
+
export function compareBaseline(dirs, baselineFile) {
|
|
11
|
+
const { baselinesDir, scoreSummaryPath } = dirs;
|
|
16
12
|
if (!existsSync(scoreSummaryPath)) {
|
|
17
13
|
return {
|
|
18
14
|
message: "No current score-summary.json found.",
|
|
@@ -20,7 +16,7 @@ export function compareBaseline(rootDir, baselineFile) {
|
|
|
20
16
|
};
|
|
21
17
|
}
|
|
22
18
|
// Find baseline to compare against
|
|
23
|
-
const baselines = listBaselines(
|
|
19
|
+
const baselines = listBaselines(baselinesDir);
|
|
24
20
|
if (baselines.length === 0) {
|
|
25
21
|
return {
|
|
26
22
|
message: "No baselines saved yet. Run 'pnpm baseline:save' first.",
|
|
@@ -76,8 +72,7 @@ export function compareBaseline(rootDir, baselineFile) {
|
|
|
76
72
|
// ---------------------------------------------------------------------------
|
|
77
73
|
// List
|
|
78
74
|
// ---------------------------------------------------------------------------
|
|
79
|
-
export function listBaselines(
|
|
80
|
-
const baselinesDir = join(rootDir, "results", "baselines");
|
|
75
|
+
export function listBaselines(baselinesDir) {
|
|
81
76
|
if (!existsSync(baselinesDir)) {
|
|
82
77
|
return [];
|
|
83
78
|
}
|
|
@@ -102,9 +97,8 @@ export function listBaselines(rootDir) {
|
|
|
102
97
|
// ---------------------------------------------------------------------------
|
|
103
98
|
// Save
|
|
104
99
|
// ---------------------------------------------------------------------------
|
|
105
|
-
export function saveBaseline(
|
|
106
|
-
const baselinesDir
|
|
107
|
-
const scoreSummaryPath = join(rootDir, "results", "latest", "score-summary.json");
|
|
100
|
+
export function saveBaseline(dirs, tag) {
|
|
101
|
+
const { baselinesDir, scoreSummaryPath } = dirs;
|
|
108
102
|
if (!existsSync(scoreSummaryPath)) {
|
|
109
103
|
return {
|
|
110
104
|
message: "No score-summary.json found. Run 'pnpm calculate-scores' first.",
|
|
@@ -135,7 +129,7 @@ export function saveBaseline(rootDir, tag) {
|
|
|
135
129
|
};
|
|
136
130
|
writeFileSync(join(baselinesDir, filename), JSON.stringify(baseline, null, 2));
|
|
137
131
|
return {
|
|
138
|
-
message: `Saved baseline to
|
|
132
|
+
message: `Saved baseline to ${join(baselinesDir, filename)} (avg: ${Math.round(summary.overall.avgScore)}, ${summary.scores.length} areas)`,
|
|
139
133
|
success: true,
|
|
140
134
|
};
|
|
141
135
|
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* cache-hit-restore.ts — helpers for the eval cache-hit branch in
|
|
3
|
+
* `RunEvalStep`. Stamps `sourceRunId` onto a cached report's artifact
|
|
4
|
+
* refs so the new run's manifest advertises the cached artifacts via
|
|
5
|
+
* cross-run lineage instead of pointing at GCS objects that were never
|
|
6
|
+
* written under the new runId.
|
|
7
|
+
*
|
|
8
|
+
* @see docs/decisions/D0040-artifact-ref-source-run-id.md
|
|
9
|
+
* @see docs/design-docs/cache-hit-artifact-restoration.md
|
|
10
|
+
*/
|
|
11
|
+
import type { ArtifactManifest, RunId } from "../_vendor/ailf-core/index.d.ts";
|
|
12
|
+
/**
|
|
13
|
+
* Copy an artifact manifest verbatim and stamp `sourceRunId` on every ref.
|
|
14
|
+
*
|
|
15
|
+
* The ref's `path`, `bucket`, `entries`, `bytes`, `preview`, etc. travel
|
|
16
|
+
* unchanged — they already point at the source run's storage. Only
|
|
17
|
+
* `sourceRunId` is added so retention/GC and observability tooling can
|
|
18
|
+
* follow the cross-run dependency.
|
|
19
|
+
*
|
|
20
|
+
* Pure function; safe to call without side effects.
|
|
21
|
+
*/
|
|
22
|
+
export declare function remapToCacheHitRefs(source: ArtifactManifest, opts: {
|
|
23
|
+
sourceRunId: RunId;
|
|
24
|
+
}): ArtifactManifest;
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* cache-hit-restore.ts — helpers for the eval cache-hit branch in
|
|
3
|
+
* `RunEvalStep`. Stamps `sourceRunId` onto a cached report's artifact
|
|
4
|
+
* refs so the new run's manifest advertises the cached artifacts via
|
|
5
|
+
* cross-run lineage instead of pointing at GCS objects that were never
|
|
6
|
+
* written under the new runId.
|
|
7
|
+
*
|
|
8
|
+
* @see docs/decisions/D0040-artifact-ref-source-run-id.md
|
|
9
|
+
* @see docs/design-docs/cache-hit-artifact-restoration.md
|
|
10
|
+
*/
|
|
11
|
+
/**
|
|
12
|
+
* Copy an artifact manifest verbatim and stamp `sourceRunId` on every ref.
|
|
13
|
+
*
|
|
14
|
+
* The ref's `path`, `bucket`, `entries`, `bytes`, `preview`, etc. travel
|
|
15
|
+
* unchanged — they already point at the source run's storage. Only
|
|
16
|
+
* `sourceRunId` is added so retention/GC and observability tooling can
|
|
17
|
+
* follow the cross-run dependency.
|
|
18
|
+
*
|
|
19
|
+
* Pure function; safe to call without side effects.
|
|
20
|
+
*/
|
|
21
|
+
export function remapToCacheHitRefs(source, opts) {
|
|
22
|
+
const out = {};
|
|
23
|
+
for (const [type, ref] of Object.entries(source)) {
|
|
24
|
+
if (!ref)
|
|
25
|
+
continue;
|
|
26
|
+
out[type] = {
|
|
27
|
+
...ref,
|
|
28
|
+
sourceRunId: opts.sourceRunId,
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
return out;
|
|
32
|
+
}
|
|
@@ -531,6 +531,45 @@ function printAgentBehaviorReport(agentBehavior, log) {
|
|
|
531
531
|
// ---------------------------------------------------------------------------
|
|
532
532
|
// Grader cost extraction
|
|
533
533
|
// ---------------------------------------------------------------------------
|
|
534
|
+
/**
|
|
535
|
+
* Resolve a per-test cost. Promptfoo populates `r.cost` for most providers
|
|
536
|
+
* directly, but `openai:responses:` (and occasionally `openai:chat:`) leaves
|
|
537
|
+
* `cost` at 0 for newer models — Promptfoo's pricing table can lag the
|
|
538
|
+
* model launch. When `cost` is 0 but `response.tokenUsage` is recorded,
|
|
539
|
+
* fall back to AILF's local pricing table so the per-model rollup row
|
|
540
|
+
* isn't dropped on the floor. See W0123.
|
|
541
|
+
*/
|
|
542
|
+
function resolveTestCost(r) {
|
|
543
|
+
const promptfooCost = r.cost ?? 0;
|
|
544
|
+
if (promptfooCost > 0)
|
|
545
|
+
return promptfooCost;
|
|
546
|
+
const tokens = r.response?.tokenUsage;
|
|
547
|
+
if (!tokens)
|
|
548
|
+
return promptfooCost;
|
|
549
|
+
const model = extractModelFromProviderId(r.provider?.id);
|
|
550
|
+
if (!model)
|
|
551
|
+
return promptfooCost;
|
|
552
|
+
return calculateCost(model, tokens.prompt ?? 0, tokens.completion ?? 0);
|
|
553
|
+
}
|
|
554
|
+
/**
|
|
555
|
+
* Extract the model name from a Promptfoo provider id. Provider ids are
|
|
556
|
+
* colon-segmented `<vendor>:<surface>:<model>` (e.g. `openai:responses:gpt-5.4`,
|
|
557
|
+
* `anthropic:messages:claude-opus-4-6`); the model is the trailing segment.
|
|
558
|
+
* Returns undefined for ids that don't carry a model segment (e.g. agentic
|
|
559
|
+
* providers whose id ends in a `file://` URL).
|
|
560
|
+
*/
|
|
561
|
+
function extractModelFromProviderId(providerId) {
|
|
562
|
+
if (!providerId)
|
|
563
|
+
return undefined;
|
|
564
|
+
const parts = providerId.split(":");
|
|
565
|
+
if (parts.length < 2)
|
|
566
|
+
return undefined;
|
|
567
|
+
const last = parts[parts.length - 1];
|
|
568
|
+
if (!last || last.startsWith("file://") || last.startsWith("http")) {
|
|
569
|
+
return undefined;
|
|
570
|
+
}
|
|
571
|
+
return last;
|
|
572
|
+
}
|
|
534
573
|
/**
|
|
535
574
|
* Reads the raw Promptfoo output file and normalizes each result so that
|
|
536
575
|
* `description` is always a top-level field (pulled from `testCase` if needed).
|
|
@@ -551,7 +590,7 @@ function readAndNormalizeResults(resultsPath, log) {
|
|
|
551
590
|
let synthesizedCount = 0;
|
|
552
591
|
for (const r of wrapper.results) {
|
|
553
592
|
const base = {
|
|
554
|
-
cost: r
|
|
593
|
+
cost: resolveTestCost(r),
|
|
555
594
|
description: r.testCase?.description ?? "unknown",
|
|
556
595
|
latencyMs: r.latencyMs,
|
|
557
596
|
metadata: r.metadata,
|
|
@@ -6,6 +6,21 @@
|
|
|
6
6
|
*
|
|
7
7
|
* Separated into its own module so GenerateConfigsStep can import it
|
|
8
8
|
* without pulling in the full legacy generate-configs machinery.
|
|
9
|
+
*
|
|
10
|
+
* W0134 — per-mode maxToolRounds
|
|
11
|
+
*
|
|
12
|
+
* The agentic naive variant gets a higher round budget than agentic
|
|
13
|
+
* optimized: naive simulates current real-world agent behavior under
|
|
14
|
+
* retrieval pressure (it spends rounds on retries when fetches fail) and
|
|
15
|
+
* benefits from more headroom; optimized bypasses Jina via the .md-direct
|
|
16
|
+
* branch and rarely needs more than a couple of rounds. Bumping globally
|
|
17
|
+
* would inflate optimized cost without changing its measured behavior.
|
|
18
|
+
*
|
|
19
|
+
* Resolution order (most specific wins):
|
|
20
|
+
* 1. `model.config.maxToolRounds` — per-model override.
|
|
21
|
+
* 2. `defaults.modeMaxToolRounds[variant]` — per-variant override.
|
|
22
|
+
* 3. `defaults.maxToolRounds` — global default.
|
|
23
|
+
* 4. Hard fallback (5).
|
|
9
24
|
*/
|
|
10
25
|
import { type ModelsConfig } from "../../_vendor/ailf-core/index.d.ts";
|
|
11
26
|
import type { ResolvedSourceConfig } from "../../sources.js";
|
|
@@ -37,3 +52,11 @@ export interface ModelsAndProviders {
|
|
|
37
52
|
* the per-variant promptfoo config files.
|
|
38
53
|
*/
|
|
39
54
|
export declare function loadModelsAndProviders(rootDir: string, source?: ResolvedSourceConfig, searchMode?: string, allowedOrigins?: string[]): ModelsAndProviders;
|
|
55
|
+
/**
|
|
56
|
+
* Resolve `maxToolRounds` for an agentic variant (W0134).
|
|
57
|
+
*
|
|
58
|
+
* Most-specific wins: per-model `config.maxToolRounds` > per-variant
|
|
59
|
+
* `defaults.modeMaxToolRounds[variant]` > global `defaults.maxToolRounds`
|
|
60
|
+
* > hard fallback (5).
|
|
61
|
+
*/
|
|
62
|
+
export declare function resolveMaxToolRounds(models: ModelsConfig, model: ModelsConfig["models"][number], variant: "agentic-naive" | "agentic-optimized"): number;
|
|
@@ -6,6 +6,21 @@
|
|
|
6
6
|
*
|
|
7
7
|
* Separated into its own module so GenerateConfigsStep can import it
|
|
8
8
|
* without pulling in the full legacy generate-configs machinery.
|
|
9
|
+
*
|
|
10
|
+
* W0134 — per-mode maxToolRounds
|
|
11
|
+
*
|
|
12
|
+
* The agentic naive variant gets a higher round budget than agentic
|
|
13
|
+
* optimized: naive simulates current real-world agent behavior under
|
|
14
|
+
* retrieval pressure (it spends rounds on retries when fetches fail) and
|
|
15
|
+
* benefits from more headroom; optimized bypasses Jina via the .md-direct
|
|
16
|
+
* branch and rarely needs more than a couple of rounds. Bumping globally
|
|
17
|
+
* would inflate optimized cost without changing its measured behavior.
|
|
18
|
+
*
|
|
19
|
+
* Resolution order (most specific wins):
|
|
20
|
+
* 1. `model.config.maxToolRounds` — per-model override.
|
|
21
|
+
* 2. `defaults.modeMaxToolRounds[variant]` — per-variant override.
|
|
22
|
+
* 3. `defaults.maxToolRounds` — global default.
|
|
23
|
+
* 4. Hard fallback (5).
|
|
9
24
|
*/
|
|
10
25
|
import { extractModelName, extractProvider, mergeConfig, } from "../../_vendor/ailf-core/index.js";
|
|
11
26
|
import { loadConfigFile } from "./config-loader.js";
|
|
@@ -100,7 +115,7 @@ function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
|
|
|
100
115
|
config: {
|
|
101
116
|
...mergeConfig(models.defaults, model.config, {
|
|
102
117
|
agentMode: "naive",
|
|
103
|
-
maxToolRounds: models
|
|
118
|
+
maxToolRounds: resolveMaxToolRounds(models, model, "agentic-naive"),
|
|
104
119
|
model: modelName,
|
|
105
120
|
provider,
|
|
106
121
|
}),
|
|
@@ -120,7 +135,7 @@ function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
|
|
|
120
135
|
config: {
|
|
121
136
|
...mergeConfig(models.defaults, model.config, {
|
|
122
137
|
agentMode: "optimized",
|
|
123
|
-
maxToolRounds: models
|
|
138
|
+
maxToolRounds: resolveMaxToolRounds(models, model, "agentic-optimized"),
|
|
124
139
|
model: modelName,
|
|
125
140
|
provider,
|
|
126
141
|
}),
|
|
@@ -135,6 +150,26 @@ function buildAgenticProviders(models, source, searchMode, _allowedOrigins) {
|
|
|
135
150
|
}
|
|
136
151
|
return providers;
|
|
137
152
|
}
|
|
153
|
+
/**
|
|
154
|
+
* Resolve `maxToolRounds` for an agentic variant (W0134).
|
|
155
|
+
*
|
|
156
|
+
* Most-specific wins: per-model `config.maxToolRounds` > per-variant
|
|
157
|
+
* `defaults.modeMaxToolRounds[variant]` > global `defaults.maxToolRounds`
|
|
158
|
+
* > hard fallback (5).
|
|
159
|
+
*/
|
|
160
|
+
export function resolveMaxToolRounds(models, model, variant) {
|
|
161
|
+
const perModel = model.config?.maxToolRounds;
|
|
162
|
+
if (typeof perModel === "number")
|
|
163
|
+
return perModel;
|
|
164
|
+
const modeOverrides = models.defaults.modeMaxToolRounds;
|
|
165
|
+
const perVariant = modeOverrides?.[variant];
|
|
166
|
+
if (typeof perVariant === "number")
|
|
167
|
+
return perVariant;
|
|
168
|
+
const globalDefault = models.defaults.maxToolRounds;
|
|
169
|
+
if (typeof globalDefault === "number")
|
|
170
|
+
return globalDefault;
|
|
171
|
+
return 5;
|
|
172
|
+
}
|
|
138
173
|
// ---------------------------------------------------------------------------
|
|
139
174
|
// Helpers
|
|
140
175
|
// ---------------------------------------------------------------------------
|
|
@@ -6,30 +6,35 @@
|
|
|
6
6
|
* pipeline can query the Sanity Content Lake for a previous report with an
|
|
7
7
|
* identical fingerprint and skip the expensive eval step.
|
|
8
8
|
*
|
|
9
|
-
* The fingerprint captures
|
|
9
|
+
* The fingerprint captures:
|
|
10
10
|
* - Evaluation mode (baseline, observed, agentic)
|
|
11
|
-
* - Model configuration (which models, their settings)
|
|
12
11
|
* - Grader model identity (different graders score differently)
|
|
13
|
-
* -
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
* -
|
|
18
|
-
* -
|
|
12
|
+
* - The task set that was actually loaded for this run, in its canonical
|
|
13
|
+
* shape (taken straight from `ctx.taskSource.loadTasks(filter)` so that
|
|
14
|
+
* Studio-authored task edits in the Content Lake are picked up — pre-v2
|
|
15
|
+
* the fingerprint walked `tasks/` on disk and missed them entirely).
|
|
16
|
+
* - Repo-tracked config (models, prompts, rubrics) and reference solutions.
|
|
17
|
+
* - Fetched canonical doc content (contexts/canonical/*.md).
|
|
19
18
|
*
|
|
20
19
|
* The fingerprint intentionally EXCLUDES:
|
|
21
|
-
* - Source name/URL (content matters, not origin)
|
|
22
|
-
* - Git metadata (informational, not eval-affecting)
|
|
23
|
-
* - Trigger type (manual vs CI → same inputs → same results)
|
|
24
|
-
* - Report tags (human labels)
|
|
20
|
+
* - Source name/URL (content matters, not origin).
|
|
21
|
+
* - Git metadata (informational, not eval-affecting).
|
|
22
|
+
* - Trigger type (manual vs CI → same inputs → same results).
|
|
23
|
+
* - Report tags (human labels).
|
|
25
24
|
*
|
|
26
25
|
* @see docs/design-docs/content-lake-eval-caching.md
|
|
27
26
|
*/
|
|
28
|
-
import type {
|
|
27
|
+
import type { GeneralizedTaskDefinition } from "../_vendor/ailf-core/index.d.ts";
|
|
28
|
+
import type { EvalMode } from "../_vendor/ailf-shared/index.d.ts";
|
|
29
29
|
/** Inputs needed to compute an evaluation fingerprint. */
|
|
30
30
|
export interface FingerprintInput {
|
|
31
|
-
/**
|
|
32
|
-
|
|
31
|
+
/**
|
|
32
|
+
* Task definitions returned by `ctx.taskSource.loadTasks(filter)` after
|
|
33
|
+
* any release-auto-scope narrowing has been applied. The fingerprint
|
|
34
|
+
* captures whatever set the pipeline is actually about to evaluate, so
|
|
35
|
+
* filter changes are reflected implicitly.
|
|
36
|
+
*/
|
|
37
|
+
tasks: readonly GeneralizedTaskDefinition[];
|
|
33
38
|
/** Grader model identifier (e.g., "anthropic:messages:claude-opus-4-5-20251101") */
|
|
34
39
|
graderModel: string;
|
|
35
40
|
/** Evaluation mode */
|
|
@@ -37,30 +42,23 @@ export interface FingerprintInput {
|
|
|
37
42
|
/** Path to the packages/eval root directory */
|
|
38
43
|
rootDir: string;
|
|
39
44
|
}
|
|
40
|
-
/**
|
|
41
|
-
* Collect all file paths that contribute to the evaluation fingerprint.
|
|
42
|
-
*
|
|
43
|
-
* This is similar to `getStepInputPaths()` in `cache.ts` but is more
|
|
44
|
-
* comprehensive and explicitly designed for cross-environment cache keys:
|
|
45
|
-
*
|
|
46
|
-
* - Includes `config/prompts` and `config/rubrics` directly
|
|
47
|
-
* (the local cache only includes them indirectly via generated configs)
|
|
48
|
-
* - Includes `config/models` (model configuration)
|
|
49
|
-
* - Includes task definitions and reference solutions
|
|
50
|
-
* - Includes the actual documentation content (contexts/canonical/*.md)
|
|
51
|
-
* - Respects filter flags to only include relevant files
|
|
52
|
-
*/
|
|
53
|
-
export declare function collectFingerprintInputPaths(rootDir: string, filter?: FilterOptions): string[];
|
|
54
45
|
/**
|
|
55
46
|
* Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
|
|
56
47
|
*
|
|
57
|
-
*
|
|
58
|
-
*
|
|
59
|
-
*
|
|
60
|
-
*
|
|
61
|
-
* and adds non-file context (mode, grader model, filter flags) as
|
|
62
|
-
* additional context strings.
|
|
48
|
+
* Identical inputs always produce the same fingerprint, regardless of the
|
|
49
|
+
* environment (local, CI, etc.). Cross-environment portability relies on
|
|
50
|
+
* (a) tasks coming from the same Content Lake source and (b) file paths
|
|
51
|
+
* being hashed as rootDir-relative.
|
|
63
52
|
*
|
|
64
53
|
* @returns SHA-256 hex string (64 characters)
|
|
65
54
|
*/
|
|
66
55
|
export declare function computeEvalFingerprint(input: FingerprintInput): string;
|
|
56
|
+
/**
|
|
57
|
+
* Collect repo-tracked + fetched file paths that contribute to the
|
|
58
|
+
* fingerprint. Tasks are NOT collected here — they come from
|
|
59
|
+
* `ctx.taskSource.loadTasks()` and flow into the hash via the `tasks`
|
|
60
|
+
* input on `computeEvalFingerprint`.
|
|
61
|
+
*
|
|
62
|
+
* Exported for the debug-fingerprint diagnostic script.
|
|
63
|
+
*/
|
|
64
|
+
export declare function collectFingerprintFilePaths(rootDir: string): string[];
|