@sanity/ailf 0.1.19 → 0.1.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -188,7 +188,8 @@ export default class AgenticProvider {
|
|
|
188
188
|
return this.recorder;
|
|
189
189
|
}
|
|
190
190
|
id() {
|
|
191
|
-
|
|
191
|
+
const model = this.config.model || this.providerId;
|
|
192
|
+
return `agentic:${this.agentMode}:${model}`;
|
|
192
193
|
}
|
|
193
194
|
// -------------------------------------------------------------------------
|
|
194
195
|
// Tool execution
|
|
@@ -5,6 +5,13 @@
|
|
|
5
5
|
* the PipelineStep interface. This includes document manifest enrichment
|
|
6
6
|
* and low-scoring judgment extraction.
|
|
7
7
|
*
|
|
8
|
+
* Document resolution uses two sources (layered):
|
|
9
|
+
* 1. TaskSource (from AppContext) — the canonical source of task definitions
|
|
10
|
+
* including Content Lake, repo-based, and YAML tasks. This is the primary
|
|
11
|
+
* source for mapping task descriptions to canonical doc slugs.
|
|
12
|
+
* 2. Local task YAML (via resolveMappings) — legacy fallback for tasks not
|
|
13
|
+
* found via the TaskSource adapter.
|
|
14
|
+
*
|
|
8
15
|
* This is an optional step — failure doesn't stop the pipeline.
|
|
9
16
|
*/
|
|
10
17
|
import type { AppContext, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
|
|
@@ -5,10 +5,18 @@
|
|
|
5
5
|
* the PipelineStep interface. This includes document manifest enrichment
|
|
6
6
|
* and low-scoring judgment extraction.
|
|
7
7
|
*
|
|
8
|
+
* Document resolution uses two sources (layered):
|
|
9
|
+
* 1. TaskSource (from AppContext) — the canonical source of task definitions
|
|
10
|
+
* including Content Lake, repo-based, and YAML tasks. This is the primary
|
|
11
|
+
* source for mapping task descriptions to canonical doc slugs.
|
|
12
|
+
* 2. Local task YAML (via resolveMappings) — legacy fallback for tasks not
|
|
13
|
+
* found via the TaskSource adapter.
|
|
14
|
+
*
|
|
8
15
|
* This is an optional step — failure doesn't stop the pipeline.
|
|
9
16
|
*/
|
|
10
17
|
import { existsSync, readFileSync, writeFileSync } from "fs";
|
|
11
18
|
import { join, resolve } from "path";
|
|
19
|
+
import { isSlugRef } from "../../_vendor/ailf-core/index.js";
|
|
12
20
|
export class GapAnalysisStep {
|
|
13
21
|
name = "gap-analysis";
|
|
14
22
|
optional = true;
|
|
@@ -51,9 +59,6 @@ export class GapAnalysisStep {
|
|
|
51
59
|
const outDir = resolve(root, "results", "latest");
|
|
52
60
|
writeFileSync(join(outDir, "failure-modes.json"), JSON.stringify(failureModeReport, null, 2));
|
|
53
61
|
writeFileSync(join(outDir, "gap-analysis.json"), JSON.stringify(gapReport, null, 2));
|
|
54
|
-
// ── Document manifest + enrichment ─────────────────────────
|
|
55
|
-
const { resolveMappings } = await import("../../pipeline/resolve-mappings.js");
|
|
56
|
-
const mappings = resolveMappings(root);
|
|
57
62
|
const manifestPath = resolve(root, "contexts", "document-manifest.json");
|
|
58
63
|
const manifestEntries = existsSync(manifestPath)
|
|
59
64
|
? JSON.parse(readFileSync(manifestPath, "utf-8"))
|
|
@@ -75,17 +80,59 @@ export class GapAnalysisStep {
|
|
|
75
80
|
: { documentId: "", slug, title: slug };
|
|
76
81
|
})
|
|
77
82
|
.filter((r) => r.documentId !== "");
|
|
83
|
+
// ── Build description→docs mapping from TaskSource ─────────
|
|
84
|
+
// Primary source: use the TaskSource adapter from AppContext.
|
|
85
|
+
// This works with Content Lake, repo-based, and YAML tasks.
|
|
86
|
+
// Judgments use task description as their taskId, so we build
|
|
87
|
+
// maps keyed by both description and task ID for robust matching.
|
|
78
88
|
const descToDocRefs = new Map();
|
|
79
89
|
const areaToDocRefs = new Map();
|
|
90
|
+
let tasks = [];
|
|
91
|
+
try {
|
|
92
|
+
tasks = await ctx.taskSource.loadTasks();
|
|
93
|
+
}
|
|
94
|
+
catch {
|
|
95
|
+
// TaskSource may not be available in all contexts (e.g., standalone
|
|
96
|
+
// gap analysis on cached results). Fall through to legacy fallback.
|
|
97
|
+
}
|
|
98
|
+
if (tasks.length > 0) {
|
|
99
|
+
// Group tasks by feature area and build slug maps
|
|
100
|
+
const byArea = new Map();
|
|
101
|
+
for (const task of tasks) {
|
|
102
|
+
const slugs = extractSlugsFromRefs(task.canonicalDocs);
|
|
103
|
+
const refs = resolveRefs(slugs);
|
|
104
|
+
// Map by description (what judgments use as taskId)
|
|
105
|
+
descToDocRefs.set(task.description, refs);
|
|
106
|
+
// Also map by task ID for prefix-based matching
|
|
107
|
+
descToDocRefs.set(task.id, refs);
|
|
108
|
+
// Group slugs by feature area
|
|
109
|
+
if (!byArea.has(task.featureArea))
|
|
110
|
+
byArea.set(task.featureArea, new Set());
|
|
111
|
+
for (const s of slugs)
|
|
112
|
+
byArea.get(task.featureArea).add(s);
|
|
113
|
+
}
|
|
114
|
+
for (const [area, slugs] of byArea) {
|
|
115
|
+
areaToDocRefs.set(area, resolveRefs([...slugs]));
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
// Legacy fallback: merge in any tasks from local YAML that weren't
|
|
119
|
+
// already covered by the TaskSource adapter.
|
|
120
|
+
const { resolveMappings } = await import("../../pipeline/resolve-mappings.js");
|
|
121
|
+
const mappings = resolveMappings(root);
|
|
80
122
|
for (const [area, areaData] of Object.entries(mappings.feature_areas)) {
|
|
81
123
|
const areaSlugs = new Set();
|
|
82
124
|
for (const task of areaData.tasks) {
|
|
83
125
|
const taskSlugs = task.canonical_docs.map((d) => d.slug);
|
|
84
|
-
|
|
126
|
+
// Only add if not already mapped by the primary source
|
|
127
|
+
if (!descToDocRefs.has(task.description)) {
|
|
128
|
+
descToDocRefs.set(task.description, resolveRefs(taskSlugs));
|
|
129
|
+
}
|
|
85
130
|
for (const s of taskSlugs)
|
|
86
131
|
areaSlugs.add(s);
|
|
87
132
|
}
|
|
88
|
-
areaToDocRefs.
|
|
133
|
+
if (!areaToDocRefs.has(area)) {
|
|
134
|
+
areaToDocRefs.set(area, resolveRefs([...areaSlugs]));
|
|
135
|
+
}
|
|
89
136
|
}
|
|
90
137
|
const documentManifest = resolveRefs([...refBySlug.keys()]);
|
|
91
138
|
const enrichedScores = scoreSummary.scores.map((s) => ({
|
|
@@ -104,6 +151,7 @@ export class GapAnalysisStep {
|
|
|
104
151
|
.sort((a, b) => a.score - b.score)
|
|
105
152
|
.slice(0, MAX_STORED_JUDGMENTS)
|
|
106
153
|
.map((j) => {
|
|
154
|
+
// Judgment taskId is the description with "(gold)" or "(baseline)" suffix
|
|
107
155
|
const baseDesc = j.taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
|
|
108
156
|
const canonicalDocs = descToDocRefs.get(baseDesc);
|
|
109
157
|
return canonicalDocs ? { ...j, canonicalDocs } : j;
|
|
@@ -134,3 +182,32 @@ export class GapAnalysisStep {
|
|
|
134
182
|
}
|
|
135
183
|
}
|
|
136
184
|
}
|
|
185
|
+
// ---------------------------------------------------------------------------
|
|
186
|
+
// Helpers
|
|
187
|
+
// ---------------------------------------------------------------------------
|
|
188
|
+
/**
|
|
189
|
+
* Extract slug strings from polymorphic canonical doc refs.
|
|
190
|
+
*
|
|
191
|
+
* Only slug-based refs produce a slug directly. Other ref types
|
|
192
|
+
* (path, id, perspective) are resolved during doc fetching — their
|
|
193
|
+
* slugs appear in the document manifest, not in the refs themselves.
|
|
194
|
+
* For path refs, the final segment is used as the slug approximation.
|
|
195
|
+
*/
|
|
196
|
+
function extractSlugsFromRefs(refs) {
|
|
197
|
+
const slugs = [];
|
|
198
|
+
for (const ref of refs) {
|
|
199
|
+
if (isSlugRef(ref)) {
|
|
200
|
+
slugs.push(ref.slug);
|
|
201
|
+
}
|
|
202
|
+
else if ("path" in ref && typeof ref.path === "string") {
|
|
203
|
+
// Path refs: use the final segment as the slug
|
|
204
|
+
const segments = ref.path.split("/").filter(Boolean);
|
|
205
|
+
if (segments.length > 0) {
|
|
206
|
+
slugs.push(segments[segments.length - 1]);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
// id and perspective refs: slugs are resolved at fetch time
|
|
210
|
+
// and appear in the document manifest — handled via refBySlug
|
|
211
|
+
}
|
|
212
|
+
return slugs;
|
|
213
|
+
}
|
|
@@ -324,13 +324,25 @@ function calculateScores(resultsPath, weights) {
|
|
|
324
324
|
/**
|
|
325
325
|
* Extracts agent behavior summary from a test result's metadata.
|
|
326
326
|
* Returns null if the test was not run with the instrumented provider.
|
|
327
|
+
*
|
|
328
|
+
* Checks two locations because Promptfoo may flatten/merge the metadata
|
|
329
|
+
* object differently than how the provider originally set it:
|
|
330
|
+
* 1. metadata.agentBehaviorSummary (set directly by AgenticProvider)
|
|
331
|
+
* 2. metadata.agentBehavior.summary (nested in the full behavior log)
|
|
327
332
|
*/
|
|
328
333
|
function extractAgentBehavior(test) {
|
|
329
334
|
const { metadata } = test;
|
|
330
|
-
if (
|
|
331
|
-
return
|
|
335
|
+
if (metadata?.agentBehaviorSummary) {
|
|
336
|
+
return metadata.agentBehaviorSummary;
|
|
337
|
+
}
|
|
338
|
+
// Fallback: Promptfoo may drop the top-level agentBehaviorSummary
|
|
339
|
+
// field during serialization, but the data is nested inside the
|
|
340
|
+
// full agentBehavior log.
|
|
341
|
+
const behavior = metadata?.agentBehavior;
|
|
342
|
+
if (behavior?.summary) {
|
|
343
|
+
return behavior.summary;
|
|
332
344
|
}
|
|
333
|
-
return
|
|
345
|
+
return null;
|
|
334
346
|
}
|
|
335
347
|
/**
|
|
336
348
|
* Extracts grader (assertion) cost from the raw Promptfoo results file.
|