@sanity/ailf 2.2.0 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/rubrics.ts +3 -3
- package/dist/_vendor/ailf-core/types/index.d.ts +25 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +15 -7
- package/dist/commands/calculate-scores.js +7 -2
- package/dist/commands/capture-list.d.ts +1 -1
- package/dist/commands/capture-list.js +6 -3
- package/dist/commands/compare.js +11 -7
- package/dist/commands/explain-handler.js +22 -24
- package/dist/commands/fetch-docs.js +4 -2
- package/dist/commands/generate-configs.js +6 -2
- package/dist/commands/pipeline-action.js +8 -24
- package/dist/commands/pipeline.js +1 -1
- package/dist/commands/pr-comment.js +6 -2
- package/dist/commands/publish.d.ts +1 -0
- package/dist/commands/publish.js +12 -8
- package/dist/commands/remote-pipeline.js +1 -1
- package/dist/commands/remote-results.d.ts +8 -8
- package/dist/commands/remote-results.js +7 -7
- package/dist/commands/shared/options.d.ts +8 -0
- package/dist/commands/shared/options.js +10 -0
- package/dist/commands/shared/resolve-output-dir.d.ts +27 -0
- package/dist/commands/shared/resolve-output-dir.js +36 -0
- package/dist/composition-root.js +1 -1
- package/dist/config/rubrics.ts +3 -3
- package/dist/orchestration/build-app-context.js +1 -1
- package/dist/orchestration/steps/fetch-docs-step.js +23 -9
- package/dist/orchestration/steps/gap-analysis-step.js +86 -75
- package/dist/orchestration/steps/generate-configs-step.d.ts +15 -0
- package/dist/orchestration/steps/generate-configs-step.js +56 -0
- package/dist/orchestration/steps/run-eval-step.js +14 -0
- package/dist/pipeline/calculate-scores.js +113 -2
- package/dist/pipeline/compare.js +50 -19
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +64 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +6 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +14 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +3 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +1 -27
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +2 -9
- package/dist/pipeline/compiler/rubric-resolution.d.ts +40 -0
- package/dist/pipeline/compiler/rubric-resolution.js +52 -0
- package/dist/pipeline/compiler/scoring-bridge.js +59 -7
- package/dist/pipeline/provenance.js +7 -1
- package/dist/pipeline/validate.d.ts +5 -4
- package/dist/pipeline/validate.js +34 -113
- package/dist/webhook/eval-request-handler.js +4 -0
- package/package.json +1 -1
|
@@ -546,11 +546,13 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
|
|
|
546
546
|
const featureScore = {
|
|
547
547
|
ceilingScore,
|
|
548
548
|
codeCorrectness: gold.dimensions.codeCorrectness ?? 0,
|
|
549
|
+
dimensions: gold.dimensions,
|
|
549
550
|
docCoverage: gold.dimensions.docCoverage ?? 0,
|
|
550
551
|
docLift,
|
|
551
552
|
docQualityGap: 100 - ceilingScore,
|
|
552
553
|
feature,
|
|
553
554
|
floorScore,
|
|
555
|
+
groupType: "feature",
|
|
554
556
|
...(modelId && { modelId }),
|
|
555
557
|
negativeDocLift: docLift < 0,
|
|
556
558
|
taskCompletion: gold.dimensions.taskCompletion ?? 0,
|
|
@@ -563,6 +565,69 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
|
|
|
563
565
|
return scores.sort((a, b) => a.feature.localeCompare(b.feature));
|
|
564
566
|
}
|
|
565
567
|
// ---------------------------------------------------------------------------
|
|
568
|
+
// Agent-harness scoring — groups by task ID, single variant
|
|
569
|
+
// ---------------------------------------------------------------------------
|
|
570
|
+
/**
|
|
571
|
+
* Score agent-harness evaluation results. Unlike literacy mode, agent-harness
|
|
572
|
+
* tasks don't have a with-docs/without-docs split. All results are scored
|
|
573
|
+
* as a single "actual" variant using the agent-harness profile.
|
|
574
|
+
*
|
|
575
|
+
* Groups results by task ID (extracted from the test description prefix)
|
|
576
|
+
* rather than by feature area. Each group produces a FeatureScore with
|
|
577
|
+
* groupType: "task".
|
|
578
|
+
*
|
|
579
|
+
* Literacy-specific fields (ceilingScore, floorScore, docLift, docQualityGap)
|
|
580
|
+
* are set to 0 for backward compatibility with downstream consumers.
|
|
581
|
+
*/
|
|
582
|
+
function scoreAgentHarnessResults(results, profile) {
|
|
583
|
+
// Group by task ID (extracted from description: "task-id — Title")
|
|
584
|
+
const byTask = {};
|
|
585
|
+
for (const result of results) {
|
|
586
|
+
const taskId = extractTaskId(result.description);
|
|
587
|
+
if (!byTask[taskId]) {
|
|
588
|
+
byTask[taskId] = [];
|
|
589
|
+
}
|
|
590
|
+
byTask[taskId].push(result);
|
|
591
|
+
}
|
|
592
|
+
const scores = [];
|
|
593
|
+
for (const [taskId, taskResults] of Object.entries(byTask)) {
|
|
594
|
+
const scored = scoreTestGroup(taskResults, profile, taskId);
|
|
595
|
+
const totalCost = scored.totalCost;
|
|
596
|
+
// Detect feature area for backward compat (used by report grouping)
|
|
597
|
+
const feature = taskResults[0]?.vars.__featureArea ??
|
|
598
|
+
detectFeatureArea(taskResults[0]?.description ?? taskId);
|
|
599
|
+
scores.push({
|
|
600
|
+
assertionPassRate: scored.dimensions.assertionPassRate,
|
|
601
|
+
ceilingScore: 0,
|
|
602
|
+
codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
|
|
603
|
+
dimensions: scored.dimensions,
|
|
604
|
+
docCoverage: scored.dimensions.docCoverage ?? 0,
|
|
605
|
+
docLift: 0,
|
|
606
|
+
docQualityGap: 0,
|
|
607
|
+
feature,
|
|
608
|
+
floorScore: 0,
|
|
609
|
+
groupType: "task",
|
|
610
|
+
negativeDocLift: false,
|
|
611
|
+
taskCompletion: scored.dimensions.taskCompletion ?? 0,
|
|
612
|
+
testCount: taskResults.length,
|
|
613
|
+
totalCost,
|
|
614
|
+
totalScore: scored.composite,
|
|
615
|
+
});
|
|
616
|
+
}
|
|
617
|
+
return scores.sort((a, b) => a.feature.localeCompare(b.feature));
|
|
618
|
+
}
|
|
619
|
+
/**
|
|
620
|
+
* Extract task ID from a test description string.
|
|
621
|
+
* Descriptions follow the pattern: "task-id — Title"
|
|
622
|
+
*/
|
|
623
|
+
function extractTaskId(description) {
|
|
624
|
+
const dashIndex = description.indexOf(" — ");
|
|
625
|
+
if (dashIndex > 0) {
|
|
626
|
+
return description.slice(0, dashIndex).trim();
|
|
627
|
+
}
|
|
628
|
+
return description.trim() || "unknown";
|
|
629
|
+
}
|
|
630
|
+
// ---------------------------------------------------------------------------
|
|
566
631
|
// Agentic scoring — all results are "actual" (agent retrieves docs via tools)
|
|
567
632
|
// ---------------------------------------------------------------------------
|
|
568
633
|
/**
|
|
@@ -684,11 +749,57 @@ export function calculateAndWriteScores(options) {
|
|
|
684
749
|
if (source) {
|
|
685
750
|
log.info(`Source: ${sourceName} (${source.baseUrl})`);
|
|
686
751
|
}
|
|
687
|
-
// Load rubric config
|
|
752
|
+
// Load rubric config — shared across all modes
|
|
753
|
+
const rubricConfig = loadRubricTemplates(ROOT);
|
|
754
|
+
// ── Agent-harness scoring path ──────────────────────────────
|
|
755
|
+
// Agent-harness mode uses its own scoring path because:
|
|
756
|
+
// 1. No with-docs/without-docs split — all results are "actual"
|
|
757
|
+
// 2. Groups by task ID, not feature area
|
|
758
|
+
// 3. Uses the agent-harness profile (assertion-pass-rate, agent-output, tool-usage)
|
|
759
|
+
// See docs/design-docs/mode-agnostic-scoring.md
|
|
760
|
+
if (mode === "agent-harness") {
|
|
761
|
+
const agentProfile = resolveProfile("agent-harness", "gold", rubricConfig);
|
|
762
|
+
log.debug("Agent-harness scoring profile", agentProfile);
|
|
763
|
+
const results = readAndNormalizeResults(baselineResultsPath);
|
|
764
|
+
const scores = scoreAgentHarnessResults(results, agentProfile);
|
|
765
|
+
log.debug("Agent-harness scores calculated", {
|
|
766
|
+
taskCount: scores.length,
|
|
767
|
+
tasks: scores.map((s) => ({
|
|
768
|
+
feature: s.feature,
|
|
769
|
+
totalScore: s.totalScore,
|
|
770
|
+
testCount: s.testCount,
|
|
771
|
+
dimensions: s.dimensions,
|
|
772
|
+
})),
|
|
773
|
+
});
|
|
774
|
+
const urlRefs = aggregateUrlReferences(baselineResultsPath);
|
|
775
|
+
const sourceVerification = buildSourceVerification(ROOT, source, {
|
|
776
|
+
allowedOrigins: options.allowedOrigins,
|
|
777
|
+
mode,
|
|
778
|
+
searchMode: options.searchMode,
|
|
779
|
+
});
|
|
780
|
+
const graderCost = extractGraderCost(baselineResultsPath);
|
|
781
|
+
const summary = printReport(scores, urlRefs, source, null, // no agent behavior (that's for literacy agentic mode)
|
|
782
|
+
graderCost, null, // no per-model breakdown
|
|
783
|
+
null, // no source isolation
|
|
784
|
+
sourceVerification, "agent-harness", log);
|
|
785
|
+
// Persist
|
|
786
|
+
const outDir = join(ROOT, "results", "latest");
|
|
787
|
+
mkdirSync(outDir, { recursive: true });
|
|
788
|
+
writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
|
|
789
|
+
log.info("Score summary written to results/latest/score-summary.json");
|
|
790
|
+
// Extract and persist grader judgments
|
|
791
|
+
const judgments = extractGraderJudgments(baselineResultsPath);
|
|
792
|
+
if (judgments.length > 0) {
|
|
793
|
+
writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
|
|
794
|
+
log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
|
|
795
|
+
}
|
|
796
|
+
const testSummary = computeTestSummary(baselineResultsPath);
|
|
797
|
+
return { belowCritical: summary.belowCritical, testSummary };
|
|
798
|
+
}
|
|
799
|
+
// ── Literacy scoring path ───────────────────────────────────
|
|
688
800
|
// Gold (with-docs) entries use the "default" profile (3 dimensions).
|
|
689
801
|
// Baseline (without-docs) entries use "output-only" (2 dimensions,
|
|
690
802
|
// doc-coverage excluded). See docs/design-docs/named-scoring-profiles.md.
|
|
691
|
-
const rubricConfig = loadRubricTemplates(ROOT);
|
|
692
803
|
const goldProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.STANDARD);
|
|
693
804
|
const baselineProfileWeights = resolveProfile("literacy", LiteracyVariant.STANDARD, rubricConfig, LiteracyVariant.STANDARD);
|
|
694
805
|
log.debug("Loaded scoring profiles", {
|
package/dist/pipeline/compare.js
CHANGED
|
@@ -146,12 +146,6 @@ function buildAreaDelta(area, baselineScore, experimentScore, threshold, isMisma
|
|
|
146
146
|
const bTotal = b?.totalScore ?? 0;
|
|
147
147
|
const eTotal = e?.totalScore ?? 0;
|
|
148
148
|
const delta = eTotal - bTotal;
|
|
149
|
-
const bTask = b?.taskCompletion ?? 0;
|
|
150
|
-
const eTask = e?.taskCompletion ?? 0;
|
|
151
|
-
const bCode = b?.codeCorrectness ?? 0;
|
|
152
|
-
const eCode = e?.codeCorrectness ?? 0;
|
|
153
|
-
const bDoc = b?.docCoverage ?? 0;
|
|
154
|
-
const eDoc = e?.docCoverage ?? 0;
|
|
155
149
|
// Support both new field names and legacy data (old baselines/Sanity docs)
|
|
156
150
|
const bRaw = b;
|
|
157
151
|
const eRaw = e;
|
|
@@ -183,19 +177,7 @@ function buildAreaDelta(area, baselineScore, experimentScore, threshold, isMisma
|
|
|
183
177
|
ceilingDelta: eCeiling - bCeiling,
|
|
184
178
|
change: isMismatched ? "not-evaluated" : classifyChange(delta, threshold),
|
|
185
179
|
delta,
|
|
186
|
-
dimensions:
|
|
187
|
-
codeCorrectness: {
|
|
188
|
-
baseline: bCode,
|
|
189
|
-
delta: eCode - bCode,
|
|
190
|
-
experiment: eCode,
|
|
191
|
-
},
|
|
192
|
-
docCoverage: { baseline: bDoc, delta: eDoc - bDoc, experiment: eDoc },
|
|
193
|
-
taskCompletion: {
|
|
194
|
-
baseline: bTask,
|
|
195
|
-
delta: eTask - bTask,
|
|
196
|
-
experiment: eTask,
|
|
197
|
-
},
|
|
198
|
-
},
|
|
180
|
+
dimensions: buildDimensionDeltas(b, e),
|
|
199
181
|
docLiftDelta: eLift - bLift,
|
|
200
182
|
experiment: eTotal,
|
|
201
183
|
floorDelta: eFloor - bFloor,
|
|
@@ -206,6 +188,55 @@ function buildAreaDelta(area, baselineScore, experimentScore, threshold, isMisma
|
|
|
206
188
|
...(hasCost && { costDelta: eCost - bCost }),
|
|
207
189
|
};
|
|
208
190
|
}
|
|
191
|
+
/**
|
|
192
|
+
* Build per-dimension deltas from the generic dimensions map when available,
|
|
193
|
+
* falling back to the three legacy literacy fields for backward compatibility.
|
|
194
|
+
*
|
|
195
|
+
* This ensures non-literacy modes (agent-harness, mcp-server, etc.) get their
|
|
196
|
+
* actual dimensions (e.g., agentOutput, toolUsage) in comparison reports
|
|
197
|
+
* instead of hardcoded zeros for codeCorrectness/docCoverage/taskCompletion.
|
|
198
|
+
*/
|
|
199
|
+
function buildDimensionDeltas(b, e) {
|
|
200
|
+
const bDims = b?.dimensions;
|
|
201
|
+
const eDims = e?.dimensions;
|
|
202
|
+
// When the generic dimensions map is populated, use it — this covers
|
|
203
|
+
// agent-harness (agentOutput, toolUsage), literacy (taskCompletion,
|
|
204
|
+
// codeCorrectness, docCoverage), and any future mode dimensions.
|
|
205
|
+
if (bDims || eDims) {
|
|
206
|
+
const allKeys = new Set([
|
|
207
|
+
...Object.keys(bDims ?? {}),
|
|
208
|
+
...Object.keys(eDims ?? {}),
|
|
209
|
+
]);
|
|
210
|
+
const result = {};
|
|
211
|
+
for (const key of allKeys) {
|
|
212
|
+
const bVal = bDims?.[key] ?? 0;
|
|
213
|
+
const eVal = eDims?.[key] ?? 0;
|
|
214
|
+
result[key] = { baseline: bVal, delta: eVal - bVal, experiment: eVal };
|
|
215
|
+
}
|
|
216
|
+
return result;
|
|
217
|
+
}
|
|
218
|
+
// Legacy fallback: older reports may lack the dimensions map entirely.
|
|
219
|
+
// Read from the three named FeatureScore fields instead.
|
|
220
|
+
const bTask = b?.taskCompletion ?? 0;
|
|
221
|
+
const eTask = e?.taskCompletion ?? 0;
|
|
222
|
+
const bCode = b?.codeCorrectness ?? 0;
|
|
223
|
+
const eCode = e?.codeCorrectness ?? 0;
|
|
224
|
+
const bDoc = b?.docCoverage ?? 0;
|
|
225
|
+
const eDoc = e?.docCoverage ?? 0;
|
|
226
|
+
return {
|
|
227
|
+
codeCorrectness: {
|
|
228
|
+
baseline: bCode,
|
|
229
|
+
delta: eCode - bCode,
|
|
230
|
+
experiment: eCode,
|
|
231
|
+
},
|
|
232
|
+
docCoverage: { baseline: bDoc, delta: eDoc - bDoc, experiment: eDoc },
|
|
233
|
+
taskCompletion: {
|
|
234
|
+
baseline: bTask,
|
|
235
|
+
delta: eTask - bTask,
|
|
236
|
+
experiment: eTask,
|
|
237
|
+
},
|
|
238
|
+
};
|
|
239
|
+
}
|
|
209
240
|
// ---------------------------------------------------------------------------
|
|
210
241
|
// Main compare function
|
|
211
242
|
// ---------------------------------------------------------------------------
|
|
@@ -232,6 +232,70 @@ describe("compileAgentHarnessTask — assertions", () => {
|
|
|
232
232
|
}), { graderProvider: "openai:chat:gpt-5" });
|
|
233
233
|
assert.equal(result.tests[0].assert?.[0]?.provider, "openai:chat:gpt-5");
|
|
234
234
|
});
|
|
235
|
+
it("resolves templated llm-rubric with rubric text and dimension metadata", () => {
|
|
236
|
+
const rubricConfig = {
|
|
237
|
+
templates: {
|
|
238
|
+
"agent-output": {
|
|
239
|
+
dimension: "agent-output",
|
|
240
|
+
header: "Score the agent's final output from 0 to 100:",
|
|
241
|
+
scale: ["0: Failed", "50: Partial", "100: Complete"],
|
|
242
|
+
criteria_label: "Check for:",
|
|
243
|
+
},
|
|
244
|
+
},
|
|
245
|
+
};
|
|
246
|
+
const result = compileAgentHarnessTask(makeTask({
|
|
247
|
+
assertions: [
|
|
248
|
+
{
|
|
249
|
+
type: "llm-rubric",
|
|
250
|
+
template: "agent-output",
|
|
251
|
+
criteria: ["File created", "Correct content"],
|
|
252
|
+
},
|
|
253
|
+
],
|
|
254
|
+
}), { rubricConfig, graderProvider: "anthropic:messages:claude-opus-4-5" });
|
|
255
|
+
const assertion = result.tests[0].assert?.[0];
|
|
256
|
+
assert.ok(assertion, "should produce an assertion");
|
|
257
|
+
assert.equal(assertion.type, "llm-rubric");
|
|
258
|
+
// Rubric text should be fully rendered (not empty)
|
|
259
|
+
assert.ok(assertion.value.includes("Score the agent"), "should contain rendered rubric header");
|
|
260
|
+
assert.ok(assertion.value.includes("File created"), "should contain task-specific criteria");
|
|
261
|
+
// Dimension metadata should be attached
|
|
262
|
+
const metadata = assertion.metadata;
|
|
263
|
+
assert.ok(metadata, "should have metadata");
|
|
264
|
+
assert.equal(metadata.dimension, "agent-output");
|
|
265
|
+
assert.equal(metadata.maxScore, 100);
|
|
266
|
+
// Grader provider should be set
|
|
267
|
+
assert.equal(assertion.provider, "anthropic:messages:claude-opus-4-5");
|
|
268
|
+
});
|
|
269
|
+
it("warns when rubric template is unknown", () => {
|
|
270
|
+
const rubricConfig = { templates: {} };
|
|
271
|
+
const result = compileAgentHarnessTask(makeTask({
|
|
272
|
+
assertions: [
|
|
273
|
+
{
|
|
274
|
+
type: "llm-rubric",
|
|
275
|
+
template: "nonexistent-template",
|
|
276
|
+
criteria: ["Something"],
|
|
277
|
+
},
|
|
278
|
+
],
|
|
279
|
+
}), { rubricConfig });
|
|
280
|
+
// Unknown template produces a warning and no assertion
|
|
281
|
+
assert.ok(result.warnings.some((w) => w.includes("nonexistent-template")), "should warn about unknown template");
|
|
282
|
+
// The assertion should be null (filtered out)
|
|
283
|
+
assert.equal(result.tests[0].assert?.length ?? 0, 0, "should not produce an assertion for unknown template");
|
|
284
|
+
});
|
|
285
|
+
it("warns when rubricConfig is not provided for templated assertion", () => {
|
|
286
|
+
const result = compileAgentHarnessTask(makeTask({
|
|
287
|
+
assertions: [
|
|
288
|
+
{
|
|
289
|
+
type: "llm-rubric",
|
|
290
|
+
template: "agent-output",
|
|
291
|
+
criteria: ["Something"],
|
|
292
|
+
},
|
|
293
|
+
],
|
|
294
|
+
})
|
|
295
|
+
// No rubricConfig in options
|
|
296
|
+
);
|
|
297
|
+
assert.ok(result.warnings.some((w) => w.includes("No rubric config")), "should warn about missing rubric config");
|
|
298
|
+
});
|
|
235
299
|
});
|
|
236
300
|
// ---------------------------------------------------------------------------
|
|
237
301
|
// compileAgentHarnessTask — lifecycle extensions
|
|
@@ -5,6 +5,12 @@
|
|
|
5
5
|
* command-succeeds, diff-matches) as well as standard pass-through
|
|
6
6
|
* assertion types.
|
|
7
7
|
*
|
|
8
|
+
* Templated LLM-rubric assertions (those with `template` + `criteria`)
|
|
9
|
+
* are resolved via the shared rubric-resolution module, producing fully
|
|
10
|
+
* assembled rubric text and dimension metadata. This is critical for
|
|
11
|
+
* scoring — without it, the grader receives empty rubrics and the
|
|
12
|
+
* scoring pipeline has no dimension data to work with (DOC-2029).
|
|
13
|
+
*
|
|
8
14
|
* Agent-specific assertions use file-based references to the assertions
|
|
9
15
|
* runtime module (dist/agent-harness/assertions-runtime.js) because
|
|
10
16
|
* promptfoo's inline `type: javascript` assertions run in a restricted
|
|
@@ -5,6 +5,12 @@
|
|
|
5
5
|
* command-succeeds, diff-matches) as well as standard pass-through
|
|
6
6
|
* assertion types.
|
|
7
7
|
*
|
|
8
|
+
* Templated LLM-rubric assertions (those with `template` + `criteria`)
|
|
9
|
+
* are resolved via the shared rubric-resolution module, producing fully
|
|
10
|
+
* assembled rubric text and dimension metadata. This is critical for
|
|
11
|
+
* scoring — without it, the grader receives empty rubrics and the
|
|
12
|
+
* scoring pipeline has no dimension data to work with (DOC-2029).
|
|
13
|
+
*
|
|
8
14
|
* Agent-specific assertions use file-based references to the assertions
|
|
9
15
|
* runtime module (dist/agent-harness/assertions-runtime.js) because
|
|
10
16
|
* promptfoo's inline `type: javascript` assertions run in a restricted
|
|
@@ -14,6 +20,7 @@
|
|
|
14
20
|
* @see https://www.promptfoo.dev/docs/configuration/expected-outputs/javascript/
|
|
15
21
|
* @see src/agent-harness/assertions-runtime.ts — runtime implementations
|
|
16
22
|
*/
|
|
23
|
+
import { resolveTemplatedAssertion } from "../../rubric-resolution.js";
|
|
17
24
|
/** Base path for the file-based assertion runtime module */
|
|
18
25
|
const RUNTIME = "file://dist/agent-harness/assertions-runtime.js";
|
|
19
26
|
// ---------------------------------------------------------------------------
|
|
@@ -44,6 +51,13 @@ export function mapAgentAssertion(assertion, options, warnings) {
|
|
|
44
51
|
: {}),
|
|
45
52
|
};
|
|
46
53
|
case "llm-rubric":
|
|
54
|
+
// Templated assertions (template + criteria) need full resolution
|
|
55
|
+
// to produce rubric text and dimension metadata for scoring.
|
|
56
|
+
if ("template" in assertion && "criteria" in assertion) {
|
|
57
|
+
const resolved = resolveTemplatedAssertion(assertion, options?.rubricConfig, options?.graderProvider, warnings);
|
|
58
|
+
return resolved;
|
|
59
|
+
}
|
|
60
|
+
// Non-templated llm-rubric (inline value) — pass through
|
|
47
61
|
return {
|
|
48
62
|
type: "llm-rubric",
|
|
49
63
|
...("value" in assertion ? { value: assertion.value } : {}),
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
* Shared types for the agent harness mode handler.
|
|
3
3
|
*/
|
|
4
4
|
import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
|
|
5
|
+
import type { RubricConfig } from "../../rubric-resolution.js";
|
|
5
6
|
import type { SandboxType } from "../../sandbox/sandbox-strategy.js";
|
|
6
7
|
/** Options for compiling an agent harness task */
|
|
7
8
|
export interface AgentHarnessCompileOptions {
|
|
@@ -9,6 +10,8 @@ export interface AgentHarnessCompileOptions {
|
|
|
9
10
|
graderProvider?: string;
|
|
10
11
|
/** Root directory for fixture resolution */
|
|
11
12
|
rootDir?: string;
|
|
13
|
+
/** Rubric config (templates, weights, profiles) — loaded from rubrics config */
|
|
14
|
+
rubricConfig?: RubricConfig;
|
|
12
15
|
}
|
|
13
16
|
/** Result of compiling a single agent harness task */
|
|
14
17
|
export interface AgentHarnessCompileResult {
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
* Handles rubric template resolution, doc-coverage auto-generation,
|
|
5
5
|
* and baseline assertion filtering.
|
|
6
6
|
*/
|
|
7
|
+
import { resolveTemplatedAssertion } from "../../rubric-resolution.js";
|
|
7
8
|
// ---------------------------------------------------------------------------
|
|
8
9
|
// Assertion resolution
|
|
9
10
|
// ---------------------------------------------------------------------------
|
|
@@ -37,33 +38,6 @@ export function resolveAssertions(task, options, warnings) {
|
|
|
37
38
|
return assertions;
|
|
38
39
|
}
|
|
39
40
|
// ---------------------------------------------------------------------------
|
|
40
|
-
// Rubric template resolution
|
|
41
|
-
// ---------------------------------------------------------------------------
|
|
42
|
-
function resolveTemplatedAssertion(a, rubricConfig, graderProvider, warnings) {
|
|
43
|
-
if (!rubricConfig) {
|
|
44
|
-
warnings.push(`No rubric config — template "${a.template}" cannot be resolved`);
|
|
45
|
-
return null;
|
|
46
|
-
}
|
|
47
|
-
const template = rubricConfig.templates[a.template];
|
|
48
|
-
if (!template) {
|
|
49
|
-
warnings.push(`Unknown rubric template: "${a.template}"`);
|
|
50
|
-
return null;
|
|
51
|
-
}
|
|
52
|
-
const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
|
|
53
|
-
const criteriaText = a.criteria.map((c) => `- ${c}`).join("\n");
|
|
54
|
-
const rubricValue = `${template.header}\n${scaleText}\n\n` +
|
|
55
|
-
`${template.criteria_label ?? "Check for:"}\n${criteriaText}\n\n` +
|
|
56
|
-
`Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
|
|
57
|
-
return {
|
|
58
|
-
type: "llm-rubric",
|
|
59
|
-
value: rubricValue,
|
|
60
|
-
...(graderProvider ? { provider: graderProvider } : {}),
|
|
61
|
-
...(template.dimension
|
|
62
|
-
? { metadata: { dimension: template.dimension, maxScore: 100 } }
|
|
63
|
-
: {}),
|
|
64
|
-
};
|
|
65
|
-
}
|
|
66
|
-
// ---------------------------------------------------------------------------
|
|
67
41
|
// Doc-coverage assertion
|
|
68
42
|
// ---------------------------------------------------------------------------
|
|
69
43
|
function buildDocCoverageAssertion(rubricConfig, graderProvider) {
|
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
* Shared types for the literacy mode handler.
|
|
3
3
|
*/
|
|
4
4
|
import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
|
|
5
|
+
export type { RubricConfig } from "../../rubric-resolution.js";
|
|
6
|
+
import type { RubricConfig } from "../../rubric-resolution.js";
|
|
5
7
|
/** Options for compiling a literacy task */
|
|
6
8
|
export interface LiteracyCompileOptions {
|
|
7
9
|
/** Grader provider for LLM-graded assertions */
|
|
@@ -19,15 +21,6 @@ export interface LiteracyCompileOptions {
|
|
|
19
21
|
/** Rubric config (templates, weights, profiles) — loaded from rubrics config */
|
|
20
22
|
rubricConfig?: RubricConfig;
|
|
21
23
|
}
|
|
22
|
-
/** Minimal rubric config needed by the handler */
|
|
23
|
-
export interface RubricConfig {
|
|
24
|
-
templates: Record<string, {
|
|
25
|
-
dimension?: string;
|
|
26
|
-
header: string;
|
|
27
|
-
scale: string[];
|
|
28
|
-
criteria_label?: string;
|
|
29
|
-
}>;
|
|
30
|
-
}
|
|
31
24
|
/** Result of compiling a single literacy task */
|
|
32
25
|
export interface LiteracyCompileResult {
|
|
33
26
|
/** Promptfoo provider configs */
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared rubric template resolution for all evaluation modes.
|
|
3
|
+
*
|
|
4
|
+
* Resolves templated LLM-rubric assertions (those with `template` + `criteria`
|
|
5
|
+
* fields) into fully assembled Promptfoo assertions with rubric text and
|
|
6
|
+
* dimension metadata.
|
|
7
|
+
*
|
|
8
|
+
* Used by both literacy and agent-harness compilers. Extracted from
|
|
9
|
+
* literacy/assertions.ts to fix the compilation bug where agent-harness
|
|
10
|
+
* tasks with templated rubrics produced empty rubric text (DOC-2029).
|
|
11
|
+
*
|
|
12
|
+
* @see docs/design-docs/mode-agnostic-scoring.md
|
|
13
|
+
* @see config/rubrics.ts — template definitions
|
|
14
|
+
*/
|
|
15
|
+
import type { PromptfooAssertion } from "./assertion-mapper.js";
|
|
16
|
+
/** Minimal rubric config needed for template resolution */
|
|
17
|
+
export interface RubricConfig {
|
|
18
|
+
templates: Record<string, {
|
|
19
|
+
criteria_label?: string;
|
|
20
|
+
dimension?: string;
|
|
21
|
+
header: string;
|
|
22
|
+
scale: string[];
|
|
23
|
+
}>;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Resolve a templated LLM-rubric assertion into a fully assembled
|
|
27
|
+
* Promptfoo assertion with rubric text and dimension metadata.
|
|
28
|
+
*
|
|
29
|
+
* A "templated" assertion has `template` (referencing a key in rubrics.ts)
|
|
30
|
+
* and `criteria` (task-specific bullet points). The template provides the
|
|
31
|
+
* scoring header, scale, and dimension metadata. The criteria are appended
|
|
32
|
+
* to create the final rubric prompt.
|
|
33
|
+
*
|
|
34
|
+
* Returns null (with a warning) if the template can't be resolved.
|
|
35
|
+
*/
|
|
36
|
+
export declare function resolveTemplatedAssertion(assertion: {
|
|
37
|
+
criteria: string[];
|
|
38
|
+
template: string;
|
|
39
|
+
type: string;
|
|
40
|
+
}, rubricConfig: RubricConfig | undefined, graderProvider: string | undefined, warnings: string[]): PromptfooAssertion | null;
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared rubric template resolution for all evaluation modes.
|
|
3
|
+
*
|
|
4
|
+
* Resolves templated LLM-rubric assertions (those with `template` + `criteria`
|
|
5
|
+
* fields) into fully assembled Promptfoo assertions with rubric text and
|
|
6
|
+
* dimension metadata.
|
|
7
|
+
*
|
|
8
|
+
* Used by both literacy and agent-harness compilers. Extracted from
|
|
9
|
+
* literacy/assertions.ts to fix the compilation bug where agent-harness
|
|
10
|
+
* tasks with templated rubrics produced empty rubric text (DOC-2029).
|
|
11
|
+
*
|
|
12
|
+
* @see docs/design-docs/mode-agnostic-scoring.md
|
|
13
|
+
* @see config/rubrics.ts — template definitions
|
|
14
|
+
*/
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
// Template resolution
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
/**
|
|
19
|
+
* Resolve a templated LLM-rubric assertion into a fully assembled
|
|
20
|
+
* Promptfoo assertion with rubric text and dimension metadata.
|
|
21
|
+
*
|
|
22
|
+
* A "templated" assertion has `template` (referencing a key in rubrics.ts)
|
|
23
|
+
* and `criteria` (task-specific bullet points). The template provides the
|
|
24
|
+
* scoring header, scale, and dimension metadata. The criteria are appended
|
|
25
|
+
* to create the final rubric prompt.
|
|
26
|
+
*
|
|
27
|
+
* Returns null (with a warning) if the template can't be resolved.
|
|
28
|
+
*/
|
|
29
|
+
export function resolveTemplatedAssertion(assertion, rubricConfig, graderProvider, warnings) {
|
|
30
|
+
if (!rubricConfig) {
|
|
31
|
+
warnings.push(`No rubric config — template "${assertion.template}" cannot be resolved`);
|
|
32
|
+
return null;
|
|
33
|
+
}
|
|
34
|
+
const template = rubricConfig.templates[assertion.template];
|
|
35
|
+
if (!template) {
|
|
36
|
+
warnings.push(`Unknown rubric template: "${assertion.template}"`);
|
|
37
|
+
return null;
|
|
38
|
+
}
|
|
39
|
+
const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
|
|
40
|
+
const criteriaText = assertion.criteria.map((c) => `- ${c}`).join("\n");
|
|
41
|
+
const rubricValue = `${template.header}\n${scaleText}\n\n` +
|
|
42
|
+
`${template.criteria_label ?? "Check for:"}\n${criteriaText}\n\n` +
|
|
43
|
+
`Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
|
|
44
|
+
return {
|
|
45
|
+
type: "llm-rubric",
|
|
46
|
+
value: rubricValue,
|
|
47
|
+
...(graderProvider ? { provider: graderProvider } : {}),
|
|
48
|
+
...(template.dimension
|
|
49
|
+
? { metadata: { dimension: template.dimension, maxScore: 100 } }
|
|
50
|
+
: {}),
|
|
51
|
+
};
|
|
52
|
+
}
|
|
@@ -41,19 +41,25 @@ import { classifyRubric, parseRubricScore } from "../../_vendor/ailf-core/index.
|
|
|
41
41
|
export function scoreTestGroup(tests, profile, taskId) {
|
|
42
42
|
let totalCost = 0;
|
|
43
43
|
// Step 1: Convert all ComponentResults into AssertionScore[] (0–1 scale)
|
|
44
|
+
//
|
|
45
|
+
// Two assertion types contribute to scoring:
|
|
46
|
+
// - llm-rubric: dimension from metadata, score from grader (0–100 → [0,1])
|
|
47
|
+
// - javascript: mapped to "assertion-pass-rate" dimension (pass=1, fail=0)
|
|
48
|
+
//
|
|
49
|
+
// Other types (cost, trajectory, contains, etc.) are metadata or guards —
|
|
50
|
+
// they don't produce dimension scores.
|
|
44
51
|
const assertionScores = [];
|
|
45
52
|
for (const test of tests) {
|
|
46
53
|
totalCost += test.cost;
|
|
47
54
|
for (const comp of test.gradingResult.componentResults) {
|
|
48
|
-
|
|
49
|
-
continue;
|
|
50
|
-
const converted = componentToAssertionScore(comp);
|
|
55
|
+
const converted = componentToScore(comp);
|
|
51
56
|
if (converted)
|
|
52
57
|
assertionScores.push(converted);
|
|
53
58
|
}
|
|
54
59
|
}
|
|
55
60
|
// Step 2: Aggregate into DimensionScores (0–1 scale)
|
|
56
61
|
const dimensionLabels = {
|
|
62
|
+
"assertion-pass-rate": "Assertion Pass Rate",
|
|
57
63
|
"code-correctness": "Code Correctness",
|
|
58
64
|
"doc-coverage": "Doc Coverage",
|
|
59
65
|
"task-completion": "Task Completion",
|
|
@@ -86,12 +92,34 @@ export function scoreTestGroup(tests, profile, taskId) {
|
|
|
86
92
|
// Conversion helpers
|
|
87
93
|
// ---------------------------------------------------------------------------
|
|
88
94
|
/**
|
|
89
|
-
*
|
|
90
|
-
* AssertionScore format.
|
|
95
|
+
* Route a ComponentResult to the appropriate scoring conversion.
|
|
91
96
|
*
|
|
92
|
-
*
|
|
97
|
+
* Dispatches by assertion type:
|
|
98
|
+
* - llm-rubric → dimension from metadata, grader score (0–100 → [0,1])
|
|
99
|
+
* - javascript → "assertion-pass-rate" dimension, binary (pass=1, fail=0)
|
|
100
|
+
* - everything else → null (not a scoring-relevant assertion)
|
|
101
|
+
*
|
|
102
|
+
* This replaces the previous llm-rubric-only filter that caused agent-harness
|
|
103
|
+
* javascript assertions to be invisible to the scoring engine (DOC-2029).
|
|
104
|
+
*/
|
|
105
|
+
function componentToScore(comp) {
|
|
106
|
+
const type = comp.assertion?.type;
|
|
107
|
+
if (type === "llm-rubric") {
|
|
108
|
+
return llmRubricToScore(comp);
|
|
109
|
+
}
|
|
110
|
+
if (type === "javascript") {
|
|
111
|
+
return javascriptAssertionToScore(comp);
|
|
112
|
+
}
|
|
113
|
+
// Other types (cost, trajectory, contains, etc.) don't produce scores
|
|
114
|
+
return null;
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Convert an LLM-rubric ComponentResult into an AssertionScore.
|
|
118
|
+
*
|
|
119
|
+
* The dimension comes from metadata (set during rubric template resolution).
|
|
120
|
+
* Returns null if the component doesn't map to any dimension.
|
|
93
121
|
*/
|
|
94
|
-
function
|
|
122
|
+
function llmRubricToScore(comp) {
|
|
95
123
|
const dim = classifyRubric(comp);
|
|
96
124
|
if (!dim)
|
|
97
125
|
return null;
|
|
@@ -108,6 +136,30 @@ function componentToAssertionScore(comp) {
|
|
|
108
136
|
weight: 1.0,
|
|
109
137
|
};
|
|
110
138
|
}
|
|
139
|
+
/**
|
|
140
|
+
* Convert a javascript assertion ComponentResult into an AssertionScore.
|
|
141
|
+
*
|
|
142
|
+
* Javascript assertions (fileExists, fileContains, commandSucceeds, etc.)
|
|
143
|
+
* produce binary pass/fail results. They map to the "assertion-pass-rate"
|
|
144
|
+
* dimension — the fraction of structural assertions that passed.
|
|
145
|
+
*
|
|
146
|
+
* Zero-weight assertions (like URL extraction) are excluded from scoring.
|
|
147
|
+
*/
|
|
148
|
+
function javascriptAssertionToScore(comp) {
|
|
149
|
+
// Skip zero-weight assertions (diagnostic-only, e.g., URL extraction)
|
|
150
|
+
const weight = comp.assertion?.weight;
|
|
151
|
+
if (weight === 0)
|
|
152
|
+
return null;
|
|
153
|
+
return {
|
|
154
|
+
assertionType: "javascript",
|
|
155
|
+
dimension: "assertion-pass-rate",
|
|
156
|
+
latencyMs: 0,
|
|
157
|
+
pass: comp.pass,
|
|
158
|
+
reason: comp.reason ?? "",
|
|
159
|
+
score: comp.pass ? 1.0 : 0.0,
|
|
160
|
+
weight: 1.0,
|
|
161
|
+
};
|
|
162
|
+
}
|
|
111
163
|
/** Convert kebab-case dimension key to camelCase (e.g., "task-completion" → "taskCompletion") */
|
|
112
164
|
function kebabToCamel(kebab) {
|
|
113
165
|
return kebab.replace(/-([a-z])/g, (_, c) => c.toUpperCase());
|
|
@@ -59,6 +59,12 @@ export function buildProvenance(input) {
|
|
|
59
59
|
evalFingerprint: input.evalFingerprint,
|
|
60
60
|
hasLineage: Boolean(lineage),
|
|
61
61
|
});
|
|
62
|
+
// Non-literacy modes (agent-harness, mcp-server, etc.) don't use the
|
|
63
|
+
// config/models.ts model matrix — listing those models would be misleading.
|
|
64
|
+
// Only include them for literacy mode where they're the actual eval targets.
|
|
65
|
+
const evaluatedModels = input.mode === "literacy"
|
|
66
|
+
? models.models.map((m) => ({ id: m.id, label: m.label }))
|
|
67
|
+
: [];
|
|
62
68
|
return {
|
|
63
69
|
areas: input.areas,
|
|
64
70
|
autoScope: input.autoScope,
|
|
@@ -68,7 +74,7 @@ export function buildProvenance(input) {
|
|
|
68
74
|
graderModel: models.grader.id,
|
|
69
75
|
lineage,
|
|
70
76
|
mode: input.mode,
|
|
71
|
-
models:
|
|
77
|
+
models: evaluatedModels,
|
|
72
78
|
promptfooUrl: input.promptfooUrl,
|
|
73
79
|
promptfooUrls: input.promptfooUrls,
|
|
74
80
|
source: {
|