agentv 2.2.0 → 2.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-5HTT24MQ.js → chunk-XREH4WAJ.js} +704 -140
- package/dist/chunk-XREH4WAJ.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.agentv/.env.example +23 -23
- package/dist/templates/.agentv/config.yaml +15 -15
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +42 -2
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +57 -3
- package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +59 -2
- package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +85 -18
- package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +78 -4
- package/dist/templates/.claude/skills/agentv-prompt-optimizer/SKILL.md +78 -77
- package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +4 -4
- package/dist/templates/.github/prompts/agentv-optimize.prompt.md +3 -3
- package/package.json +1 -1
- package/dist/chunk-5HTT24MQ.js.map +0 -1
|
@@ -375,7 +375,7 @@ var compareCommand = command({
|
|
|
375
375
|
import { readFileSync as readFileSync2, writeFileSync } from "node:fs";
|
|
376
376
|
import path16 from "node:path";
|
|
377
377
|
|
|
378
|
-
// ../../packages/core/dist/chunk-
|
|
378
|
+
// ../../packages/core/dist/chunk-RP3M7COZ.js
|
|
379
379
|
import { constants } from "node:fs";
|
|
380
380
|
import { access, readFile } from "node:fs/promises";
|
|
381
381
|
import path from "node:path";
|
|
@@ -4422,7 +4422,7 @@ var coerce = {
|
|
|
4422
4422
|
};
|
|
4423
4423
|
var NEVER = INVALID;
|
|
4424
4424
|
|
|
4425
|
-
// ../../packages/core/dist/chunk-
|
|
4425
|
+
// ../../packages/core/dist/chunk-RP3M7COZ.js
|
|
4426
4426
|
async function fileExists(filePath) {
|
|
4427
4427
|
try {
|
|
4428
4428
|
await access(filePath, constants.F_OK);
|
|
@@ -5420,6 +5420,7 @@ function isAgentProvider(provider) {
|
|
|
5420
5420
|
// ../../packages/core/dist/index.js
|
|
5421
5421
|
import { readFile as readFile6 } from "node:fs/promises";
|
|
5422
5422
|
import path72 from "node:path";
|
|
5423
|
+
import micromatch3 from "micromatch";
|
|
5423
5424
|
import { parse as parse22 } from "yaml";
|
|
5424
5425
|
import { readFile as readFile4 } from "node:fs/promises";
|
|
5425
5426
|
import path22 from "node:path";
|
|
@@ -5432,6 +5433,7 @@ import path32 from "node:path";
|
|
|
5432
5433
|
import { readFile as readFile22 } from "node:fs/promises";
|
|
5433
5434
|
import { readFile as readFile42 } from "node:fs/promises";
|
|
5434
5435
|
import path52 from "node:path";
|
|
5436
|
+
import micromatch2 from "micromatch";
|
|
5435
5437
|
import { parse as parseYaml } from "yaml";
|
|
5436
5438
|
import { readFile as readFile32 } from "node:fs/promises";
|
|
5437
5439
|
import path42 from "node:path";
|
|
@@ -35077,6 +35079,7 @@ import { randomBytes } from "node:crypto";
|
|
|
35077
35079
|
import { createServer } from "node:http";
|
|
35078
35080
|
import { createHash } from "node:crypto";
|
|
35079
35081
|
import path15 from "node:path";
|
|
35082
|
+
import micromatch4 from "micromatch";
|
|
35080
35083
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
35081
35084
|
var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
|
|
35082
35085
|
function isTestMessageRole(value) {
|
|
@@ -35452,11 +35455,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35452
35455
|
);
|
|
35453
35456
|
}
|
|
35454
35457
|
}
|
|
35455
|
-
const
|
|
35456
|
-
const
|
|
35458
|
+
const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
|
|
35459
|
+
const config22 = {};
|
|
35457
35460
|
for (const [key2, value] of Object.entries(rawEvaluator)) {
|
|
35458
|
-
if (!
|
|
35459
|
-
|
|
35461
|
+
if (!knownProps2.has(key2) && value !== void 0) {
|
|
35462
|
+
config22[key2] = value;
|
|
35460
35463
|
}
|
|
35461
35464
|
}
|
|
35462
35465
|
evaluators.push({
|
|
@@ -35466,7 +35469,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35466
35469
|
cwd,
|
|
35467
35470
|
resolvedCwd,
|
|
35468
35471
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
35469
|
-
...Object.keys(
|
|
35472
|
+
...Object.keys(config22).length > 0 ? { config: config22 } : {},
|
|
35470
35473
|
...targetConfig !== void 0 ? { target: targetConfig } : {}
|
|
35471
35474
|
});
|
|
35472
35475
|
continue;
|
|
@@ -35631,7 +35634,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35631
35634
|
continue;
|
|
35632
35635
|
}
|
|
35633
35636
|
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35634
|
-
const
|
|
35637
|
+
const config22 = {
|
|
35635
35638
|
name: name16,
|
|
35636
35639
|
type: "tool_trajectory",
|
|
35637
35640
|
mode,
|
|
@@ -35639,7 +35642,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35639
35642
|
...expected ? { expected } : {},
|
|
35640
35643
|
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
35641
35644
|
};
|
|
35642
|
-
evaluators.push(
|
|
35645
|
+
evaluators.push(config22);
|
|
35643
35646
|
continue;
|
|
35644
35647
|
}
|
|
35645
35648
|
if (typeValue === "field_accuracy") {
|
|
@@ -35776,9 +35779,33 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35776
35779
|
});
|
|
35777
35780
|
continue;
|
|
35778
35781
|
}
|
|
35779
|
-
const
|
|
35782
|
+
const rawPrompt = rawEvaluator.prompt;
|
|
35783
|
+
let prompt;
|
|
35780
35784
|
let promptPath;
|
|
35781
|
-
|
|
35785
|
+
let resolvedPromptScript;
|
|
35786
|
+
let promptScriptConfig;
|
|
35787
|
+
if (isJsonObject2(rawPrompt)) {
|
|
35788
|
+
const scriptArray = asStringArray(
|
|
35789
|
+
rawPrompt.script,
|
|
35790
|
+
`prompt.script for evaluator '${name16}' in '${evalId}'`
|
|
35791
|
+
);
|
|
35792
|
+
if (!scriptArray) {
|
|
35793
|
+
throw new Error(`Evaluator '${name16}' in '${evalId}': prompt object requires script array`);
|
|
35794
|
+
}
|
|
35795
|
+
const scriptPath = scriptArray[scriptArray.length - 1];
|
|
35796
|
+
const resolved = await resolveFileReference2(scriptPath, searchRoots);
|
|
35797
|
+
if (resolved.resolvedPath) {
|
|
35798
|
+
resolvedPromptScript = [...scriptArray.slice(0, -1), path32.resolve(resolved.resolvedPath)];
|
|
35799
|
+
} else {
|
|
35800
|
+
throw new Error(
|
|
35801
|
+
`Evaluator '${name16}' in '${evalId}': prompt script file not found: ${resolved.displayPath}`
|
|
35802
|
+
);
|
|
35803
|
+
}
|
|
35804
|
+
if (isJsonObject2(rawPrompt.config)) {
|
|
35805
|
+
promptScriptConfig = rawPrompt.config;
|
|
35806
|
+
}
|
|
35807
|
+
} else if (typeof rawPrompt === "string") {
|
|
35808
|
+
prompt = rawPrompt;
|
|
35782
35809
|
const resolved = await resolveFileReference2(prompt, searchRoots);
|
|
35783
35810
|
if (resolved.resolvedPath) {
|
|
35784
35811
|
promptPath = path32.resolve(resolved.resolvedPath);
|
|
@@ -35797,12 +35824,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35797
35824
|
}
|
|
35798
35825
|
const _model = asString(rawEvaluator.model);
|
|
35799
35826
|
const rawRubrics = rawEvaluator.rubrics;
|
|
35800
|
-
const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics
|
|
35801
|
-
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
35802
|
-
description: asString(rubric.description) ?? "",
|
|
35803
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
35804
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
35805
|
-
})).filter((r) => r.description.length > 0) : void 0;
|
|
35827
|
+
const parsedRubrics = Array.isArray(rawRubrics) ? parseRubricItems(rawRubrics, name16, evalId) : void 0;
|
|
35806
35828
|
if (typeValue === "rubric") {
|
|
35807
35829
|
if (!parsedRubrics) {
|
|
35808
35830
|
logWarning2(`Skipping rubric evaluator '${name16}' in '${evalId}': missing rubrics array`);
|
|
@@ -35822,13 +35844,26 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35822
35844
|
continue;
|
|
35823
35845
|
}
|
|
35824
35846
|
const weight = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35847
|
+
const knownProps = /* @__PURE__ */ new Set(["name", "type", "prompt", "model", "rubrics", "weight", "config"]);
|
|
35848
|
+
const config2 = {};
|
|
35849
|
+
for (const [key2, value] of Object.entries(rawEvaluator)) {
|
|
35850
|
+
if (!knownProps.has(key2) && value !== void 0) {
|
|
35851
|
+
config2[key2] = value;
|
|
35852
|
+
}
|
|
35853
|
+
}
|
|
35854
|
+
const topLevelConfig = isJsonObject2(rawEvaluator.config) ? rawEvaluator.config : {};
|
|
35855
|
+
const mergedConfig = { ...config2, ...topLevelConfig };
|
|
35856
|
+
const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
|
|
35825
35857
|
evaluators.push({
|
|
35826
35858
|
name: name16,
|
|
35827
35859
|
type: "llm_judge",
|
|
35828
35860
|
prompt,
|
|
35829
35861
|
promptPath,
|
|
35862
|
+
...promptPath ? { resolvedPromptPath: promptPath } : {},
|
|
35863
|
+
...resolvedPromptScript ? { resolvedPromptScript } : {},
|
|
35830
35864
|
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
|
|
35831
|
-
...weight !== void 0 ? { weight } : {}
|
|
35865
|
+
...weight !== void 0 ? { weight } : {},
|
|
35866
|
+
...finalConfig ? { config: finalConfig } : {}
|
|
35832
35867
|
});
|
|
35833
35868
|
}
|
|
35834
35869
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -35915,6 +35950,185 @@ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average",
|
|
|
35915
35950
|
function isValidFieldAggregationType(value) {
|
|
35916
35951
|
return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
|
|
35917
35952
|
}
|
|
35953
|
+
function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
35954
|
+
const items = [];
|
|
35955
|
+
for (const [index, rawRubric] of rawRubrics.entries()) {
|
|
35956
|
+
if (!isJsonObject2(rawRubric)) {
|
|
35957
|
+
logWarning2(
|
|
35958
|
+
`Skipping invalid rubric entry at index ${index} in evaluator '${evaluatorName}' (expected object)`
|
|
35959
|
+
);
|
|
35960
|
+
continue;
|
|
35961
|
+
}
|
|
35962
|
+
const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
|
|
35963
|
+
const expectedOutcome = asString(rawRubric.expected_outcome) ?? asString(rawRubric.description) ?? "";
|
|
35964
|
+
const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
|
|
35965
|
+
let requiredMinScore;
|
|
35966
|
+
let required2;
|
|
35967
|
+
if (typeof rawRubric.required_min_score === "number") {
|
|
35968
|
+
const minScore = rawRubric.required_min_score;
|
|
35969
|
+
if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
|
|
35970
|
+
throw new Error(
|
|
35971
|
+
`Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
|
|
35972
|
+
);
|
|
35973
|
+
}
|
|
35974
|
+
requiredMinScore = minScore;
|
|
35975
|
+
}
|
|
35976
|
+
if (typeof rawRubric.required === "boolean") {
|
|
35977
|
+
required2 = rawRubric.required;
|
|
35978
|
+
}
|
|
35979
|
+
let scoreRanges;
|
|
35980
|
+
const rawScoreRanges = rawRubric.score_ranges;
|
|
35981
|
+
if (rawScoreRanges !== void 0) {
|
|
35982
|
+
if (!Array.isArray(rawScoreRanges)) {
|
|
35983
|
+
throw new Error(
|
|
35984
|
+
`Invalid score_ranges for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an array`
|
|
35985
|
+
);
|
|
35986
|
+
}
|
|
35987
|
+
scoreRanges = parseScoreRanges(rawScoreRanges, id, evaluatorName, evalId);
|
|
35988
|
+
items.push({
|
|
35989
|
+
id,
|
|
35990
|
+
weight,
|
|
35991
|
+
...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
|
|
35992
|
+
...required2 !== void 0 ? { required: required2 } : {},
|
|
35993
|
+
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
|
|
35994
|
+
score_ranges: scoreRanges
|
|
35995
|
+
});
|
|
35996
|
+
} else {
|
|
35997
|
+
if (expectedOutcome.length === 0) {
|
|
35998
|
+
logWarning2(
|
|
35999
|
+
`Skipping rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': missing expected_outcome`
|
|
36000
|
+
);
|
|
36001
|
+
continue;
|
|
36002
|
+
}
|
|
36003
|
+
items.push({
|
|
36004
|
+
id,
|
|
36005
|
+
expected_outcome: expectedOutcome,
|
|
36006
|
+
weight,
|
|
36007
|
+
// Default to required: true if not specified (backward compatibility)
|
|
36008
|
+
required: required2 ?? true,
|
|
36009
|
+
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
|
|
36010
|
+
});
|
|
36011
|
+
}
|
|
36012
|
+
}
|
|
36013
|
+
return items.length > 0 ? items : void 0;
|
|
36014
|
+
}
|
|
36015
|
+
function parseScoreRanges(rawRanges, rubricId, evaluatorName, evalId) {
|
|
36016
|
+
const ranges = [];
|
|
36017
|
+
for (const [index, rawRange] of rawRanges.entries()) {
|
|
36018
|
+
if (!isJsonObject2(rawRange)) {
|
|
36019
|
+
throw new Error(
|
|
36020
|
+
`Invalid score_range entry at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': expected object`
|
|
36021
|
+
);
|
|
36022
|
+
}
|
|
36023
|
+
const scoreRangeValue = rawRange.score_range;
|
|
36024
|
+
if (!Array.isArray(scoreRangeValue) || scoreRangeValue.length !== 2 || typeof scoreRangeValue[0] !== "number" || typeof scoreRangeValue[1] !== "number") {
|
|
36025
|
+
throw new Error(
|
|
36026
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be [min, max] array of two numbers`
|
|
36027
|
+
);
|
|
36028
|
+
}
|
|
36029
|
+
const [min, max] = scoreRangeValue;
|
|
36030
|
+
if (!Number.isInteger(min) || !Number.isInteger(max)) {
|
|
36031
|
+
throw new Error(
|
|
36032
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be integers (got [${min}, ${max}])`
|
|
36033
|
+
);
|
|
36034
|
+
}
|
|
36035
|
+
if (min < 0 || min > 10 || max < 0 || max > 10) {
|
|
36036
|
+
throw new Error(
|
|
36037
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be 0-10 (got [${min}, ${max}])`
|
|
36038
|
+
);
|
|
36039
|
+
}
|
|
36040
|
+
if (min > max) {
|
|
36041
|
+
throw new Error(
|
|
36042
|
+
`Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': min must be <= max (got [${min}, ${max}])`
|
|
36043
|
+
);
|
|
36044
|
+
}
|
|
36045
|
+
const expectedOutcome = asString(rawRange.expected_outcome) ?? asString(rawRange.description) ?? "";
|
|
36046
|
+
if (expectedOutcome.length === 0) {
|
|
36047
|
+
throw new Error(
|
|
36048
|
+
`Missing expected_outcome for score_range [${min}, ${max}] in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}'`
|
|
36049
|
+
);
|
|
36050
|
+
}
|
|
36051
|
+
ranges.push({
|
|
36052
|
+
score_range: [min, max],
|
|
36053
|
+
expected_outcome: expectedOutcome
|
|
36054
|
+
});
|
|
36055
|
+
}
|
|
36056
|
+
const sortedRanges = [...ranges].sort((a, b) => a.score_range[0] - b.score_range[0]);
|
|
36057
|
+
for (let i = 1; i < sortedRanges.length; i++) {
|
|
36058
|
+
const prev = sortedRanges[i - 1];
|
|
36059
|
+
const curr = sortedRanges[i];
|
|
36060
|
+
if (curr.score_range[0] <= prev.score_range[1]) {
|
|
36061
|
+
throw new Error(
|
|
36062
|
+
`Overlapping score_ranges in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': [${prev.score_range[0]}, ${prev.score_range[1]}] overlaps with [${curr.score_range[0]}, ${curr.score_range[1]}]`
|
|
36063
|
+
);
|
|
36064
|
+
}
|
|
36065
|
+
}
|
|
36066
|
+
const covered = /* @__PURE__ */ new Set();
|
|
36067
|
+
for (const range of ranges) {
|
|
36068
|
+
for (let i = range.score_range[0]; i <= range.score_range[1]; i++) {
|
|
36069
|
+
covered.add(i);
|
|
36070
|
+
}
|
|
36071
|
+
}
|
|
36072
|
+
const missing = [];
|
|
36073
|
+
for (let i = 0; i <= 10; i++) {
|
|
36074
|
+
if (!covered.has(i)) {
|
|
36075
|
+
missing.push(i);
|
|
36076
|
+
}
|
|
36077
|
+
}
|
|
36078
|
+
if (missing.length > 0) {
|
|
36079
|
+
throw new Error(
|
|
36080
|
+
`Incomplete score_ranges coverage in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': missing coverage for scores: ${missing.join(", ")}. Ranges must cover all integers 0-10.`
|
|
36081
|
+
);
|
|
36082
|
+
}
|
|
36083
|
+
return ranges;
|
|
36084
|
+
}
|
|
36085
|
+
function parseInlineRubrics(rawRubrics) {
|
|
36086
|
+
const rubricItems = rawRubrics.filter((r) => isJsonObject2(r) || typeof r === "string").map((rubric, index) => {
|
|
36087
|
+
if (typeof rubric === "string") {
|
|
36088
|
+
return {
|
|
36089
|
+
id: `rubric-${index + 1}`,
|
|
36090
|
+
expected_outcome: rubric,
|
|
36091
|
+
weight: 1,
|
|
36092
|
+
required: true
|
|
36093
|
+
};
|
|
36094
|
+
}
|
|
36095
|
+
const expectedOutcome = asString(rubric.expected_outcome) ?? asString(rubric.description) ?? "";
|
|
36096
|
+
const rawScoreRanges = rubric.score_ranges;
|
|
36097
|
+
const scoreRanges = Array.isArray(rawScoreRanges) && rawScoreRanges.length > 0 ? rawScoreRanges.filter((r) => isJsonObject2(r)).map((range) => ({
|
|
36098
|
+
score_range: Array.isArray(range.score_range) ? range.score_range : [0, 10],
|
|
36099
|
+
expected_outcome: asString(range.expected_outcome) ?? asString(range.description) ?? ""
|
|
36100
|
+
})).filter((r) => r.expected_outcome.length > 0) : void 0;
|
|
36101
|
+
const baseRubric = {
|
|
36102
|
+
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
36103
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1
|
|
36104
|
+
};
|
|
36105
|
+
if (scoreRanges && scoreRanges.length > 0) {
|
|
36106
|
+
return {
|
|
36107
|
+
...baseRubric,
|
|
36108
|
+
...expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {},
|
|
36109
|
+
...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
|
|
36110
|
+
...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
|
|
36111
|
+
score_ranges: scoreRanges
|
|
36112
|
+
};
|
|
36113
|
+
}
|
|
36114
|
+
return {
|
|
36115
|
+
...baseRubric,
|
|
36116
|
+
expected_outcome: expectedOutcome,
|
|
36117
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true,
|
|
36118
|
+
...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
|
|
36119
|
+
};
|
|
36120
|
+
}).filter(
|
|
36121
|
+
(r) => r.expected_outcome && r.expected_outcome.length > 0 || "score_ranges" in r && r.score_ranges
|
|
36122
|
+
);
|
|
36123
|
+
if (rubricItems.length === 0) {
|
|
36124
|
+
return void 0;
|
|
36125
|
+
}
|
|
36126
|
+
return {
|
|
36127
|
+
name: "rubric",
|
|
36128
|
+
type: "llm_judge",
|
|
36129
|
+
rubrics: rubricItems
|
|
36130
|
+
};
|
|
36131
|
+
}
|
|
35918
36132
|
function formatFileContents(parts) {
|
|
35919
36133
|
const fileCount = parts.filter((p) => p.isFile).length;
|
|
35920
36134
|
if (fileCount > 0) {
|
|
@@ -36167,6 +36381,63 @@ async function processExpectedMessages(options) {
|
|
|
36167
36381
|
}
|
|
36168
36382
|
return segments;
|
|
36169
36383
|
}
|
|
36384
|
+
function expandInputShorthand(value) {
|
|
36385
|
+
if (value === void 0 || value === null) {
|
|
36386
|
+
return void 0;
|
|
36387
|
+
}
|
|
36388
|
+
if (typeof value === "string") {
|
|
36389
|
+
return [{ role: "user", content: value }];
|
|
36390
|
+
}
|
|
36391
|
+
if (Array.isArray(value)) {
|
|
36392
|
+
const messages = value.filter((msg) => isTestMessage(msg));
|
|
36393
|
+
return messages.length > 0 ? messages : void 0;
|
|
36394
|
+
}
|
|
36395
|
+
return void 0;
|
|
36396
|
+
}
|
|
36397
|
+
function expandExpectedOutputShorthand(value) {
|
|
36398
|
+
if (value === void 0 || value === null) {
|
|
36399
|
+
return void 0;
|
|
36400
|
+
}
|
|
36401
|
+
if (typeof value === "string") {
|
|
36402
|
+
return [{ role: "assistant", content: value }];
|
|
36403
|
+
}
|
|
36404
|
+
if (Array.isArray(value)) {
|
|
36405
|
+
if (value.length > 0 && isJsonObject(value[0]) && "role" in value[0]) {
|
|
36406
|
+
const messages = value.filter((msg) => isTestMessage(msg));
|
|
36407
|
+
return messages.length > 0 ? messages : void 0;
|
|
36408
|
+
}
|
|
36409
|
+
return [{ role: "assistant", content: value }];
|
|
36410
|
+
}
|
|
36411
|
+
if (isJsonObject(value)) {
|
|
36412
|
+
if ("role" in value) {
|
|
36413
|
+
return isTestMessage(value) ? [value] : void 0;
|
|
36414
|
+
}
|
|
36415
|
+
return [{ role: "assistant", content: value }];
|
|
36416
|
+
}
|
|
36417
|
+
return void 0;
|
|
36418
|
+
}
|
|
36419
|
+
function resolveInputMessages(raw) {
|
|
36420
|
+
if (raw.input_messages !== void 0) {
|
|
36421
|
+
if (Array.isArray(raw.input_messages)) {
|
|
36422
|
+
const messages = raw.input_messages.filter((msg) => isTestMessage(msg));
|
|
36423
|
+
return messages.length > 0 ? messages : void 0;
|
|
36424
|
+
}
|
|
36425
|
+
return void 0;
|
|
36426
|
+
}
|
|
36427
|
+
return expandInputShorthand(raw.input);
|
|
36428
|
+
}
|
|
36429
|
+
function resolveExpectedMessages(raw) {
|
|
36430
|
+
if (raw.expected_messages !== void 0) {
|
|
36431
|
+
if (Array.isArray(raw.expected_messages)) {
|
|
36432
|
+
const messages = raw.expected_messages.filter(
|
|
36433
|
+
(msg) => isTestMessage(msg)
|
|
36434
|
+
);
|
|
36435
|
+
return messages.length > 0 ? messages : void 0;
|
|
36436
|
+
}
|
|
36437
|
+
return void 0;
|
|
36438
|
+
}
|
|
36439
|
+
return expandExpectedOutputShorthand(raw.expected_output);
|
|
36440
|
+
}
|
|
36170
36441
|
var ANSI_YELLOW5 = "\x1B[33m";
|
|
36171
36442
|
var ANSI_RED = "\x1B[31m";
|
|
36172
36443
|
var ANSI_RESET5 = "\x1B[0m";
|
|
@@ -36226,7 +36497,7 @@ function parseJsonlContent(content, filePath) {
|
|
|
36226
36497
|
}
|
|
36227
36498
|
async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
|
|
36228
36499
|
const verbose = options?.verbose ?? false;
|
|
36229
|
-
const
|
|
36500
|
+
const filterPattern = options?.filter;
|
|
36230
36501
|
const absoluteTestPath = path52.resolve(evalFilePath);
|
|
36231
36502
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
36232
36503
|
const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
|
|
@@ -36253,28 +36524,20 @@ async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
36253
36524
|
const evalcase = rawCases[lineIndex];
|
|
36254
36525
|
const lineNumber = lineIndex + 1;
|
|
36255
36526
|
const id = asString4(evalcase.id);
|
|
36256
|
-
if (
|
|
36527
|
+
if (filterPattern && (!id || !micromatch2.isMatch(id, filterPattern))) {
|
|
36257
36528
|
continue;
|
|
36258
36529
|
}
|
|
36259
36530
|
const conversationId = asString4(evalcase.conversation_id);
|
|
36260
36531
|
const outcome = asString4(evalcase.expected_outcome) ?? asString4(evalcase.outcome);
|
|
36261
|
-
const
|
|
36262
|
-
const
|
|
36263
|
-
if (!id || !outcome || !
|
|
36532
|
+
const inputMessages = resolveInputMessages(evalcase);
|
|
36533
|
+
const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
|
|
36534
|
+
if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
|
|
36264
36535
|
logError(
|
|
36265
|
-
`Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages`
|
|
36536
|
+
`Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages (or input)`
|
|
36266
36537
|
);
|
|
36267
36538
|
continue;
|
|
36268
36539
|
}
|
|
36269
|
-
const hasExpectedMessages =
|
|
36270
|
-
const inputMessages = inputMessagesValue.filter(
|
|
36271
|
-
(msg) => isTestMessage(msg)
|
|
36272
|
-
);
|
|
36273
|
-
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
36274
|
-
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
36275
|
-
logError(`Line ${lineNumber}: No valid expected message found for eval case: ${id}`);
|
|
36276
|
-
continue;
|
|
36277
|
-
}
|
|
36540
|
+
const hasExpectedMessages = expectedMessages.length > 0;
|
|
36278
36541
|
const guidelinePaths = [];
|
|
36279
36542
|
const inputTextParts = [];
|
|
36280
36543
|
const inputSegments = await processMessages({
|
|
@@ -36320,28 +36583,8 @@ async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
36320
36583
|
}
|
|
36321
36584
|
const inlineRubrics = evalcase.rubrics;
|
|
36322
36585
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
36323
|
-
const
|
|
36324
|
-
|
|
36325
|
-
return {
|
|
36326
|
-
id: `rubric-${index + 1}`,
|
|
36327
|
-
description: rubric,
|
|
36328
|
-
weight: 1,
|
|
36329
|
-
required: true
|
|
36330
|
-
};
|
|
36331
|
-
}
|
|
36332
|
-
return {
|
|
36333
|
-
id: asString4(rubric.id) ?? `rubric-${index + 1}`,
|
|
36334
|
-
description: asString4(rubric.description) ?? "",
|
|
36335
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
36336
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
36337
|
-
};
|
|
36338
|
-
}).filter((r) => r.description.length > 0);
|
|
36339
|
-
if (rubricItems.length > 0) {
|
|
36340
|
-
const rubricEvaluator = {
|
|
36341
|
-
name: "rubric",
|
|
36342
|
-
type: "llm_judge",
|
|
36343
|
-
rubrics: rubricItems
|
|
36344
|
-
};
|
|
36586
|
+
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
36587
|
+
if (rubricEvaluator) {
|
|
36345
36588
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
36346
36589
|
}
|
|
36347
36590
|
}
|
|
@@ -36645,7 +36888,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
36645
36888
|
return loadEvalCasesFromJsonl(evalFilePath, repoRoot, options);
|
|
36646
36889
|
}
|
|
36647
36890
|
const verbose = options?.verbose ?? false;
|
|
36648
|
-
const
|
|
36891
|
+
const filterPattern = options?.filter;
|
|
36649
36892
|
const absoluteTestPath = path72.resolve(evalFilePath);
|
|
36650
36893
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
36651
36894
|
const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
|
|
@@ -36675,28 +36918,20 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
36675
36918
|
}
|
|
36676
36919
|
const evalcase = rawEvalcase;
|
|
36677
36920
|
const id = asString6(evalcase.id);
|
|
36678
|
-
if (
|
|
36921
|
+
if (filterPattern && (!id || !micromatch3.isMatch(id, filterPattern))) {
|
|
36679
36922
|
continue;
|
|
36680
36923
|
}
|
|
36681
36924
|
const conversationId = asString6(evalcase.conversation_id);
|
|
36682
36925
|
const outcome = asString6(evalcase.expected_outcome) ?? asString6(evalcase.outcome);
|
|
36683
|
-
const
|
|
36684
|
-
const
|
|
36685
|
-
if (!id || !outcome || !
|
|
36926
|
+
const inputMessages = resolveInputMessages(evalcase);
|
|
36927
|
+
const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
|
|
36928
|
+
if (!id || !outcome || !inputMessages || inputMessages.length === 0) {
|
|
36686
36929
|
logError2(
|
|
36687
|
-
`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
|
|
36930
|
+
`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages (or input)`
|
|
36688
36931
|
);
|
|
36689
36932
|
continue;
|
|
36690
36933
|
}
|
|
36691
|
-
const hasExpectedMessages =
|
|
36692
|
-
const inputMessages = inputMessagesValue.filter(
|
|
36693
|
-
(msg) => isTestMessage(msg)
|
|
36694
|
-
);
|
|
36695
|
-
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
36696
|
-
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
36697
|
-
logError2(`No valid expected message found for eval case: ${id}`);
|
|
36698
|
-
continue;
|
|
36699
|
-
}
|
|
36934
|
+
const hasExpectedMessages = expectedMessages.length > 0;
|
|
36700
36935
|
const guidelinePaths = [];
|
|
36701
36936
|
const inputTextParts = [];
|
|
36702
36937
|
const inputSegments = await processMessages({
|
|
@@ -36740,28 +36975,8 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
36740
36975
|
}
|
|
36741
36976
|
const inlineRubrics = evalcase.rubrics;
|
|
36742
36977
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
36743
|
-
const
|
|
36744
|
-
|
|
36745
|
-
return {
|
|
36746
|
-
id: `rubric-${index + 1}`,
|
|
36747
|
-
description: rubric,
|
|
36748
|
-
weight: 1,
|
|
36749
|
-
required: true
|
|
36750
|
-
};
|
|
36751
|
-
}
|
|
36752
|
-
return {
|
|
36753
|
-
id: asString6(rubric.id) ?? `rubric-${index + 1}`,
|
|
36754
|
-
description: asString6(rubric.description) ?? "",
|
|
36755
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
36756
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
36757
|
-
};
|
|
36758
|
-
}).filter((r) => r.description.length > 0);
|
|
36759
|
-
if (rubricItems.length > 0) {
|
|
36760
|
-
const rubricEvaluator = {
|
|
36761
|
-
name: "rubric",
|
|
36762
|
-
type: "llm_judge",
|
|
36763
|
-
rubrics: rubricItems
|
|
36764
|
-
};
|
|
36978
|
+
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
36979
|
+
if (rubricEvaluator) {
|
|
36765
36980
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
36766
36981
|
}
|
|
36767
36982
|
}
|
|
@@ -37975,7 +38190,8 @@ var ToolCallSchema = external_exports.object({
|
|
|
37975
38190
|
input: external_exports.unknown().optional(),
|
|
37976
38191
|
output: external_exports.unknown().optional(),
|
|
37977
38192
|
id: external_exports.string().optional(),
|
|
37978
|
-
timestamp: external_exports.string().optional()
|
|
38193
|
+
timestamp: external_exports.string().optional(),
|
|
38194
|
+
duration_ms: external_exports.number().optional()
|
|
37979
38195
|
});
|
|
37980
38196
|
var OutputMessageInputSchema = external_exports.object({
|
|
37981
38197
|
role: external_exports.string(),
|
|
@@ -37983,6 +38199,7 @@ var OutputMessageInputSchema = external_exports.object({
|
|
|
37983
38199
|
content: external_exports.unknown().optional(),
|
|
37984
38200
|
tool_calls: external_exports.array(ToolCallSchema).optional(),
|
|
37985
38201
|
timestamp: external_exports.string().optional(),
|
|
38202
|
+
duration_ms: external_exports.number().optional(),
|
|
37986
38203
|
metadata: external_exports.record(external_exports.unknown()).optional()
|
|
37987
38204
|
});
|
|
37988
38205
|
var TokenUsageSchema = external_exports.object({
|
|
@@ -38021,8 +38238,16 @@ function convertOutputMessages(messages) {
|
|
|
38021
38238
|
role: msg.role,
|
|
38022
38239
|
name: msg.name,
|
|
38023
38240
|
content: msg.content,
|
|
38024
|
-
toolCalls: msg.tool_calls
|
|
38241
|
+
toolCalls: msg.tool_calls?.map((tc) => ({
|
|
38242
|
+
tool: tc.tool,
|
|
38243
|
+
input: tc.input,
|
|
38244
|
+
output: tc.output,
|
|
38245
|
+
id: tc.id,
|
|
38246
|
+
timestamp: tc.timestamp,
|
|
38247
|
+
durationMs: tc.duration_ms
|
|
38248
|
+
})),
|
|
38025
38249
|
timestamp: msg.timestamp,
|
|
38250
|
+
durationMs: msg.duration_ms,
|
|
38026
38251
|
metadata: msg.metadata
|
|
38027
38252
|
}));
|
|
38028
38253
|
}
|
|
@@ -41012,6 +41237,15 @@ var rubricEvaluationSchema = external_exports.object({
|
|
|
41012
41237
|
checks: external_exports.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
41013
41238
|
overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)")
|
|
41014
41239
|
});
|
|
41240
|
+
var scoreRangeCheckResultSchema = external_exports.object({
|
|
41241
|
+
id: external_exports.string().describe("The ID of the rubric criterion being scored"),
|
|
41242
|
+
score: external_exports.number().int().min(0).max(10).describe("Integer score 0-10 for this criterion"),
|
|
41243
|
+
reasoning: external_exports.string().describe("Brief explanation (1-2 sentences) for this score").optional()
|
|
41244
|
+
});
|
|
41245
|
+
var scoreRangeEvaluationSchema = external_exports.object({
|
|
41246
|
+
checks: external_exports.array(scoreRangeCheckResultSchema).describe("Scores for each rubric criterion"),
|
|
41247
|
+
overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)").optional()
|
|
41248
|
+
});
|
|
41015
41249
|
var LlmJudgeEvaluator = class {
|
|
41016
41250
|
kind = "llm_judge";
|
|
41017
41251
|
resolveJudgeProvider;
|
|
@@ -41097,6 +41331,10 @@ var LlmJudgeEvaluator = class {
|
|
|
41097
41331
|
`No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
|
|
41098
41332
|
);
|
|
41099
41333
|
}
|
|
41334
|
+
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
41335
|
+
if (hasScoreRanges) {
|
|
41336
|
+
return this.evaluateWithScoreRanges(context, judgeProvider, rubrics);
|
|
41337
|
+
}
|
|
41100
41338
|
const prompt = this.buildRubricPrompt(context, rubrics);
|
|
41101
41339
|
const systemPrompt = buildRubricOutputSchema();
|
|
41102
41340
|
const evaluatorRawRequest = {
|
|
@@ -41122,6 +41360,84 @@ var LlmJudgeEvaluator = class {
|
|
|
41122
41360
|
evaluatorRawRequest
|
|
41123
41361
|
};
|
|
41124
41362
|
}
|
|
41363
|
+
/**
|
|
41364
|
+
* Evaluate using score-range rubrics (analytic rubric scoring).
|
|
41365
|
+
* Each criterion is scored 0-10 and normalized to 0-1.
|
|
41366
|
+
*/
|
|
41367
|
+
async evaluateWithScoreRanges(context, judgeProvider, rubrics) {
|
|
41368
|
+
const prompt = this.buildScoreRangePrompt(context, rubrics);
|
|
41369
|
+
const systemPrompt = buildScoreRangeOutputSchema();
|
|
41370
|
+
const evaluatorRawRequest = {
|
|
41371
|
+
userPrompt: prompt,
|
|
41372
|
+
systemPrompt,
|
|
41373
|
+
target: judgeProvider.targetName
|
|
41374
|
+
};
|
|
41375
|
+
const { data } = await this.runWithRetry({
|
|
41376
|
+
context,
|
|
41377
|
+
judgeProvider,
|
|
41378
|
+
systemPrompt,
|
|
41379
|
+
userPrompt: prompt,
|
|
41380
|
+
schema: scoreRangeEvaluationSchema
|
|
41381
|
+
});
|
|
41382
|
+
const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
|
|
41383
|
+
return {
|
|
41384
|
+
score,
|
|
41385
|
+
verdict,
|
|
41386
|
+
hits,
|
|
41387
|
+
misses,
|
|
41388
|
+
expectedAspectCount: rubrics.length,
|
|
41389
|
+
reasoning: data.overall_reasoning,
|
|
41390
|
+
evaluatorRawRequest,
|
|
41391
|
+
details
|
|
41392
|
+
};
|
|
41393
|
+
}
|
|
41394
|
+
/**
|
|
41395
|
+
* Build prompt for score-range rubric evaluation.
|
|
41396
|
+
*/
|
|
41397
|
+
buildScoreRangePrompt(context, rubrics) {
|
|
41398
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
41399
|
+
const parts = [
|
|
41400
|
+
"You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.",
|
|
41401
|
+
"For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
|
|
41402
|
+
"",
|
|
41403
|
+
"[[ ## question ## ]]",
|
|
41404
|
+
formattedQuestion,
|
|
41405
|
+
"",
|
|
41406
|
+
"[[ ## expected_outcome ## ]]",
|
|
41407
|
+
context.evalCase.expected_outcome,
|
|
41408
|
+
""
|
|
41409
|
+
];
|
|
41410
|
+
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
41411
|
+
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
41412
|
+
}
|
|
41413
|
+
parts.push(
|
|
41414
|
+
"[[ ## candidate_answer ## ]]",
|
|
41415
|
+
context.candidate,
|
|
41416
|
+
"",
|
|
41417
|
+
"[[ ## scoring_criteria ## ]]"
|
|
41418
|
+
);
|
|
41419
|
+
for (const rubric of rubrics) {
|
|
41420
|
+
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
41421
|
+
const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
41422
|
+
parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
|
|
41423
|
+
if (rubric.expected_outcome) {
|
|
41424
|
+
parts.push(`Description: ${rubric.expected_outcome}`);
|
|
41425
|
+
}
|
|
41426
|
+
if (rubric.score_ranges && rubric.score_ranges.length > 0) {
|
|
41427
|
+
parts.push("Score ranges:");
|
|
41428
|
+
for (const range of rubric.score_ranges) {
|
|
41429
|
+
const [min, max] = range.score_range;
|
|
41430
|
+
const rangeLabel = min === max ? `${min}` : `${min}-${max}`;
|
|
41431
|
+
parts.push(` - Score ${rangeLabel}: ${range.expected_outcome}`);
|
|
41432
|
+
}
|
|
41433
|
+
}
|
|
41434
|
+
}
|
|
41435
|
+
parts.push(
|
|
41436
|
+
"",
|
|
41437
|
+
"For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
|
|
41438
|
+
);
|
|
41439
|
+
return parts.join("\n");
|
|
41440
|
+
}
|
|
41125
41441
|
buildRubricPrompt(context, rubrics) {
|
|
41126
41442
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
41127
41443
|
const parts = [
|
|
@@ -41141,7 +41457,7 @@ var LlmJudgeEvaluator = class {
|
|
|
41141
41457
|
for (const rubric of rubrics) {
|
|
41142
41458
|
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
41143
41459
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
41144
|
-
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.
|
|
41460
|
+
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.expected_outcome}`);
|
|
41145
41461
|
}
|
|
41146
41462
|
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
41147
41463
|
return parts.join("\n");
|
|
@@ -41228,9 +41544,9 @@ function calculateRubricScore(result, rubrics) {
|
|
|
41228
41544
|
totalWeight += rubric.weight;
|
|
41229
41545
|
if (check2.satisfied) {
|
|
41230
41546
|
earnedWeight += rubric.weight;
|
|
41231
|
-
hits.push(`[${rubric.id}] ${rubric.
|
|
41547
|
+
hits.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check2.reasoning}`);
|
|
41232
41548
|
} else {
|
|
41233
|
-
misses.push(`[${rubric.id}] ${rubric.
|
|
41549
|
+
misses.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check2.reasoning}`);
|
|
41234
41550
|
if (rubric.required) {
|
|
41235
41551
|
failedRequired = true;
|
|
41236
41552
|
}
|
|
@@ -41240,6 +41556,76 @@ function calculateRubricScore(result, rubrics) {
|
|
|
41240
41556
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
41241
41557
|
return { score, verdict, hits, misses };
|
|
41242
41558
|
}
|
|
41559
|
+
function buildScoreRangeOutputSchema() {
|
|
41560
|
+
return `You are an expert evaluator. Score the candidate answer on each criterion.
|
|
41561
|
+
You must return a valid JSON object matching this schema:
|
|
41562
|
+
{
|
|
41563
|
+
"checks": [
|
|
41564
|
+
{
|
|
41565
|
+
"id": "string (criterion id)",
|
|
41566
|
+
"score": integer (0-10),
|
|
41567
|
+
"reasoning": "string (brief explanation for score)"
|
|
41568
|
+
}
|
|
41569
|
+
],
|
|
41570
|
+
"overall_reasoning": "string (summary, optional)"
|
|
41571
|
+
}
|
|
41572
|
+
|
|
41573
|
+
Important: The "score" must be an integer from 0 to 10 that falls within one of the defined score ranges for that criterion.`;
|
|
41574
|
+
}
|
|
41575
|
+
function calculateScoreRangeResult(result, rubrics) {
|
|
41576
|
+
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
41577
|
+
const hits = [];
|
|
41578
|
+
const misses = [];
|
|
41579
|
+
const rawScores = {};
|
|
41580
|
+
let totalWeight = 0;
|
|
41581
|
+
let weightedScoreSum = 0;
|
|
41582
|
+
let failedRequired = false;
|
|
41583
|
+
for (const check2 of result.checks) {
|
|
41584
|
+
const rubric = rubricMap.get(check2.id);
|
|
41585
|
+
if (!rubric) {
|
|
41586
|
+
continue;
|
|
41587
|
+
}
|
|
41588
|
+
const rawScore = Math.max(0, Math.min(10, check2.score));
|
|
41589
|
+
const normalizedScore = rawScore / 10;
|
|
41590
|
+
rawScores[rubric.id] = rawScore;
|
|
41591
|
+
totalWeight += rubric.weight;
|
|
41592
|
+
weightedScoreSum += normalizedScore * rubric.weight;
|
|
41593
|
+
let requiredMinScore;
|
|
41594
|
+
if (rubric.required_min_score !== void 0) {
|
|
41595
|
+
requiredMinScore = rubric.required_min_score;
|
|
41596
|
+
} else if (rubric.required === true) {
|
|
41597
|
+
requiredMinScore = 10;
|
|
41598
|
+
}
|
|
41599
|
+
const matchingRange = rubric.score_ranges?.find(
|
|
41600
|
+
(r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
|
|
41601
|
+
);
|
|
41602
|
+
const rangeDescription = matchingRange?.expected_outcome ?? "";
|
|
41603
|
+
const criterionLabel = rubric.expected_outcome ?? rubric.id;
|
|
41604
|
+
const reasoningText = check2.reasoning ? `: ${check2.reasoning}` : "";
|
|
41605
|
+
const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
|
|
41606
|
+
if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
|
|
41607
|
+
failedRequired = true;
|
|
41608
|
+
misses.push(scoreInfo);
|
|
41609
|
+
} else if (rawScore >= 7) {
|
|
41610
|
+
hits.push(scoreInfo);
|
|
41611
|
+
} else {
|
|
41612
|
+
misses.push(scoreInfo);
|
|
41613
|
+
}
|
|
41614
|
+
}
|
|
41615
|
+
const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
|
|
41616
|
+
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
41617
|
+
return {
|
|
41618
|
+
score,
|
|
41619
|
+
verdict,
|
|
41620
|
+
hits,
|
|
41621
|
+
misses,
|
|
41622
|
+
details: {
|
|
41623
|
+
raw_scores: rawScores,
|
|
41624
|
+
normalization: "score / 10",
|
|
41625
|
+
aggregation: "weighted_average"
|
|
41626
|
+
}
|
|
41627
|
+
};
|
|
41628
|
+
}
|
|
41243
41629
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
41244
41630
|
{{EVALUATOR_RESULTS_JSON}}
|
|
41245
41631
|
|
|
@@ -42055,6 +42441,27 @@ function argsMatch(expected, actual) {
|
|
|
42055
42441
|
}
|
|
42056
42442
|
return true;
|
|
42057
42443
|
}
|
|
42444
|
+
function checkLatency(toolName, maxDurationMs, actualDurationMs) {
|
|
42445
|
+
if (maxDurationMs === void 0) {
|
|
42446
|
+
return { status: "skip", message: "" };
|
|
42447
|
+
}
|
|
42448
|
+
if (actualDurationMs === void 0) {
|
|
42449
|
+
return {
|
|
42450
|
+
status: "skip",
|
|
42451
|
+
message: `No duration data for ${toolName}; latency assertion skipped`
|
|
42452
|
+
};
|
|
42453
|
+
}
|
|
42454
|
+
if (actualDurationMs <= maxDurationMs) {
|
|
42455
|
+
return {
|
|
42456
|
+
status: "pass",
|
|
42457
|
+
message: `${toolName} completed in ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
|
|
42458
|
+
};
|
|
42459
|
+
}
|
|
42460
|
+
return {
|
|
42461
|
+
status: "fail",
|
|
42462
|
+
message: `${toolName} took ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
|
|
42463
|
+
};
|
|
42464
|
+
}
|
|
42058
42465
|
var ToolTrajectoryEvaluator = class {
|
|
42059
42466
|
kind = "tool_trajectory";
|
|
42060
42467
|
config;
|
|
@@ -42113,7 +42520,8 @@ var ToolTrajectoryEvaluator = class {
|
|
|
42113
42520
|
for (const call of message.toolCalls) {
|
|
42114
42521
|
toolCalls.push({
|
|
42115
42522
|
name: call.tool,
|
|
42116
|
-
args: call.input
|
|
42523
|
+
args: call.input,
|
|
42524
|
+
durationMs: call.durationMs
|
|
42117
42525
|
});
|
|
42118
42526
|
}
|
|
42119
42527
|
}
|
|
@@ -42181,17 +42589,27 @@ var ToolTrajectoryEvaluator = class {
|
|
|
42181
42589
|
}
|
|
42182
42590
|
const hits = [];
|
|
42183
42591
|
const misses = [];
|
|
42592
|
+
const warnings = [];
|
|
42184
42593
|
let actualIndex = 0;
|
|
42594
|
+
let sequenceHits = 0;
|
|
42595
|
+
let latencyHits = 0;
|
|
42596
|
+
let latencySkips = 0;
|
|
42597
|
+
const latencyAssertionCount = expected.filter(
|
|
42598
|
+
(item) => item.maxDurationMs !== void 0
|
|
42599
|
+
).length;
|
|
42185
42600
|
for (let i = 0; i < expected.length; i++) {
|
|
42186
42601
|
const expectedItem = expected[i];
|
|
42187
42602
|
const expectedTool = expectedItem.tool;
|
|
42188
42603
|
let found = false;
|
|
42189
42604
|
let argsMismatch = false;
|
|
42605
|
+
let matchedCall;
|
|
42190
42606
|
while (actualIndex < toolCalls.length) {
|
|
42191
42607
|
const actualCall = toolCalls[actualIndex];
|
|
42192
42608
|
if (actualCall.name === expectedTool) {
|
|
42193
42609
|
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
42194
42610
|
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
42611
|
+
sequenceHits++;
|
|
42612
|
+
matchedCall = actualCall;
|
|
42195
42613
|
actualIndex++;
|
|
42196
42614
|
found = true;
|
|
42197
42615
|
break;
|
|
@@ -42208,14 +42626,35 @@ var ToolTrajectoryEvaluator = class {
|
|
|
42208
42626
|
if (!found && !argsMismatch) {
|
|
42209
42627
|
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
42210
42628
|
}
|
|
42629
|
+
if (found && matchedCall) {
|
|
42630
|
+
const latencyResult = checkLatency(
|
|
42631
|
+
expectedTool,
|
|
42632
|
+
expectedItem.maxDurationMs,
|
|
42633
|
+
matchedCall.durationMs
|
|
42634
|
+
);
|
|
42635
|
+
if (latencyResult.status === "pass") {
|
|
42636
|
+
hits.push(latencyResult.message);
|
|
42637
|
+
latencyHits++;
|
|
42638
|
+
} else if (latencyResult.status === "fail") {
|
|
42639
|
+
misses.push(latencyResult.message);
|
|
42640
|
+
} else if (latencyResult.message) {
|
|
42641
|
+
warnings.push(latencyResult.message);
|
|
42642
|
+
latencySkips++;
|
|
42643
|
+
}
|
|
42644
|
+
}
|
|
42645
|
+
}
|
|
42646
|
+
for (const warning of warnings) {
|
|
42647
|
+
console.warn(`[tool_trajectory] ${warning}`);
|
|
42211
42648
|
}
|
|
42212
|
-
const
|
|
42649
|
+
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
42650
|
+
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
42651
|
+
const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
|
|
42213
42652
|
return {
|
|
42214
42653
|
score,
|
|
42215
42654
|
verdict: scoreToVerdict(score),
|
|
42216
42655
|
hits,
|
|
42217
42656
|
misses,
|
|
42218
|
-
expectedAspectCount:
|
|
42657
|
+
expectedAspectCount: totalAssertions
|
|
42219
42658
|
};
|
|
42220
42659
|
}
|
|
42221
42660
|
evaluateExact(toolCalls) {
|
|
@@ -42231,6 +42670,13 @@ var ToolTrajectoryEvaluator = class {
|
|
|
42231
42670
|
}
|
|
42232
42671
|
const hits = [];
|
|
42233
42672
|
const misses = [];
|
|
42673
|
+
const warnings = [];
|
|
42674
|
+
let sequenceHits = 0;
|
|
42675
|
+
let latencyHits = 0;
|
|
42676
|
+
let latencySkips = 0;
|
|
42677
|
+
const latencyAssertionCount = expected.filter(
|
|
42678
|
+
(item) => item.maxDurationMs !== void 0
|
|
42679
|
+
).length;
|
|
42234
42680
|
if (toolCalls.length !== expected.length) {
|
|
42235
42681
|
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
42236
42682
|
}
|
|
@@ -42240,26 +42686,50 @@ var ToolTrajectoryEvaluator = class {
|
|
|
42240
42686
|
const expectedTool = expectedItem.tool;
|
|
42241
42687
|
const actualCall = toolCalls[i];
|
|
42242
42688
|
const actualTool = actualCall.name;
|
|
42689
|
+
let sequenceMatched = false;
|
|
42243
42690
|
if (actualTool === expectedTool) {
|
|
42244
42691
|
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
42245
42692
|
hits.push(`Position ${i}: ${expectedTool}`);
|
|
42693
|
+
sequenceHits++;
|
|
42694
|
+
sequenceMatched = true;
|
|
42246
42695
|
} else {
|
|
42247
42696
|
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
42248
42697
|
}
|
|
42249
42698
|
} else {
|
|
42250
42699
|
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
42251
42700
|
}
|
|
42701
|
+
if (sequenceMatched) {
|
|
42702
|
+
const latencyResult = checkLatency(
|
|
42703
|
+
expectedTool,
|
|
42704
|
+
expectedItem.maxDurationMs,
|
|
42705
|
+
actualCall.durationMs
|
|
42706
|
+
);
|
|
42707
|
+
if (latencyResult.status === "pass") {
|
|
42708
|
+
hits.push(latencyResult.message);
|
|
42709
|
+
latencyHits++;
|
|
42710
|
+
} else if (latencyResult.status === "fail") {
|
|
42711
|
+
misses.push(latencyResult.message);
|
|
42712
|
+
} else if (latencyResult.message) {
|
|
42713
|
+
warnings.push(latencyResult.message);
|
|
42714
|
+
latencySkips++;
|
|
42715
|
+
}
|
|
42716
|
+
}
|
|
42252
42717
|
}
|
|
42253
42718
|
for (let i = checkLength; i < expected.length; i++) {
|
|
42254
42719
|
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
42255
42720
|
}
|
|
42256
|
-
const
|
|
42721
|
+
for (const warning of warnings) {
|
|
42722
|
+
console.warn(`[tool_trajectory] ${warning}`);
|
|
42723
|
+
}
|
|
42724
|
+
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
42725
|
+
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
42726
|
+
const score = totalAssertions > 0 ? (sequenceHits + latencyHits) / totalAssertions : 1;
|
|
42257
42727
|
return {
|
|
42258
42728
|
score,
|
|
42259
42729
|
verdict: scoreToVerdict(score),
|
|
42260
42730
|
hits,
|
|
42261
42731
|
misses,
|
|
42262
|
-
expectedAspectCount:
|
|
42732
|
+
expectedAspectCount: totalAssertions
|
|
42263
42733
|
};
|
|
42264
42734
|
}
|
|
42265
42735
|
};
|
|
@@ -42415,17 +42885,17 @@ async function runEvaluation(options) {
|
|
|
42415
42885
|
cache,
|
|
42416
42886
|
useCache,
|
|
42417
42887
|
now,
|
|
42418
|
-
|
|
42888
|
+
filter: filter2,
|
|
42419
42889
|
verbose,
|
|
42420
42890
|
evalCases: preloadedEvalCases,
|
|
42421
42891
|
onResult,
|
|
42422
42892
|
onProgress
|
|
42423
42893
|
} = options;
|
|
42424
|
-
const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose,
|
|
42425
|
-
const filteredEvalCases = filterEvalCases(evalCases,
|
|
42894
|
+
const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, filter: filter2 });
|
|
42895
|
+
const filteredEvalCases = filterEvalCases(evalCases, filter2);
|
|
42426
42896
|
if (filteredEvalCases.length === 0) {
|
|
42427
|
-
if (
|
|
42428
|
-
throw new Error(`
|
|
42897
|
+
if (filter2) {
|
|
42898
|
+
throw new Error(`No eval cases matched filter '${filter2}' in ${evalFilePath}`);
|
|
42429
42899
|
}
|
|
42430
42900
|
return [];
|
|
42431
42901
|
}
|
|
@@ -43001,7 +43471,10 @@ async function runEvaluatorList(options) {
|
|
|
43001
43471
|
attempt,
|
|
43002
43472
|
promptInputs,
|
|
43003
43473
|
now,
|
|
43004
|
-
judgeProvider
|
|
43474
|
+
judgeProvider,
|
|
43475
|
+
outputMessages,
|
|
43476
|
+
traceSummary,
|
|
43477
|
+
agentTimeoutMs
|
|
43005
43478
|
});
|
|
43006
43479
|
const weight = evaluator.weight ?? 1;
|
|
43007
43480
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -43336,9 +43809,22 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
43336
43809
|
attempt,
|
|
43337
43810
|
promptInputs,
|
|
43338
43811
|
now,
|
|
43339
|
-
judgeProvider
|
|
43812
|
+
judgeProvider,
|
|
43813
|
+
outputMessages,
|
|
43814
|
+
traceSummary,
|
|
43815
|
+
agentTimeoutMs
|
|
43340
43816
|
} = options;
|
|
43341
|
-
const customPrompt = await resolveCustomPrompt(
|
|
43817
|
+
const customPrompt = await resolveCustomPrompt(
|
|
43818
|
+
config2,
|
|
43819
|
+
{
|
|
43820
|
+
evalCase,
|
|
43821
|
+
candidate,
|
|
43822
|
+
outputMessages,
|
|
43823
|
+
traceSummary,
|
|
43824
|
+
config: config2.config
|
|
43825
|
+
},
|
|
43826
|
+
agentTimeoutMs
|
|
43827
|
+
);
|
|
43342
43828
|
return evaluatorRegistry.llm_judge.evaluate({
|
|
43343
43829
|
evalCase,
|
|
43344
43830
|
candidate,
|
|
@@ -43352,23 +43838,70 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
43352
43838
|
evaluator: config2
|
|
43353
43839
|
});
|
|
43354
43840
|
}
|
|
43355
|
-
async function resolveCustomPrompt(
|
|
43356
|
-
if (
|
|
43841
|
+
async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
|
|
43842
|
+
if (promptConfig.resolvedPromptScript && promptConfig.resolvedPromptScript.length > 0) {
|
|
43843
|
+
if (!context) {
|
|
43844
|
+
throw new Error("Context required for executable prompt templates");
|
|
43845
|
+
}
|
|
43846
|
+
return executePromptTemplate(
|
|
43847
|
+
promptConfig.resolvedPromptScript,
|
|
43848
|
+
context,
|
|
43849
|
+
promptConfig.config,
|
|
43850
|
+
timeoutMs
|
|
43851
|
+
);
|
|
43852
|
+
}
|
|
43853
|
+
const promptPath = promptConfig.resolvedPromptPath ?? promptConfig.promptPath;
|
|
43854
|
+
if (promptPath) {
|
|
43357
43855
|
try {
|
|
43358
|
-
const content = await readTextFile(
|
|
43856
|
+
const content = await readTextFile(promptPath);
|
|
43359
43857
|
return content;
|
|
43360
43858
|
} catch (error40) {
|
|
43361
43859
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
43362
|
-
console.warn(`Could not read custom prompt at ${
|
|
43860
|
+
console.warn(`Could not read custom prompt at ${promptPath}: ${message}`);
|
|
43861
|
+
}
|
|
43862
|
+
}
|
|
43863
|
+
const promptValue = promptConfig.prompt;
|
|
43864
|
+
if (typeof promptValue === "string") {
|
|
43865
|
+
return promptValue;
|
|
43866
|
+
}
|
|
43867
|
+
return void 0;
|
|
43868
|
+
}
|
|
43869
|
+
async function executePromptTemplate(script, context, config2, timeoutMs) {
|
|
43870
|
+
const payload = {
|
|
43871
|
+
question: context.evalCase.question,
|
|
43872
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
43873
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
43874
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
43875
|
+
candidateAnswer: context.candidate,
|
|
43876
|
+
outputMessages: context.outputMessages ?? null,
|
|
43877
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
43878
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
43879
|
+
(p) => !context.evalCase.guideline_paths.includes(p)
|
|
43880
|
+
),
|
|
43881
|
+
inputMessages: context.evalCase.input_messages,
|
|
43882
|
+
traceSummary: context.traceSummary ?? null,
|
|
43883
|
+
config: config2 ?? context.config ?? null
|
|
43884
|
+
};
|
|
43885
|
+
const inputJson = JSON.stringify(toSnakeCaseDeep2(payload), null, 2);
|
|
43886
|
+
const scriptPath = script[script.length - 1];
|
|
43887
|
+
const cwd = path15.dirname(scriptPath);
|
|
43888
|
+
try {
|
|
43889
|
+
const stdout = await executeScript(script, inputJson, timeoutMs, cwd);
|
|
43890
|
+
const prompt = stdout.trim();
|
|
43891
|
+
if (!prompt) {
|
|
43892
|
+
throw new Error("Prompt template produced empty output");
|
|
43363
43893
|
}
|
|
43894
|
+
return prompt;
|
|
43895
|
+
} catch (error40) {
|
|
43896
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
43897
|
+
throw new Error(`Prompt template execution failed: ${message}`);
|
|
43364
43898
|
}
|
|
43365
|
-
return config2.prompt;
|
|
43366
43899
|
}
|
|
43367
|
-
function filterEvalCases(evalCases,
|
|
43368
|
-
if (!
|
|
43900
|
+
function filterEvalCases(evalCases, filter2) {
|
|
43901
|
+
if (!filter2) {
|
|
43369
43902
|
return evalCases;
|
|
43370
43903
|
}
|
|
43371
|
-
return evalCases.filter((evalCase) => evalCase.id
|
|
43904
|
+
return evalCases.filter((evalCase) => micromatch4.isMatch(evalCase.id, filter2));
|
|
43372
43905
|
}
|
|
43373
43906
|
function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
43374
43907
|
const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
|
|
@@ -43522,7 +44055,7 @@ function computeWeightedMean(entries) {
|
|
|
43522
44055
|
}
|
|
43523
44056
|
var rubricItemSchema = external_exports.object({
|
|
43524
44057
|
id: external_exports.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
43525
|
-
|
|
44058
|
+
expected_outcome: external_exports.string().describe("Concrete expected outcome for this rubric item"),
|
|
43526
44059
|
weight: external_exports.number().default(1).describe("Relative importance (default 1.0)"),
|
|
43527
44060
|
required: external_exports.boolean().default(true).describe("Whether this is a mandatory requirement")
|
|
43528
44061
|
});
|
|
@@ -43542,7 +44075,7 @@ You must return a valid JSON object matching this schema:
|
|
|
43542
44075
|
"rubrics": [
|
|
43543
44076
|
{
|
|
43544
44077
|
"id": "string (short identifier)",
|
|
43545
|
-
"
|
|
44078
|
+
"expected_outcome": "string (concrete expected outcome for this rubric item)",
|
|
43546
44079
|
"weight": number (default 1.0),
|
|
43547
44080
|
"required": boolean (default true)
|
|
43548
44081
|
}
|
|
@@ -43578,7 +44111,7 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
|
|
|
43578
44111
|
"Each rubric should:",
|
|
43579
44112
|
"- Be specific and testable",
|
|
43580
44113
|
"- Have a short, descriptive ID",
|
|
43581
|
-
"- Include a clear
|
|
44114
|
+
"- Include a clear expected outcome statement (what a good answer must demonstrate for this rubric)",
|
|
43582
44115
|
"- Indicate if it is required (mandatory) or optional",
|
|
43583
44116
|
"- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
|
|
43584
44117
|
"",
|
|
@@ -44439,17 +44972,31 @@ async function validateEvalFile(filePath) {
|
|
|
44439
44972
|
});
|
|
44440
44973
|
}
|
|
44441
44974
|
const inputMessages = evalCase.input_messages;
|
|
44442
|
-
|
|
44975
|
+
const inputAlias = evalCase.input;
|
|
44976
|
+
if (Array.isArray(inputMessages)) {
|
|
44977
|
+
validateMessages(inputMessages, `${location}.input_messages`, absolutePath, errors);
|
|
44978
|
+
} else if (inputAlias !== void 0) {
|
|
44979
|
+
if (typeof inputAlias === "string") {
|
|
44980
|
+
} else if (Array.isArray(inputAlias)) {
|
|
44981
|
+
validateMessages(inputAlias, `${location}.input`, absolutePath, errors);
|
|
44982
|
+
} else {
|
|
44983
|
+
errors.push({
|
|
44984
|
+
severity: "error",
|
|
44985
|
+
filePath: absolutePath,
|
|
44986
|
+
location: `${location}.input`,
|
|
44987
|
+
message: "Invalid 'input' field (must be a string or array of messages)"
|
|
44988
|
+
});
|
|
44989
|
+
}
|
|
44990
|
+
} else {
|
|
44443
44991
|
errors.push({
|
|
44444
44992
|
severity: "error",
|
|
44445
44993
|
filePath: absolutePath,
|
|
44446
44994
|
location: `${location}.input_messages`,
|
|
44447
|
-
message: "Missing or
|
|
44995
|
+
message: "Missing 'input_messages' or 'input' field (must provide one)"
|
|
44448
44996
|
});
|
|
44449
|
-
} else {
|
|
44450
|
-
validateMessages(inputMessages, `${location}.input_messages`, absolutePath, errors);
|
|
44451
44997
|
}
|
|
44452
44998
|
const expectedMessages = evalCase.expected_messages;
|
|
44999
|
+
const expectedOutputAlias = evalCase.expected_output;
|
|
44453
45000
|
if (expectedMessages !== void 0 && !Array.isArray(expectedMessages)) {
|
|
44454
45001
|
errors.push({
|
|
44455
45002
|
severity: "error",
|
|
@@ -44459,6 +45006,26 @@ async function validateEvalFile(filePath) {
|
|
|
44459
45006
|
});
|
|
44460
45007
|
} else if (Array.isArray(expectedMessages)) {
|
|
44461
45008
|
validateMessages(expectedMessages, `${location}.expected_messages`, absolutePath, errors);
|
|
45009
|
+
} else if (expectedOutputAlias !== void 0) {
|
|
45010
|
+
if (typeof expectedOutputAlias === "string") {
|
|
45011
|
+
} else if (Array.isArray(expectedOutputAlias)) {
|
|
45012
|
+
if (expectedOutputAlias.length > 0 && isObject2(expectedOutputAlias[0]) && "role" in expectedOutputAlias[0]) {
|
|
45013
|
+
validateMessages(
|
|
45014
|
+
expectedOutputAlias,
|
|
45015
|
+
`${location}.expected_output`,
|
|
45016
|
+
absolutePath,
|
|
45017
|
+
errors
|
|
45018
|
+
);
|
|
45019
|
+
}
|
|
45020
|
+
} else if (isObject2(expectedOutputAlias)) {
|
|
45021
|
+
} else {
|
|
45022
|
+
errors.push({
|
|
45023
|
+
severity: "error",
|
|
45024
|
+
filePath: absolutePath,
|
|
45025
|
+
location: `${location}.expected_output`,
|
|
45026
|
+
message: "Invalid 'expected_output' field (must be a string, object, or array)"
|
|
45027
|
+
});
|
|
45028
|
+
}
|
|
44462
45029
|
}
|
|
44463
45030
|
}
|
|
44464
45031
|
return {
|
|
@@ -45302,7 +45869,7 @@ function normalizeOptions(rawOptions) {
|
|
|
45302
45869
|
return {
|
|
45303
45870
|
target: normalizeString(rawOptions.target),
|
|
45304
45871
|
targetsPath: normalizeString(rawOptions.targets),
|
|
45305
|
-
|
|
45872
|
+
filter: normalizeString(rawOptions.filter),
|
|
45306
45873
|
workers: workers > 0 ? workers : void 0,
|
|
45307
45874
|
outPath: normalizeString(rawOptions.out),
|
|
45308
45875
|
format,
|
|
@@ -45427,9 +45994,9 @@ async function prepareFileMetadata(params) {
|
|
|
45427
45994
|
const inlineTargetLabel = `${selection.targetName} [provider=${providerLabel}]`;
|
|
45428
45995
|
const evalCases = await loadEvalCases(testFilePath, repoRoot, {
|
|
45429
45996
|
verbose: options.verbose,
|
|
45430
|
-
|
|
45997
|
+
filter: options.filter
|
|
45431
45998
|
});
|
|
45432
|
-
const filteredIds =
|
|
45999
|
+
const filteredIds = evalCases.map((value) => value.id);
|
|
45433
46000
|
return { evalIds: filteredIds, evalCases, selection, inlineTargetLabel };
|
|
45434
46001
|
}
|
|
45435
46002
|
async function runWithLimit(items, limit, task) {
|
|
@@ -45500,7 +46067,6 @@ async function runSingleEvalFile(params) {
|
|
|
45500
46067
|
agentTimeoutMs,
|
|
45501
46068
|
cache,
|
|
45502
46069
|
useCache: options.cache,
|
|
45503
|
-
evalId: options.evalId,
|
|
45504
46070
|
evalCases,
|
|
45505
46071
|
verbose: options.verbose,
|
|
45506
46072
|
maxConcurrency: resolvedWorkers,
|
|
@@ -45676,7 +46242,7 @@ var evalCommand = command3({
|
|
|
45676
46242
|
evalId: option3({
|
|
45677
46243
|
type: optional4(string6),
|
|
45678
46244
|
long: "eval-id",
|
|
45679
|
-
description:
|
|
46245
|
+
description: 'Filter eval cases by ID pattern (glob supported, e.g., "summary-*")'
|
|
45680
46246
|
}),
|
|
45681
46247
|
workers: option3({
|
|
45682
46248
|
type: number5,
|
|
@@ -45743,7 +46309,7 @@ var evalCommand = command3({
|
|
|
45743
46309
|
const rawOptions = {
|
|
45744
46310
|
target: args.target,
|
|
45745
46311
|
targets: args.targets,
|
|
45746
|
-
|
|
46312
|
+
filter: args.evalId,
|
|
45747
46313
|
workers: args.workers,
|
|
45748
46314
|
out: args.out,
|
|
45749
46315
|
outputFormat: args.outputFormat,
|
|
@@ -45902,14 +46468,12 @@ async function generateRubricsCommand(options) {
|
|
|
45902
46468
|
if (caseNode && isMap(caseNode)) {
|
|
45903
46469
|
caseNode.set(
|
|
45904
46470
|
"rubrics",
|
|
45905
|
-
rubrics.map(
|
|
45906
|
-
|
|
45907
|
-
|
|
45908
|
-
|
|
45909
|
-
|
|
45910
|
-
|
|
45911
|
-
})
|
|
45912
|
-
)
|
|
46471
|
+
rubrics.filter((r) => r.expected_outcome !== void 0).map((r) => ({
|
|
46472
|
+
id: r.id,
|
|
46473
|
+
expected_outcome: r.expected_outcome,
|
|
46474
|
+
weight: r.weight,
|
|
46475
|
+
required: r.required ?? true
|
|
46476
|
+
}))
|
|
45913
46477
|
);
|
|
45914
46478
|
}
|
|
45915
46479
|
updatedCount++;
|
|
@@ -46454,4 +47018,4 @@ export {
|
|
|
46454
47018
|
app,
|
|
46455
47019
|
runCli
|
|
46456
47020
|
};
|
|
46457
|
-
//# sourceMappingURL=chunk-
|
|
47021
|
+
//# sourceMappingURL=chunk-XREH4WAJ.js.map
|