agentv 0.21.3 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-A5T7W63L.js → chunk-QRY42RAP.js} +401 -10
- package/dist/chunk-QRY42RAP.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/cli.js.map +1 -1
- package/dist/index.js +1 -1
- package/package.json +5 -9
- package/dist/chunk-A5T7W63L.js.map +0 -1
|
@@ -34562,7 +34562,7 @@ function isTestMessage(value) {
|
|
|
34562
34562
|
}
|
|
34563
34563
|
return candidate.content.every(isJsonObject);
|
|
34564
34564
|
}
|
|
34565
|
-
var EVALUATOR_KIND_VALUES = ["
|
|
34565
|
+
var EVALUATOR_KIND_VALUES = ["code_judge", "llm_judge", "rubric", "composite"];
|
|
34566
34566
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
34567
34567
|
function isEvaluatorKind(value) {
|
|
34568
34568
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
@@ -34879,10 +34879,10 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
34879
34879
|
logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
34880
34880
|
continue;
|
|
34881
34881
|
}
|
|
34882
|
-
if (typeValue === "
|
|
34882
|
+
if (typeValue === "code_judge") {
|
|
34883
34883
|
const script = asString2(rawEvaluator.script);
|
|
34884
34884
|
if (!script) {
|
|
34885
|
-
logWarning2(`Skipping
|
|
34885
|
+
logWarning2(`Skipping code_judge evaluator '${name16}' in '${evalId}': missing script`);
|
|
34886
34886
|
continue;
|
|
34887
34887
|
}
|
|
34888
34888
|
const cwd = asString2(rawEvaluator.cwd);
|
|
@@ -34893,7 +34893,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
34893
34893
|
resolvedCwd = path32.resolve(resolved.resolvedPath);
|
|
34894
34894
|
} else {
|
|
34895
34895
|
logWarning2(
|
|
34896
|
-
`
|
|
34896
|
+
`Code_judge evaluator '${name16}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
34897
34897
|
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
34898
34898
|
);
|
|
34899
34899
|
}
|
|
@@ -34909,6 +34909,105 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
34909
34909
|
});
|
|
34910
34910
|
continue;
|
|
34911
34911
|
}
|
|
34912
|
+
if (typeValue === "composite") {
|
|
34913
|
+
const rawMembers = rawEvaluator.evaluators;
|
|
34914
|
+
if (!Array.isArray(rawMembers)) {
|
|
34915
|
+
logWarning2(
|
|
34916
|
+
`Skipping composite evaluator '${name16}' in '${evalId}': missing evaluators array`
|
|
34917
|
+
);
|
|
34918
|
+
continue;
|
|
34919
|
+
}
|
|
34920
|
+
const rawAggregator = rawEvaluator.aggregator;
|
|
34921
|
+
if (!isJsonObject2(rawAggregator)) {
|
|
34922
|
+
logWarning2(`Skipping composite evaluator '${name16}' in '${evalId}': missing aggregator`);
|
|
34923
|
+
continue;
|
|
34924
|
+
}
|
|
34925
|
+
const aggregatorType = asString2(rawAggregator.type);
|
|
34926
|
+
if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
|
|
34927
|
+
logWarning2(
|
|
34928
|
+
`Skipping composite evaluator '${name16}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
34929
|
+
);
|
|
34930
|
+
continue;
|
|
34931
|
+
}
|
|
34932
|
+
const memberEvaluators = [];
|
|
34933
|
+
for (const rawMember of rawMembers) {
|
|
34934
|
+
if (!isJsonObject2(rawMember)) {
|
|
34935
|
+
logWarning2(`Skipping invalid member evaluator in composite '${name16}' (expected object)`);
|
|
34936
|
+
continue;
|
|
34937
|
+
}
|
|
34938
|
+
const memberName = asString2(rawMember.name);
|
|
34939
|
+
const memberType = rawMember.type;
|
|
34940
|
+
if (!memberName || !isEvaluatorKind(memberType)) {
|
|
34941
|
+
logWarning2(`Skipping member evaluator with invalid name/type in composite '${name16}'`);
|
|
34942
|
+
continue;
|
|
34943
|
+
}
|
|
34944
|
+
const memberConfigs = await parseEvaluators(
|
|
34945
|
+
{ evaluators: [rawMember] },
|
|
34946
|
+
void 0,
|
|
34947
|
+
searchRoots,
|
|
34948
|
+
`${evalId}:${name16}:${memberName}`
|
|
34949
|
+
);
|
|
34950
|
+
if (memberConfigs && memberConfigs.length > 0) {
|
|
34951
|
+
memberEvaluators.push(memberConfigs[0]);
|
|
34952
|
+
}
|
|
34953
|
+
}
|
|
34954
|
+
if (memberEvaluators.length === 0) {
|
|
34955
|
+
logWarning2(
|
|
34956
|
+
`Skipping composite evaluator '${name16}' in '${evalId}': no valid member evaluators`
|
|
34957
|
+
);
|
|
34958
|
+
continue;
|
|
34959
|
+
}
|
|
34960
|
+
let aggregator;
|
|
34961
|
+
if (aggregatorType === "weighted_average") {
|
|
34962
|
+
const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
|
|
34963
|
+
const parsedWeights = {};
|
|
34964
|
+
if (weights) {
|
|
34965
|
+
for (const [key2, value] of Object.entries(weights)) {
|
|
34966
|
+
if (typeof value === "number") {
|
|
34967
|
+
parsedWeights[key2] = value;
|
|
34968
|
+
}
|
|
34969
|
+
}
|
|
34970
|
+
}
|
|
34971
|
+
aggregator = {
|
|
34972
|
+
type: "weighted_average",
|
|
34973
|
+
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
34974
|
+
};
|
|
34975
|
+
} else if (aggregatorType === "code_judge") {
|
|
34976
|
+
const aggregatorPath = asString2(rawAggregator.path);
|
|
34977
|
+
if (!aggregatorPath) {
|
|
34978
|
+
logWarning2(
|
|
34979
|
+
`Skipping composite evaluator '${name16}' in '${evalId}': code_judge aggregator missing path`
|
|
34980
|
+
);
|
|
34981
|
+
continue;
|
|
34982
|
+
}
|
|
34983
|
+
aggregator = {
|
|
34984
|
+
type: "code_judge",
|
|
34985
|
+
path: aggregatorPath,
|
|
34986
|
+
cwd: searchRoots[0]
|
|
34987
|
+
};
|
|
34988
|
+
} else {
|
|
34989
|
+
const aggregatorPrompt = asString2(rawAggregator.prompt);
|
|
34990
|
+
let promptPath2;
|
|
34991
|
+
if (aggregatorPrompt) {
|
|
34992
|
+
const resolved = await resolveFileReference2(aggregatorPrompt, searchRoots);
|
|
34993
|
+
if (resolved.resolvedPath) {
|
|
34994
|
+
promptPath2 = path32.resolve(resolved.resolvedPath);
|
|
34995
|
+
}
|
|
34996
|
+
}
|
|
34997
|
+
aggregator = {
|
|
34998
|
+
type: "llm_judge",
|
|
34999
|
+
...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
|
|
35000
|
+
...promptPath2 ? { promptPath: promptPath2 } : {}
|
|
35001
|
+
};
|
|
35002
|
+
}
|
|
35003
|
+
evaluators.push({
|
|
35004
|
+
name: name16,
|
|
35005
|
+
type: "composite",
|
|
35006
|
+
evaluators: memberEvaluators,
|
|
35007
|
+
aggregator
|
|
35008
|
+
});
|
|
35009
|
+
continue;
|
|
35010
|
+
}
|
|
34912
35011
|
const prompt = asString2(rawEvaluator.prompt);
|
|
34913
35012
|
let promptPath;
|
|
34914
35013
|
if (prompt) {
|
|
@@ -37728,6 +37827,228 @@ function substituteVariables(template, variables) {
|
|
|
37728
37827
|
return variables[varName] ?? match;
|
|
37729
37828
|
});
|
|
37730
37829
|
}
|
|
37830
|
+
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
37831
|
+
{{EVALUATOR_RESULTS_JSON}}
|
|
37832
|
+
|
|
37833
|
+
Decide the final score and verdict based on all evaluator results.
|
|
37834
|
+
Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
|
|
37835
|
+
var CompositeEvaluator = class {
|
|
37836
|
+
kind = "composite";
|
|
37837
|
+
config;
|
|
37838
|
+
evaluatorFactory;
|
|
37839
|
+
cwd;
|
|
37840
|
+
constructor(options) {
|
|
37841
|
+
this.config = options.config;
|
|
37842
|
+
this.evaluatorFactory = options.evaluatorFactory;
|
|
37843
|
+
this.cwd = options.cwd;
|
|
37844
|
+
}
|
|
37845
|
+
async evaluate(context) {
|
|
37846
|
+
const memberResults = await Promise.all(
|
|
37847
|
+
this.config.evaluators.map(async (memberConfig) => {
|
|
37848
|
+
const evaluator = this.evaluatorFactory.create(memberConfig, context);
|
|
37849
|
+
return {
|
|
37850
|
+
id: memberConfig.name,
|
|
37851
|
+
type: memberConfig.type,
|
|
37852
|
+
result: await evaluator.evaluate(context)
|
|
37853
|
+
};
|
|
37854
|
+
})
|
|
37855
|
+
);
|
|
37856
|
+
return this.aggregate(memberResults, context);
|
|
37857
|
+
}
|
|
37858
|
+
async aggregate(results, context) {
|
|
37859
|
+
const aggregator = this.config.aggregator;
|
|
37860
|
+
switch (aggregator.type) {
|
|
37861
|
+
case "code_judge":
|
|
37862
|
+
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
37863
|
+
case "llm_judge":
|
|
37864
|
+
return this.runLlmAggregator(results, context, aggregator);
|
|
37865
|
+
default:
|
|
37866
|
+
return this.runWeightedAverage(results, aggregator.weights);
|
|
37867
|
+
}
|
|
37868
|
+
}
|
|
37869
|
+
runWeightedAverage(results, weights) {
|
|
37870
|
+
let totalWeight = 0;
|
|
37871
|
+
let weightedSum = 0;
|
|
37872
|
+
const allHits = [];
|
|
37873
|
+
const allMisses = [];
|
|
37874
|
+
const reasoningParts = [];
|
|
37875
|
+
const evaluatorResults = [];
|
|
37876
|
+
for (const member of results) {
|
|
37877
|
+
const weight = weights?.[member.id] ?? 1;
|
|
37878
|
+
totalWeight += weight;
|
|
37879
|
+
weightedSum += member.result.score * weight;
|
|
37880
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
37881
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
37882
|
+
if (member.result.reasoning) {
|
|
37883
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
37884
|
+
}
|
|
37885
|
+
evaluatorResults.push({
|
|
37886
|
+
name: member.id,
|
|
37887
|
+
type: member.type,
|
|
37888
|
+
score: member.result.score,
|
|
37889
|
+
weight,
|
|
37890
|
+
verdict: member.result.verdict,
|
|
37891
|
+
hits: [...member.result.hits],
|
|
37892
|
+
misses: [...member.result.misses],
|
|
37893
|
+
reasoning: member.result.reasoning,
|
|
37894
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
37895
|
+
evaluatorResults: member.result.evaluatorResults
|
|
37896
|
+
});
|
|
37897
|
+
}
|
|
37898
|
+
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
37899
|
+
return {
|
|
37900
|
+
score: clampScore(finalScore),
|
|
37901
|
+
verdict: scoreToVerdict(finalScore),
|
|
37902
|
+
hits: allHits,
|
|
37903
|
+
misses: allMisses,
|
|
37904
|
+
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
37905
|
+
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
37906
|
+
evaluatorRawRequest: {
|
|
37907
|
+
aggregator: "weighted_average",
|
|
37908
|
+
...weights ? { weights } : {}
|
|
37909
|
+
},
|
|
37910
|
+
evaluatorResults
|
|
37911
|
+
};
|
|
37912
|
+
}
|
|
37913
|
+
async runCodeAggregator(results, scriptPath, cwd, weights) {
|
|
37914
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
37915
|
+
const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
|
|
37916
|
+
const evaluatorResults = results.map((member) => ({
|
|
37917
|
+
name: member.id,
|
|
37918
|
+
type: member.type,
|
|
37919
|
+
score: member.result.score,
|
|
37920
|
+
weight: weights?.[member.id] ?? 1,
|
|
37921
|
+
verdict: member.result.verdict,
|
|
37922
|
+
hits: [...member.result.hits],
|
|
37923
|
+
misses: [...member.result.misses],
|
|
37924
|
+
reasoning: member.result.reasoning,
|
|
37925
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
37926
|
+
evaluatorResults: member.result.evaluatorResults
|
|
37927
|
+
}));
|
|
37928
|
+
try {
|
|
37929
|
+
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
37930
|
+
const parsed = parseJsonSafe(stdout);
|
|
37931
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
37932
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
37933
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
37934
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
37935
|
+
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
37936
|
+
return {
|
|
37937
|
+
score,
|
|
37938
|
+
verdict,
|
|
37939
|
+
hits,
|
|
37940
|
+
misses,
|
|
37941
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
37942
|
+
reasoning,
|
|
37943
|
+
evaluatorRawRequest: {
|
|
37944
|
+
aggregator: "code_judge",
|
|
37945
|
+
script: scriptPath
|
|
37946
|
+
},
|
|
37947
|
+
evaluatorResults
|
|
37948
|
+
};
|
|
37949
|
+
} catch (error40) {
|
|
37950
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
37951
|
+
return {
|
|
37952
|
+
score: 0,
|
|
37953
|
+
verdict: "fail",
|
|
37954
|
+
hits: [],
|
|
37955
|
+
misses: [`Code aggregator failed: ${message}`],
|
|
37956
|
+
expectedAspectCount: 1,
|
|
37957
|
+
reasoning: message,
|
|
37958
|
+
evaluatorRawRequest: {
|
|
37959
|
+
aggregator: "code_judge",
|
|
37960
|
+
script: scriptPath,
|
|
37961
|
+
error: message
|
|
37962
|
+
},
|
|
37963
|
+
evaluatorResults
|
|
37964
|
+
};
|
|
37965
|
+
}
|
|
37966
|
+
}
|
|
37967
|
+
async runLlmAggregator(results, context, config2) {
|
|
37968
|
+
const judgeProvider = context.judgeProvider;
|
|
37969
|
+
if (!judgeProvider) {
|
|
37970
|
+
throw new Error("No judge provider available for LLM aggregation");
|
|
37971
|
+
}
|
|
37972
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
37973
|
+
const resultsJson = JSON.stringify(resultsObject, null, 2);
|
|
37974
|
+
const evaluatorResults = results.map((member) => ({
|
|
37975
|
+
name: member.id,
|
|
37976
|
+
type: member.type,
|
|
37977
|
+
score: member.result.score,
|
|
37978
|
+
verdict: member.result.verdict,
|
|
37979
|
+
hits: [...member.result.hits],
|
|
37980
|
+
misses: [...member.result.misses],
|
|
37981
|
+
reasoning: member.result.reasoning,
|
|
37982
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
37983
|
+
evaluatorResults: member.result.evaluatorResults
|
|
37984
|
+
}));
|
|
37985
|
+
const promptTemplate = config2.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
37986
|
+
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
37987
|
+
const systemPrompt = buildOutputSchema();
|
|
37988
|
+
const evaluatorRawRequest = {
|
|
37989
|
+
aggregator: "llm_judge",
|
|
37990
|
+
userPrompt,
|
|
37991
|
+
systemPrompt,
|
|
37992
|
+
target: judgeProvider.targetName
|
|
37993
|
+
};
|
|
37994
|
+
try {
|
|
37995
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
37996
|
+
if (model) {
|
|
37997
|
+
const { text: text2 } = await generateText({
|
|
37998
|
+
model,
|
|
37999
|
+
system: systemPrompt,
|
|
38000
|
+
prompt: userPrompt
|
|
38001
|
+
});
|
|
38002
|
+
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text2));
|
|
38003
|
+
const score2 = clampScore(data2.score);
|
|
38004
|
+
const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
38005
|
+
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
38006
|
+
const reasoning2 = data2.reasoning;
|
|
38007
|
+
return {
|
|
38008
|
+
score: score2,
|
|
38009
|
+
verdict: scoreToVerdict(score2),
|
|
38010
|
+
hits: hits2,
|
|
38011
|
+
misses: misses2,
|
|
38012
|
+
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
38013
|
+
reasoning: reasoning2,
|
|
38014
|
+
evaluatorRawRequest,
|
|
38015
|
+
evaluatorResults
|
|
38016
|
+
};
|
|
38017
|
+
}
|
|
38018
|
+
const response = await judgeProvider.invoke({
|
|
38019
|
+
question: userPrompt,
|
|
38020
|
+
systemPrompt,
|
|
38021
|
+
evalCaseId: context.evalCase.id,
|
|
38022
|
+
attempt: context.attempt
|
|
38023
|
+
});
|
|
38024
|
+
const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
|
|
38025
|
+
const score = clampScore(data.score);
|
|
38026
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
38027
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
38028
|
+
const reasoning = data.reasoning ?? response.reasoning;
|
|
38029
|
+
return {
|
|
38030
|
+
score,
|
|
38031
|
+
verdict: scoreToVerdict(score),
|
|
38032
|
+
hits,
|
|
38033
|
+
misses,
|
|
38034
|
+
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
38035
|
+
reasoning,
|
|
38036
|
+
evaluatorRawRequest,
|
|
38037
|
+
evaluatorResults
|
|
38038
|
+
};
|
|
38039
|
+
} catch {
|
|
38040
|
+
return {
|
|
38041
|
+
score: 0,
|
|
38042
|
+
verdict: "fail",
|
|
38043
|
+
hits: [],
|
|
38044
|
+
misses: [],
|
|
38045
|
+
expectedAspectCount: 1,
|
|
38046
|
+
evaluatorRawRequest,
|
|
38047
|
+
evaluatorResults
|
|
38048
|
+
};
|
|
38049
|
+
}
|
|
38050
|
+
}
|
|
38051
|
+
};
|
|
37731
38052
|
var Node = class {
|
|
37732
38053
|
value;
|
|
37733
38054
|
next;
|
|
@@ -38418,6 +38739,57 @@ async function runEvaluatorList(options) {
|
|
|
38418
38739
|
promptInputs,
|
|
38419
38740
|
now
|
|
38420
38741
|
});
|
|
38742
|
+
scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
|
|
38743
|
+
evaluatorResults.push({
|
|
38744
|
+
name: evaluator.name,
|
|
38745
|
+
type: "code_judge",
|
|
38746
|
+
score: score2.score,
|
|
38747
|
+
verdict: score2.verdict,
|
|
38748
|
+
hits: score2.hits,
|
|
38749
|
+
misses: score2.misses,
|
|
38750
|
+
reasoning: score2.reasoning,
|
|
38751
|
+
evaluator_provider_request: score2.evaluatorRawRequest
|
|
38752
|
+
});
|
|
38753
|
+
}
|
|
38754
|
+
if (evaluator.type === "composite") {
|
|
38755
|
+
const evalFileDir = evalCase.guideline_paths[0] ? path122.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
38756
|
+
const createEvaluator = (memberConfig) => {
|
|
38757
|
+
switch (memberConfig.type) {
|
|
38758
|
+
case "llm_judge":
|
|
38759
|
+
return evaluatorRegistry.llm_judge;
|
|
38760
|
+
case "code":
|
|
38761
|
+
return new CodeEvaluator({
|
|
38762
|
+
script: memberConfig.script,
|
|
38763
|
+
cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
|
|
38764
|
+
agentTimeoutMs
|
|
38765
|
+
});
|
|
38766
|
+
case "composite":
|
|
38767
|
+
return new CompositeEvaluator({
|
|
38768
|
+
config: memberConfig,
|
|
38769
|
+
cwd: evalFileDir,
|
|
38770
|
+
evaluatorFactory: { create: createEvaluator }
|
|
38771
|
+
});
|
|
38772
|
+
default: {
|
|
38773
|
+
const unknownConfig = memberConfig;
|
|
38774
|
+
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
38775
|
+
}
|
|
38776
|
+
}
|
|
38777
|
+
};
|
|
38778
|
+
const compositeEvaluator = new CompositeEvaluator({
|
|
38779
|
+
config: evaluator,
|
|
38780
|
+
cwd: evalFileDir,
|
|
38781
|
+
evaluatorFactory: { create: createEvaluator }
|
|
38782
|
+
});
|
|
38783
|
+
const score2 = await compositeEvaluator.evaluate({
|
|
38784
|
+
evalCase,
|
|
38785
|
+
candidate,
|
|
38786
|
+
target,
|
|
38787
|
+
provider,
|
|
38788
|
+
attempt,
|
|
38789
|
+
promptInputs,
|
|
38790
|
+
now,
|
|
38791
|
+
judgeProvider
|
|
38792
|
+
});
|
|
38421
38793
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
38422
38794
|
evaluatorResults.push({
|
|
38423
38795
|
name: evaluator.name,
|
|
@@ -38427,7 +38799,8 @@ async function runEvaluatorList(options) {
|
|
|
38427
38799
|
hits: score2.hits,
|
|
38428
38800
|
misses: score2.misses,
|
|
38429
38801
|
reasoning: score2.reasoning,
|
|
38430
|
-
evaluator_provider_request: score2.evaluatorRawRequest
|
|
38802
|
+
evaluator_provider_request: score2.evaluatorRawRequest,
|
|
38803
|
+
evaluator_results: mapChildResults(score2.evaluatorResults)
|
|
38431
38804
|
});
|
|
38432
38805
|
}
|
|
38433
38806
|
} catch (error40) {
|
|
@@ -38440,14 +38813,15 @@ async function runEvaluatorList(options) {
|
|
|
38440
38813
|
expectedAspectCount: 1,
|
|
38441
38814
|
reasoning: message
|
|
38442
38815
|
};
|
|
38816
|
+
const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
|
|
38443
38817
|
scored.push({
|
|
38444
38818
|
score: fallbackScore,
|
|
38445
38819
|
name: evaluator.name ?? "unknown",
|
|
38446
|
-
type:
|
|
38820
|
+
type: resultType ?? "llm_judge"
|
|
38447
38821
|
});
|
|
38448
38822
|
evaluatorResults.push({
|
|
38449
38823
|
name: evaluator.name ?? "unknown",
|
|
38450
|
-
type:
|
|
38824
|
+
type: resultType ?? "llm_judge",
|
|
38451
38825
|
score: 0,
|
|
38452
38826
|
verdict: "fail",
|
|
38453
38827
|
hits: [],
|
|
@@ -38665,6 +39039,23 @@ function isTimeoutLike(error40) {
|
|
|
38665
39039
|
const value = String(error40).toLowerCase();
|
|
38666
39040
|
return value.includes("timeout");
|
|
38667
39041
|
}
|
|
39042
|
+
function mapChildResults(children) {
|
|
39043
|
+
if (!children || children.length === 0) {
|
|
39044
|
+
return void 0;
|
|
39045
|
+
}
|
|
39046
|
+
return children.map((child) => ({
|
|
39047
|
+
name: child.name,
|
|
39048
|
+
type: child.type,
|
|
39049
|
+
score: child.score,
|
|
39050
|
+
weight: child.weight,
|
|
39051
|
+
verdict: child.verdict,
|
|
39052
|
+
hits: child.hits,
|
|
39053
|
+
misses: child.misses,
|
|
39054
|
+
reasoning: child.reasoning,
|
|
39055
|
+
evaluator_provider_request: child.evaluatorRawRequest,
|
|
39056
|
+
evaluator_results: mapChildResults(child.evaluatorResults)
|
|
39057
|
+
}));
|
|
39058
|
+
}
|
|
38668
39059
|
var rubricItemSchema = external_exports.object({
|
|
38669
39060
|
id: external_exports.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
38670
39061
|
description: external_exports.string().describe("What this rubric checks for"),
|
|
@@ -40918,8 +41309,8 @@ var evalCommand = command({
|
|
|
40918
41309
|
workers: option({
|
|
40919
41310
|
type: number4,
|
|
40920
41311
|
long: "workers",
|
|
40921
|
-
description: "Number of parallel workers (default:
|
|
40922
|
-
defaultValue: () =>
|
|
41312
|
+
description: "Number of parallel workers (default: 3, max: 50). Can also be set per-target in targets.yaml",
|
|
41313
|
+
defaultValue: () => 3
|
|
40923
41314
|
}),
|
|
40924
41315
|
out: option({
|
|
40925
41316
|
type: optional2(string4),
|
|
@@ -41667,4 +42058,4 @@ export {
|
|
|
41667
42058
|
app,
|
|
41668
42059
|
runCli
|
|
41669
42060
|
};
|
|
41670
|
-
//# sourceMappingURL=chunk-
|
|
42061
|
+
//# sourceMappingURL=chunk-QRY42RAP.js.map
|