@agentv/core 0.26.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-NDEN3H2B.js → chunk-V3JCB3HI.js} +1 -1
- package/dist/chunk-V3JCB3HI.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +0 -44
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -45
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +51 -222
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +9 -45
- package/dist/index.d.ts +9 -45
- package/dist/index.js +52 -221
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-NDEN3H2B.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -32,7 +32,6 @@ var index_exports = {};
|
|
|
32
32
|
__export(index_exports, {
|
|
33
33
|
CodeEvaluator: () => CodeEvaluator,
|
|
34
34
|
CompositeEvaluator: () => CompositeEvaluator,
|
|
35
|
-
ExpectedMessagesEvaluator: () => ExpectedMessagesEvaluator,
|
|
36
35
|
LlmJudgeEvaluator: () => LlmJudgeEvaluator,
|
|
37
36
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
38
37
|
ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
|
|
@@ -50,7 +49,6 @@ __export(index_exports, {
|
|
|
50
49
|
generateRubrics: () => generateRubrics,
|
|
51
50
|
getHitCount: () => getHitCount,
|
|
52
51
|
isEvaluatorKind: () => isEvaluatorKind,
|
|
53
|
-
isExpectedToolCall: () => isExpectedToolCall,
|
|
54
52
|
isGuidelineFile: () => isGuidelineFile,
|
|
55
53
|
isJsonObject: () => isJsonObject,
|
|
56
54
|
isJsonValue: () => isJsonValue,
|
|
@@ -110,18 +108,23 @@ function isTestMessage(value) {
|
|
|
110
108
|
if (typeof candidate.content === "string") {
|
|
111
109
|
return true;
|
|
112
110
|
}
|
|
113
|
-
if (
|
|
114
|
-
return
|
|
111
|
+
if (Array.isArray(candidate.content) && candidate.content.every(isJsonObject)) {
|
|
112
|
+
return true;
|
|
113
|
+
}
|
|
114
|
+
if (Array.isArray(candidate.tool_calls) && candidate.tool_calls.length > 0) {
|
|
115
|
+
return true;
|
|
115
116
|
}
|
|
116
|
-
|
|
117
|
+
if (isJsonObject(candidate.content)) {
|
|
118
|
+
return true;
|
|
119
|
+
}
|
|
120
|
+
return false;
|
|
117
121
|
}
|
|
118
122
|
var EVALUATOR_KIND_VALUES = [
|
|
119
123
|
"code_judge",
|
|
120
124
|
"llm_judge",
|
|
121
125
|
"rubric",
|
|
122
126
|
"composite",
|
|
123
|
-
"tool_trajectory"
|
|
124
|
-
"expected_messages"
|
|
127
|
+
"tool_trajectory"
|
|
125
128
|
];
|
|
126
129
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
127
130
|
function isEvaluatorKind(value) {
|
|
@@ -142,13 +145,6 @@ function isTraceEvent(value) {
|
|
|
142
145
|
const candidate = value;
|
|
143
146
|
return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
|
|
144
147
|
}
|
|
145
|
-
function isExpectedToolCall(value) {
|
|
146
|
-
if (typeof value !== "object" || value === null) {
|
|
147
|
-
return false;
|
|
148
|
-
}
|
|
149
|
-
const candidate = value;
|
|
150
|
-
return typeof candidate.tool === "string";
|
|
151
|
-
}
|
|
152
148
|
function computeTraceSummary(trace) {
|
|
153
149
|
const toolCallCounts = {};
|
|
154
150
|
let errorCount = 0;
|
|
@@ -645,15 +641,6 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
645
641
|
});
|
|
646
642
|
continue;
|
|
647
643
|
}
|
|
648
|
-
if (typeValue === "expected_messages") {
|
|
649
|
-
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
650
|
-
evaluators.push({
|
|
651
|
-
name,
|
|
652
|
-
type: "expected_messages",
|
|
653
|
-
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
654
|
-
});
|
|
655
|
-
continue;
|
|
656
|
-
}
|
|
657
644
|
if (typeValue === "tool_trajectory") {
|
|
658
645
|
const mode = asString2(rawEvaluator.mode);
|
|
659
646
|
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
|
|
@@ -908,63 +895,6 @@ async function processMessages(options) {
|
|
|
908
895
|
}
|
|
909
896
|
return segments;
|
|
910
897
|
}
|
|
911
|
-
async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
912
|
-
if (typeof content === "string") {
|
|
913
|
-
return content;
|
|
914
|
-
}
|
|
915
|
-
if (!content) {
|
|
916
|
-
return "";
|
|
917
|
-
}
|
|
918
|
-
const parts = [];
|
|
919
|
-
for (const entry of content) {
|
|
920
|
-
if (typeof entry === "string") {
|
|
921
|
-
parts.push({ content: entry, isFile: false });
|
|
922
|
-
continue;
|
|
923
|
-
}
|
|
924
|
-
if (!isJsonObject(entry)) {
|
|
925
|
-
continue;
|
|
926
|
-
}
|
|
927
|
-
const segmentType = asString3(entry.type);
|
|
928
|
-
if (segmentType === "file") {
|
|
929
|
-
const rawValue = asString3(entry.value);
|
|
930
|
-
if (!rawValue) {
|
|
931
|
-
continue;
|
|
932
|
-
}
|
|
933
|
-
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
934
|
-
rawValue,
|
|
935
|
-
searchRoots
|
|
936
|
-
);
|
|
937
|
-
if (!resolvedPath) {
|
|
938
|
-
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
939
|
-
logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
940
|
-
continue;
|
|
941
|
-
}
|
|
942
|
-
try {
|
|
943
|
-
const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
944
|
-
parts.push({ content: fileContent, isFile: true, displayPath });
|
|
945
|
-
if (verbose) {
|
|
946
|
-
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
947
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
948
|
-
}
|
|
949
|
-
} catch (error) {
|
|
950
|
-
logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
951
|
-
}
|
|
952
|
-
continue;
|
|
953
|
-
}
|
|
954
|
-
const textValue = asString3(entry.text);
|
|
955
|
-
if (typeof textValue === "string") {
|
|
956
|
-
parts.push({ content: textValue, isFile: false });
|
|
957
|
-
continue;
|
|
958
|
-
}
|
|
959
|
-
const valueValue = asString3(entry.value);
|
|
960
|
-
if (typeof valueValue === "string") {
|
|
961
|
-
parts.push({ content: valueValue, isFile: false });
|
|
962
|
-
continue;
|
|
963
|
-
}
|
|
964
|
-
parts.push({ content: JSON.stringify(entry), isFile: false });
|
|
965
|
-
}
|
|
966
|
-
return formatFileContents(parts);
|
|
967
|
-
}
|
|
968
898
|
function asString3(value) {
|
|
969
899
|
return typeof value === "string" ? value : void 0;
|
|
970
900
|
}
|
|
@@ -997,14 +927,15 @@ ${detailBlock}${ANSI_RESET4}`);
|
|
|
997
927
|
}
|
|
998
928
|
}
|
|
999
929
|
async function processExpectedMessages(options) {
|
|
1000
|
-
const { messages, searchRoots,
|
|
930
|
+
const { messages, searchRoots, verbose } = options;
|
|
1001
931
|
const segments = [];
|
|
1002
932
|
for (const message of messages) {
|
|
933
|
+
const extendedMessage = message;
|
|
1003
934
|
const segment = {
|
|
1004
935
|
role: message.role
|
|
1005
936
|
};
|
|
1006
|
-
if (
|
|
1007
|
-
segment.
|
|
937
|
+
if (extendedMessage.name) {
|
|
938
|
+
segment.name = extendedMessage.name;
|
|
1008
939
|
}
|
|
1009
940
|
const content = message.content;
|
|
1010
941
|
if (typeof content === "string") {
|
|
@@ -1052,6 +983,13 @@ async function processExpectedMessages(options) {
|
|
|
1052
983
|
processedContent.push(cloneJsonObject(rawSegment));
|
|
1053
984
|
}
|
|
1054
985
|
segment.content = processedContent;
|
|
986
|
+
} else if (isJsonObject(content)) {
|
|
987
|
+
segment.content = cloneJsonObject(content);
|
|
988
|
+
}
|
|
989
|
+
if (extendedMessage.tool_calls && Array.isArray(extendedMessage.tool_calls)) {
|
|
990
|
+
segment.tool_calls = extendedMessage.tool_calls.map(
|
|
991
|
+
(tc) => isJsonObject(tc) ? cloneJsonObject(tc) : tc
|
|
992
|
+
);
|
|
1055
993
|
}
|
|
1056
994
|
segments.push(segment);
|
|
1057
995
|
}
|
|
@@ -1346,9 +1284,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1346
1284
|
logError(`No valid expected message found for eval case: ${id}`);
|
|
1347
1285
|
continue;
|
|
1348
1286
|
}
|
|
1349
|
-
if (expectedMessages.length > 1) {
|
|
1350
|
-
logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
|
|
1351
|
-
}
|
|
1352
1287
|
const guidelinePaths = [];
|
|
1353
1288
|
const inputTextParts = [];
|
|
1354
1289
|
const inputSegments = await processMessages({
|
|
@@ -1368,8 +1303,19 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1368
1303
|
verbose
|
|
1369
1304
|
}) : [];
|
|
1370
1305
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
1371
|
-
|
|
1372
|
-
|
|
1306
|
+
let referenceAnswer = "";
|
|
1307
|
+
if (outputSegments.length > 1) {
|
|
1308
|
+
referenceAnswer = JSON.stringify(outputSegments, null, 2);
|
|
1309
|
+
} else if (outputSegments.length === 1) {
|
|
1310
|
+
const singleMessage = outputSegments[0];
|
|
1311
|
+
if (typeof singleMessage.content === "string") {
|
|
1312
|
+
referenceAnswer = singleMessage.content;
|
|
1313
|
+
} else if (singleMessage.content) {
|
|
1314
|
+
referenceAnswer = JSON.stringify(singleMessage, null, 2);
|
|
1315
|
+
} else if (singleMessage.tool_calls) {
|
|
1316
|
+
referenceAnswer = JSON.stringify(singleMessage, null, 2);
|
|
1317
|
+
}
|
|
1318
|
+
}
|
|
1373
1319
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
1374
1320
|
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
1375
1321
|
let evaluators;
|
|
@@ -1424,7 +1370,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1424
1370
|
question,
|
|
1425
1371
|
input_messages: inputMessages,
|
|
1426
1372
|
input_segments: inputSegments,
|
|
1427
|
-
|
|
1373
|
+
expected_messages: outputSegments,
|
|
1428
1374
|
reference_answer: referenceAnswer,
|
|
1429
1375
|
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
|
|
1430
1376
|
guideline_patterns: guidelinePatterns,
|
|
@@ -3979,7 +3925,7 @@ var import_ai2 = require("ai");
|
|
|
3979
3925
|
var import_zod2 = require("zod");
|
|
3980
3926
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
3981
3927
|
|
|
3982
|
-
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
3928
|
+
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
3983
3929
|
|
|
3984
3930
|
Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
|
|
3985
3931
|
|
|
@@ -4037,7 +3983,7 @@ var LlmJudgeEvaluator = class {
|
|
|
4037
3983
|
const variables = {
|
|
4038
3984
|
[TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
4039
3985
|
[TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
|
|
4040
|
-
context.evalCase.
|
|
3986
|
+
context.evalCase.expected_messages,
|
|
4041
3987
|
null,
|
|
4042
3988
|
2
|
|
4043
3989
|
),
|
|
@@ -4256,7 +4202,9 @@ var CodeEvaluator = class {
|
|
|
4256
4202
|
input_files: context.evalCase.file_paths.filter(
|
|
4257
4203
|
(path15) => !context.evalCase.guideline_paths.includes(path15)
|
|
4258
4204
|
),
|
|
4259
|
-
input_messages: context.evalCase.input_messages
|
|
4205
|
+
input_messages: context.evalCase.input_messages,
|
|
4206
|
+
candidate_trace_file: context.candidateTraceRef ?? null,
|
|
4207
|
+
candidate_trace_summary: context.candidateTraceSummary ?? null
|
|
4260
4208
|
},
|
|
4261
4209
|
null,
|
|
4262
4210
|
2
|
|
@@ -4522,105 +4470,6 @@ var ToolTrajectoryEvaluator = class {
|
|
|
4522
4470
|
};
|
|
4523
4471
|
}
|
|
4524
4472
|
};
|
|
4525
|
-
var ExpectedMessagesEvaluator = class {
|
|
4526
|
-
kind = "expected_messages";
|
|
4527
|
-
evaluate(context) {
|
|
4528
|
-
const { candidateTrace, evalCase } = context;
|
|
4529
|
-
const expectedSegments = evalCase.expected_segments;
|
|
4530
|
-
const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
|
|
4531
|
-
if (expectedToolCalls.length === 0) {
|
|
4532
|
-
return {
|
|
4533
|
-
score: 1,
|
|
4534
|
-
verdict: "pass",
|
|
4535
|
-
hits: ["No tool_calls specified in expected_messages"],
|
|
4536
|
-
misses: [],
|
|
4537
|
-
expectedAspectCount: 1
|
|
4538
|
-
};
|
|
4539
|
-
}
|
|
4540
|
-
if (!candidateTrace || candidateTrace.length === 0) {
|
|
4541
|
-
return {
|
|
4542
|
-
score: 0,
|
|
4543
|
-
verdict: "fail",
|
|
4544
|
-
hits: [],
|
|
4545
|
-
misses: ["No trace available to validate tool_calls"],
|
|
4546
|
-
expectedAspectCount: expectedToolCalls.length
|
|
4547
|
-
};
|
|
4548
|
-
}
|
|
4549
|
-
const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
|
|
4550
|
-
return this.validateToolCalls(expectedToolCalls, actualToolCalls);
|
|
4551
|
-
}
|
|
4552
|
-
extractExpectedToolCalls(segments) {
|
|
4553
|
-
if (!segments) {
|
|
4554
|
-
return [];
|
|
4555
|
-
}
|
|
4556
|
-
const toolCalls = [];
|
|
4557
|
-
for (const segment of segments) {
|
|
4558
|
-
const role = segment.role;
|
|
4559
|
-
const segmentToolCalls = segment.tool_calls;
|
|
4560
|
-
if (role === "assistant" && Array.isArray(segmentToolCalls)) {
|
|
4561
|
-
for (const tc of segmentToolCalls) {
|
|
4562
|
-
if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
|
|
4563
|
-
const toolCall = tc;
|
|
4564
|
-
toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
|
|
4565
|
-
}
|
|
4566
|
-
}
|
|
4567
|
-
}
|
|
4568
|
-
}
|
|
4569
|
-
return toolCalls;
|
|
4570
|
-
}
|
|
4571
|
-
validateToolCalls(expected, actual) {
|
|
4572
|
-
const hits = [];
|
|
4573
|
-
const misses = [];
|
|
4574
|
-
for (let i = 0; i < expected.length; i++) {
|
|
4575
|
-
const expectedCall = expected[i];
|
|
4576
|
-
const actualCall = actual[i];
|
|
4577
|
-
if (!actualCall) {
|
|
4578
|
-
misses.push(
|
|
4579
|
-
`tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
|
|
4580
|
-
);
|
|
4581
|
-
continue;
|
|
4582
|
-
}
|
|
4583
|
-
if (actualCall.name !== expectedCall.tool) {
|
|
4584
|
-
misses.push(
|
|
4585
|
-
`tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
|
|
4586
|
-
);
|
|
4587
|
-
continue;
|
|
4588
|
-
}
|
|
4589
|
-
if (expectedCall.input !== void 0) {
|
|
4590
|
-
if (!this.deepEquals(expectedCall.input, actualCall.input)) {
|
|
4591
|
-
misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
|
|
4592
|
-
continue;
|
|
4593
|
-
}
|
|
4594
|
-
}
|
|
4595
|
-
hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
|
|
4596
|
-
}
|
|
4597
|
-
const totalChecks = expected.length || 1;
|
|
4598
|
-
const score = hits.length / totalChecks;
|
|
4599
|
-
return {
|
|
4600
|
-
score,
|
|
4601
|
-
verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
|
|
4602
|
-
hits,
|
|
4603
|
-
misses,
|
|
4604
|
-
expectedAspectCount: totalChecks
|
|
4605
|
-
};
|
|
4606
|
-
}
|
|
4607
|
-
deepEquals(a, b) {
|
|
4608
|
-
if (a === b) return true;
|
|
4609
|
-
if (typeof a !== typeof b) return false;
|
|
4610
|
-
if (typeof a !== "object" || a === null || b === null) return false;
|
|
4611
|
-
if (Array.isArray(a) && Array.isArray(b)) {
|
|
4612
|
-
if (a.length !== b.length) return false;
|
|
4613
|
-
return a.every((val, i) => this.deepEquals(val, b[i]));
|
|
4614
|
-
}
|
|
4615
|
-
if (Array.isArray(a) || Array.isArray(b)) return false;
|
|
4616
|
-
const aObj = a;
|
|
4617
|
-
const bObj = b;
|
|
4618
|
-
const aKeys = Object.keys(aObj);
|
|
4619
|
-
const bKeys = Object.keys(bObj);
|
|
4620
|
-
if (aKeys.length !== bKeys.length) return false;
|
|
4621
|
-
return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
|
|
4622
|
-
}
|
|
4623
|
-
};
|
|
4624
4473
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
4625
4474
|
{{EVALUATOR_RESULTS_JSON}}
|
|
4626
4475
|
|
|
@@ -5392,6 +5241,7 @@ async function runEvalCase(options) {
|
|
|
5392
5241
|
judgeProvider,
|
|
5393
5242
|
agentTimeoutMs,
|
|
5394
5243
|
candidateTrace,
|
|
5244
|
+
candidateTraceRef: providerResponse.traceRef,
|
|
5395
5245
|
candidateTraceSummary
|
|
5396
5246
|
});
|
|
5397
5247
|
} catch (error) {
|
|
@@ -5411,6 +5261,7 @@ async function evaluateCandidate(options) {
|
|
|
5411
5261
|
judgeProvider,
|
|
5412
5262
|
agentTimeoutMs,
|
|
5413
5263
|
candidateTrace,
|
|
5264
|
+
candidateTraceRef,
|
|
5414
5265
|
candidateTraceSummary
|
|
5415
5266
|
} = options;
|
|
5416
5267
|
const gradeTimestamp = nowFn();
|
|
@@ -5426,6 +5277,7 @@ async function evaluateCandidate(options) {
|
|
|
5426
5277
|
judgeProvider,
|
|
5427
5278
|
agentTimeoutMs,
|
|
5428
5279
|
candidateTrace,
|
|
5280
|
+
candidateTraceRef,
|
|
5429
5281
|
candidateTraceSummary
|
|
5430
5282
|
});
|
|
5431
5283
|
const completedAt = nowFn();
|
|
@@ -5480,6 +5332,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
5480
5332
|
judgeProvider,
|
|
5481
5333
|
agentTimeoutMs,
|
|
5482
5334
|
candidateTrace,
|
|
5335
|
+
candidateTraceRef,
|
|
5483
5336
|
candidateTraceSummary
|
|
5484
5337
|
} = options;
|
|
5485
5338
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
@@ -5496,6 +5349,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
5496
5349
|
judgeProvider,
|
|
5497
5350
|
agentTimeoutMs,
|
|
5498
5351
|
candidateTrace,
|
|
5352
|
+
candidateTraceRef,
|
|
5499
5353
|
candidateTraceSummary
|
|
5500
5354
|
});
|
|
5501
5355
|
}
|
|
@@ -5514,6 +5368,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
5514
5368
|
now,
|
|
5515
5369
|
judgeProvider,
|
|
5516
5370
|
candidateTrace,
|
|
5371
|
+
candidateTraceRef,
|
|
5517
5372
|
candidateTraceSummary
|
|
5518
5373
|
});
|
|
5519
5374
|
return { score };
|
|
@@ -5532,6 +5387,7 @@ async function runEvaluatorList(options) {
|
|
|
5532
5387
|
judgeProvider,
|
|
5533
5388
|
agentTimeoutMs,
|
|
5534
5389
|
candidateTrace,
|
|
5390
|
+
candidateTraceRef,
|
|
5535
5391
|
candidateTraceSummary
|
|
5536
5392
|
} = options;
|
|
5537
5393
|
const scored = [];
|
|
@@ -5578,7 +5434,9 @@ async function runEvaluatorList(options) {
|
|
|
5578
5434
|
provider,
|
|
5579
5435
|
attempt,
|
|
5580
5436
|
promptInputs,
|
|
5581
|
-
now
|
|
5437
|
+
now,
|
|
5438
|
+
candidateTraceRef,
|
|
5439
|
+
candidateTraceSummary
|
|
5582
5440
|
});
|
|
5583
5441
|
const weight = evaluator.weight ?? 1;
|
|
5584
5442
|
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
@@ -5616,8 +5474,6 @@ async function runEvaluatorList(options) {
|
|
|
5616
5474
|
return new ToolTrajectoryEvaluator({
|
|
5617
5475
|
config: memberConfig
|
|
5618
5476
|
});
|
|
5619
|
-
case "expected_messages":
|
|
5620
|
-
return new ExpectedMessagesEvaluator();
|
|
5621
5477
|
default: {
|
|
5622
5478
|
const unknownConfig = memberConfig;
|
|
5623
5479
|
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
@@ -5667,32 +5523,7 @@ async function runEvaluatorList(options) {
|
|
|
5667
5523
|
promptInputs,
|
|
5668
5524
|
now,
|
|
5669
5525
|
candidateTrace,
|
|
5670
|
-
|
|
5671
|
-
});
|
|
5672
|
-
const weight = evaluator.weight ?? 1;
|
|
5673
|
-
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
5674
|
-
evaluatorResults.push({
|
|
5675
|
-
name: evaluator.name,
|
|
5676
|
-
type: evaluator.type,
|
|
5677
|
-
score: score2.score,
|
|
5678
|
-
weight,
|
|
5679
|
-
verdict: score2.verdict,
|
|
5680
|
-
hits: score2.hits,
|
|
5681
|
-
misses: score2.misses,
|
|
5682
|
-
reasoning: score2.reasoning
|
|
5683
|
-
});
|
|
5684
|
-
}
|
|
5685
|
-
if (evaluator.type === "expected_messages") {
|
|
5686
|
-
const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
|
|
5687
|
-
const score2 = expectedMessagesEvaluator.evaluate({
|
|
5688
|
-
evalCase,
|
|
5689
|
-
candidate,
|
|
5690
|
-
target,
|
|
5691
|
-
provider,
|
|
5692
|
-
attempt,
|
|
5693
|
-
promptInputs,
|
|
5694
|
-
now,
|
|
5695
|
-
candidateTrace,
|
|
5526
|
+
candidateTraceRef,
|
|
5696
5527
|
candidateTraceSummary
|
|
5697
5528
|
});
|
|
5698
5529
|
const weight = evaluator.weight ?? 1;
|
|
@@ -6065,7 +5896,6 @@ function createAgentKernel() {
|
|
|
6065
5896
|
0 && (module.exports = {
|
|
6066
5897
|
CodeEvaluator,
|
|
6067
5898
|
CompositeEvaluator,
|
|
6068
|
-
ExpectedMessagesEvaluator,
|
|
6069
5899
|
LlmJudgeEvaluator,
|
|
6070
5900
|
TEST_MESSAGE_ROLES,
|
|
6071
5901
|
ToolTrajectoryEvaluator,
|
|
@@ -6083,7 +5913,6 @@ function createAgentKernel() {
|
|
|
6083
5913
|
generateRubrics,
|
|
6084
5914
|
getHitCount,
|
|
6085
5915
|
isEvaluatorKind,
|
|
6086
|
-
isExpectedToolCall,
|
|
6087
5916
|
isGuidelineFile,
|
|
6088
5917
|
isJsonObject,
|
|
6089
5918
|
isJsonValue,
|