@agentv/core 0.26.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-NDEN3H2B.js → chunk-V3JCB3HI.js} +1 -1
- package/dist/chunk-V3JCB3HI.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +0 -44
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -45
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +51 -222
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +9 -45
- package/dist/index.d.ts +9 -45
- package/dist/index.js +52 -221
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-NDEN3H2B.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
readTextFile,
|
|
10
10
|
resolveFileReference,
|
|
11
11
|
resolveTargetDefinition
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-V3JCB3HI.js";
|
|
13
13
|
|
|
14
14
|
// src/evaluation/types.ts
|
|
15
15
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -47,18 +47,23 @@ function isTestMessage(value) {
|
|
|
47
47
|
if (typeof candidate.content === "string") {
|
|
48
48
|
return true;
|
|
49
49
|
}
|
|
50
|
-
if (
|
|
51
|
-
return
|
|
50
|
+
if (Array.isArray(candidate.content) && candidate.content.every(isJsonObject)) {
|
|
51
|
+
return true;
|
|
52
|
+
}
|
|
53
|
+
if (Array.isArray(candidate.tool_calls) && candidate.tool_calls.length > 0) {
|
|
54
|
+
return true;
|
|
52
55
|
}
|
|
53
|
-
|
|
56
|
+
if (isJsonObject(candidate.content)) {
|
|
57
|
+
return true;
|
|
58
|
+
}
|
|
59
|
+
return false;
|
|
54
60
|
}
|
|
55
61
|
var EVALUATOR_KIND_VALUES = [
|
|
56
62
|
"code_judge",
|
|
57
63
|
"llm_judge",
|
|
58
64
|
"rubric",
|
|
59
65
|
"composite",
|
|
60
|
-
"tool_trajectory"
|
|
61
|
-
"expected_messages"
|
|
66
|
+
"tool_trajectory"
|
|
62
67
|
];
|
|
63
68
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
64
69
|
function isEvaluatorKind(value) {
|
|
@@ -79,13 +84,6 @@ function isTraceEvent(value) {
|
|
|
79
84
|
const candidate = value;
|
|
80
85
|
return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
|
|
81
86
|
}
|
|
82
|
-
function isExpectedToolCall(value) {
|
|
83
|
-
if (typeof value !== "object" || value === null) {
|
|
84
|
-
return false;
|
|
85
|
-
}
|
|
86
|
-
const candidate = value;
|
|
87
|
-
return typeof candidate.tool === "string";
|
|
88
|
-
}
|
|
89
87
|
function computeTraceSummary(trace) {
|
|
90
88
|
const toolCallCounts = {};
|
|
91
89
|
let errorCount = 0;
|
|
@@ -582,15 +580,6 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
582
580
|
});
|
|
583
581
|
continue;
|
|
584
582
|
}
|
|
585
|
-
if (typeValue === "expected_messages") {
|
|
586
|
-
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
587
|
-
evaluators.push({
|
|
588
|
-
name,
|
|
589
|
-
type: "expected_messages",
|
|
590
|
-
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
591
|
-
});
|
|
592
|
-
continue;
|
|
593
|
-
}
|
|
594
583
|
if (typeValue === "tool_trajectory") {
|
|
595
584
|
const mode = asString2(rawEvaluator.mode);
|
|
596
585
|
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
|
|
@@ -845,63 +834,6 @@ async function processMessages(options) {
|
|
|
845
834
|
}
|
|
846
835
|
return segments;
|
|
847
836
|
}
|
|
848
|
-
async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
849
|
-
if (typeof content === "string") {
|
|
850
|
-
return content;
|
|
851
|
-
}
|
|
852
|
-
if (!content) {
|
|
853
|
-
return "";
|
|
854
|
-
}
|
|
855
|
-
const parts = [];
|
|
856
|
-
for (const entry of content) {
|
|
857
|
-
if (typeof entry === "string") {
|
|
858
|
-
parts.push({ content: entry, isFile: false });
|
|
859
|
-
continue;
|
|
860
|
-
}
|
|
861
|
-
if (!isJsonObject(entry)) {
|
|
862
|
-
continue;
|
|
863
|
-
}
|
|
864
|
-
const segmentType = asString3(entry.type);
|
|
865
|
-
if (segmentType === "file") {
|
|
866
|
-
const rawValue = asString3(entry.value);
|
|
867
|
-
if (!rawValue) {
|
|
868
|
-
continue;
|
|
869
|
-
}
|
|
870
|
-
const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
|
|
871
|
-
rawValue,
|
|
872
|
-
searchRoots
|
|
873
|
-
);
|
|
874
|
-
if (!resolvedPath) {
|
|
875
|
-
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
876
|
-
logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
877
|
-
continue;
|
|
878
|
-
}
|
|
879
|
-
try {
|
|
880
|
-
const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
881
|
-
parts.push({ content: fileContent, isFile: true, displayPath });
|
|
882
|
-
if (verbose) {
|
|
883
|
-
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
884
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
885
|
-
}
|
|
886
|
-
} catch (error) {
|
|
887
|
-
logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
888
|
-
}
|
|
889
|
-
continue;
|
|
890
|
-
}
|
|
891
|
-
const textValue = asString3(entry.text);
|
|
892
|
-
if (typeof textValue === "string") {
|
|
893
|
-
parts.push({ content: textValue, isFile: false });
|
|
894
|
-
continue;
|
|
895
|
-
}
|
|
896
|
-
const valueValue = asString3(entry.value);
|
|
897
|
-
if (typeof valueValue === "string") {
|
|
898
|
-
parts.push({ content: valueValue, isFile: false });
|
|
899
|
-
continue;
|
|
900
|
-
}
|
|
901
|
-
parts.push({ content: JSON.stringify(entry), isFile: false });
|
|
902
|
-
}
|
|
903
|
-
return formatFileContents(parts);
|
|
904
|
-
}
|
|
905
837
|
function asString3(value) {
|
|
906
838
|
return typeof value === "string" ? value : void 0;
|
|
907
839
|
}
|
|
@@ -934,14 +866,15 @@ ${detailBlock}${ANSI_RESET4}`);
|
|
|
934
866
|
}
|
|
935
867
|
}
|
|
936
868
|
async function processExpectedMessages(options) {
|
|
937
|
-
const { messages, searchRoots,
|
|
869
|
+
const { messages, searchRoots, verbose } = options;
|
|
938
870
|
const segments = [];
|
|
939
871
|
for (const message of messages) {
|
|
872
|
+
const extendedMessage = message;
|
|
940
873
|
const segment = {
|
|
941
874
|
role: message.role
|
|
942
875
|
};
|
|
943
|
-
if (
|
|
944
|
-
segment.
|
|
876
|
+
if (extendedMessage.name) {
|
|
877
|
+
segment.name = extendedMessage.name;
|
|
945
878
|
}
|
|
946
879
|
const content = message.content;
|
|
947
880
|
if (typeof content === "string") {
|
|
@@ -989,6 +922,13 @@ async function processExpectedMessages(options) {
|
|
|
989
922
|
processedContent.push(cloneJsonObject(rawSegment));
|
|
990
923
|
}
|
|
991
924
|
segment.content = processedContent;
|
|
925
|
+
} else if (isJsonObject(content)) {
|
|
926
|
+
segment.content = cloneJsonObject(content);
|
|
927
|
+
}
|
|
928
|
+
if (extendedMessage.tool_calls && Array.isArray(extendedMessage.tool_calls)) {
|
|
929
|
+
segment.tool_calls = extendedMessage.tool_calls.map(
|
|
930
|
+
(tc) => isJsonObject(tc) ? cloneJsonObject(tc) : tc
|
|
931
|
+
);
|
|
992
932
|
}
|
|
993
933
|
segments.push(segment);
|
|
994
934
|
}
|
|
@@ -1283,9 +1223,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1283
1223
|
logError(`No valid expected message found for eval case: ${id}`);
|
|
1284
1224
|
continue;
|
|
1285
1225
|
}
|
|
1286
|
-
if (expectedMessages.length > 1) {
|
|
1287
|
-
logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
|
|
1288
|
-
}
|
|
1289
1226
|
const guidelinePaths = [];
|
|
1290
1227
|
const inputTextParts = [];
|
|
1291
1228
|
const inputSegments = await processMessages({
|
|
@@ -1305,8 +1242,19 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1305
1242
|
verbose
|
|
1306
1243
|
}) : [];
|
|
1307
1244
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
1308
|
-
|
|
1309
|
-
|
|
1245
|
+
let referenceAnswer = "";
|
|
1246
|
+
if (outputSegments.length > 1) {
|
|
1247
|
+
referenceAnswer = JSON.stringify(outputSegments, null, 2);
|
|
1248
|
+
} else if (outputSegments.length === 1) {
|
|
1249
|
+
const singleMessage = outputSegments[0];
|
|
1250
|
+
if (typeof singleMessage.content === "string") {
|
|
1251
|
+
referenceAnswer = singleMessage.content;
|
|
1252
|
+
} else if (singleMessage.content) {
|
|
1253
|
+
referenceAnswer = JSON.stringify(singleMessage, null, 2);
|
|
1254
|
+
} else if (singleMessage.tool_calls) {
|
|
1255
|
+
referenceAnswer = JSON.stringify(singleMessage, null, 2);
|
|
1256
|
+
}
|
|
1257
|
+
}
|
|
1310
1258
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
1311
1259
|
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
1312
1260
|
let evaluators;
|
|
@@ -1361,7 +1309,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1361
1309
|
question,
|
|
1362
1310
|
input_messages: inputMessages,
|
|
1363
1311
|
input_segments: inputSegments,
|
|
1364
|
-
|
|
1312
|
+
expected_messages: outputSegments,
|
|
1365
1313
|
reference_answer: referenceAnswer,
|
|
1366
1314
|
guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
|
|
1367
1315
|
guideline_patterns: guidelinePatterns,
|
|
@@ -3270,7 +3218,7 @@ import { generateText as generateText2 } from "ai";
|
|
|
3270
3218
|
import { z } from "zod";
|
|
3271
3219
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
3272
3220
|
|
|
3273
|
-
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
3221
|
+
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
3274
3222
|
|
|
3275
3223
|
Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
|
|
3276
3224
|
|
|
@@ -3328,7 +3276,7 @@ var LlmJudgeEvaluator = class {
|
|
|
3328
3276
|
const variables = {
|
|
3329
3277
|
[TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
3330
3278
|
[TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
|
|
3331
|
-
context.evalCase.
|
|
3279
|
+
context.evalCase.expected_messages,
|
|
3332
3280
|
null,
|
|
3333
3281
|
2
|
|
3334
3282
|
),
|
|
@@ -3547,7 +3495,9 @@ var CodeEvaluator = class {
|
|
|
3547
3495
|
input_files: context.evalCase.file_paths.filter(
|
|
3548
3496
|
(path13) => !context.evalCase.guideline_paths.includes(path13)
|
|
3549
3497
|
),
|
|
3550
|
-
input_messages: context.evalCase.input_messages
|
|
3498
|
+
input_messages: context.evalCase.input_messages,
|
|
3499
|
+
candidate_trace_file: context.candidateTraceRef ?? null,
|
|
3500
|
+
candidate_trace_summary: context.candidateTraceSummary ?? null
|
|
3551
3501
|
},
|
|
3552
3502
|
null,
|
|
3553
3503
|
2
|
|
@@ -3813,105 +3763,6 @@ var ToolTrajectoryEvaluator = class {
|
|
|
3813
3763
|
};
|
|
3814
3764
|
}
|
|
3815
3765
|
};
|
|
3816
|
-
var ExpectedMessagesEvaluator = class {
|
|
3817
|
-
kind = "expected_messages";
|
|
3818
|
-
evaluate(context) {
|
|
3819
|
-
const { candidateTrace, evalCase } = context;
|
|
3820
|
-
const expectedSegments = evalCase.expected_segments;
|
|
3821
|
-
const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
|
|
3822
|
-
if (expectedToolCalls.length === 0) {
|
|
3823
|
-
return {
|
|
3824
|
-
score: 1,
|
|
3825
|
-
verdict: "pass",
|
|
3826
|
-
hits: ["No tool_calls specified in expected_messages"],
|
|
3827
|
-
misses: [],
|
|
3828
|
-
expectedAspectCount: 1
|
|
3829
|
-
};
|
|
3830
|
-
}
|
|
3831
|
-
if (!candidateTrace || candidateTrace.length === 0) {
|
|
3832
|
-
return {
|
|
3833
|
-
score: 0,
|
|
3834
|
-
verdict: "fail",
|
|
3835
|
-
hits: [],
|
|
3836
|
-
misses: ["No trace available to validate tool_calls"],
|
|
3837
|
-
expectedAspectCount: expectedToolCalls.length
|
|
3838
|
-
};
|
|
3839
|
-
}
|
|
3840
|
-
const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
|
|
3841
|
-
return this.validateToolCalls(expectedToolCalls, actualToolCalls);
|
|
3842
|
-
}
|
|
3843
|
-
extractExpectedToolCalls(segments) {
|
|
3844
|
-
if (!segments) {
|
|
3845
|
-
return [];
|
|
3846
|
-
}
|
|
3847
|
-
const toolCalls = [];
|
|
3848
|
-
for (const segment of segments) {
|
|
3849
|
-
const role = segment.role;
|
|
3850
|
-
const segmentToolCalls = segment.tool_calls;
|
|
3851
|
-
if (role === "assistant" && Array.isArray(segmentToolCalls)) {
|
|
3852
|
-
for (const tc of segmentToolCalls) {
|
|
3853
|
-
if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
|
|
3854
|
-
const toolCall = tc;
|
|
3855
|
-
toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
|
|
3856
|
-
}
|
|
3857
|
-
}
|
|
3858
|
-
}
|
|
3859
|
-
}
|
|
3860
|
-
return toolCalls;
|
|
3861
|
-
}
|
|
3862
|
-
validateToolCalls(expected, actual) {
|
|
3863
|
-
const hits = [];
|
|
3864
|
-
const misses = [];
|
|
3865
|
-
for (let i = 0; i < expected.length; i++) {
|
|
3866
|
-
const expectedCall = expected[i];
|
|
3867
|
-
const actualCall = actual[i];
|
|
3868
|
-
if (!actualCall) {
|
|
3869
|
-
misses.push(
|
|
3870
|
-
`tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
|
|
3871
|
-
);
|
|
3872
|
-
continue;
|
|
3873
|
-
}
|
|
3874
|
-
if (actualCall.name !== expectedCall.tool) {
|
|
3875
|
-
misses.push(
|
|
3876
|
-
`tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
|
|
3877
|
-
);
|
|
3878
|
-
continue;
|
|
3879
|
-
}
|
|
3880
|
-
if (expectedCall.input !== void 0) {
|
|
3881
|
-
if (!this.deepEquals(expectedCall.input, actualCall.input)) {
|
|
3882
|
-
misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
|
|
3883
|
-
continue;
|
|
3884
|
-
}
|
|
3885
|
-
}
|
|
3886
|
-
hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
|
|
3887
|
-
}
|
|
3888
|
-
const totalChecks = expected.length || 1;
|
|
3889
|
-
const score = hits.length / totalChecks;
|
|
3890
|
-
return {
|
|
3891
|
-
score,
|
|
3892
|
-
verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
|
|
3893
|
-
hits,
|
|
3894
|
-
misses,
|
|
3895
|
-
expectedAspectCount: totalChecks
|
|
3896
|
-
};
|
|
3897
|
-
}
|
|
3898
|
-
deepEquals(a, b) {
|
|
3899
|
-
if (a === b) return true;
|
|
3900
|
-
if (typeof a !== typeof b) return false;
|
|
3901
|
-
if (typeof a !== "object" || a === null || b === null) return false;
|
|
3902
|
-
if (Array.isArray(a) && Array.isArray(b)) {
|
|
3903
|
-
if (a.length !== b.length) return false;
|
|
3904
|
-
return a.every((val, i) => this.deepEquals(val, b[i]));
|
|
3905
|
-
}
|
|
3906
|
-
if (Array.isArray(a) || Array.isArray(b)) return false;
|
|
3907
|
-
const aObj = a;
|
|
3908
|
-
const bObj = b;
|
|
3909
|
-
const aKeys = Object.keys(aObj);
|
|
3910
|
-
const bKeys = Object.keys(bObj);
|
|
3911
|
-
if (aKeys.length !== bKeys.length) return false;
|
|
3912
|
-
return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
|
|
3913
|
-
}
|
|
3914
|
-
};
|
|
3915
3766
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
3916
3767
|
{{EVALUATOR_RESULTS_JSON}}
|
|
3917
3768
|
|
|
@@ -4673,6 +4524,7 @@ async function runEvalCase(options) {
|
|
|
4673
4524
|
judgeProvider,
|
|
4674
4525
|
agentTimeoutMs,
|
|
4675
4526
|
candidateTrace,
|
|
4527
|
+
candidateTraceRef: providerResponse.traceRef,
|
|
4676
4528
|
candidateTraceSummary
|
|
4677
4529
|
});
|
|
4678
4530
|
} catch (error) {
|
|
@@ -4692,6 +4544,7 @@ async function evaluateCandidate(options) {
|
|
|
4692
4544
|
judgeProvider,
|
|
4693
4545
|
agentTimeoutMs,
|
|
4694
4546
|
candidateTrace,
|
|
4547
|
+
candidateTraceRef,
|
|
4695
4548
|
candidateTraceSummary
|
|
4696
4549
|
} = options;
|
|
4697
4550
|
const gradeTimestamp = nowFn();
|
|
@@ -4707,6 +4560,7 @@ async function evaluateCandidate(options) {
|
|
|
4707
4560
|
judgeProvider,
|
|
4708
4561
|
agentTimeoutMs,
|
|
4709
4562
|
candidateTrace,
|
|
4563
|
+
candidateTraceRef,
|
|
4710
4564
|
candidateTraceSummary
|
|
4711
4565
|
});
|
|
4712
4566
|
const completedAt = nowFn();
|
|
@@ -4761,6 +4615,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
4761
4615
|
judgeProvider,
|
|
4762
4616
|
agentTimeoutMs,
|
|
4763
4617
|
candidateTrace,
|
|
4618
|
+
candidateTraceRef,
|
|
4764
4619
|
candidateTraceSummary
|
|
4765
4620
|
} = options;
|
|
4766
4621
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
@@ -4777,6 +4632,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
4777
4632
|
judgeProvider,
|
|
4778
4633
|
agentTimeoutMs,
|
|
4779
4634
|
candidateTrace,
|
|
4635
|
+
candidateTraceRef,
|
|
4780
4636
|
candidateTraceSummary
|
|
4781
4637
|
});
|
|
4782
4638
|
}
|
|
@@ -4795,6 +4651,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
4795
4651
|
now,
|
|
4796
4652
|
judgeProvider,
|
|
4797
4653
|
candidateTrace,
|
|
4654
|
+
candidateTraceRef,
|
|
4798
4655
|
candidateTraceSummary
|
|
4799
4656
|
});
|
|
4800
4657
|
return { score };
|
|
@@ -4813,6 +4670,7 @@ async function runEvaluatorList(options) {
|
|
|
4813
4670
|
judgeProvider,
|
|
4814
4671
|
agentTimeoutMs,
|
|
4815
4672
|
candidateTrace,
|
|
4673
|
+
candidateTraceRef,
|
|
4816
4674
|
candidateTraceSummary
|
|
4817
4675
|
} = options;
|
|
4818
4676
|
const scored = [];
|
|
@@ -4859,7 +4717,9 @@ async function runEvaluatorList(options) {
|
|
|
4859
4717
|
provider,
|
|
4860
4718
|
attempt,
|
|
4861
4719
|
promptInputs,
|
|
4862
|
-
now
|
|
4720
|
+
now,
|
|
4721
|
+
candidateTraceRef,
|
|
4722
|
+
candidateTraceSummary
|
|
4863
4723
|
});
|
|
4864
4724
|
const weight = evaluator.weight ?? 1;
|
|
4865
4725
|
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
@@ -4897,8 +4757,6 @@ async function runEvaluatorList(options) {
|
|
|
4897
4757
|
return new ToolTrajectoryEvaluator({
|
|
4898
4758
|
config: memberConfig
|
|
4899
4759
|
});
|
|
4900
|
-
case "expected_messages":
|
|
4901
|
-
return new ExpectedMessagesEvaluator();
|
|
4902
4760
|
default: {
|
|
4903
4761
|
const unknownConfig = memberConfig;
|
|
4904
4762
|
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
@@ -4948,32 +4806,7 @@ async function runEvaluatorList(options) {
|
|
|
4948
4806
|
promptInputs,
|
|
4949
4807
|
now,
|
|
4950
4808
|
candidateTrace,
|
|
4951
|
-
|
|
4952
|
-
});
|
|
4953
|
-
const weight = evaluator.weight ?? 1;
|
|
4954
|
-
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
4955
|
-
evaluatorResults.push({
|
|
4956
|
-
name: evaluator.name,
|
|
4957
|
-
type: evaluator.type,
|
|
4958
|
-
score: score2.score,
|
|
4959
|
-
weight,
|
|
4960
|
-
verdict: score2.verdict,
|
|
4961
|
-
hits: score2.hits,
|
|
4962
|
-
misses: score2.misses,
|
|
4963
|
-
reasoning: score2.reasoning
|
|
4964
|
-
});
|
|
4965
|
-
}
|
|
4966
|
-
if (evaluator.type === "expected_messages") {
|
|
4967
|
-
const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
|
|
4968
|
-
const score2 = expectedMessagesEvaluator.evaluate({
|
|
4969
|
-
evalCase,
|
|
4970
|
-
candidate,
|
|
4971
|
-
target,
|
|
4972
|
-
provider,
|
|
4973
|
-
attempt,
|
|
4974
|
-
promptInputs,
|
|
4975
|
-
now,
|
|
4976
|
-
candidateTrace,
|
|
4809
|
+
candidateTraceRef,
|
|
4977
4810
|
candidateTraceSummary
|
|
4978
4811
|
});
|
|
4979
4812
|
const weight = evaluator.weight ?? 1;
|
|
@@ -5345,7 +5178,6 @@ function createAgentKernel() {
|
|
|
5345
5178
|
export {
|
|
5346
5179
|
CodeEvaluator,
|
|
5347
5180
|
CompositeEvaluator,
|
|
5348
|
-
ExpectedMessagesEvaluator,
|
|
5349
5181
|
LlmJudgeEvaluator,
|
|
5350
5182
|
TEST_MESSAGE_ROLES,
|
|
5351
5183
|
ToolTrajectoryEvaluator,
|
|
@@ -5363,7 +5195,6 @@ export {
|
|
|
5363
5195
|
generateRubrics,
|
|
5364
5196
|
getHitCount,
|
|
5365
5197
|
isEvaluatorKind,
|
|
5366
|
-
isExpectedToolCall,
|
|
5367
5198
|
isGuidelineFile,
|
|
5368
5199
|
isJsonObject,
|
|
5369
5200
|
isJsonValue,
|