agentv 0.26.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-6ZM7WVSC.js → chunk-IVIT4U6S.js} +54 -258
- package/dist/chunk-IVIT4U6S.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/cli.js.map +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +20 -19
- package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +217 -217
- package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +67 -2
- package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +10 -68
- package/package.json +1 -1
- package/dist/chunk-6ZM7WVSC.js.map +0 -1
- package/dist/templates/agentv/.env.template +0 -23
|
@@ -164,7 +164,7 @@ import { access as access6, mkdir as mkdir7 } from "node:fs/promises";
|
|
|
164
164
|
import path19 from "node:path";
|
|
165
165
|
import { pathToFileURL } from "node:url";
|
|
166
166
|
|
|
167
|
-
// ../../packages/core/dist/chunk-
|
|
167
|
+
// ../../packages/core/dist/chunk-V3JCB3HI.js
|
|
168
168
|
import { constants } from "node:fs";
|
|
169
169
|
import { access, readFile } from "node:fs/promises";
|
|
170
170
|
import path from "node:path";
|
|
@@ -4211,7 +4211,7 @@ var coerce = {
|
|
|
4211
4211
|
};
|
|
4212
4212
|
var NEVER = INVALID;
|
|
4213
4213
|
|
|
4214
|
-
// ../../packages/core/dist/chunk-
|
|
4214
|
+
// ../../packages/core/dist/chunk-V3JCB3HI.js
|
|
4215
4215
|
async function fileExists(filePath) {
|
|
4216
4216
|
try {
|
|
4217
4217
|
await access(filePath, constants.F_OK);
|
|
@@ -34567,18 +34567,23 @@ function isTestMessage(value) {
|
|
|
34567
34567
|
if (typeof candidate.content === "string") {
|
|
34568
34568
|
return true;
|
|
34569
34569
|
}
|
|
34570
|
-
if (
|
|
34571
|
-
return
|
|
34570
|
+
if (Array.isArray(candidate.content) && candidate.content.every(isJsonObject)) {
|
|
34571
|
+
return true;
|
|
34572
34572
|
}
|
|
34573
|
-
|
|
34573
|
+
if (Array.isArray(candidate.tool_calls) && candidate.tool_calls.length > 0) {
|
|
34574
|
+
return true;
|
|
34575
|
+
}
|
|
34576
|
+
if (isJsonObject(candidate.content)) {
|
|
34577
|
+
return true;
|
|
34578
|
+
}
|
|
34579
|
+
return false;
|
|
34574
34580
|
}
|
|
34575
34581
|
var EVALUATOR_KIND_VALUES = [
|
|
34576
34582
|
"code_judge",
|
|
34577
34583
|
"llm_judge",
|
|
34578
34584
|
"rubric",
|
|
34579
34585
|
"composite",
|
|
34580
|
-
"tool_trajectory"
|
|
34581
|
-
"expected_messages"
|
|
34586
|
+
"tool_trajectory"
|
|
34582
34587
|
];
|
|
34583
34588
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
34584
34589
|
function isEvaluatorKind(value) {
|
|
@@ -35058,15 +35063,6 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35058
35063
|
});
|
|
35059
35064
|
continue;
|
|
35060
35065
|
}
|
|
35061
|
-
if (typeValue === "expected_messages") {
|
|
35062
|
-
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35063
|
-
evaluators.push({
|
|
35064
|
-
name: name16,
|
|
35065
|
-
type: "expected_messages",
|
|
35066
|
-
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
35067
|
-
});
|
|
35068
|
-
continue;
|
|
35069
|
-
}
|
|
35070
35066
|
if (typeValue === "tool_trajectory") {
|
|
35071
35067
|
const mode = asString2(rawEvaluator.mode);
|
|
35072
35068
|
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
|
|
@@ -35317,63 +35313,6 @@ async function processMessages(options) {
|
|
|
35317
35313
|
}
|
|
35318
35314
|
return segments;
|
|
35319
35315
|
}
|
|
35320
|
-
async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
35321
|
-
if (typeof content === "string") {
|
|
35322
|
-
return content;
|
|
35323
|
-
}
|
|
35324
|
-
if (!content) {
|
|
35325
|
-
return "";
|
|
35326
|
-
}
|
|
35327
|
-
const parts = [];
|
|
35328
|
-
for (const entry of content) {
|
|
35329
|
-
if (typeof entry === "string") {
|
|
35330
|
-
parts.push({ content: entry, isFile: false });
|
|
35331
|
-
continue;
|
|
35332
|
-
}
|
|
35333
|
-
if (!isJsonObject(entry)) {
|
|
35334
|
-
continue;
|
|
35335
|
-
}
|
|
35336
|
-
const segmentType = asString3(entry.type);
|
|
35337
|
-
if (segmentType === "file") {
|
|
35338
|
-
const rawValue = asString3(entry.value);
|
|
35339
|
-
if (!rawValue) {
|
|
35340
|
-
continue;
|
|
35341
|
-
}
|
|
35342
|
-
const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
|
|
35343
|
-
rawValue,
|
|
35344
|
-
searchRoots
|
|
35345
|
-
);
|
|
35346
|
-
if (!resolvedPath) {
|
|
35347
|
-
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
35348
|
-
logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
35349
|
-
continue;
|
|
35350
|
-
}
|
|
35351
|
-
try {
|
|
35352
|
-
const fileContent = (await readFile32(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
35353
|
-
parts.push({ content: fileContent, isFile: true, displayPath });
|
|
35354
|
-
if (verbose) {
|
|
35355
|
-
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
35356
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
35357
|
-
}
|
|
35358
|
-
} catch (error40) {
|
|
35359
|
-
logWarning3(`Could not read file ${resolvedPath}: ${error40.message}`);
|
|
35360
|
-
}
|
|
35361
|
-
continue;
|
|
35362
|
-
}
|
|
35363
|
-
const textValue = asString3(entry.text);
|
|
35364
|
-
if (typeof textValue === "string") {
|
|
35365
|
-
parts.push({ content: textValue, isFile: false });
|
|
35366
|
-
continue;
|
|
35367
|
-
}
|
|
35368
|
-
const valueValue = asString3(entry.value);
|
|
35369
|
-
if (typeof valueValue === "string") {
|
|
35370
|
-
parts.push({ content: valueValue, isFile: false });
|
|
35371
|
-
continue;
|
|
35372
|
-
}
|
|
35373
|
-
parts.push({ content: JSON.stringify(entry), isFile: false });
|
|
35374
|
-
}
|
|
35375
|
-
return formatFileContents(parts);
|
|
35376
|
-
}
|
|
35377
35316
|
function asString3(value) {
|
|
35378
35317
|
return typeof value === "string" ? value : void 0;
|
|
35379
35318
|
}
|
|
@@ -35406,14 +35345,15 @@ ${detailBlock}${ANSI_RESET4}`);
|
|
|
35406
35345
|
}
|
|
35407
35346
|
}
|
|
35408
35347
|
async function processExpectedMessages(options) {
|
|
35409
|
-
const { messages, searchRoots,
|
|
35348
|
+
const { messages, searchRoots, verbose } = options;
|
|
35410
35349
|
const segments = [];
|
|
35411
35350
|
for (const message of messages) {
|
|
35351
|
+
const extendedMessage = message;
|
|
35412
35352
|
const segment = {
|
|
35413
35353
|
role: message.role
|
|
35414
35354
|
};
|
|
35415
|
-
if (
|
|
35416
|
-
segment.
|
|
35355
|
+
if (extendedMessage.name) {
|
|
35356
|
+
segment.name = extendedMessage.name;
|
|
35417
35357
|
}
|
|
35418
35358
|
const content = message.content;
|
|
35419
35359
|
if (typeof content === "string") {
|
|
@@ -35461,6 +35401,13 @@ async function processExpectedMessages(options) {
|
|
|
35461
35401
|
processedContent.push(cloneJsonObject(rawSegment));
|
|
35462
35402
|
}
|
|
35463
35403
|
segment.content = processedContent;
|
|
35404
|
+
} else if (isJsonObject(content)) {
|
|
35405
|
+
segment.content = cloneJsonObject(content);
|
|
35406
|
+
}
|
|
35407
|
+
if (extendedMessage.tool_calls && Array.isArray(extendedMessage.tool_calls)) {
|
|
35408
|
+
segment.tool_calls = extendedMessage.tool_calls.map(
|
|
35409
|
+
(tc) => isJsonObject(tc) ? cloneJsonObject(tc) : tc
|
|
35410
|
+
);
|
|
35464
35411
|
}
|
|
35465
35412
|
segments.push(segment);
|
|
35466
35413
|
}
|
|
@@ -35749,9 +35696,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
35749
35696
|
logError(`No valid expected message found for eval case: ${id}`);
|
|
35750
35697
|
continue;
|
|
35751
35698
|
}
|
|
35752
|
-
if (expectedMessages.length > 1) {
|
|
35753
|
-
logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
|
|
35754
|
-
}
|
|
35755
35699
|
const guidelinePaths = [];
|
|
35756
35700
|
const inputTextParts = [];
|
|
35757
35701
|
const inputSegments = await processMessages({
|
|
@@ -35771,8 +35715,19 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
35771
35715
|
verbose
|
|
35772
35716
|
}) : [];
|
|
35773
35717
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
35774
|
-
|
|
35775
|
-
|
|
35718
|
+
let referenceAnswer = "";
|
|
35719
|
+
if (outputSegments.length > 1) {
|
|
35720
|
+
referenceAnswer = JSON.stringify(outputSegments, null, 2);
|
|
35721
|
+
} else if (outputSegments.length === 1) {
|
|
35722
|
+
const singleMessage = outputSegments[0];
|
|
35723
|
+
if (typeof singleMessage.content === "string") {
|
|
35724
|
+
referenceAnswer = singleMessage.content;
|
|
35725
|
+
} else if (singleMessage.content) {
|
|
35726
|
+
referenceAnswer = JSON.stringify(singleMessage, null, 2);
|
|
35727
|
+
} else if (singleMessage.tool_calls) {
|
|
35728
|
+
referenceAnswer = JSON.stringify(singleMessage, null, 2);
|
|
35729
|
+
}
|
|
35730
|
+
}
|
|
35776
35731
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
35777
35732
|
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
35778
35733
|
let evaluators;
|
|
@@ -35827,7 +35782,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
35827
35782
|
question,
|
|
35828
35783
|
input_messages: inputMessages,
|
|
35829
35784
|
input_segments: inputSegments,
|
|
35830
|
-
|
|
35785
|
+
expected_messages: outputSegments,
|
|
35831
35786
|
reference_answer: referenceAnswer,
|
|
35832
35787
|
guideline_paths: guidelinePaths.map((guidelinePath) => path62.resolve(guidelinePath)),
|
|
35833
35788
|
guideline_patterns: guidelinePatterns,
|
|
@@ -37669,7 +37624,7 @@ function createProvider(target) {
|
|
|
37669
37624
|
}
|
|
37670
37625
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
37671
37626
|
|
|
37672
|
-
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
37627
|
+
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
37673
37628
|
|
|
37674
37629
|
Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
|
|
37675
37630
|
|
|
@@ -37727,7 +37682,7 @@ var LlmJudgeEvaluator = class {
|
|
|
37727
37682
|
const variables = {
|
|
37728
37683
|
[TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
37729
37684
|
[TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
|
|
37730
|
-
context.evalCase.
|
|
37685
|
+
context.evalCase.expected_messages,
|
|
37731
37686
|
null,
|
|
37732
37687
|
2
|
|
37733
37688
|
),
|
|
@@ -37946,7 +37901,9 @@ var CodeEvaluator = class {
|
|
|
37946
37901
|
input_files: context.evalCase.file_paths.filter(
|
|
37947
37902
|
(path132) => !context.evalCase.guideline_paths.includes(path132)
|
|
37948
37903
|
),
|
|
37949
|
-
input_messages: context.evalCase.input_messages
|
|
37904
|
+
input_messages: context.evalCase.input_messages,
|
|
37905
|
+
candidate_trace_file: context.candidateTraceRef ?? null,
|
|
37906
|
+
candidate_trace_summary: context.candidateTraceSummary ?? null
|
|
37950
37907
|
},
|
|
37951
37908
|
null,
|
|
37952
37909
|
2
|
|
@@ -38212,105 +38169,6 @@ var ToolTrajectoryEvaluator = class {
|
|
|
38212
38169
|
};
|
|
38213
38170
|
}
|
|
38214
38171
|
};
|
|
38215
|
-
var ExpectedMessagesEvaluator = class {
|
|
38216
|
-
kind = "expected_messages";
|
|
38217
|
-
evaluate(context) {
|
|
38218
|
-
const { candidateTrace, evalCase } = context;
|
|
38219
|
-
const expectedSegments = evalCase.expected_segments;
|
|
38220
|
-
const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
|
|
38221
|
-
if (expectedToolCalls.length === 0) {
|
|
38222
|
-
return {
|
|
38223
|
-
score: 1,
|
|
38224
|
-
verdict: "pass",
|
|
38225
|
-
hits: ["No tool_calls specified in expected_messages"],
|
|
38226
|
-
misses: [],
|
|
38227
|
-
expectedAspectCount: 1
|
|
38228
|
-
};
|
|
38229
|
-
}
|
|
38230
|
-
if (!candidateTrace || candidateTrace.length === 0) {
|
|
38231
|
-
return {
|
|
38232
|
-
score: 0,
|
|
38233
|
-
verdict: "fail",
|
|
38234
|
-
hits: [],
|
|
38235
|
-
misses: ["No trace available to validate tool_calls"],
|
|
38236
|
-
expectedAspectCount: expectedToolCalls.length
|
|
38237
|
-
};
|
|
38238
|
-
}
|
|
38239
|
-
const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
|
|
38240
|
-
return this.validateToolCalls(expectedToolCalls, actualToolCalls);
|
|
38241
|
-
}
|
|
38242
|
-
extractExpectedToolCalls(segments) {
|
|
38243
|
-
if (!segments) {
|
|
38244
|
-
return [];
|
|
38245
|
-
}
|
|
38246
|
-
const toolCalls = [];
|
|
38247
|
-
for (const segment of segments) {
|
|
38248
|
-
const role = segment.role;
|
|
38249
|
-
const segmentToolCalls = segment.tool_calls;
|
|
38250
|
-
if (role === "assistant" && Array.isArray(segmentToolCalls)) {
|
|
38251
|
-
for (const tc of segmentToolCalls) {
|
|
38252
|
-
if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
|
|
38253
|
-
const toolCall = tc;
|
|
38254
|
-
toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
|
|
38255
|
-
}
|
|
38256
|
-
}
|
|
38257
|
-
}
|
|
38258
|
-
}
|
|
38259
|
-
return toolCalls;
|
|
38260
|
-
}
|
|
38261
|
-
validateToolCalls(expected, actual) {
|
|
38262
|
-
const hits = [];
|
|
38263
|
-
const misses = [];
|
|
38264
|
-
for (let i = 0; i < expected.length; i++) {
|
|
38265
|
-
const expectedCall = expected[i];
|
|
38266
|
-
const actualCall = actual[i];
|
|
38267
|
-
if (!actualCall) {
|
|
38268
|
-
misses.push(
|
|
38269
|
-
`tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
|
|
38270
|
-
);
|
|
38271
|
-
continue;
|
|
38272
|
-
}
|
|
38273
|
-
if (actualCall.name !== expectedCall.tool) {
|
|
38274
|
-
misses.push(
|
|
38275
|
-
`tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
|
|
38276
|
-
);
|
|
38277
|
-
continue;
|
|
38278
|
-
}
|
|
38279
|
-
if (expectedCall.input !== void 0) {
|
|
38280
|
-
if (!this.deepEquals(expectedCall.input, actualCall.input)) {
|
|
38281
|
-
misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
|
|
38282
|
-
continue;
|
|
38283
|
-
}
|
|
38284
|
-
}
|
|
38285
|
-
hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
|
|
38286
|
-
}
|
|
38287
|
-
const totalChecks = expected.length || 1;
|
|
38288
|
-
const score = hits.length / totalChecks;
|
|
38289
|
-
return {
|
|
38290
|
-
score,
|
|
38291
|
-
verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
|
|
38292
|
-
hits,
|
|
38293
|
-
misses,
|
|
38294
|
-
expectedAspectCount: totalChecks
|
|
38295
|
-
};
|
|
38296
|
-
}
|
|
38297
|
-
deepEquals(a, b) {
|
|
38298
|
-
if (a === b) return true;
|
|
38299
|
-
if (typeof a !== typeof b) return false;
|
|
38300
|
-
if (typeof a !== "object" || a === null || b === null) return false;
|
|
38301
|
-
if (Array.isArray(a) && Array.isArray(b)) {
|
|
38302
|
-
if (a.length !== b.length) return false;
|
|
38303
|
-
return a.every((val, i) => this.deepEquals(val, b[i]));
|
|
38304
|
-
}
|
|
38305
|
-
if (Array.isArray(a) || Array.isArray(b)) return false;
|
|
38306
|
-
const aObj = a;
|
|
38307
|
-
const bObj = b;
|
|
38308
|
-
const aKeys = Object.keys(aObj);
|
|
38309
|
-
const bKeys = Object.keys(bObj);
|
|
38310
|
-
if (aKeys.length !== bKeys.length) return false;
|
|
38311
|
-
return aKeys.every((key2) => this.deepEquals(aObj[key2], bObj[key2]));
|
|
38312
|
-
}
|
|
38313
|
-
};
|
|
38314
38172
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
38315
38173
|
{{EVALUATOR_RESULTS_JSON}}
|
|
38316
38174
|
|
|
@@ -39061,6 +38919,7 @@ async function runEvalCase(options) {
|
|
|
39061
38919
|
judgeProvider,
|
|
39062
38920
|
agentTimeoutMs,
|
|
39063
38921
|
candidateTrace,
|
|
38922
|
+
candidateTraceRef: providerResponse.traceRef,
|
|
39064
38923
|
candidateTraceSummary
|
|
39065
38924
|
});
|
|
39066
38925
|
} catch (error40) {
|
|
@@ -39080,6 +38939,7 @@ async function evaluateCandidate(options) {
|
|
|
39080
38939
|
judgeProvider,
|
|
39081
38940
|
agentTimeoutMs,
|
|
39082
38941
|
candidateTrace,
|
|
38942
|
+
candidateTraceRef,
|
|
39083
38943
|
candidateTraceSummary
|
|
39084
38944
|
} = options;
|
|
39085
38945
|
const gradeTimestamp = nowFn();
|
|
@@ -39095,6 +38955,7 @@ async function evaluateCandidate(options) {
|
|
|
39095
38955
|
judgeProvider,
|
|
39096
38956
|
agentTimeoutMs,
|
|
39097
38957
|
candidateTrace,
|
|
38958
|
+
candidateTraceRef,
|
|
39098
38959
|
candidateTraceSummary
|
|
39099
38960
|
});
|
|
39100
38961
|
const completedAt = nowFn();
|
|
@@ -39149,6 +39010,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
39149
39010
|
judgeProvider,
|
|
39150
39011
|
agentTimeoutMs,
|
|
39151
39012
|
candidateTrace,
|
|
39013
|
+
candidateTraceRef,
|
|
39152
39014
|
candidateTraceSummary
|
|
39153
39015
|
} = options;
|
|
39154
39016
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
@@ -39165,6 +39027,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
39165
39027
|
judgeProvider,
|
|
39166
39028
|
agentTimeoutMs,
|
|
39167
39029
|
candidateTrace,
|
|
39030
|
+
candidateTraceRef,
|
|
39168
39031
|
candidateTraceSummary
|
|
39169
39032
|
});
|
|
39170
39033
|
}
|
|
@@ -39183,6 +39046,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
39183
39046
|
now,
|
|
39184
39047
|
judgeProvider,
|
|
39185
39048
|
candidateTrace,
|
|
39049
|
+
candidateTraceRef,
|
|
39186
39050
|
candidateTraceSummary
|
|
39187
39051
|
});
|
|
39188
39052
|
return { score };
|
|
@@ -39201,6 +39065,7 @@ async function runEvaluatorList(options) {
|
|
|
39201
39065
|
judgeProvider,
|
|
39202
39066
|
agentTimeoutMs,
|
|
39203
39067
|
candidateTrace,
|
|
39068
|
+
candidateTraceRef,
|
|
39204
39069
|
candidateTraceSummary
|
|
39205
39070
|
} = options;
|
|
39206
39071
|
const scored = [];
|
|
@@ -39247,7 +39112,9 @@ async function runEvaluatorList(options) {
|
|
|
39247
39112
|
provider,
|
|
39248
39113
|
attempt,
|
|
39249
39114
|
promptInputs,
|
|
39250
|
-
now
|
|
39115
|
+
now,
|
|
39116
|
+
candidateTraceRef,
|
|
39117
|
+
candidateTraceSummary
|
|
39251
39118
|
});
|
|
39252
39119
|
const weight = evaluator.weight ?? 1;
|
|
39253
39120
|
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
@@ -39285,8 +39152,6 @@ async function runEvaluatorList(options) {
|
|
|
39285
39152
|
return new ToolTrajectoryEvaluator({
|
|
39286
39153
|
config: memberConfig
|
|
39287
39154
|
});
|
|
39288
|
-
case "expected_messages":
|
|
39289
|
-
return new ExpectedMessagesEvaluator();
|
|
39290
39155
|
default: {
|
|
39291
39156
|
const unknownConfig = memberConfig;
|
|
39292
39157
|
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
@@ -39336,32 +39201,7 @@ async function runEvaluatorList(options) {
|
|
|
39336
39201
|
promptInputs,
|
|
39337
39202
|
now,
|
|
39338
39203
|
candidateTrace,
|
|
39339
|
-
|
|
39340
|
-
});
|
|
39341
|
-
const weight = evaluator.weight ?? 1;
|
|
39342
|
-
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
39343
|
-
evaluatorResults.push({
|
|
39344
|
-
name: evaluator.name,
|
|
39345
|
-
type: evaluator.type,
|
|
39346
|
-
score: score2.score,
|
|
39347
|
-
weight,
|
|
39348
|
-
verdict: score2.verdict,
|
|
39349
|
-
hits: score2.hits,
|
|
39350
|
-
misses: score2.misses,
|
|
39351
|
-
reasoning: score2.reasoning
|
|
39352
|
-
});
|
|
39353
|
-
}
|
|
39354
|
-
if (evaluator.type === "expected_messages") {
|
|
39355
|
-
const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
|
|
39356
|
-
const score2 = expectedMessagesEvaluator.evaluate({
|
|
39357
|
-
evalCase,
|
|
39358
|
-
candidate,
|
|
39359
|
-
target,
|
|
39360
|
-
provider,
|
|
39361
|
-
attempt,
|
|
39362
|
-
promptInputs,
|
|
39363
|
-
now,
|
|
39364
|
-
candidateTrace,
|
|
39204
|
+
candidateTraceRef,
|
|
39365
39205
|
candidateTraceSummary
|
|
39366
39206
|
});
|
|
39367
39207
|
const weight = evaluator.weight ?? 1;
|
|
@@ -40649,26 +40489,6 @@ function validateMessages(messages, location, filePath, errors) {
|
|
|
40649
40489
|
message: `Invalid role '${role}'. Must be one of: ${validRoles.join(", ")}`
|
|
40650
40490
|
});
|
|
40651
40491
|
}
|
|
40652
|
-
const toolCalls = message.tool_calls;
|
|
40653
|
-
if (toolCalls !== void 0) {
|
|
40654
|
-
if (role !== "assistant") {
|
|
40655
|
-
errors.push({
|
|
40656
|
-
severity: "error",
|
|
40657
|
-
filePath,
|
|
40658
|
-
location: `${msgLocation}.tool_calls`,
|
|
40659
|
-
message: "tool_calls can only be specified on assistant messages"
|
|
40660
|
-
});
|
|
40661
|
-
} else if (!Array.isArray(toolCalls)) {
|
|
40662
|
-
errors.push({
|
|
40663
|
-
severity: "error",
|
|
40664
|
-
filePath,
|
|
40665
|
-
location: `${msgLocation}.tool_calls`,
|
|
40666
|
-
message: "tool_calls must be an array"
|
|
40667
|
-
});
|
|
40668
|
-
} else {
|
|
40669
|
-
validateToolCalls(toolCalls, `${msgLocation}.tool_calls`, filePath, errors);
|
|
40670
|
-
}
|
|
40671
|
-
}
|
|
40672
40492
|
const content = message.content;
|
|
40673
40493
|
if (typeof content === "string") {
|
|
40674
40494
|
validateContentForRoleMarkers(content, `${msgLocation}.content`, filePath, errors);
|
|
@@ -40733,30 +40553,6 @@ function validateContentForRoleMarkers(content, location, filePath, errors) {
|
|
|
40733
40553
|
}
|
|
40734
40554
|
}
|
|
40735
40555
|
}
|
|
40736
|
-
function validateToolCalls(toolCalls, location, filePath, errors) {
|
|
40737
|
-
for (let i = 0; i < toolCalls.length; i++) {
|
|
40738
|
-
const toolCall = toolCalls[i];
|
|
40739
|
-
const callLocation = `${location}[${i}]`;
|
|
40740
|
-
if (!isObject2(toolCall)) {
|
|
40741
|
-
errors.push({
|
|
40742
|
-
severity: "error",
|
|
40743
|
-
filePath,
|
|
40744
|
-
location: callLocation,
|
|
40745
|
-
message: "Tool call must be an object"
|
|
40746
|
-
});
|
|
40747
|
-
continue;
|
|
40748
|
-
}
|
|
40749
|
-
const tool2 = toolCall.tool;
|
|
40750
|
-
if (typeof tool2 !== "string" || tool2.trim().length === 0) {
|
|
40751
|
-
errors.push({
|
|
40752
|
-
severity: "error",
|
|
40753
|
-
filePath,
|
|
40754
|
-
location: `${callLocation}.tool`,
|
|
40755
|
-
message: "Missing or invalid 'tool' field (must be a non-empty string)"
|
|
40756
|
-
});
|
|
40757
|
-
}
|
|
40758
|
-
}
|
|
40759
|
-
}
|
|
40760
40556
|
function isObject22(value) {
|
|
40761
40557
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
40762
40558
|
}
|
|
@@ -42708,4 +42504,4 @@ export {
|
|
|
42708
42504
|
app,
|
|
42709
42505
|
runCli
|
|
42710
42506
|
};
|
|
42711
|
-
//# sourceMappingURL=chunk-
|
|
42507
|
+
//# sourceMappingURL=chunk-IVIT4U6S.js.map
|