agentv 0.7.2 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -11386,6 +11386,7 @@ async function processMessages(options) {
|
|
|
11386
11386
|
}
|
|
11387
11387
|
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
11388
11388
|
const verbose = options?.verbose ?? false;
|
|
11389
|
+
const evalIdFilter = options?.evalId;
|
|
11389
11390
|
const absoluteTestPath = path8.resolve(evalFilePath);
|
|
11390
11391
|
if (!await fileExists2(absoluteTestPath)) {
|
|
11391
11392
|
throw new Error(`Test file not found: ${evalFilePath}`);
|
|
@@ -11417,62 +11418,39 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
11417
11418
|
const results = [];
|
|
11418
11419
|
for (const rawEvalcase of rawTestcases) {
|
|
11419
11420
|
if (!isJsonObject(rawEvalcase)) {
|
|
11420
|
-
logWarning("Skipping invalid
|
|
11421
|
+
logWarning("Skipping invalid eval case entry (expected object)");
|
|
11421
11422
|
continue;
|
|
11422
11423
|
}
|
|
11423
11424
|
const evalcase = rawEvalcase;
|
|
11424
11425
|
const id = asString(evalcase.id);
|
|
11426
|
+
if (evalIdFilter && id !== evalIdFilter) {
|
|
11427
|
+
continue;
|
|
11428
|
+
}
|
|
11425
11429
|
const conversationId = asString(evalcase.conversation_id);
|
|
11426
11430
|
const outcome = asString(evalcase.outcome);
|
|
11427
11431
|
const inputMessagesValue = evalcase.input_messages;
|
|
11428
11432
|
const expectedMessagesValue = evalcase.expected_messages;
|
|
11429
11433
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
11430
|
-
logWarning(`Skipping incomplete
|
|
11434
|
+
logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
11431
11435
|
continue;
|
|
11432
11436
|
}
|
|
11433
11437
|
if (!Array.isArray(expectedMessagesValue)) {
|
|
11434
|
-
logWarning(`
|
|
11438
|
+
logWarning(`Eval case '${id}' missing expected_messages array`);
|
|
11435
11439
|
continue;
|
|
11436
11440
|
}
|
|
11437
11441
|
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
11438
11442
|
const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
|
|
11439
|
-
|
|
11440
|
-
|
|
11441
|
-
const systemMessages = inputMessages.filter((message) => message.role === "system");
|
|
11442
|
-
if (assistantMessages.length === 0) {
|
|
11443
|
-
logWarning(`No assistant message found for test case: ${id}`);
|
|
11443
|
+
if (expectedMessages.length === 0) {
|
|
11444
|
+
logWarning(`No expected message found for eval case: ${id}`);
|
|
11444
11445
|
continue;
|
|
11445
11446
|
}
|
|
11446
|
-
if (
|
|
11447
|
-
logWarning(`Multiple
|
|
11448
|
-
}
|
|
11449
|
-
if (systemMessages.length > 1) {
|
|
11450
|
-
logWarning(`Multiple system messages found for test case: ${id}, using first`);
|
|
11451
|
-
}
|
|
11452
|
-
let systemMessageContent;
|
|
11453
|
-
if (systemMessages.length > 0) {
|
|
11454
|
-
const content = systemMessages[0]?.content;
|
|
11455
|
-
if (typeof content === "string") {
|
|
11456
|
-
systemMessageContent = content;
|
|
11457
|
-
} else if (Array.isArray(content)) {
|
|
11458
|
-
const textParts = [];
|
|
11459
|
-
for (const segment of content) {
|
|
11460
|
-
if (isJsonObject(segment)) {
|
|
11461
|
-
const value = segment.value;
|
|
11462
|
-
if (typeof value === "string") {
|
|
11463
|
-
textParts.push(value);
|
|
11464
|
-
}
|
|
11465
|
-
}
|
|
11466
|
-
}
|
|
11467
|
-
if (textParts.length > 0) {
|
|
11468
|
-
systemMessageContent = textParts.join("\n\n");
|
|
11469
|
-
}
|
|
11470
|
-
}
|
|
11447
|
+
if (expectedMessages.length > 1) {
|
|
11448
|
+
logWarning(`Multiple expected messages found for eval case: ${id}, using first`);
|
|
11471
11449
|
}
|
|
11472
11450
|
const guidelinePaths = [];
|
|
11473
11451
|
const inputTextParts = [];
|
|
11474
11452
|
const inputSegments = await processMessages({
|
|
11475
|
-
messages:
|
|
11453
|
+
messages: inputMessages,
|
|
11476
11454
|
searchRoots,
|
|
11477
11455
|
repoRootPath,
|
|
11478
11456
|
guidelinePatterns,
|
|
@@ -11482,7 +11460,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
11482
11460
|
verbose
|
|
11483
11461
|
});
|
|
11484
11462
|
const outputSegments = await processMessages({
|
|
11485
|
-
messages:
|
|
11463
|
+
messages: expectedMessages,
|
|
11486
11464
|
searchRoots,
|
|
11487
11465
|
repoRootPath,
|
|
11488
11466
|
guidelinePatterns,
|
|
@@ -11490,10 +11468,10 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
11490
11468
|
verbose
|
|
11491
11469
|
});
|
|
11492
11470
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
11493
|
-
const
|
|
11494
|
-
const referenceAnswer = await resolveAssistantContent(
|
|
11471
|
+
const expectedContent = expectedMessages[0]?.content;
|
|
11472
|
+
const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
|
|
11495
11473
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
11496
|
-
const
|
|
11474
|
+
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
11497
11475
|
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
11498
11476
|
const userFilePaths = [];
|
|
11499
11477
|
for (const segment of inputSegments) {
|
|
@@ -11512,19 +11490,18 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
11512
11490
|
question,
|
|
11513
11491
|
input_segments: inputSegments,
|
|
11514
11492
|
output_segments: outputSegments,
|
|
11515
|
-
system_message: systemMessageContent,
|
|
11516
11493
|
reference_answer: referenceAnswer,
|
|
11517
11494
|
guideline_paths: guidelinePaths.map((guidelinePath) => path8.resolve(guidelinePath)),
|
|
11518
11495
|
guideline_patterns: guidelinePatterns,
|
|
11519
11496
|
file_paths: allFilePaths,
|
|
11520
11497
|
code_snippets: codeSnippets,
|
|
11521
11498
|
expected_outcome: outcome,
|
|
11522
|
-
evaluator:
|
|
11499
|
+
evaluator: evalCaseEvaluatorKind,
|
|
11523
11500
|
evaluators
|
|
11524
11501
|
};
|
|
11525
11502
|
if (verbose) {
|
|
11526
11503
|
console.log(`
|
|
11527
|
-
[
|
|
11504
|
+
[Eval Case: ${id}]`);
|
|
11528
11505
|
if (testCase.guideline_paths.length > 0) {
|
|
11529
11506
|
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
11530
11507
|
for (const guidelinePath of testCase.guideline_paths) {
|
|
@@ -11583,7 +11560,7 @@ ${body}`);
|
|
|
11583
11560
|
}
|
|
11584
11561
|
const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
11585
11562
|
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
11586
|
-
return { question, guidelines
|
|
11563
|
+
return { question, guidelines };
|
|
11587
11564
|
}
|
|
11588
11565
|
async function fileExists2(absolutePath) {
|
|
11589
11566
|
try {
|
|
@@ -14031,7 +14008,6 @@ var CodeEvaluator = class {
|
|
|
14031
14008
|
expected_outcome: context2.evalCase.expected_outcome,
|
|
14032
14009
|
reference_answer: context2.evalCase.reference_answer,
|
|
14033
14010
|
candidate_answer: context2.candidate,
|
|
14034
|
-
system_message: context2.promptInputs.systemMessage ?? "",
|
|
14035
14011
|
guideline_paths: context2.evalCase.guideline_paths,
|
|
14036
14012
|
input_files: context2.evalCase.file_paths,
|
|
14037
14013
|
input_segments: context2.evalCase.input_segments
|
|
@@ -14262,7 +14238,7 @@ function validateConcurrency(concurrency) {
|
|
|
14262
14238
|
}
|
|
14263
14239
|
async function runEvaluation(options) {
|
|
14264
14240
|
const {
|
|
14265
|
-
testFilePath,
|
|
14241
|
+
testFilePath: evalFilePath,
|
|
14266
14242
|
repoRoot,
|
|
14267
14243
|
target,
|
|
14268
14244
|
targets,
|
|
@@ -14281,11 +14257,11 @@ async function runEvaluation(options) {
|
|
|
14281
14257
|
onProgress
|
|
14282
14258
|
} = options;
|
|
14283
14259
|
const load = loadEvalCases;
|
|
14284
|
-
const evalCases = await load(
|
|
14260
|
+
const evalCases = await load(evalFilePath, repoRoot, { verbose, evalId });
|
|
14285
14261
|
const filteredEvalCases = filterEvalCases(evalCases, evalId);
|
|
14286
14262
|
if (filteredEvalCases.length === 0) {
|
|
14287
14263
|
if (evalId) {
|
|
14288
|
-
throw new Error(`
|
|
14264
|
+
throw new Error(`Eval case with id '${evalId}' not found in ${evalFilePath}`);
|
|
14289
14265
|
}
|
|
14290
14266
|
return [];
|
|
14291
14267
|
}
|
|
@@ -14664,8 +14640,7 @@ async function evaluateCandidate(options) {
|
|
|
14664
14640
|
const rawRequest = {
|
|
14665
14641
|
question: promptInputs.question,
|
|
14666
14642
|
...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
|
|
14667
|
-
guideline_paths: evalCase.guideline_paths
|
|
14668
|
-
system_message: promptInputs.systemMessage ?? ""
|
|
14643
|
+
guideline_paths: evalCase.guideline_paths
|
|
14669
14644
|
};
|
|
14670
14645
|
return {
|
|
14671
14646
|
eval_id: evalCase.id,
|
|
@@ -14929,7 +14904,6 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
14929
14904
|
question: promptInputs.question,
|
|
14930
14905
|
...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
|
|
14931
14906
|
guideline_paths: evalCase.guideline_paths,
|
|
14932
|
-
system_message: promptInputs.systemMessage ?? "",
|
|
14933
14907
|
error: message
|
|
14934
14908
|
};
|
|
14935
14909
|
return {
|
|
@@ -16837,7 +16811,7 @@ async function prepareFileMetadata(params) {
|
|
|
16837
16811
|
});
|
|
16838
16812
|
const providerLabel = options.dryRun ? `${selection.resolvedTarget.kind} (dry-run)` : selection.resolvedTarget.kind;
|
|
16839
16813
|
const inlineTargetLabel = `${selection.targetName} [provider=${providerLabel}]`;
|
|
16840
|
-
const evalCases = await loadEvalCases(testFilePath, repoRoot, { verbose: options.verbose });
|
|
16814
|
+
const evalCases = await loadEvalCases(testFilePath, repoRoot, { verbose: options.verbose, evalId: options.evalId });
|
|
16841
16815
|
const filteredIds = options.evalId ? evalCases.filter((value) => value.id === options.evalId).map((value) => value.id) : evalCases.map((value) => value.id);
|
|
16842
16816
|
return { evalIds: filteredIds, selection, inlineTargetLabel };
|
|
16843
16817
|
}
|
|
@@ -17538,4 +17512,4 @@ export {
|
|
|
17538
17512
|
createProgram,
|
|
17539
17513
|
runCli
|
|
17540
17514
|
};
|
|
17541
|
-
//# sourceMappingURL=chunk-
|
|
17515
|
+
//# sourceMappingURL=chunk-GWH4WZTW.js.map
|