@agentv/core 0.7.2 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +23 -49
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +3 -3
- package/dist/index.d.ts +3 -3
- package/dist/index.js +23 -49
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -382,6 +382,7 @@ async function processMessages(options) {
|
|
|
382
382
|
}
|
|
383
383
|
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
384
384
|
const verbose = options?.verbose ?? false;
|
|
385
|
+
const evalIdFilter = options?.evalId;
|
|
385
386
|
const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
|
|
386
387
|
if (!await fileExists2(absoluteTestPath)) {
|
|
387
388
|
throw new Error(`Test file not found: ${evalFilePath}`);
|
|
@@ -413,62 +414,39 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
413
414
|
const results = [];
|
|
414
415
|
for (const rawEvalcase of rawTestcases) {
|
|
415
416
|
if (!isJsonObject(rawEvalcase)) {
|
|
416
|
-
logWarning("Skipping invalid
|
|
417
|
+
logWarning("Skipping invalid eval case entry (expected object)");
|
|
417
418
|
continue;
|
|
418
419
|
}
|
|
419
420
|
const evalcase = rawEvalcase;
|
|
420
421
|
const id = asString(evalcase.id);
|
|
422
|
+
if (evalIdFilter && id !== evalIdFilter) {
|
|
423
|
+
continue;
|
|
424
|
+
}
|
|
421
425
|
const conversationId = asString(evalcase.conversation_id);
|
|
422
426
|
const outcome = asString(evalcase.outcome);
|
|
423
427
|
const inputMessagesValue = evalcase.input_messages;
|
|
424
428
|
const expectedMessagesValue = evalcase.expected_messages;
|
|
425
429
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
426
|
-
logWarning(`Skipping incomplete
|
|
430
|
+
logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
427
431
|
continue;
|
|
428
432
|
}
|
|
429
433
|
if (!Array.isArray(expectedMessagesValue)) {
|
|
430
|
-
logWarning(`
|
|
434
|
+
logWarning(`Eval case '${id}' missing expected_messages array`);
|
|
431
435
|
continue;
|
|
432
436
|
}
|
|
433
437
|
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
434
438
|
const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
const systemMessages = inputMessages.filter((message) => message.role === "system");
|
|
438
|
-
if (assistantMessages.length === 0) {
|
|
439
|
-
logWarning(`No assistant message found for test case: ${id}`);
|
|
439
|
+
if (expectedMessages.length === 0) {
|
|
440
|
+
logWarning(`No expected message found for eval case: ${id}`);
|
|
440
441
|
continue;
|
|
441
442
|
}
|
|
442
|
-
if (
|
|
443
|
-
logWarning(`Multiple
|
|
444
|
-
}
|
|
445
|
-
if (systemMessages.length > 1) {
|
|
446
|
-
logWarning(`Multiple system messages found for test case: ${id}, using first`);
|
|
447
|
-
}
|
|
448
|
-
let systemMessageContent;
|
|
449
|
-
if (systemMessages.length > 0) {
|
|
450
|
-
const content = systemMessages[0]?.content;
|
|
451
|
-
if (typeof content === "string") {
|
|
452
|
-
systemMessageContent = content;
|
|
453
|
-
} else if (Array.isArray(content)) {
|
|
454
|
-
const textParts = [];
|
|
455
|
-
for (const segment of content) {
|
|
456
|
-
if (isJsonObject(segment)) {
|
|
457
|
-
const value = segment.value;
|
|
458
|
-
if (typeof value === "string") {
|
|
459
|
-
textParts.push(value);
|
|
460
|
-
}
|
|
461
|
-
}
|
|
462
|
-
}
|
|
463
|
-
if (textParts.length > 0) {
|
|
464
|
-
systemMessageContent = textParts.join("\n\n");
|
|
465
|
-
}
|
|
466
|
-
}
|
|
443
|
+
if (expectedMessages.length > 1) {
|
|
444
|
+
logWarning(`Multiple expected messages found for eval case: ${id}, using first`);
|
|
467
445
|
}
|
|
468
446
|
const guidelinePaths = [];
|
|
469
447
|
const inputTextParts = [];
|
|
470
448
|
const inputSegments = await processMessages({
|
|
471
|
-
messages:
|
|
449
|
+
messages: inputMessages,
|
|
472
450
|
searchRoots,
|
|
473
451
|
repoRootPath,
|
|
474
452
|
guidelinePatterns,
|
|
@@ -478,7 +456,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
478
456
|
verbose
|
|
479
457
|
});
|
|
480
458
|
const outputSegments = await processMessages({
|
|
481
|
-
messages:
|
|
459
|
+
messages: expectedMessages,
|
|
482
460
|
searchRoots,
|
|
483
461
|
repoRootPath,
|
|
484
462
|
guidelinePatterns,
|
|
@@ -486,10 +464,10 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
486
464
|
verbose
|
|
487
465
|
});
|
|
488
466
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
489
|
-
const
|
|
490
|
-
const referenceAnswer = await resolveAssistantContent(
|
|
467
|
+
const expectedContent = expectedMessages[0]?.content;
|
|
468
|
+
const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
|
|
491
469
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
492
|
-
const
|
|
470
|
+
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
493
471
|
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
494
472
|
const userFilePaths = [];
|
|
495
473
|
for (const segment of inputSegments) {
|
|
@@ -508,19 +486,18 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
508
486
|
question,
|
|
509
487
|
input_segments: inputSegments,
|
|
510
488
|
output_segments: outputSegments,
|
|
511
|
-
system_message: systemMessageContent,
|
|
512
489
|
reference_answer: referenceAnswer,
|
|
513
490
|
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
|
|
514
491
|
guideline_patterns: guidelinePatterns,
|
|
515
492
|
file_paths: allFilePaths,
|
|
516
493
|
code_snippets: codeSnippets,
|
|
517
494
|
expected_outcome: outcome,
|
|
518
|
-
evaluator:
|
|
495
|
+
evaluator: evalCaseEvaluatorKind,
|
|
519
496
|
evaluators
|
|
520
497
|
};
|
|
521
498
|
if (verbose) {
|
|
522
499
|
console.log(`
|
|
523
|
-
[
|
|
500
|
+
[Eval Case: ${id}]`);
|
|
524
501
|
if (testCase.guideline_paths.length > 0) {
|
|
525
502
|
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
526
503
|
for (const guidelinePath of testCase.guideline_paths) {
|
|
@@ -579,7 +556,7 @@ ${body}`);
|
|
|
579
556
|
}
|
|
580
557
|
const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
581
558
|
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
582
|
-
return { question, guidelines
|
|
559
|
+
return { question, guidelines };
|
|
583
560
|
}
|
|
584
561
|
async function fileExists2(absolutePath) {
|
|
585
562
|
try {
|
|
@@ -3095,7 +3072,6 @@ var CodeEvaluator = class {
|
|
|
3095
3072
|
expected_outcome: context.evalCase.expected_outcome,
|
|
3096
3073
|
reference_answer: context.evalCase.reference_answer,
|
|
3097
3074
|
candidate_answer: context.candidate,
|
|
3098
|
-
system_message: context.promptInputs.systemMessage ?? "",
|
|
3099
3075
|
guideline_paths: context.evalCase.guideline_paths,
|
|
3100
3076
|
input_files: context.evalCase.file_paths,
|
|
3101
3077
|
input_segments: context.evalCase.input_segments
|
|
@@ -3337,7 +3313,7 @@ function validateConcurrency(concurrency) {
|
|
|
3337
3313
|
// src/evaluation/orchestrator.ts
|
|
3338
3314
|
async function runEvaluation(options) {
|
|
3339
3315
|
const {
|
|
3340
|
-
testFilePath,
|
|
3316
|
+
testFilePath: evalFilePath,
|
|
3341
3317
|
repoRoot,
|
|
3342
3318
|
target,
|
|
3343
3319
|
targets,
|
|
@@ -3356,11 +3332,11 @@ async function runEvaluation(options) {
|
|
|
3356
3332
|
onProgress
|
|
3357
3333
|
} = options;
|
|
3358
3334
|
const load = loadEvalCases;
|
|
3359
|
-
const evalCases = await load(
|
|
3335
|
+
const evalCases = await load(evalFilePath, repoRoot, { verbose, evalId });
|
|
3360
3336
|
const filteredEvalCases = filterEvalCases(evalCases, evalId);
|
|
3361
3337
|
if (filteredEvalCases.length === 0) {
|
|
3362
3338
|
if (evalId) {
|
|
3363
|
-
throw new Error(`
|
|
3339
|
+
throw new Error(`Eval case with id '${evalId}' not found in ${evalFilePath}`);
|
|
3364
3340
|
}
|
|
3365
3341
|
return [];
|
|
3366
3342
|
}
|
|
@@ -3739,8 +3715,7 @@ async function evaluateCandidate(options) {
|
|
|
3739
3715
|
const rawRequest = {
|
|
3740
3716
|
question: promptInputs.question,
|
|
3741
3717
|
...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
|
|
3742
|
-
guideline_paths: evalCase.guideline_paths
|
|
3743
|
-
system_message: promptInputs.systemMessage ?? ""
|
|
3718
|
+
guideline_paths: evalCase.guideline_paths
|
|
3744
3719
|
};
|
|
3745
3720
|
return {
|
|
3746
3721
|
eval_id: evalCase.id,
|
|
@@ -4004,7 +3979,6 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
4004
3979
|
question: promptInputs.question,
|
|
4005
3980
|
...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
|
|
4006
3981
|
guideline_paths: evalCase.guideline_paths,
|
|
4007
|
-
system_message: promptInputs.systemMessage ?? "",
|
|
4008
3982
|
error: message
|
|
4009
3983
|
};
|
|
4010
3984
|
return {
|