@agentv/core 0.7.0 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-L7I5UTJU.js → chunk-UQLHF3T7.js} +12 -3
- package/dist/chunk-UQLHF3T7.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +143 -2
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.d.cts +1 -1
- package/dist/evaluation/validation/index.d.ts +1 -1
- package/dist/evaluation/validation/index.js +143 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +79 -135
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +3 -3
- package/dist/index.d.ts +3 -3
- package/dist/index.js +69 -132
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-L7I5UTJU.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -382,6 +382,7 @@ async function processMessages(options) {
|
|
|
382
382
|
}
|
|
383
383
|
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
384
384
|
const verbose = options?.verbose ?? false;
|
|
385
|
+
const evalIdFilter = options?.evalId;
|
|
385
386
|
const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
|
|
386
387
|
if (!await fileExists2(absoluteTestPath)) {
|
|
387
388
|
throw new Error(`Test file not found: ${evalFilePath}`);
|
|
@@ -413,62 +414,39 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
413
414
|
const results = [];
|
|
414
415
|
for (const rawEvalcase of rawTestcases) {
|
|
415
416
|
if (!isJsonObject(rawEvalcase)) {
|
|
416
|
-
logWarning("Skipping invalid
|
|
417
|
+
logWarning("Skipping invalid eval case entry (expected object)");
|
|
417
418
|
continue;
|
|
418
419
|
}
|
|
419
420
|
const evalcase = rawEvalcase;
|
|
420
421
|
const id = asString(evalcase.id);
|
|
422
|
+
if (evalIdFilter && id !== evalIdFilter) {
|
|
423
|
+
continue;
|
|
424
|
+
}
|
|
421
425
|
const conversationId = asString(evalcase.conversation_id);
|
|
422
426
|
const outcome = asString(evalcase.outcome);
|
|
423
427
|
const inputMessagesValue = evalcase.input_messages;
|
|
424
428
|
const expectedMessagesValue = evalcase.expected_messages;
|
|
425
429
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
426
|
-
logWarning(`Skipping incomplete
|
|
430
|
+
logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
427
431
|
continue;
|
|
428
432
|
}
|
|
429
433
|
if (!Array.isArray(expectedMessagesValue)) {
|
|
430
|
-
logWarning(`
|
|
434
|
+
logWarning(`Eval case '${id}' missing expected_messages array`);
|
|
431
435
|
continue;
|
|
432
436
|
}
|
|
433
437
|
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
434
438
|
const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
const systemMessages = inputMessages.filter((message) => message.role === "system");
|
|
438
|
-
if (assistantMessages.length === 0) {
|
|
439
|
-
logWarning(`No assistant message found for test case: ${id}`);
|
|
439
|
+
if (expectedMessages.length === 0) {
|
|
440
|
+
logWarning(`No expected message found for eval case: ${id}`);
|
|
440
441
|
continue;
|
|
441
442
|
}
|
|
442
|
-
if (
|
|
443
|
-
logWarning(`Multiple
|
|
444
|
-
}
|
|
445
|
-
if (systemMessages.length > 1) {
|
|
446
|
-
logWarning(`Multiple system messages found for test case: ${id}, using first`);
|
|
447
|
-
}
|
|
448
|
-
let systemMessageContent;
|
|
449
|
-
if (systemMessages.length > 0) {
|
|
450
|
-
const content = systemMessages[0]?.content;
|
|
451
|
-
if (typeof content === "string") {
|
|
452
|
-
systemMessageContent = content;
|
|
453
|
-
} else if (Array.isArray(content)) {
|
|
454
|
-
const textParts = [];
|
|
455
|
-
for (const segment of content) {
|
|
456
|
-
if (isJsonObject(segment)) {
|
|
457
|
-
const value = segment.value;
|
|
458
|
-
if (typeof value === "string") {
|
|
459
|
-
textParts.push(value);
|
|
460
|
-
}
|
|
461
|
-
}
|
|
462
|
-
}
|
|
463
|
-
if (textParts.length > 0) {
|
|
464
|
-
systemMessageContent = textParts.join("\n\n");
|
|
465
|
-
}
|
|
466
|
-
}
|
|
443
|
+
if (expectedMessages.length > 1) {
|
|
444
|
+
logWarning(`Multiple expected messages found for eval case: ${id}, using first`);
|
|
467
445
|
}
|
|
468
446
|
const guidelinePaths = [];
|
|
469
447
|
const inputTextParts = [];
|
|
470
448
|
const inputSegments = await processMessages({
|
|
471
|
-
messages:
|
|
449
|
+
messages: inputMessages,
|
|
472
450
|
searchRoots,
|
|
473
451
|
repoRootPath,
|
|
474
452
|
guidelinePatterns,
|
|
@@ -478,7 +456,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
478
456
|
verbose
|
|
479
457
|
});
|
|
480
458
|
const outputSegments = await processMessages({
|
|
481
|
-
messages:
|
|
459
|
+
messages: expectedMessages,
|
|
482
460
|
searchRoots,
|
|
483
461
|
repoRootPath,
|
|
484
462
|
guidelinePatterns,
|
|
@@ -486,10 +464,10 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
486
464
|
verbose
|
|
487
465
|
});
|
|
488
466
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
489
|
-
const
|
|
490
|
-
const referenceAnswer = await resolveAssistantContent(
|
|
467
|
+
const expectedContent = expectedMessages[0]?.content;
|
|
468
|
+
const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
|
|
491
469
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
492
|
-
const
|
|
470
|
+
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
493
471
|
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
494
472
|
const userFilePaths = [];
|
|
495
473
|
for (const segment of inputSegments) {
|
|
@@ -508,19 +486,18 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
508
486
|
question,
|
|
509
487
|
input_segments: inputSegments,
|
|
510
488
|
output_segments: outputSegments,
|
|
511
|
-
system_message: systemMessageContent,
|
|
512
489
|
reference_answer: referenceAnswer,
|
|
513
490
|
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
|
|
514
491
|
guideline_patterns: guidelinePatterns,
|
|
515
492
|
file_paths: allFilePaths,
|
|
516
493
|
code_snippets: codeSnippets,
|
|
517
494
|
expected_outcome: outcome,
|
|
518
|
-
evaluator:
|
|
495
|
+
evaluator: evalCaseEvaluatorKind,
|
|
519
496
|
evaluators
|
|
520
497
|
};
|
|
521
498
|
if (verbose) {
|
|
522
499
|
console.log(`
|
|
523
|
-
[
|
|
500
|
+
[Eval Case: ${id}]`);
|
|
524
501
|
if (testCase.guideline_paths.length > 0) {
|
|
525
502
|
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
526
503
|
for (const guidelinePath of testCase.guideline_paths) {
|
|
@@ -579,7 +556,7 @@ ${body}`);
|
|
|
579
556
|
}
|
|
580
557
|
const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
581
558
|
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
582
|
-
return { question, guidelines
|
|
559
|
+
return { question, guidelines };
|
|
583
560
|
}
|
|
584
561
|
async function fileExists2(absolutePath) {
|
|
585
562
|
try {
|
|
@@ -1338,7 +1315,6 @@ function pathToFileUri(filePath) {
|
|
|
1338
1315
|
var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exec);
|
|
1339
1316
|
var WORKSPACE_PREFIX = "agentv-codex-";
|
|
1340
1317
|
var PROMPT_FILENAME = "prompt.md";
|
|
1341
|
-
var FILES_DIR = "files";
|
|
1342
1318
|
var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
|
|
1343
1319
|
var CodexProvider = class {
|
|
1344
1320
|
id;
|
|
@@ -1361,21 +1337,10 @@ var CodexProvider = class {
|
|
|
1361
1337
|
}
|
|
1362
1338
|
await this.ensureEnvironmentReady();
|
|
1363
1339
|
const inputFiles = normalizeInputFiles2(request.inputFiles);
|
|
1364
|
-
const originalGuidelines = new Set(
|
|
1365
|
-
collectGuidelineFiles(inputFiles, request.guideline_patterns).map((file) => import_node_path5.default.resolve(file))
|
|
1366
|
-
);
|
|
1367
1340
|
const workspaceRoot = await this.createWorkspace();
|
|
1368
1341
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
1369
1342
|
try {
|
|
1370
|
-
const
|
|
1371
|
-
inputFiles,
|
|
1372
|
-
workspaceRoot,
|
|
1373
|
-
originalGuidelines
|
|
1374
|
-
);
|
|
1375
|
-
const promptContent = buildPromptDocument(request, mirroredInputFiles, {
|
|
1376
|
-
guidelinePatterns: request.guideline_patterns,
|
|
1377
|
-
guidelineOverrides: guidelineMirrors
|
|
1378
|
-
});
|
|
1343
|
+
const promptContent = buildPromptDocument(request, inputFiles);
|
|
1379
1344
|
const promptFile = import_node_path5.default.join(workspaceRoot, PROMPT_FILENAME);
|
|
1380
1345
|
await (0, import_promises3.writeFile)(promptFile, promptContent, "utf8");
|
|
1381
1346
|
const args = this.buildCodexArgs();
|
|
@@ -1404,7 +1369,7 @@ var CodexProvider = class {
|
|
|
1404
1369
|
executable: this.resolvedExecutable ?? this.config.executable,
|
|
1405
1370
|
promptFile,
|
|
1406
1371
|
workspace: workspaceRoot,
|
|
1407
|
-
inputFiles
|
|
1372
|
+
inputFiles,
|
|
1408
1373
|
logFile: logger?.filePath
|
|
1409
1374
|
}
|
|
1410
1375
|
};
|
|
@@ -1459,37 +1424,6 @@ var CodexProvider = class {
|
|
|
1459
1424
|
throw error;
|
|
1460
1425
|
}
|
|
1461
1426
|
}
|
|
1462
|
-
async mirrorInputFiles(inputFiles, workspaceRoot, guidelineOriginals) {
|
|
1463
|
-
if (!inputFiles || inputFiles.length === 0) {
|
|
1464
|
-
return {
|
|
1465
|
-
mirroredInputFiles: void 0,
|
|
1466
|
-
guidelineMirrors: /* @__PURE__ */ new Set()
|
|
1467
|
-
};
|
|
1468
|
-
}
|
|
1469
|
-
const filesRoot = import_node_path5.default.join(workspaceRoot, FILES_DIR);
|
|
1470
|
-
await (0, import_promises3.mkdir)(filesRoot, { recursive: true });
|
|
1471
|
-
const mirrored = [];
|
|
1472
|
-
const guidelineMirrors = /* @__PURE__ */ new Set();
|
|
1473
|
-
const nameCounts = /* @__PURE__ */ new Map();
|
|
1474
|
-
for (const inputFile of inputFiles) {
|
|
1475
|
-
const absoluteSource = import_node_path5.default.resolve(inputFile);
|
|
1476
|
-
const baseName = import_node_path5.default.basename(absoluteSource);
|
|
1477
|
-
const count = nameCounts.get(baseName) ?? 0;
|
|
1478
|
-
nameCounts.set(baseName, count + 1);
|
|
1479
|
-
const finalName = count === 0 ? baseName : `${baseName}.${count}`;
|
|
1480
|
-
const destination = import_node_path5.default.join(filesRoot, finalName);
|
|
1481
|
-
await (0, import_promises3.copyFile)(absoluteSource, destination);
|
|
1482
|
-
const resolvedDestination = import_node_path5.default.resolve(destination);
|
|
1483
|
-
mirrored.push(resolvedDestination);
|
|
1484
|
-
if (guidelineOriginals.has(absoluteSource)) {
|
|
1485
|
-
guidelineMirrors.add(resolvedDestination);
|
|
1486
|
-
}
|
|
1487
|
-
}
|
|
1488
|
-
return {
|
|
1489
|
-
mirroredInputFiles: mirrored,
|
|
1490
|
-
guidelineMirrors
|
|
1491
|
-
};
|
|
1492
|
-
}
|
|
1493
1427
|
async createWorkspace() {
|
|
1494
1428
|
return await (0, import_promises3.mkdtemp)(import_node_path5.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
|
|
1495
1429
|
}
|
|
@@ -2460,23 +2394,25 @@ function resolveOptionalString(source, env, description, options) {
|
|
|
2460
2394
|
if (trimmed.length === 0) {
|
|
2461
2395
|
return void 0;
|
|
2462
2396
|
}
|
|
2463
|
-
const
|
|
2464
|
-
if (
|
|
2465
|
-
|
|
2466
|
-
|
|
2397
|
+
const envVarMatch = trimmed.match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
|
|
2398
|
+
if (envVarMatch) {
|
|
2399
|
+
const varName = envVarMatch[1];
|
|
2400
|
+
const envValue = env[varName];
|
|
2401
|
+
if (envValue !== void 0) {
|
|
2402
|
+
if (envValue.trim().length === 0) {
|
|
2403
|
+
throw new Error(`Environment variable '${varName}' for ${description} is empty`);
|
|
2404
|
+
}
|
|
2405
|
+
return envValue;
|
|
2467
2406
|
}
|
|
2468
|
-
|
|
2469
|
-
}
|
|
2470
|
-
const allowLiteral = options?.allowLiteral ?? false;
|
|
2471
|
-
const optionalEnv = options?.optionalEnv ?? false;
|
|
2472
|
-
const looksLikeEnv = isLikelyEnvReference(trimmed);
|
|
2473
|
-
if (looksLikeEnv) {
|
|
2407
|
+
const optionalEnv = options?.optionalEnv ?? false;
|
|
2474
2408
|
if (optionalEnv) {
|
|
2475
2409
|
return void 0;
|
|
2476
2410
|
}
|
|
2477
|
-
|
|
2478
|
-
|
|
2479
|
-
|
|
2411
|
+
throw new Error(`Environment variable '${varName}' required for ${description} is not set`);
|
|
2412
|
+
}
|
|
2413
|
+
const allowLiteral = options?.allowLiteral ?? false;
|
|
2414
|
+
if (!allowLiteral) {
|
|
2415
|
+
throw new Error(`${description} must use \${{ VARIABLE_NAME }} syntax for environment variables or be marked as allowing literals`);
|
|
2480
2416
|
}
|
|
2481
2417
|
return trimmed;
|
|
2482
2418
|
}
|
|
@@ -2523,9 +2459,6 @@ function resolveOptionalBoolean(source) {
|
|
|
2523
2459
|
}
|
|
2524
2460
|
throw new Error("expected boolean value");
|
|
2525
2461
|
}
|
|
2526
|
-
function isLikelyEnvReference(value) {
|
|
2527
|
-
return /^[A-Z0-9_]+$/.test(value);
|
|
2528
|
-
}
|
|
2529
2462
|
function resolveOptionalStringArray(source, env, description) {
|
|
2530
2463
|
if (source === void 0 || source === null) {
|
|
2531
2464
|
return void 0;
|
|
@@ -2546,21 +2479,25 @@ function resolveOptionalStringArray(source, env, description) {
|
|
|
2546
2479
|
if (trimmed.length === 0) {
|
|
2547
2480
|
throw new Error(`${description}[${i}] cannot be empty`);
|
|
2548
2481
|
}
|
|
2549
|
-
const
|
|
2550
|
-
if (
|
|
2551
|
-
|
|
2552
|
-
|
|
2482
|
+
const envVarMatch = trimmed.match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
|
|
2483
|
+
if (envVarMatch) {
|
|
2484
|
+
const varName = envVarMatch[1];
|
|
2485
|
+
const envValue = env[varName];
|
|
2486
|
+
if (envValue !== void 0) {
|
|
2487
|
+
if (envValue.trim().length === 0) {
|
|
2488
|
+
throw new Error(`Environment variable '${varName}' for ${description}[${i}] is empty`);
|
|
2489
|
+
}
|
|
2490
|
+
resolved.push(envValue);
|
|
2491
|
+
continue;
|
|
2553
2492
|
}
|
|
2554
|
-
|
|
2555
|
-
} else {
|
|
2556
|
-
resolved.push(trimmed);
|
|
2493
|
+
throw new Error(`Environment variable '${varName}' for ${description}[${i}] is not set`);
|
|
2557
2494
|
}
|
|
2495
|
+
resolved.push(trimmed);
|
|
2558
2496
|
}
|
|
2559
2497
|
return resolved.length > 0 ? resolved : void 0;
|
|
2560
2498
|
}
|
|
2561
2499
|
|
|
2562
2500
|
// src/evaluation/providers/vscode.ts
|
|
2563
|
-
var import_promises4 = require("fs/promises");
|
|
2564
2501
|
var import_node_path6 = __toESM(require("path"), 1);
|
|
2565
2502
|
var import_subagent = require("subagent");
|
|
2566
2503
|
var VSCodeProvider = class {
|
|
@@ -2604,7 +2541,7 @@ var VSCodeProvider = class {
|
|
|
2604
2541
|
}
|
|
2605
2542
|
};
|
|
2606
2543
|
}
|
|
2607
|
-
const responseText = await (
|
|
2544
|
+
const responseText = await readTextFile(session.responseFile);
|
|
2608
2545
|
return {
|
|
2609
2546
|
text: responseText,
|
|
2610
2547
|
raw: {
|
|
@@ -2658,7 +2595,7 @@ var VSCodeProvider = class {
|
|
|
2658
2595
|
}
|
|
2659
2596
|
const responses = [];
|
|
2660
2597
|
for (const [index, responseFile] of session.responseFiles.entries()) {
|
|
2661
|
-
const responseText = await (
|
|
2598
|
+
const responseText = await readTextFile(responseFile);
|
|
2662
2599
|
responses.push({
|
|
2663
2600
|
text: responseText,
|
|
2664
2601
|
raw: {
|
|
@@ -2808,12 +2745,20 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
2808
2745
|
|
|
2809
2746
|
// src/evaluation/providers/targets-file.ts
|
|
2810
2747
|
var import_node_fs4 = require("fs");
|
|
2811
|
-
var
|
|
2748
|
+
var import_promises4 = require("fs/promises");
|
|
2812
2749
|
var import_node_path7 = __toESM(require("path"), 1);
|
|
2813
2750
|
var import_yaml2 = require("yaml");
|
|
2814
2751
|
|
|
2815
2752
|
// src/evaluation/providers/types.ts
|
|
2816
|
-
var
|
|
2753
|
+
var AGENT_PROVIDER_KINDS = [
|
|
2754
|
+
"codex",
|
|
2755
|
+
"vscode",
|
|
2756
|
+
"vscode-insiders"
|
|
2757
|
+
];
|
|
2758
|
+
var TARGETS_SCHEMA_V2 = "agentv-targets-v2.1";
|
|
2759
|
+
function isAgentProvider(provider) {
|
|
2760
|
+
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
2761
|
+
}
|
|
2817
2762
|
|
|
2818
2763
|
// src/evaluation/providers/targets-file.ts
|
|
2819
2764
|
function isRecord(value) {
|
|
@@ -2870,7 +2815,7 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
2870
2815
|
}
|
|
2871
2816
|
async function fileExists3(filePath) {
|
|
2872
2817
|
try {
|
|
2873
|
-
await (0,
|
|
2818
|
+
await (0, import_promises4.access)(filePath, import_node_fs4.constants.F_OK);
|
|
2874
2819
|
return true;
|
|
2875
2820
|
} catch {
|
|
2876
2821
|
return false;
|
|
@@ -2881,7 +2826,7 @@ async function readTargetDefinitions(filePath) {
|
|
|
2881
2826
|
if (!await fileExists3(absolutePath)) {
|
|
2882
2827
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
2883
2828
|
}
|
|
2884
|
-
const raw = await (0,
|
|
2829
|
+
const raw = await (0, import_promises4.readFile)(absolutePath, "utf8");
|
|
2885
2830
|
const parsed = (0, import_yaml2.parse)(raw);
|
|
2886
2831
|
if (!isRecord(parsed)) {
|
|
2887
2832
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
|
|
@@ -3127,7 +3072,6 @@ var CodeEvaluator = class {
|
|
|
3127
3072
|
expected_outcome: context.evalCase.expected_outcome,
|
|
3128
3073
|
reference_answer: context.evalCase.reference_answer,
|
|
3129
3074
|
candidate_answer: context.candidate,
|
|
3130
|
-
system_message: context.promptInputs.systemMessage ?? "",
|
|
3131
3075
|
guideline_paths: context.evalCase.guideline_paths,
|
|
3132
3076
|
input_files: context.evalCase.file_paths,
|
|
3133
3077
|
input_segments: context.evalCase.input_segments
|
|
@@ -3227,7 +3171,7 @@ function substituteVariables(template, variables) {
|
|
|
3227
3171
|
|
|
3228
3172
|
// src/evaluation/orchestrator.ts
|
|
3229
3173
|
var import_node_crypto3 = require("crypto");
|
|
3230
|
-
var
|
|
3174
|
+
var import_promises5 = require("fs/promises");
|
|
3231
3175
|
var import_node_path8 = __toESM(require("path"), 1);
|
|
3232
3176
|
|
|
3233
3177
|
// ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
|
|
@@ -3369,7 +3313,7 @@ function validateConcurrency(concurrency) {
|
|
|
3369
3313
|
// src/evaluation/orchestrator.ts
|
|
3370
3314
|
async function runEvaluation(options) {
|
|
3371
3315
|
const {
|
|
3372
|
-
testFilePath,
|
|
3316
|
+
testFilePath: evalFilePath,
|
|
3373
3317
|
repoRoot,
|
|
3374
3318
|
target,
|
|
3375
3319
|
targets,
|
|
@@ -3388,11 +3332,11 @@ async function runEvaluation(options) {
|
|
|
3388
3332
|
onProgress
|
|
3389
3333
|
} = options;
|
|
3390
3334
|
const load = loadEvalCases;
|
|
3391
|
-
const evalCases = await load(
|
|
3335
|
+
const evalCases = await load(evalFilePath, repoRoot, { verbose, evalId });
|
|
3392
3336
|
const filteredEvalCases = filterEvalCases(evalCases, evalId);
|
|
3393
3337
|
if (filteredEvalCases.length === 0) {
|
|
3394
3338
|
if (evalId) {
|
|
3395
|
-
throw new Error(`
|
|
3339
|
+
throw new Error(`Eval case with id '${evalId}' not found in ${evalFilePath}`);
|
|
3396
3340
|
}
|
|
3397
3341
|
return [];
|
|
3398
3342
|
}
|
|
@@ -3546,7 +3490,8 @@ async function runEvaluation(options) {
|
|
|
3546
3490
|
target.name,
|
|
3547
3491
|
(now ?? (() => /* @__PURE__ */ new Date()))(),
|
|
3548
3492
|
outcome.reason,
|
|
3549
|
-
promptInputs
|
|
3493
|
+
promptInputs,
|
|
3494
|
+
primaryProvider
|
|
3550
3495
|
);
|
|
3551
3496
|
results.push(errorResult);
|
|
3552
3497
|
if (onResult) {
|
|
@@ -3630,7 +3575,7 @@ async function runBatchEvaluation(options) {
|
|
|
3630
3575
|
agentTimeoutMs
|
|
3631
3576
|
});
|
|
3632
3577
|
} catch (error) {
|
|
3633
|
-
const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
3578
|
+
const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
3634
3579
|
results.push(errorResult);
|
|
3635
3580
|
if (onResult) {
|
|
3636
3581
|
await onResult(errorResult);
|
|
@@ -3707,7 +3652,7 @@ async function runEvalCase(options) {
|
|
|
3707
3652
|
attempt += 1;
|
|
3708
3653
|
continue;
|
|
3709
3654
|
}
|
|
3710
|
-
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
3655
|
+
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
3711
3656
|
}
|
|
3712
3657
|
}
|
|
3713
3658
|
if (!providerResponse) {
|
|
@@ -3716,7 +3661,8 @@ async function runEvalCase(options) {
|
|
|
3716
3661
|
target.name,
|
|
3717
3662
|
nowFn(),
|
|
3718
3663
|
lastError ?? new Error("Provider did not return a response"),
|
|
3719
|
-
promptInputs
|
|
3664
|
+
promptInputs,
|
|
3665
|
+
provider
|
|
3720
3666
|
);
|
|
3721
3667
|
}
|
|
3722
3668
|
if (cacheKey && cache && !cachedResponse) {
|
|
@@ -3736,7 +3682,7 @@ async function runEvalCase(options) {
|
|
|
3736
3682
|
agentTimeoutMs
|
|
3737
3683
|
});
|
|
3738
3684
|
} catch (error) {
|
|
3739
|
-
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
3685
|
+
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
3740
3686
|
}
|
|
3741
3687
|
}
|
|
3742
3688
|
async function evaluateCandidate(options) {
|
|
@@ -3768,9 +3714,8 @@ async function evaluateCandidate(options) {
|
|
|
3768
3714
|
const completedAt = nowFn();
|
|
3769
3715
|
const rawRequest = {
|
|
3770
3716
|
question: promptInputs.question,
|
|
3771
|
-
guidelines: promptInputs.guidelines,
|
|
3772
|
-
guideline_paths: evalCase.guideline_paths
|
|
3773
|
-
system_message: promptInputs.systemMessage ?? ""
|
|
3717
|
+
...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
|
|
3718
|
+
guideline_paths: evalCase.guideline_paths
|
|
3774
3719
|
};
|
|
3775
3720
|
return {
|
|
3776
3721
|
eval_id: evalCase.id,
|
|
@@ -3986,14 +3931,14 @@ async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
|
3986
3931
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
3987
3932
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
3988
3933
|
const filePath = import_node_path8.default.resolve(directory, filename);
|
|
3989
|
-
await (0,
|
|
3934
|
+
await (0, import_promises5.mkdir)(import_node_path8.default.dirname(filePath), { recursive: true });
|
|
3990
3935
|
const payload = {
|
|
3991
3936
|
eval_id: evalCase.id,
|
|
3992
3937
|
question: promptInputs.question,
|
|
3993
3938
|
guidelines: promptInputs.guidelines,
|
|
3994
3939
|
guideline_paths: evalCase.guideline_paths
|
|
3995
3940
|
};
|
|
3996
|
-
await (0,
|
|
3941
|
+
await (0, import_promises5.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
3997
3942
|
}
|
|
3998
3943
|
function sanitizeFilename(value) {
|
|
3999
3944
|
if (!value) {
|
|
@@ -4028,13 +3973,12 @@ async function invokeProvider(provider, options) {
|
|
|
4028
3973
|
}
|
|
4029
3974
|
}
|
|
4030
3975
|
}
|
|
4031
|
-
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
|
|
3976
|
+
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
|
|
4032
3977
|
const message = error instanceof Error ? error.message : String(error);
|
|
4033
3978
|
const rawRequest = {
|
|
4034
3979
|
question: promptInputs.question,
|
|
4035
|
-
guidelines: promptInputs.guidelines,
|
|
3980
|
+
...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
|
|
4036
3981
|
guideline_paths: evalCase.guideline_paths,
|
|
4037
|
-
system_message: promptInputs.systemMessage ?? "",
|
|
4038
3982
|
error: message
|
|
4039
3983
|
};
|
|
4040
3984
|
return {
|