@agentv/core 0.7.5 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-7XM7HYRS.js → chunk-YQBJAT5I.js} +97 -67
- package/dist/chunk-YQBJAT5I.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +61 -69
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +51 -58
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +538 -192
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +136 -58
- package/dist/index.d.ts +136 -58
- package/dist/index.js +443 -127
- package/dist/index.js.map +1 -1
- package/package.json +1 -2
- package/dist/chunk-7XM7HYRS.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
readTextFile,
|
|
10
10
|
resolveFileReference,
|
|
11
11
|
resolveTargetDefinition
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-YQBJAT5I.js";
|
|
13
13
|
|
|
14
14
|
// src/evaluation/types.ts
|
|
15
15
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -268,14 +268,11 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
268
268
|
logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
269
269
|
continue;
|
|
270
270
|
}
|
|
271
|
-
|
|
272
|
-
logWarning(`Eval case '${id}' missing expected_messages array`);
|
|
273
|
-
continue;
|
|
274
|
-
}
|
|
271
|
+
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
275
272
|
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
276
|
-
const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
|
|
277
|
-
if (expectedMessages.length === 0) {
|
|
278
|
-
logWarning(`No expected message found for eval case: ${id}`);
|
|
273
|
+
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
274
|
+
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
275
|
+
logWarning(`No valid expected message found for eval case: ${id}`);
|
|
279
276
|
continue;
|
|
280
277
|
}
|
|
281
278
|
if (expectedMessages.length > 1) {
|
|
@@ -293,17 +290,17 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
293
290
|
messageType: "input",
|
|
294
291
|
verbose
|
|
295
292
|
});
|
|
296
|
-
const outputSegments = await processMessages({
|
|
293
|
+
const outputSegments = hasExpectedMessages ? await processMessages({
|
|
297
294
|
messages: expectedMessages,
|
|
298
295
|
searchRoots,
|
|
299
296
|
repoRootPath,
|
|
300
297
|
guidelinePatterns,
|
|
301
298
|
messageType: "output",
|
|
302
299
|
verbose
|
|
303
|
-
});
|
|
300
|
+
}) : [];
|
|
304
301
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
305
302
|
const expectedContent = expectedMessages[0]?.content;
|
|
306
|
-
const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
|
|
303
|
+
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
307
304
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
308
305
|
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
309
306
|
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
@@ -322,6 +319,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
322
319
|
dataset: datasetName,
|
|
323
320
|
conversation_id: conversationId,
|
|
324
321
|
question,
|
|
322
|
+
input_messages: inputMessages,
|
|
325
323
|
input_segments: inputSegments,
|
|
326
324
|
output_segments: outputSegments,
|
|
327
325
|
reference_answer: referenceAnswer,
|
|
@@ -349,6 +347,54 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
349
347
|
}
|
|
350
348
|
return results;
|
|
351
349
|
}
|
|
350
|
+
function needsRoleMarkers(messages, processedSegmentsByMessage) {
|
|
351
|
+
if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
|
|
352
|
+
return true;
|
|
353
|
+
}
|
|
354
|
+
let messagesWithContent = 0;
|
|
355
|
+
for (const segments of processedSegmentsByMessage) {
|
|
356
|
+
if (hasVisibleContent(segments)) {
|
|
357
|
+
messagesWithContent++;
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
return messagesWithContent > 1;
|
|
361
|
+
}
|
|
362
|
+
function hasVisibleContent(segments) {
|
|
363
|
+
return segments.some((segment) => {
|
|
364
|
+
const type = asString(segment.type);
|
|
365
|
+
if (type === "text") {
|
|
366
|
+
const value = asString(segment.value);
|
|
367
|
+
return value !== void 0 && value.trim().length > 0;
|
|
368
|
+
}
|
|
369
|
+
if (type === "guideline_ref") {
|
|
370
|
+
return false;
|
|
371
|
+
}
|
|
372
|
+
if (type === "file") {
|
|
373
|
+
const text = asString(segment.text);
|
|
374
|
+
return text !== void 0 && text.trim().length > 0;
|
|
375
|
+
}
|
|
376
|
+
return false;
|
|
377
|
+
});
|
|
378
|
+
}
|
|
379
|
+
function formatSegment(segment) {
|
|
380
|
+
const type = asString(segment.type);
|
|
381
|
+
if (type === "text") {
|
|
382
|
+
return asString(segment.value);
|
|
383
|
+
}
|
|
384
|
+
if (type === "guideline_ref") {
|
|
385
|
+
const refPath = asString(segment.path);
|
|
386
|
+
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
387
|
+
}
|
|
388
|
+
if (type === "file") {
|
|
389
|
+
const text = asString(segment.text);
|
|
390
|
+
const filePath = asString(segment.path);
|
|
391
|
+
if (text && filePath) {
|
|
392
|
+
return `=== ${filePath} ===
|
|
393
|
+
${text}`;
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
return void 0;
|
|
397
|
+
}
|
|
352
398
|
async function buildPromptInputs(testCase) {
|
|
353
399
|
const guidelineContents = [];
|
|
354
400
|
for (const rawPath of testCase.guideline_paths) {
|
|
@@ -365,36 +411,168 @@ ${content}`);
|
|
|
365
411
|
logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
366
412
|
}
|
|
367
413
|
}
|
|
368
|
-
const
|
|
414
|
+
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
415
|
+
const segmentsByMessage = [];
|
|
416
|
+
const fileContentsByPath = /* @__PURE__ */ new Map();
|
|
369
417
|
for (const segment of testCase.input_segments) {
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
const pathValue = segment.path;
|
|
373
|
-
const textValue = segment.text;
|
|
374
|
-
const label = typeof pathValue === "string" ? pathValue : "file";
|
|
375
|
-
const body = typeof textValue === "string" ? textValue : "";
|
|
376
|
-
questionParts.push(`=== ${label} ===
|
|
377
|
-
${body}`);
|
|
378
|
-
continue;
|
|
418
|
+
if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
|
|
419
|
+
fileContentsByPath.set(segment.path, segment.text);
|
|
379
420
|
}
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
421
|
+
}
|
|
422
|
+
for (const message of testCase.input_messages) {
|
|
423
|
+
const messageSegments = [];
|
|
424
|
+
if (typeof message.content === "string") {
|
|
425
|
+
if (message.content.trim().length > 0) {
|
|
426
|
+
messageSegments.push({ type: "text", value: message.content });
|
|
427
|
+
}
|
|
428
|
+
} else if (Array.isArray(message.content)) {
|
|
429
|
+
for (const segment of message.content) {
|
|
430
|
+
if (typeof segment === "string") {
|
|
431
|
+
if (segment.trim().length > 0) {
|
|
432
|
+
messageSegments.push({ type: "text", value: segment });
|
|
433
|
+
}
|
|
434
|
+
} else if (isJsonObject(segment)) {
|
|
435
|
+
const type = asString(segment.type);
|
|
436
|
+
if (type === "file") {
|
|
437
|
+
const value = asString(segment.value);
|
|
438
|
+
if (!value) continue;
|
|
439
|
+
if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
|
|
440
|
+
messageSegments.push({ type: "guideline_ref", path: value });
|
|
441
|
+
continue;
|
|
442
|
+
}
|
|
443
|
+
const fileText = fileContentsByPath.get(value);
|
|
444
|
+
if (fileText !== void 0) {
|
|
445
|
+
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
446
|
+
}
|
|
447
|
+
} else if (type === "text") {
|
|
448
|
+
const textValue = asString(segment.value);
|
|
449
|
+
if (textValue && textValue.trim().length > 0) {
|
|
450
|
+
messageSegments.push({ type: "text", value: textValue });
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
segmentsByMessage.push(messageSegments);
|
|
457
|
+
}
|
|
458
|
+
const useRoleMarkers = needsRoleMarkers(testCase.input_messages, segmentsByMessage);
|
|
459
|
+
let question;
|
|
460
|
+
if (useRoleMarkers) {
|
|
461
|
+
const messageParts = [];
|
|
462
|
+
for (let i = 0; i < testCase.input_messages.length; i++) {
|
|
463
|
+
const message = testCase.input_messages[i];
|
|
464
|
+
const segments = segmentsByMessage[i];
|
|
465
|
+
if (!hasVisibleContent(segments)) {
|
|
466
|
+
continue;
|
|
467
|
+
}
|
|
468
|
+
const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
|
|
469
|
+
const contentParts = [];
|
|
470
|
+
for (const segment of segments) {
|
|
471
|
+
const formattedContent = formatSegment(segment);
|
|
472
|
+
if (formattedContent) {
|
|
473
|
+
contentParts.push(formattedContent);
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
if (contentParts.length > 0) {
|
|
477
|
+
const messageContent = contentParts.join("\n");
|
|
478
|
+
messageParts.push(`@[${roleLabel}]:
|
|
479
|
+
${messageContent}`);
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
question = messageParts.join("\n\n");
|
|
483
|
+
} else {
|
|
484
|
+
const questionParts = [];
|
|
485
|
+
for (const segment of testCase.input_segments) {
|
|
486
|
+
const formattedContent = formatSegment(segment);
|
|
487
|
+
if (formattedContent) {
|
|
488
|
+
questionParts.push(formattedContent);
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
if (testCase.code_snippets.length > 0) {
|
|
492
|
+
questionParts.push(testCase.code_snippets.join("\n"));
|
|
493
|
+
}
|
|
494
|
+
question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
495
|
+
}
|
|
496
|
+
const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
|
|
497
|
+
messages: testCase.input_messages,
|
|
498
|
+
segmentsByMessage,
|
|
499
|
+
guidelinePatterns: testCase.guideline_patterns,
|
|
500
|
+
guidelineContent: guidelines
|
|
501
|
+
}) : void 0;
|
|
502
|
+
return { question, guidelines, chatPrompt };
|
|
503
|
+
}
|
|
504
|
+
function buildChatPromptFromSegments(options) {
|
|
505
|
+
const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
|
|
506
|
+
if (messages.length === 0) {
|
|
507
|
+
return void 0;
|
|
508
|
+
}
|
|
509
|
+
const systemSegments = [];
|
|
510
|
+
if (systemPrompt && systemPrompt.trim().length > 0) {
|
|
511
|
+
systemSegments.push(systemPrompt.trim());
|
|
512
|
+
}
|
|
513
|
+
if (guidelineContent && guidelineContent.trim().length > 0) {
|
|
514
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
515
|
+
|
|
516
|
+
${guidelineContent.trim()}`);
|
|
517
|
+
}
|
|
518
|
+
let startIndex = 0;
|
|
519
|
+
while (startIndex < messages.length && messages[startIndex].role === "system") {
|
|
520
|
+
const segments = segmentsByMessage[startIndex];
|
|
521
|
+
const contentParts = [];
|
|
522
|
+
for (const segment of segments) {
|
|
523
|
+
const formatted = formatSegment(segment);
|
|
524
|
+
if (formatted) {
|
|
525
|
+
contentParts.push(formatted);
|
|
384
526
|
}
|
|
385
|
-
continue;
|
|
386
527
|
}
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
questionParts.push(genericValue);
|
|
528
|
+
if (contentParts.length > 0) {
|
|
529
|
+
systemSegments.push(contentParts.join("\n"));
|
|
390
530
|
}
|
|
531
|
+
startIndex += 1;
|
|
391
532
|
}
|
|
392
|
-
|
|
393
|
-
|
|
533
|
+
const chatPrompt = [];
|
|
534
|
+
if (systemSegments.length > 0) {
|
|
535
|
+
chatPrompt.push({
|
|
536
|
+
role: "system",
|
|
537
|
+
content: systemSegments.join("\n\n")
|
|
538
|
+
});
|
|
394
539
|
}
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
540
|
+
for (let i = startIndex; i < messages.length; i++) {
|
|
541
|
+
const message = messages[i];
|
|
542
|
+
const segments = segmentsByMessage[i];
|
|
543
|
+
const contentParts = [];
|
|
544
|
+
let role = message.role;
|
|
545
|
+
let name;
|
|
546
|
+
if (role === "system") {
|
|
547
|
+
role = "assistant";
|
|
548
|
+
contentParts.push("@[System]:");
|
|
549
|
+
} else if (role === "tool") {
|
|
550
|
+
role = "function";
|
|
551
|
+
name = "tool";
|
|
552
|
+
}
|
|
553
|
+
for (const segment of segments) {
|
|
554
|
+
if (segment.type === "guideline_ref") {
|
|
555
|
+
continue;
|
|
556
|
+
}
|
|
557
|
+
const formatted = formatSegment(segment);
|
|
558
|
+
if (formatted) {
|
|
559
|
+
const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
|
|
560
|
+
if (isGuidelineRef) {
|
|
561
|
+
continue;
|
|
562
|
+
}
|
|
563
|
+
contentParts.push(formatted);
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
if (contentParts.length === 0) {
|
|
567
|
+
continue;
|
|
568
|
+
}
|
|
569
|
+
chatPrompt.push({
|
|
570
|
+
role,
|
|
571
|
+
content: contentParts.join("\n"),
|
|
572
|
+
...name ? { name } : {}
|
|
573
|
+
});
|
|
574
|
+
}
|
|
575
|
+
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
398
576
|
}
|
|
399
577
|
async function fileExists2(absolutePath) {
|
|
400
578
|
try {
|
|
@@ -591,21 +769,14 @@ import { AxAI } from "@ax-llm/ax";
|
|
|
591
769
|
var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
|
|
592
770
|
function buildChatPrompt(request) {
|
|
593
771
|
if (request.chatPrompt) {
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
} else {
|
|
601
|
-
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
602
|
-
}
|
|
603
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
604
|
-
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
605
|
-
|
|
606
|
-
${request.guidelines.trim()}`);
|
|
772
|
+
const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
|
|
773
|
+
if (hasSystemMessage) {
|
|
774
|
+
return request.chatPrompt;
|
|
775
|
+
}
|
|
776
|
+
const systemContent2 = resolveSystemContent(request);
|
|
777
|
+
return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
|
|
607
778
|
}
|
|
608
|
-
const systemContent =
|
|
779
|
+
const systemContent = resolveSystemContent(request);
|
|
609
780
|
const userContent = request.question.trim();
|
|
610
781
|
const prompt = [
|
|
611
782
|
{
|
|
@@ -619,6 +790,21 @@ ${request.guidelines.trim()}`);
|
|
|
619
790
|
];
|
|
620
791
|
return prompt;
|
|
621
792
|
}
|
|
793
|
+
function resolveSystemContent(request) {
|
|
794
|
+
const systemSegments = [];
|
|
795
|
+
const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
|
|
796
|
+
if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
|
|
797
|
+
systemSegments.push(metadataSystemPrompt.trim());
|
|
798
|
+
} else {
|
|
799
|
+
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
800
|
+
}
|
|
801
|
+
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
802
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
803
|
+
|
|
804
|
+
${request.guidelines.trim()}`);
|
|
805
|
+
}
|
|
806
|
+
return systemSegments.join("\n\n");
|
|
807
|
+
}
|
|
622
808
|
function extractModelConfig(request, defaults) {
|
|
623
809
|
const temperature = request.temperature ?? defaults.temperature;
|
|
624
810
|
const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
|
|
@@ -662,6 +848,67 @@ function ensureChatResponse(result) {
|
|
|
662
848
|
}
|
|
663
849
|
return result;
|
|
664
850
|
}
|
|
851
|
+
function isRetryableError(error, retryableStatusCodes) {
|
|
852
|
+
if (!error || typeof error !== "object") {
|
|
853
|
+
return false;
|
|
854
|
+
}
|
|
855
|
+
if ("status" in error && typeof error.status === "number") {
|
|
856
|
+
return retryableStatusCodes.includes(error.status);
|
|
857
|
+
}
|
|
858
|
+
if ("message" in error && typeof error.message === "string") {
|
|
859
|
+
const match = error.message.match(/HTTP (\d{3})/);
|
|
860
|
+
if (match) {
|
|
861
|
+
const status = Number.parseInt(match[1], 10);
|
|
862
|
+
return retryableStatusCodes.includes(status);
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
if ("name" in error && error.name === "AxAIServiceNetworkError") {
|
|
866
|
+
return true;
|
|
867
|
+
}
|
|
868
|
+
return false;
|
|
869
|
+
}
|
|
870
|
+
function calculateRetryDelay(attempt, config) {
|
|
871
|
+
const delay = Math.min(
|
|
872
|
+
config.maxDelayMs,
|
|
873
|
+
config.initialDelayMs * config.backoffFactor ** attempt
|
|
874
|
+
);
|
|
875
|
+
return delay * (0.75 + Math.random() * 0.5);
|
|
876
|
+
}
|
|
877
|
+
async function sleep(ms) {
|
|
878
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
879
|
+
}
|
|
880
|
+
async function withRetry(fn, retryConfig, signal) {
|
|
881
|
+
const config = {
|
|
882
|
+
maxRetries: retryConfig?.maxRetries ?? 3,
|
|
883
|
+
initialDelayMs: retryConfig?.initialDelayMs ?? 1e3,
|
|
884
|
+
maxDelayMs: retryConfig?.maxDelayMs ?? 6e4,
|
|
885
|
+
backoffFactor: retryConfig?.backoffFactor ?? 2,
|
|
886
|
+
retryableStatusCodes: retryConfig?.retryableStatusCodes ?? [500, 408, 429, 502, 503, 504]
|
|
887
|
+
};
|
|
888
|
+
let lastError;
|
|
889
|
+
for (let attempt = 0; attempt <= config.maxRetries; attempt++) {
|
|
890
|
+
if (signal?.aborted) {
|
|
891
|
+
throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
|
|
892
|
+
}
|
|
893
|
+
try {
|
|
894
|
+
return await fn();
|
|
895
|
+
} catch (error) {
|
|
896
|
+
lastError = error;
|
|
897
|
+
if (attempt >= config.maxRetries) {
|
|
898
|
+
break;
|
|
899
|
+
}
|
|
900
|
+
if (!isRetryableError(error, config.retryableStatusCodes)) {
|
|
901
|
+
throw error;
|
|
902
|
+
}
|
|
903
|
+
const delay = calculateRetryDelay(attempt, config);
|
|
904
|
+
await sleep(delay);
|
|
905
|
+
if (signal?.aborted) {
|
|
906
|
+
throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
|
|
907
|
+
}
|
|
908
|
+
}
|
|
909
|
+
}
|
|
910
|
+
throw lastError;
|
|
911
|
+
}
|
|
665
912
|
var AzureProvider = class {
|
|
666
913
|
constructor(targetName, config) {
|
|
667
914
|
this.config = config;
|
|
@@ -671,6 +918,7 @@ var AzureProvider = class {
|
|
|
671
918
|
temperature: config.temperature,
|
|
672
919
|
maxOutputTokens: config.maxOutputTokens
|
|
673
920
|
};
|
|
921
|
+
this.retryConfig = config.retry;
|
|
674
922
|
this.ai = AxAI.create({
|
|
675
923
|
name: "azure-openai",
|
|
676
924
|
apiKey: config.apiKey,
|
|
@@ -687,16 +935,21 @@ var AzureProvider = class {
|
|
|
687
935
|
targetName;
|
|
688
936
|
ai;
|
|
689
937
|
defaults;
|
|
938
|
+
retryConfig;
|
|
690
939
|
async invoke(request) {
|
|
691
940
|
const chatPrompt = buildChatPrompt(request);
|
|
692
941
|
const modelConfig = extractModelConfig(request, this.defaults);
|
|
693
|
-
const response = await
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
942
|
+
const response = await withRetry(
|
|
943
|
+
async () => await this.ai.chat(
|
|
944
|
+
{
|
|
945
|
+
chatPrompt,
|
|
946
|
+
model: this.config.deploymentName,
|
|
947
|
+
...modelConfig ? { modelConfig } : {}
|
|
948
|
+
},
|
|
949
|
+
request.signal ? { abortSignal: request.signal } : void 0
|
|
950
|
+
),
|
|
951
|
+
this.retryConfig,
|
|
952
|
+
request.signal
|
|
700
953
|
);
|
|
701
954
|
return mapResponse(ensureChatResponse(response));
|
|
702
955
|
}
|
|
@@ -714,6 +967,7 @@ var AnthropicProvider = class {
|
|
|
714
967
|
maxOutputTokens: config.maxOutputTokens,
|
|
715
968
|
thinkingBudget: config.thinkingBudget
|
|
716
969
|
};
|
|
970
|
+
this.retryConfig = config.retry;
|
|
717
971
|
this.ai = AxAI.create({
|
|
718
972
|
name: "anthropic",
|
|
719
973
|
apiKey: config.apiKey
|
|
@@ -724,16 +978,21 @@ var AnthropicProvider = class {
|
|
|
724
978
|
targetName;
|
|
725
979
|
ai;
|
|
726
980
|
defaults;
|
|
981
|
+
retryConfig;
|
|
727
982
|
async invoke(request) {
|
|
728
983
|
const chatPrompt = buildChatPrompt(request);
|
|
729
984
|
const modelConfig = extractModelConfig(request, this.defaults);
|
|
730
|
-
const response = await
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
985
|
+
const response = await withRetry(
|
|
986
|
+
async () => await this.ai.chat(
|
|
987
|
+
{
|
|
988
|
+
chatPrompt,
|
|
989
|
+
model: this.config.model,
|
|
990
|
+
...modelConfig ? { modelConfig } : {}
|
|
991
|
+
},
|
|
992
|
+
request.signal ? { abortSignal: request.signal } : void 0
|
|
993
|
+
),
|
|
994
|
+
this.retryConfig,
|
|
995
|
+
request.signal
|
|
737
996
|
);
|
|
738
997
|
return mapResponse(ensureChatResponse(response));
|
|
739
998
|
}
|
|
@@ -750,6 +1009,7 @@ var GeminiProvider = class {
|
|
|
750
1009
|
temperature: config.temperature,
|
|
751
1010
|
maxOutputTokens: config.maxOutputTokens
|
|
752
1011
|
};
|
|
1012
|
+
this.retryConfig = config.retry;
|
|
753
1013
|
this.ai = AxAI.create({
|
|
754
1014
|
name: "google-gemini",
|
|
755
1015
|
apiKey: config.apiKey
|
|
@@ -760,16 +1020,21 @@ var GeminiProvider = class {
|
|
|
760
1020
|
targetName;
|
|
761
1021
|
ai;
|
|
762
1022
|
defaults;
|
|
1023
|
+
retryConfig;
|
|
763
1024
|
async invoke(request) {
|
|
764
1025
|
const chatPrompt = buildChatPrompt(request);
|
|
765
1026
|
const modelConfig = extractModelConfig(request, this.defaults);
|
|
766
|
-
const response = await
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
1027
|
+
const response = await withRetry(
|
|
1028
|
+
async () => await this.ai.chat(
|
|
1029
|
+
{
|
|
1030
|
+
chatPrompt,
|
|
1031
|
+
model: this.config.model,
|
|
1032
|
+
...modelConfig ? { modelConfig } : {}
|
|
1033
|
+
},
|
|
1034
|
+
request.signal ? { abortSignal: request.signal } : void 0
|
|
1035
|
+
),
|
|
1036
|
+
this.retryConfig,
|
|
1037
|
+
request.signal
|
|
773
1038
|
);
|
|
774
1039
|
return mapResponse(ensureChatResponse(response));
|
|
775
1040
|
}
|
|
@@ -839,10 +1104,9 @@ var CliProvider = class {
|
|
|
839
1104
|
const outputFilePath = generateOutputFilePath(request.evalCaseId);
|
|
840
1105
|
const templateValues = buildTemplateValues(request, this.config, outputFilePath);
|
|
841
1106
|
const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
|
|
842
|
-
const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
|
|
843
1107
|
const result = await this.runCommand(renderedCommand, {
|
|
844
1108
|
cwd: this.config.cwd,
|
|
845
|
-
env,
|
|
1109
|
+
env: process.env,
|
|
846
1110
|
timeoutMs: this.config.timeoutMs,
|
|
847
1111
|
signal: request.signal
|
|
848
1112
|
});
|
|
@@ -931,10 +1195,9 @@ var CliProvider = class {
|
|
|
931
1195
|
generateOutputFilePath("healthcheck")
|
|
932
1196
|
)
|
|
933
1197
|
);
|
|
934
|
-
const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
|
|
935
1198
|
const result = await this.runCommand(renderedCommand, {
|
|
936
1199
|
cwd: healthcheck.cwd ?? this.config.cwd,
|
|
937
|
-
env,
|
|
1200
|
+
env: process.env,
|
|
938
1201
|
timeoutMs,
|
|
939
1202
|
signal
|
|
940
1203
|
});
|
|
@@ -2167,20 +2430,13 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
2167
2430
|
}
|
|
2168
2431
|
const name = value.name;
|
|
2169
2432
|
const provider = value.provider;
|
|
2170
|
-
const settings = value.settings;
|
|
2171
|
-
const judgeTarget = value.judge_target;
|
|
2172
2433
|
if (typeof name !== "string" || name.trim().length === 0) {
|
|
2173
2434
|
throw new Error(`targets.yaml entry at index ${index} in ${filePath} is missing a valid 'name'`);
|
|
2174
2435
|
}
|
|
2175
2436
|
if (typeof provider !== "string" || provider.trim().length === 0) {
|
|
2176
2437
|
throw new Error(`targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider'`);
|
|
2177
2438
|
}
|
|
2178
|
-
return
|
|
2179
|
-
name,
|
|
2180
|
-
provider,
|
|
2181
|
-
settings: isRecord(settings) ? settings : void 0,
|
|
2182
|
-
judge_target: typeof judgeTarget === "string" ? judgeTarget : void 0
|
|
2183
|
-
};
|
|
2439
|
+
return value;
|
|
2184
2440
|
}
|
|
2185
2441
|
async function fileExists3(filePath) {
|
|
2186
2442
|
try {
|
|
@@ -2260,19 +2516,21 @@ var LlmJudgeEvaluator = class {
|
|
|
2260
2516
|
return this.evaluateWithPrompt(context, judgeProvider);
|
|
2261
2517
|
}
|
|
2262
2518
|
async evaluateWithPrompt(context, judgeProvider) {
|
|
2263
|
-
|
|
2264
|
-
|
|
2519
|
+
const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
|
|
2520
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
2521
|
+
let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
|
|
2522
|
+
let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
|
|
2265
2523
|
if (systemPrompt && hasTemplateVariables(systemPrompt)) {
|
|
2266
2524
|
const variables = {
|
|
2267
2525
|
input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
2268
2526
|
output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
|
|
2269
2527
|
candidate_answer: context.candidate,
|
|
2270
|
-
reference_answer: context.evalCase.reference_answer,
|
|
2528
|
+
reference_answer: context.evalCase.reference_answer ?? "",
|
|
2271
2529
|
expected_outcome: context.evalCase.expected_outcome,
|
|
2272
|
-
question:
|
|
2530
|
+
question: formattedQuestion
|
|
2273
2531
|
};
|
|
2274
2532
|
prompt = substituteVariables(systemPrompt, variables);
|
|
2275
|
-
systemPrompt =
|
|
2533
|
+
systemPrompt = buildSystemPrompt(hasReferenceAnswer);
|
|
2276
2534
|
}
|
|
2277
2535
|
const metadata = {
|
|
2278
2536
|
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
@@ -2310,38 +2568,51 @@ var LlmJudgeEvaluator = class {
|
|
|
2310
2568
|
};
|
|
2311
2569
|
}
|
|
2312
2570
|
};
|
|
2313
|
-
|
|
2314
|
-
|
|
2315
|
-
|
|
2316
|
-
|
|
2317
|
-
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
|
|
2321
|
-
|
|
2322
|
-
|
|
2323
|
-
|
|
2324
|
-
|
|
2325
|
-
|
|
2326
|
-
|
|
2327
|
-
|
|
2328
|
-
|
|
2329
|
-
|
|
2571
|
+
function buildSystemPrompt(hasReferenceAnswer) {
|
|
2572
|
+
const basePrompt = [
|
|
2573
|
+
"You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
|
|
2574
|
+
""
|
|
2575
|
+
];
|
|
2576
|
+
if (hasReferenceAnswer) {
|
|
2577
|
+
basePrompt.push(
|
|
2578
|
+
"Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
|
|
2579
|
+
""
|
|
2580
|
+
);
|
|
2581
|
+
}
|
|
2582
|
+
basePrompt.push(
|
|
2583
|
+
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
2584
|
+
"",
|
|
2585
|
+
"You must respond with a single JSON object matching this schema:",
|
|
2586
|
+
"",
|
|
2587
|
+
"{",
|
|
2588
|
+
' "score": <number between 0.0 and 1.0>,',
|
|
2589
|
+
' "hits": [<array of strings, max 4 items, brief specific achievements>],',
|
|
2590
|
+
' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
|
|
2591
|
+
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
2592
|
+
"}"
|
|
2593
|
+
);
|
|
2594
|
+
return basePrompt.join("\n");
|
|
2595
|
+
}
|
|
2596
|
+
function buildQualityPrompt(evalCase, candidate, question) {
|
|
2330
2597
|
const parts = [
|
|
2331
2598
|
"[[ ## expected_outcome ## ]]",
|
|
2332
2599
|
evalCase.expected_outcome.trim(),
|
|
2333
2600
|
"",
|
|
2334
2601
|
"[[ ## question ## ]]",
|
|
2335
|
-
|
|
2336
|
-
""
|
|
2337
|
-
"[[ ## reference_answer ## ]]",
|
|
2338
|
-
evalCase.reference_answer.trim(),
|
|
2339
|
-
"",
|
|
2340
|
-
"[[ ## candidate_answer ## ]]",
|
|
2341
|
-
candidate.trim(),
|
|
2342
|
-
"",
|
|
2343
|
-
"Respond with a single JSON object matching the schema described in the system prompt."
|
|
2602
|
+
question.trim(),
|
|
2603
|
+
""
|
|
2344
2604
|
];
|
|
2605
|
+
if (hasNonEmptyReferenceAnswer(evalCase)) {
|
|
2606
|
+
parts.push(
|
|
2607
|
+
"[[ ## reference_answer ## ]]",
|
|
2608
|
+
evalCase.reference_answer.trim(),
|
|
2609
|
+
""
|
|
2610
|
+
);
|
|
2611
|
+
}
|
|
2612
|
+
parts.push(
|
|
2613
|
+
"[[ ## candidate_answer ## ]]",
|
|
2614
|
+
candidate.trim()
|
|
2615
|
+
);
|
|
2345
2616
|
return parts.join("\n");
|
|
2346
2617
|
}
|
|
2347
2618
|
function clampScore(value) {
|
|
@@ -2424,6 +2695,9 @@ function extractJsonBlob(text) {
|
|
|
2424
2695
|
function isNonEmptyString(value) {
|
|
2425
2696
|
return typeof value === "string" && value.trim().length > 0;
|
|
2426
2697
|
}
|
|
2698
|
+
function hasNonEmptyReferenceAnswer(evalCase) {
|
|
2699
|
+
return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
|
|
2700
|
+
}
|
|
2427
2701
|
var CodeEvaluator = class {
|
|
2428
2702
|
kind = "code";
|
|
2429
2703
|
script;
|
|
@@ -2821,10 +3095,11 @@ async function runEvaluation(options) {
|
|
|
2821
3095
|
await onProgress({
|
|
2822
3096
|
workerId,
|
|
2823
3097
|
evalId: evalCase.id,
|
|
2824
|
-
status: "completed",
|
|
3098
|
+
status: result.error ? "failed" : "completed",
|
|
2825
3099
|
startedAt: 0,
|
|
2826
3100
|
// Not used for completed status
|
|
2827
|
-
completedAt: Date.now()
|
|
3101
|
+
completedAt: Date.now(),
|
|
3102
|
+
error: result.error
|
|
2828
3103
|
});
|
|
2829
3104
|
}
|
|
2830
3105
|
if (onResult) {
|
|
@@ -3081,11 +3356,27 @@ async function evaluateCandidate(options) {
|
|
|
3081
3356
|
agentTimeoutMs
|
|
3082
3357
|
});
|
|
3083
3358
|
const completedAt = nowFn();
|
|
3084
|
-
|
|
3085
|
-
|
|
3086
|
-
|
|
3087
|
-
|
|
3088
|
-
|
|
3359
|
+
let agentProviderRequest;
|
|
3360
|
+
let lmProviderRequest;
|
|
3361
|
+
if (isAgentProvider(provider)) {
|
|
3362
|
+
agentProviderRequest = {
|
|
3363
|
+
question: promptInputs.question,
|
|
3364
|
+
guideline_paths: evalCase.guideline_paths
|
|
3365
|
+
};
|
|
3366
|
+
} else {
|
|
3367
|
+
if (promptInputs.chatPrompt) {
|
|
3368
|
+
lmProviderRequest = {
|
|
3369
|
+
chat_prompt: promptInputs.chatPrompt,
|
|
3370
|
+
guideline_paths: evalCase.guideline_paths
|
|
3371
|
+
};
|
|
3372
|
+
} else {
|
|
3373
|
+
lmProviderRequest = {
|
|
3374
|
+
question: promptInputs.question,
|
|
3375
|
+
guidelines: promptInputs.guidelines,
|
|
3376
|
+
guideline_paths: evalCase.guideline_paths
|
|
3377
|
+
};
|
|
3378
|
+
}
|
|
3379
|
+
}
|
|
3089
3380
|
return {
|
|
3090
3381
|
eval_id: evalCase.id,
|
|
3091
3382
|
dataset: evalCase.dataset,
|
|
@@ -3099,7 +3390,8 @@ async function evaluateCandidate(options) {
|
|
|
3099
3390
|
timestamp: completedAt.toISOString(),
|
|
3100
3391
|
reasoning: score.reasoning,
|
|
3101
3392
|
raw_aspects: score.rawAspects,
|
|
3102
|
-
|
|
3393
|
+
agent_provider_request: agentProviderRequest,
|
|
3394
|
+
lm_provider_request: lmProviderRequest,
|
|
3103
3395
|
evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
3104
3396
|
evaluator_results: evaluatorResults
|
|
3105
3397
|
};
|
|
@@ -3328,6 +3620,7 @@ async function invokeProvider(provider, options) {
|
|
|
3328
3620
|
question: promptInputs.question,
|
|
3329
3621
|
guidelines: promptInputs.guidelines,
|
|
3330
3622
|
guideline_patterns: evalCase.guideline_patterns,
|
|
3623
|
+
chatPrompt: promptInputs.chatPrompt,
|
|
3331
3624
|
inputFiles: evalCase.file_paths,
|
|
3332
3625
|
evalCaseId: evalCase.id,
|
|
3333
3626
|
attempt,
|
|
@@ -3344,12 +3637,30 @@ async function invokeProvider(provider, options) {
|
|
|
3344
3637
|
}
|
|
3345
3638
|
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
|
|
3346
3639
|
const message = error instanceof Error ? error.message : String(error);
|
|
3347
|
-
|
|
3348
|
-
|
|
3349
|
-
|
|
3350
|
-
|
|
3351
|
-
|
|
3352
|
-
|
|
3640
|
+
let agentProviderRequest;
|
|
3641
|
+
let lmProviderRequest;
|
|
3642
|
+
if (isAgentProvider(provider)) {
|
|
3643
|
+
agentProviderRequest = {
|
|
3644
|
+
question: promptInputs.question,
|
|
3645
|
+
guideline_paths: evalCase.guideline_paths,
|
|
3646
|
+
error: message
|
|
3647
|
+
};
|
|
3648
|
+
} else {
|
|
3649
|
+
if (promptInputs.chatPrompt) {
|
|
3650
|
+
lmProviderRequest = {
|
|
3651
|
+
chat_prompt: promptInputs.chatPrompt,
|
|
3652
|
+
guideline_paths: evalCase.guideline_paths,
|
|
3653
|
+
error: message
|
|
3654
|
+
};
|
|
3655
|
+
} else {
|
|
3656
|
+
lmProviderRequest = {
|
|
3657
|
+
question: promptInputs.question,
|
|
3658
|
+
guidelines: promptInputs.guidelines,
|
|
3659
|
+
guideline_paths: evalCase.guideline_paths,
|
|
3660
|
+
error: message
|
|
3661
|
+
};
|
|
3662
|
+
}
|
|
3663
|
+
}
|
|
3353
3664
|
return {
|
|
3354
3665
|
eval_id: evalCase.id,
|
|
3355
3666
|
dataset: evalCase.dataset,
|
|
@@ -3362,7 +3673,9 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
3362
3673
|
target: targetName,
|
|
3363
3674
|
timestamp: timestamp.toISOString(),
|
|
3364
3675
|
raw_aspects: [],
|
|
3365
|
-
|
|
3676
|
+
agent_provider_request: agentProviderRequest,
|
|
3677
|
+
lm_provider_request: lmProviderRequest,
|
|
3678
|
+
error: message
|
|
3366
3679
|
};
|
|
3367
3680
|
}
|
|
3368
3681
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
@@ -3373,6 +3686,9 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
|
3373
3686
|
hash.update(promptInputs.question);
|
|
3374
3687
|
hash.update(promptInputs.guidelines);
|
|
3375
3688
|
hash.update(promptInputs.systemMessage ?? "");
|
|
3689
|
+
if (promptInputs.chatPrompt) {
|
|
3690
|
+
hash.update(JSON.stringify(promptInputs.chatPrompt));
|
|
3691
|
+
}
|
|
3376
3692
|
return hash.digest("hex");
|
|
3377
3693
|
}
|
|
3378
3694
|
function isTimeoutLike(error) {
|