@agentv/core 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-SNTZFB24.js → chunk-YQBJAT5I.js} +1 -1
- package/dist/{chunk-SNTZFB24.js.map → chunk-YQBJAT5I.js.map} +1 -1
- package/dist/evaluation/validation/index.cjs +30 -13
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +21 -4
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +335 -91
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +67 -62
- package/dist/index.d.ts +67 -62
- package/dist/index.js +336 -92
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
readTextFile,
|
|
10
10
|
resolveFileReference,
|
|
11
11
|
resolveTargetDefinition
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-YQBJAT5I.js";
|
|
13
13
|
|
|
14
14
|
// src/evaluation/types.ts
|
|
15
15
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -268,14 +268,11 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
268
268
|
logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
269
269
|
continue;
|
|
270
270
|
}
|
|
271
|
-
|
|
272
|
-
logWarning(`Eval case '${id}' missing expected_messages array`);
|
|
273
|
-
continue;
|
|
274
|
-
}
|
|
271
|
+
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
275
272
|
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
276
|
-
const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
|
|
277
|
-
if (expectedMessages.length === 0) {
|
|
278
|
-
logWarning(`No expected message found for eval case: ${id}`);
|
|
273
|
+
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
274
|
+
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
275
|
+
logWarning(`No valid expected message found for eval case: ${id}`);
|
|
279
276
|
continue;
|
|
280
277
|
}
|
|
281
278
|
if (expectedMessages.length > 1) {
|
|
@@ -293,17 +290,17 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
293
290
|
messageType: "input",
|
|
294
291
|
verbose
|
|
295
292
|
});
|
|
296
|
-
const outputSegments = await processMessages({
|
|
293
|
+
const outputSegments = hasExpectedMessages ? await processMessages({
|
|
297
294
|
messages: expectedMessages,
|
|
298
295
|
searchRoots,
|
|
299
296
|
repoRootPath,
|
|
300
297
|
guidelinePatterns,
|
|
301
298
|
messageType: "output",
|
|
302
299
|
verbose
|
|
303
|
-
});
|
|
300
|
+
}) : [];
|
|
304
301
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
305
302
|
const expectedContent = expectedMessages[0]?.content;
|
|
306
|
-
const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
|
|
303
|
+
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
307
304
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
308
305
|
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
309
306
|
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
@@ -322,6 +319,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
322
319
|
dataset: datasetName,
|
|
323
320
|
conversation_id: conversationId,
|
|
324
321
|
question,
|
|
322
|
+
input_messages: inputMessages,
|
|
325
323
|
input_segments: inputSegments,
|
|
326
324
|
output_segments: outputSegments,
|
|
327
325
|
reference_answer: referenceAnswer,
|
|
@@ -349,6 +347,54 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
349
347
|
}
|
|
350
348
|
return results;
|
|
351
349
|
}
|
|
350
|
+
function needsRoleMarkers(messages, processedSegmentsByMessage) {
|
|
351
|
+
if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
|
|
352
|
+
return true;
|
|
353
|
+
}
|
|
354
|
+
let messagesWithContent = 0;
|
|
355
|
+
for (const segments of processedSegmentsByMessage) {
|
|
356
|
+
if (hasVisibleContent(segments)) {
|
|
357
|
+
messagesWithContent++;
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
return messagesWithContent > 1;
|
|
361
|
+
}
|
|
362
|
+
function hasVisibleContent(segments) {
|
|
363
|
+
return segments.some((segment) => {
|
|
364
|
+
const type = asString(segment.type);
|
|
365
|
+
if (type === "text") {
|
|
366
|
+
const value = asString(segment.value);
|
|
367
|
+
return value !== void 0 && value.trim().length > 0;
|
|
368
|
+
}
|
|
369
|
+
if (type === "guideline_ref") {
|
|
370
|
+
return false;
|
|
371
|
+
}
|
|
372
|
+
if (type === "file") {
|
|
373
|
+
const text = asString(segment.text);
|
|
374
|
+
return text !== void 0 && text.trim().length > 0;
|
|
375
|
+
}
|
|
376
|
+
return false;
|
|
377
|
+
});
|
|
378
|
+
}
|
|
379
|
+
function formatSegment(segment) {
|
|
380
|
+
const type = asString(segment.type);
|
|
381
|
+
if (type === "text") {
|
|
382
|
+
return asString(segment.value);
|
|
383
|
+
}
|
|
384
|
+
if (type === "guideline_ref") {
|
|
385
|
+
const refPath = asString(segment.path);
|
|
386
|
+
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
387
|
+
}
|
|
388
|
+
if (type === "file") {
|
|
389
|
+
const text = asString(segment.text);
|
|
390
|
+
const filePath = asString(segment.path);
|
|
391
|
+
if (text && filePath) {
|
|
392
|
+
return `=== ${filePath} ===
|
|
393
|
+
${text}`;
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
return void 0;
|
|
397
|
+
}
|
|
352
398
|
async function buildPromptInputs(testCase) {
|
|
353
399
|
const guidelineContents = [];
|
|
354
400
|
for (const rawPath of testCase.guideline_paths) {
|
|
@@ -365,36 +411,168 @@ ${content}`);
|
|
|
365
411
|
logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
366
412
|
}
|
|
367
413
|
}
|
|
368
|
-
const
|
|
414
|
+
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
415
|
+
const segmentsByMessage = [];
|
|
416
|
+
const fileContentsByPath = /* @__PURE__ */ new Map();
|
|
369
417
|
for (const segment of testCase.input_segments) {
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
const pathValue = segment.path;
|
|
373
|
-
const textValue = segment.text;
|
|
374
|
-
const label = typeof pathValue === "string" ? pathValue : "file";
|
|
375
|
-
const body = typeof textValue === "string" ? textValue : "";
|
|
376
|
-
questionParts.push(`=== ${label} ===
|
|
377
|
-
${body}`);
|
|
378
|
-
continue;
|
|
418
|
+
if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
|
|
419
|
+
fileContentsByPath.set(segment.path, segment.text);
|
|
379
420
|
}
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
421
|
+
}
|
|
422
|
+
for (const message of testCase.input_messages) {
|
|
423
|
+
const messageSegments = [];
|
|
424
|
+
if (typeof message.content === "string") {
|
|
425
|
+
if (message.content.trim().length > 0) {
|
|
426
|
+
messageSegments.push({ type: "text", value: message.content });
|
|
427
|
+
}
|
|
428
|
+
} else if (Array.isArray(message.content)) {
|
|
429
|
+
for (const segment of message.content) {
|
|
430
|
+
if (typeof segment === "string") {
|
|
431
|
+
if (segment.trim().length > 0) {
|
|
432
|
+
messageSegments.push({ type: "text", value: segment });
|
|
433
|
+
}
|
|
434
|
+
} else if (isJsonObject(segment)) {
|
|
435
|
+
const type = asString(segment.type);
|
|
436
|
+
if (type === "file") {
|
|
437
|
+
const value = asString(segment.value);
|
|
438
|
+
if (!value) continue;
|
|
439
|
+
if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
|
|
440
|
+
messageSegments.push({ type: "guideline_ref", path: value });
|
|
441
|
+
continue;
|
|
442
|
+
}
|
|
443
|
+
const fileText = fileContentsByPath.get(value);
|
|
444
|
+
if (fileText !== void 0) {
|
|
445
|
+
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
446
|
+
}
|
|
447
|
+
} else if (type === "text") {
|
|
448
|
+
const textValue = asString(segment.value);
|
|
449
|
+
if (textValue && textValue.trim().length > 0) {
|
|
450
|
+
messageSegments.push({ type: "text", value: textValue });
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
}
|
|
384
454
|
}
|
|
385
|
-
continue;
|
|
386
455
|
}
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
456
|
+
segmentsByMessage.push(messageSegments);
|
|
457
|
+
}
|
|
458
|
+
const useRoleMarkers = needsRoleMarkers(testCase.input_messages, segmentsByMessage);
|
|
459
|
+
let question;
|
|
460
|
+
if (useRoleMarkers) {
|
|
461
|
+
const messageParts = [];
|
|
462
|
+
for (let i = 0; i < testCase.input_messages.length; i++) {
|
|
463
|
+
const message = testCase.input_messages[i];
|
|
464
|
+
const segments = segmentsByMessage[i];
|
|
465
|
+
if (!hasVisibleContent(segments)) {
|
|
466
|
+
continue;
|
|
467
|
+
}
|
|
468
|
+
const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
|
|
469
|
+
const contentParts = [];
|
|
470
|
+
for (const segment of segments) {
|
|
471
|
+
const formattedContent = formatSegment(segment);
|
|
472
|
+
if (formattedContent) {
|
|
473
|
+
contentParts.push(formattedContent);
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
if (contentParts.length > 0) {
|
|
477
|
+
const messageContent = contentParts.join("\n");
|
|
478
|
+
messageParts.push(`@[${roleLabel}]:
|
|
479
|
+
${messageContent}`);
|
|
480
|
+
}
|
|
390
481
|
}
|
|
482
|
+
question = messageParts.join("\n\n");
|
|
483
|
+
} else {
|
|
484
|
+
const questionParts = [];
|
|
485
|
+
for (const segment of testCase.input_segments) {
|
|
486
|
+
const formattedContent = formatSegment(segment);
|
|
487
|
+
if (formattedContent) {
|
|
488
|
+
questionParts.push(formattedContent);
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
if (testCase.code_snippets.length > 0) {
|
|
492
|
+
questionParts.push(testCase.code_snippets.join("\n"));
|
|
493
|
+
}
|
|
494
|
+
question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
391
495
|
}
|
|
392
|
-
|
|
393
|
-
|
|
496
|
+
const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
|
|
497
|
+
messages: testCase.input_messages,
|
|
498
|
+
segmentsByMessage,
|
|
499
|
+
guidelinePatterns: testCase.guideline_patterns,
|
|
500
|
+
guidelineContent: guidelines
|
|
501
|
+
}) : void 0;
|
|
502
|
+
return { question, guidelines, chatPrompt };
|
|
503
|
+
}
|
|
504
|
+
function buildChatPromptFromSegments(options) {
|
|
505
|
+
const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
|
|
506
|
+
if (messages.length === 0) {
|
|
507
|
+
return void 0;
|
|
394
508
|
}
|
|
395
|
-
const
|
|
396
|
-
|
|
397
|
-
|
|
509
|
+
const systemSegments = [];
|
|
510
|
+
if (systemPrompt && systemPrompt.trim().length > 0) {
|
|
511
|
+
systemSegments.push(systemPrompt.trim());
|
|
512
|
+
}
|
|
513
|
+
if (guidelineContent && guidelineContent.trim().length > 0) {
|
|
514
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
515
|
+
|
|
516
|
+
${guidelineContent.trim()}`);
|
|
517
|
+
}
|
|
518
|
+
let startIndex = 0;
|
|
519
|
+
while (startIndex < messages.length && messages[startIndex].role === "system") {
|
|
520
|
+
const segments = segmentsByMessage[startIndex];
|
|
521
|
+
const contentParts = [];
|
|
522
|
+
for (const segment of segments) {
|
|
523
|
+
const formatted = formatSegment(segment);
|
|
524
|
+
if (formatted) {
|
|
525
|
+
contentParts.push(formatted);
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
if (contentParts.length > 0) {
|
|
529
|
+
systemSegments.push(contentParts.join("\n"));
|
|
530
|
+
}
|
|
531
|
+
startIndex += 1;
|
|
532
|
+
}
|
|
533
|
+
const chatPrompt = [];
|
|
534
|
+
if (systemSegments.length > 0) {
|
|
535
|
+
chatPrompt.push({
|
|
536
|
+
role: "system",
|
|
537
|
+
content: systemSegments.join("\n\n")
|
|
538
|
+
});
|
|
539
|
+
}
|
|
540
|
+
for (let i = startIndex; i < messages.length; i++) {
|
|
541
|
+
const message = messages[i];
|
|
542
|
+
const segments = segmentsByMessage[i];
|
|
543
|
+
const contentParts = [];
|
|
544
|
+
let role = message.role;
|
|
545
|
+
let name;
|
|
546
|
+
if (role === "system") {
|
|
547
|
+
role = "assistant";
|
|
548
|
+
contentParts.push("@[System]:");
|
|
549
|
+
} else if (role === "tool") {
|
|
550
|
+
role = "function";
|
|
551
|
+
name = "tool";
|
|
552
|
+
}
|
|
553
|
+
for (const segment of segments) {
|
|
554
|
+
if (segment.type === "guideline_ref") {
|
|
555
|
+
continue;
|
|
556
|
+
}
|
|
557
|
+
const formatted = formatSegment(segment);
|
|
558
|
+
if (formatted) {
|
|
559
|
+
const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
|
|
560
|
+
if (isGuidelineRef) {
|
|
561
|
+
continue;
|
|
562
|
+
}
|
|
563
|
+
contentParts.push(formatted);
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
if (contentParts.length === 0) {
|
|
567
|
+
continue;
|
|
568
|
+
}
|
|
569
|
+
chatPrompt.push({
|
|
570
|
+
role,
|
|
571
|
+
content: contentParts.join("\n"),
|
|
572
|
+
...name ? { name } : {}
|
|
573
|
+
});
|
|
574
|
+
}
|
|
575
|
+
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
398
576
|
}
|
|
399
577
|
async function fileExists2(absolutePath) {
|
|
400
578
|
try {
|
|
@@ -591,21 +769,14 @@ import { AxAI } from "@ax-llm/ax";
|
|
|
591
769
|
var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
|
|
592
770
|
function buildChatPrompt(request) {
|
|
593
771
|
if (request.chatPrompt) {
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
} else {
|
|
601
|
-
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
602
|
-
}
|
|
603
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
604
|
-
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
605
|
-
|
|
606
|
-
${request.guidelines.trim()}`);
|
|
772
|
+
const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
|
|
773
|
+
if (hasSystemMessage) {
|
|
774
|
+
return request.chatPrompt;
|
|
775
|
+
}
|
|
776
|
+
const systemContent2 = resolveSystemContent(request);
|
|
777
|
+
return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
|
|
607
778
|
}
|
|
608
|
-
const systemContent =
|
|
779
|
+
const systemContent = resolveSystemContent(request);
|
|
609
780
|
const userContent = request.question.trim();
|
|
610
781
|
const prompt = [
|
|
611
782
|
{
|
|
@@ -619,6 +790,21 @@ ${request.guidelines.trim()}`);
|
|
|
619
790
|
];
|
|
620
791
|
return prompt;
|
|
621
792
|
}
|
|
793
|
+
function resolveSystemContent(request) {
|
|
794
|
+
const systemSegments = [];
|
|
795
|
+
const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
|
|
796
|
+
if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
|
|
797
|
+
systemSegments.push(metadataSystemPrompt.trim());
|
|
798
|
+
} else {
|
|
799
|
+
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
800
|
+
}
|
|
801
|
+
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
802
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
803
|
+
|
|
804
|
+
${request.guidelines.trim()}`);
|
|
805
|
+
}
|
|
806
|
+
return systemSegments.join("\n\n");
|
|
807
|
+
}
|
|
622
808
|
function extractModelConfig(request, defaults) {
|
|
623
809
|
const temperature = request.temperature ?? defaults.temperature;
|
|
624
810
|
const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
|
|
@@ -2330,19 +2516,21 @@ var LlmJudgeEvaluator = class {
|
|
|
2330
2516
|
return this.evaluateWithPrompt(context, judgeProvider);
|
|
2331
2517
|
}
|
|
2332
2518
|
async evaluateWithPrompt(context, judgeProvider) {
|
|
2333
|
-
|
|
2334
|
-
|
|
2519
|
+
const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
|
|
2520
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
2521
|
+
let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
|
|
2522
|
+
let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
|
|
2335
2523
|
if (systemPrompt && hasTemplateVariables(systemPrompt)) {
|
|
2336
2524
|
const variables = {
|
|
2337
2525
|
input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
2338
2526
|
output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
|
|
2339
2527
|
candidate_answer: context.candidate,
|
|
2340
|
-
reference_answer: context.evalCase.reference_answer,
|
|
2528
|
+
reference_answer: context.evalCase.reference_answer ?? "",
|
|
2341
2529
|
expected_outcome: context.evalCase.expected_outcome,
|
|
2342
|
-
question:
|
|
2530
|
+
question: formattedQuestion
|
|
2343
2531
|
};
|
|
2344
2532
|
prompt = substituteVariables(systemPrompt, variables);
|
|
2345
|
-
systemPrompt =
|
|
2533
|
+
systemPrompt = buildSystemPrompt(hasReferenceAnswer);
|
|
2346
2534
|
}
|
|
2347
2535
|
const metadata = {
|
|
2348
2536
|
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
@@ -2380,38 +2568,51 @@ var LlmJudgeEvaluator = class {
|
|
|
2380
2568
|
};
|
|
2381
2569
|
}
|
|
2382
2570
|
};
|
|
2383
|
-
|
|
2384
|
-
|
|
2385
|
-
|
|
2386
|
-
|
|
2387
|
-
|
|
2388
|
-
|
|
2389
|
-
|
|
2390
|
-
|
|
2391
|
-
|
|
2392
|
-
|
|
2393
|
-
|
|
2394
|
-
|
|
2395
|
-
|
|
2396
|
-
|
|
2397
|
-
|
|
2398
|
-
|
|
2399
|
-
|
|
2571
|
+
function buildSystemPrompt(hasReferenceAnswer) {
|
|
2572
|
+
const basePrompt = [
|
|
2573
|
+
"You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
|
|
2574
|
+
""
|
|
2575
|
+
];
|
|
2576
|
+
if (hasReferenceAnswer) {
|
|
2577
|
+
basePrompt.push(
|
|
2578
|
+
"Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
|
|
2579
|
+
""
|
|
2580
|
+
);
|
|
2581
|
+
}
|
|
2582
|
+
basePrompt.push(
|
|
2583
|
+
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
2584
|
+
"",
|
|
2585
|
+
"You must respond with a single JSON object matching this schema:",
|
|
2586
|
+
"",
|
|
2587
|
+
"{",
|
|
2588
|
+
' "score": <number between 0.0 and 1.0>,',
|
|
2589
|
+
' "hits": [<array of strings, max 4 items, brief specific achievements>],',
|
|
2590
|
+
' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
|
|
2591
|
+
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
2592
|
+
"}"
|
|
2593
|
+
);
|
|
2594
|
+
return basePrompt.join("\n");
|
|
2595
|
+
}
|
|
2596
|
+
function buildQualityPrompt(evalCase, candidate, question) {
|
|
2400
2597
|
const parts = [
|
|
2401
2598
|
"[[ ## expected_outcome ## ]]",
|
|
2402
2599
|
evalCase.expected_outcome.trim(),
|
|
2403
2600
|
"",
|
|
2404
2601
|
"[[ ## question ## ]]",
|
|
2405
|
-
|
|
2406
|
-
""
|
|
2407
|
-
"[[ ## reference_answer ## ]]",
|
|
2408
|
-
evalCase.reference_answer.trim(),
|
|
2409
|
-
"",
|
|
2410
|
-
"[[ ## candidate_answer ## ]]",
|
|
2411
|
-
candidate.trim(),
|
|
2412
|
-
"",
|
|
2413
|
-
"Respond with a single JSON object matching the schema described in the system prompt."
|
|
2602
|
+
question.trim(),
|
|
2603
|
+
""
|
|
2414
2604
|
];
|
|
2605
|
+
if (hasNonEmptyReferenceAnswer(evalCase)) {
|
|
2606
|
+
parts.push(
|
|
2607
|
+
"[[ ## reference_answer ## ]]",
|
|
2608
|
+
evalCase.reference_answer.trim(),
|
|
2609
|
+
""
|
|
2610
|
+
);
|
|
2611
|
+
}
|
|
2612
|
+
parts.push(
|
|
2613
|
+
"[[ ## candidate_answer ## ]]",
|
|
2614
|
+
candidate.trim()
|
|
2615
|
+
);
|
|
2415
2616
|
return parts.join("\n");
|
|
2416
2617
|
}
|
|
2417
2618
|
function clampScore(value) {
|
|
@@ -2494,6 +2695,9 @@ function extractJsonBlob(text) {
|
|
|
2494
2695
|
function isNonEmptyString(value) {
|
|
2495
2696
|
return typeof value === "string" && value.trim().length > 0;
|
|
2496
2697
|
}
|
|
2698
|
+
function hasNonEmptyReferenceAnswer(evalCase) {
|
|
2699
|
+
return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
|
|
2700
|
+
}
|
|
2497
2701
|
var CodeEvaluator = class {
|
|
2498
2702
|
kind = "code";
|
|
2499
2703
|
script;
|
|
@@ -3152,11 +3356,27 @@ async function evaluateCandidate(options) {
|
|
|
3152
3356
|
agentTimeoutMs
|
|
3153
3357
|
});
|
|
3154
3358
|
const completedAt = nowFn();
|
|
3155
|
-
|
|
3156
|
-
|
|
3157
|
-
|
|
3158
|
-
|
|
3159
|
-
|
|
3359
|
+
let agentProviderRequest;
|
|
3360
|
+
let lmProviderRequest;
|
|
3361
|
+
if (isAgentProvider(provider)) {
|
|
3362
|
+
agentProviderRequest = {
|
|
3363
|
+
question: promptInputs.question,
|
|
3364
|
+
guideline_paths: evalCase.guideline_paths
|
|
3365
|
+
};
|
|
3366
|
+
} else {
|
|
3367
|
+
if (promptInputs.chatPrompt) {
|
|
3368
|
+
lmProviderRequest = {
|
|
3369
|
+
chat_prompt: promptInputs.chatPrompt,
|
|
3370
|
+
guideline_paths: evalCase.guideline_paths
|
|
3371
|
+
};
|
|
3372
|
+
} else {
|
|
3373
|
+
lmProviderRequest = {
|
|
3374
|
+
question: promptInputs.question,
|
|
3375
|
+
guidelines: promptInputs.guidelines,
|
|
3376
|
+
guideline_paths: evalCase.guideline_paths
|
|
3377
|
+
};
|
|
3378
|
+
}
|
|
3379
|
+
}
|
|
3160
3380
|
return {
|
|
3161
3381
|
eval_id: evalCase.id,
|
|
3162
3382
|
dataset: evalCase.dataset,
|
|
@@ -3170,7 +3390,8 @@ async function evaluateCandidate(options) {
|
|
|
3170
3390
|
timestamp: completedAt.toISOString(),
|
|
3171
3391
|
reasoning: score.reasoning,
|
|
3172
3392
|
raw_aspects: score.rawAspects,
|
|
3173
|
-
|
|
3393
|
+
agent_provider_request: agentProviderRequest,
|
|
3394
|
+
lm_provider_request: lmProviderRequest,
|
|
3174
3395
|
evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
3175
3396
|
evaluator_results: evaluatorResults
|
|
3176
3397
|
};
|
|
@@ -3399,6 +3620,7 @@ async function invokeProvider(provider, options) {
|
|
|
3399
3620
|
question: promptInputs.question,
|
|
3400
3621
|
guidelines: promptInputs.guidelines,
|
|
3401
3622
|
guideline_patterns: evalCase.guideline_patterns,
|
|
3623
|
+
chatPrompt: promptInputs.chatPrompt,
|
|
3402
3624
|
inputFiles: evalCase.file_paths,
|
|
3403
3625
|
evalCaseId: evalCase.id,
|
|
3404
3626
|
attempt,
|
|
@@ -3415,12 +3637,30 @@ async function invokeProvider(provider, options) {
|
|
|
3415
3637
|
}
|
|
3416
3638
|
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
|
|
3417
3639
|
const message = error instanceof Error ? error.message : String(error);
|
|
3418
|
-
|
|
3419
|
-
|
|
3420
|
-
|
|
3421
|
-
|
|
3422
|
-
|
|
3423
|
-
|
|
3640
|
+
let agentProviderRequest;
|
|
3641
|
+
let lmProviderRequest;
|
|
3642
|
+
if (isAgentProvider(provider)) {
|
|
3643
|
+
agentProviderRequest = {
|
|
3644
|
+
question: promptInputs.question,
|
|
3645
|
+
guideline_paths: evalCase.guideline_paths,
|
|
3646
|
+
error: message
|
|
3647
|
+
};
|
|
3648
|
+
} else {
|
|
3649
|
+
if (promptInputs.chatPrompt) {
|
|
3650
|
+
lmProviderRequest = {
|
|
3651
|
+
chat_prompt: promptInputs.chatPrompt,
|
|
3652
|
+
guideline_paths: evalCase.guideline_paths,
|
|
3653
|
+
error: message
|
|
3654
|
+
};
|
|
3655
|
+
} else {
|
|
3656
|
+
lmProviderRequest = {
|
|
3657
|
+
question: promptInputs.question,
|
|
3658
|
+
guidelines: promptInputs.guidelines,
|
|
3659
|
+
guideline_paths: evalCase.guideline_paths,
|
|
3660
|
+
error: message
|
|
3661
|
+
};
|
|
3662
|
+
}
|
|
3663
|
+
}
|
|
3424
3664
|
return {
|
|
3425
3665
|
eval_id: evalCase.id,
|
|
3426
3666
|
dataset: evalCase.dataset,
|
|
@@ -3433,7 +3673,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
3433
3673
|
target: targetName,
|
|
3434
3674
|
timestamp: timestamp.toISOString(),
|
|
3435
3675
|
raw_aspects: [],
|
|
3436
|
-
|
|
3676
|
+
agent_provider_request: agentProviderRequest,
|
|
3677
|
+
lm_provider_request: lmProviderRequest,
|
|
3437
3678
|
error: message
|
|
3438
3679
|
};
|
|
3439
3680
|
}
|
|
@@ -3445,6 +3686,9 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
|
3445
3686
|
hash.update(promptInputs.question);
|
|
3446
3687
|
hash.update(promptInputs.guidelines);
|
|
3447
3688
|
hash.update(promptInputs.systemMessage ?? "");
|
|
3689
|
+
if (promptInputs.chatPrompt) {
|
|
3690
|
+
hash.update(JSON.stringify(promptInputs.chatPrompt));
|
|
3691
|
+
}
|
|
3448
3692
|
return hash.digest("hex");
|
|
3449
3693
|
}
|
|
3450
3694
|
function isTimeoutLike(error) {
|