@agentv/core 0.9.0 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -9,7 +9,7 @@ import {
9
9
  readTextFile,
10
10
  resolveFileReference,
11
11
  resolveTargetDefinition
12
- } from "./chunk-SNTZFB24.js";
12
+ } from "./chunk-YQBJAT5I.js";
13
13
 
14
14
  // src/evaluation/types.ts
15
15
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -73,6 +73,33 @@ var ANSI_YELLOW = "\x1B[33m";
73
73
  var ANSI_RESET = "\x1B[0m";
74
74
  var SCHEMA_EVAL_V2 = "agentv-eval-v2";
75
75
  var SCHEMA_CONFIG_V2 = "agentv-config-v2";
76
+ async function readTestSuiteMetadata(testFilePath) {
77
+ try {
78
+ const absolutePath = path.resolve(testFilePath);
79
+ const content = await readFile(absolutePath, "utf8");
80
+ const parsed = parse(content);
81
+ if (!isJsonObject(parsed)) {
82
+ return {};
83
+ }
84
+ return { target: extractTargetFromSuite(parsed) };
85
+ } catch {
86
+ return {};
87
+ }
88
+ }
89
+ function extractTargetFromSuite(suite) {
90
+ const execution = suite.execution;
91
+ if (execution && typeof execution === "object" && !Array.isArray(execution)) {
92
+ const executionTarget = execution.target;
93
+ if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
94
+ return executionTarget.trim();
95
+ }
96
+ }
97
+ const targetValue = suite.target;
98
+ if (typeof targetValue === "string" && targetValue.trim().length > 0) {
99
+ return targetValue.trim();
100
+ }
101
+ return void 0;
102
+ }
76
103
  async function loadConfig(evalFilePath, repoRoot) {
77
104
  const directories = buildDirectoryChain(evalFilePath, repoRoot);
78
105
  for (const directory of directories) {
@@ -249,6 +276,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
249
276
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
250
277
  }
251
278
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
279
+ const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
280
+ const globalTarget = asString(globalExecution?.target) ?? asString(suite.target);
252
281
  const results = [];
253
282
  for (const rawEvalcase of rawTestcases) {
254
283
  if (!isJsonObject(rawEvalcase)) {
@@ -268,14 +297,11 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
268
297
  logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
269
298
  continue;
270
299
  }
271
- if (!Array.isArray(expectedMessagesValue)) {
272
- logWarning(`Eval case '${id}' missing expected_messages array`);
273
- continue;
274
- }
300
+ const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
275
301
  const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
276
- const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
277
- if (expectedMessages.length === 0) {
278
- logWarning(`No expected message found for eval case: ${id}`);
302
+ const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
303
+ if (hasExpectedMessages && expectedMessages.length === 0) {
304
+ logWarning(`No valid expected message found for eval case: ${id}`);
279
305
  continue;
280
306
  }
281
307
  if (expectedMessages.length > 1) {
@@ -293,20 +319,20 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
293
319
  messageType: "input",
294
320
  verbose
295
321
  });
296
- const outputSegments = await processMessages({
322
+ const outputSegments = hasExpectedMessages ? await processMessages({
297
323
  messages: expectedMessages,
298
324
  searchRoots,
299
325
  repoRootPath,
300
326
  guidelinePatterns,
301
327
  messageType: "output",
302
328
  verbose
303
- });
329
+ }) : [];
304
330
  const codeSnippets = extractCodeBlocks(inputSegments);
305
331
  const expectedContent = expectedMessages[0]?.content;
306
- const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
332
+ const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
307
333
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
308
334
  const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
309
- const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
335
+ const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
310
336
  const userFilePaths = [];
311
337
  for (const segment of inputSegments) {
312
338
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -322,6 +348,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
322
348
  dataset: datasetName,
323
349
  conversation_id: conversationId,
324
350
  question,
351
+ input_messages: inputMessages,
325
352
  input_segments: inputSegments,
326
353
  output_segments: outputSegments,
327
354
  reference_answer: referenceAnswer,
@@ -349,6 +376,54 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
349
376
  }
350
377
  return results;
351
378
  }
379
+ function needsRoleMarkers(messages, processedSegmentsByMessage) {
380
+ if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
381
+ return true;
382
+ }
383
+ let messagesWithContent = 0;
384
+ for (const segments of processedSegmentsByMessage) {
385
+ if (hasVisibleContent(segments)) {
386
+ messagesWithContent++;
387
+ }
388
+ }
389
+ return messagesWithContent > 1;
390
+ }
391
+ function hasVisibleContent(segments) {
392
+ return segments.some((segment) => {
393
+ const type = asString(segment.type);
394
+ if (type === "text") {
395
+ const value = asString(segment.value);
396
+ return value !== void 0 && value.trim().length > 0;
397
+ }
398
+ if (type === "guideline_ref") {
399
+ return false;
400
+ }
401
+ if (type === "file") {
402
+ const text = asString(segment.text);
403
+ return text !== void 0 && text.trim().length > 0;
404
+ }
405
+ return false;
406
+ });
407
+ }
408
+ function formatSegment(segment) {
409
+ const type = asString(segment.type);
410
+ if (type === "text") {
411
+ return asString(segment.value);
412
+ }
413
+ if (type === "guideline_ref") {
414
+ const refPath = asString(segment.path);
415
+ return refPath ? `<Attached: ${refPath}>` : void 0;
416
+ }
417
+ if (type === "file") {
418
+ const text = asString(segment.text);
419
+ const filePath = asString(segment.path);
420
+ if (text && filePath) {
421
+ return `=== ${filePath} ===
422
+ ${text}`;
423
+ }
424
+ }
425
+ return void 0;
426
+ }
352
427
  async function buildPromptInputs(testCase) {
353
428
  const guidelineContents = [];
354
429
  for (const rawPath of testCase.guideline_paths) {
@@ -365,36 +440,168 @@ ${content}`);
365
440
  logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
366
441
  }
367
442
  }
368
- const questionParts = [];
443
+ const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
444
+ const segmentsByMessage = [];
445
+ const fileContentsByPath = /* @__PURE__ */ new Map();
369
446
  for (const segment of testCase.input_segments) {
370
- const typeValue = segment.type;
371
- if (typeof typeValue === "string" && typeValue === "file") {
372
- const pathValue = segment.path;
373
- const textValue = segment.text;
374
- const label = typeof pathValue === "string" ? pathValue : "file";
375
- const body = typeof textValue === "string" ? textValue : "";
376
- questionParts.push(`=== ${label} ===
377
- ${body}`);
378
- continue;
447
+ if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
448
+ fileContentsByPath.set(segment.path, segment.text);
379
449
  }
380
- if (typeof typeValue === "string" && typeValue === "text") {
381
- const value = segment.value;
382
- if (typeof value === "string") {
383
- questionParts.push(value);
450
+ }
451
+ for (const message of testCase.input_messages) {
452
+ const messageSegments = [];
453
+ if (typeof message.content === "string") {
454
+ if (message.content.trim().length > 0) {
455
+ messageSegments.push({ type: "text", value: message.content });
456
+ }
457
+ } else if (Array.isArray(message.content)) {
458
+ for (const segment of message.content) {
459
+ if (typeof segment === "string") {
460
+ if (segment.trim().length > 0) {
461
+ messageSegments.push({ type: "text", value: segment });
462
+ }
463
+ } else if (isJsonObject(segment)) {
464
+ const type = asString(segment.type);
465
+ if (type === "file") {
466
+ const value = asString(segment.value);
467
+ if (!value) continue;
468
+ if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
469
+ messageSegments.push({ type: "guideline_ref", path: value });
470
+ continue;
471
+ }
472
+ const fileText = fileContentsByPath.get(value);
473
+ if (fileText !== void 0) {
474
+ messageSegments.push({ type: "file", text: fileText, path: value });
475
+ }
476
+ } else if (type === "text") {
477
+ const textValue = asString(segment.value);
478
+ if (textValue && textValue.trim().length > 0) {
479
+ messageSegments.push({ type: "text", value: textValue });
480
+ }
481
+ }
482
+ }
384
483
  }
385
- continue;
386
484
  }
387
- const genericValue = segment.value;
388
- if (typeof genericValue === "string") {
389
- questionParts.push(genericValue);
485
+ segmentsByMessage.push(messageSegments);
486
+ }
487
+ const useRoleMarkers = needsRoleMarkers(testCase.input_messages, segmentsByMessage);
488
+ let question;
489
+ if (useRoleMarkers) {
490
+ const messageParts = [];
491
+ for (let i = 0; i < testCase.input_messages.length; i++) {
492
+ const message = testCase.input_messages[i];
493
+ const segments = segmentsByMessage[i];
494
+ if (!hasVisibleContent(segments)) {
495
+ continue;
496
+ }
497
+ const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
498
+ const contentParts = [];
499
+ for (const segment of segments) {
500
+ const formattedContent = formatSegment(segment);
501
+ if (formattedContent) {
502
+ contentParts.push(formattedContent);
503
+ }
504
+ }
505
+ if (contentParts.length > 0) {
506
+ const messageContent = contentParts.join("\n");
507
+ messageParts.push(`@[${roleLabel}]:
508
+ ${messageContent}`);
509
+ }
510
+ }
511
+ question = messageParts.join("\n\n");
512
+ } else {
513
+ const questionParts = [];
514
+ for (const segment of testCase.input_segments) {
515
+ const formattedContent = formatSegment(segment);
516
+ if (formattedContent) {
517
+ questionParts.push(formattedContent);
518
+ }
390
519
  }
520
+ if (testCase.code_snippets.length > 0) {
521
+ questionParts.push(testCase.code_snippets.join("\n"));
522
+ }
523
+ question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
391
524
  }
392
- if (testCase.code_snippets.length > 0) {
393
- questionParts.push(testCase.code_snippets.join("\n"));
525
+ const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
526
+ messages: testCase.input_messages,
527
+ segmentsByMessage,
528
+ guidelinePatterns: testCase.guideline_patterns,
529
+ guidelineContent: guidelines
530
+ }) : void 0;
531
+ return { question, guidelines, chatPrompt };
532
+ }
533
+ function buildChatPromptFromSegments(options) {
534
+ const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
535
+ if (messages.length === 0) {
536
+ return void 0;
394
537
  }
395
- const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
396
- const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
397
- return { question, guidelines };
538
+ const systemSegments = [];
539
+ if (systemPrompt && systemPrompt.trim().length > 0) {
540
+ systemSegments.push(systemPrompt.trim());
541
+ }
542
+ if (guidelineContent && guidelineContent.trim().length > 0) {
543
+ systemSegments.push(`[[ ## Guidelines ## ]]
544
+
545
+ ${guidelineContent.trim()}`);
546
+ }
547
+ let startIndex = 0;
548
+ while (startIndex < messages.length && messages[startIndex].role === "system") {
549
+ const segments = segmentsByMessage[startIndex];
550
+ const contentParts = [];
551
+ for (const segment of segments) {
552
+ const formatted = formatSegment(segment);
553
+ if (formatted) {
554
+ contentParts.push(formatted);
555
+ }
556
+ }
557
+ if (contentParts.length > 0) {
558
+ systemSegments.push(contentParts.join("\n"));
559
+ }
560
+ startIndex += 1;
561
+ }
562
+ const chatPrompt = [];
563
+ if (systemSegments.length > 0) {
564
+ chatPrompt.push({
565
+ role: "system",
566
+ content: systemSegments.join("\n\n")
567
+ });
568
+ }
569
+ for (let i = startIndex; i < messages.length; i++) {
570
+ const message = messages[i];
571
+ const segments = segmentsByMessage[i];
572
+ const contentParts = [];
573
+ let role = message.role;
574
+ let name;
575
+ if (role === "system") {
576
+ role = "assistant";
577
+ contentParts.push("@[System]:");
578
+ } else if (role === "tool") {
579
+ role = "function";
580
+ name = "tool";
581
+ }
582
+ for (const segment of segments) {
583
+ if (segment.type === "guideline_ref") {
584
+ continue;
585
+ }
586
+ const formatted = formatSegment(segment);
587
+ if (formatted) {
588
+ const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
589
+ if (isGuidelineRef) {
590
+ continue;
591
+ }
592
+ contentParts.push(formatted);
593
+ }
594
+ }
595
+ if (contentParts.length === 0) {
596
+ continue;
597
+ }
598
+ chatPrompt.push({
599
+ role,
600
+ content: contentParts.join("\n"),
601
+ ...name ? { name } : {}
602
+ });
603
+ }
604
+ return chatPrompt.length > 0 ? chatPrompt : void 0;
398
605
  }
399
606
  async function fileExists2(absolutePath) {
400
607
  try {
@@ -492,9 +699,9 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
492
699
  }
493
700
  return parts.join(" ");
494
701
  }
495
- async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
702
+ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
496
703
  const execution = rawEvalCase.execution;
497
- const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators;
704
+ const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
498
705
  if (candidateEvaluators === void 0) {
499
706
  return void 0;
500
707
  }
@@ -532,6 +739,8 @@ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
532
739
  resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
533
740
  );
534
741
  }
742
+ } else {
743
+ resolvedCwd = searchRoots[0];
535
744
  }
536
745
  evaluators.push({
537
746
  name,
@@ -560,8 +769,7 @@ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
560
769
  name,
561
770
  type: "llm_judge",
562
771
  prompt,
563
- promptPath,
564
- model
772
+ promptPath
565
773
  });
566
774
  }
567
775
  return evaluators.length > 0 ? evaluators : void 0;
@@ -591,21 +799,14 @@ import { AxAI } from "@ax-llm/ax";
591
799
  var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
592
800
  function buildChatPrompt(request) {
593
801
  if (request.chatPrompt) {
594
- return request.chatPrompt;
595
- }
596
- const systemSegments = [];
597
- const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
598
- if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
599
- systemSegments.push(metadataSystemPrompt.trim());
600
- } else {
601
- systemSegments.push(DEFAULT_SYSTEM_PROMPT);
602
- }
603
- if (request.guidelines && request.guidelines.trim().length > 0) {
604
- systemSegments.push(`[[ ## Guidelines ## ]]
605
-
606
- ${request.guidelines.trim()}`);
802
+ const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
803
+ if (hasSystemMessage) {
804
+ return request.chatPrompt;
805
+ }
806
+ const systemContent2 = resolveSystemContent(request);
807
+ return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
607
808
  }
608
- const systemContent = systemSegments.join("\n\n");
809
+ const systemContent = resolveSystemContent(request);
609
810
  const userContent = request.question.trim();
610
811
  const prompt = [
611
812
  {
@@ -619,6 +820,21 @@ ${request.guidelines.trim()}`);
619
820
  ];
620
821
  return prompt;
621
822
  }
823
+ function resolveSystemContent(request) {
824
+ const systemSegments = [];
825
+ const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
826
+ if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
827
+ systemSegments.push(metadataSystemPrompt.trim());
828
+ } else {
829
+ systemSegments.push(DEFAULT_SYSTEM_PROMPT);
830
+ }
831
+ if (request.guidelines && request.guidelines.trim().length > 0) {
832
+ systemSegments.push(`[[ ## Guidelines ## ]]
833
+
834
+ ${request.guidelines.trim()}`);
835
+ }
836
+ return systemSegments.join("\n\n");
837
+ }
622
838
  function extractModelConfig(request, defaults) {
623
839
  const temperature = request.temperature ?? defaults.temperature;
624
840
  const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
@@ -2330,24 +2546,23 @@ var LlmJudgeEvaluator = class {
2330
2546
  return this.evaluateWithPrompt(context, judgeProvider);
2331
2547
  }
2332
2548
  async evaluateWithPrompt(context, judgeProvider) {
2333
- let prompt = buildQualityPrompt(context.evalCase, context.candidate);
2334
- let systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
2549
+ const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
2550
+ const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
2551
+ let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
2552
+ let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
2335
2553
  if (systemPrompt && hasTemplateVariables(systemPrompt)) {
2336
2554
  const variables = {
2337
2555
  input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
2338
2556
  output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
2339
2557
  candidate_answer: context.candidate,
2340
- reference_answer: context.evalCase.reference_answer,
2558
+ reference_answer: context.evalCase.reference_answer ?? "",
2341
2559
  expected_outcome: context.evalCase.expected_outcome,
2342
- question: context.evalCase.question
2560
+ question: formattedQuestion
2343
2561
  };
2344
2562
  prompt = substituteVariables(systemPrompt, variables);
2345
- systemPrompt = QUALITY_SYSTEM_PROMPT;
2563
+ systemPrompt = buildSystemPrompt(hasReferenceAnswer);
2346
2564
  }
2347
- const metadata = {
2348
- ...systemPrompt !== void 0 ? { systemPrompt } : {},
2349
- ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
2350
- };
2565
+ const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
2351
2566
  const response = await judgeProvider.invoke({
2352
2567
  question: prompt,
2353
2568
  metadata,
@@ -2367,8 +2582,7 @@ var LlmJudgeEvaluator = class {
2367
2582
  provider: judgeProvider.id,
2368
2583
  prompt,
2369
2584
  target: context.target.name,
2370
- ...systemPrompt !== void 0 ? { systemPrompt } : {},
2371
- ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
2585
+ ...systemPrompt !== void 0 && { systemPrompt }
2372
2586
  };
2373
2587
  return {
2374
2588
  score,
@@ -2380,38 +2594,51 @@ var LlmJudgeEvaluator = class {
2380
2594
  };
2381
2595
  }
2382
2596
  };
2383
- var QUALITY_SYSTEM_PROMPT = [
2384
- "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
2385
- "",
2386
- "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
2387
- "",
2388
- "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
2389
- "",
2390
- "You must respond with a single JSON object matching this schema:",
2391
- "",
2392
- "{",
2393
- ' "score": <number between 0.0 and 1.0>,',
2394
- ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
2395
- ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
2396
- ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
2397
- "}"
2398
- ].join("\n");
2399
- function buildQualityPrompt(evalCase, candidate) {
2597
+ function buildSystemPrompt(hasReferenceAnswer) {
2598
+ const basePrompt = [
2599
+ "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
2600
+ ""
2601
+ ];
2602
+ if (hasReferenceAnswer) {
2603
+ basePrompt.push(
2604
+ "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
2605
+ ""
2606
+ );
2607
+ }
2608
+ basePrompt.push(
2609
+ "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
2610
+ "",
2611
+ "You must respond with a single JSON object matching this schema:",
2612
+ "",
2613
+ "{",
2614
+ ' "score": <number between 0.0 and 1.0>,',
2615
+ ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
2616
+ ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
2617
+ ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
2618
+ "}"
2619
+ );
2620
+ return basePrompt.join("\n");
2621
+ }
2622
+ function buildQualityPrompt(evalCase, candidate, question) {
2400
2623
  const parts = [
2401
2624
  "[[ ## expected_outcome ## ]]",
2402
2625
  evalCase.expected_outcome.trim(),
2403
2626
  "",
2404
2627
  "[[ ## question ## ]]",
2405
- evalCase.question.trim(),
2406
- "",
2407
- "[[ ## reference_answer ## ]]",
2408
- evalCase.reference_answer.trim(),
2409
- "",
2410
- "[[ ## candidate_answer ## ]]",
2411
- candidate.trim(),
2412
- "",
2413
- "Respond with a single JSON object matching the schema described in the system prompt."
2628
+ question.trim(),
2629
+ ""
2414
2630
  ];
2631
+ if (hasNonEmptyReferenceAnswer(evalCase)) {
2632
+ parts.push(
2633
+ "[[ ## reference_answer ## ]]",
2634
+ evalCase.reference_answer.trim(),
2635
+ ""
2636
+ );
2637
+ }
2638
+ parts.push(
2639
+ "[[ ## candidate_answer ## ]]",
2640
+ candidate.trim()
2641
+ );
2415
2642
  return parts.join("\n");
2416
2643
  }
2417
2644
  function clampScore(value) {
@@ -2494,6 +2721,9 @@ function extractJsonBlob(text) {
2494
2721
  function isNonEmptyString(value) {
2495
2722
  return typeof value === "string" && value.trim().length > 0;
2496
2723
  }
2724
+ function hasNonEmptyReferenceAnswer(evalCase) {
2725
+ return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
2726
+ }
2497
2727
  var CodeEvaluator = class {
2498
2728
  kind = "code";
2499
2729
  script;
@@ -3152,11 +3382,27 @@ async function evaluateCandidate(options) {
3152
3382
  agentTimeoutMs
3153
3383
  });
3154
3384
  const completedAt = nowFn();
3155
- const rawRequest = {
3156
- question: promptInputs.question,
3157
- ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
3158
- guideline_paths: evalCase.guideline_paths
3159
- };
3385
+ let agentProviderRequest;
3386
+ let lmProviderRequest;
3387
+ if (isAgentProvider(provider)) {
3388
+ agentProviderRequest = {
3389
+ question: promptInputs.question,
3390
+ guideline_paths: evalCase.guideline_paths
3391
+ };
3392
+ } else {
3393
+ if (promptInputs.chatPrompt) {
3394
+ lmProviderRequest = {
3395
+ chat_prompt: promptInputs.chatPrompt,
3396
+ guideline_paths: evalCase.guideline_paths
3397
+ };
3398
+ } else {
3399
+ lmProviderRequest = {
3400
+ question: promptInputs.question,
3401
+ guidelines: promptInputs.guidelines,
3402
+ guideline_paths: evalCase.guideline_paths
3403
+ };
3404
+ }
3405
+ }
3160
3406
  return {
3161
3407
  eval_id: evalCase.id,
3162
3408
  dataset: evalCase.dataset,
@@ -3170,7 +3416,8 @@ async function evaluateCandidate(options) {
3170
3416
  timestamp: completedAt.toISOString(),
3171
3417
  reasoning: score.reasoning,
3172
3418
  raw_aspects: score.rawAspects,
3173
- raw_request: rawRequest,
3419
+ agent_provider_request: agentProviderRequest,
3420
+ lm_provider_request: lmProviderRequest,
3174
3421
  evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
3175
3422
  evaluator_results: evaluatorResults
3176
3423
  };
@@ -3329,8 +3576,7 @@ async function runLlmJudgeEvaluator(options) {
3329
3576
  now,
3330
3577
  judgeProvider,
3331
3578
  systemPrompt: customPrompt,
3332
- evaluator: config,
3333
- judgeModel: config.model
3579
+ evaluator: config
3334
3580
  });
3335
3581
  }
3336
3582
  async function resolveCustomPrompt(config) {
@@ -3399,6 +3645,7 @@ async function invokeProvider(provider, options) {
3399
3645
  question: promptInputs.question,
3400
3646
  guidelines: promptInputs.guidelines,
3401
3647
  guideline_patterns: evalCase.guideline_patterns,
3648
+ chatPrompt: promptInputs.chatPrompt,
3402
3649
  inputFiles: evalCase.file_paths,
3403
3650
  evalCaseId: evalCase.id,
3404
3651
  attempt,
@@ -3415,12 +3662,30 @@ async function invokeProvider(provider, options) {
3415
3662
  }
3416
3663
  function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
3417
3664
  const message = error instanceof Error ? error.message : String(error);
3418
- const rawRequest = {
3419
- question: promptInputs.question,
3420
- ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
3421
- guideline_paths: evalCase.guideline_paths,
3422
- error: message
3423
- };
3665
+ let agentProviderRequest;
3666
+ let lmProviderRequest;
3667
+ if (isAgentProvider(provider)) {
3668
+ agentProviderRequest = {
3669
+ question: promptInputs.question,
3670
+ guideline_paths: evalCase.guideline_paths,
3671
+ error: message
3672
+ };
3673
+ } else {
3674
+ if (promptInputs.chatPrompt) {
3675
+ lmProviderRequest = {
3676
+ chat_prompt: promptInputs.chatPrompt,
3677
+ guideline_paths: evalCase.guideline_paths,
3678
+ error: message
3679
+ };
3680
+ } else {
3681
+ lmProviderRequest = {
3682
+ question: promptInputs.question,
3683
+ guidelines: promptInputs.guidelines,
3684
+ guideline_paths: evalCase.guideline_paths,
3685
+ error: message
3686
+ };
3687
+ }
3688
+ }
3424
3689
  return {
3425
3690
  eval_id: evalCase.id,
3426
3691
  dataset: evalCase.dataset,
@@ -3433,7 +3698,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
3433
3698
  target: targetName,
3434
3699
  timestamp: timestamp.toISOString(),
3435
3700
  raw_aspects: [],
3436
- raw_request: rawRequest,
3701
+ agent_provider_request: agentProviderRequest,
3702
+ lm_provider_request: lmProviderRequest,
3437
3703
  error: message
3438
3704
  };
3439
3705
  }
@@ -3445,6 +3711,9 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
3445
3711
  hash.update(promptInputs.question);
3446
3712
  hash.update(promptInputs.guidelines);
3447
3713
  hash.update(promptInputs.systemMessage ?? "");
3714
+ if (promptInputs.chatPrompt) {
3715
+ hash.update(JSON.stringify(promptInputs.chatPrompt));
3716
+ }
3448
3717
  return hash.digest("hex");
3449
3718
  }
3450
3719
  function isTimeoutLike(error) {
@@ -3492,6 +3761,7 @@ export {
3492
3761
  loadEvalCases,
3493
3762
  normalizeLineEndings,
3494
3763
  readTargetDefinitions,
3764
+ readTestSuiteMetadata,
3495
3765
  readTextFile,
3496
3766
  resolveAndCreateProvider,
3497
3767
  resolveFileReference,