@agentv/core 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -9,7 +9,7 @@ import {
9
9
  readTextFile,
10
10
  resolveFileReference,
11
11
  resolveTargetDefinition
12
- } from "./chunk-SNTZFB24.js";
12
+ } from "./chunk-YQBJAT5I.js";
13
13
 
14
14
  // src/evaluation/types.ts
15
15
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -268,14 +268,11 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
268
268
  logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
269
269
  continue;
270
270
  }
271
- if (!Array.isArray(expectedMessagesValue)) {
272
- logWarning(`Eval case '${id}' missing expected_messages array`);
273
- continue;
274
- }
271
+ const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
275
272
  const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
276
- const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
277
- if (expectedMessages.length === 0) {
278
- logWarning(`No expected message found for eval case: ${id}`);
273
+ const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
274
+ if (hasExpectedMessages && expectedMessages.length === 0) {
275
+ logWarning(`No valid expected message found for eval case: ${id}`);
279
276
  continue;
280
277
  }
281
278
  if (expectedMessages.length > 1) {
@@ -293,17 +290,17 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
293
290
  messageType: "input",
294
291
  verbose
295
292
  });
296
- const outputSegments = await processMessages({
293
+ const outputSegments = hasExpectedMessages ? await processMessages({
297
294
  messages: expectedMessages,
298
295
  searchRoots,
299
296
  repoRootPath,
300
297
  guidelinePatterns,
301
298
  messageType: "output",
302
299
  verbose
303
- });
300
+ }) : [];
304
301
  const codeSnippets = extractCodeBlocks(inputSegments);
305
302
  const expectedContent = expectedMessages[0]?.content;
306
- const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
303
+ const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
307
304
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
308
305
  const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
309
306
  const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
@@ -322,6 +319,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
322
319
  dataset: datasetName,
323
320
  conversation_id: conversationId,
324
321
  question,
322
+ input_messages: inputMessages,
325
323
  input_segments: inputSegments,
326
324
  output_segments: outputSegments,
327
325
  reference_answer: referenceAnswer,
@@ -349,6 +347,54 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
349
347
  }
350
348
  return results;
351
349
  }
350
+ function needsRoleMarkers(messages, processedSegmentsByMessage) {
351
+ if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
352
+ return true;
353
+ }
354
+ let messagesWithContent = 0;
355
+ for (const segments of processedSegmentsByMessage) {
356
+ if (hasVisibleContent(segments)) {
357
+ messagesWithContent++;
358
+ }
359
+ }
360
+ return messagesWithContent > 1;
361
+ }
362
+ function hasVisibleContent(segments) {
363
+ return segments.some((segment) => {
364
+ const type = asString(segment.type);
365
+ if (type === "text") {
366
+ const value = asString(segment.value);
367
+ return value !== void 0 && value.trim().length > 0;
368
+ }
369
+ if (type === "guideline_ref") {
370
+ return false;
371
+ }
372
+ if (type === "file") {
373
+ const text = asString(segment.text);
374
+ return text !== void 0 && text.trim().length > 0;
375
+ }
376
+ return false;
377
+ });
378
+ }
379
+ function formatSegment(segment) {
380
+ const type = asString(segment.type);
381
+ if (type === "text") {
382
+ return asString(segment.value);
383
+ }
384
+ if (type === "guideline_ref") {
385
+ const refPath = asString(segment.path);
386
+ return refPath ? `<Attached: ${refPath}>` : void 0;
387
+ }
388
+ if (type === "file") {
389
+ const text = asString(segment.text);
390
+ const filePath = asString(segment.path);
391
+ if (text && filePath) {
392
+ return `=== ${filePath} ===
393
+ ${text}`;
394
+ }
395
+ }
396
+ return void 0;
397
+ }
352
398
  async function buildPromptInputs(testCase) {
353
399
  const guidelineContents = [];
354
400
  for (const rawPath of testCase.guideline_paths) {
@@ -365,36 +411,168 @@ ${content}`);
365
411
  logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
366
412
  }
367
413
  }
368
- const questionParts = [];
414
+ const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
415
+ const segmentsByMessage = [];
416
+ const fileContentsByPath = /* @__PURE__ */ new Map();
369
417
  for (const segment of testCase.input_segments) {
370
- const typeValue = segment.type;
371
- if (typeof typeValue === "string" && typeValue === "file") {
372
- const pathValue = segment.path;
373
- const textValue = segment.text;
374
- const label = typeof pathValue === "string" ? pathValue : "file";
375
- const body = typeof textValue === "string" ? textValue : "";
376
- questionParts.push(`=== ${label} ===
377
- ${body}`);
378
- continue;
418
+ if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
419
+ fileContentsByPath.set(segment.path, segment.text);
379
420
  }
380
- if (typeof typeValue === "string" && typeValue === "text") {
381
- const value = segment.value;
382
- if (typeof value === "string") {
383
- questionParts.push(value);
421
+ }
422
+ for (const message of testCase.input_messages) {
423
+ const messageSegments = [];
424
+ if (typeof message.content === "string") {
425
+ if (message.content.trim().length > 0) {
426
+ messageSegments.push({ type: "text", value: message.content });
427
+ }
428
+ } else if (Array.isArray(message.content)) {
429
+ for (const segment of message.content) {
430
+ if (typeof segment === "string") {
431
+ if (segment.trim().length > 0) {
432
+ messageSegments.push({ type: "text", value: segment });
433
+ }
434
+ } else if (isJsonObject(segment)) {
435
+ const type = asString(segment.type);
436
+ if (type === "file") {
437
+ const value = asString(segment.value);
438
+ if (!value) continue;
439
+ if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
440
+ messageSegments.push({ type: "guideline_ref", path: value });
441
+ continue;
442
+ }
443
+ const fileText = fileContentsByPath.get(value);
444
+ if (fileText !== void 0) {
445
+ messageSegments.push({ type: "file", text: fileText, path: value });
446
+ }
447
+ } else if (type === "text") {
448
+ const textValue = asString(segment.value);
449
+ if (textValue && textValue.trim().length > 0) {
450
+ messageSegments.push({ type: "text", value: textValue });
451
+ }
452
+ }
453
+ }
384
454
  }
385
- continue;
386
455
  }
387
- const genericValue = segment.value;
388
- if (typeof genericValue === "string") {
389
- questionParts.push(genericValue);
456
+ segmentsByMessage.push(messageSegments);
457
+ }
458
+ const useRoleMarkers = needsRoleMarkers(testCase.input_messages, segmentsByMessage);
459
+ let question;
460
+ if (useRoleMarkers) {
461
+ const messageParts = [];
462
+ for (let i = 0; i < testCase.input_messages.length; i++) {
463
+ const message = testCase.input_messages[i];
464
+ const segments = segmentsByMessage[i];
465
+ if (!hasVisibleContent(segments)) {
466
+ continue;
467
+ }
468
+ const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
469
+ const contentParts = [];
470
+ for (const segment of segments) {
471
+ const formattedContent = formatSegment(segment);
472
+ if (formattedContent) {
473
+ contentParts.push(formattedContent);
474
+ }
475
+ }
476
+ if (contentParts.length > 0) {
477
+ const messageContent = contentParts.join("\n");
478
+ messageParts.push(`@[${roleLabel}]:
479
+ ${messageContent}`);
480
+ }
390
481
  }
482
+ question = messageParts.join("\n\n");
483
+ } else {
484
+ const questionParts = [];
485
+ for (const segment of testCase.input_segments) {
486
+ const formattedContent = formatSegment(segment);
487
+ if (formattedContent) {
488
+ questionParts.push(formattedContent);
489
+ }
490
+ }
491
+ if (testCase.code_snippets.length > 0) {
492
+ questionParts.push(testCase.code_snippets.join("\n"));
493
+ }
494
+ question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
391
495
  }
392
- if (testCase.code_snippets.length > 0) {
393
- questionParts.push(testCase.code_snippets.join("\n"));
496
+ const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
497
+ messages: testCase.input_messages,
498
+ segmentsByMessage,
499
+ guidelinePatterns: testCase.guideline_patterns,
500
+ guidelineContent: guidelines
501
+ }) : void 0;
502
+ return { question, guidelines, chatPrompt };
503
+ }
504
+ function buildChatPromptFromSegments(options) {
505
+ const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
506
+ if (messages.length === 0) {
507
+ return void 0;
394
508
  }
395
- const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
396
- const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
397
- return { question, guidelines };
509
+ const systemSegments = [];
510
+ if (systemPrompt && systemPrompt.trim().length > 0) {
511
+ systemSegments.push(systemPrompt.trim());
512
+ }
513
+ if (guidelineContent && guidelineContent.trim().length > 0) {
514
+ systemSegments.push(`[[ ## Guidelines ## ]]
515
+
516
+ ${guidelineContent.trim()}`);
517
+ }
518
+ let startIndex = 0;
519
+ while (startIndex < messages.length && messages[startIndex].role === "system") {
520
+ const segments = segmentsByMessage[startIndex];
521
+ const contentParts = [];
522
+ for (const segment of segments) {
523
+ const formatted = formatSegment(segment);
524
+ if (formatted) {
525
+ contentParts.push(formatted);
526
+ }
527
+ }
528
+ if (contentParts.length > 0) {
529
+ systemSegments.push(contentParts.join("\n"));
530
+ }
531
+ startIndex += 1;
532
+ }
533
+ const chatPrompt = [];
534
+ if (systemSegments.length > 0) {
535
+ chatPrompt.push({
536
+ role: "system",
537
+ content: systemSegments.join("\n\n")
538
+ });
539
+ }
540
+ for (let i = startIndex; i < messages.length; i++) {
541
+ const message = messages[i];
542
+ const segments = segmentsByMessage[i];
543
+ const contentParts = [];
544
+ let role = message.role;
545
+ let name;
546
+ if (role === "system") {
547
+ role = "assistant";
548
+ contentParts.push("@[System]:");
549
+ } else if (role === "tool") {
550
+ role = "function";
551
+ name = "tool";
552
+ }
553
+ for (const segment of segments) {
554
+ if (segment.type === "guideline_ref") {
555
+ continue;
556
+ }
557
+ const formatted = formatSegment(segment);
558
+ if (formatted) {
559
+ const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
560
+ if (isGuidelineRef) {
561
+ continue;
562
+ }
563
+ contentParts.push(formatted);
564
+ }
565
+ }
566
+ if (contentParts.length === 0) {
567
+ continue;
568
+ }
569
+ chatPrompt.push({
570
+ role,
571
+ content: contentParts.join("\n"),
572
+ ...name ? { name } : {}
573
+ });
574
+ }
575
+ return chatPrompt.length > 0 ? chatPrompt : void 0;
398
576
  }
399
577
  async function fileExists2(absolutePath) {
400
578
  try {
@@ -591,21 +769,14 @@ import { AxAI } from "@ax-llm/ax";
591
769
  var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
592
770
  function buildChatPrompt(request) {
593
771
  if (request.chatPrompt) {
594
- return request.chatPrompt;
595
- }
596
- const systemSegments = [];
597
- const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
598
- if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
599
- systemSegments.push(metadataSystemPrompt.trim());
600
- } else {
601
- systemSegments.push(DEFAULT_SYSTEM_PROMPT);
602
- }
603
- if (request.guidelines && request.guidelines.trim().length > 0) {
604
- systemSegments.push(`[[ ## Guidelines ## ]]
605
-
606
- ${request.guidelines.trim()}`);
772
+ const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
773
+ if (hasSystemMessage) {
774
+ return request.chatPrompt;
775
+ }
776
+ const systemContent2 = resolveSystemContent(request);
777
+ return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
607
778
  }
608
- const systemContent = systemSegments.join("\n\n");
779
+ const systemContent = resolveSystemContent(request);
609
780
  const userContent = request.question.trim();
610
781
  const prompt = [
611
782
  {
@@ -619,6 +790,21 @@ ${request.guidelines.trim()}`);
619
790
  ];
620
791
  return prompt;
621
792
  }
793
+ function resolveSystemContent(request) {
794
+ const systemSegments = [];
795
+ const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
796
+ if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
797
+ systemSegments.push(metadataSystemPrompt.trim());
798
+ } else {
799
+ systemSegments.push(DEFAULT_SYSTEM_PROMPT);
800
+ }
801
+ if (request.guidelines && request.guidelines.trim().length > 0) {
802
+ systemSegments.push(`[[ ## Guidelines ## ]]
803
+
804
+ ${request.guidelines.trim()}`);
805
+ }
806
+ return systemSegments.join("\n\n");
807
+ }
622
808
  function extractModelConfig(request, defaults) {
623
809
  const temperature = request.temperature ?? defaults.temperature;
624
810
  const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
@@ -2330,19 +2516,21 @@ var LlmJudgeEvaluator = class {
2330
2516
  return this.evaluateWithPrompt(context, judgeProvider);
2331
2517
  }
2332
2518
  async evaluateWithPrompt(context, judgeProvider) {
2333
- let prompt = buildQualityPrompt(context.evalCase, context.candidate);
2334
- let systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
2519
+ const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
2520
+ const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
2521
+ let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
2522
+ let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
2335
2523
  if (systemPrompt && hasTemplateVariables(systemPrompt)) {
2336
2524
  const variables = {
2337
2525
  input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
2338
2526
  output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
2339
2527
  candidate_answer: context.candidate,
2340
- reference_answer: context.evalCase.reference_answer,
2528
+ reference_answer: context.evalCase.reference_answer ?? "",
2341
2529
  expected_outcome: context.evalCase.expected_outcome,
2342
- question: context.evalCase.question
2530
+ question: formattedQuestion
2343
2531
  };
2344
2532
  prompt = substituteVariables(systemPrompt, variables);
2345
- systemPrompt = QUALITY_SYSTEM_PROMPT;
2533
+ systemPrompt = buildSystemPrompt(hasReferenceAnswer);
2346
2534
  }
2347
2535
  const metadata = {
2348
2536
  ...systemPrompt !== void 0 ? { systemPrompt } : {},
@@ -2380,38 +2568,51 @@ var LlmJudgeEvaluator = class {
2380
2568
  };
2381
2569
  }
2382
2570
  };
2383
- var QUALITY_SYSTEM_PROMPT = [
2384
- "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
2385
- "",
2386
- "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
2387
- "",
2388
- "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
2389
- "",
2390
- "You must respond with a single JSON object matching this schema:",
2391
- "",
2392
- "{",
2393
- ' "score": <number between 0.0 and 1.0>,',
2394
- ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
2395
- ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
2396
- ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
2397
- "}"
2398
- ].join("\n");
2399
- function buildQualityPrompt(evalCase, candidate) {
2571
+ function buildSystemPrompt(hasReferenceAnswer) {
2572
+ const basePrompt = [
2573
+ "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
2574
+ ""
2575
+ ];
2576
+ if (hasReferenceAnswer) {
2577
+ basePrompt.push(
2578
+ "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
2579
+ ""
2580
+ );
2581
+ }
2582
+ basePrompt.push(
2583
+ "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
2584
+ "",
2585
+ "You must respond with a single JSON object matching this schema:",
2586
+ "",
2587
+ "{",
2588
+ ' "score": <number between 0.0 and 1.0>,',
2589
+ ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
2590
+ ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
2591
+ ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
2592
+ "}"
2593
+ );
2594
+ return basePrompt.join("\n");
2595
+ }
2596
+ function buildQualityPrompt(evalCase, candidate, question) {
2400
2597
  const parts = [
2401
2598
  "[[ ## expected_outcome ## ]]",
2402
2599
  evalCase.expected_outcome.trim(),
2403
2600
  "",
2404
2601
  "[[ ## question ## ]]",
2405
- evalCase.question.trim(),
2406
- "",
2407
- "[[ ## reference_answer ## ]]",
2408
- evalCase.reference_answer.trim(),
2409
- "",
2410
- "[[ ## candidate_answer ## ]]",
2411
- candidate.trim(),
2412
- "",
2413
- "Respond with a single JSON object matching the schema described in the system prompt."
2602
+ question.trim(),
2603
+ ""
2414
2604
  ];
2605
+ if (hasNonEmptyReferenceAnswer(evalCase)) {
2606
+ parts.push(
2607
+ "[[ ## reference_answer ## ]]",
2608
+ evalCase.reference_answer.trim(),
2609
+ ""
2610
+ );
2611
+ }
2612
+ parts.push(
2613
+ "[[ ## candidate_answer ## ]]",
2614
+ candidate.trim()
2615
+ );
2415
2616
  return parts.join("\n");
2416
2617
  }
2417
2618
  function clampScore(value) {
@@ -2494,6 +2695,9 @@ function extractJsonBlob(text) {
2494
2695
  function isNonEmptyString(value) {
2495
2696
  return typeof value === "string" && value.trim().length > 0;
2496
2697
  }
2698
+ function hasNonEmptyReferenceAnswer(evalCase) {
2699
+ return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
2700
+ }
2497
2701
  var CodeEvaluator = class {
2498
2702
  kind = "code";
2499
2703
  script;
@@ -3152,11 +3356,27 @@ async function evaluateCandidate(options) {
3152
3356
  agentTimeoutMs
3153
3357
  });
3154
3358
  const completedAt = nowFn();
3155
- const rawRequest = {
3156
- question: promptInputs.question,
3157
- ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
3158
- guideline_paths: evalCase.guideline_paths
3159
- };
3359
+ let agentProviderRequest;
3360
+ let lmProviderRequest;
3361
+ if (isAgentProvider(provider)) {
3362
+ agentProviderRequest = {
3363
+ question: promptInputs.question,
3364
+ guideline_paths: evalCase.guideline_paths
3365
+ };
3366
+ } else {
3367
+ if (promptInputs.chatPrompt) {
3368
+ lmProviderRequest = {
3369
+ chat_prompt: promptInputs.chatPrompt,
3370
+ guideline_paths: evalCase.guideline_paths
3371
+ };
3372
+ } else {
3373
+ lmProviderRequest = {
3374
+ question: promptInputs.question,
3375
+ guidelines: promptInputs.guidelines,
3376
+ guideline_paths: evalCase.guideline_paths
3377
+ };
3378
+ }
3379
+ }
3160
3380
  return {
3161
3381
  eval_id: evalCase.id,
3162
3382
  dataset: evalCase.dataset,
@@ -3170,7 +3390,8 @@ async function evaluateCandidate(options) {
3170
3390
  timestamp: completedAt.toISOString(),
3171
3391
  reasoning: score.reasoning,
3172
3392
  raw_aspects: score.rawAspects,
3173
- raw_request: rawRequest,
3393
+ agent_provider_request: agentProviderRequest,
3394
+ lm_provider_request: lmProviderRequest,
3174
3395
  evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
3175
3396
  evaluator_results: evaluatorResults
3176
3397
  };
@@ -3399,6 +3620,7 @@ async function invokeProvider(provider, options) {
3399
3620
  question: promptInputs.question,
3400
3621
  guidelines: promptInputs.guidelines,
3401
3622
  guideline_patterns: evalCase.guideline_patterns,
3623
+ chatPrompt: promptInputs.chatPrompt,
3402
3624
  inputFiles: evalCase.file_paths,
3403
3625
  evalCaseId: evalCase.id,
3404
3626
  attempt,
@@ -3415,12 +3637,30 @@ async function invokeProvider(provider, options) {
3415
3637
  }
3416
3638
  function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
3417
3639
  const message = error instanceof Error ? error.message : String(error);
3418
- const rawRequest = {
3419
- question: promptInputs.question,
3420
- ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
3421
- guideline_paths: evalCase.guideline_paths,
3422
- error: message
3423
- };
3640
+ let agentProviderRequest;
3641
+ let lmProviderRequest;
3642
+ if (isAgentProvider(provider)) {
3643
+ agentProviderRequest = {
3644
+ question: promptInputs.question,
3645
+ guideline_paths: evalCase.guideline_paths,
3646
+ error: message
3647
+ };
3648
+ } else {
3649
+ if (promptInputs.chatPrompt) {
3650
+ lmProviderRequest = {
3651
+ chat_prompt: promptInputs.chatPrompt,
3652
+ guideline_paths: evalCase.guideline_paths,
3653
+ error: message
3654
+ };
3655
+ } else {
3656
+ lmProviderRequest = {
3657
+ question: promptInputs.question,
3658
+ guidelines: promptInputs.guidelines,
3659
+ guideline_paths: evalCase.guideline_paths,
3660
+ error: message
3661
+ };
3662
+ }
3663
+ }
3424
3664
  return {
3425
3665
  eval_id: evalCase.id,
3426
3666
  dataset: evalCase.dataset,
@@ -3433,7 +3673,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
3433
3673
  target: targetName,
3434
3674
  timestamp: timestamp.toISOString(),
3435
3675
  raw_aspects: [],
3436
- raw_request: rawRequest,
3676
+ agent_provider_request: agentProviderRequest,
3677
+ lm_provider_request: lmProviderRequest,
3437
3678
  error: message
3438
3679
  };
3439
3680
  }
@@ -3445,6 +3686,9 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
3445
3686
  hash.update(promptInputs.question);
3446
3687
  hash.update(promptInputs.guidelines);
3447
3688
  hash.update(promptInputs.systemMessage ?? "");
3689
+ if (promptInputs.chatPrompt) {
3690
+ hash.update(JSON.stringify(promptInputs.chatPrompt));
3691
+ }
3448
3692
  return hash.digest("hex");
3449
3693
  }
3450
3694
  function isTimeoutLike(error) {