@agentv/core 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -434,14 +434,11 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
434
434
  logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
435
435
  continue;
436
436
  }
437
- if (!Array.isArray(expectedMessagesValue)) {
438
- logWarning(`Eval case '${id}' missing expected_messages array`);
439
- continue;
440
- }
437
+ const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
441
438
  const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
442
- const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
443
- if (expectedMessages.length === 0) {
444
- logWarning(`No expected message found for eval case: ${id}`);
439
+ const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
440
+ if (hasExpectedMessages && expectedMessages.length === 0) {
441
+ logWarning(`No valid expected message found for eval case: ${id}`);
445
442
  continue;
446
443
  }
447
444
  if (expectedMessages.length > 1) {
@@ -459,17 +456,17 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
459
456
  messageType: "input",
460
457
  verbose
461
458
  });
462
- const outputSegments = await processMessages({
459
+ const outputSegments = hasExpectedMessages ? await processMessages({
463
460
  messages: expectedMessages,
464
461
  searchRoots,
465
462
  repoRootPath,
466
463
  guidelinePatterns,
467
464
  messageType: "output",
468
465
  verbose
469
- });
466
+ }) : [];
470
467
  const codeSnippets = extractCodeBlocks(inputSegments);
471
468
  const expectedContent = expectedMessages[0]?.content;
472
- const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
469
+ const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
473
470
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
474
471
  const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
475
472
  const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
@@ -488,6 +485,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
488
485
  dataset: datasetName,
489
486
  conversation_id: conversationId,
490
487
  question,
488
+ input_messages: inputMessages,
491
489
  input_segments: inputSegments,
492
490
  output_segments: outputSegments,
493
491
  reference_answer: referenceAnswer,
@@ -515,6 +513,54 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
515
513
  }
516
514
  return results;
517
515
  }
516
+ function needsRoleMarkers(messages, processedSegmentsByMessage) {
517
+ if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
518
+ return true;
519
+ }
520
+ let messagesWithContent = 0;
521
+ for (const segments of processedSegmentsByMessage) {
522
+ if (hasVisibleContent(segments)) {
523
+ messagesWithContent++;
524
+ }
525
+ }
526
+ return messagesWithContent > 1;
527
+ }
528
+ function hasVisibleContent(segments) {
529
+ return segments.some((segment) => {
530
+ const type = asString(segment.type);
531
+ if (type === "text") {
532
+ const value = asString(segment.value);
533
+ return value !== void 0 && value.trim().length > 0;
534
+ }
535
+ if (type === "guideline_ref") {
536
+ return false;
537
+ }
538
+ if (type === "file") {
539
+ const text = asString(segment.text);
540
+ return text !== void 0 && text.trim().length > 0;
541
+ }
542
+ return false;
543
+ });
544
+ }
545
+ function formatSegment(segment) {
546
+ const type = asString(segment.type);
547
+ if (type === "text") {
548
+ return asString(segment.value);
549
+ }
550
+ if (type === "guideline_ref") {
551
+ const refPath = asString(segment.path);
552
+ return refPath ? `<Attached: ${refPath}>` : void 0;
553
+ }
554
+ if (type === "file") {
555
+ const text = asString(segment.text);
556
+ const filePath = asString(segment.path);
557
+ if (text && filePath) {
558
+ return `=== ${filePath} ===
559
+ ${text}`;
560
+ }
561
+ }
562
+ return void 0;
563
+ }
518
564
  async function buildPromptInputs(testCase) {
519
565
  const guidelineContents = [];
520
566
  for (const rawPath of testCase.guideline_paths) {
@@ -531,36 +577,168 @@ ${content}`);
531
577
  logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
532
578
  }
533
579
  }
534
- const questionParts = [];
580
+ const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
581
+ const segmentsByMessage = [];
582
+ const fileContentsByPath = /* @__PURE__ */ new Map();
535
583
  for (const segment of testCase.input_segments) {
536
- const typeValue = segment.type;
537
- if (typeof typeValue === "string" && typeValue === "file") {
538
- const pathValue = segment.path;
539
- const textValue = segment.text;
540
- const label = typeof pathValue === "string" ? pathValue : "file";
541
- const body = typeof textValue === "string" ? textValue : "";
542
- questionParts.push(`=== ${label} ===
543
- ${body}`);
544
- continue;
584
+ if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
585
+ fileContentsByPath.set(segment.path, segment.text);
545
586
  }
546
- if (typeof typeValue === "string" && typeValue === "text") {
547
- const value = segment.value;
548
- if (typeof value === "string") {
549
- questionParts.push(value);
587
+ }
588
+ for (const message of testCase.input_messages) {
589
+ const messageSegments = [];
590
+ if (typeof message.content === "string") {
591
+ if (message.content.trim().length > 0) {
592
+ messageSegments.push({ type: "text", value: message.content });
593
+ }
594
+ } else if (Array.isArray(message.content)) {
595
+ for (const segment of message.content) {
596
+ if (typeof segment === "string") {
597
+ if (segment.trim().length > 0) {
598
+ messageSegments.push({ type: "text", value: segment });
599
+ }
600
+ } else if (isJsonObject(segment)) {
601
+ const type = asString(segment.type);
602
+ if (type === "file") {
603
+ const value = asString(segment.value);
604
+ if (!value) continue;
605
+ if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
606
+ messageSegments.push({ type: "guideline_ref", path: value });
607
+ continue;
608
+ }
609
+ const fileText = fileContentsByPath.get(value);
610
+ if (fileText !== void 0) {
611
+ messageSegments.push({ type: "file", text: fileText, path: value });
612
+ }
613
+ } else if (type === "text") {
614
+ const textValue = asString(segment.value);
615
+ if (textValue && textValue.trim().length > 0) {
616
+ messageSegments.push({ type: "text", value: textValue });
617
+ }
618
+ }
619
+ }
550
620
  }
551
- continue;
552
621
  }
553
- const genericValue = segment.value;
554
- if (typeof genericValue === "string") {
555
- questionParts.push(genericValue);
622
+ segmentsByMessage.push(messageSegments);
623
+ }
624
+ const useRoleMarkers = needsRoleMarkers(testCase.input_messages, segmentsByMessage);
625
+ let question;
626
+ if (useRoleMarkers) {
627
+ const messageParts = [];
628
+ for (let i = 0; i < testCase.input_messages.length; i++) {
629
+ const message = testCase.input_messages[i];
630
+ const segments = segmentsByMessage[i];
631
+ if (!hasVisibleContent(segments)) {
632
+ continue;
633
+ }
634
+ const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
635
+ const contentParts = [];
636
+ for (const segment of segments) {
637
+ const formattedContent = formatSegment(segment);
638
+ if (formattedContent) {
639
+ contentParts.push(formattedContent);
640
+ }
641
+ }
642
+ if (contentParts.length > 0) {
643
+ const messageContent = contentParts.join("\n");
644
+ messageParts.push(`@[${roleLabel}]:
645
+ ${messageContent}`);
646
+ }
556
647
  }
648
+ question = messageParts.join("\n\n");
649
+ } else {
650
+ const questionParts = [];
651
+ for (const segment of testCase.input_segments) {
652
+ const formattedContent = formatSegment(segment);
653
+ if (formattedContent) {
654
+ questionParts.push(formattedContent);
655
+ }
656
+ }
657
+ if (testCase.code_snippets.length > 0) {
658
+ questionParts.push(testCase.code_snippets.join("\n"));
659
+ }
660
+ question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
557
661
  }
558
- if (testCase.code_snippets.length > 0) {
559
- questionParts.push(testCase.code_snippets.join("\n"));
662
+ const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
663
+ messages: testCase.input_messages,
664
+ segmentsByMessage,
665
+ guidelinePatterns: testCase.guideline_patterns,
666
+ guidelineContent: guidelines
667
+ }) : void 0;
668
+ return { question, guidelines, chatPrompt };
669
+ }
670
+ function buildChatPromptFromSegments(options) {
671
+ const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
672
+ if (messages.length === 0) {
673
+ return void 0;
560
674
  }
561
- const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
562
- const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
563
- return { question, guidelines };
675
+ const systemSegments = [];
676
+ if (systemPrompt && systemPrompt.trim().length > 0) {
677
+ systemSegments.push(systemPrompt.trim());
678
+ }
679
+ if (guidelineContent && guidelineContent.trim().length > 0) {
680
+ systemSegments.push(`[[ ## Guidelines ## ]]
681
+
682
+ ${guidelineContent.trim()}`);
683
+ }
684
+ let startIndex = 0;
685
+ while (startIndex < messages.length && messages[startIndex].role === "system") {
686
+ const segments = segmentsByMessage[startIndex];
687
+ const contentParts = [];
688
+ for (const segment of segments) {
689
+ const formatted = formatSegment(segment);
690
+ if (formatted) {
691
+ contentParts.push(formatted);
692
+ }
693
+ }
694
+ if (contentParts.length > 0) {
695
+ systemSegments.push(contentParts.join("\n"));
696
+ }
697
+ startIndex += 1;
698
+ }
699
+ const chatPrompt = [];
700
+ if (systemSegments.length > 0) {
701
+ chatPrompt.push({
702
+ role: "system",
703
+ content: systemSegments.join("\n\n")
704
+ });
705
+ }
706
+ for (let i = startIndex; i < messages.length; i++) {
707
+ const message = messages[i];
708
+ const segments = segmentsByMessage[i];
709
+ const contentParts = [];
710
+ let role = message.role;
711
+ let name;
712
+ if (role === "system") {
713
+ role = "assistant";
714
+ contentParts.push("@[System]:");
715
+ } else if (role === "tool") {
716
+ role = "function";
717
+ name = "tool";
718
+ }
719
+ for (const segment of segments) {
720
+ if (segment.type === "guideline_ref") {
721
+ continue;
722
+ }
723
+ const formatted = formatSegment(segment);
724
+ if (formatted) {
725
+ const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
726
+ if (isGuidelineRef) {
727
+ continue;
728
+ }
729
+ contentParts.push(formatted);
730
+ }
731
+ }
732
+ if (contentParts.length === 0) {
733
+ continue;
734
+ }
735
+ chatPrompt.push({
736
+ role,
737
+ content: contentParts.join("\n"),
738
+ ...name ? { name } : {}
739
+ });
740
+ }
741
+ return chatPrompt.length > 0 ? chatPrompt : void 0;
564
742
  }
565
743
  async function fileExists2(absolutePath) {
566
744
  try {
@@ -757,21 +935,14 @@ var import_ax = require("@ax-llm/ax");
757
935
  var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
758
936
  function buildChatPrompt(request) {
759
937
  if (request.chatPrompt) {
760
- return request.chatPrompt;
761
- }
762
- const systemSegments = [];
763
- const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
764
- if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
765
- systemSegments.push(metadataSystemPrompt.trim());
766
- } else {
767
- systemSegments.push(DEFAULT_SYSTEM_PROMPT);
768
- }
769
- if (request.guidelines && request.guidelines.trim().length > 0) {
770
- systemSegments.push(`[[ ## Guidelines ## ]]
771
-
772
- ${request.guidelines.trim()}`);
938
+ const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
939
+ if (hasSystemMessage) {
940
+ return request.chatPrompt;
941
+ }
942
+ const systemContent2 = resolveSystemContent(request);
943
+ return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
773
944
  }
774
- const systemContent = systemSegments.join("\n\n");
945
+ const systemContent = resolveSystemContent(request);
775
946
  const userContent = request.question.trim();
776
947
  const prompt = [
777
948
  {
@@ -785,6 +956,21 @@ ${request.guidelines.trim()}`);
785
956
  ];
786
957
  return prompt;
787
958
  }
959
+ function resolveSystemContent(request) {
960
+ const systemSegments = [];
961
+ const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
962
+ if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
963
+ systemSegments.push(metadataSystemPrompt.trim());
964
+ } else {
965
+ systemSegments.push(DEFAULT_SYSTEM_PROMPT);
966
+ }
967
+ if (request.guidelines && request.guidelines.trim().length > 0) {
968
+ systemSegments.push(`[[ ## Guidelines ## ]]
969
+
970
+ ${request.guidelines.trim()}`);
971
+ }
972
+ return systemSegments.join("\n\n");
973
+ }
788
974
  function extractModelConfig(request, defaults) {
789
975
  const temperature = request.temperature ?? defaults.temperature;
790
976
  const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
@@ -3020,19 +3206,21 @@ var LlmJudgeEvaluator = class {
3020
3206
  return this.evaluateWithPrompt(context, judgeProvider);
3021
3207
  }
3022
3208
  async evaluateWithPrompt(context, judgeProvider) {
3023
- let prompt = buildQualityPrompt(context.evalCase, context.candidate);
3024
- let systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
3209
+ const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
3210
+ const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
3211
+ let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
3212
+ let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
3025
3213
  if (systemPrompt && hasTemplateVariables(systemPrompt)) {
3026
3214
  const variables = {
3027
3215
  input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
3028
3216
  output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
3029
3217
  candidate_answer: context.candidate,
3030
- reference_answer: context.evalCase.reference_answer,
3218
+ reference_answer: context.evalCase.reference_answer ?? "",
3031
3219
  expected_outcome: context.evalCase.expected_outcome,
3032
- question: context.evalCase.question
3220
+ question: formattedQuestion
3033
3221
  };
3034
3222
  prompt = substituteVariables(systemPrompt, variables);
3035
- systemPrompt = QUALITY_SYSTEM_PROMPT;
3223
+ systemPrompt = buildSystemPrompt(hasReferenceAnswer);
3036
3224
  }
3037
3225
  const metadata = {
3038
3226
  ...systemPrompt !== void 0 ? { systemPrompt } : {},
@@ -3070,38 +3258,51 @@ var LlmJudgeEvaluator = class {
3070
3258
  };
3071
3259
  }
3072
3260
  };
3073
- var QUALITY_SYSTEM_PROMPT = [
3074
- "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
3075
- "",
3076
- "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
3077
- "",
3078
- "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
3079
- "",
3080
- "You must respond with a single JSON object matching this schema:",
3081
- "",
3082
- "{",
3083
- ' "score": <number between 0.0 and 1.0>,',
3084
- ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
3085
- ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
3086
- ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
3087
- "}"
3088
- ].join("\n");
3089
- function buildQualityPrompt(evalCase, candidate) {
3261
+ function buildSystemPrompt(hasReferenceAnswer) {
3262
+ const basePrompt = [
3263
+ "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
3264
+ ""
3265
+ ];
3266
+ if (hasReferenceAnswer) {
3267
+ basePrompt.push(
3268
+ "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
3269
+ ""
3270
+ );
3271
+ }
3272
+ basePrompt.push(
3273
+ "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
3274
+ "",
3275
+ "You must respond with a single JSON object matching this schema:",
3276
+ "",
3277
+ "{",
3278
+ ' "score": <number between 0.0 and 1.0>,',
3279
+ ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
3280
+ ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
3281
+ ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
3282
+ "}"
3283
+ );
3284
+ return basePrompt.join("\n");
3285
+ }
3286
+ function buildQualityPrompt(evalCase, candidate, question) {
3090
3287
  const parts = [
3091
3288
  "[[ ## expected_outcome ## ]]",
3092
3289
  evalCase.expected_outcome.trim(),
3093
3290
  "",
3094
3291
  "[[ ## question ## ]]",
3095
- evalCase.question.trim(),
3096
- "",
3097
- "[[ ## reference_answer ## ]]",
3098
- evalCase.reference_answer.trim(),
3099
- "",
3100
- "[[ ## candidate_answer ## ]]",
3101
- candidate.trim(),
3102
- "",
3103
- "Respond with a single JSON object matching the schema described in the system prompt."
3292
+ question.trim(),
3293
+ ""
3104
3294
  ];
3295
+ if (hasNonEmptyReferenceAnswer(evalCase)) {
3296
+ parts.push(
3297
+ "[[ ## reference_answer ## ]]",
3298
+ evalCase.reference_answer.trim(),
3299
+ ""
3300
+ );
3301
+ }
3302
+ parts.push(
3303
+ "[[ ## candidate_answer ## ]]",
3304
+ candidate.trim()
3305
+ );
3105
3306
  return parts.join("\n");
3106
3307
  }
3107
3308
  function clampScore(value) {
@@ -3184,6 +3385,9 @@ function extractJsonBlob(text) {
3184
3385
  function isNonEmptyString(value) {
3185
3386
  return typeof value === "string" && value.trim().length > 0;
3186
3387
  }
3388
+ function hasNonEmptyReferenceAnswer(evalCase) {
3389
+ return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
3390
+ }
3187
3391
  var CodeEvaluator = class {
3188
3392
  kind = "code";
3189
3393
  script;
@@ -3842,11 +4046,27 @@ async function evaluateCandidate(options) {
3842
4046
  agentTimeoutMs
3843
4047
  });
3844
4048
  const completedAt = nowFn();
3845
- const rawRequest = {
3846
- question: promptInputs.question,
3847
- ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
3848
- guideline_paths: evalCase.guideline_paths
3849
- };
4049
+ let agentProviderRequest;
4050
+ let lmProviderRequest;
4051
+ if (isAgentProvider(provider)) {
4052
+ agentProviderRequest = {
4053
+ question: promptInputs.question,
4054
+ guideline_paths: evalCase.guideline_paths
4055
+ };
4056
+ } else {
4057
+ if (promptInputs.chatPrompt) {
4058
+ lmProviderRequest = {
4059
+ chat_prompt: promptInputs.chatPrompt,
4060
+ guideline_paths: evalCase.guideline_paths
4061
+ };
4062
+ } else {
4063
+ lmProviderRequest = {
4064
+ question: promptInputs.question,
4065
+ guidelines: promptInputs.guidelines,
4066
+ guideline_paths: evalCase.guideline_paths
4067
+ };
4068
+ }
4069
+ }
3850
4070
  return {
3851
4071
  eval_id: evalCase.id,
3852
4072
  dataset: evalCase.dataset,
@@ -3860,7 +4080,8 @@ async function evaluateCandidate(options) {
3860
4080
  timestamp: completedAt.toISOString(),
3861
4081
  reasoning: score.reasoning,
3862
4082
  raw_aspects: score.rawAspects,
3863
- raw_request: rawRequest,
4083
+ agent_provider_request: agentProviderRequest,
4084
+ lm_provider_request: lmProviderRequest,
3864
4085
  evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
3865
4086
  evaluator_results: evaluatorResults
3866
4087
  };
@@ -4089,6 +4310,7 @@ async function invokeProvider(provider, options) {
4089
4310
  question: promptInputs.question,
4090
4311
  guidelines: promptInputs.guidelines,
4091
4312
  guideline_patterns: evalCase.guideline_patterns,
4313
+ chatPrompt: promptInputs.chatPrompt,
4092
4314
  inputFiles: evalCase.file_paths,
4093
4315
  evalCaseId: evalCase.id,
4094
4316
  attempt,
@@ -4105,12 +4327,30 @@ async function invokeProvider(provider, options) {
4105
4327
  }
4106
4328
  function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
4107
4329
  const message = error instanceof Error ? error.message : String(error);
4108
- const rawRequest = {
4109
- question: promptInputs.question,
4110
- ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
4111
- guideline_paths: evalCase.guideline_paths,
4112
- error: message
4113
- };
4330
+ let agentProviderRequest;
4331
+ let lmProviderRequest;
4332
+ if (isAgentProvider(provider)) {
4333
+ agentProviderRequest = {
4334
+ question: promptInputs.question,
4335
+ guideline_paths: evalCase.guideline_paths,
4336
+ error: message
4337
+ };
4338
+ } else {
4339
+ if (promptInputs.chatPrompt) {
4340
+ lmProviderRequest = {
4341
+ chat_prompt: promptInputs.chatPrompt,
4342
+ guideline_paths: evalCase.guideline_paths,
4343
+ error: message
4344
+ };
4345
+ } else {
4346
+ lmProviderRequest = {
4347
+ question: promptInputs.question,
4348
+ guidelines: promptInputs.guidelines,
4349
+ guideline_paths: evalCase.guideline_paths,
4350
+ error: message
4351
+ };
4352
+ }
4353
+ }
4114
4354
  return {
4115
4355
  eval_id: evalCase.id,
4116
4356
  dataset: evalCase.dataset,
@@ -4123,7 +4363,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
4123
4363
  target: targetName,
4124
4364
  timestamp: timestamp.toISOString(),
4125
4365
  raw_aspects: [],
4126
- raw_request: rawRequest,
4366
+ agent_provider_request: agentProviderRequest,
4367
+ lm_provider_request: lmProviderRequest,
4127
4368
  error: message
4128
4369
  };
4129
4370
  }
@@ -4135,6 +4376,9 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
4135
4376
  hash.update(promptInputs.question);
4136
4377
  hash.update(promptInputs.guidelines);
4137
4378
  hash.update(promptInputs.systemMessage ?? "");
4379
+ if (promptInputs.chatPrompt) {
4380
+ hash.update(JSON.stringify(promptInputs.chatPrompt));
4381
+ }
4138
4382
  return hash.digest("hex");
4139
4383
  }
4140
4384
  function isTimeoutLike(error) {