@agentv/core 0.7.5 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -434,14 +434,11 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
434
434
  logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
435
435
  continue;
436
436
  }
437
- if (!Array.isArray(expectedMessagesValue)) {
438
- logWarning(`Eval case '${id}' missing expected_messages array`);
439
- continue;
440
- }
437
+ const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
441
438
  const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
442
- const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
443
- if (expectedMessages.length === 0) {
444
- logWarning(`No expected message found for eval case: ${id}`);
439
+ const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
440
+ if (hasExpectedMessages && expectedMessages.length === 0) {
441
+ logWarning(`No valid expected message found for eval case: ${id}`);
445
442
  continue;
446
443
  }
447
444
  if (expectedMessages.length > 1) {
@@ -459,17 +456,17 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
459
456
  messageType: "input",
460
457
  verbose
461
458
  });
462
- const outputSegments = await processMessages({
459
+ const outputSegments = hasExpectedMessages ? await processMessages({
463
460
  messages: expectedMessages,
464
461
  searchRoots,
465
462
  repoRootPath,
466
463
  guidelinePatterns,
467
464
  messageType: "output",
468
465
  verbose
469
- });
466
+ }) : [];
470
467
  const codeSnippets = extractCodeBlocks(inputSegments);
471
468
  const expectedContent = expectedMessages[0]?.content;
472
- const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
469
+ const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
473
470
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
474
471
  const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
475
472
  const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
@@ -488,6 +485,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
488
485
  dataset: datasetName,
489
486
  conversation_id: conversationId,
490
487
  question,
488
+ input_messages: inputMessages,
491
489
  input_segments: inputSegments,
492
490
  output_segments: outputSegments,
493
491
  reference_answer: referenceAnswer,
@@ -515,6 +513,54 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
515
513
  }
516
514
  return results;
517
515
  }
516
+ function needsRoleMarkers(messages, processedSegmentsByMessage) {
517
+ if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
518
+ return true;
519
+ }
520
+ let messagesWithContent = 0;
521
+ for (const segments of processedSegmentsByMessage) {
522
+ if (hasVisibleContent(segments)) {
523
+ messagesWithContent++;
524
+ }
525
+ }
526
+ return messagesWithContent > 1;
527
+ }
528
+ function hasVisibleContent(segments) {
529
+ return segments.some((segment) => {
530
+ const type = asString(segment.type);
531
+ if (type === "text") {
532
+ const value = asString(segment.value);
533
+ return value !== void 0 && value.trim().length > 0;
534
+ }
535
+ if (type === "guideline_ref") {
536
+ return false;
537
+ }
538
+ if (type === "file") {
539
+ const text = asString(segment.text);
540
+ return text !== void 0 && text.trim().length > 0;
541
+ }
542
+ return false;
543
+ });
544
+ }
545
+ function formatSegment(segment) {
546
+ const type = asString(segment.type);
547
+ if (type === "text") {
548
+ return asString(segment.value);
549
+ }
550
+ if (type === "guideline_ref") {
551
+ const refPath = asString(segment.path);
552
+ return refPath ? `<Attached: ${refPath}>` : void 0;
553
+ }
554
+ if (type === "file") {
555
+ const text = asString(segment.text);
556
+ const filePath = asString(segment.path);
557
+ if (text && filePath) {
558
+ return `=== ${filePath} ===
559
+ ${text}`;
560
+ }
561
+ }
562
+ return void 0;
563
+ }
518
564
  async function buildPromptInputs(testCase) {
519
565
  const guidelineContents = [];
520
566
  for (const rawPath of testCase.guideline_paths) {
@@ -531,36 +577,168 @@ ${content}`);
531
577
  logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
532
578
  }
533
579
  }
534
- const questionParts = [];
580
+ const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
581
+ const segmentsByMessage = [];
582
+ const fileContentsByPath = /* @__PURE__ */ new Map();
535
583
  for (const segment of testCase.input_segments) {
536
- const typeValue = segment.type;
537
- if (typeof typeValue === "string" && typeValue === "file") {
538
- const pathValue = segment.path;
539
- const textValue = segment.text;
540
- const label = typeof pathValue === "string" ? pathValue : "file";
541
- const body = typeof textValue === "string" ? textValue : "";
542
- questionParts.push(`=== ${label} ===
543
- ${body}`);
544
- continue;
584
+ if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
585
+ fileContentsByPath.set(segment.path, segment.text);
545
586
  }
546
- if (typeof typeValue === "string" && typeValue === "text") {
547
- const value = segment.value;
548
- if (typeof value === "string") {
549
- questionParts.push(value);
587
+ }
588
+ for (const message of testCase.input_messages) {
589
+ const messageSegments = [];
590
+ if (typeof message.content === "string") {
591
+ if (message.content.trim().length > 0) {
592
+ messageSegments.push({ type: "text", value: message.content });
593
+ }
594
+ } else if (Array.isArray(message.content)) {
595
+ for (const segment of message.content) {
596
+ if (typeof segment === "string") {
597
+ if (segment.trim().length > 0) {
598
+ messageSegments.push({ type: "text", value: segment });
599
+ }
600
+ } else if (isJsonObject(segment)) {
601
+ const type = asString(segment.type);
602
+ if (type === "file") {
603
+ const value = asString(segment.value);
604
+ if (!value) continue;
605
+ if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
606
+ messageSegments.push({ type: "guideline_ref", path: value });
607
+ continue;
608
+ }
609
+ const fileText = fileContentsByPath.get(value);
610
+ if (fileText !== void 0) {
611
+ messageSegments.push({ type: "file", text: fileText, path: value });
612
+ }
613
+ } else if (type === "text") {
614
+ const textValue = asString(segment.value);
615
+ if (textValue && textValue.trim().length > 0) {
616
+ messageSegments.push({ type: "text", value: textValue });
617
+ }
618
+ }
619
+ }
620
+ }
621
+ }
622
+ segmentsByMessage.push(messageSegments);
623
+ }
624
+ const useRoleMarkers = needsRoleMarkers(testCase.input_messages, segmentsByMessage);
625
+ let question;
626
+ if (useRoleMarkers) {
627
+ const messageParts = [];
628
+ for (let i = 0; i < testCase.input_messages.length; i++) {
629
+ const message = testCase.input_messages[i];
630
+ const segments = segmentsByMessage[i];
631
+ if (!hasVisibleContent(segments)) {
632
+ continue;
633
+ }
634
+ const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
635
+ const contentParts = [];
636
+ for (const segment of segments) {
637
+ const formattedContent = formatSegment(segment);
638
+ if (formattedContent) {
639
+ contentParts.push(formattedContent);
640
+ }
641
+ }
642
+ if (contentParts.length > 0) {
643
+ const messageContent = contentParts.join("\n");
644
+ messageParts.push(`@[${roleLabel}]:
645
+ ${messageContent}`);
550
646
  }
551
- continue;
552
647
  }
553
- const genericValue = segment.value;
554
- if (typeof genericValue === "string") {
555
- questionParts.push(genericValue);
648
+ question = messageParts.join("\n\n");
649
+ } else {
650
+ const questionParts = [];
651
+ for (const segment of testCase.input_segments) {
652
+ const formattedContent = formatSegment(segment);
653
+ if (formattedContent) {
654
+ questionParts.push(formattedContent);
655
+ }
556
656
  }
657
+ if (testCase.code_snippets.length > 0) {
658
+ questionParts.push(testCase.code_snippets.join("\n"));
659
+ }
660
+ question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
557
661
  }
558
- if (testCase.code_snippets.length > 0) {
559
- questionParts.push(testCase.code_snippets.join("\n"));
662
+ const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
663
+ messages: testCase.input_messages,
664
+ segmentsByMessage,
665
+ guidelinePatterns: testCase.guideline_patterns,
666
+ guidelineContent: guidelines
667
+ }) : void 0;
668
+ return { question, guidelines, chatPrompt };
669
+ }
670
+ function buildChatPromptFromSegments(options) {
671
+ const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
672
+ if (messages.length === 0) {
673
+ return void 0;
560
674
  }
561
- const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
562
- const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
563
- return { question, guidelines };
675
+ const systemSegments = [];
676
+ if (systemPrompt && systemPrompt.trim().length > 0) {
677
+ systemSegments.push(systemPrompt.trim());
678
+ }
679
+ if (guidelineContent && guidelineContent.trim().length > 0) {
680
+ systemSegments.push(`[[ ## Guidelines ## ]]
681
+
682
+ ${guidelineContent.trim()}`);
683
+ }
684
+ let startIndex = 0;
685
+ while (startIndex < messages.length && messages[startIndex].role === "system") {
686
+ const segments = segmentsByMessage[startIndex];
687
+ const contentParts = [];
688
+ for (const segment of segments) {
689
+ const formatted = formatSegment(segment);
690
+ if (formatted) {
691
+ contentParts.push(formatted);
692
+ }
693
+ }
694
+ if (contentParts.length > 0) {
695
+ systemSegments.push(contentParts.join("\n"));
696
+ }
697
+ startIndex += 1;
698
+ }
699
+ const chatPrompt = [];
700
+ if (systemSegments.length > 0) {
701
+ chatPrompt.push({
702
+ role: "system",
703
+ content: systemSegments.join("\n\n")
704
+ });
705
+ }
706
+ for (let i = startIndex; i < messages.length; i++) {
707
+ const message = messages[i];
708
+ const segments = segmentsByMessage[i];
709
+ const contentParts = [];
710
+ let role = message.role;
711
+ let name;
712
+ if (role === "system") {
713
+ role = "assistant";
714
+ contentParts.push("@[System]:");
715
+ } else if (role === "tool") {
716
+ role = "function";
717
+ name = "tool";
718
+ }
719
+ for (const segment of segments) {
720
+ if (segment.type === "guideline_ref") {
721
+ continue;
722
+ }
723
+ const formatted = formatSegment(segment);
724
+ if (formatted) {
725
+ const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
726
+ if (isGuidelineRef) {
727
+ continue;
728
+ }
729
+ contentParts.push(formatted);
730
+ }
731
+ }
732
+ if (contentParts.length === 0) {
733
+ continue;
734
+ }
735
+ chatPrompt.push({
736
+ role,
737
+ content: contentParts.join("\n"),
738
+ ...name ? { name } : {}
739
+ });
740
+ }
741
+ return chatPrompt.length > 0 ? chatPrompt : void 0;
564
742
  }
565
743
  async function fileExists2(absolutePath) {
566
744
  try {
@@ -757,21 +935,14 @@ var import_ax = require("@ax-llm/ax");
757
935
  var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
758
936
  function buildChatPrompt(request) {
759
937
  if (request.chatPrompt) {
760
- return request.chatPrompt;
761
- }
762
- const systemSegments = [];
763
- const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
764
- if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
765
- systemSegments.push(metadataSystemPrompt.trim());
766
- } else {
767
- systemSegments.push(DEFAULT_SYSTEM_PROMPT);
768
- }
769
- if (request.guidelines && request.guidelines.trim().length > 0) {
770
- systemSegments.push(`[[ ## Guidelines ## ]]
771
-
772
- ${request.guidelines.trim()}`);
938
+ const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
939
+ if (hasSystemMessage) {
940
+ return request.chatPrompt;
941
+ }
942
+ const systemContent2 = resolveSystemContent(request);
943
+ return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
773
944
  }
774
- const systemContent = systemSegments.join("\n\n");
945
+ const systemContent = resolveSystemContent(request);
775
946
  const userContent = request.question.trim();
776
947
  const prompt = [
777
948
  {
@@ -785,6 +956,21 @@ ${request.guidelines.trim()}`);
785
956
  ];
786
957
  return prompt;
787
958
  }
959
+ function resolveSystemContent(request) {
960
+ const systemSegments = [];
961
+ const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
962
+ if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
963
+ systemSegments.push(metadataSystemPrompt.trim());
964
+ } else {
965
+ systemSegments.push(DEFAULT_SYSTEM_PROMPT);
966
+ }
967
+ if (request.guidelines && request.guidelines.trim().length > 0) {
968
+ systemSegments.push(`[[ ## Guidelines ## ]]
969
+
970
+ ${request.guidelines.trim()}`);
971
+ }
972
+ return systemSegments.join("\n\n");
973
+ }
788
974
  function extractModelConfig(request, defaults) {
789
975
  const temperature = request.temperature ?? defaults.temperature;
790
976
  const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
@@ -828,6 +1014,67 @@ function ensureChatResponse(result) {
828
1014
  }
829
1015
  return result;
830
1016
  }
1017
+ function isRetryableError(error, retryableStatusCodes) {
1018
+ if (!error || typeof error !== "object") {
1019
+ return false;
1020
+ }
1021
+ if ("status" in error && typeof error.status === "number") {
1022
+ return retryableStatusCodes.includes(error.status);
1023
+ }
1024
+ if ("message" in error && typeof error.message === "string") {
1025
+ const match = error.message.match(/HTTP (\d{3})/);
1026
+ if (match) {
1027
+ const status = Number.parseInt(match[1], 10);
1028
+ return retryableStatusCodes.includes(status);
1029
+ }
1030
+ }
1031
+ if ("name" in error && error.name === "AxAIServiceNetworkError") {
1032
+ return true;
1033
+ }
1034
+ return false;
1035
+ }
1036
+ function calculateRetryDelay(attempt, config) {
1037
+ const delay = Math.min(
1038
+ config.maxDelayMs,
1039
+ config.initialDelayMs * config.backoffFactor ** attempt
1040
+ );
1041
+ return delay * (0.75 + Math.random() * 0.5);
1042
+ }
1043
+ async function sleep(ms) {
1044
+ return new Promise((resolve) => setTimeout(resolve, ms));
1045
+ }
1046
+ async function withRetry(fn, retryConfig, signal) {
1047
+ const config = {
1048
+ maxRetries: retryConfig?.maxRetries ?? 3,
1049
+ initialDelayMs: retryConfig?.initialDelayMs ?? 1e3,
1050
+ maxDelayMs: retryConfig?.maxDelayMs ?? 6e4,
1051
+ backoffFactor: retryConfig?.backoffFactor ?? 2,
1052
+ retryableStatusCodes: retryConfig?.retryableStatusCodes ?? [500, 408, 429, 502, 503, 504]
1053
+ };
1054
+ let lastError;
1055
+ for (let attempt = 0; attempt <= config.maxRetries; attempt++) {
1056
+ if (signal?.aborted) {
1057
+ throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
1058
+ }
1059
+ try {
1060
+ return await fn();
1061
+ } catch (error) {
1062
+ lastError = error;
1063
+ if (attempt >= config.maxRetries) {
1064
+ break;
1065
+ }
1066
+ if (!isRetryableError(error, config.retryableStatusCodes)) {
1067
+ throw error;
1068
+ }
1069
+ const delay = calculateRetryDelay(attempt, config);
1070
+ await sleep(delay);
1071
+ if (signal?.aborted) {
1072
+ throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
1073
+ }
1074
+ }
1075
+ }
1076
+ throw lastError;
1077
+ }
831
1078
  var AzureProvider = class {
832
1079
  constructor(targetName, config) {
833
1080
  this.config = config;
@@ -837,6 +1084,7 @@ var AzureProvider = class {
837
1084
  temperature: config.temperature,
838
1085
  maxOutputTokens: config.maxOutputTokens
839
1086
  };
1087
+ this.retryConfig = config.retry;
840
1088
  this.ai = import_ax.AxAI.create({
841
1089
  name: "azure-openai",
842
1090
  apiKey: config.apiKey,
@@ -853,16 +1101,21 @@ var AzureProvider = class {
853
1101
  targetName;
854
1102
  ai;
855
1103
  defaults;
1104
+ retryConfig;
856
1105
  async invoke(request) {
857
1106
  const chatPrompt = buildChatPrompt(request);
858
1107
  const modelConfig = extractModelConfig(request, this.defaults);
859
- const response = await this.ai.chat(
860
- {
861
- chatPrompt,
862
- model: this.config.deploymentName,
863
- ...modelConfig ? { modelConfig } : {}
864
- },
865
- request.signal ? { abortSignal: request.signal } : void 0
1108
+ const response = await withRetry(
1109
+ async () => await this.ai.chat(
1110
+ {
1111
+ chatPrompt,
1112
+ model: this.config.deploymentName,
1113
+ ...modelConfig ? { modelConfig } : {}
1114
+ },
1115
+ request.signal ? { abortSignal: request.signal } : void 0
1116
+ ),
1117
+ this.retryConfig,
1118
+ request.signal
866
1119
  );
867
1120
  return mapResponse(ensureChatResponse(response));
868
1121
  }
@@ -880,6 +1133,7 @@ var AnthropicProvider = class {
880
1133
  maxOutputTokens: config.maxOutputTokens,
881
1134
  thinkingBudget: config.thinkingBudget
882
1135
  };
1136
+ this.retryConfig = config.retry;
883
1137
  this.ai = import_ax.AxAI.create({
884
1138
  name: "anthropic",
885
1139
  apiKey: config.apiKey
@@ -890,16 +1144,21 @@ var AnthropicProvider = class {
890
1144
  targetName;
891
1145
  ai;
892
1146
  defaults;
1147
+ retryConfig;
893
1148
  async invoke(request) {
894
1149
  const chatPrompt = buildChatPrompt(request);
895
1150
  const modelConfig = extractModelConfig(request, this.defaults);
896
- const response = await this.ai.chat(
897
- {
898
- chatPrompt,
899
- model: this.config.model,
900
- ...modelConfig ? { modelConfig } : {}
901
- },
902
- request.signal ? { abortSignal: request.signal } : void 0
1151
+ const response = await withRetry(
1152
+ async () => await this.ai.chat(
1153
+ {
1154
+ chatPrompt,
1155
+ model: this.config.model,
1156
+ ...modelConfig ? { modelConfig } : {}
1157
+ },
1158
+ request.signal ? { abortSignal: request.signal } : void 0
1159
+ ),
1160
+ this.retryConfig,
1161
+ request.signal
903
1162
  );
904
1163
  return mapResponse(ensureChatResponse(response));
905
1164
  }
@@ -916,6 +1175,7 @@ var GeminiProvider = class {
916
1175
  temperature: config.temperature,
917
1176
  maxOutputTokens: config.maxOutputTokens
918
1177
  };
1178
+ this.retryConfig = config.retry;
919
1179
  this.ai = import_ax.AxAI.create({
920
1180
  name: "google-gemini",
921
1181
  apiKey: config.apiKey
@@ -926,16 +1186,21 @@ var GeminiProvider = class {
926
1186
  targetName;
927
1187
  ai;
928
1188
  defaults;
1189
+ retryConfig;
929
1190
  async invoke(request) {
930
1191
  const chatPrompt = buildChatPrompt(request);
931
1192
  const modelConfig = extractModelConfig(request, this.defaults);
932
- const response = await this.ai.chat(
933
- {
934
- chatPrompt,
935
- model: this.config.model,
936
- ...modelConfig ? { modelConfig } : {}
937
- },
938
- request.signal ? { abortSignal: request.signal } : void 0
1193
+ const response = await withRetry(
1194
+ async () => await this.ai.chat(
1195
+ {
1196
+ chatPrompt,
1197
+ model: this.config.model,
1198
+ ...modelConfig ? { modelConfig } : {}
1199
+ },
1200
+ request.signal ? { abortSignal: request.signal } : void 0
1201
+ ),
1202
+ this.retryConfig,
1203
+ request.signal
939
1204
  );
940
1205
  return mapResponse(ensureChatResponse(response));
941
1206
  }
@@ -1005,10 +1270,9 @@ var CliProvider = class {
1005
1270
  const outputFilePath = generateOutputFilePath(request.evalCaseId);
1006
1271
  const templateValues = buildTemplateValues(request, this.config, outputFilePath);
1007
1272
  const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
1008
- const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
1009
1273
  const result = await this.runCommand(renderedCommand, {
1010
1274
  cwd: this.config.cwd,
1011
- env,
1275
+ env: process.env,
1012
1276
  timeoutMs: this.config.timeoutMs,
1013
1277
  signal: request.signal
1014
1278
  });
@@ -1097,10 +1361,9 @@ var CliProvider = class {
1097
1361
  generateOutputFilePath("healthcheck")
1098
1362
  )
1099
1363
  );
1100
- const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
1101
1364
  const result = await this.runCommand(renderedCommand, {
1102
1365
  cwd: healthcheck.cwd ?? this.config.cwd,
1103
- env,
1366
+ env: process.env,
1104
1367
  timeoutMs,
1105
1368
  signal
1106
1369
  });
@@ -2051,10 +2314,9 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set(["PROMPT", "GUIDELINES", "EVAL_ID
2051
2314
  var BASE_TARGET_SCHEMA = import_zod.z.object({
2052
2315
  name: import_zod.z.string().min(1, "target name is required"),
2053
2316
  provider: import_zod.z.string().min(1, "provider is required"),
2054
- settings: import_zod.z.record(import_zod.z.unknown()).optional(),
2055
2317
  judge_target: import_zod.z.string().optional(),
2056
2318
  workers: import_zod.z.number().int().min(1).optional()
2057
- });
2319
+ }).passthrough();
2058
2320
  var DEFAULT_AZURE_API_VERSION = "2024-10-01-preview";
2059
2321
  function normalizeAzureApiVersion(value) {
2060
2322
  if (!value) {
@@ -2067,11 +2329,43 @@ function normalizeAzureApiVersion(value) {
2067
2329
  const withoutPrefix = trimmed.replace(/^api[-_]?version\s*=\s*/i, "").trim();
2068
2330
  return withoutPrefix.length > 0 ? withoutPrefix : DEFAULT_AZURE_API_VERSION;
2069
2331
  }
2332
+ function resolveRetryConfig(target) {
2333
+ const maxRetries = resolveOptionalNumber(
2334
+ target.max_retries ?? target.maxRetries,
2335
+ `${target.name} max retries`
2336
+ );
2337
+ const initialDelayMs = resolveOptionalNumber(
2338
+ target.retry_initial_delay_ms ?? target.retryInitialDelayMs,
2339
+ `${target.name} retry initial delay`
2340
+ );
2341
+ const maxDelayMs = resolveOptionalNumber(
2342
+ target.retry_max_delay_ms ?? target.retryMaxDelayMs,
2343
+ `${target.name} retry max delay`
2344
+ );
2345
+ const backoffFactor = resolveOptionalNumber(
2346
+ target.retry_backoff_factor ?? target.retryBackoffFactor,
2347
+ `${target.name} retry backoff factor`
2348
+ );
2349
+ const retryableStatusCodes = resolveOptionalNumberArray(
2350
+ target.retry_status_codes ?? target.retryStatusCodes,
2351
+ `${target.name} retry status codes`
2352
+ );
2353
+ if (maxRetries === void 0 && initialDelayMs === void 0 && maxDelayMs === void 0 && backoffFactor === void 0 && retryableStatusCodes === void 0) {
2354
+ return void 0;
2355
+ }
2356
+ return {
2357
+ maxRetries,
2358
+ initialDelayMs,
2359
+ maxDelayMs,
2360
+ backoffFactor,
2361
+ retryableStatusCodes
2362
+ };
2363
+ }
2070
2364
  function resolveTargetDefinition(definition, env = process.env) {
2071
2365
  const parsed = BASE_TARGET_SCHEMA.parse(definition);
2072
2366
  const provider = parsed.provider.toLowerCase();
2073
2367
  const providerBatching = resolveOptionalBoolean(
2074
- parsed.settings?.provider_batching ?? parsed.settings?.providerBatching
2368
+ parsed.provider_batching ?? parsed.providerBatching
2075
2369
  );
2076
2370
  switch (provider) {
2077
2371
  case "azure":
@@ -2147,13 +2441,12 @@ function resolveTargetDefinition(definition, env = process.env) {
2147
2441
  }
2148
2442
  }
2149
2443
  function resolveAzureConfig(target, env) {
2150
- const settings = target.settings ?? {};
2151
- const endpointSource = settings.endpoint ?? settings.resource ?? settings.resourceName;
2152
- const apiKeySource = settings.api_key ?? settings.apiKey;
2153
- const deploymentSource = settings.deployment ?? settings.deploymentName ?? settings.model;
2154
- const versionSource = settings.version ?? settings.api_version;
2155
- const temperatureSource = settings.temperature;
2156
- const maxTokensSource = settings.max_output_tokens ?? settings.maxTokens;
2444
+ const endpointSource = target.endpoint ?? target.resource ?? target.resourceName;
2445
+ const apiKeySource = target.api_key ?? target.apiKey;
2446
+ const deploymentSource = target.deployment ?? target.deploymentName ?? target.model;
2447
+ const versionSource = target.version ?? target.api_version;
2448
+ const temperatureSource = target.temperature;
2449
+ const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
2157
2450
  const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
2158
2451
  const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
2159
2452
  const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
@@ -2165,58 +2458,61 @@ function resolveAzureConfig(target, env) {
2165
2458
  maxTokensSource,
2166
2459
  `${target.name} max output tokens`
2167
2460
  );
2461
+ const retry = resolveRetryConfig(target);
2168
2462
  return {
2169
2463
  resourceName,
2170
2464
  deploymentName,
2171
2465
  apiKey,
2172
2466
  version,
2173
2467
  temperature,
2174
- maxOutputTokens
2468
+ maxOutputTokens,
2469
+ retry
2175
2470
  };
2176
2471
  }
2177
2472
  function resolveAnthropicConfig(target, env) {
2178
- const settings = target.settings ?? {};
2179
- const apiKeySource = settings.api_key ?? settings.apiKey;
2180
- const modelSource = settings.model ?? settings.deployment ?? settings.variant;
2181
- const temperatureSource = settings.temperature;
2182
- const maxTokensSource = settings.max_output_tokens ?? settings.maxTokens;
2183
- const thinkingBudgetSource = settings.thinking_budget ?? settings.thinkingBudget;
2473
+ const apiKeySource = target.api_key ?? target.apiKey;
2474
+ const modelSource = target.model ?? target.deployment ?? target.variant;
2475
+ const temperatureSource = target.temperature;
2476
+ const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
2477
+ const thinkingBudgetSource = target.thinking_budget ?? target.thinkingBudget;
2184
2478
  const apiKey = resolveString(apiKeySource, env, `${target.name} Anthropic api key`);
2185
2479
  const model = resolveString(modelSource, env, `${target.name} Anthropic model`);
2480
+ const retry = resolveRetryConfig(target);
2186
2481
  return {
2187
2482
  apiKey,
2188
2483
  model,
2189
2484
  temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
2190
2485
  maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
2191
- thinkingBudget: resolveOptionalNumber(thinkingBudgetSource, `${target.name} thinking budget`)
2486
+ thinkingBudget: resolveOptionalNumber(thinkingBudgetSource, `${target.name} thinking budget`),
2487
+ retry
2192
2488
  };
2193
2489
  }
2194
2490
  function resolveGeminiConfig(target, env) {
2195
- const settings = target.settings ?? {};
2196
- const apiKeySource = settings.api_key ?? settings.apiKey;
2197
- const modelSource = settings.model ?? settings.deployment ?? settings.variant;
2198
- const temperatureSource = settings.temperature;
2199
- const maxTokensSource = settings.max_output_tokens ?? settings.maxTokens;
2491
+ const apiKeySource = target.api_key ?? target.apiKey;
2492
+ const modelSource = target.model ?? target.deployment ?? target.variant;
2493
+ const temperatureSource = target.temperature;
2494
+ const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
2200
2495
  const apiKey = resolveString(apiKeySource, env, `${target.name} Google API key`);
2201
2496
  const model = resolveOptionalString(modelSource, env, `${target.name} Gemini model`, {
2202
2497
  allowLiteral: true,
2203
2498
  optionalEnv: true
2204
2499
  }) ?? "gemini-2.5-flash";
2500
+ const retry = resolveRetryConfig(target);
2205
2501
  return {
2206
2502
  apiKey,
2207
2503
  model,
2208
2504
  temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
2209
- maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`)
2505
+ maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
2506
+ retry
2210
2507
  };
2211
2508
  }
2212
2509
  function resolveCodexConfig(target, env) {
2213
- const settings = target.settings ?? {};
2214
- const executableSource = settings.executable ?? settings.command ?? settings.binary;
2215
- const argsSource = settings.args ?? settings.arguments;
2216
- const cwdSource = settings.cwd;
2217
- const timeoutSource = settings.timeout_seconds ?? settings.timeoutSeconds;
2218
- const logDirSource = settings.log_dir ?? settings.logDir ?? settings.log_directory ?? settings.logDirectory;
2219
- const logFormatSource = settings.log_format ?? settings.logFormat ?? settings.log_output_format ?? settings.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
2510
+ const executableSource = target.executable ?? target.command ?? target.binary;
2511
+ const argsSource = target.args ?? target.arguments;
2512
+ const cwdSource = target.cwd;
2513
+ const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
2514
+ const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
2515
+ const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
2220
2516
  const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
2221
2517
  allowLiteral: true,
2222
2518
  optionalEnv: true
@@ -2255,21 +2551,19 @@ function normalizeCodexLogFormat(value) {
2255
2551
  throw new Error("codex log format must be 'summary' or 'json'");
2256
2552
  }
2257
2553
  function resolveMockConfig(target) {
2258
- const settings = target.settings ?? {};
2259
- const response = typeof settings.response === "string" ? settings.response : void 0;
2554
+ const response = typeof target.response === "string" ? target.response : void 0;
2260
2555
  return { response };
2261
2556
  }
2262
2557
  function resolveVSCodeConfig(target, env, insiders) {
2263
- const settings = target.settings ?? {};
2264
- const workspaceTemplateEnvVar = resolveOptionalLiteralString(settings.workspace_template ?? settings.workspaceTemplate);
2558
+ const workspaceTemplateEnvVar = resolveOptionalLiteralString(target.workspace_template ?? target.workspaceTemplate);
2265
2559
  const workspaceTemplate = workspaceTemplateEnvVar ? resolveOptionalString(workspaceTemplateEnvVar, env, `${target.name} workspace template path`, {
2266
2560
  allowLiteral: false,
2267
2561
  optionalEnv: true
2268
2562
  }) : void 0;
2269
- const commandSource = settings.vscode_cmd ?? settings.command;
2270
- const waitSource = settings.wait;
2271
- const dryRunSource = settings.dry_run ?? settings.dryRun;
2272
- const subagentRootSource = settings.subagent_root ?? settings.subagentRoot;
2563
+ const commandSource = target.vscode_cmd ?? target.command;
2564
+ const waitSource = target.wait;
2565
+ const dryRunSource = target.dry_run ?? target.dryRun;
2566
+ const subagentRootSource = target.subagent_root ?? target.subagentRoot;
2273
2567
  const defaultCommand = insiders ? "code-insiders" : "code";
2274
2568
  const command = resolveOptionalLiteralString(commandSource) ?? defaultCommand;
2275
2569
  return {
@@ -2284,18 +2578,16 @@ function resolveVSCodeConfig(target, env, insiders) {
2284
2578
  };
2285
2579
  }
2286
2580
  function resolveCliConfig(target, env) {
2287
- const settings = target.settings ?? {};
2288
- const commandTemplateSource = settings.command_template ?? settings.commandTemplate;
2581
+ const commandTemplateSource = target.command_template ?? target.commandTemplate;
2289
2582
  const filesFormat = resolveOptionalLiteralString(
2290
- settings.files_format ?? settings.filesFormat ?? settings.attachments_format ?? settings.attachmentsFormat
2583
+ target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
2291
2584
  );
2292
- const cwd = resolveOptionalString(settings.cwd, env, `${target.name} working directory`, {
2585
+ const cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
2293
2586
  allowLiteral: true,
2294
2587
  optionalEnv: true
2295
2588
  });
2296
- const envOverrides = resolveEnvOverrides(settings.env, env, target.name);
2297
- const timeoutMs = resolveTimeoutMs(settings.timeout_seconds ?? settings.timeoutSeconds, `${target.name} timeout`);
2298
- const healthcheck = resolveCliHealthcheck(settings.healthcheck, env, target.name);
2589
+ const timeoutMs = resolveTimeoutMs(target.timeout_seconds ?? target.timeoutSeconds, `${target.name} timeout`);
2590
+ const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name);
2299
2591
  const commandTemplate = resolveString(
2300
2592
  commandTemplateSource,
2301
2593
  env,
@@ -2307,29 +2599,10 @@ function resolveCliConfig(target, env) {
2307
2599
  commandTemplate,
2308
2600
  filesFormat,
2309
2601
  cwd,
2310
- env: envOverrides,
2311
2602
  timeoutMs,
2312
2603
  healthcheck
2313
2604
  };
2314
2605
  }
2315
- function resolveEnvOverrides(source, env, targetName) {
2316
- if (source === void 0 || source === null) {
2317
- return void 0;
2318
- }
2319
- if (typeof source !== "object" || Array.isArray(source)) {
2320
- throw new Error(`${targetName} env overrides must be an object map of strings`);
2321
- }
2322
- const entries = Object.entries(source);
2323
- const resolved = {};
2324
- for (const [key, value] of entries) {
2325
- if (typeof value !== "string") {
2326
- throw new Error(`${targetName} env override '${key}' must be a string`);
2327
- }
2328
- const resolvedValue = resolveString(value, env, `${targetName} env override '${key}'`);
2329
- resolved[key] = resolvedValue;
2330
- }
2331
- return Object.keys(resolved).length > 0 ? resolved : void 0;
2332
- }
2333
2606
  function resolveTimeoutMs(source, description) {
2334
2607
  const seconds = resolveOptionalNumber(source, `${description} (seconds)`);
2335
2608
  if (seconds === void 0) {
@@ -2525,6 +2798,26 @@ function resolveOptionalStringArray(source, env, description) {
2525
2798
  }
2526
2799
  return resolved.length > 0 ? resolved : void 0;
2527
2800
  }
2801
+ function resolveOptionalNumberArray(source, description) {
2802
+ if (source === void 0 || source === null) {
2803
+ return void 0;
2804
+ }
2805
+ if (!Array.isArray(source)) {
2806
+ throw new Error(`${description} must be an array of numbers`);
2807
+ }
2808
+ if (source.length === 0) {
2809
+ return void 0;
2810
+ }
2811
+ const resolved = [];
2812
+ for (let i = 0; i < source.length; i++) {
2813
+ const item = source[i];
2814
+ if (typeof item !== "number" || !Number.isFinite(item)) {
2815
+ throw new Error(`${description}[${i}] must be a number`);
2816
+ }
2817
+ resolved.push(item);
2818
+ }
2819
+ return resolved.length > 0 ? resolved : void 0;
2820
+ }
2528
2821
 
2529
2822
  // src/evaluation/providers/vscode.ts
2530
2823
  var import_node_path6 = __toESM(require("path"), 1);
@@ -2784,7 +3077,7 @@ var AGENT_PROVIDER_KINDS = [
2784
3077
  "vscode",
2785
3078
  "vscode-insiders"
2786
3079
  ];
2787
- var TARGETS_SCHEMA_V2 = "agentv-targets-v2.1";
3080
+ var TARGETS_SCHEMA_V2 = "agentv-targets-v2.2";
2788
3081
  function isAgentProvider(provider) {
2789
3082
  return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
2790
3083
  }
@@ -2827,20 +3120,13 @@ function assertTargetDefinition(value, index, filePath) {
2827
3120
  }
2828
3121
  const name = value.name;
2829
3122
  const provider = value.provider;
2830
- const settings = value.settings;
2831
- const judgeTarget = value.judge_target;
2832
3123
  if (typeof name !== "string" || name.trim().length === 0) {
2833
3124
  throw new Error(`targets.yaml entry at index ${index} in ${filePath} is missing a valid 'name'`);
2834
3125
  }
2835
3126
  if (typeof provider !== "string" || provider.trim().length === 0) {
2836
3127
  throw new Error(`targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider'`);
2837
3128
  }
2838
- return {
2839
- name,
2840
- provider,
2841
- settings: isRecord(settings) ? settings : void 0,
2842
- judge_target: typeof judgeTarget === "string" ? judgeTarget : void 0
2843
- };
3129
+ return value;
2844
3130
  }
2845
3131
  async function fileExists3(filePath) {
2846
3132
  try {
@@ -2920,19 +3206,21 @@ var LlmJudgeEvaluator = class {
2920
3206
  return this.evaluateWithPrompt(context, judgeProvider);
2921
3207
  }
2922
3208
  async evaluateWithPrompt(context, judgeProvider) {
2923
- let prompt = buildQualityPrompt(context.evalCase, context.candidate);
2924
- let systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
3209
+ const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
3210
+ const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
3211
+ let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
3212
+ let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
2925
3213
  if (systemPrompt && hasTemplateVariables(systemPrompt)) {
2926
3214
  const variables = {
2927
3215
  input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
2928
3216
  output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
2929
3217
  candidate_answer: context.candidate,
2930
- reference_answer: context.evalCase.reference_answer,
3218
+ reference_answer: context.evalCase.reference_answer ?? "",
2931
3219
  expected_outcome: context.evalCase.expected_outcome,
2932
- question: context.evalCase.question
3220
+ question: formattedQuestion
2933
3221
  };
2934
3222
  prompt = substituteVariables(systemPrompt, variables);
2935
- systemPrompt = QUALITY_SYSTEM_PROMPT;
3223
+ systemPrompt = buildSystemPrompt(hasReferenceAnswer);
2936
3224
  }
2937
3225
  const metadata = {
2938
3226
  ...systemPrompt !== void 0 ? { systemPrompt } : {},
@@ -2970,38 +3258,51 @@ var LlmJudgeEvaluator = class {
2970
3258
  };
2971
3259
  }
2972
3260
  };
2973
- var QUALITY_SYSTEM_PROMPT = [
2974
- "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
2975
- "",
2976
- "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
2977
- "",
2978
- "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
2979
- "",
2980
- "You must respond with a single JSON object matching this schema:",
2981
- "",
2982
- "{",
2983
- ' "score": <number between 0.0 and 1.0>,',
2984
- ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
2985
- ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
2986
- ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
2987
- "}"
2988
- ].join("\n");
2989
- function buildQualityPrompt(evalCase, candidate) {
3261
+ function buildSystemPrompt(hasReferenceAnswer) {
3262
+ const basePrompt = [
3263
+ "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
3264
+ ""
3265
+ ];
3266
+ if (hasReferenceAnswer) {
3267
+ basePrompt.push(
3268
+ "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
3269
+ ""
3270
+ );
3271
+ }
3272
+ basePrompt.push(
3273
+ "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
3274
+ "",
3275
+ "You must respond with a single JSON object matching this schema:",
3276
+ "",
3277
+ "{",
3278
+ ' "score": <number between 0.0 and 1.0>,',
3279
+ ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
3280
+ ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
3281
+ ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
3282
+ "}"
3283
+ );
3284
+ return basePrompt.join("\n");
3285
+ }
3286
+ function buildQualityPrompt(evalCase, candidate, question) {
2990
3287
  const parts = [
2991
3288
  "[[ ## expected_outcome ## ]]",
2992
3289
  evalCase.expected_outcome.trim(),
2993
3290
  "",
2994
3291
  "[[ ## question ## ]]",
2995
- evalCase.question.trim(),
2996
- "",
2997
- "[[ ## reference_answer ## ]]",
2998
- evalCase.reference_answer.trim(),
2999
- "",
3000
- "[[ ## candidate_answer ## ]]",
3001
- candidate.trim(),
3002
- "",
3003
- "Respond with a single JSON object matching the schema described in the system prompt."
3292
+ question.trim(),
3293
+ ""
3004
3294
  ];
3295
+ if (hasNonEmptyReferenceAnswer(evalCase)) {
3296
+ parts.push(
3297
+ "[[ ## reference_answer ## ]]",
3298
+ evalCase.reference_answer.trim(),
3299
+ ""
3300
+ );
3301
+ }
3302
+ parts.push(
3303
+ "[[ ## candidate_answer ## ]]",
3304
+ candidate.trim()
3305
+ );
3005
3306
  return parts.join("\n");
3006
3307
  }
3007
3308
  function clampScore(value) {
@@ -3084,6 +3385,9 @@ function extractJsonBlob(text) {
3084
3385
  function isNonEmptyString(value) {
3085
3386
  return typeof value === "string" && value.trim().length > 0;
3086
3387
  }
3388
+ function hasNonEmptyReferenceAnswer(evalCase) {
3389
+ return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
3390
+ }
3087
3391
  var CodeEvaluator = class {
3088
3392
  kind = "code";
3089
3393
  script;
@@ -3481,10 +3785,11 @@ async function runEvaluation(options) {
3481
3785
  await onProgress({
3482
3786
  workerId,
3483
3787
  evalId: evalCase.id,
3484
- status: "completed",
3788
+ status: result.error ? "failed" : "completed",
3485
3789
  startedAt: 0,
3486
3790
  // Not used for completed status
3487
- completedAt: Date.now()
3791
+ completedAt: Date.now(),
3792
+ error: result.error
3488
3793
  });
3489
3794
  }
3490
3795
  if (onResult) {
@@ -3741,11 +4046,27 @@ async function evaluateCandidate(options) {
3741
4046
  agentTimeoutMs
3742
4047
  });
3743
4048
  const completedAt = nowFn();
3744
- const rawRequest = {
3745
- question: promptInputs.question,
3746
- ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
3747
- guideline_paths: evalCase.guideline_paths
3748
- };
4049
+ let agentProviderRequest;
4050
+ let lmProviderRequest;
4051
+ if (isAgentProvider(provider)) {
4052
+ agentProviderRequest = {
4053
+ question: promptInputs.question,
4054
+ guideline_paths: evalCase.guideline_paths
4055
+ };
4056
+ } else {
4057
+ if (promptInputs.chatPrompt) {
4058
+ lmProviderRequest = {
4059
+ chat_prompt: promptInputs.chatPrompt,
4060
+ guideline_paths: evalCase.guideline_paths
4061
+ };
4062
+ } else {
4063
+ lmProviderRequest = {
4064
+ question: promptInputs.question,
4065
+ guidelines: promptInputs.guidelines,
4066
+ guideline_paths: evalCase.guideline_paths
4067
+ };
4068
+ }
4069
+ }
3749
4070
  return {
3750
4071
  eval_id: evalCase.id,
3751
4072
  dataset: evalCase.dataset,
@@ -3759,7 +4080,8 @@ async function evaluateCandidate(options) {
3759
4080
  timestamp: completedAt.toISOString(),
3760
4081
  reasoning: score.reasoning,
3761
4082
  raw_aspects: score.rawAspects,
3762
- raw_request: rawRequest,
4083
+ agent_provider_request: agentProviderRequest,
4084
+ lm_provider_request: lmProviderRequest,
3763
4085
  evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
3764
4086
  evaluator_results: evaluatorResults
3765
4087
  };
@@ -3988,6 +4310,7 @@ async function invokeProvider(provider, options) {
3988
4310
  question: promptInputs.question,
3989
4311
  guidelines: promptInputs.guidelines,
3990
4312
  guideline_patterns: evalCase.guideline_patterns,
4313
+ chatPrompt: promptInputs.chatPrompt,
3991
4314
  inputFiles: evalCase.file_paths,
3992
4315
  evalCaseId: evalCase.id,
3993
4316
  attempt,
@@ -4004,12 +4327,30 @@ async function invokeProvider(provider, options) {
4004
4327
  }
4005
4328
  function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
4006
4329
  const message = error instanceof Error ? error.message : String(error);
4007
- const rawRequest = {
4008
- question: promptInputs.question,
4009
- ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
4010
- guideline_paths: evalCase.guideline_paths,
4011
- error: message
4012
- };
4330
+ let agentProviderRequest;
4331
+ let lmProviderRequest;
4332
+ if (isAgentProvider(provider)) {
4333
+ agentProviderRequest = {
4334
+ question: promptInputs.question,
4335
+ guideline_paths: evalCase.guideline_paths,
4336
+ error: message
4337
+ };
4338
+ } else {
4339
+ if (promptInputs.chatPrompt) {
4340
+ lmProviderRequest = {
4341
+ chat_prompt: promptInputs.chatPrompt,
4342
+ guideline_paths: evalCase.guideline_paths,
4343
+ error: message
4344
+ };
4345
+ } else {
4346
+ lmProviderRequest = {
4347
+ question: promptInputs.question,
4348
+ guidelines: promptInputs.guidelines,
4349
+ guideline_paths: evalCase.guideline_paths,
4350
+ error: message
4351
+ };
4352
+ }
4353
+ }
4013
4354
  return {
4014
4355
  eval_id: evalCase.id,
4015
4356
  dataset: evalCase.dataset,
@@ -4022,7 +4363,9 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
4022
4363
  target: targetName,
4023
4364
  timestamp: timestamp.toISOString(),
4024
4365
  raw_aspects: [],
4025
- raw_request: rawRequest
4366
+ agent_provider_request: agentProviderRequest,
4367
+ lm_provider_request: lmProviderRequest,
4368
+ error: message
4026
4369
  };
4027
4370
  }
4028
4371
  function createCacheKey(provider, target, evalCase, promptInputs) {
@@ -4033,6 +4376,9 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
4033
4376
  hash.update(promptInputs.question);
4034
4377
  hash.update(promptInputs.guidelines);
4035
4378
  hash.update(promptInputs.systemMessage ?? "");
4379
+ if (promptInputs.chatPrompt) {
4380
+ hash.update(JSON.stringify(promptInputs.chatPrompt));
4381
+ }
4036
4382
  return hash.digest("hex");
4037
4383
  }
4038
4384
  function isTimeoutLike(error) {