@agentv/core 0.7.5 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -9,7 +9,7 @@ import {
9
9
  readTextFile,
10
10
  resolveFileReference,
11
11
  resolveTargetDefinition
12
- } from "./chunk-7XM7HYRS.js";
12
+ } from "./chunk-YQBJAT5I.js";
13
13
 
14
14
  // src/evaluation/types.ts
15
15
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -268,14 +268,11 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
268
268
  logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
269
269
  continue;
270
270
  }
271
- if (!Array.isArray(expectedMessagesValue)) {
272
- logWarning(`Eval case '${id}' missing expected_messages array`);
273
- continue;
274
- }
271
+ const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
275
272
  const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
276
- const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
277
- if (expectedMessages.length === 0) {
278
- logWarning(`No expected message found for eval case: ${id}`);
273
+ const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
274
+ if (hasExpectedMessages && expectedMessages.length === 0) {
275
+ logWarning(`No valid expected message found for eval case: ${id}`);
279
276
  continue;
280
277
  }
281
278
  if (expectedMessages.length > 1) {
@@ -293,17 +290,17 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
293
290
  messageType: "input",
294
291
  verbose
295
292
  });
296
- const outputSegments = await processMessages({
293
+ const outputSegments = hasExpectedMessages ? await processMessages({
297
294
  messages: expectedMessages,
298
295
  searchRoots,
299
296
  repoRootPath,
300
297
  guidelinePatterns,
301
298
  messageType: "output",
302
299
  verbose
303
- });
300
+ }) : [];
304
301
  const codeSnippets = extractCodeBlocks(inputSegments);
305
302
  const expectedContent = expectedMessages[0]?.content;
306
- const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
303
+ const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
307
304
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
308
305
  const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
309
306
  const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
@@ -322,6 +319,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
322
319
  dataset: datasetName,
323
320
  conversation_id: conversationId,
324
321
  question,
322
+ input_messages: inputMessages,
325
323
  input_segments: inputSegments,
326
324
  output_segments: outputSegments,
327
325
  reference_answer: referenceAnswer,
@@ -349,6 +347,54 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
349
347
  }
350
348
  return results;
351
349
  }
350
+ function needsRoleMarkers(messages, processedSegmentsByMessage) {
351
+ if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
352
+ return true;
353
+ }
354
+ let messagesWithContent = 0;
355
+ for (const segments of processedSegmentsByMessage) {
356
+ if (hasVisibleContent(segments)) {
357
+ messagesWithContent++;
358
+ }
359
+ }
360
+ return messagesWithContent > 1;
361
+ }
362
+ function hasVisibleContent(segments) {
363
+ return segments.some((segment) => {
364
+ const type = asString(segment.type);
365
+ if (type === "text") {
366
+ const value = asString(segment.value);
367
+ return value !== void 0 && value.trim().length > 0;
368
+ }
369
+ if (type === "guideline_ref") {
370
+ return false;
371
+ }
372
+ if (type === "file") {
373
+ const text = asString(segment.text);
374
+ return text !== void 0 && text.trim().length > 0;
375
+ }
376
+ return false;
377
+ });
378
+ }
379
+ function formatSegment(segment) {
380
+ const type = asString(segment.type);
381
+ if (type === "text") {
382
+ return asString(segment.value);
383
+ }
384
+ if (type === "guideline_ref") {
385
+ const refPath = asString(segment.path);
386
+ return refPath ? `<Attached: ${refPath}>` : void 0;
387
+ }
388
+ if (type === "file") {
389
+ const text = asString(segment.text);
390
+ const filePath = asString(segment.path);
391
+ if (text && filePath) {
392
+ return `=== ${filePath} ===
393
+ ${text}`;
394
+ }
395
+ }
396
+ return void 0;
397
+ }
352
398
  async function buildPromptInputs(testCase) {
353
399
  const guidelineContents = [];
354
400
  for (const rawPath of testCase.guideline_paths) {
@@ -365,36 +411,168 @@ ${content}`);
365
411
  logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
366
412
  }
367
413
  }
368
- const questionParts = [];
414
+ const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
415
+ const segmentsByMessage = [];
416
+ const fileContentsByPath = /* @__PURE__ */ new Map();
369
417
  for (const segment of testCase.input_segments) {
370
- const typeValue = segment.type;
371
- if (typeof typeValue === "string" && typeValue === "file") {
372
- const pathValue = segment.path;
373
- const textValue = segment.text;
374
- const label = typeof pathValue === "string" ? pathValue : "file";
375
- const body = typeof textValue === "string" ? textValue : "";
376
- questionParts.push(`=== ${label} ===
377
- ${body}`);
378
- continue;
418
+ if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
419
+ fileContentsByPath.set(segment.path, segment.text);
379
420
  }
380
- if (typeof typeValue === "string" && typeValue === "text") {
381
- const value = segment.value;
382
- if (typeof value === "string") {
383
- questionParts.push(value);
421
+ }
422
+ for (const message of testCase.input_messages) {
423
+ const messageSegments = [];
424
+ if (typeof message.content === "string") {
425
+ if (message.content.trim().length > 0) {
426
+ messageSegments.push({ type: "text", value: message.content });
427
+ }
428
+ } else if (Array.isArray(message.content)) {
429
+ for (const segment of message.content) {
430
+ if (typeof segment === "string") {
431
+ if (segment.trim().length > 0) {
432
+ messageSegments.push({ type: "text", value: segment });
433
+ }
434
+ } else if (isJsonObject(segment)) {
435
+ const type = asString(segment.type);
436
+ if (type === "file") {
437
+ const value = asString(segment.value);
438
+ if (!value) continue;
439
+ if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
440
+ messageSegments.push({ type: "guideline_ref", path: value });
441
+ continue;
442
+ }
443
+ const fileText = fileContentsByPath.get(value);
444
+ if (fileText !== void 0) {
445
+ messageSegments.push({ type: "file", text: fileText, path: value });
446
+ }
447
+ } else if (type === "text") {
448
+ const textValue = asString(segment.value);
449
+ if (textValue && textValue.trim().length > 0) {
450
+ messageSegments.push({ type: "text", value: textValue });
451
+ }
452
+ }
453
+ }
454
+ }
455
+ }
456
+ segmentsByMessage.push(messageSegments);
457
+ }
458
+ const useRoleMarkers = needsRoleMarkers(testCase.input_messages, segmentsByMessage);
459
+ let question;
460
+ if (useRoleMarkers) {
461
+ const messageParts = [];
462
+ for (let i = 0; i < testCase.input_messages.length; i++) {
463
+ const message = testCase.input_messages[i];
464
+ const segments = segmentsByMessage[i];
465
+ if (!hasVisibleContent(segments)) {
466
+ continue;
467
+ }
468
+ const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
469
+ const contentParts = [];
470
+ for (const segment of segments) {
471
+ const formattedContent = formatSegment(segment);
472
+ if (formattedContent) {
473
+ contentParts.push(formattedContent);
474
+ }
475
+ }
476
+ if (contentParts.length > 0) {
477
+ const messageContent = contentParts.join("\n");
478
+ messageParts.push(`@[${roleLabel}]:
479
+ ${messageContent}`);
480
+ }
481
+ }
482
+ question = messageParts.join("\n\n");
483
+ } else {
484
+ const questionParts = [];
485
+ for (const segment of testCase.input_segments) {
486
+ const formattedContent = formatSegment(segment);
487
+ if (formattedContent) {
488
+ questionParts.push(formattedContent);
489
+ }
490
+ }
491
+ if (testCase.code_snippets.length > 0) {
492
+ questionParts.push(testCase.code_snippets.join("\n"));
493
+ }
494
+ question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
495
+ }
496
+ const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
497
+ messages: testCase.input_messages,
498
+ segmentsByMessage,
499
+ guidelinePatterns: testCase.guideline_patterns,
500
+ guidelineContent: guidelines
501
+ }) : void 0;
502
+ return { question, guidelines, chatPrompt };
503
+ }
504
+ function buildChatPromptFromSegments(options) {
505
+ const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
506
+ if (messages.length === 0) {
507
+ return void 0;
508
+ }
509
+ const systemSegments = [];
510
+ if (systemPrompt && systemPrompt.trim().length > 0) {
511
+ systemSegments.push(systemPrompt.trim());
512
+ }
513
+ if (guidelineContent && guidelineContent.trim().length > 0) {
514
+ systemSegments.push(`[[ ## Guidelines ## ]]
515
+
516
+ ${guidelineContent.trim()}`);
517
+ }
518
+ let startIndex = 0;
519
+ while (startIndex < messages.length && messages[startIndex].role === "system") {
520
+ const segments = segmentsByMessage[startIndex];
521
+ const contentParts = [];
522
+ for (const segment of segments) {
523
+ const formatted = formatSegment(segment);
524
+ if (formatted) {
525
+ contentParts.push(formatted);
384
526
  }
385
- continue;
386
527
  }
387
- const genericValue = segment.value;
388
- if (typeof genericValue === "string") {
389
- questionParts.push(genericValue);
528
+ if (contentParts.length > 0) {
529
+ systemSegments.push(contentParts.join("\n"));
390
530
  }
531
+ startIndex += 1;
391
532
  }
392
- if (testCase.code_snippets.length > 0) {
393
- questionParts.push(testCase.code_snippets.join("\n"));
533
+ const chatPrompt = [];
534
+ if (systemSegments.length > 0) {
535
+ chatPrompt.push({
536
+ role: "system",
537
+ content: systemSegments.join("\n\n")
538
+ });
394
539
  }
395
- const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
396
- const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
397
- return { question, guidelines };
540
+ for (let i = startIndex; i < messages.length; i++) {
541
+ const message = messages[i];
542
+ const segments = segmentsByMessage[i];
543
+ const contentParts = [];
544
+ let role = message.role;
545
+ let name;
546
+ if (role === "system") {
547
+ role = "assistant";
548
+ contentParts.push("@[System]:");
549
+ } else if (role === "tool") {
550
+ role = "function";
551
+ name = "tool";
552
+ }
553
+ for (const segment of segments) {
554
+ if (segment.type === "guideline_ref") {
555
+ continue;
556
+ }
557
+ const formatted = formatSegment(segment);
558
+ if (formatted) {
559
+ const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
560
+ if (isGuidelineRef) {
561
+ continue;
562
+ }
563
+ contentParts.push(formatted);
564
+ }
565
+ }
566
+ if (contentParts.length === 0) {
567
+ continue;
568
+ }
569
+ chatPrompt.push({
570
+ role,
571
+ content: contentParts.join("\n"),
572
+ ...name ? { name } : {}
573
+ });
574
+ }
575
+ return chatPrompt.length > 0 ? chatPrompt : void 0;
398
576
  }
399
577
  async function fileExists2(absolutePath) {
400
578
  try {
@@ -591,21 +769,14 @@ import { AxAI } from "@ax-llm/ax";
591
769
  var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
592
770
  function buildChatPrompt(request) {
593
771
  if (request.chatPrompt) {
594
- return request.chatPrompt;
595
- }
596
- const systemSegments = [];
597
- const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
598
- if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
599
- systemSegments.push(metadataSystemPrompt.trim());
600
- } else {
601
- systemSegments.push(DEFAULT_SYSTEM_PROMPT);
602
- }
603
- if (request.guidelines && request.guidelines.trim().length > 0) {
604
- systemSegments.push(`[[ ## Guidelines ## ]]
605
-
606
- ${request.guidelines.trim()}`);
772
+ const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
773
+ if (hasSystemMessage) {
774
+ return request.chatPrompt;
775
+ }
776
+ const systemContent2 = resolveSystemContent(request);
777
+ return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
607
778
  }
608
- const systemContent = systemSegments.join("\n\n");
779
+ const systemContent = resolveSystemContent(request);
609
780
  const userContent = request.question.trim();
610
781
  const prompt = [
611
782
  {
@@ -619,6 +790,21 @@ ${request.guidelines.trim()}`);
619
790
  ];
620
791
  return prompt;
621
792
  }
793
+ function resolveSystemContent(request) {
794
+ const systemSegments = [];
795
+ const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
796
+ if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
797
+ systemSegments.push(metadataSystemPrompt.trim());
798
+ } else {
799
+ systemSegments.push(DEFAULT_SYSTEM_PROMPT);
800
+ }
801
+ if (request.guidelines && request.guidelines.trim().length > 0) {
802
+ systemSegments.push(`[[ ## Guidelines ## ]]
803
+
804
+ ${request.guidelines.trim()}`);
805
+ }
806
+ return systemSegments.join("\n\n");
807
+ }
622
808
  function extractModelConfig(request, defaults) {
623
809
  const temperature = request.temperature ?? defaults.temperature;
624
810
  const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
@@ -662,6 +848,67 @@ function ensureChatResponse(result) {
662
848
  }
663
849
  return result;
664
850
  }
851
+ function isRetryableError(error, retryableStatusCodes) {
852
+ if (!error || typeof error !== "object") {
853
+ return false;
854
+ }
855
+ if ("status" in error && typeof error.status === "number") {
856
+ return retryableStatusCodes.includes(error.status);
857
+ }
858
+ if ("message" in error && typeof error.message === "string") {
859
+ const match = error.message.match(/HTTP (\d{3})/);
860
+ if (match) {
861
+ const status = Number.parseInt(match[1], 10);
862
+ return retryableStatusCodes.includes(status);
863
+ }
864
+ }
865
+ if ("name" in error && error.name === "AxAIServiceNetworkError") {
866
+ return true;
867
+ }
868
+ return false;
869
+ }
870
+ function calculateRetryDelay(attempt, config) {
871
+ const delay = Math.min(
872
+ config.maxDelayMs,
873
+ config.initialDelayMs * config.backoffFactor ** attempt
874
+ );
875
+ return delay * (0.75 + Math.random() * 0.5);
876
+ }
877
+ async function sleep(ms) {
878
+ return new Promise((resolve) => setTimeout(resolve, ms));
879
+ }
880
+ async function withRetry(fn, retryConfig, signal) {
881
+ const config = {
882
+ maxRetries: retryConfig?.maxRetries ?? 3,
883
+ initialDelayMs: retryConfig?.initialDelayMs ?? 1e3,
884
+ maxDelayMs: retryConfig?.maxDelayMs ?? 6e4,
885
+ backoffFactor: retryConfig?.backoffFactor ?? 2,
886
+ retryableStatusCodes: retryConfig?.retryableStatusCodes ?? [500, 408, 429, 502, 503, 504]
887
+ };
888
+ let lastError;
889
+ for (let attempt = 0; attempt <= config.maxRetries; attempt++) {
890
+ if (signal?.aborted) {
891
+ throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
892
+ }
893
+ try {
894
+ return await fn();
895
+ } catch (error) {
896
+ lastError = error;
897
+ if (attempt >= config.maxRetries) {
898
+ break;
899
+ }
900
+ if (!isRetryableError(error, config.retryableStatusCodes)) {
901
+ throw error;
902
+ }
903
+ const delay = calculateRetryDelay(attempt, config);
904
+ await sleep(delay);
905
+ if (signal?.aborted) {
906
+ throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
907
+ }
908
+ }
909
+ }
910
+ throw lastError;
911
+ }
665
912
  var AzureProvider = class {
666
913
  constructor(targetName, config) {
667
914
  this.config = config;
@@ -671,6 +918,7 @@ var AzureProvider = class {
671
918
  temperature: config.temperature,
672
919
  maxOutputTokens: config.maxOutputTokens
673
920
  };
921
+ this.retryConfig = config.retry;
674
922
  this.ai = AxAI.create({
675
923
  name: "azure-openai",
676
924
  apiKey: config.apiKey,
@@ -687,16 +935,21 @@ var AzureProvider = class {
687
935
  targetName;
688
936
  ai;
689
937
  defaults;
938
+ retryConfig;
690
939
  async invoke(request) {
691
940
  const chatPrompt = buildChatPrompt(request);
692
941
  const modelConfig = extractModelConfig(request, this.defaults);
693
- const response = await this.ai.chat(
694
- {
695
- chatPrompt,
696
- model: this.config.deploymentName,
697
- ...modelConfig ? { modelConfig } : {}
698
- },
699
- request.signal ? { abortSignal: request.signal } : void 0
942
+ const response = await withRetry(
943
+ async () => await this.ai.chat(
944
+ {
945
+ chatPrompt,
946
+ model: this.config.deploymentName,
947
+ ...modelConfig ? { modelConfig } : {}
948
+ },
949
+ request.signal ? { abortSignal: request.signal } : void 0
950
+ ),
951
+ this.retryConfig,
952
+ request.signal
700
953
  );
701
954
  return mapResponse(ensureChatResponse(response));
702
955
  }
@@ -714,6 +967,7 @@ var AnthropicProvider = class {
714
967
  maxOutputTokens: config.maxOutputTokens,
715
968
  thinkingBudget: config.thinkingBudget
716
969
  };
970
+ this.retryConfig = config.retry;
717
971
  this.ai = AxAI.create({
718
972
  name: "anthropic",
719
973
  apiKey: config.apiKey
@@ -724,16 +978,21 @@ var AnthropicProvider = class {
724
978
  targetName;
725
979
  ai;
726
980
  defaults;
981
+ retryConfig;
727
982
  async invoke(request) {
728
983
  const chatPrompt = buildChatPrompt(request);
729
984
  const modelConfig = extractModelConfig(request, this.defaults);
730
- const response = await this.ai.chat(
731
- {
732
- chatPrompt,
733
- model: this.config.model,
734
- ...modelConfig ? { modelConfig } : {}
735
- },
736
- request.signal ? { abortSignal: request.signal } : void 0
985
+ const response = await withRetry(
986
+ async () => await this.ai.chat(
987
+ {
988
+ chatPrompt,
989
+ model: this.config.model,
990
+ ...modelConfig ? { modelConfig } : {}
991
+ },
992
+ request.signal ? { abortSignal: request.signal } : void 0
993
+ ),
994
+ this.retryConfig,
995
+ request.signal
737
996
  );
738
997
  return mapResponse(ensureChatResponse(response));
739
998
  }
@@ -750,6 +1009,7 @@ var GeminiProvider = class {
750
1009
  temperature: config.temperature,
751
1010
  maxOutputTokens: config.maxOutputTokens
752
1011
  };
1012
+ this.retryConfig = config.retry;
753
1013
  this.ai = AxAI.create({
754
1014
  name: "google-gemini",
755
1015
  apiKey: config.apiKey
@@ -760,16 +1020,21 @@ var GeminiProvider = class {
760
1020
  targetName;
761
1021
  ai;
762
1022
  defaults;
1023
+ retryConfig;
763
1024
  async invoke(request) {
764
1025
  const chatPrompt = buildChatPrompt(request);
765
1026
  const modelConfig = extractModelConfig(request, this.defaults);
766
- const response = await this.ai.chat(
767
- {
768
- chatPrompt,
769
- model: this.config.model,
770
- ...modelConfig ? { modelConfig } : {}
771
- },
772
- request.signal ? { abortSignal: request.signal } : void 0
1027
+ const response = await withRetry(
1028
+ async () => await this.ai.chat(
1029
+ {
1030
+ chatPrompt,
1031
+ model: this.config.model,
1032
+ ...modelConfig ? { modelConfig } : {}
1033
+ },
1034
+ request.signal ? { abortSignal: request.signal } : void 0
1035
+ ),
1036
+ this.retryConfig,
1037
+ request.signal
773
1038
  );
774
1039
  return mapResponse(ensureChatResponse(response));
775
1040
  }
@@ -839,10 +1104,9 @@ var CliProvider = class {
839
1104
  const outputFilePath = generateOutputFilePath(request.evalCaseId);
840
1105
  const templateValues = buildTemplateValues(request, this.config, outputFilePath);
841
1106
  const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
842
- const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
843
1107
  const result = await this.runCommand(renderedCommand, {
844
1108
  cwd: this.config.cwd,
845
- env,
1109
+ env: process.env,
846
1110
  timeoutMs: this.config.timeoutMs,
847
1111
  signal: request.signal
848
1112
  });
@@ -931,10 +1195,9 @@ var CliProvider = class {
931
1195
  generateOutputFilePath("healthcheck")
932
1196
  )
933
1197
  );
934
- const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
935
1198
  const result = await this.runCommand(renderedCommand, {
936
1199
  cwd: healthcheck.cwd ?? this.config.cwd,
937
- env,
1200
+ env: process.env,
938
1201
  timeoutMs,
939
1202
  signal
940
1203
  });
@@ -2167,20 +2430,13 @@ function assertTargetDefinition(value, index, filePath) {
2167
2430
  }
2168
2431
  const name = value.name;
2169
2432
  const provider = value.provider;
2170
- const settings = value.settings;
2171
- const judgeTarget = value.judge_target;
2172
2433
  if (typeof name !== "string" || name.trim().length === 0) {
2173
2434
  throw new Error(`targets.yaml entry at index ${index} in ${filePath} is missing a valid 'name'`);
2174
2435
  }
2175
2436
  if (typeof provider !== "string" || provider.trim().length === 0) {
2176
2437
  throw new Error(`targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider'`);
2177
2438
  }
2178
- return {
2179
- name,
2180
- provider,
2181
- settings: isRecord(settings) ? settings : void 0,
2182
- judge_target: typeof judgeTarget === "string" ? judgeTarget : void 0
2183
- };
2439
+ return value;
2184
2440
  }
2185
2441
  async function fileExists3(filePath) {
2186
2442
  try {
@@ -2260,19 +2516,21 @@ var LlmJudgeEvaluator = class {
2260
2516
  return this.evaluateWithPrompt(context, judgeProvider);
2261
2517
  }
2262
2518
  async evaluateWithPrompt(context, judgeProvider) {
2263
- let prompt = buildQualityPrompt(context.evalCase, context.candidate);
2264
- let systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
2519
+ const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
2520
+ const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
2521
+ let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
2522
+ let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
2265
2523
  if (systemPrompt && hasTemplateVariables(systemPrompt)) {
2266
2524
  const variables = {
2267
2525
  input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
2268
2526
  output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
2269
2527
  candidate_answer: context.candidate,
2270
- reference_answer: context.evalCase.reference_answer,
2528
+ reference_answer: context.evalCase.reference_answer ?? "",
2271
2529
  expected_outcome: context.evalCase.expected_outcome,
2272
- question: context.evalCase.question
2530
+ question: formattedQuestion
2273
2531
  };
2274
2532
  prompt = substituteVariables(systemPrompt, variables);
2275
- systemPrompt = QUALITY_SYSTEM_PROMPT;
2533
+ systemPrompt = buildSystemPrompt(hasReferenceAnswer);
2276
2534
  }
2277
2535
  const metadata = {
2278
2536
  ...systemPrompt !== void 0 ? { systemPrompt } : {},
@@ -2310,38 +2568,51 @@ var LlmJudgeEvaluator = class {
2310
2568
  };
2311
2569
  }
2312
2570
  };
2313
- var QUALITY_SYSTEM_PROMPT = [
2314
- "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
2315
- "",
2316
- "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
2317
- "",
2318
- "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
2319
- "",
2320
- "You must respond with a single JSON object matching this schema:",
2321
- "",
2322
- "{",
2323
- ' "score": <number between 0.0 and 1.0>,',
2324
- ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
2325
- ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
2326
- ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
2327
- "}"
2328
- ].join("\n");
2329
- function buildQualityPrompt(evalCase, candidate) {
2571
+ function buildSystemPrompt(hasReferenceAnswer) {
2572
+ const basePrompt = [
2573
+ "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
2574
+ ""
2575
+ ];
2576
+ if (hasReferenceAnswer) {
2577
+ basePrompt.push(
2578
+ "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
2579
+ ""
2580
+ );
2581
+ }
2582
+ basePrompt.push(
2583
+ "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
2584
+ "",
2585
+ "You must respond with a single JSON object matching this schema:",
2586
+ "",
2587
+ "{",
2588
+ ' "score": <number between 0.0 and 1.0>,',
2589
+ ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
2590
+ ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
2591
+ ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
2592
+ "}"
2593
+ );
2594
+ return basePrompt.join("\n");
2595
+ }
2596
+ function buildQualityPrompt(evalCase, candidate, question) {
2330
2597
  const parts = [
2331
2598
  "[[ ## expected_outcome ## ]]",
2332
2599
  evalCase.expected_outcome.trim(),
2333
2600
  "",
2334
2601
  "[[ ## question ## ]]",
2335
- evalCase.question.trim(),
2336
- "",
2337
- "[[ ## reference_answer ## ]]",
2338
- evalCase.reference_answer.trim(),
2339
- "",
2340
- "[[ ## candidate_answer ## ]]",
2341
- candidate.trim(),
2342
- "",
2343
- "Respond with a single JSON object matching the schema described in the system prompt."
2602
+ question.trim(),
2603
+ ""
2344
2604
  ];
2605
+ if (hasNonEmptyReferenceAnswer(evalCase)) {
2606
+ parts.push(
2607
+ "[[ ## reference_answer ## ]]",
2608
+ evalCase.reference_answer.trim(),
2609
+ ""
2610
+ );
2611
+ }
2612
+ parts.push(
2613
+ "[[ ## candidate_answer ## ]]",
2614
+ candidate.trim()
2615
+ );
2345
2616
  return parts.join("\n");
2346
2617
  }
2347
2618
  function clampScore(value) {
@@ -2424,6 +2695,9 @@ function extractJsonBlob(text) {
2424
2695
  function isNonEmptyString(value) {
2425
2696
  return typeof value === "string" && value.trim().length > 0;
2426
2697
  }
2698
+ function hasNonEmptyReferenceAnswer(evalCase) {
2699
+ return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
2700
+ }
2427
2701
  var CodeEvaluator = class {
2428
2702
  kind = "code";
2429
2703
  script;
@@ -2821,10 +3095,11 @@ async function runEvaluation(options) {
2821
3095
  await onProgress({
2822
3096
  workerId,
2823
3097
  evalId: evalCase.id,
2824
- status: "completed",
3098
+ status: result.error ? "failed" : "completed",
2825
3099
  startedAt: 0,
2826
3100
  // Not used for completed status
2827
- completedAt: Date.now()
3101
+ completedAt: Date.now(),
3102
+ error: result.error
2828
3103
  });
2829
3104
  }
2830
3105
  if (onResult) {
@@ -3081,11 +3356,27 @@ async function evaluateCandidate(options) {
3081
3356
  agentTimeoutMs
3082
3357
  });
3083
3358
  const completedAt = nowFn();
3084
- const rawRequest = {
3085
- question: promptInputs.question,
3086
- ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
3087
- guideline_paths: evalCase.guideline_paths
3088
- };
3359
+ let agentProviderRequest;
3360
+ let lmProviderRequest;
3361
+ if (isAgentProvider(provider)) {
3362
+ agentProviderRequest = {
3363
+ question: promptInputs.question,
3364
+ guideline_paths: evalCase.guideline_paths
3365
+ };
3366
+ } else {
3367
+ if (promptInputs.chatPrompt) {
3368
+ lmProviderRequest = {
3369
+ chat_prompt: promptInputs.chatPrompt,
3370
+ guideline_paths: evalCase.guideline_paths
3371
+ };
3372
+ } else {
3373
+ lmProviderRequest = {
3374
+ question: promptInputs.question,
3375
+ guidelines: promptInputs.guidelines,
3376
+ guideline_paths: evalCase.guideline_paths
3377
+ };
3378
+ }
3379
+ }
3089
3380
  return {
3090
3381
  eval_id: evalCase.id,
3091
3382
  dataset: evalCase.dataset,
@@ -3099,7 +3390,8 @@ async function evaluateCandidate(options) {
3099
3390
  timestamp: completedAt.toISOString(),
3100
3391
  reasoning: score.reasoning,
3101
3392
  raw_aspects: score.rawAspects,
3102
- raw_request: rawRequest,
3393
+ agent_provider_request: agentProviderRequest,
3394
+ lm_provider_request: lmProviderRequest,
3103
3395
  evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
3104
3396
  evaluator_results: evaluatorResults
3105
3397
  };
@@ -3328,6 +3620,7 @@ async function invokeProvider(provider, options) {
3328
3620
  question: promptInputs.question,
3329
3621
  guidelines: promptInputs.guidelines,
3330
3622
  guideline_patterns: evalCase.guideline_patterns,
3623
+ chatPrompt: promptInputs.chatPrompt,
3331
3624
  inputFiles: evalCase.file_paths,
3332
3625
  evalCaseId: evalCase.id,
3333
3626
  attempt,
@@ -3344,12 +3637,30 @@ async function invokeProvider(provider, options) {
3344
3637
  }
3345
3638
  function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
3346
3639
  const message = error instanceof Error ? error.message : String(error);
3347
- const rawRequest = {
3348
- question: promptInputs.question,
3349
- ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
3350
- guideline_paths: evalCase.guideline_paths,
3351
- error: message
3352
- };
3640
+ let agentProviderRequest;
3641
+ let lmProviderRequest;
3642
+ if (isAgentProvider(provider)) {
3643
+ agentProviderRequest = {
3644
+ question: promptInputs.question,
3645
+ guideline_paths: evalCase.guideline_paths,
3646
+ error: message
3647
+ };
3648
+ } else {
3649
+ if (promptInputs.chatPrompt) {
3650
+ lmProviderRequest = {
3651
+ chat_prompt: promptInputs.chatPrompt,
3652
+ guideline_paths: evalCase.guideline_paths,
3653
+ error: message
3654
+ };
3655
+ } else {
3656
+ lmProviderRequest = {
3657
+ question: promptInputs.question,
3658
+ guidelines: promptInputs.guidelines,
3659
+ guideline_paths: evalCase.guideline_paths,
3660
+ error: message
3661
+ };
3662
+ }
3663
+ }
3353
3664
  return {
3354
3665
  eval_id: evalCase.id,
3355
3666
  dataset: evalCase.dataset,
@@ -3362,7 +3673,9 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
3362
3673
  target: targetName,
3363
3674
  timestamp: timestamp.toISOString(),
3364
3675
  raw_aspects: [],
3365
- raw_request: rawRequest
3676
+ agent_provider_request: agentProviderRequest,
3677
+ lm_provider_request: lmProviderRequest,
3678
+ error: message
3366
3679
  };
3367
3680
  }
3368
3681
  function createCacheKey(provider, target, evalCase, promptInputs) {
@@ -3373,6 +3686,9 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
3373
3686
  hash.update(promptInputs.question);
3374
3687
  hash.update(promptInputs.guidelines);
3375
3688
  hash.update(promptInputs.systemMessage ?? "");
3689
+ if (promptInputs.chatPrompt) {
3690
+ hash.update(JSON.stringify(promptInputs.chatPrompt));
3691
+ }
3376
3692
  return hash.digest("hex");
3377
3693
  }
3378
3694
  function isTimeoutLike(error) {