agentv 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -590,7 +590,7 @@ import fg from "fast-glob";
590
590
  import { stat as stat3 } from "node:fs/promises";
591
591
  import path15 from "node:path";
592
592
 
593
- // ../../packages/core/dist/chunk-SNTZFB24.js
593
+ // ../../packages/core/dist/chunk-YQBJAT5I.js
594
594
  import { constants } from "node:fs";
595
595
  import { access, readFile } from "node:fs/promises";
596
596
  import path from "node:path";
@@ -4636,7 +4636,7 @@ var coerce = {
4636
4636
  };
4637
4637
  var NEVER = INVALID;
4638
4638
 
4639
- // ../../packages/core/dist/chunk-SNTZFB24.js
4639
+ // ../../packages/core/dist/chunk-YQBJAT5I.js
4640
4640
  async function fileExists(filePath) {
4641
4641
  try {
4642
4642
  await access(filePath, constants.F_OK);
@@ -11947,14 +11947,11 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
11947
11947
  logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
11948
11948
  continue;
11949
11949
  }
11950
- if (!Array.isArray(expectedMessagesValue)) {
11951
- logWarning(`Eval case '${id}' missing expected_messages array`);
11952
- continue;
11953
- }
11950
+ const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
11954
11951
  const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
11955
- const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
11956
- if (expectedMessages.length === 0) {
11957
- logWarning(`No expected message found for eval case: ${id}`);
11952
+ const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
11953
+ if (hasExpectedMessages && expectedMessages.length === 0) {
11954
+ logWarning(`No valid expected message found for eval case: ${id}`);
11958
11955
  continue;
11959
11956
  }
11960
11957
  if (expectedMessages.length > 1) {
@@ -11972,17 +11969,17 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
11972
11969
  messageType: "input",
11973
11970
  verbose
11974
11971
  });
11975
- const outputSegments = await processMessages({
11972
+ const outputSegments = hasExpectedMessages ? await processMessages({
11976
11973
  messages: expectedMessages,
11977
11974
  searchRoots,
11978
11975
  repoRootPath,
11979
11976
  guidelinePatterns,
11980
11977
  messageType: "output",
11981
11978
  verbose
11982
- });
11979
+ }) : [];
11983
11980
  const codeSnippets = extractCodeBlocks(inputSegments);
11984
11981
  const expectedContent = expectedMessages[0]?.content;
11985
- const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
11982
+ const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
11986
11983
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
11987
11984
  const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
11988
11985
  const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
@@ -12001,6 +11998,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
12001
11998
  dataset: datasetName,
12002
11999
  conversation_id: conversationId,
12003
12000
  question,
12001
+ input_messages: inputMessages,
12004
12002
  input_segments: inputSegments,
12005
12003
  output_segments: outputSegments,
12006
12004
  reference_answer: referenceAnswer,
@@ -12028,6 +12026,54 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
12028
12026
  }
12029
12027
  return results;
12030
12028
  }
12029
+ function needsRoleMarkers(messages, processedSegmentsByMessage) {
12030
+ if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
12031
+ return true;
12032
+ }
12033
+ let messagesWithContent = 0;
12034
+ for (const segments of processedSegmentsByMessage) {
12035
+ if (hasVisibleContent(segments)) {
12036
+ messagesWithContent++;
12037
+ }
12038
+ }
12039
+ return messagesWithContent > 1;
12040
+ }
12041
+ function hasVisibleContent(segments) {
12042
+ return segments.some((segment) => {
12043
+ const type = asString(segment.type);
12044
+ if (type === "text") {
12045
+ const value = asString(segment.value);
12046
+ return value !== void 0 && value.trim().length > 0;
12047
+ }
12048
+ if (type === "guideline_ref") {
12049
+ return false;
12050
+ }
12051
+ if (type === "file") {
12052
+ const text = asString(segment.text);
12053
+ return text !== void 0 && text.trim().length > 0;
12054
+ }
12055
+ return false;
12056
+ });
12057
+ }
12058
+ function formatSegment(segment) {
12059
+ const type = asString(segment.type);
12060
+ if (type === "text") {
12061
+ return asString(segment.value);
12062
+ }
12063
+ if (type === "guideline_ref") {
12064
+ const refPath = asString(segment.path);
12065
+ return refPath ? `<Attached: ${refPath}>` : void 0;
12066
+ }
12067
+ if (type === "file") {
12068
+ const text = asString(segment.text);
12069
+ const filePath = asString(segment.path);
12070
+ if (text && filePath) {
12071
+ return `=== ${filePath} ===
12072
+ ${text}`;
12073
+ }
12074
+ }
12075
+ return void 0;
12076
+ }
12031
12077
  async function buildPromptInputs(testCase) {
12032
12078
  const guidelineContents = [];
12033
12079
  for (const rawPath of testCase.guideline_paths) {
@@ -12044,36 +12090,168 @@ ${content}`);
12044
12090
  logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
12045
12091
  }
12046
12092
  }
12047
- const questionParts = [];
12093
+ const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
12094
+ const segmentsByMessage = [];
12095
+ const fileContentsByPath = /* @__PURE__ */ new Map();
12048
12096
  for (const segment of testCase.input_segments) {
12049
- const typeValue = segment.type;
12050
- if (typeof typeValue === "string" && typeValue === "file") {
12051
- const pathValue = segment.path;
12052
- const textValue = segment.text;
12053
- const label = typeof pathValue === "string" ? pathValue : "file";
12054
- const body = typeof textValue === "string" ? textValue : "";
12055
- questionParts.push(`=== ${label} ===
12056
- ${body}`);
12057
- continue;
12097
+ if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
12098
+ fileContentsByPath.set(segment.path, segment.text);
12058
12099
  }
12059
- if (typeof typeValue === "string" && typeValue === "text") {
12060
- const value = segment.value;
12061
- if (typeof value === "string") {
12062
- questionParts.push(value);
12100
+ }
12101
+ for (const message of testCase.input_messages) {
12102
+ const messageSegments = [];
12103
+ if (typeof message.content === "string") {
12104
+ if (message.content.trim().length > 0) {
12105
+ messageSegments.push({ type: "text", value: message.content });
12106
+ }
12107
+ } else if (Array.isArray(message.content)) {
12108
+ for (const segment of message.content) {
12109
+ if (typeof segment === "string") {
12110
+ if (segment.trim().length > 0) {
12111
+ messageSegments.push({ type: "text", value: segment });
12112
+ }
12113
+ } else if (isJsonObject(segment)) {
12114
+ const type = asString(segment.type);
12115
+ if (type === "file") {
12116
+ const value = asString(segment.value);
12117
+ if (!value) continue;
12118
+ if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
12119
+ messageSegments.push({ type: "guideline_ref", path: value });
12120
+ continue;
12121
+ }
12122
+ const fileText = fileContentsByPath.get(value);
12123
+ if (fileText !== void 0) {
12124
+ messageSegments.push({ type: "file", text: fileText, path: value });
12125
+ }
12126
+ } else if (type === "text") {
12127
+ const textValue = asString(segment.value);
12128
+ if (textValue && textValue.trim().length > 0) {
12129
+ messageSegments.push({ type: "text", value: textValue });
12130
+ }
12131
+ }
12132
+ }
12063
12133
  }
12064
- continue;
12065
12134
  }
12066
- const genericValue = segment.value;
12067
- if (typeof genericValue === "string") {
12068
- questionParts.push(genericValue);
12135
+ segmentsByMessage.push(messageSegments);
12136
+ }
12137
+ const useRoleMarkers = needsRoleMarkers(testCase.input_messages, segmentsByMessage);
12138
+ let question;
12139
+ if (useRoleMarkers) {
12140
+ const messageParts = [];
12141
+ for (let i6 = 0; i6 < testCase.input_messages.length; i6++) {
12142
+ const message = testCase.input_messages[i6];
12143
+ const segments = segmentsByMessage[i6];
12144
+ if (!hasVisibleContent(segments)) {
12145
+ continue;
12146
+ }
12147
+ const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
12148
+ const contentParts = [];
12149
+ for (const segment of segments) {
12150
+ const formattedContent = formatSegment(segment);
12151
+ if (formattedContent) {
12152
+ contentParts.push(formattedContent);
12153
+ }
12154
+ }
12155
+ if (contentParts.length > 0) {
12156
+ const messageContent = contentParts.join("\n");
12157
+ messageParts.push(`@[${roleLabel}]:
12158
+ ${messageContent}`);
12159
+ }
12160
+ }
12161
+ question = messageParts.join("\n\n");
12162
+ } else {
12163
+ const questionParts = [];
12164
+ for (const segment of testCase.input_segments) {
12165
+ const formattedContent = formatSegment(segment);
12166
+ if (formattedContent) {
12167
+ questionParts.push(formattedContent);
12168
+ }
12169
+ }
12170
+ if (testCase.code_snippets.length > 0) {
12171
+ questionParts.push(testCase.code_snippets.join("\n"));
12069
12172
  }
12173
+ question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
12070
12174
  }
12071
- if (testCase.code_snippets.length > 0) {
12072
- questionParts.push(testCase.code_snippets.join("\n"));
12175
+ const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
12176
+ messages: testCase.input_messages,
12177
+ segmentsByMessage,
12178
+ guidelinePatterns: testCase.guideline_patterns,
12179
+ guidelineContent: guidelines
12180
+ }) : void 0;
12181
+ return { question, guidelines, chatPrompt };
12182
+ }
12183
+ function buildChatPromptFromSegments(options) {
12184
+ const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
12185
+ if (messages.length === 0) {
12186
+ return void 0;
12073
12187
  }
12074
- const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
12075
- const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
12076
- return { question, guidelines };
12188
+ const systemSegments = [];
12189
+ if (systemPrompt && systemPrompt.trim().length > 0) {
12190
+ systemSegments.push(systemPrompt.trim());
12191
+ }
12192
+ if (guidelineContent && guidelineContent.trim().length > 0) {
12193
+ systemSegments.push(`[[ ## Guidelines ## ]]
12194
+
12195
+ ${guidelineContent.trim()}`);
12196
+ }
12197
+ let startIndex = 0;
12198
+ while (startIndex < messages.length && messages[startIndex].role === "system") {
12199
+ const segments = segmentsByMessage[startIndex];
12200
+ const contentParts = [];
12201
+ for (const segment of segments) {
12202
+ const formatted = formatSegment(segment);
12203
+ if (formatted) {
12204
+ contentParts.push(formatted);
12205
+ }
12206
+ }
12207
+ if (contentParts.length > 0) {
12208
+ systemSegments.push(contentParts.join("\n"));
12209
+ }
12210
+ startIndex += 1;
12211
+ }
12212
+ const chatPrompt = [];
12213
+ if (systemSegments.length > 0) {
12214
+ chatPrompt.push({
12215
+ role: "system",
12216
+ content: systemSegments.join("\n\n")
12217
+ });
12218
+ }
12219
+ for (let i6 = startIndex; i6 < messages.length; i6++) {
12220
+ const message = messages[i6];
12221
+ const segments = segmentsByMessage[i6];
12222
+ const contentParts = [];
12223
+ let role = message.role;
12224
+ let name;
12225
+ if (role === "system") {
12226
+ role = "assistant";
12227
+ contentParts.push("@[System]:");
12228
+ } else if (role === "tool") {
12229
+ role = "function";
12230
+ name = "tool";
12231
+ }
12232
+ for (const segment of segments) {
12233
+ if (segment.type === "guideline_ref") {
12234
+ continue;
12235
+ }
12236
+ const formatted = formatSegment(segment);
12237
+ if (formatted) {
12238
+ const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
12239
+ if (isGuidelineRef) {
12240
+ continue;
12241
+ }
12242
+ contentParts.push(formatted);
12243
+ }
12244
+ }
12245
+ if (contentParts.length === 0) {
12246
+ continue;
12247
+ }
12248
+ chatPrompt.push({
12249
+ role,
12250
+ content: contentParts.join("\n"),
12251
+ ...name ? { name } : {}
12252
+ });
12253
+ }
12254
+ return chatPrompt.length > 0 ? chatPrompt : void 0;
12077
12255
  }
12078
12256
  async function fileExists2(absolutePath) {
12079
12257
  try {
@@ -12267,21 +12445,14 @@ ${detailBlock}${ANSI_RESET}`);
12267
12445
  var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
12268
12446
  function buildChatPrompt(request) {
12269
12447
  if (request.chatPrompt) {
12270
- return request.chatPrompt;
12271
- }
12272
- const systemSegments = [];
12273
- const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
12274
- if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
12275
- systemSegments.push(metadataSystemPrompt.trim());
12276
- } else {
12277
- systemSegments.push(DEFAULT_SYSTEM_PROMPT);
12278
- }
12279
- if (request.guidelines && request.guidelines.trim().length > 0) {
12280
- systemSegments.push(`[[ ## Guidelines ## ]]
12281
-
12282
- ${request.guidelines.trim()}`);
12448
+ const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
12449
+ if (hasSystemMessage) {
12450
+ return request.chatPrompt;
12451
+ }
12452
+ const systemContent2 = resolveSystemContent(request);
12453
+ return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
12283
12454
  }
12284
- const systemContent = systemSegments.join("\n\n");
12455
+ const systemContent = resolveSystemContent(request);
12285
12456
  const userContent = request.question.trim();
12286
12457
  const prompt = [
12287
12458
  {
@@ -12295,6 +12466,21 @@ ${request.guidelines.trim()}`);
12295
12466
  ];
12296
12467
  return prompt;
12297
12468
  }
12469
+ function resolveSystemContent(request) {
12470
+ const systemSegments = [];
12471
+ const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
12472
+ if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
12473
+ systemSegments.push(metadataSystemPrompt.trim());
12474
+ } else {
12475
+ systemSegments.push(DEFAULT_SYSTEM_PROMPT);
12476
+ }
12477
+ if (request.guidelines && request.guidelines.trim().length > 0) {
12478
+ systemSegments.push(`[[ ## Guidelines ## ]]
12479
+
12480
+ ${request.guidelines.trim()}`);
12481
+ }
12482
+ return systemSegments.join("\n\n");
12483
+ }
12298
12484
  function extractModelConfig(request, defaults) {
12299
12485
  const temperature = request.temperature ?? defaults.temperature;
12300
12486
  const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
@@ -13955,19 +14141,21 @@ var LlmJudgeEvaluator = class {
13955
14141
  return this.evaluateWithPrompt(context2, judgeProvider);
13956
14142
  }
13957
14143
  async evaluateWithPrompt(context2, judgeProvider) {
13958
- let prompt = buildQualityPrompt(context2.evalCase, context2.candidate);
13959
- let systemPrompt = context2.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
14144
+ const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context2.evalCase);
14145
+ const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
14146
+ let prompt = buildQualityPrompt(context2.evalCase, context2.candidate, formattedQuestion);
14147
+ let systemPrompt = context2.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
13960
14148
  if (systemPrompt && hasTemplateVariables(systemPrompt)) {
13961
14149
  const variables = {
13962
14150
  input_messages: JSON.stringify(context2.evalCase.input_segments, null, 2),
13963
14151
  output_messages: JSON.stringify(context2.evalCase.output_segments, null, 2),
13964
14152
  candidate_answer: context2.candidate,
13965
- reference_answer: context2.evalCase.reference_answer,
14153
+ reference_answer: context2.evalCase.reference_answer ?? "",
13966
14154
  expected_outcome: context2.evalCase.expected_outcome,
13967
- question: context2.evalCase.question
14155
+ question: formattedQuestion
13968
14156
  };
13969
14157
  prompt = substituteVariables(systemPrompt, variables);
13970
- systemPrompt = QUALITY_SYSTEM_PROMPT;
14158
+ systemPrompt = buildSystemPrompt(hasReferenceAnswer);
13971
14159
  }
13972
14160
  const metadata = {
13973
14161
  ...systemPrompt !== void 0 ? { systemPrompt } : {},
@@ -14005,38 +14193,51 @@ var LlmJudgeEvaluator = class {
14005
14193
  };
14006
14194
  }
14007
14195
  };
14008
- var QUALITY_SYSTEM_PROMPT = [
14009
- "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
14010
- "",
14011
- "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
14012
- "",
14013
- "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
14014
- "",
14015
- "You must respond with a single JSON object matching this schema:",
14016
- "",
14017
- "{",
14018
- ' "score": <number between 0.0 and 1.0>,',
14019
- ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
14020
- ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
14021
- ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
14022
- "}"
14023
- ].join("\n");
14024
- function buildQualityPrompt(evalCase, candidate) {
14196
+ function buildSystemPrompt(hasReferenceAnswer) {
14197
+ const basePrompt = [
14198
+ "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
14199
+ ""
14200
+ ];
14201
+ if (hasReferenceAnswer) {
14202
+ basePrompt.push(
14203
+ "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
14204
+ ""
14205
+ );
14206
+ }
14207
+ basePrompt.push(
14208
+ "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
14209
+ "",
14210
+ "You must respond with a single JSON object matching this schema:",
14211
+ "",
14212
+ "{",
14213
+ ' "score": <number between 0.0 and 1.0>,',
14214
+ ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
14215
+ ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
14216
+ ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
14217
+ "}"
14218
+ );
14219
+ return basePrompt.join("\n");
14220
+ }
14221
+ function buildQualityPrompt(evalCase, candidate, question) {
14025
14222
  const parts = [
14026
14223
  "[[ ## expected_outcome ## ]]",
14027
14224
  evalCase.expected_outcome.trim(),
14028
14225
  "",
14029
14226
  "[[ ## question ## ]]",
14030
- evalCase.question.trim(),
14031
- "",
14032
- "[[ ## reference_answer ## ]]",
14033
- evalCase.reference_answer.trim(),
14034
- "",
14035
- "[[ ## candidate_answer ## ]]",
14036
- candidate.trim(),
14037
- "",
14038
- "Respond with a single JSON object matching the schema described in the system prompt."
14227
+ question.trim(),
14228
+ ""
14039
14229
  ];
14230
+ if (hasNonEmptyReferenceAnswer(evalCase)) {
14231
+ parts.push(
14232
+ "[[ ## reference_answer ## ]]",
14233
+ evalCase.reference_answer.trim(),
14234
+ ""
14235
+ );
14236
+ }
14237
+ parts.push(
14238
+ "[[ ## candidate_answer ## ]]",
14239
+ candidate.trim()
14240
+ );
14040
14241
  return parts.join("\n");
14041
14242
  }
14042
14243
  function clampScore(value) {
@@ -14119,6 +14320,9 @@ function extractJsonBlob(text) {
14119
14320
  function isNonEmptyString(value) {
14120
14321
  return typeof value === "string" && value.trim().length > 0;
14121
14322
  }
14323
+ function hasNonEmptyReferenceAnswer(evalCase) {
14324
+ return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
14325
+ }
14122
14326
  var CodeEvaluator = class {
14123
14327
  kind = "code";
14124
14328
  script;
@@ -14766,11 +14970,27 @@ async function evaluateCandidate(options) {
14766
14970
  agentTimeoutMs
14767
14971
  });
14768
14972
  const completedAt = nowFn();
14769
- const rawRequest = {
14770
- question: promptInputs.question,
14771
- ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
14772
- guideline_paths: evalCase.guideline_paths
14773
- };
14973
+ let agentProviderRequest;
14974
+ let lmProviderRequest;
14975
+ if (isAgentProvider(provider)) {
14976
+ agentProviderRequest = {
14977
+ question: promptInputs.question,
14978
+ guideline_paths: evalCase.guideline_paths
14979
+ };
14980
+ } else {
14981
+ if (promptInputs.chatPrompt) {
14982
+ lmProviderRequest = {
14983
+ chat_prompt: promptInputs.chatPrompt,
14984
+ guideline_paths: evalCase.guideline_paths
14985
+ };
14986
+ } else {
14987
+ lmProviderRequest = {
14988
+ question: promptInputs.question,
14989
+ guidelines: promptInputs.guidelines,
14990
+ guideline_paths: evalCase.guideline_paths
14991
+ };
14992
+ }
14993
+ }
14774
14994
  return {
14775
14995
  eval_id: evalCase.id,
14776
14996
  dataset: evalCase.dataset,
@@ -14784,7 +15004,8 @@ async function evaluateCandidate(options) {
14784
15004
  timestamp: completedAt.toISOString(),
14785
15005
  reasoning: score.reasoning,
14786
15006
  raw_aspects: score.rawAspects,
14787
- raw_request: rawRequest,
15007
+ agent_provider_request: agentProviderRequest,
15008
+ lm_provider_request: lmProviderRequest,
14788
15009
  evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
14789
15010
  evaluator_results: evaluatorResults
14790
15011
  };
@@ -15013,6 +15234,7 @@ async function invokeProvider(provider, options) {
15013
15234
  question: promptInputs.question,
15014
15235
  guidelines: promptInputs.guidelines,
15015
15236
  guideline_patterns: evalCase.guideline_patterns,
15237
+ chatPrompt: promptInputs.chatPrompt,
15016
15238
  inputFiles: evalCase.file_paths,
15017
15239
  evalCaseId: evalCase.id,
15018
15240
  attempt,
@@ -15029,12 +15251,30 @@ async function invokeProvider(provider, options) {
15029
15251
  }
15030
15252
  function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
15031
15253
  const message = error instanceof Error ? error.message : String(error);
15032
- const rawRequest = {
15033
- question: promptInputs.question,
15034
- ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
15035
- guideline_paths: evalCase.guideline_paths,
15036
- error: message
15037
- };
15254
+ let agentProviderRequest;
15255
+ let lmProviderRequest;
15256
+ if (isAgentProvider(provider)) {
15257
+ agentProviderRequest = {
15258
+ question: promptInputs.question,
15259
+ guideline_paths: evalCase.guideline_paths,
15260
+ error: message
15261
+ };
15262
+ } else {
15263
+ if (promptInputs.chatPrompt) {
15264
+ lmProviderRequest = {
15265
+ chat_prompt: promptInputs.chatPrompt,
15266
+ guideline_paths: evalCase.guideline_paths,
15267
+ error: message
15268
+ };
15269
+ } else {
15270
+ lmProviderRequest = {
15271
+ question: promptInputs.question,
15272
+ guidelines: promptInputs.guidelines,
15273
+ guideline_paths: evalCase.guideline_paths,
15274
+ error: message
15275
+ };
15276
+ }
15277
+ }
15038
15278
  return {
15039
15279
  eval_id: evalCase.id,
15040
15280
  dataset: evalCase.dataset,
@@ -15047,7 +15287,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
15047
15287
  target: targetName,
15048
15288
  timestamp: timestamp.toISOString(),
15049
15289
  raw_aspects: [],
15050
- raw_request: rawRequest,
15290
+ agent_provider_request: agentProviderRequest,
15291
+ lm_provider_request: lmProviderRequest,
15051
15292
  error: message
15052
15293
  };
15053
15294
  }
@@ -15059,6 +15300,9 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
15059
15300
  hash.update(promptInputs.question);
15060
15301
  hash.update(promptInputs.guidelines);
15061
15302
  hash.update(promptInputs.systemMessage ?? "");
15303
+ if (promptInputs.chatPrompt) {
15304
+ hash.update(JSON.stringify(promptInputs.chatPrompt));
15305
+ }
15062
15306
  return hash.digest("hex");
15063
15307
  }
15064
15308
  function isTimeoutLike(error) {
@@ -15486,8 +15730,6 @@ import { stripVTControlCharacters } from "node:util";
15486
15730
  var ESC = "\x1B[";
15487
15731
  var CLEAR_LINE = `${ESC}K`;
15488
15732
  var MOVE_CURSOR_UP = `${ESC}1A`;
15489
- var SYNC_START = `${ESC}?2026h`;
15490
- var SYNC_END = `${ESC}?2026l`;
15491
15733
  var ProgressDisplay = class {
15492
15734
  workers = /* @__PURE__ */ new Map();
15493
15735
  maxWorkers;
@@ -15963,14 +16205,14 @@ async function validateEvalFile(filePath) {
15963
16205
  validateMessages(inputMessages, `${location}.input_messages`, absolutePath, errors);
15964
16206
  }
15965
16207
  const expectedMessages = evalCase["expected_messages"];
15966
- if (!Array.isArray(expectedMessages)) {
16208
+ if (expectedMessages !== void 0 && !Array.isArray(expectedMessages)) {
15967
16209
  errors.push({
15968
16210
  severity: "error",
15969
16211
  filePath: absolutePath,
15970
16212
  location: `${location}.expected_messages`,
15971
- message: "Missing or invalid 'expected_messages' field (must be an array)"
16213
+ message: "Invalid 'expected_messages' field (must be an array if provided)"
15972
16214
  });
15973
- } else {
16215
+ } else if (Array.isArray(expectedMessages)) {
15974
16216
  validateMessages(expectedMessages, `${location}.expected_messages`, absolutePath, errors);
15975
16217
  }
15976
16218
  }
@@ -16006,11 +16248,13 @@ function validateMessages(messages, location, filePath, errors) {
16006
16248
  }
16007
16249
  const content = message["content"];
16008
16250
  if (typeof content === "string") {
16251
+ validateContentForRoleMarkers(content, `${msgLocation}.content`, filePath, errors);
16009
16252
  } else if (Array.isArray(content)) {
16010
16253
  for (let j2 = 0; j2 < content.length; j2++) {
16011
16254
  const contentItem = content[j2];
16012
16255
  const contentLocation = `${msgLocation}.content[${j2}]`;
16013
16256
  if (typeof contentItem === "string") {
16257
+ validateContentForRoleMarkers(contentItem, contentLocation, filePath, errors);
16014
16258
  } else if (isObject(contentItem)) {
16015
16259
  const type = contentItem["type"];
16016
16260
  if (typeof type !== "string") {
@@ -16030,6 +16274,8 @@ function validateMessages(messages, location, filePath, errors) {
16030
16274
  location: `${contentLocation}.value`,
16031
16275
  message: "Content with type 'text' must have a 'value' field"
16032
16276
  });
16277
+ } else {
16278
+ validateContentForRoleMarkers(value, `${contentLocation}.value`, filePath, errors);
16033
16279
  }
16034
16280
  }
16035
16281
  } else {
@@ -16051,6 +16297,19 @@ function validateMessages(messages, location, filePath, errors) {
16051
16297
  }
16052
16298
  }
16053
16299
  }
16300
+ function validateContentForRoleMarkers(content, location, filePath, errors) {
16301
+ const markers = ["@[System]:", "@[User]:", "@[Assistant]:", "@[Tool]:"];
16302
+ for (const marker of markers) {
16303
+ if (content.toLowerCase().includes(marker.toLowerCase())) {
16304
+ errors.push({
16305
+ severity: "warning",
16306
+ filePath,
16307
+ location,
16308
+ message: `Content contains potential role marker '${marker}'. This may confuse agentic providers or cause prompt injection.`
16309
+ });
16310
+ }
16311
+ }
16312
+ }
16054
16313
  function isObject2(value) {
16055
16314
  return typeof value === "object" && value !== null && !Array.isArray(value);
16056
16315
  }
@@ -17665,4 +17924,4 @@ export {
17665
17924
  createProgram,
17666
17925
  runCli
17667
17926
  };
17668
- //# sourceMappingURL=chunk-X2VVUCIB.js.map
17927
+ //# sourceMappingURL=chunk-J5HK75TC.js.map