agentv 0.9.0 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -590,7 +590,7 @@ import fg from "fast-glob";
590
590
  import { stat as stat3 } from "node:fs/promises";
591
591
  import path15 from "node:path";
592
592
 
593
- // ../../packages/core/dist/chunk-SNTZFB24.js
593
+ // ../../packages/core/dist/chunk-YQBJAT5I.js
594
594
  import { constants } from "node:fs";
595
595
  import { access, readFile } from "node:fs/promises";
596
596
  import path from "node:path";
@@ -4636,7 +4636,7 @@ var coerce = {
4636
4636
  };
4637
4637
  var NEVER = INVALID;
4638
4638
 
4639
- // ../../packages/core/dist/chunk-SNTZFB24.js
4639
+ // ../../packages/core/dist/chunk-YQBJAT5I.js
4640
4640
  async function fileExists(filePath) {
4641
4641
  try {
4642
4642
  await access(filePath, constants.F_OK);
@@ -11752,6 +11752,33 @@ var ANSI_YELLOW = "\x1B[33m";
11752
11752
  var ANSI_RESET = "\x1B[0m";
11753
11753
  var SCHEMA_EVAL_V2 = "agentv-eval-v2";
11754
11754
  var SCHEMA_CONFIG_V2 = "agentv-config-v2";
11755
+ async function readTestSuiteMetadata(testFilePath) {
11756
+ try {
11757
+ const absolutePath = path8.resolve(testFilePath);
11758
+ const content = await readFile3(absolutePath, "utf8");
11759
+ const parsed = parse3(content);
11760
+ if (!isJsonObject(parsed)) {
11761
+ return {};
11762
+ }
11763
+ return { target: extractTargetFromSuite(parsed) };
11764
+ } catch {
11765
+ return {};
11766
+ }
11767
+ }
11768
+ function extractTargetFromSuite(suite) {
11769
+ const execution = suite.execution;
11770
+ if (execution && typeof execution === "object" && !Array.isArray(execution)) {
11771
+ const executionTarget = execution.target;
11772
+ if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
11773
+ return executionTarget.trim();
11774
+ }
11775
+ }
11776
+ const targetValue = suite.target;
11777
+ if (typeof targetValue === "string" && targetValue.trim().length > 0) {
11778
+ return targetValue.trim();
11779
+ }
11780
+ return void 0;
11781
+ }
11755
11782
  async function loadConfig(evalFilePath, repoRoot) {
11756
11783
  const directories = buildDirectoryChain(evalFilePath, repoRoot);
11757
11784
  for (const directory of directories) {
@@ -11928,6 +11955,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
11928
11955
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
11929
11956
  }
11930
11957
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
11958
+ const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
11959
+ const globalTarget = asString(globalExecution?.target) ?? asString(suite.target);
11931
11960
  const results = [];
11932
11961
  for (const rawEvalcase of rawTestcases) {
11933
11962
  if (!isJsonObject(rawEvalcase)) {
@@ -11947,14 +11976,11 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
11947
11976
  logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
11948
11977
  continue;
11949
11978
  }
11950
- if (!Array.isArray(expectedMessagesValue)) {
11951
- logWarning(`Eval case '${id}' missing expected_messages array`);
11952
- continue;
11953
- }
11979
+ const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
11954
11980
  const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
11955
- const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
11956
- if (expectedMessages.length === 0) {
11957
- logWarning(`No expected message found for eval case: ${id}`);
11981
+ const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
11982
+ if (hasExpectedMessages && expectedMessages.length === 0) {
11983
+ logWarning(`No valid expected message found for eval case: ${id}`);
11958
11984
  continue;
11959
11985
  }
11960
11986
  if (expectedMessages.length > 1) {
@@ -11972,20 +11998,20 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
11972
11998
  messageType: "input",
11973
11999
  verbose
11974
12000
  });
11975
- const outputSegments = await processMessages({
12001
+ const outputSegments = hasExpectedMessages ? await processMessages({
11976
12002
  messages: expectedMessages,
11977
12003
  searchRoots,
11978
12004
  repoRootPath,
11979
12005
  guidelinePatterns,
11980
12006
  messageType: "output",
11981
12007
  verbose
11982
- });
12008
+ }) : [];
11983
12009
  const codeSnippets = extractCodeBlocks(inputSegments);
11984
12010
  const expectedContent = expectedMessages[0]?.content;
11985
- const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
12011
+ const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
11986
12012
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
11987
12013
  const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
11988
- const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
12014
+ const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
11989
12015
  const userFilePaths = [];
11990
12016
  for (const segment of inputSegments) {
11991
12017
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -12001,6 +12027,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
12001
12027
  dataset: datasetName,
12002
12028
  conversation_id: conversationId,
12003
12029
  question,
12030
+ input_messages: inputMessages,
12004
12031
  input_segments: inputSegments,
12005
12032
  output_segments: outputSegments,
12006
12033
  reference_answer: referenceAnswer,
@@ -12028,6 +12055,54 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
12028
12055
  }
12029
12056
  return results;
12030
12057
  }
12058
+ function needsRoleMarkers(messages, processedSegmentsByMessage) {
12059
+ if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
12060
+ return true;
12061
+ }
12062
+ let messagesWithContent = 0;
12063
+ for (const segments of processedSegmentsByMessage) {
12064
+ if (hasVisibleContent(segments)) {
12065
+ messagesWithContent++;
12066
+ }
12067
+ }
12068
+ return messagesWithContent > 1;
12069
+ }
12070
+ function hasVisibleContent(segments) {
12071
+ return segments.some((segment) => {
12072
+ const type = asString(segment.type);
12073
+ if (type === "text") {
12074
+ const value = asString(segment.value);
12075
+ return value !== void 0 && value.trim().length > 0;
12076
+ }
12077
+ if (type === "guideline_ref") {
12078
+ return false;
12079
+ }
12080
+ if (type === "file") {
12081
+ const text = asString(segment.text);
12082
+ return text !== void 0 && text.trim().length > 0;
12083
+ }
12084
+ return false;
12085
+ });
12086
+ }
12087
+ function formatSegment(segment) {
12088
+ const type = asString(segment.type);
12089
+ if (type === "text") {
12090
+ return asString(segment.value);
12091
+ }
12092
+ if (type === "guideline_ref") {
12093
+ const refPath = asString(segment.path);
12094
+ return refPath ? `<Attached: ${refPath}>` : void 0;
12095
+ }
12096
+ if (type === "file") {
12097
+ const text = asString(segment.text);
12098
+ const filePath = asString(segment.path);
12099
+ if (text && filePath) {
12100
+ return `=== ${filePath} ===
12101
+ ${text}`;
12102
+ }
12103
+ }
12104
+ return void 0;
12105
+ }
12031
12106
  async function buildPromptInputs(testCase) {
12032
12107
  const guidelineContents = [];
12033
12108
  for (const rawPath of testCase.guideline_paths) {
@@ -12044,36 +12119,168 @@ ${content}`);
12044
12119
  logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
12045
12120
  }
12046
12121
  }
12047
- const questionParts = [];
12122
+ const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
12123
+ const segmentsByMessage = [];
12124
+ const fileContentsByPath = /* @__PURE__ */ new Map();
12048
12125
  for (const segment of testCase.input_segments) {
12049
- const typeValue = segment.type;
12050
- if (typeof typeValue === "string" && typeValue === "file") {
12051
- const pathValue = segment.path;
12052
- const textValue = segment.text;
12053
- const label = typeof pathValue === "string" ? pathValue : "file";
12054
- const body = typeof textValue === "string" ? textValue : "";
12055
- questionParts.push(`=== ${label} ===
12056
- ${body}`);
12057
- continue;
12126
+ if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
12127
+ fileContentsByPath.set(segment.path, segment.text);
12058
12128
  }
12059
- if (typeof typeValue === "string" && typeValue === "text") {
12060
- const value = segment.value;
12061
- if (typeof value === "string") {
12062
- questionParts.push(value);
12129
+ }
12130
+ for (const message of testCase.input_messages) {
12131
+ const messageSegments = [];
12132
+ if (typeof message.content === "string") {
12133
+ if (message.content.trim().length > 0) {
12134
+ messageSegments.push({ type: "text", value: message.content });
12135
+ }
12136
+ } else if (Array.isArray(message.content)) {
12137
+ for (const segment of message.content) {
12138
+ if (typeof segment === "string") {
12139
+ if (segment.trim().length > 0) {
12140
+ messageSegments.push({ type: "text", value: segment });
12141
+ }
12142
+ } else if (isJsonObject(segment)) {
12143
+ const type = asString(segment.type);
12144
+ if (type === "file") {
12145
+ const value = asString(segment.value);
12146
+ if (!value) continue;
12147
+ if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
12148
+ messageSegments.push({ type: "guideline_ref", path: value });
12149
+ continue;
12150
+ }
12151
+ const fileText = fileContentsByPath.get(value);
12152
+ if (fileText !== void 0) {
12153
+ messageSegments.push({ type: "file", text: fileText, path: value });
12154
+ }
12155
+ } else if (type === "text") {
12156
+ const textValue = asString(segment.value);
12157
+ if (textValue && textValue.trim().length > 0) {
12158
+ messageSegments.push({ type: "text", value: textValue });
12159
+ }
12160
+ }
12161
+ }
12063
12162
  }
12064
- continue;
12065
12163
  }
12066
- const genericValue = segment.value;
12067
- if (typeof genericValue === "string") {
12068
- questionParts.push(genericValue);
12164
+ segmentsByMessage.push(messageSegments);
12165
+ }
12166
+ const useRoleMarkers = needsRoleMarkers(testCase.input_messages, segmentsByMessage);
12167
+ let question;
12168
+ if (useRoleMarkers) {
12169
+ const messageParts = [];
12170
+ for (let i6 = 0; i6 < testCase.input_messages.length; i6++) {
12171
+ const message = testCase.input_messages[i6];
12172
+ const segments = segmentsByMessage[i6];
12173
+ if (!hasVisibleContent(segments)) {
12174
+ continue;
12175
+ }
12176
+ const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
12177
+ const contentParts = [];
12178
+ for (const segment of segments) {
12179
+ const formattedContent = formatSegment(segment);
12180
+ if (formattedContent) {
12181
+ contentParts.push(formattedContent);
12182
+ }
12183
+ }
12184
+ if (contentParts.length > 0) {
12185
+ const messageContent = contentParts.join("\n");
12186
+ messageParts.push(`@[${roleLabel}]:
12187
+ ${messageContent}`);
12188
+ }
12189
+ }
12190
+ question = messageParts.join("\n\n");
12191
+ } else {
12192
+ const questionParts = [];
12193
+ for (const segment of testCase.input_segments) {
12194
+ const formattedContent = formatSegment(segment);
12195
+ if (formattedContent) {
12196
+ questionParts.push(formattedContent);
12197
+ }
12069
12198
  }
12199
+ if (testCase.code_snippets.length > 0) {
12200
+ questionParts.push(testCase.code_snippets.join("\n"));
12201
+ }
12202
+ question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
12070
12203
  }
12071
- if (testCase.code_snippets.length > 0) {
12072
- questionParts.push(testCase.code_snippets.join("\n"));
12204
+ const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
12205
+ messages: testCase.input_messages,
12206
+ segmentsByMessage,
12207
+ guidelinePatterns: testCase.guideline_patterns,
12208
+ guidelineContent: guidelines
12209
+ }) : void 0;
12210
+ return { question, guidelines, chatPrompt };
12211
+ }
12212
+ function buildChatPromptFromSegments(options) {
12213
+ const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
12214
+ if (messages.length === 0) {
12215
+ return void 0;
12073
12216
  }
12074
- const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
12075
- const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
12076
- return { question, guidelines };
12217
+ const systemSegments = [];
12218
+ if (systemPrompt && systemPrompt.trim().length > 0) {
12219
+ systemSegments.push(systemPrompt.trim());
12220
+ }
12221
+ if (guidelineContent && guidelineContent.trim().length > 0) {
12222
+ systemSegments.push(`[[ ## Guidelines ## ]]
12223
+
12224
+ ${guidelineContent.trim()}`);
12225
+ }
12226
+ let startIndex = 0;
12227
+ while (startIndex < messages.length && messages[startIndex].role === "system") {
12228
+ const segments = segmentsByMessage[startIndex];
12229
+ const contentParts = [];
12230
+ for (const segment of segments) {
12231
+ const formatted = formatSegment(segment);
12232
+ if (formatted) {
12233
+ contentParts.push(formatted);
12234
+ }
12235
+ }
12236
+ if (contentParts.length > 0) {
12237
+ systemSegments.push(contentParts.join("\n"));
12238
+ }
12239
+ startIndex += 1;
12240
+ }
12241
+ const chatPrompt = [];
12242
+ if (systemSegments.length > 0) {
12243
+ chatPrompt.push({
12244
+ role: "system",
12245
+ content: systemSegments.join("\n\n")
12246
+ });
12247
+ }
12248
+ for (let i6 = startIndex; i6 < messages.length; i6++) {
12249
+ const message = messages[i6];
12250
+ const segments = segmentsByMessage[i6];
12251
+ const contentParts = [];
12252
+ let role = message.role;
12253
+ let name;
12254
+ if (role === "system") {
12255
+ role = "assistant";
12256
+ contentParts.push("@[System]:");
12257
+ } else if (role === "tool") {
12258
+ role = "function";
12259
+ name = "tool";
12260
+ }
12261
+ for (const segment of segments) {
12262
+ if (segment.type === "guideline_ref") {
12263
+ continue;
12264
+ }
12265
+ const formatted = formatSegment(segment);
12266
+ if (formatted) {
12267
+ const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
12268
+ if (isGuidelineRef) {
12269
+ continue;
12270
+ }
12271
+ contentParts.push(formatted);
12272
+ }
12273
+ }
12274
+ if (contentParts.length === 0) {
12275
+ continue;
12276
+ }
12277
+ chatPrompt.push({
12278
+ role,
12279
+ content: contentParts.join("\n"),
12280
+ ...name ? { name } : {}
12281
+ });
12282
+ }
12283
+ return chatPrompt.length > 0 ? chatPrompt : void 0;
12077
12284
  }
12078
12285
  async function fileExists2(absolutePath) {
12079
12286
  try {
@@ -12171,9 +12378,9 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
12171
12378
  }
12172
12379
  return parts.join(" ");
12173
12380
  }
12174
- async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
12381
+ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
12175
12382
  const execution = rawEvalCase.execution;
12176
- const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators;
12383
+ const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
12177
12384
  if (candidateEvaluators === void 0) {
12178
12385
  return void 0;
12179
12386
  }
@@ -12211,6 +12418,8 @@ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
12211
12418
  resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
12212
12419
  );
12213
12420
  }
12421
+ } else {
12422
+ resolvedCwd = searchRoots[0];
12214
12423
  }
12215
12424
  evaluators.push({
12216
12425
  name,
@@ -12239,8 +12448,7 @@ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
12239
12448
  name,
12240
12449
  type: "llm_judge",
12241
12450
  prompt,
12242
- promptPath,
12243
- model
12451
+ promptPath
12244
12452
  });
12245
12453
  }
12246
12454
  return evaluators.length > 0 ? evaluators : void 0;
@@ -12267,21 +12475,14 @@ ${detailBlock}${ANSI_RESET}`);
12267
12475
  var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
12268
12476
  function buildChatPrompt(request) {
12269
12477
  if (request.chatPrompt) {
12270
- return request.chatPrompt;
12271
- }
12272
- const systemSegments = [];
12273
- const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
12274
- if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
12275
- systemSegments.push(metadataSystemPrompt.trim());
12276
- } else {
12277
- systemSegments.push(DEFAULT_SYSTEM_PROMPT);
12278
- }
12279
- if (request.guidelines && request.guidelines.trim().length > 0) {
12280
- systemSegments.push(`[[ ## Guidelines ## ]]
12281
-
12282
- ${request.guidelines.trim()}`);
12478
+ const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
12479
+ if (hasSystemMessage) {
12480
+ return request.chatPrompt;
12481
+ }
12482
+ const systemContent2 = resolveSystemContent(request);
12483
+ return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
12283
12484
  }
12284
- const systemContent = systemSegments.join("\n\n");
12485
+ const systemContent = resolveSystemContent(request);
12285
12486
  const userContent = request.question.trim();
12286
12487
  const prompt = [
12287
12488
  {
@@ -12295,6 +12496,21 @@ ${request.guidelines.trim()}`);
12295
12496
  ];
12296
12497
  return prompt;
12297
12498
  }
12499
+ function resolveSystemContent(request) {
12500
+ const systemSegments = [];
12501
+ const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
12502
+ if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
12503
+ systemSegments.push(metadataSystemPrompt.trim());
12504
+ } else {
12505
+ systemSegments.push(DEFAULT_SYSTEM_PROMPT);
12506
+ }
12507
+ if (request.guidelines && request.guidelines.trim().length > 0) {
12508
+ systemSegments.push(`[[ ## Guidelines ## ]]
12509
+
12510
+ ${request.guidelines.trim()}`);
12511
+ }
12512
+ return systemSegments.join("\n\n");
12513
+ }
12298
12514
  function extractModelConfig(request, defaults) {
12299
12515
  const temperature = request.temperature ?? defaults.temperature;
12300
12516
  const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
@@ -13955,24 +14171,23 @@ var LlmJudgeEvaluator = class {
13955
14171
  return this.evaluateWithPrompt(context2, judgeProvider);
13956
14172
  }
13957
14173
  async evaluateWithPrompt(context2, judgeProvider) {
13958
- let prompt = buildQualityPrompt(context2.evalCase, context2.candidate);
13959
- let systemPrompt = context2.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
14174
+ const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context2.evalCase);
14175
+ const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
14176
+ let prompt = buildQualityPrompt(context2.evalCase, context2.candidate, formattedQuestion);
14177
+ let systemPrompt = context2.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
13960
14178
  if (systemPrompt && hasTemplateVariables(systemPrompt)) {
13961
14179
  const variables = {
13962
14180
  input_messages: JSON.stringify(context2.evalCase.input_segments, null, 2),
13963
14181
  output_messages: JSON.stringify(context2.evalCase.output_segments, null, 2),
13964
14182
  candidate_answer: context2.candidate,
13965
- reference_answer: context2.evalCase.reference_answer,
14183
+ reference_answer: context2.evalCase.reference_answer ?? "",
13966
14184
  expected_outcome: context2.evalCase.expected_outcome,
13967
- question: context2.evalCase.question
14185
+ question: formattedQuestion
13968
14186
  };
13969
14187
  prompt = substituteVariables(systemPrompt, variables);
13970
- systemPrompt = QUALITY_SYSTEM_PROMPT;
14188
+ systemPrompt = buildSystemPrompt(hasReferenceAnswer);
13971
14189
  }
13972
- const metadata = {
13973
- ...systemPrompt !== void 0 ? { systemPrompt } : {},
13974
- ...context2.judgeModel !== void 0 ? { model: context2.judgeModel } : {}
13975
- };
14190
+ const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
13976
14191
  const response = await judgeProvider.invoke({
13977
14192
  question: prompt,
13978
14193
  metadata,
@@ -13992,8 +14207,7 @@ var LlmJudgeEvaluator = class {
13992
14207
  provider: judgeProvider.id,
13993
14208
  prompt,
13994
14209
  target: context2.target.name,
13995
- ...systemPrompt !== void 0 ? { systemPrompt } : {},
13996
- ...context2.judgeModel !== void 0 ? { model: context2.judgeModel } : {}
14210
+ ...systemPrompt !== void 0 && { systemPrompt }
13997
14211
  };
13998
14212
  return {
13999
14213
  score,
@@ -14005,38 +14219,51 @@ var LlmJudgeEvaluator = class {
14005
14219
  };
14006
14220
  }
14007
14221
  };
14008
- var QUALITY_SYSTEM_PROMPT = [
14009
- "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
14010
- "",
14011
- "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
14012
- "",
14013
- "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
14014
- "",
14015
- "You must respond with a single JSON object matching this schema:",
14016
- "",
14017
- "{",
14018
- ' "score": <number between 0.0 and 1.0>,',
14019
- ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
14020
- ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
14021
- ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
14022
- "}"
14023
- ].join("\n");
14024
- function buildQualityPrompt(evalCase, candidate) {
14222
+ function buildSystemPrompt(hasReferenceAnswer) {
14223
+ const basePrompt = [
14224
+ "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
14225
+ ""
14226
+ ];
14227
+ if (hasReferenceAnswer) {
14228
+ basePrompt.push(
14229
+ "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
14230
+ ""
14231
+ );
14232
+ }
14233
+ basePrompt.push(
14234
+ "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
14235
+ "",
14236
+ "You must respond with a single JSON object matching this schema:",
14237
+ "",
14238
+ "{",
14239
+ ' "score": <number between 0.0 and 1.0>,',
14240
+ ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
14241
+ ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
14242
+ ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
14243
+ "}"
14244
+ );
14245
+ return basePrompt.join("\n");
14246
+ }
14247
+ function buildQualityPrompt(evalCase, candidate, question) {
14025
14248
  const parts = [
14026
14249
  "[[ ## expected_outcome ## ]]",
14027
14250
  evalCase.expected_outcome.trim(),
14028
14251
  "",
14029
14252
  "[[ ## question ## ]]",
14030
- evalCase.question.trim(),
14031
- "",
14032
- "[[ ## reference_answer ## ]]",
14033
- evalCase.reference_answer.trim(),
14034
- "",
14035
- "[[ ## candidate_answer ## ]]",
14036
- candidate.trim(),
14037
- "",
14038
- "Respond with a single JSON object matching the schema described in the system prompt."
14253
+ question.trim(),
14254
+ ""
14039
14255
  ];
14256
+ if (hasNonEmptyReferenceAnswer(evalCase)) {
14257
+ parts.push(
14258
+ "[[ ## reference_answer ## ]]",
14259
+ evalCase.reference_answer.trim(),
14260
+ ""
14261
+ );
14262
+ }
14263
+ parts.push(
14264
+ "[[ ## candidate_answer ## ]]",
14265
+ candidate.trim()
14266
+ );
14040
14267
  return parts.join("\n");
14041
14268
  }
14042
14269
  function clampScore(value) {
@@ -14119,6 +14346,9 @@ function extractJsonBlob(text) {
14119
14346
  function isNonEmptyString(value) {
14120
14347
  return typeof value === "string" && value.trim().length > 0;
14121
14348
  }
14349
+ function hasNonEmptyReferenceAnswer(evalCase) {
14350
+ return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
14351
+ }
14122
14352
  var CodeEvaluator = class {
14123
14353
  kind = "code";
14124
14354
  script;
@@ -14766,11 +14996,27 @@ async function evaluateCandidate(options) {
14766
14996
  agentTimeoutMs
14767
14997
  });
14768
14998
  const completedAt = nowFn();
14769
- const rawRequest = {
14770
- question: promptInputs.question,
14771
- ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
14772
- guideline_paths: evalCase.guideline_paths
14773
- };
14999
+ let agentProviderRequest;
15000
+ let lmProviderRequest;
15001
+ if (isAgentProvider(provider)) {
15002
+ agentProviderRequest = {
15003
+ question: promptInputs.question,
15004
+ guideline_paths: evalCase.guideline_paths
15005
+ };
15006
+ } else {
15007
+ if (promptInputs.chatPrompt) {
15008
+ lmProviderRequest = {
15009
+ chat_prompt: promptInputs.chatPrompt,
15010
+ guideline_paths: evalCase.guideline_paths
15011
+ };
15012
+ } else {
15013
+ lmProviderRequest = {
15014
+ question: promptInputs.question,
15015
+ guidelines: promptInputs.guidelines,
15016
+ guideline_paths: evalCase.guideline_paths
15017
+ };
15018
+ }
15019
+ }
14774
15020
  return {
14775
15021
  eval_id: evalCase.id,
14776
15022
  dataset: evalCase.dataset,
@@ -14784,7 +15030,8 @@ async function evaluateCandidate(options) {
14784
15030
  timestamp: completedAt.toISOString(),
14785
15031
  reasoning: score.reasoning,
14786
15032
  raw_aspects: score.rawAspects,
14787
- raw_request: rawRequest,
15033
+ agent_provider_request: agentProviderRequest,
15034
+ lm_provider_request: lmProviderRequest,
14788
15035
  evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
14789
15036
  evaluator_results: evaluatorResults
14790
15037
  };
@@ -14943,8 +15190,7 @@ async function runLlmJudgeEvaluator(options) {
14943
15190
  now,
14944
15191
  judgeProvider,
14945
15192
  systemPrompt: customPrompt,
14946
- evaluator: config,
14947
- judgeModel: config.model
15193
+ evaluator: config
14948
15194
  });
14949
15195
  }
14950
15196
  async function resolveCustomPrompt(config) {
@@ -15013,6 +15259,7 @@ async function invokeProvider(provider, options) {
15013
15259
  question: promptInputs.question,
15014
15260
  guidelines: promptInputs.guidelines,
15015
15261
  guideline_patterns: evalCase.guideline_patterns,
15262
+ chatPrompt: promptInputs.chatPrompt,
15016
15263
  inputFiles: evalCase.file_paths,
15017
15264
  evalCaseId: evalCase.id,
15018
15265
  attempt,
@@ -15029,12 +15276,30 @@ async function invokeProvider(provider, options) {
15029
15276
  }
15030
15277
  function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
15031
15278
  const message = error instanceof Error ? error.message : String(error);
15032
- const rawRequest = {
15033
- question: promptInputs.question,
15034
- ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
15035
- guideline_paths: evalCase.guideline_paths,
15036
- error: message
15037
- };
15279
+ let agentProviderRequest;
15280
+ let lmProviderRequest;
15281
+ if (isAgentProvider(provider)) {
15282
+ agentProviderRequest = {
15283
+ question: promptInputs.question,
15284
+ guideline_paths: evalCase.guideline_paths,
15285
+ error: message
15286
+ };
15287
+ } else {
15288
+ if (promptInputs.chatPrompt) {
15289
+ lmProviderRequest = {
15290
+ chat_prompt: promptInputs.chatPrompt,
15291
+ guideline_paths: evalCase.guideline_paths,
15292
+ error: message
15293
+ };
15294
+ } else {
15295
+ lmProviderRequest = {
15296
+ question: promptInputs.question,
15297
+ guidelines: promptInputs.guidelines,
15298
+ guideline_paths: evalCase.guideline_paths,
15299
+ error: message
15300
+ };
15301
+ }
15302
+ }
15038
15303
  return {
15039
15304
  eval_id: evalCase.id,
15040
15305
  dataset: evalCase.dataset,
@@ -15047,7 +15312,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
15047
15312
  target: targetName,
15048
15313
  timestamp: timestamp.toISOString(),
15049
15314
  raw_aspects: [],
15050
- raw_request: rawRequest,
15315
+ agent_provider_request: agentProviderRequest,
15316
+ lm_provider_request: lmProviderRequest,
15051
15317
  error: message
15052
15318
  };
15053
15319
  }
@@ -15059,6 +15325,9 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
15059
15325
  hash.update(promptInputs.question);
15060
15326
  hash.update(promptInputs.guidelines);
15061
15327
  hash.update(promptInputs.systemMessage ?? "");
15328
+ if (promptInputs.chatPrompt) {
15329
+ hash.update(JSON.stringify(promptInputs.chatPrompt));
15330
+ }
15062
15331
  return hash.digest("hex");
15063
15332
  }
15064
15333
  function isTimeoutLike(error) {
@@ -15486,8 +15755,6 @@ import { stripVTControlCharacters } from "node:util";
15486
15755
  var ESC = "\x1B[";
15487
15756
  var CLEAR_LINE = `${ESC}K`;
15488
15757
  var MOVE_CURSOR_UP = `${ESC}1A`;
15489
- var SYNC_START = `${ESC}?2026h`;
15490
- var SYNC_END = `${ESC}?2026l`;
15491
15758
  var ProgressDisplay = class {
15492
15759
  workers = /* @__PURE__ */ new Map();
15493
15760
  maxWorkers;
@@ -15963,14 +16230,14 @@ async function validateEvalFile(filePath) {
15963
16230
  validateMessages(inputMessages, `${location}.input_messages`, absolutePath, errors);
15964
16231
  }
15965
16232
  const expectedMessages = evalCase["expected_messages"];
15966
- if (!Array.isArray(expectedMessages)) {
16233
+ if (expectedMessages !== void 0 && !Array.isArray(expectedMessages)) {
15967
16234
  errors.push({
15968
16235
  severity: "error",
15969
16236
  filePath: absolutePath,
15970
16237
  location: `${location}.expected_messages`,
15971
- message: "Missing or invalid 'expected_messages' field (must be an array)"
16238
+ message: "Invalid 'expected_messages' field (must be an array if provided)"
15972
16239
  });
15973
- } else {
16240
+ } else if (Array.isArray(expectedMessages)) {
15974
16241
  validateMessages(expectedMessages, `${location}.expected_messages`, absolutePath, errors);
15975
16242
  }
15976
16243
  }
@@ -16006,11 +16273,13 @@ function validateMessages(messages, location, filePath, errors) {
16006
16273
  }
16007
16274
  const content = message["content"];
16008
16275
  if (typeof content === "string") {
16276
+ validateContentForRoleMarkers(content, `${msgLocation}.content`, filePath, errors);
16009
16277
  } else if (Array.isArray(content)) {
16010
16278
  for (let j2 = 0; j2 < content.length; j2++) {
16011
16279
  const contentItem = content[j2];
16012
16280
  const contentLocation = `${msgLocation}.content[${j2}]`;
16013
16281
  if (typeof contentItem === "string") {
16282
+ validateContentForRoleMarkers(contentItem, contentLocation, filePath, errors);
16014
16283
  } else if (isObject(contentItem)) {
16015
16284
  const type = contentItem["type"];
16016
16285
  if (typeof type !== "string") {
@@ -16030,6 +16299,8 @@ function validateMessages(messages, location, filePath, errors) {
16030
16299
  location: `${contentLocation}.value`,
16031
16300
  message: "Content with type 'text' must have a 'value' field"
16032
16301
  });
16302
+ } else {
16303
+ validateContentForRoleMarkers(value, `${contentLocation}.value`, filePath, errors);
16033
16304
  }
16034
16305
  }
16035
16306
  } else {
@@ -16051,6 +16322,19 @@ function validateMessages(messages, location, filePath, errors) {
16051
16322
  }
16052
16323
  }
16053
16324
  }
16325
+ function validateContentForRoleMarkers(content, location, filePath, errors) {
16326
+ const markers = ["@[System]:", "@[User]:", "@[Assistant]:", "@[Tool]:"];
16327
+ for (const marker of markers) {
16328
+ if (content.toLowerCase().includes(marker.toLowerCase())) {
16329
+ errors.push({
16330
+ severity: "warning",
16331
+ filePath,
16332
+ location,
16333
+ message: `Content contains potential role marker '${marker}'. This may confuse agentic providers or cause prompt injection.`
16334
+ });
16335
+ }
16336
+ }
16337
+ }
16054
16338
  function isObject2(value) {
16055
16339
  return typeof value === "object" && value !== null && !Array.isArray(value);
16056
16340
  }
@@ -16659,9 +16943,8 @@ async function validateMessagesFileRefs(messages, location, searchRoots, filePat
16659
16943
 
16660
16944
  // src/commands/eval/targets.ts
16661
16945
  import { constants as constants5 } from "node:fs";
16662
- import { access as access5, readFile as readFile6 } from "node:fs/promises";
16946
+ import { access as access5 } from "node:fs/promises";
16663
16947
  import path13 from "node:path";
16664
- import { parse as parse6 } from "yaml";
16665
16948
  var TARGET_FILE_CANDIDATES = [
16666
16949
  "targets.yaml",
16667
16950
  "targets.yml",
@@ -16683,18 +16966,8 @@ async function fileExists5(filePath) {
16683
16966
  }
16684
16967
  }
16685
16968
  async function readTestSuiteTarget(testFilePath) {
16686
- try {
16687
- const raw = await readFile6(path13.resolve(testFilePath), "utf8");
16688
- const parsed = parse6(raw);
16689
- if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
16690
- const targetValue = parsed.target;
16691
- if (typeof targetValue === "string" && targetValue.trim().length > 0) {
16692
- return targetValue.trim();
16693
- }
16694
- }
16695
- } catch {
16696
- }
16697
- return void 0;
16969
+ const metadata = await readTestSuiteMetadata(testFilePath);
16970
+ return metadata.target;
16698
16971
  }
16699
16972
  async function discoverTargetsFile(options) {
16700
16973
  const { explicitPath, testFilePath, repoRoot, cwd } = options;
@@ -17665,4 +17938,4 @@ export {
17665
17938
  createProgram,
17666
17939
  runCli
17667
17940
  };
17668
- //# sourceMappingURL=chunk-X2VVUCIB.js.map
17941
+ //# sourceMappingURL=chunk-72BHGHIT.js.map