npm - agentv - Versions diffs - 0.26.0 → 1.2.0 - Mend

agentv 0.26.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/{chunk-6ZM7WVSC.js → chunk-IVIT4U6S.js} +54 -258
package/dist/chunk-IVIT4U6S.js.map +1 -0
package/dist/cli.js +1 -1
package/dist/cli.js.map +1 -1
package/dist/index.js +1 -1
package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +20 -19
package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +217 -217
package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +67 -2
package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +10 -68
package/package.json +1 -1
package/dist/chunk-6ZM7WVSC.js.map +0 -1
package/dist/templates/agentv/.env.template +0 -23

package/dist/{chunk-6ZM7WVSC.js → chunk-IVIT4U6S.js} RENAMED Viewed

@@ -164,7 +164,7 @@ import { access as access6, mkdir as mkdir7 } from "node:fs/promises";
 import path19 from "node:path";
 import { pathToFileURL } from "node:url";
-// ../../packages/core/dist/chunk-NDEN3H2B.js
+// ../../packages/core/dist/chunk-V3JCB3HI.js
 import { constants } from "node:fs";
 import { access, readFile } from "node:fs/promises";
 import path from "node:path";
@@ -4211,7 +4211,7 @@ var coerce = {
 };
 var NEVER = INVALID;
-// ../../packages/core/dist/chunk-NDEN3H2B.js
+// ../../packages/core/dist/chunk-V3JCB3HI.js
 async function fileExists(filePath) {
   try {
     await access(filePath, constants.F_OK);
@@ -34567,18 +34567,23 @@ function isTestMessage(value) {
   if (typeof candidate.content === "string") {
     return true;
   }
-  if (!Array.isArray(candidate.content)) {
-    return false;
+  if (Array.isArray(candidate.content) && candidate.content.every(isJsonObject)) {
+    return true;
   }
-  return candidate.content.every(isJsonObject);
+  if (Array.isArray(candidate.tool_calls) && candidate.tool_calls.length > 0) {
+    return true;
+  }
+  if (isJsonObject(candidate.content)) {
+    return true;
+  }
+  return false;
 }
 var EVALUATOR_KIND_VALUES = [
   "code_judge",
   "llm_judge",
   "rubric",
   "composite",
-  "tool_trajectory",
-  "expected_messages"
+  "tool_trajectory"
 ];
 var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
 function isEvaluatorKind(value) {
@@ -35058,15 +35063,6 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       });
       continue;
     }
-    if (typeValue === "expected_messages") {
-      const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
-      evaluators.push({
-        name: name16,
-        type: "expected_messages",
-        ...weight2 !== void 0 ? { weight: weight2 } : {}
-      });
-      continue;
-    }
     if (typeValue === "tool_trajectory") {
       const mode = asString2(rawEvaluator.mode);
       if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
@@ -35317,63 +35313,6 @@ async function processMessages(options) {
   }
   return segments;
 }
-async function resolveAssistantContent(content, searchRoots, verbose) {
-  if (typeof content === "string") {
-    return content;
-  }
-  if (!content) {
-    return "";
-  }
-  const parts = [];
-  for (const entry of content) {
-    if (typeof entry === "string") {
-      parts.push({ content: entry, isFile: false });
-      continue;
-    }
-    if (!isJsonObject(entry)) {
-      continue;
-    }
-    const segmentType = asString3(entry.type);
-    if (segmentType === "file") {
-      const rawValue = asString3(entry.value);
-      if (!rawValue) {
-        continue;
-      }
-      const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
-        rawValue,
-        searchRoots
-      );
-      if (!resolvedPath) {
-        const attempts = attempted.length ? ["  Tried:", ...attempted.map((candidate) => `    ${candidate}`)] : void 0;
-        logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
-        continue;
-      }
-      try {
-        const fileContent = (await readFile32(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
-        parts.push({ content: fileContent, isFile: true, displayPath });
-        if (verbose) {
-          console.log(`  [Expected Assistant File] Found: ${displayPath}`);
-          console.log(`    Resolved to: ${resolvedPath}`);
-        }
-      } catch (error40) {
-        logWarning3(`Could not read file ${resolvedPath}: ${error40.message}`);
-      }
-      continue;
-    }
-    const textValue = asString3(entry.text);
-    if (typeof textValue === "string") {
-      parts.push({ content: textValue, isFile: false });
-      continue;
-    }
-    const valueValue = asString3(entry.value);
-    if (typeof valueValue === "string") {
-      parts.push({ content: valueValue, isFile: false });
-      continue;
-    }
-    parts.push({ content: JSON.stringify(entry), isFile: false });
-  }
-  return formatFileContents(parts);
-}
 function asString3(value) {
   return typeof value === "string" ? value : void 0;
 }
@@ -35406,14 +35345,15 @@ ${detailBlock}${ANSI_RESET4}`);
   }
 }
 async function processExpectedMessages(options) {
-  const { messages, searchRoots, repoRootPath, verbose } = options;
+  const { messages, searchRoots, verbose } = options;
   const segments = [];
   for (const message of messages) {
+    const extendedMessage = message;
     const segment = {
       role: message.role
     };
-    if (message.role === "assistant" && message.tool_calls !== void 0) {
-      segment.tool_calls = message.tool_calls;
+    if (extendedMessage.name) {
+      segment.name = extendedMessage.name;
     }
     const content = message.content;
     if (typeof content === "string") {
@@ -35461,6 +35401,13 @@ async function processExpectedMessages(options) {
         processedContent.push(cloneJsonObject(rawSegment));
       }
       segment.content = processedContent;
+    } else if (isJsonObject(content)) {
+      segment.content = cloneJsonObject(content);
+    }
+    if (extendedMessage.tool_calls && Array.isArray(extendedMessage.tool_calls)) {
+      segment.tool_calls = extendedMessage.tool_calls.map(
+        (tc) => isJsonObject(tc) ? cloneJsonObject(tc) : tc
+      );
     }
     segments.push(segment);
   }
@@ -35749,9 +35696,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       logError(`No valid expected message found for eval case: ${id}`);
       continue;
     }
-    if (expectedMessages.length > 1) {
-      logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
-    }
     const guidelinePaths = [];
     const inputTextParts = [];
     const inputSegments = await processMessages({
@@ -35771,8 +35715,19 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       verbose
     }) : [];
     const codeSnippets = extractCodeBlocks(inputSegments);
-    const expectedContent = expectedMessages[0]?.content;
-    const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
+    let referenceAnswer = "";
+    if (outputSegments.length > 1) {
+      referenceAnswer = JSON.stringify(outputSegments, null, 2);
+    } else if (outputSegments.length === 1) {
+      const singleMessage = outputSegments[0];
+      if (typeof singleMessage.content === "string") {
+        referenceAnswer = singleMessage.content;
+      } else if (singleMessage.content) {
+        referenceAnswer = JSON.stringify(singleMessage, null, 2);
+      } else if (singleMessage.tool_calls) {
+        referenceAnswer = JSON.stringify(singleMessage, null, 2);
+      }
+    }
     const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
     const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
     let evaluators;
@@ -35827,7 +35782,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       question,
       input_messages: inputMessages,
       input_segments: inputSegments,
-      expected_segments: outputSegments,
+      expected_messages: outputSegments,
       reference_answer: referenceAnswer,
       guideline_paths: guidelinePaths.map((guidelinePath) => path62.resolve(guidelinePath)),
       guideline_patterns: guidelinePatterns,
@@ -37669,7 +37624,7 @@ function createProvider(target) {
 }
 var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
-Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
+Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
 Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
@@ -37727,7 +37682,7 @@ var LlmJudgeEvaluator = class {
     const variables = {
       [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
       [TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
-        context.evalCase.expected_segments,
+        context.evalCase.expected_messages,
         null,
         2
       ),
@@ -37946,7 +37901,9 @@ var CodeEvaluator = class {
         input_files: context.evalCase.file_paths.filter(
           (path132) => !context.evalCase.guideline_paths.includes(path132)
         ),
-        input_messages: context.evalCase.input_messages
+        input_messages: context.evalCase.input_messages,
+        candidate_trace_file: context.candidateTraceRef ?? null,
+        candidate_trace_summary: context.candidateTraceSummary ?? null
       },
       null,
       2
@@ -38212,105 +38169,6 @@ var ToolTrajectoryEvaluator = class {
     };
   }
 };
-var ExpectedMessagesEvaluator = class {
-  kind = "expected_messages";
-  evaluate(context) {
-    const { candidateTrace, evalCase } = context;
-    const expectedSegments = evalCase.expected_segments;
-    const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
-    if (expectedToolCalls.length === 0) {
-      return {
-        score: 1,
-        verdict: "pass",
-        hits: ["No tool_calls specified in expected_messages"],
-        misses: [],
-        expectedAspectCount: 1
-      };
-    }
-    if (!candidateTrace || candidateTrace.length === 0) {
-      return {
-        score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: ["No trace available to validate tool_calls"],
-        expectedAspectCount: expectedToolCalls.length
-      };
-    }
-    const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
-    return this.validateToolCalls(expectedToolCalls, actualToolCalls);
-  }
-  extractExpectedToolCalls(segments) {
-    if (!segments) {
-      return [];
-    }
-    const toolCalls = [];
-    for (const segment of segments) {
-      const role = segment.role;
-      const segmentToolCalls = segment.tool_calls;
-      if (role === "assistant" && Array.isArray(segmentToolCalls)) {
-        for (const tc of segmentToolCalls) {
-          if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
-            const toolCall = tc;
-            toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
-          }
-        }
-      }
-    }
-    return toolCalls;
-  }
-  validateToolCalls(expected, actual) {
-    const hits = [];
-    const misses = [];
-    for (let i = 0; i < expected.length; i++) {
-      const expectedCall = expected[i];
-      const actualCall = actual[i];
-      if (!actualCall) {
-        misses.push(
-          `tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
-        );
-        continue;
-      }
-      if (actualCall.name !== expectedCall.tool) {
-        misses.push(
-          `tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
-        );
-        continue;
-      }
-      if (expectedCall.input !== void 0) {
-        if (!this.deepEquals(expectedCall.input, actualCall.input)) {
-          misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
-          continue;
-        }
-      }
-      hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
-    }
-    const totalChecks = expected.length || 1;
-    const score = hits.length / totalChecks;
-    return {
-      score,
-      verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
-      hits,
-      misses,
-      expectedAspectCount: totalChecks
-    };
-  }
-  deepEquals(a, b) {
-    if (a === b) return true;
-    if (typeof a !== typeof b) return false;
-    if (typeof a !== "object" || a === null || b === null) return false;
-    if (Array.isArray(a) && Array.isArray(b)) {
-      if (a.length !== b.length) return false;
-      return a.every((val, i) => this.deepEquals(val, b[i]));
-    }
-    if (Array.isArray(a) || Array.isArray(b)) return false;
-    const aObj = a;
-    const bObj = b;
-    const aKeys = Object.keys(aObj);
-    const bKeys = Object.keys(bObj);
-    if (aKeys.length !== bKeys.length) return false;
-    return aKeys.every((key2) => this.deepEquals(aObj[key2], bObj[key2]));
-  }
-};
 var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
 {{EVALUATOR_RESULTS_JSON}}
@@ -39061,6 +38919,7 @@ async function runEvalCase(options) {
       judgeProvider,
       agentTimeoutMs,
       candidateTrace,
+      candidateTraceRef: providerResponse.traceRef,
       candidateTraceSummary
     });
   } catch (error40) {
@@ -39080,6 +38939,7 @@ async function evaluateCandidate(options) {
     judgeProvider,
     agentTimeoutMs,
     candidateTrace,
+    candidateTraceRef,
     candidateTraceSummary
   } = options;
   const gradeTimestamp = nowFn();
@@ -39095,6 +38955,7 @@ async function evaluateCandidate(options) {
     judgeProvider,
     agentTimeoutMs,
     candidateTrace,
+    candidateTraceRef,
     candidateTraceSummary
   });
   const completedAt = nowFn();
@@ -39149,6 +39010,7 @@ async function runEvaluatorsForCase(options) {
     judgeProvider,
     agentTimeoutMs,
     candidateTrace,
+    candidateTraceRef,
     candidateTraceSummary
   } = options;
   if (evalCase.evaluators && evalCase.evaluators.length > 0) {
@@ -39165,6 +39027,7 @@ async function runEvaluatorsForCase(options) {
       judgeProvider,
       agentTimeoutMs,
       candidateTrace,
+      candidateTraceRef,
       candidateTraceSummary
     });
   }
@@ -39183,6 +39046,7 @@ async function runEvaluatorsForCase(options) {
     now,
     judgeProvider,
     candidateTrace,
+    candidateTraceRef,
     candidateTraceSummary
   });
   return { score };
@@ -39201,6 +39065,7 @@ async function runEvaluatorList(options) {
     judgeProvider,
     agentTimeoutMs,
     candidateTrace,
+    candidateTraceRef,
     candidateTraceSummary
   } = options;
   const scored = [];
@@ -39247,7 +39112,9 @@ async function runEvaluatorList(options) {
           provider,
           attempt,
           promptInputs,
-          now
+          now,
+          candidateTraceRef,
+          candidateTraceSummary
         });
         const weight = evaluator.weight ?? 1;
         scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -39285,8 +39152,6 @@ async function runEvaluatorList(options) {
               return new ToolTrajectoryEvaluator({
                 config: memberConfig
               });
-            case "expected_messages":
-              return new ExpectedMessagesEvaluator();
             default: {
               const unknownConfig = memberConfig;
               throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -39336,32 +39201,7 @@ async function runEvaluatorList(options) {
           promptInputs,
           now,
           candidateTrace,
-          candidateTraceSummary
-        });
-        const weight = evaluator.weight ?? 1;
-        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
-        evaluatorResults.push({
-          name: evaluator.name,
-          type: evaluator.type,
-          score: score2.score,
-          weight,
-          verdict: score2.verdict,
-          hits: score2.hits,
-          misses: score2.misses,
-          reasoning: score2.reasoning
-        });
-      }
-      if (evaluator.type === "expected_messages") {
-        const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
-        const score2 = expectedMessagesEvaluator.evaluate({
-          evalCase,
-          candidate,
-          target,
-          provider,
-          attempt,
-          promptInputs,
-          now,
-          candidateTrace,
+          candidateTraceRef,
           candidateTraceSummary
         });
         const weight = evaluator.weight ?? 1;
@@ -40649,26 +40489,6 @@ function validateMessages(messages, location, filePath, errors) {
         message: `Invalid role '${role}'. Must be one of: ${validRoles.join(", ")}`
       });
     }
-    const toolCalls = message.tool_calls;
-    if (toolCalls !== void 0) {
-      if (role !== "assistant") {
-        errors.push({
-          severity: "error",
-          filePath,
-          location: `${msgLocation}.tool_calls`,
-          message: "tool_calls can only be specified on assistant messages"
-        });
-      } else if (!Array.isArray(toolCalls)) {
-        errors.push({
-          severity: "error",
-          filePath,
-          location: `${msgLocation}.tool_calls`,
-          message: "tool_calls must be an array"
-        });
-      } else {
-        validateToolCalls(toolCalls, `${msgLocation}.tool_calls`, filePath, errors);
-      }
-    }
     const content = message.content;
     if (typeof content === "string") {
       validateContentForRoleMarkers(content, `${msgLocation}.content`, filePath, errors);
@@ -40733,30 +40553,6 @@ function validateContentForRoleMarkers(content, location, filePath, errors) {
     }
   }
 }
-function validateToolCalls(toolCalls, location, filePath, errors) {
-  for (let i = 0; i < toolCalls.length; i++) {
-    const toolCall = toolCalls[i];
-    const callLocation = `${location}[${i}]`;
-    if (!isObject2(toolCall)) {
-      errors.push({
-        severity: "error",
-        filePath,
-        location: callLocation,
-        message: "Tool call must be an object"
-      });
-      continue;
-    }
-    const tool2 = toolCall.tool;
-    if (typeof tool2 !== "string" || tool2.trim().length === 0) {
-      errors.push({
-        severity: "error",
-        filePath,
-        location: `${callLocation}.tool`,
-        message: "Missing or invalid 'tool' field (must be a non-empty string)"
-      });
-    }
-  }
-}
 function isObject22(value) {
   return typeof value === "object" && value !== null && !Array.isArray(value);
 }
@@ -42708,4 +42504,4 @@ export {
   app,
   runCli
 };
-//# sourceMappingURL=chunk-6ZM7WVSC.js.map
+//# sourceMappingURL=chunk-IVIT4U6S.js.map