npm - @agentv/core - Versions diffs - 0.26.0 → 1.2.0 - Mend

@agentv/core 0.26.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/{chunk-NDEN3H2B.js → chunk-V3JCB3HI.js} +1 -1
package/dist/chunk-V3JCB3HI.js.map +1 -0
package/dist/evaluation/validation/index.cjs +0 -44
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +1 -45
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +51 -222
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +9 -45
package/dist/index.d.ts +9 -45
package/dist/index.js +52 -221
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/dist/chunk-NDEN3H2B.js.map +0 -1

package/dist/index.cjs CHANGED Viewed

@@ -32,7 +32,6 @@ var index_exports = {};
 __export(index_exports, {
   CodeEvaluator: () => CodeEvaluator,
   CompositeEvaluator: () => CompositeEvaluator,
-  ExpectedMessagesEvaluator: () => ExpectedMessagesEvaluator,
   LlmJudgeEvaluator: () => LlmJudgeEvaluator,
   TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
   ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
@@ -50,7 +49,6 @@ __export(index_exports, {
   generateRubrics: () => generateRubrics,
   getHitCount: () => getHitCount,
   isEvaluatorKind: () => isEvaluatorKind,
-  isExpectedToolCall: () => isExpectedToolCall,
   isGuidelineFile: () => isGuidelineFile,
   isJsonObject: () => isJsonObject,
   isJsonValue: () => isJsonValue,
@@ -110,18 +108,23 @@ function isTestMessage(value) {
   if (typeof candidate.content === "string") {
     return true;
   }
-  if (!Array.isArray(candidate.content)) {
-    return false;
+  if (Array.isArray(candidate.content) && candidate.content.every(isJsonObject)) {
+    return true;
+  }
+  if (Array.isArray(candidate.tool_calls) && candidate.tool_calls.length > 0) {
+    return true;
   }
-  return candidate.content.every(isJsonObject);
+  if (isJsonObject(candidate.content)) {
+    return true;
+  }
+  return false;
 }
 var EVALUATOR_KIND_VALUES = [
   "code_judge",
   "llm_judge",
   "rubric",
   "composite",
-  "tool_trajectory",
-  "expected_messages"
+  "tool_trajectory"
 ];
 var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
 function isEvaluatorKind(value) {
@@ -142,13 +145,6 @@ function isTraceEvent(value) {
   const candidate = value;
   return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
 }
-function isExpectedToolCall(value) {
-  if (typeof value !== "object" || value === null) {
-    return false;
-  }
-  const candidate = value;
-  return typeof candidate.tool === "string";
-}
 function computeTraceSummary(trace) {
   const toolCallCounts = {};
   let errorCount = 0;
@@ -645,15 +641,6 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       });
       continue;
     }
-    if (typeValue === "expected_messages") {
-      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      evaluators.push({
-        name,
-        type: "expected_messages",
-        ...weight2 !== void 0 ? { weight: weight2 } : {}
-      });
-      continue;
-    }
     if (typeValue === "tool_trajectory") {
       const mode = asString2(rawEvaluator.mode);
       if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
@@ -908,63 +895,6 @@ async function processMessages(options) {
   }
   return segments;
 }
-async function resolveAssistantContent(content, searchRoots, verbose) {
-  if (typeof content === "string") {
-    return content;
-  }
-  if (!content) {
-    return "";
-  }
-  const parts = [];
-  for (const entry of content) {
-    if (typeof entry === "string") {
-      parts.push({ content: entry, isFile: false });
-      continue;
-    }
-    if (!isJsonObject(entry)) {
-      continue;
-    }
-    const segmentType = asString3(entry.type);
-    if (segmentType === "file") {
-      const rawValue = asString3(entry.value);
-      if (!rawValue) {
-        continue;
-      }
-      const { displayPath, resolvedPath, attempted } = await resolveFileReference(
-        rawValue,
-        searchRoots
-      );
-      if (!resolvedPath) {
-        const attempts = attempted.length ? ["  Tried:", ...attempted.map((candidate) => `    ${candidate}`)] : void 0;
-        logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
-        continue;
-      }
-      try {
-        const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
-        parts.push({ content: fileContent, isFile: true, displayPath });
-        if (verbose) {
-          console.log(`  [Expected Assistant File] Found: ${displayPath}`);
-          console.log(`    Resolved to: ${resolvedPath}`);
-        }
-      } catch (error) {
-        logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
-      }
-      continue;
-    }
-    const textValue = asString3(entry.text);
-    if (typeof textValue === "string") {
-      parts.push({ content: textValue, isFile: false });
-      continue;
-    }
-    const valueValue = asString3(entry.value);
-    if (typeof valueValue === "string") {
-      parts.push({ content: valueValue, isFile: false });
-      continue;
-    }
-    parts.push({ content: JSON.stringify(entry), isFile: false });
-  }
-  return formatFileContents(parts);
-}
 function asString3(value) {
   return typeof value === "string" ? value : void 0;
 }
@@ -997,14 +927,15 @@ ${detailBlock}${ANSI_RESET4}`);
   }
 }
 async function processExpectedMessages(options) {
-  const { messages, searchRoots, repoRootPath, verbose } = options;
+  const { messages, searchRoots, verbose } = options;
   const segments = [];
   for (const message of messages) {
+    const extendedMessage = message;
     const segment = {
       role: message.role
     };
-    if (message.role === "assistant" && message.tool_calls !== void 0) {
-      segment.tool_calls = message.tool_calls;
+    if (extendedMessage.name) {
+      segment.name = extendedMessage.name;
     }
     const content = message.content;
     if (typeof content === "string") {
@@ -1052,6 +983,13 @@ async function processExpectedMessages(options) {
         processedContent.push(cloneJsonObject(rawSegment));
       }
       segment.content = processedContent;
+    } else if (isJsonObject(content)) {
+      segment.content = cloneJsonObject(content);
+    }
+    if (extendedMessage.tool_calls && Array.isArray(extendedMessage.tool_calls)) {
+      segment.tool_calls = extendedMessage.tool_calls.map(
+        (tc) => isJsonObject(tc) ? cloneJsonObject(tc) : tc
+      );
     }
     segments.push(segment);
   }
@@ -1346,9 +1284,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       logError(`No valid expected message found for eval case: ${id}`);
       continue;
     }
-    if (expectedMessages.length > 1) {
-      logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
-    }
     const guidelinePaths = [];
     const inputTextParts = [];
     const inputSegments = await processMessages({
@@ -1368,8 +1303,19 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       verbose
     }) : [];
     const codeSnippets = extractCodeBlocks(inputSegments);
-    const expectedContent = expectedMessages[0]?.content;
-    const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
+    let referenceAnswer = "";
+    if (outputSegments.length > 1) {
+      referenceAnswer = JSON.stringify(outputSegments, null, 2);
+    } else if (outputSegments.length === 1) {
+      const singleMessage = outputSegments[0];
+      if (typeof singleMessage.content === "string") {
+        referenceAnswer = singleMessage.content;
+      } else if (singleMessage.content) {
+        referenceAnswer = JSON.stringify(singleMessage, null, 2);
+      } else if (singleMessage.tool_calls) {
+        referenceAnswer = JSON.stringify(singleMessage, null, 2);
+      }
+    }
     const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
     const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
     let evaluators;
@@ -1424,7 +1370,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       question,
       input_messages: inputMessages,
       input_segments: inputSegments,
-      expected_segments: outputSegments,
+      expected_messages: outputSegments,
       reference_answer: referenceAnswer,
       guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
       guideline_patterns: guidelinePatterns,
@@ -3979,7 +3925,7 @@ var import_ai2 = require("ai");
 var import_zod2 = require("zod");
 var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
-Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
+Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
 Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
@@ -4037,7 +3983,7 @@ var LlmJudgeEvaluator = class {
     const variables = {
       [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
       [TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
-        context.evalCase.expected_segments,
+        context.evalCase.expected_messages,
         null,
         2
       ),
@@ -4256,7 +4202,9 @@ var CodeEvaluator = class {
         input_files: context.evalCase.file_paths.filter(
           (path15) => !context.evalCase.guideline_paths.includes(path15)
         ),
-        input_messages: context.evalCase.input_messages
+        input_messages: context.evalCase.input_messages,
+        candidate_trace_file: context.candidateTraceRef ?? null,
+        candidate_trace_summary: context.candidateTraceSummary ?? null
       },
       null,
       2
@@ -4522,105 +4470,6 @@ var ToolTrajectoryEvaluator = class {
     };
   }
 };
-var ExpectedMessagesEvaluator = class {
-  kind = "expected_messages";
-  evaluate(context) {
-    const { candidateTrace, evalCase } = context;
-    const expectedSegments = evalCase.expected_segments;
-    const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
-    if (expectedToolCalls.length === 0) {
-      return {
-        score: 1,
-        verdict: "pass",
-        hits: ["No tool_calls specified in expected_messages"],
-        misses: [],
-        expectedAspectCount: 1
-      };
-    }
-    if (!candidateTrace || candidateTrace.length === 0) {
-      return {
-        score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: ["No trace available to validate tool_calls"],
-        expectedAspectCount: expectedToolCalls.length
-      };
-    }
-    const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
-    return this.validateToolCalls(expectedToolCalls, actualToolCalls);
-  }
-  extractExpectedToolCalls(segments) {
-    if (!segments) {
-      return [];
-    }
-    const toolCalls = [];
-    for (const segment of segments) {
-      const role = segment.role;
-      const segmentToolCalls = segment.tool_calls;
-      if (role === "assistant" && Array.isArray(segmentToolCalls)) {
-        for (const tc of segmentToolCalls) {
-          if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
-            const toolCall = tc;
-            toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
-          }
-        }
-      }
-    }
-    return toolCalls;
-  }
-  validateToolCalls(expected, actual) {
-    const hits = [];
-    const misses = [];
-    for (let i = 0; i < expected.length; i++) {
-      const expectedCall = expected[i];
-      const actualCall = actual[i];
-      if (!actualCall) {
-        misses.push(
-          `tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
-        );
-        continue;
-      }
-      if (actualCall.name !== expectedCall.tool) {
-        misses.push(
-          `tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
-        );
-        continue;
-      }
-      if (expectedCall.input !== void 0) {
-        if (!this.deepEquals(expectedCall.input, actualCall.input)) {
-          misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
-          continue;
-        }
-      }
-      hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
-    }
-    const totalChecks = expected.length || 1;
-    const score = hits.length / totalChecks;
-    return {
-      score,
-      verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
-      hits,
-      misses,
-      expectedAspectCount: totalChecks
-    };
-  }
-  deepEquals(a, b) {
-    if (a === b) return true;
-    if (typeof a !== typeof b) return false;
-    if (typeof a !== "object" || a === null || b === null) return false;
-    if (Array.isArray(a) && Array.isArray(b)) {
-      if (a.length !== b.length) return false;
-      return a.every((val, i) => this.deepEquals(val, b[i]));
-    }
-    if (Array.isArray(a) || Array.isArray(b)) return false;
-    const aObj = a;
-    const bObj = b;
-    const aKeys = Object.keys(aObj);
-    const bKeys = Object.keys(bObj);
-    if (aKeys.length !== bKeys.length) return false;
-    return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
-  }
-};
 var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
 {{EVALUATOR_RESULTS_JSON}}
@@ -5392,6 +5241,7 @@ async function runEvalCase(options) {
       judgeProvider,
       agentTimeoutMs,
       candidateTrace,
+      candidateTraceRef: providerResponse.traceRef,
       candidateTraceSummary
     });
   } catch (error) {
@@ -5411,6 +5261,7 @@ async function evaluateCandidate(options) {
     judgeProvider,
     agentTimeoutMs,
     candidateTrace,
+    candidateTraceRef,
     candidateTraceSummary
   } = options;
   const gradeTimestamp = nowFn();
@@ -5426,6 +5277,7 @@ async function evaluateCandidate(options) {
     judgeProvider,
     agentTimeoutMs,
     candidateTrace,
+    candidateTraceRef,
     candidateTraceSummary
   });
   const completedAt = nowFn();
@@ -5480,6 +5332,7 @@ async function runEvaluatorsForCase(options) {
     judgeProvider,
     agentTimeoutMs,
     candidateTrace,
+    candidateTraceRef,
     candidateTraceSummary
   } = options;
   if (evalCase.evaluators && evalCase.evaluators.length > 0) {
@@ -5496,6 +5349,7 @@ async function runEvaluatorsForCase(options) {
       judgeProvider,
       agentTimeoutMs,
       candidateTrace,
+      candidateTraceRef,
       candidateTraceSummary
     });
   }
@@ -5514,6 +5368,7 @@ async function runEvaluatorsForCase(options) {
     now,
     judgeProvider,
     candidateTrace,
+    candidateTraceRef,
     candidateTraceSummary
   });
   return { score };
@@ -5532,6 +5387,7 @@ async function runEvaluatorList(options) {
     judgeProvider,
     agentTimeoutMs,
     candidateTrace,
+    candidateTraceRef,
     candidateTraceSummary
   } = options;
   const scored = [];
@@ -5578,7 +5434,9 @@ async function runEvaluatorList(options) {
           provider,
           attempt,
           promptInputs,
-          now
+          now,
+          candidateTraceRef,
+          candidateTraceSummary
         });
         const weight = evaluator.weight ?? 1;
         scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -5616,8 +5474,6 @@ async function runEvaluatorList(options) {
               return new ToolTrajectoryEvaluator({
                 config: memberConfig
               });
-            case "expected_messages":
-              return new ExpectedMessagesEvaluator();
             default: {
               const unknownConfig = memberConfig;
               throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -5667,32 +5523,7 @@ async function runEvaluatorList(options) {
           promptInputs,
           now,
           candidateTrace,
-          candidateTraceSummary
-        });
-        const weight = evaluator.weight ?? 1;
-        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
-        evaluatorResults.push({
-          name: evaluator.name,
-          type: evaluator.type,
-          score: score2.score,
-          weight,
-          verdict: score2.verdict,
-          hits: score2.hits,
-          misses: score2.misses,
-          reasoning: score2.reasoning
-        });
-      }
-      if (evaluator.type === "expected_messages") {
-        const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
-        const score2 = expectedMessagesEvaluator.evaluate({
-          evalCase,
-          candidate,
-          target,
-          provider,
-          attempt,
-          promptInputs,
-          now,
-          candidateTrace,
+          candidateTraceRef,
           candidateTraceSummary
         });
         const weight = evaluator.weight ?? 1;
@@ -6065,7 +5896,6 @@ function createAgentKernel() {
 0 && (module.exports = {
   CodeEvaluator,
   CompositeEvaluator,
-  ExpectedMessagesEvaluator,
   LlmJudgeEvaluator,
   TEST_MESSAGE_ROLES,
   ToolTrajectoryEvaluator,
@@ -6083,7 +5913,6 @@ function createAgentKernel() {
   generateRubrics,
   getHitCount,
   isEvaluatorKind,
-  isExpectedToolCall,
   isGuidelineFile,
   isJsonObject,
   isJsonValue,