npm - @agentv/core - Versions diffs - 0.25.0 → 0.26.0 - Mend

@agentv/core 0.25.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/{chunk-OYTL3LNN.js → chunk-NDEN3H2B.js} +5 -2
package/dist/chunk-NDEN3H2B.js.map +1 -0
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +1 -1
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +82 -21
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +6 -0
package/dist/index.d.ts +6 -0
package/dist/index.js +79 -21
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/dist/chunk-OYTL3LNN.js.map +0 -1

package/dist/index.d.cts CHANGED Viewed

@@ -56,6 +56,8 @@ interface ToolTrajectoryEvaluatorConfig {
     readonly minimums?: Readonly<Record<string, number>>;
     /** Expected tool sequence (for in_order/exact modes) */
     readonly expected?: readonly ToolTrajectoryExpectedItem[];
+    /** Optional weight for top-level aggregation (defaults to 1.0) */
+    readonly weight?: number;
 }
 /**
  * Expected tool call item in a trajectory sequence.
@@ -186,6 +188,7 @@ type CodeEvaluatorConfig = {
     readonly resolvedScriptPath?: string;
     readonly cwd?: string;
     readonly resolvedCwd?: string;
+    readonly weight?: number;
 };
 type LlmJudgeEvaluatorConfig = {
     readonly name: string;
@@ -193,6 +196,7 @@ type LlmJudgeEvaluatorConfig = {
     readonly prompt?: string;
     readonly promptPath?: string;
     readonly rubrics?: readonly RubricItem[];
+    readonly weight?: number;
 };
 type RubricItem = {
     readonly id: string;
@@ -218,10 +222,12 @@ type CompositeEvaluatorConfig = {
     readonly type: 'composite';
     readonly evaluators: readonly EvaluatorConfig[];
     readonly aggregator: CompositeAggregatorConfig;
+    readonly weight?: number;
 };
 type ExpectedMessagesEvaluatorConfig = {
     readonly name: string;
     readonly type: 'expected_messages';
+    readonly weight?: number;
 };
 type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedMessagesEvaluatorConfig;
 /**

package/dist/index.d.ts CHANGED Viewed

@@ -56,6 +56,8 @@ interface ToolTrajectoryEvaluatorConfig {
     readonly minimums?: Readonly<Record<string, number>>;
     /** Expected tool sequence (for in_order/exact modes) */
     readonly expected?: readonly ToolTrajectoryExpectedItem[];
+    /** Optional weight for top-level aggregation (defaults to 1.0) */
+    readonly weight?: number;
 }
 /**
  * Expected tool call item in a trajectory sequence.
@@ -186,6 +188,7 @@ type CodeEvaluatorConfig = {
     readonly resolvedScriptPath?: string;
     readonly cwd?: string;
     readonly resolvedCwd?: string;
+    readonly weight?: number;
 };
 type LlmJudgeEvaluatorConfig = {
     readonly name: string;
@@ -193,6 +196,7 @@ type LlmJudgeEvaluatorConfig = {
     readonly prompt?: string;
     readonly promptPath?: string;
     readonly rubrics?: readonly RubricItem[];
+    readonly weight?: number;
 };
 type RubricItem = {
     readonly id: string;
@@ -218,10 +222,12 @@ type CompositeEvaluatorConfig = {
     readonly type: 'composite';
     readonly evaluators: readonly EvaluatorConfig[];
     readonly aggregator: CompositeAggregatorConfig;
+    readonly weight?: number;
 };
 type ExpectedMessagesEvaluatorConfig = {
     readonly name: string;
     readonly type: 'expected_messages';
+    readonly weight?: number;
 };
 type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedMessagesEvaluatorConfig;
 /**

package/dist/index.js CHANGED Viewed

@@ -9,7 +9,7 @@ import {
   readTextFile,
   resolveFileReference,
   resolveTargetDefinition
-} from "./chunk-OYTL3LNN.js";
+} from "./chunk-NDEN3H2B.js";
 // src/evaluation/types.ts
 var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -455,6 +455,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
         continue;
       }
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
       const cwd = asString2(rawEvaluator.cwd);
       let resolvedCwd;
       if (cwd) {
@@ -475,7 +476,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         type: "code",
         script,
         cwd,
-        resolvedCwd
+        resolvedCwd,
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
       });
       continue;
     }
@@ -570,18 +572,22 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           ...promptPath2 ? { promptPath: promptPath2 } : {}
         };
       }
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
       evaluators.push({
         name,
         type: "composite",
         evaluators: memberEvaluators,
-        aggregator
+        aggregator,
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
       });
       continue;
     }
     if (typeValue === "expected_messages") {
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
       evaluators.push({
         name,
-        type: "expected_messages"
+        type: "expected_messages",
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
       });
       continue;
     }
@@ -637,12 +643,14 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         );
         continue;
       }
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
       const config = {
         name,
         type: "tool_trajectory",
         mode,
         ...minimums ? { minimums } : {},
-        ...expected ? { expected } : {}
+        ...expected ? { expected } : {},
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
       };
       evaluators.push(config);
       continue;
@@ -683,19 +691,23 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
         continue;
       }
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
       evaluators.push({
         name,
         type: "llm_judge",
-        rubrics: parsedRubrics
+        rubrics: parsedRubrics,
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
       });
       continue;
     }
+    const weight = validateWeight(rawEvaluator.weight, name, evalId);
     evaluators.push({
       name,
       type: "llm_judge",
       prompt,
       promptPath,
-      ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
+      ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
+      ...weight !== void 0 ? { weight } : {}
     });
   }
   return evaluators.length > 0 ? evaluators : void 0;
@@ -725,6 +737,27 @@ ${detailBlock}${ANSI_RESET3}`);
     console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
   }
 }
+function validateWeight(rawWeight, evaluatorName, evalId) {
+  if (rawWeight === void 0) {
+    return void 0;
+  }
+  if (typeof rawWeight !== "number") {
+    throw new Error(
+      `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be a number`
+    );
+  }
+  if (!Number.isFinite(rawWeight)) {
+    throw new Error(
+      `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be finite (got ${rawWeight})`
+    );
+  }
+  if (rawWeight < 0) {
+    throw new Error(
+      `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be non-negative (got ${rawWeight})`
+    );
+  }
+  return rawWeight;
+}
 // src/evaluation/loaders/message-processor.ts
 import { readFile as readFile3 } from "node:fs/promises";
@@ -3510,9 +3543,11 @@ var CodeEvaluator = class {
         expected_outcome: context.evalCase.expected_outcome,
         reference_answer: context.evalCase.reference_answer,
         candidate_answer: context.candidate,
-        guideline_paths: context.evalCase.guideline_paths,
-        input_files: context.evalCase.file_paths,
-        input_segments: context.evalCase.input_segments
+        guideline_files: context.evalCase.guideline_paths,
+        input_files: context.evalCase.file_paths.filter(
+          (path13) => !context.evalCase.guideline_paths.includes(path13)
+        ),
+        input_messages: context.evalCase.input_messages
       },
       null,
       2
@@ -4685,14 +4720,12 @@ async function evaluateCandidate(options) {
   } else {
     if (promptInputs.chatPrompt) {
       lmProviderRequest = {
-        chat_prompt: promptInputs.chatPrompt,
-        guideline_paths: evalCase.guideline_paths
+        chat_prompt: promptInputs.chatPrompt
       };
     } else {
       lmProviderRequest = {
         question: promptInputs.question,
-        guidelines: promptInputs.guidelines,
-        guideline_paths: evalCase.guideline_paths
+        guidelines: promptInputs.guidelines
       };
     }
   }
@@ -4799,11 +4832,13 @@ async function runEvaluatorList(options) {
           now,
           judgeProvider
         });
-        scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
         evaluatorResults.push({
           name: evaluator.name,
           type: evaluator.type,
           score: score2.score,
+          weight,
           verdict: score2.verdict,
           hits: score2.hits,
           misses: score2.misses,
@@ -4826,11 +4861,13 @@ async function runEvaluatorList(options) {
           promptInputs,
           now
         });
-        scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
         evaluatorResults.push({
           name: evaluator.name,
           type: "code_judge",
           score: score2.score,
+          weight,
           verdict: score2.verdict,
           hits: score2.hits,
           misses: score2.misses,
@@ -4883,11 +4920,13 @@ async function runEvaluatorList(options) {
           now,
           judgeProvider
         });
-        scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
         evaluatorResults.push({
           name: evaluator.name,
           type: evaluator.type,
           score: score2.score,
+          weight,
           verdict: score2.verdict,
           hits: score2.hits,
           misses: score2.misses,
@@ -4911,11 +4950,13 @@ async function runEvaluatorList(options) {
           candidateTrace,
           candidateTraceSummary
         });
-        scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
         evaluatorResults.push({
           name: evaluator.name,
           type: evaluator.type,
           score: score2.score,
+          weight,
           verdict: score2.verdict,
           hits: score2.hits,
           misses: score2.misses,
@@ -4935,11 +4976,13 @@ async function runEvaluatorList(options) {
           candidateTrace,
           candidateTraceSummary
         });
-        scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
         evaluatorResults.push({
           name: evaluator.name,
           type: evaluator.type,
           score: score2.score,
+          weight,
           verdict: score2.verdict,
           hits: score2.hits,
           misses: score2.misses,
@@ -4957,15 +5000,18 @@ async function runEvaluatorList(options) {
         reasoning: message
       };
       const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
+      const weight = evaluator.weight ?? 1;
       scored.push({
         score: fallbackScore,
         name: evaluator.name ?? "unknown",
-        type: resultType ?? "llm_judge"
+        type: resultType ?? "llm_judge",
+        weight
       });
       evaluatorResults.push({
         name: evaluator.name ?? "unknown",
         type: resultType ?? "llm_judge",
         score: 0,
+        weight,
         verdict: "fail",
         hits: [],
         misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
@@ -4973,7 +5019,9 @@ async function runEvaluatorList(options) {
       });
     }
   }
-  const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
+  const aggregateScore = scored.length > 0 ? computeWeightedMean(
+    scored.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
+  ) : 0;
   const hits = scored.flatMap((entry) => entry.score.hits);
   const misses = scored.flatMap((entry) => entry.score.misses);
   const expectedAspectCount = scored.reduce(
@@ -5199,6 +5247,16 @@ function mapChildResults(children) {
     evaluator_results: mapChildResults(child.evaluatorResults)
   }));
 }
+function computeWeightedMean(entries) {
+  let totalWeight = 0;
+  let weightedSum = 0;
+  for (const entry of entries) {
+    const weight = entry.weight ?? 1;
+    totalWeight += weight;
+    weightedSum += entry.score * weight;
+  }
+  return totalWeight > 0 ? weightedSum / totalWeight : 0;
+}
 // src/evaluation/generators/rubric-generator.ts
 import { generateText as generateText3 } from "ai";