npm - @agentv/core - Versions diffs - 0.25.0 → 1.0.0 - Mend

@agentv/core 0.25.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/{chunk-OYTL3LNN.js → chunk-V3JCB3HI.js} +5 -2
package/dist/chunk-V3JCB3HI.js.map +1 -0
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +1 -1
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +93 -32
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +13 -7
package/dist/index.d.ts +13 -7
package/dist/index.js +89 -31
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/dist/chunk-OYTL3LNN.js.map +0 -1

package/dist/index.d.cts CHANGED Viewed

@@ -56,6 +56,8 @@ interface ToolTrajectoryEvaluatorConfig {
     readonly minimums?: Readonly<Record<string, number>>;
     /** Expected tool sequence (for in_order/exact modes) */
     readonly expected?: readonly ToolTrajectoryExpectedItem[];
+    /** Optional weight for top-level aggregation (defaults to 1.0) */
+    readonly weight?: number;
 }
 /**
  * Expected tool call item in a trajectory sequence.
@@ -176,7 +178,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
  * Guard validating raw test messages.
  */
 declare function isTestMessage(value: unknown): value is TestMessage;
-declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "expected_messages"];
+declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "expected_tool_calls"];
 type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
 declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
 type CodeEvaluatorConfig = {
@@ -186,6 +188,7 @@ type CodeEvaluatorConfig = {
     readonly resolvedScriptPath?: string;
     readonly cwd?: string;
     readonly resolvedCwd?: string;
+    readonly weight?: number;
 };
 type LlmJudgeEvaluatorConfig = {
     readonly name: string;
@@ -193,6 +196,7 @@ type LlmJudgeEvaluatorConfig = {
     readonly prompt?: string;
     readonly promptPath?: string;
     readonly rubrics?: readonly RubricItem[];
+    readonly weight?: number;
 };
 type RubricItem = {
     readonly id: string;
@@ -218,12 +222,14 @@ type CompositeEvaluatorConfig = {
     readonly type: 'composite';
     readonly evaluators: readonly EvaluatorConfig[];
     readonly aggregator: CompositeAggregatorConfig;
+    readonly weight?: number;
 };
-type ExpectedMessagesEvaluatorConfig = {
+type ExpectedToolCallsEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'expected_messages';
+    readonly type: 'expected_tool_calls';
+    readonly weight?: number;
 };
-type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedMessagesEvaluatorConfig;
+type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedToolCallsEvaluatorConfig;
 /**
  * Eval case definition sourced from AgentV specs.
  */
@@ -764,8 +770,8 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
  * Extracts tool_calls from assistant messages in expected_messages and compares them
  * sequentially against tool_call events in the trace.
  */
-declare class ExpectedMessagesEvaluator implements Evaluator {
-    readonly kind = "expected_messages";
+declare class ExpectedToolCallsEvaluator implements Evaluator {
+    readonly kind = "expected_tool_calls";
     evaluate(context: EvaluationContext): EvaluationScore;
     private extractExpectedToolCalls;
     private validateToolCalls;
@@ -861,4 +867,4 @@ type AgentKernel = {
 };
 declare function createAgentKernel(): AgentKernel;
-export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, ExpectedMessagesEvaluator, type ExpectedMessagesEvaluatorConfig, type ExpectedToolCall, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TestMessageToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceEvent, type TraceEventType, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isExpectedToolCall, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, isTraceEvent, isTraceEventType, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
+export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExpectedToolCall, ExpectedToolCallsEvaluator, type ExpectedToolCallsEvaluatorConfig, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TestMessageToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceEvent, type TraceEventType, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isExpectedToolCall, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, isTraceEvent, isTraceEventType, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };

package/dist/index.d.ts CHANGED Viewed

@@ -56,6 +56,8 @@ interface ToolTrajectoryEvaluatorConfig {
     readonly minimums?: Readonly<Record<string, number>>;
     /** Expected tool sequence (for in_order/exact modes) */
     readonly expected?: readonly ToolTrajectoryExpectedItem[];
+    /** Optional weight for top-level aggregation (defaults to 1.0) */
+    readonly weight?: number;
 }
 /**
  * Expected tool call item in a trajectory sequence.
@@ -176,7 +178,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
  * Guard validating raw test messages.
  */
 declare function isTestMessage(value: unknown): value is TestMessage;
-declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "expected_messages"];
+declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "expected_tool_calls"];
 type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
 declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
 type CodeEvaluatorConfig = {
@@ -186,6 +188,7 @@ type CodeEvaluatorConfig = {
     readonly resolvedScriptPath?: string;
     readonly cwd?: string;
     readonly resolvedCwd?: string;
+    readonly weight?: number;
 };
 type LlmJudgeEvaluatorConfig = {
     readonly name: string;
@@ -193,6 +196,7 @@ type LlmJudgeEvaluatorConfig = {
     readonly prompt?: string;
     readonly promptPath?: string;
     readonly rubrics?: readonly RubricItem[];
+    readonly weight?: number;
 };
 type RubricItem = {
     readonly id: string;
@@ -218,12 +222,14 @@ type CompositeEvaluatorConfig = {
     readonly type: 'composite';
     readonly evaluators: readonly EvaluatorConfig[];
     readonly aggregator: CompositeAggregatorConfig;
+    readonly weight?: number;
 };
-type ExpectedMessagesEvaluatorConfig = {
+type ExpectedToolCallsEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'expected_messages';
+    readonly type: 'expected_tool_calls';
+    readonly weight?: number;
 };
-type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedMessagesEvaluatorConfig;
+type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedToolCallsEvaluatorConfig;
 /**
  * Eval case definition sourced from AgentV specs.
  */
@@ -764,8 +770,8 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
  * Extracts tool_calls from assistant messages in expected_messages and compares them
  * sequentially against tool_call events in the trace.
  */
-declare class ExpectedMessagesEvaluator implements Evaluator {
-    readonly kind = "expected_messages";
+declare class ExpectedToolCallsEvaluator implements Evaluator {
+    readonly kind = "expected_tool_calls";
     evaluate(context: EvaluationContext): EvaluationScore;
     private extractExpectedToolCalls;
     private validateToolCalls;
@@ -861,4 +867,4 @@ type AgentKernel = {
 };
 declare function createAgentKernel(): AgentKernel;
-export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, ExpectedMessagesEvaluator, type ExpectedMessagesEvaluatorConfig, type ExpectedToolCall, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TestMessageToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceEvent, type TraceEventType, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isExpectedToolCall, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, isTraceEvent, isTraceEventType, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
+export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExpectedToolCall, ExpectedToolCallsEvaluator, type ExpectedToolCallsEvaluatorConfig, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TestMessageToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceEvent, type TraceEventType, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isExpectedToolCall, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, isTraceEvent, isTraceEventType, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };

package/dist/index.js CHANGED Viewed

@@ -9,7 +9,7 @@ import {
   readTextFile,
   resolveFileReference,
   resolveTargetDefinition
-} from "./chunk-OYTL3LNN.js";
+} from "./chunk-V3JCB3HI.js";
 // src/evaluation/types.ts
 var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -58,7 +58,7 @@ var EVALUATOR_KIND_VALUES = [
   "rubric",
   "composite",
   "tool_trajectory",
-  "expected_messages"
+  "expected_tool_calls"
 ];
 var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
 function isEvaluatorKind(value) {
@@ -455,6 +455,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
         continue;
       }
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
       const cwd = asString2(rawEvaluator.cwd);
       let resolvedCwd;
       if (cwd) {
@@ -475,7 +476,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         type: "code",
         script,
         cwd,
-        resolvedCwd
+        resolvedCwd,
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
       });
       continue;
     }
@@ -570,18 +572,22 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           ...promptPath2 ? { promptPath: promptPath2 } : {}
         };
       }
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
       evaluators.push({
         name,
         type: "composite",
         evaluators: memberEvaluators,
-        aggregator
+        aggregator,
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
       });
       continue;
     }
-    if (typeValue === "expected_messages") {
+    if (typeValue === "expected_tool_calls") {
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
       evaluators.push({
         name,
-        type: "expected_messages"
+        type: "expected_tool_calls",
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
       });
       continue;
     }
@@ -637,12 +643,14 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         );
         continue;
       }
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
       const config = {
         name,
         type: "tool_trajectory",
         mode,
         ...minimums ? { minimums } : {},
-        ...expected ? { expected } : {}
+        ...expected ? { expected } : {},
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
       };
       evaluators.push(config);
       continue;
@@ -683,19 +691,23 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
         continue;
       }
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
       evaluators.push({
         name,
         type: "llm_judge",
-        rubrics: parsedRubrics
+        rubrics: parsedRubrics,
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
       });
       continue;
     }
+    const weight = validateWeight(rawEvaluator.weight, name, evalId);
     evaluators.push({
       name,
       type: "llm_judge",
       prompt,
       promptPath,
-      ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
+      ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
+      ...weight !== void 0 ? { weight } : {}
     });
   }
   return evaluators.length > 0 ? evaluators : void 0;
@@ -725,6 +737,27 @@ ${detailBlock}${ANSI_RESET3}`);
     console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
   }
 }
+function validateWeight(rawWeight, evaluatorName, evalId) {
+  if (rawWeight === void 0) {
+    return void 0;
+  }
+  if (typeof rawWeight !== "number") {
+    throw new Error(
+      `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be a number`
+    );
+  }
+  if (!Number.isFinite(rawWeight)) {
+    throw new Error(
+      `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be finite (got ${rawWeight})`
+    );
+  }
+  if (rawWeight < 0) {
+    throw new Error(
+      `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be non-negative (got ${rawWeight})`
+    );
+  }
+  return rawWeight;
+}
 // src/evaluation/loaders/message-processor.ts
 import { readFile as readFile3 } from "node:fs/promises";
@@ -3510,9 +3543,11 @@ var CodeEvaluator = class {
         expected_outcome: context.evalCase.expected_outcome,
         reference_answer: context.evalCase.reference_answer,
         candidate_answer: context.candidate,
-        guideline_paths: context.evalCase.guideline_paths,
-        input_files: context.evalCase.file_paths,
-        input_segments: context.evalCase.input_segments
+        guideline_files: context.evalCase.guideline_paths,
+        input_files: context.evalCase.file_paths.filter(
+          (path13) => !context.evalCase.guideline_paths.includes(path13)
+        ),
+        input_messages: context.evalCase.input_messages
       },
       null,
       2
@@ -3778,8 +3813,8 @@ var ToolTrajectoryEvaluator = class {
     };
   }
 };
-var ExpectedMessagesEvaluator = class {
-  kind = "expected_messages";
+var ExpectedToolCallsEvaluator = class {
+  kind = "expected_tool_calls";
   evaluate(context) {
     const { candidateTrace, evalCase } = context;
     const expectedSegments = evalCase.expected_segments;
@@ -4685,14 +4720,12 @@ async function evaluateCandidate(options) {
   } else {
     if (promptInputs.chatPrompt) {
       lmProviderRequest = {
-        chat_prompt: promptInputs.chatPrompt,
-        guideline_paths: evalCase.guideline_paths
+        chat_prompt: promptInputs.chatPrompt
       };
     } else {
       lmProviderRequest = {
         question: promptInputs.question,
-        guidelines: promptInputs.guidelines,
-        guideline_paths: evalCase.guideline_paths
+        guidelines: promptInputs.guidelines
       };
     }
   }
@@ -4799,11 +4832,13 @@ async function runEvaluatorList(options) {
           now,
           judgeProvider
         });
-        scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
         evaluatorResults.push({
           name: evaluator.name,
           type: evaluator.type,
           score: score2.score,
+          weight,
           verdict: score2.verdict,
           hits: score2.hits,
           misses: score2.misses,
@@ -4826,11 +4861,13 @@ async function runEvaluatorList(options) {
           promptInputs,
           now
         });
-        scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
         evaluatorResults.push({
           name: evaluator.name,
           type: "code_judge",
           score: score2.score,
+          weight,
           verdict: score2.verdict,
           hits: score2.hits,
           misses: score2.misses,
@@ -4860,8 +4897,8 @@ async function runEvaluatorList(options) {
               return new ToolTrajectoryEvaluator({
                 config: memberConfig
               });
-            case "expected_messages":
-              return new ExpectedMessagesEvaluator();
+            case "expected_tool_calls":
+              return new ExpectedToolCallsEvaluator();
             default: {
               const unknownConfig = memberConfig;
               throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -4883,11 +4920,13 @@ async function runEvaluatorList(options) {
           now,
           judgeProvider
         });
-        scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
         evaluatorResults.push({
           name: evaluator.name,
           type: evaluator.type,
           score: score2.score,
+          weight,
           verdict: score2.verdict,
           hits: score2.hits,
           misses: score2.misses,
@@ -4911,20 +4950,22 @@ async function runEvaluatorList(options) {
           candidateTrace,
           candidateTraceSummary
         });
-        scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
         evaluatorResults.push({
           name: evaluator.name,
           type: evaluator.type,
           score: score2.score,
+          weight,
           verdict: score2.verdict,
           hits: score2.hits,
           misses: score2.misses,
           reasoning: score2.reasoning
         });
       }
-      if (evaluator.type === "expected_messages") {
-        const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
-        const score2 = expectedMessagesEvaluator.evaluate({
+      if (evaluator.type === "expected_tool_calls") {
+        const expectedToolCallsEvaluator = new ExpectedToolCallsEvaluator();
+        const score2 = expectedToolCallsEvaluator.evaluate({
           evalCase,
           candidate,
           target,
@@ -4935,11 +4976,13 @@ async function runEvaluatorList(options) {
           candidateTrace,
           candidateTraceSummary
         });
-        scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
         evaluatorResults.push({
           name: evaluator.name,
           type: evaluator.type,
           score: score2.score,
+          weight,
           verdict: score2.verdict,
           hits: score2.hits,
           misses: score2.misses,
@@ -4957,15 +5000,18 @@ async function runEvaluatorList(options) {
         reasoning: message
       };
       const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
+      const weight = evaluator.weight ?? 1;
       scored.push({
         score: fallbackScore,
         name: evaluator.name ?? "unknown",
-        type: resultType ?? "llm_judge"
+        type: resultType ?? "llm_judge",
+        weight
       });
       evaluatorResults.push({
         name: evaluator.name ?? "unknown",
         type: resultType ?? "llm_judge",
         score: 0,
+        weight,
         verdict: "fail",
         hits: [],
         misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
@@ -4973,7 +5019,9 @@ async function runEvaluatorList(options) {
       });
     }
   }
-  const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
+  const aggregateScore = scored.length > 0 ? computeWeightedMean(
+    scored.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
+  ) : 0;
   const hits = scored.flatMap((entry) => entry.score.hits);
   const misses = scored.flatMap((entry) => entry.score.misses);
   const expectedAspectCount = scored.reduce(
@@ -5199,6 +5247,16 @@ function mapChildResults(children) {
     evaluator_results: mapChildResults(child.evaluatorResults)
   }));
 }
+function computeWeightedMean(entries) {
+  let totalWeight = 0;
+  let weightedSum = 0;
+  for (const entry of entries) {
+    const weight = entry.weight ?? 1;
+    totalWeight += weight;
+    weightedSum += entry.score * weight;
+  }
+  return totalWeight > 0 ? weightedSum / totalWeight : 0;
+}
 // src/evaluation/generators/rubric-generator.ts
 import { generateText as generateText3 } from "ai";
@@ -5287,7 +5345,7 @@ function createAgentKernel() {
 export {
   CodeEvaluator,
   CompositeEvaluator,
-  ExpectedMessagesEvaluator,
+  ExpectedToolCallsEvaluator,
   LlmJudgeEvaluator,
   TEST_MESSAGE_ROLES,
   ToolTrajectoryEvaluator,