@agentv/core 0.25.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -56,6 +56,8 @@ interface ToolTrajectoryEvaluatorConfig {
56
56
  readonly minimums?: Readonly<Record<string, number>>;
57
57
  /** Expected tool sequence (for in_order/exact modes) */
58
58
  readonly expected?: readonly ToolTrajectoryExpectedItem[];
59
+ /** Optional weight for top-level aggregation (defaults to 1.0) */
60
+ readonly weight?: number;
59
61
  }
60
62
  /**
61
63
  * Expected tool call item in a trajectory sequence.
@@ -176,7 +178,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
176
178
  * Guard validating raw test messages.
177
179
  */
178
180
  declare function isTestMessage(value: unknown): value is TestMessage;
179
- declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "expected_messages"];
181
+ declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "expected_tool_calls"];
180
182
  type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
181
183
  declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
182
184
  type CodeEvaluatorConfig = {
@@ -186,6 +188,7 @@ type CodeEvaluatorConfig = {
186
188
  readonly resolvedScriptPath?: string;
187
189
  readonly cwd?: string;
188
190
  readonly resolvedCwd?: string;
191
+ readonly weight?: number;
189
192
  };
190
193
  type LlmJudgeEvaluatorConfig = {
191
194
  readonly name: string;
@@ -193,6 +196,7 @@ type LlmJudgeEvaluatorConfig = {
193
196
  readonly prompt?: string;
194
197
  readonly promptPath?: string;
195
198
  readonly rubrics?: readonly RubricItem[];
199
+ readonly weight?: number;
196
200
  };
197
201
  type RubricItem = {
198
202
  readonly id: string;
@@ -218,12 +222,14 @@ type CompositeEvaluatorConfig = {
218
222
  readonly type: 'composite';
219
223
  readonly evaluators: readonly EvaluatorConfig[];
220
224
  readonly aggregator: CompositeAggregatorConfig;
225
+ readonly weight?: number;
221
226
  };
222
- type ExpectedMessagesEvaluatorConfig = {
227
+ type ExpectedToolCallsEvaluatorConfig = {
223
228
  readonly name: string;
224
- readonly type: 'expected_messages';
229
+ readonly type: 'expected_tool_calls';
230
+ readonly weight?: number;
225
231
  };
226
- type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedMessagesEvaluatorConfig;
232
+ type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedToolCallsEvaluatorConfig;
227
233
  /**
228
234
  * Eval case definition sourced from AgentV specs.
229
235
  */
@@ -764,8 +770,8 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
764
770
  * Extracts tool_calls from assistant messages in expected_messages and compares them
765
771
  * sequentially against tool_call events in the trace.
766
772
  */
767
- declare class ExpectedMessagesEvaluator implements Evaluator {
768
- readonly kind = "expected_messages";
773
+ declare class ExpectedToolCallsEvaluator implements Evaluator {
774
+ readonly kind = "expected_tool_calls";
769
775
  evaluate(context: EvaluationContext): EvaluationScore;
770
776
  private extractExpectedToolCalls;
771
777
  private validateToolCalls;
@@ -861,4 +867,4 @@ type AgentKernel = {
861
867
  };
862
868
  declare function createAgentKernel(): AgentKernel;
863
869
 
864
- export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, ExpectedMessagesEvaluator, type ExpectedMessagesEvaluatorConfig, type ExpectedToolCall, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TestMessageToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceEvent, type TraceEventType, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isExpectedToolCall, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, isTraceEvent, isTraceEventType, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
870
+ export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExpectedToolCall, ExpectedToolCallsEvaluator, type ExpectedToolCallsEvaluatorConfig, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TestMessageToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceEvent, type TraceEventType, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isExpectedToolCall, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, isTraceEvent, isTraceEventType, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
package/dist/index.d.ts CHANGED
@@ -56,6 +56,8 @@ interface ToolTrajectoryEvaluatorConfig {
56
56
  readonly minimums?: Readonly<Record<string, number>>;
57
57
  /** Expected tool sequence (for in_order/exact modes) */
58
58
  readonly expected?: readonly ToolTrajectoryExpectedItem[];
59
+ /** Optional weight for top-level aggregation (defaults to 1.0) */
60
+ readonly weight?: number;
59
61
  }
60
62
  /**
61
63
  * Expected tool call item in a trajectory sequence.
@@ -176,7 +178,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
176
178
  * Guard validating raw test messages.
177
179
  */
178
180
  declare function isTestMessage(value: unknown): value is TestMessage;
179
- declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "expected_messages"];
181
+ declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "expected_tool_calls"];
180
182
  type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
181
183
  declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
182
184
  type CodeEvaluatorConfig = {
@@ -186,6 +188,7 @@ type CodeEvaluatorConfig = {
186
188
  readonly resolvedScriptPath?: string;
187
189
  readonly cwd?: string;
188
190
  readonly resolvedCwd?: string;
191
+ readonly weight?: number;
189
192
  };
190
193
  type LlmJudgeEvaluatorConfig = {
191
194
  readonly name: string;
@@ -193,6 +196,7 @@ type LlmJudgeEvaluatorConfig = {
193
196
  readonly prompt?: string;
194
197
  readonly promptPath?: string;
195
198
  readonly rubrics?: readonly RubricItem[];
199
+ readonly weight?: number;
196
200
  };
197
201
  type RubricItem = {
198
202
  readonly id: string;
@@ -218,12 +222,14 @@ type CompositeEvaluatorConfig = {
218
222
  readonly type: 'composite';
219
223
  readonly evaluators: readonly EvaluatorConfig[];
220
224
  readonly aggregator: CompositeAggregatorConfig;
225
+ readonly weight?: number;
221
226
  };
222
- type ExpectedMessagesEvaluatorConfig = {
227
+ type ExpectedToolCallsEvaluatorConfig = {
223
228
  readonly name: string;
224
- readonly type: 'expected_messages';
229
+ readonly type: 'expected_tool_calls';
230
+ readonly weight?: number;
225
231
  };
226
- type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedMessagesEvaluatorConfig;
232
+ type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedToolCallsEvaluatorConfig;
227
233
  /**
228
234
  * Eval case definition sourced from AgentV specs.
229
235
  */
@@ -764,8 +770,8 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
764
770
  * Extracts tool_calls from assistant messages in expected_messages and compares them
765
771
  * sequentially against tool_call events in the trace.
766
772
  */
767
- declare class ExpectedMessagesEvaluator implements Evaluator {
768
- readonly kind = "expected_messages";
773
+ declare class ExpectedToolCallsEvaluator implements Evaluator {
774
+ readonly kind = "expected_tool_calls";
769
775
  evaluate(context: EvaluationContext): EvaluationScore;
770
776
  private extractExpectedToolCalls;
771
777
  private validateToolCalls;
@@ -861,4 +867,4 @@ type AgentKernel = {
861
867
  };
862
868
  declare function createAgentKernel(): AgentKernel;
863
869
 
864
- export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, ExpectedMessagesEvaluator, type ExpectedMessagesEvaluatorConfig, type ExpectedToolCall, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TestMessageToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceEvent, type TraceEventType, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isExpectedToolCall, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, isTraceEvent, isTraceEventType, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
870
+ export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExpectedToolCall, ExpectedToolCallsEvaluator, type ExpectedToolCallsEvaluatorConfig, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TestMessageToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceEvent, type TraceEventType, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isExpectedToolCall, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, isTraceEvent, isTraceEventType, listTargetNames, loadEvalCases, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries };
package/dist/index.js CHANGED
@@ -9,7 +9,7 @@ import {
9
9
  readTextFile,
10
10
  resolveFileReference,
11
11
  resolveTargetDefinition
12
- } from "./chunk-OYTL3LNN.js";
12
+ } from "./chunk-V3JCB3HI.js";
13
13
 
14
14
  // src/evaluation/types.ts
15
15
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -58,7 +58,7 @@ var EVALUATOR_KIND_VALUES = [
58
58
  "rubric",
59
59
  "composite",
60
60
  "tool_trajectory",
61
- "expected_messages"
61
+ "expected_tool_calls"
62
62
  ];
63
63
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
64
64
  function isEvaluatorKind(value) {
@@ -455,6 +455,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
455
455
  logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
456
456
  continue;
457
457
  }
458
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
458
459
  const cwd = asString2(rawEvaluator.cwd);
459
460
  let resolvedCwd;
460
461
  if (cwd) {
@@ -475,7 +476,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
475
476
  type: "code",
476
477
  script,
477
478
  cwd,
478
- resolvedCwd
479
+ resolvedCwd,
480
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
479
481
  });
480
482
  continue;
481
483
  }
@@ -570,18 +572,22 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
570
572
  ...promptPath2 ? { promptPath: promptPath2 } : {}
571
573
  };
572
574
  }
575
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
573
576
  evaluators.push({
574
577
  name,
575
578
  type: "composite",
576
579
  evaluators: memberEvaluators,
577
- aggregator
580
+ aggregator,
581
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
578
582
  });
579
583
  continue;
580
584
  }
581
- if (typeValue === "expected_messages") {
585
+ if (typeValue === "expected_tool_calls") {
586
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
582
587
  evaluators.push({
583
588
  name,
584
- type: "expected_messages"
589
+ type: "expected_tool_calls",
590
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
585
591
  });
586
592
  continue;
587
593
  }
@@ -637,12 +643,14 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
637
643
  );
638
644
  continue;
639
645
  }
646
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
640
647
  const config = {
641
648
  name,
642
649
  type: "tool_trajectory",
643
650
  mode,
644
651
  ...minimums ? { minimums } : {},
645
- ...expected ? { expected } : {}
652
+ ...expected ? { expected } : {},
653
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
646
654
  };
647
655
  evaluators.push(config);
648
656
  continue;
@@ -683,19 +691,23 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
683
691
  logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
684
692
  continue;
685
693
  }
694
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
686
695
  evaluators.push({
687
696
  name,
688
697
  type: "llm_judge",
689
- rubrics: parsedRubrics
698
+ rubrics: parsedRubrics,
699
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
690
700
  });
691
701
  continue;
692
702
  }
703
+ const weight = validateWeight(rawEvaluator.weight, name, evalId);
693
704
  evaluators.push({
694
705
  name,
695
706
  type: "llm_judge",
696
707
  prompt,
697
708
  promptPath,
698
- ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
709
+ ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
710
+ ...weight !== void 0 ? { weight } : {}
699
711
  });
700
712
  }
701
713
  return evaluators.length > 0 ? evaluators : void 0;
@@ -725,6 +737,27 @@ ${detailBlock}${ANSI_RESET3}`);
725
737
  console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
726
738
  }
727
739
  }
740
+ function validateWeight(rawWeight, evaluatorName, evalId) {
741
+ if (rawWeight === void 0) {
742
+ return void 0;
743
+ }
744
+ if (typeof rawWeight !== "number") {
745
+ throw new Error(
746
+ `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be a number`
747
+ );
748
+ }
749
+ if (!Number.isFinite(rawWeight)) {
750
+ throw new Error(
751
+ `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be finite (got ${rawWeight})`
752
+ );
753
+ }
754
+ if (rawWeight < 0) {
755
+ throw new Error(
756
+ `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be non-negative (got ${rawWeight})`
757
+ );
758
+ }
759
+ return rawWeight;
760
+ }
728
761
 
729
762
  // src/evaluation/loaders/message-processor.ts
730
763
  import { readFile as readFile3 } from "node:fs/promises";
@@ -3510,9 +3543,11 @@ var CodeEvaluator = class {
3510
3543
  expected_outcome: context.evalCase.expected_outcome,
3511
3544
  reference_answer: context.evalCase.reference_answer,
3512
3545
  candidate_answer: context.candidate,
3513
- guideline_paths: context.evalCase.guideline_paths,
3514
- input_files: context.evalCase.file_paths,
3515
- input_segments: context.evalCase.input_segments
3546
+ guideline_files: context.evalCase.guideline_paths,
3547
+ input_files: context.evalCase.file_paths.filter(
3548
+ (path13) => !context.evalCase.guideline_paths.includes(path13)
3549
+ ),
3550
+ input_messages: context.evalCase.input_messages
3516
3551
  },
3517
3552
  null,
3518
3553
  2
@@ -3778,8 +3813,8 @@ var ToolTrajectoryEvaluator = class {
3778
3813
  };
3779
3814
  }
3780
3815
  };
3781
- var ExpectedMessagesEvaluator = class {
3782
- kind = "expected_messages";
3816
+ var ExpectedToolCallsEvaluator = class {
3817
+ kind = "expected_tool_calls";
3783
3818
  evaluate(context) {
3784
3819
  const { candidateTrace, evalCase } = context;
3785
3820
  const expectedSegments = evalCase.expected_segments;
@@ -4685,14 +4720,12 @@ async function evaluateCandidate(options) {
4685
4720
  } else {
4686
4721
  if (promptInputs.chatPrompt) {
4687
4722
  lmProviderRequest = {
4688
- chat_prompt: promptInputs.chatPrompt,
4689
- guideline_paths: evalCase.guideline_paths
4723
+ chat_prompt: promptInputs.chatPrompt
4690
4724
  };
4691
4725
  } else {
4692
4726
  lmProviderRequest = {
4693
4727
  question: promptInputs.question,
4694
- guidelines: promptInputs.guidelines,
4695
- guideline_paths: evalCase.guideline_paths
4728
+ guidelines: promptInputs.guidelines
4696
4729
  };
4697
4730
  }
4698
4731
  }
@@ -4799,11 +4832,13 @@ async function runEvaluatorList(options) {
4799
4832
  now,
4800
4833
  judgeProvider
4801
4834
  });
4802
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4835
+ const weight = evaluator.weight ?? 1;
4836
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
4803
4837
  evaluatorResults.push({
4804
4838
  name: evaluator.name,
4805
4839
  type: evaluator.type,
4806
4840
  score: score2.score,
4841
+ weight,
4807
4842
  verdict: score2.verdict,
4808
4843
  hits: score2.hits,
4809
4844
  misses: score2.misses,
@@ -4826,11 +4861,13 @@ async function runEvaluatorList(options) {
4826
4861
  promptInputs,
4827
4862
  now
4828
4863
  });
4829
- scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
4864
+ const weight = evaluator.weight ?? 1;
4865
+ scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
4830
4866
  evaluatorResults.push({
4831
4867
  name: evaluator.name,
4832
4868
  type: "code_judge",
4833
4869
  score: score2.score,
4870
+ weight,
4834
4871
  verdict: score2.verdict,
4835
4872
  hits: score2.hits,
4836
4873
  misses: score2.misses,
@@ -4860,8 +4897,8 @@ async function runEvaluatorList(options) {
4860
4897
  return new ToolTrajectoryEvaluator({
4861
4898
  config: memberConfig
4862
4899
  });
4863
- case "expected_messages":
4864
- return new ExpectedMessagesEvaluator();
4900
+ case "expected_tool_calls":
4901
+ return new ExpectedToolCallsEvaluator();
4865
4902
  default: {
4866
4903
  const unknownConfig = memberConfig;
4867
4904
  throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -4883,11 +4920,13 @@ async function runEvaluatorList(options) {
4883
4920
  now,
4884
4921
  judgeProvider
4885
4922
  });
4886
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4923
+ const weight = evaluator.weight ?? 1;
4924
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
4887
4925
  evaluatorResults.push({
4888
4926
  name: evaluator.name,
4889
4927
  type: evaluator.type,
4890
4928
  score: score2.score,
4929
+ weight,
4891
4930
  verdict: score2.verdict,
4892
4931
  hits: score2.hits,
4893
4932
  misses: score2.misses,
@@ -4911,20 +4950,22 @@ async function runEvaluatorList(options) {
4911
4950
  candidateTrace,
4912
4951
  candidateTraceSummary
4913
4952
  });
4914
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4953
+ const weight = evaluator.weight ?? 1;
4954
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
4915
4955
  evaluatorResults.push({
4916
4956
  name: evaluator.name,
4917
4957
  type: evaluator.type,
4918
4958
  score: score2.score,
4959
+ weight,
4919
4960
  verdict: score2.verdict,
4920
4961
  hits: score2.hits,
4921
4962
  misses: score2.misses,
4922
4963
  reasoning: score2.reasoning
4923
4964
  });
4924
4965
  }
4925
- if (evaluator.type === "expected_messages") {
4926
- const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
4927
- const score2 = expectedMessagesEvaluator.evaluate({
4966
+ if (evaluator.type === "expected_tool_calls") {
4967
+ const expectedToolCallsEvaluator = new ExpectedToolCallsEvaluator();
4968
+ const score2 = expectedToolCallsEvaluator.evaluate({
4928
4969
  evalCase,
4929
4970
  candidate,
4930
4971
  target,
@@ -4935,11 +4976,13 @@ async function runEvaluatorList(options) {
4935
4976
  candidateTrace,
4936
4977
  candidateTraceSummary
4937
4978
  });
4938
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4979
+ const weight = evaluator.weight ?? 1;
4980
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
4939
4981
  evaluatorResults.push({
4940
4982
  name: evaluator.name,
4941
4983
  type: evaluator.type,
4942
4984
  score: score2.score,
4985
+ weight,
4943
4986
  verdict: score2.verdict,
4944
4987
  hits: score2.hits,
4945
4988
  misses: score2.misses,
@@ -4957,15 +5000,18 @@ async function runEvaluatorList(options) {
4957
5000
  reasoning: message
4958
5001
  };
4959
5002
  const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
5003
+ const weight = evaluator.weight ?? 1;
4960
5004
  scored.push({
4961
5005
  score: fallbackScore,
4962
5006
  name: evaluator.name ?? "unknown",
4963
- type: resultType ?? "llm_judge"
5007
+ type: resultType ?? "llm_judge",
5008
+ weight
4964
5009
  });
4965
5010
  evaluatorResults.push({
4966
5011
  name: evaluator.name ?? "unknown",
4967
5012
  type: resultType ?? "llm_judge",
4968
5013
  score: 0,
5014
+ weight,
4969
5015
  verdict: "fail",
4970
5016
  hits: [],
4971
5017
  misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
@@ -4973,7 +5019,9 @@ async function runEvaluatorList(options) {
4973
5019
  });
4974
5020
  }
4975
5021
  }
4976
- const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
5022
+ const aggregateScore = scored.length > 0 ? computeWeightedMean(
5023
+ scored.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
5024
+ ) : 0;
4977
5025
  const hits = scored.flatMap((entry) => entry.score.hits);
4978
5026
  const misses = scored.flatMap((entry) => entry.score.misses);
4979
5027
  const expectedAspectCount = scored.reduce(
@@ -5199,6 +5247,16 @@ function mapChildResults(children) {
5199
5247
  evaluator_results: mapChildResults(child.evaluatorResults)
5200
5248
  }));
5201
5249
  }
5250
+ function computeWeightedMean(entries) {
5251
+ let totalWeight = 0;
5252
+ let weightedSum = 0;
5253
+ for (const entry of entries) {
5254
+ const weight = entry.weight ?? 1;
5255
+ totalWeight += weight;
5256
+ weightedSum += entry.score * weight;
5257
+ }
5258
+ return totalWeight > 0 ? weightedSum / totalWeight : 0;
5259
+ }
5202
5260
 
5203
5261
  // src/evaluation/generators/rubric-generator.ts
5204
5262
  import { generateText as generateText3 } from "ai";
@@ -5287,7 +5345,7 @@ function createAgentKernel() {
5287
5345
  export {
5288
5346
  CodeEvaluator,
5289
5347
  CompositeEvaluator,
5290
- ExpectedMessagesEvaluator,
5348
+ ExpectedToolCallsEvaluator,
5291
5349
  LlmJudgeEvaluator,
5292
5350
  TEST_MESSAGE_ROLES,
5293
5351
  ToolTrajectoryEvaluator,