@agentv/core 0.25.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -56,6 +56,8 @@ interface ToolTrajectoryEvaluatorConfig {
56
56
  readonly minimums?: Readonly<Record<string, number>>;
57
57
  /** Expected tool sequence (for in_order/exact modes) */
58
58
  readonly expected?: readonly ToolTrajectoryExpectedItem[];
59
+ /** Optional weight for top-level aggregation (defaults to 1.0) */
60
+ readonly weight?: number;
59
61
  }
60
62
  /**
61
63
  * Expected tool call item in a trajectory sequence.
@@ -186,6 +188,7 @@ type CodeEvaluatorConfig = {
186
188
  readonly resolvedScriptPath?: string;
187
189
  readonly cwd?: string;
188
190
  readonly resolvedCwd?: string;
191
+ readonly weight?: number;
189
192
  };
190
193
  type LlmJudgeEvaluatorConfig = {
191
194
  readonly name: string;
@@ -193,6 +196,7 @@ type LlmJudgeEvaluatorConfig = {
193
196
  readonly prompt?: string;
194
197
  readonly promptPath?: string;
195
198
  readonly rubrics?: readonly RubricItem[];
199
+ readonly weight?: number;
196
200
  };
197
201
  type RubricItem = {
198
202
  readonly id: string;
@@ -218,10 +222,12 @@ type CompositeEvaluatorConfig = {
218
222
  readonly type: 'composite';
219
223
  readonly evaluators: readonly EvaluatorConfig[];
220
224
  readonly aggregator: CompositeAggregatorConfig;
225
+ readonly weight?: number;
221
226
  };
222
227
  type ExpectedMessagesEvaluatorConfig = {
223
228
  readonly name: string;
224
229
  readonly type: 'expected_messages';
230
+ readonly weight?: number;
225
231
  };
226
232
  type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedMessagesEvaluatorConfig;
227
233
  /**
package/dist/index.d.ts CHANGED
@@ -56,6 +56,8 @@ interface ToolTrajectoryEvaluatorConfig {
56
56
  readonly minimums?: Readonly<Record<string, number>>;
57
57
  /** Expected tool sequence (for in_order/exact modes) */
58
58
  readonly expected?: readonly ToolTrajectoryExpectedItem[];
59
+ /** Optional weight for top-level aggregation (defaults to 1.0) */
60
+ readonly weight?: number;
59
61
  }
60
62
  /**
61
63
  * Expected tool call item in a trajectory sequence.
@@ -186,6 +188,7 @@ type CodeEvaluatorConfig = {
186
188
  readonly resolvedScriptPath?: string;
187
189
  readonly cwd?: string;
188
190
  readonly resolvedCwd?: string;
191
+ readonly weight?: number;
189
192
  };
190
193
  type LlmJudgeEvaluatorConfig = {
191
194
  readonly name: string;
@@ -193,6 +196,7 @@ type LlmJudgeEvaluatorConfig = {
193
196
  readonly prompt?: string;
194
197
  readonly promptPath?: string;
195
198
  readonly rubrics?: readonly RubricItem[];
199
+ readonly weight?: number;
196
200
  };
197
201
  type RubricItem = {
198
202
  readonly id: string;
@@ -218,10 +222,12 @@ type CompositeEvaluatorConfig = {
218
222
  readonly type: 'composite';
219
223
  readonly evaluators: readonly EvaluatorConfig[];
220
224
  readonly aggregator: CompositeAggregatorConfig;
225
+ readonly weight?: number;
221
226
  };
222
227
  type ExpectedMessagesEvaluatorConfig = {
223
228
  readonly name: string;
224
229
  readonly type: 'expected_messages';
230
+ readonly weight?: number;
225
231
  };
226
232
  type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedMessagesEvaluatorConfig;
227
233
  /**
package/dist/index.js CHANGED
@@ -9,7 +9,7 @@ import {
9
9
  readTextFile,
10
10
  resolveFileReference,
11
11
  resolveTargetDefinition
12
- } from "./chunk-OYTL3LNN.js";
12
+ } from "./chunk-NDEN3H2B.js";
13
13
 
14
14
  // src/evaluation/types.ts
15
15
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -455,6 +455,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
455
455
  logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
456
456
  continue;
457
457
  }
458
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
458
459
  const cwd = asString2(rawEvaluator.cwd);
459
460
  let resolvedCwd;
460
461
  if (cwd) {
@@ -475,7 +476,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
475
476
  type: "code",
476
477
  script,
477
478
  cwd,
478
- resolvedCwd
479
+ resolvedCwd,
480
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
479
481
  });
480
482
  continue;
481
483
  }
@@ -570,18 +572,22 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
570
572
  ...promptPath2 ? { promptPath: promptPath2 } : {}
571
573
  };
572
574
  }
575
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
573
576
  evaluators.push({
574
577
  name,
575
578
  type: "composite",
576
579
  evaluators: memberEvaluators,
577
- aggregator
580
+ aggregator,
581
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
578
582
  });
579
583
  continue;
580
584
  }
581
585
  if (typeValue === "expected_messages") {
586
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
582
587
  evaluators.push({
583
588
  name,
584
- type: "expected_messages"
589
+ type: "expected_messages",
590
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
585
591
  });
586
592
  continue;
587
593
  }
@@ -637,12 +643,14 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
637
643
  );
638
644
  continue;
639
645
  }
646
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
640
647
  const config = {
641
648
  name,
642
649
  type: "tool_trajectory",
643
650
  mode,
644
651
  ...minimums ? { minimums } : {},
645
- ...expected ? { expected } : {}
652
+ ...expected ? { expected } : {},
653
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
646
654
  };
647
655
  evaluators.push(config);
648
656
  continue;
@@ -683,19 +691,23 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
683
691
  logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
684
692
  continue;
685
693
  }
694
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
686
695
  evaluators.push({
687
696
  name,
688
697
  type: "llm_judge",
689
- rubrics: parsedRubrics
698
+ rubrics: parsedRubrics,
699
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
690
700
  });
691
701
  continue;
692
702
  }
703
+ const weight = validateWeight(rawEvaluator.weight, name, evalId);
693
704
  evaluators.push({
694
705
  name,
695
706
  type: "llm_judge",
696
707
  prompt,
697
708
  promptPath,
698
- ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
709
+ ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
710
+ ...weight !== void 0 ? { weight } : {}
699
711
  });
700
712
  }
701
713
  return evaluators.length > 0 ? evaluators : void 0;
@@ -725,6 +737,27 @@ ${detailBlock}${ANSI_RESET3}`);
725
737
  console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
726
738
  }
727
739
  }
740
+ function validateWeight(rawWeight, evaluatorName, evalId) {
741
+ if (rawWeight === void 0) {
742
+ return void 0;
743
+ }
744
+ if (typeof rawWeight !== "number") {
745
+ throw new Error(
746
+ `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be a number`
747
+ );
748
+ }
749
+ if (!Number.isFinite(rawWeight)) {
750
+ throw new Error(
751
+ `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be finite (got ${rawWeight})`
752
+ );
753
+ }
754
+ if (rawWeight < 0) {
755
+ throw new Error(
756
+ `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be non-negative (got ${rawWeight})`
757
+ );
758
+ }
759
+ return rawWeight;
760
+ }
728
761
 
729
762
  // src/evaluation/loaders/message-processor.ts
730
763
  import { readFile as readFile3 } from "node:fs/promises";
@@ -3510,9 +3543,11 @@ var CodeEvaluator = class {
3510
3543
  expected_outcome: context.evalCase.expected_outcome,
3511
3544
  reference_answer: context.evalCase.reference_answer,
3512
3545
  candidate_answer: context.candidate,
3513
- guideline_paths: context.evalCase.guideline_paths,
3514
- input_files: context.evalCase.file_paths,
3515
- input_segments: context.evalCase.input_segments
3546
+ guideline_files: context.evalCase.guideline_paths,
3547
+ input_files: context.evalCase.file_paths.filter(
3548
+ (path13) => !context.evalCase.guideline_paths.includes(path13)
3549
+ ),
3550
+ input_messages: context.evalCase.input_messages
3516
3551
  },
3517
3552
  null,
3518
3553
  2
@@ -4685,14 +4720,12 @@ async function evaluateCandidate(options) {
4685
4720
  } else {
4686
4721
  if (promptInputs.chatPrompt) {
4687
4722
  lmProviderRequest = {
4688
- chat_prompt: promptInputs.chatPrompt,
4689
- guideline_paths: evalCase.guideline_paths
4723
+ chat_prompt: promptInputs.chatPrompt
4690
4724
  };
4691
4725
  } else {
4692
4726
  lmProviderRequest = {
4693
4727
  question: promptInputs.question,
4694
- guidelines: promptInputs.guidelines,
4695
- guideline_paths: evalCase.guideline_paths
4728
+ guidelines: promptInputs.guidelines
4696
4729
  };
4697
4730
  }
4698
4731
  }
@@ -4799,11 +4832,13 @@ async function runEvaluatorList(options) {
4799
4832
  now,
4800
4833
  judgeProvider
4801
4834
  });
4802
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4835
+ const weight = evaluator.weight ?? 1;
4836
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
4803
4837
  evaluatorResults.push({
4804
4838
  name: evaluator.name,
4805
4839
  type: evaluator.type,
4806
4840
  score: score2.score,
4841
+ weight,
4807
4842
  verdict: score2.verdict,
4808
4843
  hits: score2.hits,
4809
4844
  misses: score2.misses,
@@ -4826,11 +4861,13 @@ async function runEvaluatorList(options) {
4826
4861
  promptInputs,
4827
4862
  now
4828
4863
  });
4829
- scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
4864
+ const weight = evaluator.weight ?? 1;
4865
+ scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
4830
4866
  evaluatorResults.push({
4831
4867
  name: evaluator.name,
4832
4868
  type: "code_judge",
4833
4869
  score: score2.score,
4870
+ weight,
4834
4871
  verdict: score2.verdict,
4835
4872
  hits: score2.hits,
4836
4873
  misses: score2.misses,
@@ -4883,11 +4920,13 @@ async function runEvaluatorList(options) {
4883
4920
  now,
4884
4921
  judgeProvider
4885
4922
  });
4886
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4923
+ const weight = evaluator.weight ?? 1;
4924
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
4887
4925
  evaluatorResults.push({
4888
4926
  name: evaluator.name,
4889
4927
  type: evaluator.type,
4890
4928
  score: score2.score,
4929
+ weight,
4891
4930
  verdict: score2.verdict,
4892
4931
  hits: score2.hits,
4893
4932
  misses: score2.misses,
@@ -4911,11 +4950,13 @@ async function runEvaluatorList(options) {
4911
4950
  candidateTrace,
4912
4951
  candidateTraceSummary
4913
4952
  });
4914
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4953
+ const weight = evaluator.weight ?? 1;
4954
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
4915
4955
  evaluatorResults.push({
4916
4956
  name: evaluator.name,
4917
4957
  type: evaluator.type,
4918
4958
  score: score2.score,
4959
+ weight,
4919
4960
  verdict: score2.verdict,
4920
4961
  hits: score2.hits,
4921
4962
  misses: score2.misses,
@@ -4935,11 +4976,13 @@ async function runEvaluatorList(options) {
4935
4976
  candidateTrace,
4936
4977
  candidateTraceSummary
4937
4978
  });
4938
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4979
+ const weight = evaluator.weight ?? 1;
4980
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
4939
4981
  evaluatorResults.push({
4940
4982
  name: evaluator.name,
4941
4983
  type: evaluator.type,
4942
4984
  score: score2.score,
4985
+ weight,
4943
4986
  verdict: score2.verdict,
4944
4987
  hits: score2.hits,
4945
4988
  misses: score2.misses,
@@ -4957,15 +5000,18 @@ async function runEvaluatorList(options) {
4957
5000
  reasoning: message
4958
5001
  };
4959
5002
  const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
5003
+ const weight = evaluator.weight ?? 1;
4960
5004
  scored.push({
4961
5005
  score: fallbackScore,
4962
5006
  name: evaluator.name ?? "unknown",
4963
- type: resultType ?? "llm_judge"
5007
+ type: resultType ?? "llm_judge",
5008
+ weight
4964
5009
  });
4965
5010
  evaluatorResults.push({
4966
5011
  name: evaluator.name ?? "unknown",
4967
5012
  type: resultType ?? "llm_judge",
4968
5013
  score: 0,
5014
+ weight,
4969
5015
  verdict: "fail",
4970
5016
  hits: [],
4971
5017
  misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
@@ -4973,7 +5019,9 @@ async function runEvaluatorList(options) {
4973
5019
  });
4974
5020
  }
4975
5021
  }
4976
- const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
5022
+ const aggregateScore = scored.length > 0 ? computeWeightedMean(
5023
+ scored.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
5024
+ ) : 0;
4977
5025
  const hits = scored.flatMap((entry) => entry.score.hits);
4978
5026
  const misses = scored.flatMap((entry) => entry.score.misses);
4979
5027
  const expectedAspectCount = scored.reduce(
@@ -5199,6 +5247,16 @@ function mapChildResults(children) {
5199
5247
  evaluator_results: mapChildResults(child.evaluatorResults)
5200
5248
  }));
5201
5249
  }
5250
+ function computeWeightedMean(entries) {
5251
+ let totalWeight = 0;
5252
+ let weightedSum = 0;
5253
+ for (const entry of entries) {
5254
+ const weight = entry.weight ?? 1;
5255
+ totalWeight += weight;
5256
+ weightedSum += entry.score * weight;
5257
+ }
5258
+ return totalWeight > 0 ? weightedSum / totalWeight : 0;
5259
+ }
5202
5260
 
5203
5261
  // src/evaluation/generators/rubric-generator.ts
5204
5262
  import { generateText as generateText3 } from "ai";