@agentv/core 0.25.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-OYTL3LNN.js → chunk-NDEN3H2B.js} +5 -2
- package/dist/chunk-NDEN3H2B.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +82 -21
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +6 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +79 -21
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-OYTL3LNN.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -56,6 +56,8 @@ interface ToolTrajectoryEvaluatorConfig {
|
|
|
56
56
|
readonly minimums?: Readonly<Record<string, number>>;
|
|
57
57
|
/** Expected tool sequence (for in_order/exact modes) */
|
|
58
58
|
readonly expected?: readonly ToolTrajectoryExpectedItem[];
|
|
59
|
+
/** Optional weight for top-level aggregation (defaults to 1.0) */
|
|
60
|
+
readonly weight?: number;
|
|
59
61
|
}
|
|
60
62
|
/**
|
|
61
63
|
* Expected tool call item in a trajectory sequence.
|
|
@@ -186,6 +188,7 @@ type CodeEvaluatorConfig = {
|
|
|
186
188
|
readonly resolvedScriptPath?: string;
|
|
187
189
|
readonly cwd?: string;
|
|
188
190
|
readonly resolvedCwd?: string;
|
|
191
|
+
readonly weight?: number;
|
|
189
192
|
};
|
|
190
193
|
type LlmJudgeEvaluatorConfig = {
|
|
191
194
|
readonly name: string;
|
|
@@ -193,6 +196,7 @@ type LlmJudgeEvaluatorConfig = {
|
|
|
193
196
|
readonly prompt?: string;
|
|
194
197
|
readonly promptPath?: string;
|
|
195
198
|
readonly rubrics?: readonly RubricItem[];
|
|
199
|
+
readonly weight?: number;
|
|
196
200
|
};
|
|
197
201
|
type RubricItem = {
|
|
198
202
|
readonly id: string;
|
|
@@ -218,10 +222,12 @@ type CompositeEvaluatorConfig = {
|
|
|
218
222
|
readonly type: 'composite';
|
|
219
223
|
readonly evaluators: readonly EvaluatorConfig[];
|
|
220
224
|
readonly aggregator: CompositeAggregatorConfig;
|
|
225
|
+
readonly weight?: number;
|
|
221
226
|
};
|
|
222
227
|
type ExpectedMessagesEvaluatorConfig = {
|
|
223
228
|
readonly name: string;
|
|
224
229
|
readonly type: 'expected_messages';
|
|
230
|
+
readonly weight?: number;
|
|
225
231
|
};
|
|
226
232
|
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedMessagesEvaluatorConfig;
|
|
227
233
|
/**
|
package/dist/index.d.ts
CHANGED
|
@@ -56,6 +56,8 @@ interface ToolTrajectoryEvaluatorConfig {
|
|
|
56
56
|
readonly minimums?: Readonly<Record<string, number>>;
|
|
57
57
|
/** Expected tool sequence (for in_order/exact modes) */
|
|
58
58
|
readonly expected?: readonly ToolTrajectoryExpectedItem[];
|
|
59
|
+
/** Optional weight for top-level aggregation (defaults to 1.0) */
|
|
60
|
+
readonly weight?: number;
|
|
59
61
|
}
|
|
60
62
|
/**
|
|
61
63
|
* Expected tool call item in a trajectory sequence.
|
|
@@ -186,6 +188,7 @@ type CodeEvaluatorConfig = {
|
|
|
186
188
|
readonly resolvedScriptPath?: string;
|
|
187
189
|
readonly cwd?: string;
|
|
188
190
|
readonly resolvedCwd?: string;
|
|
191
|
+
readonly weight?: number;
|
|
189
192
|
};
|
|
190
193
|
type LlmJudgeEvaluatorConfig = {
|
|
191
194
|
readonly name: string;
|
|
@@ -193,6 +196,7 @@ type LlmJudgeEvaluatorConfig = {
|
|
|
193
196
|
readonly prompt?: string;
|
|
194
197
|
readonly promptPath?: string;
|
|
195
198
|
readonly rubrics?: readonly RubricItem[];
|
|
199
|
+
readonly weight?: number;
|
|
196
200
|
};
|
|
197
201
|
type RubricItem = {
|
|
198
202
|
readonly id: string;
|
|
@@ -218,10 +222,12 @@ type CompositeEvaluatorConfig = {
|
|
|
218
222
|
readonly type: 'composite';
|
|
219
223
|
readonly evaluators: readonly EvaluatorConfig[];
|
|
220
224
|
readonly aggregator: CompositeAggregatorConfig;
|
|
225
|
+
readonly weight?: number;
|
|
221
226
|
};
|
|
222
227
|
type ExpectedMessagesEvaluatorConfig = {
|
|
223
228
|
readonly name: string;
|
|
224
229
|
readonly type: 'expected_messages';
|
|
230
|
+
readonly weight?: number;
|
|
225
231
|
};
|
|
226
232
|
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | ExpectedMessagesEvaluatorConfig;
|
|
227
233
|
/**
|
package/dist/index.js
CHANGED
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
readTextFile,
|
|
10
10
|
resolveFileReference,
|
|
11
11
|
resolveTargetDefinition
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-NDEN3H2B.js";
|
|
13
13
|
|
|
14
14
|
// src/evaluation/types.ts
|
|
15
15
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -455,6 +455,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
455
455
|
logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
|
|
456
456
|
continue;
|
|
457
457
|
}
|
|
458
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
458
459
|
const cwd = asString2(rawEvaluator.cwd);
|
|
459
460
|
let resolvedCwd;
|
|
460
461
|
if (cwd) {
|
|
@@ -475,7 +476,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
475
476
|
type: "code",
|
|
476
477
|
script,
|
|
477
478
|
cwd,
|
|
478
|
-
resolvedCwd
|
|
479
|
+
resolvedCwd,
|
|
480
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
479
481
|
});
|
|
480
482
|
continue;
|
|
481
483
|
}
|
|
@@ -570,18 +572,22 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
570
572
|
...promptPath2 ? { promptPath: promptPath2 } : {}
|
|
571
573
|
};
|
|
572
574
|
}
|
|
575
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
573
576
|
evaluators.push({
|
|
574
577
|
name,
|
|
575
578
|
type: "composite",
|
|
576
579
|
evaluators: memberEvaluators,
|
|
577
|
-
aggregator
|
|
580
|
+
aggregator,
|
|
581
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
578
582
|
});
|
|
579
583
|
continue;
|
|
580
584
|
}
|
|
581
585
|
if (typeValue === "expected_messages") {
|
|
586
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
582
587
|
evaluators.push({
|
|
583
588
|
name,
|
|
584
|
-
type: "expected_messages"
|
|
589
|
+
type: "expected_messages",
|
|
590
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
585
591
|
});
|
|
586
592
|
continue;
|
|
587
593
|
}
|
|
@@ -637,12 +643,14 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
637
643
|
);
|
|
638
644
|
continue;
|
|
639
645
|
}
|
|
646
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
640
647
|
const config = {
|
|
641
648
|
name,
|
|
642
649
|
type: "tool_trajectory",
|
|
643
650
|
mode,
|
|
644
651
|
...minimums ? { minimums } : {},
|
|
645
|
-
...expected ? { expected } : {}
|
|
652
|
+
...expected ? { expected } : {},
|
|
653
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
646
654
|
};
|
|
647
655
|
evaluators.push(config);
|
|
648
656
|
continue;
|
|
@@ -683,19 +691,23 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
683
691
|
logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
|
|
684
692
|
continue;
|
|
685
693
|
}
|
|
694
|
+
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
686
695
|
evaluators.push({
|
|
687
696
|
name,
|
|
688
697
|
type: "llm_judge",
|
|
689
|
-
rubrics: parsedRubrics
|
|
698
|
+
rubrics: parsedRubrics,
|
|
699
|
+
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
690
700
|
});
|
|
691
701
|
continue;
|
|
692
702
|
}
|
|
703
|
+
const weight = validateWeight(rawEvaluator.weight, name, evalId);
|
|
693
704
|
evaluators.push({
|
|
694
705
|
name,
|
|
695
706
|
type: "llm_judge",
|
|
696
707
|
prompt,
|
|
697
708
|
promptPath,
|
|
698
|
-
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
|
|
709
|
+
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
|
|
710
|
+
...weight !== void 0 ? { weight } : {}
|
|
699
711
|
});
|
|
700
712
|
}
|
|
701
713
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -725,6 +737,27 @@ ${detailBlock}${ANSI_RESET3}`);
|
|
|
725
737
|
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
|
|
726
738
|
}
|
|
727
739
|
}
|
|
740
|
+
function validateWeight(rawWeight, evaluatorName, evalId) {
|
|
741
|
+
if (rawWeight === void 0) {
|
|
742
|
+
return void 0;
|
|
743
|
+
}
|
|
744
|
+
if (typeof rawWeight !== "number") {
|
|
745
|
+
throw new Error(
|
|
746
|
+
`Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be a number`
|
|
747
|
+
);
|
|
748
|
+
}
|
|
749
|
+
if (!Number.isFinite(rawWeight)) {
|
|
750
|
+
throw new Error(
|
|
751
|
+
`Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be finite (got ${rawWeight})`
|
|
752
|
+
);
|
|
753
|
+
}
|
|
754
|
+
if (rawWeight < 0) {
|
|
755
|
+
throw new Error(
|
|
756
|
+
`Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be non-negative (got ${rawWeight})`
|
|
757
|
+
);
|
|
758
|
+
}
|
|
759
|
+
return rawWeight;
|
|
760
|
+
}
|
|
728
761
|
|
|
729
762
|
// src/evaluation/loaders/message-processor.ts
|
|
730
763
|
import { readFile as readFile3 } from "node:fs/promises";
|
|
@@ -3510,9 +3543,11 @@ var CodeEvaluator = class {
|
|
|
3510
3543
|
expected_outcome: context.evalCase.expected_outcome,
|
|
3511
3544
|
reference_answer: context.evalCase.reference_answer,
|
|
3512
3545
|
candidate_answer: context.candidate,
|
|
3513
|
-
|
|
3514
|
-
input_files: context.evalCase.file_paths
|
|
3515
|
-
|
|
3546
|
+
guideline_files: context.evalCase.guideline_paths,
|
|
3547
|
+
input_files: context.evalCase.file_paths.filter(
|
|
3548
|
+
(path13) => !context.evalCase.guideline_paths.includes(path13)
|
|
3549
|
+
),
|
|
3550
|
+
input_messages: context.evalCase.input_messages
|
|
3516
3551
|
},
|
|
3517
3552
|
null,
|
|
3518
3553
|
2
|
|
@@ -4685,14 +4720,12 @@ async function evaluateCandidate(options) {
|
|
|
4685
4720
|
} else {
|
|
4686
4721
|
if (promptInputs.chatPrompt) {
|
|
4687
4722
|
lmProviderRequest = {
|
|
4688
|
-
chat_prompt: promptInputs.chatPrompt
|
|
4689
|
-
guideline_paths: evalCase.guideline_paths
|
|
4723
|
+
chat_prompt: promptInputs.chatPrompt
|
|
4690
4724
|
};
|
|
4691
4725
|
} else {
|
|
4692
4726
|
lmProviderRequest = {
|
|
4693
4727
|
question: promptInputs.question,
|
|
4694
|
-
guidelines: promptInputs.guidelines
|
|
4695
|
-
guideline_paths: evalCase.guideline_paths
|
|
4728
|
+
guidelines: promptInputs.guidelines
|
|
4696
4729
|
};
|
|
4697
4730
|
}
|
|
4698
4731
|
}
|
|
@@ -4799,11 +4832,13 @@ async function runEvaluatorList(options) {
|
|
|
4799
4832
|
now,
|
|
4800
4833
|
judgeProvider
|
|
4801
4834
|
});
|
|
4802
|
-
|
|
4835
|
+
const weight = evaluator.weight ?? 1;
|
|
4836
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
4803
4837
|
evaluatorResults.push({
|
|
4804
4838
|
name: evaluator.name,
|
|
4805
4839
|
type: evaluator.type,
|
|
4806
4840
|
score: score2.score,
|
|
4841
|
+
weight,
|
|
4807
4842
|
verdict: score2.verdict,
|
|
4808
4843
|
hits: score2.hits,
|
|
4809
4844
|
misses: score2.misses,
|
|
@@ -4826,11 +4861,13 @@ async function runEvaluatorList(options) {
|
|
|
4826
4861
|
promptInputs,
|
|
4827
4862
|
now
|
|
4828
4863
|
});
|
|
4829
|
-
|
|
4864
|
+
const weight = evaluator.weight ?? 1;
|
|
4865
|
+
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
4830
4866
|
evaluatorResults.push({
|
|
4831
4867
|
name: evaluator.name,
|
|
4832
4868
|
type: "code_judge",
|
|
4833
4869
|
score: score2.score,
|
|
4870
|
+
weight,
|
|
4834
4871
|
verdict: score2.verdict,
|
|
4835
4872
|
hits: score2.hits,
|
|
4836
4873
|
misses: score2.misses,
|
|
@@ -4883,11 +4920,13 @@ async function runEvaluatorList(options) {
|
|
|
4883
4920
|
now,
|
|
4884
4921
|
judgeProvider
|
|
4885
4922
|
});
|
|
4886
|
-
|
|
4923
|
+
const weight = evaluator.weight ?? 1;
|
|
4924
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
4887
4925
|
evaluatorResults.push({
|
|
4888
4926
|
name: evaluator.name,
|
|
4889
4927
|
type: evaluator.type,
|
|
4890
4928
|
score: score2.score,
|
|
4929
|
+
weight,
|
|
4891
4930
|
verdict: score2.verdict,
|
|
4892
4931
|
hits: score2.hits,
|
|
4893
4932
|
misses: score2.misses,
|
|
@@ -4911,11 +4950,13 @@ async function runEvaluatorList(options) {
|
|
|
4911
4950
|
candidateTrace,
|
|
4912
4951
|
candidateTraceSummary
|
|
4913
4952
|
});
|
|
4914
|
-
|
|
4953
|
+
const weight = evaluator.weight ?? 1;
|
|
4954
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
4915
4955
|
evaluatorResults.push({
|
|
4916
4956
|
name: evaluator.name,
|
|
4917
4957
|
type: evaluator.type,
|
|
4918
4958
|
score: score2.score,
|
|
4959
|
+
weight,
|
|
4919
4960
|
verdict: score2.verdict,
|
|
4920
4961
|
hits: score2.hits,
|
|
4921
4962
|
misses: score2.misses,
|
|
@@ -4935,11 +4976,13 @@ async function runEvaluatorList(options) {
|
|
|
4935
4976
|
candidateTrace,
|
|
4936
4977
|
candidateTraceSummary
|
|
4937
4978
|
});
|
|
4938
|
-
|
|
4979
|
+
const weight = evaluator.weight ?? 1;
|
|
4980
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
4939
4981
|
evaluatorResults.push({
|
|
4940
4982
|
name: evaluator.name,
|
|
4941
4983
|
type: evaluator.type,
|
|
4942
4984
|
score: score2.score,
|
|
4985
|
+
weight,
|
|
4943
4986
|
verdict: score2.verdict,
|
|
4944
4987
|
hits: score2.hits,
|
|
4945
4988
|
misses: score2.misses,
|
|
@@ -4957,15 +5000,18 @@ async function runEvaluatorList(options) {
|
|
|
4957
5000
|
reasoning: message
|
|
4958
5001
|
};
|
|
4959
5002
|
const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
|
|
5003
|
+
const weight = evaluator.weight ?? 1;
|
|
4960
5004
|
scored.push({
|
|
4961
5005
|
score: fallbackScore,
|
|
4962
5006
|
name: evaluator.name ?? "unknown",
|
|
4963
|
-
type: resultType ?? "llm_judge"
|
|
5007
|
+
type: resultType ?? "llm_judge",
|
|
5008
|
+
weight
|
|
4964
5009
|
});
|
|
4965
5010
|
evaluatorResults.push({
|
|
4966
5011
|
name: evaluator.name ?? "unknown",
|
|
4967
5012
|
type: resultType ?? "llm_judge",
|
|
4968
5013
|
score: 0,
|
|
5014
|
+
weight,
|
|
4969
5015
|
verdict: "fail",
|
|
4970
5016
|
hits: [],
|
|
4971
5017
|
misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
|
|
@@ -4973,7 +5019,9 @@ async function runEvaluatorList(options) {
|
|
|
4973
5019
|
});
|
|
4974
5020
|
}
|
|
4975
5021
|
}
|
|
4976
|
-
const aggregateScore = scored.length > 0 ?
|
|
5022
|
+
const aggregateScore = scored.length > 0 ? computeWeightedMean(
|
|
5023
|
+
scored.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
|
|
5024
|
+
) : 0;
|
|
4977
5025
|
const hits = scored.flatMap((entry) => entry.score.hits);
|
|
4978
5026
|
const misses = scored.flatMap((entry) => entry.score.misses);
|
|
4979
5027
|
const expectedAspectCount = scored.reduce(
|
|
@@ -5199,6 +5247,16 @@ function mapChildResults(children) {
|
|
|
5199
5247
|
evaluator_results: mapChildResults(child.evaluatorResults)
|
|
5200
5248
|
}));
|
|
5201
5249
|
}
|
|
5250
|
+
function computeWeightedMean(entries) {
|
|
5251
|
+
let totalWeight = 0;
|
|
5252
|
+
let weightedSum = 0;
|
|
5253
|
+
for (const entry of entries) {
|
|
5254
|
+
const weight = entry.weight ?? 1;
|
|
5255
|
+
totalWeight += weight;
|
|
5256
|
+
weightedSum += entry.score * weight;
|
|
5257
|
+
}
|
|
5258
|
+
return totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
5259
|
+
}
|
|
5202
5260
|
|
|
5203
5261
|
// src/evaluation/generators/rubric-generator.ts
|
|
5204
5262
|
import { generateText as generateText3 } from "ai";
|