@agentv/core 3.13.0 → 3.13.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-4XWPXNQM.js → chunk-ZB3AUPES.js} +1 -3
- package/dist/chunk-ZB3AUPES.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +0 -2
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +44 -31
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +12 -21
- package/dist/index.d.ts +12 -21
- package/dist/index.js +45 -30
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-4XWPXNQM.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1581,8 +1581,6 @@ function isTestMessage(value) {
|
|
|
1581
1581
|
var EVALUATOR_KIND_VALUES = [
|
|
1582
1582
|
"code-grader",
|
|
1583
1583
|
"llm-grader",
|
|
1584
|
-
"code-judge",
|
|
1585
|
-
"llm-judge",
|
|
1586
1584
|
"rubric",
|
|
1587
1585
|
"composite",
|
|
1588
1586
|
"tool-trajectory",
|
|
@@ -2449,6 +2447,9 @@ var ANSI_RESET5 = "\x1B[0m";
|
|
|
2449
2447
|
function normalizeEvaluatorType(type) {
|
|
2450
2448
|
return type.replace(/_/g, "-");
|
|
2451
2449
|
}
|
|
2450
|
+
function isDeprecatedJudgeType(type) {
|
|
2451
|
+
return type === "code-judge" || type === "llm-judge";
|
|
2452
|
+
}
|
|
2452
2453
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
2453
2454
|
const execution = rawEvalCase.execution;
|
|
2454
2455
|
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
@@ -2511,6 +2512,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2511
2512
|
const rawName = asString(rawEvaluator.name);
|
|
2512
2513
|
const rawType = rawEvaluator.type;
|
|
2513
2514
|
const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
|
|
2515
|
+
if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
|
|
2516
|
+
logWarning2(
|
|
2517
|
+
`Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
|
|
2518
|
+
);
|
|
2519
|
+
continue;
|
|
2520
|
+
}
|
|
2514
2521
|
const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
|
|
2515
2522
|
if (typeof typeValue !== "string") {
|
|
2516
2523
|
logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
|
|
@@ -2543,7 +2550,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2543
2550
|
});
|
|
2544
2551
|
continue;
|
|
2545
2552
|
}
|
|
2546
|
-
if (typeValue === "code-grader"
|
|
2553
|
+
if (typeValue === "code-grader") {
|
|
2547
2554
|
let command;
|
|
2548
2555
|
if (rawEvaluator.script !== void 0 && rawEvaluator.command === void 0) {
|
|
2549
2556
|
console.warn(
|
|
@@ -2653,7 +2660,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2653
2660
|
continue;
|
|
2654
2661
|
}
|
|
2655
2662
|
const aggregatorType = asString(rawAggregator.type);
|
|
2656
|
-
|
|
2663
|
+
const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeEvaluatorType(aggregatorType) : aggregatorType;
|
|
2664
|
+
if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
|
|
2665
|
+
logWarning2(
|
|
2666
|
+
`Skipping composite evaluator '${name}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
|
|
2667
|
+
);
|
|
2668
|
+
continue;
|
|
2669
|
+
}
|
|
2670
|
+
if (normalizedAggregatorType !== "weighted_average" && normalizedAggregatorType !== "code-grader" && normalizedAggregatorType !== "llm-grader" && normalizedAggregatorType !== "threshold") {
|
|
2657
2671
|
logWarning2(
|
|
2658
2672
|
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
2659
2673
|
);
|
|
@@ -2688,7 +2702,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2688
2702
|
continue;
|
|
2689
2703
|
}
|
|
2690
2704
|
let aggregator;
|
|
2691
|
-
if (
|
|
2705
|
+
if (normalizedAggregatorType === "weighted_average") {
|
|
2692
2706
|
const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
|
|
2693
2707
|
const parsedWeights = {};
|
|
2694
2708
|
if (weights) {
|
|
@@ -2702,7 +2716,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2702
2716
|
type: "weighted_average",
|
|
2703
2717
|
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
2704
2718
|
};
|
|
2705
|
-
} else if (
|
|
2719
|
+
} else if (normalizedAggregatorType === "code-grader") {
|
|
2706
2720
|
const aggregatorPath = asString(rawAggregator.path);
|
|
2707
2721
|
if (!aggregatorPath) {
|
|
2708
2722
|
logWarning2(
|
|
@@ -2715,7 +2729,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2715
2729
|
path: aggregatorPath,
|
|
2716
2730
|
cwd: searchRoots[0]
|
|
2717
2731
|
};
|
|
2718
|
-
} else if (
|
|
2732
|
+
} else if (normalizedAggregatorType === "threshold") {
|
|
2719
2733
|
const thresholdValue = rawAggregator.threshold;
|
|
2720
2734
|
if (typeof thresholdValue !== "number" || thresholdValue < 0 || thresholdValue > 1) {
|
|
2721
2735
|
logWarning2(
|
|
@@ -3463,10 +3477,15 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
3463
3477
|
return void 0;
|
|
3464
3478
|
}
|
|
3465
3479
|
const normalized = normalizeEvaluatorType(candidate);
|
|
3480
|
+
if (isDeprecatedJudgeType(normalized)) {
|
|
3481
|
+
throw new Error(
|
|
3482
|
+
`Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
|
|
3483
|
+
);
|
|
3484
|
+
}
|
|
3466
3485
|
if (isEvaluatorKind(normalized)) {
|
|
3467
3486
|
return normalized;
|
|
3468
3487
|
}
|
|
3469
|
-
logWarning2(`Unknown
|
|
3488
|
+
logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
|
|
3470
3489
|
return void 0;
|
|
3471
3490
|
}
|
|
3472
3491
|
function asString(value) {
|
|
@@ -4899,9 +4918,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
4899
4918
|
case "ends_with":
|
|
4900
4919
|
return `Output ends with '${entry.value}'`;
|
|
4901
4920
|
case "llm-grader":
|
|
4902
|
-
case "llm_grader":
|
|
4903
|
-
case "llm-judge":
|
|
4904
|
-
case "llm_judge": {
|
|
4921
|
+
case "llm_grader": {
|
|
4905
4922
|
if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
|
|
4906
4923
|
return null;
|
|
4907
4924
|
}
|
|
@@ -4914,9 +4931,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
4914
4931
|
return tools ? `Agent called tools in order: ${tools}` : "Agent followed expected tool trajectory";
|
|
4915
4932
|
}
|
|
4916
4933
|
case "code-grader":
|
|
4917
|
-
case "code_grader":
|
|
4918
|
-
case "code-judge":
|
|
4919
|
-
case "code_judge": {
|
|
4934
|
+
case "code_grader": {
|
|
4920
4935
|
const graderName = entry.name ?? deriveGraderNameFromCommand(entry.command) ?? "code-grader";
|
|
4921
4936
|
const desc = typeof entry.description === "string" ? entry.description : void 0;
|
|
4922
4937
|
return codeGraderInstruction(graderName, desc);
|
|
@@ -4947,7 +4962,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
4947
4962
|
}
|
|
4948
4963
|
}
|
|
4949
4964
|
function assertionToNaturalLanguageList(entry) {
|
|
4950
|
-
if (entry.type === "llm-grader" || entry.type === "llm_grader"
|
|
4965
|
+
if (entry.type === "llm-grader" || entry.type === "llm_grader") {
|
|
4951
4966
|
if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
|
|
4952
4967
|
return entry.rubrics.map((r) => r.outcome ?? r.criteria ?? r.id).filter((s) => typeof s === "string");
|
|
4953
4968
|
}
|
|
@@ -13168,7 +13183,7 @@ function toCamelCaseDeep(obj) {
|
|
|
13168
13183
|
// src/evaluation/evaluators/code-evaluator.ts
|
|
13169
13184
|
var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
|
|
13170
13185
|
var CodeEvaluator = class {
|
|
13171
|
-
kind = "code-
|
|
13186
|
+
kind = "code-grader";
|
|
13172
13187
|
command;
|
|
13173
13188
|
cwd;
|
|
13174
13189
|
agentTimeoutMs;
|
|
@@ -13187,7 +13202,7 @@ var CodeEvaluator = class {
|
|
|
13187
13202
|
if (outputForPayload) {
|
|
13188
13203
|
const serialized = JSON.stringify(outputForPayload);
|
|
13189
13204
|
if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
|
|
13190
|
-
const tmpDir = await (0, import_promises26.mkdtemp)((0, import_node_path36.join)((0, import_node_os5.tmpdir)(), "agentv-
|
|
13205
|
+
const tmpDir = await (0, import_promises26.mkdtemp)((0, import_node_path36.join)((0, import_node_os5.tmpdir)(), "agentv-grader-"));
|
|
13191
13206
|
outputPath = (0, import_node_path36.join)(tmpDir, "output.json");
|
|
13192
13207
|
await (0, import_promises26.writeFile)(outputPath, serialized);
|
|
13193
13208
|
outputForPayload = null;
|
|
@@ -13477,7 +13492,7 @@ var LlmGraderEvaluator = class {
|
|
|
13477
13492
|
return this.evaluateWithDelegatedAgent(context2, graderProvider);
|
|
13478
13493
|
}
|
|
13479
13494
|
const config = context2.evaluator;
|
|
13480
|
-
if (
|
|
13495
|
+
if (config?.type === "llm-grader" && config.rubrics && config.rubrics.length > 0) {
|
|
13481
13496
|
return this.evaluateWithRubrics(context2, graderProvider, config.rubrics);
|
|
13482
13497
|
}
|
|
13483
13498
|
return this.evaluateFreeform(context2, graderProvider);
|
|
@@ -13662,7 +13677,7 @@ ${context2.fileChanges}`;
|
|
|
13662
13677
|
const systemPrompt = this.buildAgentSystemPrompt(context2);
|
|
13663
13678
|
const userPrompt = this.buildAgentUserPrompt(context2);
|
|
13664
13679
|
const config = context2.evaluator;
|
|
13665
|
-
const rubrics = config?.type === "llm-grader"
|
|
13680
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
13666
13681
|
const fsTools = createFilesystemTools(workspacePath);
|
|
13667
13682
|
const evaluatorRawRequest = {
|
|
13668
13683
|
mode: "built-in",
|
|
@@ -13758,7 +13773,7 @@ ${context2.fileChanges}`;
|
|
|
13758
13773
|
};
|
|
13759
13774
|
}
|
|
13760
13775
|
const config = context2.evaluator;
|
|
13761
|
-
const rubrics = config?.type === "llm-grader"
|
|
13776
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
13762
13777
|
const details = {
|
|
13763
13778
|
mode: modeLabel,
|
|
13764
13779
|
grader_target: provider.targetName
|
|
@@ -13798,7 +13813,7 @@ ${context2.fileChanges}`;
|
|
|
13798
13813
|
*/
|
|
13799
13814
|
buildAgentSystemPrompt(context2) {
|
|
13800
13815
|
const config = context2.evaluator;
|
|
13801
|
-
const rubrics = config?.type === "llm-grader"
|
|
13816
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
13802
13817
|
const parts = [
|
|
13803
13818
|
"You are an expert evaluator with access to the workspace filesystem.",
|
|
13804
13819
|
"Use the provided tools to investigate the workspace and verify the criteria are met.",
|
|
@@ -13829,7 +13844,7 @@ ${context2.fileChanges}`;
|
|
|
13829
13844
|
return substituteVariables(this.evaluatorTemplate, variables);
|
|
13830
13845
|
}
|
|
13831
13846
|
const config = context2.evaluator;
|
|
13832
|
-
const rubrics = config?.type === "llm-grader"
|
|
13847
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
13833
13848
|
const parts = [
|
|
13834
13849
|
"Evaluate the candidate answer by investigating the workspace.",
|
|
13835
13850
|
"",
|
|
@@ -13872,7 +13887,7 @@ ${context2.fileChanges}`;
|
|
|
13872
13887
|
buildDelegatedPrompt(context2) {
|
|
13873
13888
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
13874
13889
|
const config = context2.evaluator;
|
|
13875
|
-
const rubrics = config?.type === "llm-grader"
|
|
13890
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
13876
13891
|
if (this.evaluatorTemplate) {
|
|
13877
13892
|
const variables = {
|
|
13878
13893
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
@@ -14369,10 +14384,8 @@ var CompositeEvaluator = class {
|
|
|
14369
14384
|
const aggregator = this.config.aggregator;
|
|
14370
14385
|
switch (aggregator.type) {
|
|
14371
14386
|
case "code-grader":
|
|
14372
|
-
case "code-judge":
|
|
14373
14387
|
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
14374
14388
|
case "llm-grader":
|
|
14375
|
-
case "llm-judge":
|
|
14376
14389
|
return this.runLlmAggregator(results, context2, aggregator);
|
|
14377
14390
|
case "threshold":
|
|
14378
14391
|
return this.runThreshold(results, aggregator.threshold);
|
|
@@ -16794,7 +16807,7 @@ var endsWithFactory = (config) => {
|
|
|
16794
16807
|
};
|
|
16795
16808
|
function createBuiltinRegistry() {
|
|
16796
16809
|
const registry = new EvaluatorRegistry();
|
|
16797
|
-
registry.register("llm-grader", llmGraderFactory).register("
|
|
16810
|
+
registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
|
|
16798
16811
|
const fn = config[INLINE_ASSERT_FN];
|
|
16799
16812
|
if (!fn) {
|
|
16800
16813
|
throw new Error(
|
|
@@ -19512,7 +19525,7 @@ function filterEvalCases(evalCases, filter) {
|
|
|
19512
19525
|
return evalCases.filter((evalCase) => import_micromatch3.default.isMatch(evalCase.id, filter));
|
|
19513
19526
|
}
|
|
19514
19527
|
function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
|
|
19515
|
-
const llmGrader = overrides?.["llm-grader"] ??
|
|
19528
|
+
const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
|
|
19516
19529
|
resolveGraderProvider: async (context2) => {
|
|
19517
19530
|
if (context2.graderProvider) {
|
|
19518
19531
|
return context2.graderProvider;
|
|
@@ -20356,10 +20369,10 @@ var OtelTraceExporter = class {
|
|
|
20356
20369
|
}
|
|
20357
20370
|
if (result.scores) {
|
|
20358
20371
|
for (const score of result.scores) {
|
|
20359
|
-
rootSpan.addEvent(`agentv.
|
|
20360
|
-
"agentv.
|
|
20361
|
-
"agentv.
|
|
20362
|
-
...score.verdict ? { "agentv.
|
|
20372
|
+
rootSpan.addEvent(`agentv.grader.${score.name}`, {
|
|
20373
|
+
"agentv.grader.score": score.score,
|
|
20374
|
+
"agentv.grader.type": score.type,
|
|
20375
|
+
...score.verdict ? { "agentv.grader.verdict": score.verdict } : {}
|
|
20363
20376
|
});
|
|
20364
20377
|
}
|
|
20365
20378
|
}
|