agentv 2.13.0 → 2.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -14
- package/dist/{chunk-UWDI4UVN.js → chunk-5646K2XJ.js} +15 -14
- package/dist/{chunk-UWDI4UVN.js.map → chunk-5646K2XJ.js.map} +1 -1
- package/dist/{chunk-FSBZM3HT.js → chunk-OQN2GDEU.js} +188 -162
- package/dist/chunk-OQN2GDEU.js.map +1 -0
- package/dist/{chunk-M6JYP6A6.js → chunk-YVWP4Z3W.js} +26 -26
- package/dist/chunk-YVWP4Z3W.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-CCUHG3SN.js → dist-QR5OZ4DH.js} +2 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-P3D5O673.js → interactive-Z6ZV5OGM.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-FSBZM3HT.js.map +0 -1
- package/dist/chunk-M6JYP6A6.js.map +0 -1
- /package/dist/{dist-CCUHG3SN.js.map → dist-QR5OZ4DH.js.map} +0 -0
- /package/dist/{interactive-P3D5O673.js.map → interactive-Z6ZV5OGM.js.map} +0 -0
|
@@ -148,7 +148,7 @@ var require_dist = __commonJS({
|
|
|
148
148
|
}
|
|
149
149
|
});
|
|
150
150
|
|
|
151
|
-
// ../../packages/core/dist/chunk-
|
|
151
|
+
// ../../packages/core/dist/chunk-N55K52OO.js
|
|
152
152
|
import { constants } from "node:fs";
|
|
153
153
|
import { access, readFile } from "node:fs/promises";
|
|
154
154
|
import path from "node:path";
|
|
@@ -4195,7 +4195,7 @@ var coerce = {
|
|
|
4195
4195
|
};
|
|
4196
4196
|
var NEVER = INVALID;
|
|
4197
4197
|
|
|
4198
|
-
// ../../packages/core/dist/chunk-
|
|
4198
|
+
// ../../packages/core/dist/chunk-N55K52OO.js
|
|
4199
4199
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
4200
4200
|
var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
|
|
4201
4201
|
var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
|
|
@@ -4243,27 +4243,27 @@ function isTestMessage(value) {
|
|
|
4243
4243
|
return false;
|
|
4244
4244
|
}
|
|
4245
4245
|
var EVALUATOR_KIND_VALUES = [
|
|
4246
|
-
"
|
|
4247
|
-
"
|
|
4246
|
+
"code-judge",
|
|
4247
|
+
"llm-judge",
|
|
4248
4248
|
"rubric",
|
|
4249
4249
|
"composite",
|
|
4250
|
-
"
|
|
4251
|
-
"
|
|
4250
|
+
"tool-trajectory",
|
|
4251
|
+
"field-accuracy",
|
|
4252
4252
|
"latency",
|
|
4253
4253
|
"cost",
|
|
4254
|
-
"
|
|
4255
|
-
"
|
|
4256
|
-
"
|
|
4254
|
+
"token-usage",
|
|
4255
|
+
"execution-metrics",
|
|
4256
|
+
"agent-judge",
|
|
4257
4257
|
"contains",
|
|
4258
|
-
"
|
|
4259
|
-
"
|
|
4258
|
+
"contains-any",
|
|
4259
|
+
"contains-all",
|
|
4260
4260
|
"icontains",
|
|
4261
|
-
"
|
|
4262
|
-
"
|
|
4263
|
-
"
|
|
4264
|
-
"
|
|
4261
|
+
"icontains-any",
|
|
4262
|
+
"icontains-all",
|
|
4263
|
+
"starts-with",
|
|
4264
|
+
"ends-with",
|
|
4265
4265
|
"regex",
|
|
4266
|
-
"
|
|
4266
|
+
"is-json",
|
|
4267
4267
|
"equals",
|
|
4268
4268
|
"rubrics"
|
|
4269
4269
|
];
|
|
@@ -33960,7 +33960,7 @@ import { createServer } from "node:http";
|
|
|
33960
33960
|
import fs2 from "node:fs/promises";
|
|
33961
33961
|
import path30 from "node:path";
|
|
33962
33962
|
import { createHash as createHash2, randomUUID as randomUUID7 } from "node:crypto";
|
|
33963
|
-
import { mkdir as mkdir12 } from "node:fs/promises";
|
|
33963
|
+
import { mkdir as mkdir12, stat as stat7 } from "node:fs/promises";
|
|
33964
33964
|
import path37 from "node:path";
|
|
33965
33965
|
import micromatch4 from "micromatch";
|
|
33966
33966
|
import { readFileSync } from "node:fs";
|
|
@@ -34605,6 +34605,9 @@ function validateTemplateVariables(content, source) {
|
|
|
34605
34605
|
}
|
|
34606
34606
|
var ANSI_YELLOW4 = "\x1B[33m";
|
|
34607
34607
|
var ANSI_RESET4 = "\x1B[0m";
|
|
34608
|
+
function normalizeEvaluatorType(type) {
|
|
34609
|
+
return type.replace(/_/g, "-");
|
|
34610
|
+
}
|
|
34608
34611
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
34609
34612
|
const execution = rawEvalCase.execution;
|
|
34610
34613
|
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
@@ -34635,7 +34638,8 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34635
34638
|
continue;
|
|
34636
34639
|
}
|
|
34637
34640
|
const rawName = asString(rawEvaluator.name);
|
|
34638
|
-
const
|
|
34641
|
+
const rawType = rawEvaluator.type;
|
|
34642
|
+
const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
|
|
34639
34643
|
const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
|
|
34640
34644
|
if (typeof typeValue !== "string") {
|
|
34641
34645
|
logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
|
|
@@ -34668,25 +34672,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34668
34672
|
});
|
|
34669
34673
|
continue;
|
|
34670
34674
|
}
|
|
34671
|
-
if (typeValue === "
|
|
34675
|
+
if (typeValue === "code-judge") {
|
|
34672
34676
|
let command;
|
|
34673
34677
|
const rawCommand = rawEvaluator.command ?? rawEvaluator.script;
|
|
34674
34678
|
if (typeof rawCommand === "string") {
|
|
34675
34679
|
const trimmed = rawCommand.trim();
|
|
34676
34680
|
if (trimmed.length === 0) {
|
|
34677
34681
|
throw new Error(
|
|
34678
|
-
`Invalid
|
|
34682
|
+
`Invalid code-judge command for evaluator '${name16}' in '${evalId}': command cannot be empty`
|
|
34679
34683
|
);
|
|
34680
34684
|
}
|
|
34681
34685
|
command = parseCommandToArgv(trimmed);
|
|
34682
34686
|
} else {
|
|
34683
34687
|
command = asStringArray(
|
|
34684
34688
|
rawCommand,
|
|
34685
|
-
`
|
|
34689
|
+
`code-judge command for evaluator '${name16}' in '${evalId}'`
|
|
34686
34690
|
);
|
|
34687
34691
|
}
|
|
34688
34692
|
if (!command) {
|
|
34689
|
-
logWarning2(`Skipping
|
|
34693
|
+
logWarning2(`Skipping code-judge evaluator '${name16}' in '${evalId}': missing command`);
|
|
34690
34694
|
continue;
|
|
34691
34695
|
}
|
|
34692
34696
|
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
@@ -34747,7 +34751,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34747
34751
|
}
|
|
34748
34752
|
evaluators.push({
|
|
34749
34753
|
name: name16,
|
|
34750
|
-
type: "code",
|
|
34754
|
+
type: "code-judge",
|
|
34751
34755
|
command,
|
|
34752
34756
|
cwd,
|
|
34753
34757
|
resolvedCwd,
|
|
@@ -34773,7 +34777,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34773
34777
|
continue;
|
|
34774
34778
|
}
|
|
34775
34779
|
const aggregatorType = asString(rawAggregator.type);
|
|
34776
|
-
if (aggregatorType !== "weighted_average" && aggregatorType !== "
|
|
34780
|
+
if (aggregatorType !== "weighted_average" && aggregatorType !== "code-judge" && aggregatorType !== "llm-judge" && aggregatorType !== "threshold") {
|
|
34777
34781
|
logWarning2(
|
|
34778
34782
|
`Skipping composite evaluator '${name16}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
34779
34783
|
);
|
|
@@ -34822,16 +34826,16 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34822
34826
|
type: "weighted_average",
|
|
34823
34827
|
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
34824
34828
|
};
|
|
34825
|
-
} else if (aggregatorType === "
|
|
34829
|
+
} else if (aggregatorType === "code-judge") {
|
|
34826
34830
|
const aggregatorPath = asString(rawAggregator.path);
|
|
34827
34831
|
if (!aggregatorPath) {
|
|
34828
34832
|
logWarning2(
|
|
34829
|
-
`Skipping composite evaluator '${name16}' in '${evalId}':
|
|
34833
|
+
`Skipping composite evaluator '${name16}' in '${evalId}': code-judge aggregator missing path`
|
|
34830
34834
|
);
|
|
34831
34835
|
continue;
|
|
34832
34836
|
}
|
|
34833
34837
|
aggregator = {
|
|
34834
|
-
type: "
|
|
34838
|
+
type: "code-judge",
|
|
34835
34839
|
path: aggregatorPath,
|
|
34836
34840
|
cwd: searchRoots[0]
|
|
34837
34841
|
};
|
|
@@ -34857,7 +34861,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34857
34861
|
}
|
|
34858
34862
|
}
|
|
34859
34863
|
aggregator = {
|
|
34860
|
-
type: "
|
|
34864
|
+
type: "llm-judge",
|
|
34861
34865
|
...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
|
|
34862
34866
|
...promptPath2 ? { promptPath: promptPath2 } : {}
|
|
34863
34867
|
};
|
|
@@ -34875,11 +34879,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34875
34879
|
});
|
|
34876
34880
|
continue;
|
|
34877
34881
|
}
|
|
34878
|
-
if (typeValue === "
|
|
34882
|
+
if (typeValue === "tool-trajectory") {
|
|
34879
34883
|
const mode = asString(rawEvaluator.mode);
|
|
34880
34884
|
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact" && mode !== "subset" && mode !== "superset") {
|
|
34881
34885
|
logWarning2(
|
|
34882
|
-
`Skipping
|
|
34886
|
+
`Skipping tool-trajectory evaluator '${name16}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, exact, subset, or superset)`
|
|
34883
34887
|
);
|
|
34884
34888
|
continue;
|
|
34885
34889
|
}
|
|
@@ -34888,7 +34892,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34888
34892
|
if (rawMinimums !== void 0) {
|
|
34889
34893
|
if (!isJsonObject2(rawMinimums)) {
|
|
34890
34894
|
logWarning2(
|
|
34891
|
-
`Skipping
|
|
34895
|
+
`Skipping tool-trajectory evaluator '${name16}' in '${evalId}': minimums must be an object`
|
|
34892
34896
|
);
|
|
34893
34897
|
continue;
|
|
34894
34898
|
}
|
|
@@ -34914,7 +34918,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34914
34918
|
argsMatch2 = rawArgsMatch;
|
|
34915
34919
|
} else {
|
|
34916
34920
|
logWarning2(
|
|
34917
|
-
`Invalid args_match '${rawArgsMatch}' for
|
|
34921
|
+
`Invalid args_match '${rawArgsMatch}' for tool-trajectory evaluator '${name16}' in '${evalId}': must be exact, superset, subset, ignore, or a string array`
|
|
34918
34922
|
);
|
|
34919
34923
|
}
|
|
34920
34924
|
}
|
|
@@ -34924,7 +34928,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34924
34928
|
if (rawExpected !== void 0) {
|
|
34925
34929
|
if (!Array.isArray(rawExpected)) {
|
|
34926
34930
|
logWarning2(
|
|
34927
|
-
`Skipping
|
|
34931
|
+
`Skipping tool-trajectory evaluator '${name16}' in '${evalId}': expected must be an array`
|
|
34928
34932
|
);
|
|
34929
34933
|
continue;
|
|
34930
34934
|
}
|
|
@@ -34970,13 +34974,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34970
34974
|
}
|
|
34971
34975
|
if (mode === "any_order" && !minimums) {
|
|
34972
34976
|
logWarning2(
|
|
34973
|
-
`Skipping
|
|
34977
|
+
`Skipping tool-trajectory evaluator '${name16}' in '${evalId}': any_order mode requires minimums`
|
|
34974
34978
|
);
|
|
34975
34979
|
continue;
|
|
34976
34980
|
}
|
|
34977
34981
|
if ((mode === "in_order" || mode === "exact" || mode === "subset" || mode === "superset") && !expected) {
|
|
34978
34982
|
logWarning2(
|
|
34979
|
-
`Skipping
|
|
34983
|
+
`Skipping tool-trajectory evaluator '${name16}' in '${evalId}': ${mode} mode requires expected`
|
|
34980
34984
|
);
|
|
34981
34985
|
continue;
|
|
34982
34986
|
}
|
|
@@ -34984,7 +34988,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34984
34988
|
const required22 = parseRequired(rawEvaluator.required);
|
|
34985
34989
|
const config22 = {
|
|
34986
34990
|
name: name16,
|
|
34987
|
-
type: "
|
|
34991
|
+
type: "tool-trajectory",
|
|
34988
34992
|
mode,
|
|
34989
34993
|
...minimums ? { minimums } : {},
|
|
34990
34994
|
...expected ? { expected } : {},
|
|
@@ -34996,17 +35000,17 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34996
35000
|
evaluators.push(config22);
|
|
34997
35001
|
continue;
|
|
34998
35002
|
}
|
|
34999
|
-
if (typeValue === "
|
|
35003
|
+
if (typeValue === "field-accuracy") {
|
|
35000
35004
|
const rawFields = rawEvaluator.fields;
|
|
35001
35005
|
if (!Array.isArray(rawFields)) {
|
|
35002
35006
|
logWarning2(
|
|
35003
|
-
`Skipping
|
|
35007
|
+
`Skipping field-accuracy evaluator '${name16}' in '${evalId}': missing fields array`
|
|
35004
35008
|
);
|
|
35005
35009
|
continue;
|
|
35006
35010
|
}
|
|
35007
35011
|
if (rawFields.length === 0) {
|
|
35008
35012
|
logWarning2(
|
|
35009
|
-
`Skipping
|
|
35013
|
+
`Skipping field-accuracy evaluator '${name16}' in '${evalId}': fields array is empty`
|
|
35010
35014
|
);
|
|
35011
35015
|
continue;
|
|
35012
35016
|
}
|
|
@@ -35014,7 +35018,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35014
35018
|
for (const rawField of rawFields) {
|
|
35015
35019
|
if (!isJsonObject2(rawField)) {
|
|
35016
35020
|
logWarning2(
|
|
35017
|
-
`Skipping invalid field entry in
|
|
35021
|
+
`Skipping invalid field entry in field-accuracy evaluator '${name16}' (expected object)`
|
|
35018
35022
|
);
|
|
35019
35023
|
continue;
|
|
35020
35024
|
}
|
|
@@ -35022,13 +35026,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35022
35026
|
const match = asString(rawField.match);
|
|
35023
35027
|
if (!fieldPath) {
|
|
35024
35028
|
logWarning2(
|
|
35025
|
-
`Skipping field without path in
|
|
35029
|
+
`Skipping field without path in field-accuracy evaluator '${name16}' in '${evalId}'`
|
|
35026
35030
|
);
|
|
35027
35031
|
continue;
|
|
35028
35032
|
}
|
|
35029
35033
|
if (!match || !isValidFieldMatchType(match)) {
|
|
35030
35034
|
logWarning2(
|
|
35031
|
-
`Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name16}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a
|
|
35035
|
+
`Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name16}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code-judge evaluator.`
|
|
35032
35036
|
);
|
|
35033
35037
|
continue;
|
|
35034
35038
|
}
|
|
@@ -35045,7 +35049,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35045
35049
|
}
|
|
35046
35050
|
if (fields.length === 0) {
|
|
35047
35051
|
logWarning2(
|
|
35048
|
-
`Skipping
|
|
35052
|
+
`Skipping field-accuracy evaluator '${name16}' in '${evalId}': no valid fields found`
|
|
35049
35053
|
);
|
|
35050
35054
|
continue;
|
|
35051
35055
|
}
|
|
@@ -35055,7 +35059,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35055
35059
|
const required22 = parseRequired(rawEvaluator.required);
|
|
35056
35060
|
evaluators.push({
|
|
35057
35061
|
name: name16,
|
|
35058
|
-
type: "
|
|
35062
|
+
type: "field-accuracy",
|
|
35059
35063
|
fields,
|
|
35060
35064
|
...validAggregation ? { aggregation: validAggregation } : {},
|
|
35061
35065
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
@@ -35104,7 +35108,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35104
35108
|
});
|
|
35105
35109
|
continue;
|
|
35106
35110
|
}
|
|
35107
|
-
if (typeValue === "
|
|
35111
|
+
if (typeValue === "token-usage") {
|
|
35108
35112
|
const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
|
|
35109
35113
|
const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
|
|
35110
35114
|
const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
|
|
@@ -35118,7 +35122,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35118
35122
|
if (raw === void 0) continue;
|
|
35119
35123
|
if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
|
|
35120
35124
|
logWarning2(
|
|
35121
|
-
`Skipping
|
|
35125
|
+
`Skipping token-usage evaluator '${name16}' in '${evalId}': ${key} must be a non-negative finite number`
|
|
35122
35126
|
);
|
|
35123
35127
|
continue;
|
|
35124
35128
|
}
|
|
@@ -35126,7 +35130,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35126
35130
|
}
|
|
35127
35131
|
if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
|
|
35128
35132
|
logWarning2(
|
|
35129
|
-
`Skipping
|
|
35133
|
+
`Skipping token-usage evaluator '${name16}' in '${evalId}': must set at least one of max_total, max_input, max_output`
|
|
35130
35134
|
);
|
|
35131
35135
|
continue;
|
|
35132
35136
|
}
|
|
@@ -35134,7 +35138,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35134
35138
|
const required22 = parseRequired(rawEvaluator.required);
|
|
35135
35139
|
evaluators.push({
|
|
35136
35140
|
name: name16,
|
|
35137
|
-
type: "
|
|
35141
|
+
type: "token-usage",
|
|
35138
35142
|
...validLimits,
|
|
35139
35143
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
35140
35144
|
...required22 !== void 0 ? { required: required22 } : {},
|
|
@@ -35142,7 +35146,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35142
35146
|
});
|
|
35143
35147
|
continue;
|
|
35144
35148
|
}
|
|
35145
|
-
if (typeValue === "
|
|
35149
|
+
if (typeValue === "execution-metrics") {
|
|
35146
35150
|
const maxToolCalls = rawEvaluator.max_tool_calls ?? rawEvaluator.maxToolCalls;
|
|
35147
35151
|
const maxLlmCalls = rawEvaluator.max_llm_calls ?? rawEvaluator.maxLlmCalls;
|
|
35148
35152
|
const maxTokens = rawEvaluator.max_tokens ?? rawEvaluator.maxTokens;
|
|
@@ -35165,7 +35169,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35165
35169
|
if (raw === void 0) continue;
|
|
35166
35170
|
if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
|
|
35167
35171
|
logWarning2(
|
|
35168
|
-
`Skipping
|
|
35172
|
+
`Skipping execution-metrics evaluator '${name16}' in '${evalId}': ${key} must be a non-negative finite number`
|
|
35169
35173
|
);
|
|
35170
35174
|
hasError = true;
|
|
35171
35175
|
break;
|
|
@@ -35178,7 +35182,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35178
35182
|
const hasThreshold = validThresholds.max_tool_calls !== void 0 || validThresholds.max_llm_calls !== void 0 || validThresholds.max_tokens !== void 0 || validThresholds.max_cost_usd !== void 0 || validThresholds.max_duration_ms !== void 0 || validThresholds.target_exploration_ratio !== void 0;
|
|
35179
35183
|
if (!hasThreshold) {
|
|
35180
35184
|
logWarning2(
|
|
35181
|
-
`Skipping
|
|
35185
|
+
`Skipping execution-metrics evaluator '${name16}' in '${evalId}': must set at least one threshold (max_tool_calls, max_llm_calls, max_tokens, max_cost_usd, max_duration_ms, or target_exploration_ratio)`
|
|
35182
35186
|
);
|
|
35183
35187
|
continue;
|
|
35184
35188
|
}
|
|
@@ -35186,7 +35190,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35186
35190
|
const required22 = parseRequired(rawEvaluator.required);
|
|
35187
35191
|
evaluators.push({
|
|
35188
35192
|
name: name16,
|
|
35189
|
-
type: "
|
|
35193
|
+
type: "execution-metrics",
|
|
35190
35194
|
...validThresholds,
|
|
35191
35195
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
35192
35196
|
...required22 !== void 0 ? { required: required22 } : {},
|
|
@@ -35194,13 +35198,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35194
35198
|
});
|
|
35195
35199
|
continue;
|
|
35196
35200
|
}
|
|
35197
|
-
if (typeValue === "
|
|
35201
|
+
if (typeValue === "agent-judge") {
|
|
35198
35202
|
const rawMaxSteps = rawEvaluator.max_steps ?? rawEvaluator.maxSteps;
|
|
35199
35203
|
let maxSteps;
|
|
35200
35204
|
if (rawMaxSteps !== void 0) {
|
|
35201
35205
|
if (typeof rawMaxSteps !== "number" || !Number.isInteger(rawMaxSteps) || rawMaxSteps < 1 || rawMaxSteps > 50) {
|
|
35202
35206
|
logWarning2(
|
|
35203
|
-
`Skipping
|
|
35207
|
+
`Skipping agent-judge evaluator '${name16}' in '${evalId}': max_steps must be an integer 1-50`
|
|
35204
35208
|
);
|
|
35205
35209
|
continue;
|
|
35206
35210
|
}
|
|
@@ -35211,7 +35215,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35211
35215
|
if (rawTemperature !== void 0) {
|
|
35212
35216
|
if (typeof rawTemperature !== "number" || rawTemperature < 0 || rawTemperature > 2) {
|
|
35213
35217
|
logWarning2(
|
|
35214
|
-
`Skipping
|
|
35218
|
+
`Skipping agent-judge evaluator '${name16}' in '${evalId}': temperature must be a number 0-2`
|
|
35215
35219
|
);
|
|
35216
35220
|
continue;
|
|
35217
35221
|
}
|
|
@@ -35234,7 +35238,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35234
35238
|
const required22 = parseRequired(rawEvaluator.required);
|
|
35235
35239
|
evaluators.push({
|
|
35236
35240
|
name: name16,
|
|
35237
|
-
type: "
|
|
35241
|
+
type: "agent-judge",
|
|
35238
35242
|
...agentPrompt ? { prompt: agentPrompt } : {},
|
|
35239
35243
|
...agentPromptPath ? { promptPath: agentPromptPath, resolvedPromptPath: agentPromptPath } : {},
|
|
35240
35244
|
...agentParsedRubrics && agentParsedRubrics.length > 0 ? { rubrics: agentParsedRubrics } : {},
|
|
@@ -35265,7 +35269,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35265
35269
|
});
|
|
35266
35270
|
continue;
|
|
35267
35271
|
}
|
|
35268
|
-
if (typeValue === "
|
|
35272
|
+
if (typeValue === "contains-any" || typeValue === "contains-all") {
|
|
35269
35273
|
const value = asStringArrayStrict(rawEvaluator.value);
|
|
35270
35274
|
if (!value || value.length === 0) {
|
|
35271
35275
|
logWarning2(
|
|
@@ -35303,7 +35307,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35303
35307
|
});
|
|
35304
35308
|
continue;
|
|
35305
35309
|
}
|
|
35306
|
-
if (typeValue === "
|
|
35310
|
+
if (typeValue === "icontains-any" || typeValue === "icontains-all") {
|
|
35307
35311
|
const value = asStringArrayStrict(rawEvaluator.value);
|
|
35308
35312
|
if (!value || value.length === 0) {
|
|
35309
35313
|
logWarning2(
|
|
@@ -35323,7 +35327,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35323
35327
|
});
|
|
35324
35328
|
continue;
|
|
35325
35329
|
}
|
|
35326
|
-
if (typeValue === "
|
|
35330
|
+
if (typeValue === "starts-with" || typeValue === "ends-with") {
|
|
35327
35331
|
const value = asString(rawEvaluator.value);
|
|
35328
35332
|
if (!value) {
|
|
35329
35333
|
logWarning2(`Skipping ${typeValue} evaluator '${name16}' in '${evalId}': missing value`);
|
|
@@ -35361,12 +35365,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35361
35365
|
});
|
|
35362
35366
|
continue;
|
|
35363
35367
|
}
|
|
35364
|
-
if (typeValue === "
|
|
35368
|
+
if (typeValue === "is-json") {
|
|
35365
35369
|
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35366
35370
|
const required22 = parseRequired(rawEvaluator.required);
|
|
35367
35371
|
evaluators.push({
|
|
35368
35372
|
name: name16,
|
|
35369
|
-
type: "
|
|
35373
|
+
type: "is-json",
|
|
35370
35374
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
35371
35375
|
...required22 !== void 0 ? { required: required22 } : {},
|
|
35372
35376
|
...negate !== void 0 ? { negate } : {}
|
|
@@ -35414,7 +35418,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35414
35418
|
const required22 = parseRequired(rawEvaluator.required);
|
|
35415
35419
|
evaluators.push({
|
|
35416
35420
|
name: name16,
|
|
35417
|
-
type: "
|
|
35421
|
+
type: "llm-judge",
|
|
35418
35422
|
rubrics: parsedCriteria,
|
|
35419
35423
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
35420
35424
|
...required22 !== void 0 ? { required: required22 } : {},
|
|
@@ -35481,7 +35485,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35481
35485
|
const required22 = parseRequired(rawEvaluator.required);
|
|
35482
35486
|
evaluators.push({
|
|
35483
35487
|
name: name16,
|
|
35484
|
-
type: "
|
|
35488
|
+
type: "llm-judge",
|
|
35485
35489
|
rubrics: parsedRubrics,
|
|
35486
35490
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
35487
35491
|
...required22 !== void 0 ? { required: required22 } : {},
|
|
@@ -35513,7 +35517,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35513
35517
|
const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
|
|
35514
35518
|
evaluators.push({
|
|
35515
35519
|
name: name16,
|
|
35516
|
-
type: "
|
|
35520
|
+
type: "llm-judge",
|
|
35517
35521
|
prompt,
|
|
35518
35522
|
promptPath,
|
|
35519
35523
|
...promptPath ? { resolvedPromptPath: promptPath } : {},
|
|
@@ -35529,15 +35533,15 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35529
35533
|
}
|
|
35530
35534
|
var ASSERTION_TYPES = /* @__PURE__ */ new Set([
|
|
35531
35535
|
"contains",
|
|
35532
|
-
"
|
|
35533
|
-
"
|
|
35536
|
+
"contains-any",
|
|
35537
|
+
"contains-all",
|
|
35534
35538
|
"icontains",
|
|
35535
|
-
"
|
|
35536
|
-
"
|
|
35537
|
-
"
|
|
35538
|
-
"
|
|
35539
|
+
"icontains-any",
|
|
35540
|
+
"icontains-all",
|
|
35541
|
+
"starts-with",
|
|
35542
|
+
"ends-with",
|
|
35539
35543
|
"regex",
|
|
35540
|
-
"
|
|
35544
|
+
"is-json",
|
|
35541
35545
|
"equals",
|
|
35542
35546
|
"rubrics"
|
|
35543
35547
|
]);
|
|
@@ -35550,24 +35554,24 @@ function generateAssertionName(typeValue, rawEvaluator) {
|
|
|
35550
35554
|
switch (typeValue) {
|
|
35551
35555
|
case "contains":
|
|
35552
35556
|
return value ? `contains-${value}` : "contains";
|
|
35553
|
-
case "
|
|
35554
|
-
return arrayValue ? `
|
|
35555
|
-
case "
|
|
35556
|
-
return arrayValue ? `
|
|
35557
|
+
case "contains-any":
|
|
35558
|
+
return arrayValue ? `contains-any-${arrayValue.length}` : "contains-any";
|
|
35559
|
+
case "contains-all":
|
|
35560
|
+
return arrayValue ? `contains-all-${arrayValue.length}` : "contains-all";
|
|
35557
35561
|
case "icontains":
|
|
35558
35562
|
return value ? `icontains-${value}` : "icontains";
|
|
35559
|
-
case "
|
|
35560
|
-
return arrayValue ? `
|
|
35561
|
-
case "
|
|
35562
|
-
return arrayValue ? `
|
|
35563
|
-
case "
|
|
35564
|
-
return value ? `
|
|
35565
|
-
case "
|
|
35566
|
-
return value ? `
|
|
35563
|
+
case "icontains-any":
|
|
35564
|
+
return arrayValue ? `icontains-any-${arrayValue.length}` : "icontains-any";
|
|
35565
|
+
case "icontains-all":
|
|
35566
|
+
return arrayValue ? `icontains-all-${arrayValue.length}` : "icontains-all";
|
|
35567
|
+
case "starts-with":
|
|
35568
|
+
return value ? `starts-with-${value}` : "starts-with";
|
|
35569
|
+
case "ends-with":
|
|
35570
|
+
return value ? `ends-with-${value}` : "ends-with";
|
|
35567
35571
|
case "regex":
|
|
35568
35572
|
return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
|
|
35569
|
-
case "
|
|
35570
|
-
return "
|
|
35573
|
+
case "is-json":
|
|
35574
|
+
return "is-json";
|
|
35571
35575
|
case "equals":
|
|
35572
35576
|
return value ? `equals-${value}` : "equals";
|
|
35573
35577
|
case "rubrics":
|
|
@@ -35580,8 +35584,9 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
35580
35584
|
if (typeof candidate !== "string") {
|
|
35581
35585
|
return void 0;
|
|
35582
35586
|
}
|
|
35583
|
-
|
|
35584
|
-
|
|
35587
|
+
const normalized = normalizeEvaluatorType(candidate);
|
|
35588
|
+
if (isEvaluatorKind(normalized)) {
|
|
35589
|
+
return normalized;
|
|
35585
35590
|
}
|
|
35586
35591
|
logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
35587
35592
|
return void 0;
|
|
@@ -35627,6 +35632,16 @@ function parseCommandToArgv(command) {
|
|
|
35627
35632
|
function isJsonObject2(value) {
|
|
35628
35633
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
35629
35634
|
}
|
|
35635
|
+
var CRITERIA_CONSUMER_TYPES = /* @__PURE__ */ new Set(["llm-judge", "agent-judge", "code-judge"]);
|
|
35636
|
+
function warnUnconsumedCriteria(criteria, evaluators, testId) {
|
|
35637
|
+
if (!criteria?.trim() || !evaluators || evaluators.length === 0) return;
|
|
35638
|
+
const hasConsumer = evaluators.some((e) => CRITERIA_CONSUMER_TYPES.has(e.type));
|
|
35639
|
+
if (!hasConsumer) {
|
|
35640
|
+
logWarning2(
|
|
35641
|
+
`Test '${testId}': criteria is defined but no evaluator in assert will evaluate it. Add 'type: llm-judge' to assert, or remove criteria if it is documentation-only.`
|
|
35642
|
+
);
|
|
35643
|
+
}
|
|
35644
|
+
}
|
|
35630
35645
|
function logWarning2(message, details) {
|
|
35631
35646
|
if (details && details.length > 0) {
|
|
35632
35647
|
const detailBlock = details.join("\n");
|
|
@@ -35876,7 +35891,7 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
35876
35891
|
}
|
|
35877
35892
|
return {
|
|
35878
35893
|
name: "rubric",
|
|
35879
|
-
type: "
|
|
35894
|
+
type: "llm-judge",
|
|
35880
35895
|
rubrics: rubricItems
|
|
35881
35896
|
};
|
|
35882
35897
|
}
|
|
@@ -36243,7 +36258,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
36243
36258
|
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
36244
36259
|
const fallbackDataset = path6.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
36245
36260
|
const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
|
|
36246
|
-
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "
|
|
36261
|
+
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-judge";
|
|
36247
36262
|
const globalExecution = sidecar.execution;
|
|
36248
36263
|
if (verbose) {
|
|
36249
36264
|
console.log(`
|
|
@@ -36331,6 +36346,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
36331
36346
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
36332
36347
|
}
|
|
36333
36348
|
}
|
|
36349
|
+
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
36334
36350
|
const userFilePaths = [];
|
|
36335
36351
|
for (const segment of inputSegments) {
|
|
36336
36352
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -36714,7 +36730,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
36714
36730
|
const fallbackDataset = path8.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
36715
36731
|
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
36716
36732
|
const rawTestcases = resolveTests(suite);
|
|
36717
|
-
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "
|
|
36733
|
+
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-judge";
|
|
36718
36734
|
const evalFileDir = path8.dirname(absoluteTestPath);
|
|
36719
36735
|
let expandedTestcases;
|
|
36720
36736
|
if (typeof rawTestcases === "string") {
|
|
@@ -36811,6 +36827,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
36811
36827
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
36812
36828
|
}
|
|
36813
36829
|
}
|
|
36830
|
+
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
36814
36831
|
const userFilePaths = [];
|
|
36815
36832
|
for (const segment of inputSegments) {
|
|
36816
36833
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -42612,7 +42629,7 @@ function toCamelCaseDeep(obj) {
|
|
|
42612
42629
|
}
|
|
42613
42630
|
var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
|
|
42614
42631
|
var CodeEvaluator = class {
|
|
42615
|
-
kind = "code";
|
|
42632
|
+
kind = "code-judge";
|
|
42616
42633
|
command;
|
|
42617
42634
|
cwd;
|
|
42618
42635
|
agentTimeoutMs;
|
|
@@ -42813,7 +42830,7 @@ var scoreRangeEvaluationSchema = external_exports.object({
|
|
|
42813
42830
|
overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)").optional()
|
|
42814
42831
|
});
|
|
42815
42832
|
var LlmJudgeEvaluator = class {
|
|
42816
|
-
kind = "
|
|
42833
|
+
kind = "llm-judge";
|
|
42817
42834
|
resolveJudgeProvider;
|
|
42818
42835
|
maxOutputTokens;
|
|
42819
42836
|
temperature;
|
|
@@ -42830,7 +42847,7 @@ var LlmJudgeEvaluator = class {
|
|
|
42830
42847
|
throw new Error("No judge provider available for LLM grading");
|
|
42831
42848
|
}
|
|
42832
42849
|
const config2 = context.evaluator;
|
|
42833
|
-
if (config2?.type === "
|
|
42850
|
+
if (config2?.type === "llm-judge" && config2.rubrics && config2.rubrics.length > 0) {
|
|
42834
42851
|
return this.evaluateWithRubrics(context, judgeProvider, config2.rubrics);
|
|
42835
42852
|
}
|
|
42836
42853
|
return this.evaluateFreeform(context, judgeProvider);
|
|
@@ -42904,7 +42921,7 @@ ${context.fileChanges}`;
|
|
|
42904
42921
|
async evaluateWithRubrics(context, judgeProvider, rubrics) {
|
|
42905
42922
|
if (!rubrics || rubrics.length === 0) {
|
|
42906
42923
|
throw new Error(
|
|
42907
|
-
`No rubrics found for evaluator "${context.evaluator?.name ?? "
|
|
42924
|
+
`No rubrics found for evaluator "${context.evaluator?.name ?? "llm-judge"}". Run "agentv generate rubrics" first.`
|
|
42908
42925
|
);
|
|
42909
42926
|
}
|
|
42910
42927
|
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
@@ -43238,9 +43255,9 @@ var CompositeEvaluator = class {
|
|
|
43238
43255
|
async aggregate(results, context) {
|
|
43239
43256
|
const aggregator = this.config.aggregator;
|
|
43240
43257
|
switch (aggregator.type) {
|
|
43241
|
-
case "
|
|
43258
|
+
case "code-judge":
|
|
43242
43259
|
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
43243
|
-
case "
|
|
43260
|
+
case "llm-judge":
|
|
43244
43261
|
return this.runLlmAggregator(results, context, aggregator);
|
|
43245
43262
|
case "threshold":
|
|
43246
43263
|
return this.runThreshold(results, aggregator.threshold);
|
|
@@ -43383,7 +43400,7 @@ var CompositeEvaluator = class {
|
|
|
43383
43400
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
43384
43401
|
reasoning,
|
|
43385
43402
|
evaluatorRawRequest: {
|
|
43386
|
-
aggregator: "
|
|
43403
|
+
aggregator: "code-judge",
|
|
43387
43404
|
script: scriptPath
|
|
43388
43405
|
},
|
|
43389
43406
|
scores
|
|
@@ -43398,7 +43415,7 @@ var CompositeEvaluator = class {
|
|
|
43398
43415
|
expectedAspectCount: 1,
|
|
43399
43416
|
reasoning: message,
|
|
43400
43417
|
evaluatorRawRequest: {
|
|
43401
|
-
aggregator: "
|
|
43418
|
+
aggregator: "code-judge",
|
|
43402
43419
|
script: scriptPath,
|
|
43403
43420
|
error: message
|
|
43404
43421
|
},
|
|
@@ -43429,7 +43446,7 @@ var CompositeEvaluator = class {
|
|
|
43429
43446
|
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
43430
43447
|
const systemPrompt = buildOutputSchema();
|
|
43431
43448
|
const evaluatorRawRequest = {
|
|
43432
|
-
aggregator: "
|
|
43449
|
+
aggregator: "llm-judge",
|
|
43433
43450
|
userPrompt,
|
|
43434
43451
|
systemPrompt,
|
|
43435
43452
|
target: judgeProvider.targetName
|
|
@@ -43537,7 +43554,7 @@ var CostEvaluator = class {
|
|
|
43537
43554
|
}
|
|
43538
43555
|
};
|
|
43539
43556
|
var ExecutionMetricsEvaluator = class {
|
|
43540
|
-
kind = "
|
|
43557
|
+
kind = "execution-metrics";
|
|
43541
43558
|
config;
|
|
43542
43559
|
constructor(options) {
|
|
43543
43560
|
this.config = options.config;
|
|
@@ -43563,7 +43580,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
43563
43580
|
expectedAspectCount: 1,
|
|
43564
43581
|
reasoning: "Execution metrics not available - no trace summary provided",
|
|
43565
43582
|
evaluatorRawRequest: {
|
|
43566
|
-
type: "
|
|
43583
|
+
type: "execution-metrics",
|
|
43567
43584
|
config: this.extractConfiguredThresholds(),
|
|
43568
43585
|
actual: null
|
|
43569
43586
|
}
|
|
@@ -43672,7 +43689,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
43672
43689
|
if (actualMetrics.exploration_ratio !== void 0) {
|
|
43673
43690
|
reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
|
|
43674
43691
|
}
|
|
43675
|
-
const reasoning = reasoningParts.length > 0 ? `
|
|
43692
|
+
const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
|
|
43676
43693
|
return {
|
|
43677
43694
|
score,
|
|
43678
43695
|
verdict: scoreToVerdict(score),
|
|
@@ -43681,7 +43698,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
43681
43698
|
expectedAspectCount: totalChecks || 1,
|
|
43682
43699
|
reasoning,
|
|
43683
43700
|
evaluatorRawRequest: {
|
|
43684
|
-
type: "
|
|
43701
|
+
type: "execution-metrics",
|
|
43685
43702
|
config: this.extractConfiguredThresholds(),
|
|
43686
43703
|
actual: this.filterDefinedMetrics(actualMetrics)
|
|
43687
43704
|
}
|
|
@@ -43767,7 +43784,7 @@ var MONTH_NAMES = {
|
|
|
43767
43784
|
december: 11
|
|
43768
43785
|
};
|
|
43769
43786
|
var FieldAccuracyEvaluator = class {
|
|
43770
|
-
kind = "
|
|
43787
|
+
kind = "field-accuracy";
|
|
43771
43788
|
config;
|
|
43772
43789
|
constructor(options) {
|
|
43773
43790
|
this.config = options.config;
|
|
@@ -44213,7 +44230,7 @@ var BINARY_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
|
44213
44230
|
".dylib"
|
|
44214
44231
|
]);
|
|
44215
44232
|
var AgentJudgeEvaluator = class {
|
|
44216
|
-
kind = "
|
|
44233
|
+
kind = "agent-judge";
|
|
44217
44234
|
resolveJudgeProvider;
|
|
44218
44235
|
maxSteps;
|
|
44219
44236
|
temperature;
|
|
@@ -44238,24 +44255,24 @@ var AgentJudgeEvaluator = class {
|
|
|
44238
44255
|
async evaluateBuiltIn(context) {
|
|
44239
44256
|
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
44240
44257
|
if (!judgeProvider) {
|
|
44241
|
-
throw new Error("No judge provider available for
|
|
44258
|
+
throw new Error("No judge provider available for agent-judge evaluation");
|
|
44242
44259
|
}
|
|
44243
44260
|
const model = judgeProvider.asLanguageModel?.();
|
|
44244
44261
|
if (!model) {
|
|
44245
44262
|
throw new Error(
|
|
44246
|
-
`Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in
|
|
44263
|
+
`Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent-judge mode`
|
|
44247
44264
|
);
|
|
44248
44265
|
}
|
|
44249
44266
|
const workspacePath = context.workspacePath;
|
|
44250
44267
|
if (!workspacePath) {
|
|
44251
44268
|
throw new Error(
|
|
44252
|
-
"
|
|
44269
|
+
"agent-judge evaluator requires a workspace_template target (workspacePath is not set)"
|
|
44253
44270
|
);
|
|
44254
44271
|
}
|
|
44255
44272
|
const systemPrompt = this.buildSystemPrompt(context);
|
|
44256
44273
|
const userPrompt = this.buildUserPrompt(context);
|
|
44257
44274
|
const config2 = context.evaluator;
|
|
44258
|
-
const rubrics = config2?.type === "
|
|
44275
|
+
const rubrics = config2?.type === "agent-judge" ? config2.rubrics : void 0;
|
|
44259
44276
|
const fsTools = createFilesystemTools(workspacePath);
|
|
44260
44277
|
const evaluatorRawRequest = {
|
|
44261
44278
|
mode: "built-in",
|
|
@@ -44286,7 +44303,7 @@ var AgentJudgeEvaluator = class {
|
|
|
44286
44303
|
score: 0,
|
|
44287
44304
|
verdict: "fail",
|
|
44288
44305
|
hits: [],
|
|
44289
|
-
misses: [`
|
|
44306
|
+
misses: [`agent-judge built-in evaluation failed: ${message}`],
|
|
44290
44307
|
expectedAspectCount: 1,
|
|
44291
44308
|
evaluatorRawRequest,
|
|
44292
44309
|
details: { mode: "built-in", error: message }
|
|
@@ -44318,14 +44335,14 @@ var AgentJudgeEvaluator = class {
|
|
|
44318
44335
|
score: 0,
|
|
44319
44336
|
verdict: "fail",
|
|
44320
44337
|
hits: [],
|
|
44321
|
-
misses: ["
|
|
44338
|
+
misses: ["agent-judge judge_target returned no assistant response"],
|
|
44322
44339
|
expectedAspectCount: 1,
|
|
44323
44340
|
evaluatorRawRequest,
|
|
44324
44341
|
details: { mode: "judge_target", judge_target: provider.targetName }
|
|
44325
44342
|
};
|
|
44326
44343
|
}
|
|
44327
44344
|
const config2 = context.evaluator;
|
|
44328
|
-
const rubrics = config2?.type === "
|
|
44345
|
+
const rubrics = config2?.type === "agent-judge" ? config2.rubrics : void 0;
|
|
44329
44346
|
const details = {
|
|
44330
44347
|
mode: "judge_target",
|
|
44331
44348
|
judge_target: provider.targetName
|
|
@@ -44337,7 +44354,7 @@ var AgentJudgeEvaluator = class {
|
|
|
44337
44354
|
score: 0,
|
|
44338
44355
|
verdict: "fail",
|
|
44339
44356
|
hits: [],
|
|
44340
|
-
misses: [`
|
|
44357
|
+
misses: [`agent-judge judge_target evaluation failed: ${message}`],
|
|
44341
44358
|
expectedAspectCount: 1,
|
|
44342
44359
|
evaluatorRawRequest,
|
|
44343
44360
|
details: {
|
|
@@ -44388,7 +44405,7 @@ var AgentJudgeEvaluator = class {
|
|
|
44388
44405
|
score: 0,
|
|
44389
44406
|
verdict: "fail",
|
|
44390
44407
|
hits: [],
|
|
44391
|
-
misses: ["Failed to parse
|
|
44408
|
+
misses: ["Failed to parse agent-judge response as valid evaluation JSON"],
|
|
44392
44409
|
expectedAspectCount: 1,
|
|
44393
44410
|
evaluatorRawRequest,
|
|
44394
44411
|
details
|
|
@@ -44401,7 +44418,7 @@ var AgentJudgeEvaluator = class {
|
|
|
44401
44418
|
*/
|
|
44402
44419
|
buildSystemPrompt(context) {
|
|
44403
44420
|
const config2 = context.evaluator;
|
|
44404
|
-
const rubrics = config2?.type === "
|
|
44421
|
+
const rubrics = config2?.type === "agent-judge" ? config2.rubrics : void 0;
|
|
44405
44422
|
const parts = [
|
|
44406
44423
|
"You are an expert evaluator with access to the workspace filesystem.",
|
|
44407
44424
|
"Use the provided tools to investigate the workspace and verify the criteria are met.",
|
|
@@ -44432,7 +44449,7 @@ var AgentJudgeEvaluator = class {
|
|
|
44432
44449
|
return substituteVariables(this.evaluatorTemplate, variables);
|
|
44433
44450
|
}
|
|
44434
44451
|
const config2 = context.evaluator;
|
|
44435
|
-
const rubrics = config2?.type === "
|
|
44452
|
+
const rubrics = config2?.type === "agent-judge" ? config2.rubrics : void 0;
|
|
44436
44453
|
const parts = [
|
|
44437
44454
|
"Evaluate the candidate answer by investigating the workspace.",
|
|
44438
44455
|
"",
|
|
@@ -44475,7 +44492,7 @@ var AgentJudgeEvaluator = class {
|
|
|
44475
44492
|
buildDelegatedPrompt(context) {
|
|
44476
44493
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
44477
44494
|
const config2 = context.evaluator;
|
|
44478
|
-
const rubrics = config2?.type === "
|
|
44495
|
+
const rubrics = config2?.type === "agent-judge" ? config2.rubrics : void 0;
|
|
44479
44496
|
if (this.evaluatorTemplate) {
|
|
44480
44497
|
const variables = {
|
|
44481
44498
|
[TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
|
|
@@ -44557,11 +44574,11 @@ function createFilesystemTools(workspacePath) {
|
|
|
44557
44574
|
execute: async (input) => {
|
|
44558
44575
|
try {
|
|
44559
44576
|
const resolved = resolveSandboxed(workspacePath, input.path);
|
|
44560
|
-
const
|
|
44561
|
-
if (
|
|
44577
|
+
const stat8 = await fs2.stat(resolved);
|
|
44578
|
+
if (stat8.isDirectory()) {
|
|
44562
44579
|
return { error: `'${input.path}' is a directory, not a file` };
|
|
44563
44580
|
}
|
|
44564
|
-
const buffer = Buffer.alloc(Math.min(
|
|
44581
|
+
const buffer = Buffer.alloc(Math.min(stat8.size, MAX_FILE_SIZE));
|
|
44565
44582
|
const fd = await fs2.open(resolved, "r");
|
|
44566
44583
|
try {
|
|
44567
44584
|
await fd.read(buffer, 0, buffer.length, 0);
|
|
@@ -44569,8 +44586,8 @@ function createFilesystemTools(workspacePath) {
|
|
|
44569
44586
|
await fd.close();
|
|
44570
44587
|
}
|
|
44571
44588
|
const content = buffer.toString("utf-8");
|
|
44572
|
-
const truncated =
|
|
44573
|
-
return { content, truncated, size:
|
|
44589
|
+
const truncated = stat8.size > MAX_FILE_SIZE;
|
|
44590
|
+
return { content, truncated, size: stat8.size };
|
|
44574
44591
|
} catch (error40) {
|
|
44575
44592
|
return { error: error40 instanceof Error ? error40.message : String(error40) };
|
|
44576
44593
|
}
|
|
@@ -44614,8 +44631,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
|
44614
44631
|
const ext = path30.extname(entry.name).toLowerCase();
|
|
44615
44632
|
if (BINARY_EXTENSIONS.has(ext)) continue;
|
|
44616
44633
|
try {
|
|
44617
|
-
const
|
|
44618
|
-
if (
|
|
44634
|
+
const stat8 = await fs2.stat(fullPath);
|
|
44635
|
+
if (stat8.size > MAX_FILE_SIZE) continue;
|
|
44619
44636
|
const content = await fs2.readFile(fullPath, "utf-8");
|
|
44620
44637
|
const lines = content.split("\n");
|
|
44621
44638
|
for (let i = 0; i < lines.length; i++) {
|
|
@@ -44773,7 +44790,7 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
|
|
|
44773
44790
|
};
|
|
44774
44791
|
}
|
|
44775
44792
|
var TokenUsageEvaluator = class {
|
|
44776
|
-
kind = "
|
|
44793
|
+
kind = "token-usage";
|
|
44777
44794
|
config;
|
|
44778
44795
|
constructor(options) {
|
|
44779
44796
|
this.config = options.config;
|
|
@@ -44796,7 +44813,7 @@ var TokenUsageEvaluator = class {
|
|
|
44796
44813
|
expectedAspectCount,
|
|
44797
44814
|
reasoning: "Token usage not reported by provider",
|
|
44798
44815
|
evaluatorRawRequest: {
|
|
44799
|
-
type: "
|
|
44816
|
+
type: "token-usage",
|
|
44800
44817
|
max_total: maxTotal ?? null,
|
|
44801
44818
|
max_input: maxInput ?? null,
|
|
44802
44819
|
max_output: maxOutput ?? null,
|
|
@@ -44838,9 +44855,9 @@ var TokenUsageEvaluator = class {
|
|
|
44838
44855
|
hits,
|
|
44839
44856
|
misses,
|
|
44840
44857
|
expectedAspectCount,
|
|
44841
|
-
reasoning: `
|
|
44858
|
+
reasoning: `token-usage input=${input}, output=${output}, cached=${cached2}, total=${total}`,
|
|
44842
44859
|
evaluatorRawRequest: {
|
|
44843
|
-
type: "
|
|
44860
|
+
type: "token-usage",
|
|
44844
44861
|
max_total: maxTotal ?? null,
|
|
44845
44862
|
max_input: maxInput ?? null,
|
|
44846
44863
|
max_output: maxOutput ?? null,
|
|
@@ -44923,7 +44940,7 @@ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
|
|
|
44923
44940
|
};
|
|
44924
44941
|
}
|
|
44925
44942
|
var ToolTrajectoryEvaluator = class {
|
|
44926
|
-
kind = "
|
|
44943
|
+
kind = "tool-trajectory";
|
|
44927
44944
|
config;
|
|
44928
44945
|
constructor(options) {
|
|
44929
44946
|
this.config = options.config;
|
|
@@ -45111,7 +45128,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
45111
45128
|
}
|
|
45112
45129
|
}
|
|
45113
45130
|
for (const warning of warnings) {
|
|
45114
|
-
console.warn(`[
|
|
45131
|
+
console.warn(`[tool-trajectory] ${warning}`);
|
|
45115
45132
|
}
|
|
45116
45133
|
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
45117
45134
|
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
@@ -45187,7 +45204,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
45187
45204
|
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
45188
45205
|
}
|
|
45189
45206
|
for (const warning of warnings) {
|
|
45190
|
-
console.warn(`[
|
|
45207
|
+
console.warn(`[tool-trajectory] ${warning}`);
|
|
45191
45208
|
}
|
|
45192
45209
|
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
45193
45210
|
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
@@ -45655,7 +45672,7 @@ var llmJudgeFactory = (config2, context) => {
|
|
|
45655
45672
|
const c = config2;
|
|
45656
45673
|
const { llmJudge, agentTimeoutMs } = context;
|
|
45657
45674
|
return {
|
|
45658
|
-
kind: "
|
|
45675
|
+
kind: "llm-judge",
|
|
45659
45676
|
async evaluate(evalContext) {
|
|
45660
45677
|
const customPrompt = await resolveCustomPrompt(
|
|
45661
45678
|
c,
|
|
@@ -45744,7 +45761,7 @@ var agentJudgeFactory = (config2, context) => {
|
|
|
45744
45761
|
customPrompt = readFileSync(c.resolvedPromptPath, "utf-8");
|
|
45745
45762
|
} catch (error40) {
|
|
45746
45763
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
45747
|
-
console.warn(`Could not read
|
|
45764
|
+
console.warn(`Could not read agent-judge prompt at ${c.resolvedPromptPath}: ${message}`);
|
|
45748
45765
|
}
|
|
45749
45766
|
} else if (c.prompt) {
|
|
45750
45767
|
customPrompt = c.prompt;
|
|
@@ -45754,7 +45771,7 @@ var agentJudgeFactory = (config2, context) => {
|
|
|
45754
45771
|
judgeTargetProvider = targetResolver(c.target);
|
|
45755
45772
|
if (!judgeTargetProvider) {
|
|
45756
45773
|
throw new Error(
|
|
45757
|
-
`
|
|
45774
|
+
`agent-judge evaluator '${c.name}': target '${c.target}' not found in targets`
|
|
45758
45775
|
);
|
|
45759
45776
|
}
|
|
45760
45777
|
}
|
|
@@ -45798,7 +45815,7 @@ var regexFactory = (config2) => {
|
|
|
45798
45815
|
});
|
|
45799
45816
|
};
|
|
45800
45817
|
var isJsonFactory = () => {
|
|
45801
|
-
return new DeterministicAssertionEvaluator("
|
|
45818
|
+
return new DeterministicAssertionEvaluator("is-json", (ctx) => {
|
|
45802
45819
|
const result = runIsJsonAssertion(ctx.candidate);
|
|
45803
45820
|
return {
|
|
45804
45821
|
score: result.score,
|
|
@@ -45826,7 +45843,7 @@ var equalsFactory = (config2) => {
|
|
|
45826
45843
|
};
|
|
45827
45844
|
var containsAnyFactory = (config2) => {
|
|
45828
45845
|
const c = config2;
|
|
45829
|
-
return new DeterministicAssertionEvaluator("
|
|
45846
|
+
return new DeterministicAssertionEvaluator("contains-any", (ctx) => {
|
|
45830
45847
|
const result = runContainsAnyAssertion(ctx.candidate, c.value);
|
|
45831
45848
|
return {
|
|
45832
45849
|
score: result.score,
|
|
@@ -45840,7 +45857,7 @@ var containsAnyFactory = (config2) => {
|
|
|
45840
45857
|
};
|
|
45841
45858
|
var containsAllFactory = (config2) => {
|
|
45842
45859
|
const c = config2;
|
|
45843
|
-
return new DeterministicAssertionEvaluator("
|
|
45860
|
+
return new DeterministicAssertionEvaluator("contains-all", (ctx) => {
|
|
45844
45861
|
const result = runContainsAllAssertion(ctx.candidate, c.value);
|
|
45845
45862
|
return {
|
|
45846
45863
|
score: result.score,
|
|
@@ -45868,7 +45885,7 @@ var icontainsFactory = (config2) => {
|
|
|
45868
45885
|
};
|
|
45869
45886
|
var icontainsAnyFactory = (config2) => {
|
|
45870
45887
|
const c = config2;
|
|
45871
|
-
return new DeterministicAssertionEvaluator("
|
|
45888
|
+
return new DeterministicAssertionEvaluator("icontains-any", (ctx) => {
|
|
45872
45889
|
const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
|
|
45873
45890
|
return {
|
|
45874
45891
|
score: result.score,
|
|
@@ -45882,7 +45899,7 @@ var icontainsAnyFactory = (config2) => {
|
|
|
45882
45899
|
};
|
|
45883
45900
|
var icontainsAllFactory = (config2) => {
|
|
45884
45901
|
const c = config2;
|
|
45885
|
-
return new DeterministicAssertionEvaluator("
|
|
45902
|
+
return new DeterministicAssertionEvaluator("icontains-all", (ctx) => {
|
|
45886
45903
|
const result = runIcontainsAllAssertion(ctx.candidate, c.value);
|
|
45887
45904
|
return {
|
|
45888
45905
|
score: result.score,
|
|
@@ -45896,7 +45913,7 @@ var icontainsAllFactory = (config2) => {
|
|
|
45896
45913
|
};
|
|
45897
45914
|
var startsWithFactory = (config2) => {
|
|
45898
45915
|
const c = config2;
|
|
45899
|
-
return new DeterministicAssertionEvaluator("
|
|
45916
|
+
return new DeterministicAssertionEvaluator("starts-with", (ctx) => {
|
|
45900
45917
|
const result = runStartsWithAssertion(ctx.candidate, c.value);
|
|
45901
45918
|
return {
|
|
45902
45919
|
score: result.score,
|
|
@@ -45910,7 +45927,7 @@ var startsWithFactory = (config2) => {
|
|
|
45910
45927
|
};
|
|
45911
45928
|
var endsWithFactory = (config2) => {
|
|
45912
45929
|
const c = config2;
|
|
45913
|
-
return new DeterministicAssertionEvaluator("
|
|
45930
|
+
return new DeterministicAssertionEvaluator("ends-with", (ctx) => {
|
|
45914
45931
|
const result = runEndsWithAssertion(ctx.candidate, c.value);
|
|
45915
45932
|
return {
|
|
45916
45933
|
score: result.score,
|
|
@@ -45924,7 +45941,7 @@ var endsWithFactory = (config2) => {
|
|
|
45924
45941
|
};
|
|
45925
45942
|
function createBuiltinRegistry() {
|
|
45926
45943
|
const registry2 = new EvaluatorRegistry();
|
|
45927
|
-
registry2.register("
|
|
45944
|
+
registry2.register("llm-judge", llmJudgeFactory).register("code-judge", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("agent-judge", agentJudgeFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory);
|
|
45928
45945
|
return registry2;
|
|
45929
45946
|
}
|
|
45930
45947
|
async function discoverAssertions(registry2, baseDir) {
|
|
@@ -46636,7 +46653,7 @@ async function runEvaluation(options) {
|
|
|
46636
46653
|
};
|
|
46637
46654
|
if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
|
|
46638
46655
|
throw new Error(
|
|
46639
|
-
`Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g.,
|
|
46656
|
+
`Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure-base).`
|
|
46640
46657
|
);
|
|
46641
46658
|
}
|
|
46642
46659
|
const targetResolver = (name16) => {
|
|
@@ -46707,7 +46724,7 @@ async function runEvaluation(options) {
|
|
|
46707
46724
|
const rawTemplate = suiteWorkspace?.template ?? getWorkspaceTemplate(target);
|
|
46708
46725
|
const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
|
|
46709
46726
|
const workspaceTemplate = resolvedTemplate?.dir;
|
|
46710
|
-
|
|
46727
|
+
let suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
|
|
46711
46728
|
const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
|
|
46712
46729
|
const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
|
|
46713
46730
|
const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
|
|
@@ -46728,6 +46745,14 @@ async function runEvaluation(options) {
|
|
|
46728
46745
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
46729
46746
|
throw new Error(`Failed to create shared workspace: ${message}`);
|
|
46730
46747
|
}
|
|
46748
|
+
if (suiteWorkspaceFile && sharedWorkspacePath) {
|
|
46749
|
+
const copiedWorkspaceFile = path37.join(sharedWorkspacePath, path37.basename(suiteWorkspaceFile));
|
|
46750
|
+
try {
|
|
46751
|
+
await stat7(copiedWorkspaceFile);
|
|
46752
|
+
suiteWorkspaceFile = copiedWorkspaceFile;
|
|
46753
|
+
} catch {
|
|
46754
|
+
}
|
|
46755
|
+
}
|
|
46731
46756
|
} else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
|
|
46732
46757
|
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
46733
46758
|
await mkdir12(sharedWorkspacePath, { recursive: true });
|
|
@@ -47206,6 +47231,14 @@ async function runEvalCase(options) {
|
|
|
47206
47231
|
"template_error"
|
|
47207
47232
|
);
|
|
47208
47233
|
}
|
|
47234
|
+
if (caseWorkspaceFile && workspacePath) {
|
|
47235
|
+
const copiedFile = path37.join(workspacePath, path37.basename(caseWorkspaceFile));
|
|
47236
|
+
try {
|
|
47237
|
+
await stat7(copiedFile);
|
|
47238
|
+
caseWorkspaceFile = copiedFile;
|
|
47239
|
+
} catch {
|
|
47240
|
+
}
|
|
47241
|
+
}
|
|
47209
47242
|
}
|
|
47210
47243
|
if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
47211
47244
|
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
@@ -47715,8 +47748,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
47715
47748
|
workspacePath
|
|
47716
47749
|
});
|
|
47717
47750
|
}
|
|
47718
|
-
const evaluatorKind = evalCase.evaluator ?? "
|
|
47719
|
-
const activeEvaluator = evaluators[evaluatorKind] ?? evaluators
|
|
47751
|
+
const evaluatorKind = evalCase.evaluator ?? "llm-judge";
|
|
47752
|
+
const activeEvaluator = evaluators[evaluatorKind] ?? evaluators["llm-judge"];
|
|
47720
47753
|
if (!activeEvaluator) {
|
|
47721
47754
|
throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
|
|
47722
47755
|
}
|
|
@@ -47799,25 +47832,24 @@ async function runEvaluatorList(options) {
|
|
|
47799
47832
|
availableTargets,
|
|
47800
47833
|
agentTimeoutMs,
|
|
47801
47834
|
evalFileDir,
|
|
47802
|
-
llmJudge: evaluatorRegistry
|
|
47835
|
+
llmJudge: evaluatorRegistry["llm-judge"],
|
|
47803
47836
|
registry: typeRegistry
|
|
47804
47837
|
};
|
|
47805
47838
|
for (const evaluatorConfig of evaluators ?? []) {
|
|
47806
47839
|
try {
|
|
47807
47840
|
const evaluatorInstance = await typeRegistry.create(evaluatorConfig, dispatchContext);
|
|
47808
47841
|
const score2 = await evaluatorInstance.evaluate(evalContext);
|
|
47809
|
-
const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
|
|
47810
47842
|
const weight = evaluatorConfig.weight ?? 1;
|
|
47811
47843
|
scored.push({
|
|
47812
47844
|
score: score2,
|
|
47813
47845
|
name: evaluatorConfig.name,
|
|
47814
|
-
type:
|
|
47846
|
+
type: evaluatorConfig.type,
|
|
47815
47847
|
weight,
|
|
47816
47848
|
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
47817
47849
|
});
|
|
47818
47850
|
scores.push({
|
|
47819
47851
|
name: evaluatorConfig.name,
|
|
47820
|
-
type:
|
|
47852
|
+
type: evaluatorConfig.type,
|
|
47821
47853
|
score: score2.score,
|
|
47822
47854
|
weight,
|
|
47823
47855
|
verdict: score2.verdict,
|
|
@@ -47839,18 +47871,17 @@ async function runEvaluatorList(options) {
|
|
|
47839
47871
|
expectedAspectCount: 1,
|
|
47840
47872
|
reasoning: message
|
|
47841
47873
|
};
|
|
47842
|
-
const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
|
|
47843
47874
|
const weight = evaluatorConfig.weight ?? 1;
|
|
47844
47875
|
scored.push({
|
|
47845
47876
|
score: fallbackScore,
|
|
47846
47877
|
name: evaluatorConfig.name ?? "unknown",
|
|
47847
|
-
type:
|
|
47878
|
+
type: evaluatorConfig.type ?? "llm-judge",
|
|
47848
47879
|
weight,
|
|
47849
47880
|
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
47850
47881
|
});
|
|
47851
47882
|
scores.push({
|
|
47852
47883
|
name: evaluatorConfig.name ?? "unknown",
|
|
47853
|
-
type:
|
|
47884
|
+
type: evaluatorConfig.type ?? "llm-judge",
|
|
47854
47885
|
score: 0,
|
|
47855
47886
|
weight,
|
|
47856
47887
|
verdict: "fail",
|
|
@@ -47911,7 +47942,7 @@ function filterEvalCases(evalCases, filter2) {
|
|
|
47911
47942
|
return evalCases.filter((evalCase) => micromatch4.isMatch(evalCase.id, filter2));
|
|
47912
47943
|
}
|
|
47913
47944
|
function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
47914
|
-
const llmJudge = overrides?.
|
|
47945
|
+
const llmJudge = overrides?.["llm-judge"] ?? new LlmJudgeEvaluator({
|
|
47915
47946
|
resolveJudgeProvider: async (context) => {
|
|
47916
47947
|
if (context.judgeProvider) {
|
|
47917
47948
|
return context.judgeProvider;
|
|
@@ -47921,7 +47952,7 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
47921
47952
|
});
|
|
47922
47953
|
return {
|
|
47923
47954
|
...overrides,
|
|
47924
|
-
|
|
47955
|
+
"llm-judge": llmJudge
|
|
47925
47956
|
};
|
|
47926
47957
|
}
|
|
47927
47958
|
async function invokeProvider(provider, options) {
|
|
@@ -48177,12 +48208,7 @@ async function evaluate(config2) {
|
|
|
48177
48208
|
};
|
|
48178
48209
|
}
|
|
48179
48210
|
function mapAssertionType(type) {
|
|
48180
|
-
|
|
48181
|
-
case "code_judge":
|
|
48182
|
-
return "code";
|
|
48183
|
-
default:
|
|
48184
|
-
return type;
|
|
48185
|
-
}
|
|
48211
|
+
return type.replace(/_/g, "-");
|
|
48186
48212
|
}
|
|
48187
48213
|
function computeSummary(results, durationMs) {
|
|
48188
48214
|
const total = results.length;
|
|
@@ -49011,4 +49037,4 @@ export {
|
|
|
49011
49037
|
OtelStreamingObserver,
|
|
49012
49038
|
createAgentKernel
|
|
49013
49039
|
};
|
|
49014
|
-
//# sourceMappingURL=chunk-
|
|
49040
|
+
//# sourceMappingURL=chunk-OQN2GDEU.js.map
|