agentv 2.12.0 → 2.14.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -14
- package/dist/{chunk-YBJX5CP6.js → chunk-K2APOWTE.js} +213 -29
- package/dist/chunk-K2APOWTE.js.map +1 -0
- package/dist/{chunk-LUHCYBMD.js → chunk-OQN2GDEU.js} +251 -164
- package/dist/chunk-OQN2GDEU.js.map +1 -0
- package/dist/{chunk-6KU2ZUFJ.js → chunk-ZSSGXZX6.js} +39 -77
- package/dist/chunk-ZSSGXZX6.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-OPPA4P5R.js → dist-QR5OZ4DH.js} +4 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-TOUKPSHP.js → interactive-WF6UO63B.js} +3 -3
- package/package.json +4 -2
- package/dist/chunk-6KU2ZUFJ.js.map +0 -1
- package/dist/chunk-LUHCYBMD.js.map +0 -1
- package/dist/chunk-YBJX5CP6.js.map +0 -1
- /package/dist/{dist-OPPA4P5R.js.map → dist-QR5OZ4DH.js.map} +0 -0
- /package/dist/{interactive-TOUKPSHP.js.map → interactive-WF6UO63B.js.map} +0 -0
|
@@ -148,7 +148,7 @@ var require_dist = __commonJS({
|
|
|
148
148
|
}
|
|
149
149
|
});
|
|
150
150
|
|
|
151
|
-
// ../../packages/core/dist/chunk-
|
|
151
|
+
// ../../packages/core/dist/chunk-N55K52OO.js
|
|
152
152
|
import { constants } from "node:fs";
|
|
153
153
|
import { access, readFile } from "node:fs/promises";
|
|
154
154
|
import path from "node:path";
|
|
@@ -4195,7 +4195,7 @@ var coerce = {
|
|
|
4195
4195
|
};
|
|
4196
4196
|
var NEVER = INVALID;
|
|
4197
4197
|
|
|
4198
|
-
// ../../packages/core/dist/chunk-
|
|
4198
|
+
// ../../packages/core/dist/chunk-N55K52OO.js
|
|
4199
4199
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
4200
4200
|
var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
|
|
4201
4201
|
var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
|
|
@@ -4243,27 +4243,27 @@ function isTestMessage(value) {
|
|
|
4243
4243
|
return false;
|
|
4244
4244
|
}
|
|
4245
4245
|
var EVALUATOR_KIND_VALUES = [
|
|
4246
|
-
"
|
|
4247
|
-
"
|
|
4246
|
+
"code-judge",
|
|
4247
|
+
"llm-judge",
|
|
4248
4248
|
"rubric",
|
|
4249
4249
|
"composite",
|
|
4250
|
-
"
|
|
4251
|
-
"
|
|
4250
|
+
"tool-trajectory",
|
|
4251
|
+
"field-accuracy",
|
|
4252
4252
|
"latency",
|
|
4253
4253
|
"cost",
|
|
4254
|
-
"
|
|
4255
|
-
"
|
|
4256
|
-
"
|
|
4254
|
+
"token-usage",
|
|
4255
|
+
"execution-metrics",
|
|
4256
|
+
"agent-judge",
|
|
4257
4257
|
"contains",
|
|
4258
|
-
"
|
|
4259
|
-
"
|
|
4258
|
+
"contains-any",
|
|
4259
|
+
"contains-all",
|
|
4260
4260
|
"icontains",
|
|
4261
|
-
"
|
|
4262
|
-
"
|
|
4263
|
-
"
|
|
4264
|
-
"
|
|
4261
|
+
"icontains-any",
|
|
4262
|
+
"icontains-all",
|
|
4263
|
+
"starts-with",
|
|
4264
|
+
"ends-with",
|
|
4265
4265
|
"regex",
|
|
4266
|
-
"
|
|
4266
|
+
"is-json",
|
|
4267
4267
|
"equals",
|
|
4268
4268
|
"rubrics"
|
|
4269
4269
|
];
|
|
@@ -33960,7 +33960,7 @@ import { createServer } from "node:http";
|
|
|
33960
33960
|
import fs2 from "node:fs/promises";
|
|
33961
33961
|
import path30 from "node:path";
|
|
33962
33962
|
import { createHash as createHash2, randomUUID as randomUUID7 } from "node:crypto";
|
|
33963
|
-
import { mkdir as mkdir12 } from "node:fs/promises";
|
|
33963
|
+
import { mkdir as mkdir12, stat as stat7 } from "node:fs/promises";
|
|
33964
33964
|
import path37 from "node:path";
|
|
33965
33965
|
import micromatch4 from "micromatch";
|
|
33966
33966
|
import { readFileSync } from "node:fs";
|
|
@@ -34331,6 +34331,11 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
34331
34331
|
continue;
|
|
34332
34332
|
}
|
|
34333
34333
|
const config2 = parsed;
|
|
34334
|
+
const requiredVersion = parsed.required_version;
|
|
34335
|
+
if (requiredVersion !== void 0 && typeof requiredVersion !== "string") {
|
|
34336
|
+
logWarning(`Invalid required_version in ${configPath}, expected string`);
|
|
34337
|
+
continue;
|
|
34338
|
+
}
|
|
34334
34339
|
const guidelinePatterns = config2.guideline_patterns;
|
|
34335
34340
|
if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
|
|
34336
34341
|
logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
|
|
@@ -34354,6 +34359,7 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
34354
34359
|
configPath
|
|
34355
34360
|
);
|
|
34356
34361
|
return {
|
|
34362
|
+
required_version: requiredVersion,
|
|
34357
34363
|
guideline_patterns: guidelinePatterns,
|
|
34358
34364
|
eval_patterns: evalPatterns,
|
|
34359
34365
|
execution: executionDefaults
|
|
@@ -34497,6 +34503,22 @@ function extractTotalBudgetUsd(suite) {
|
|
|
34497
34503
|
);
|
|
34498
34504
|
return void 0;
|
|
34499
34505
|
}
|
|
34506
|
+
function extractFailOnError(suite) {
|
|
34507
|
+
const execution = suite.execution;
|
|
34508
|
+
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
34509
|
+
return void 0;
|
|
34510
|
+
}
|
|
34511
|
+
const executionObj = execution;
|
|
34512
|
+
const raw = executionObj.fail_on_error ?? executionObj.failOnError;
|
|
34513
|
+
if (raw === void 0 || raw === null) {
|
|
34514
|
+
return void 0;
|
|
34515
|
+
}
|
|
34516
|
+
if (typeof raw === "boolean") {
|
|
34517
|
+
return raw;
|
|
34518
|
+
}
|
|
34519
|
+
logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
|
|
34520
|
+
return void 0;
|
|
34521
|
+
}
|
|
34500
34522
|
function parseExecutionDefaults(raw, configPath) {
|
|
34501
34523
|
if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
|
|
34502
34524
|
return void 0;
|
|
@@ -34583,6 +34605,9 @@ function validateTemplateVariables(content, source) {
|
|
|
34583
34605
|
}
|
|
34584
34606
|
var ANSI_YELLOW4 = "\x1B[33m";
|
|
34585
34607
|
var ANSI_RESET4 = "\x1B[0m";
|
|
34608
|
+
function normalizeEvaluatorType(type) {
|
|
34609
|
+
return type.replace(/_/g, "-");
|
|
34610
|
+
}
|
|
34586
34611
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
34587
34612
|
const execution = rawEvalCase.execution;
|
|
34588
34613
|
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
@@ -34613,7 +34638,8 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34613
34638
|
continue;
|
|
34614
34639
|
}
|
|
34615
34640
|
const rawName = asString(rawEvaluator.name);
|
|
34616
|
-
const
|
|
34641
|
+
const rawType = rawEvaluator.type;
|
|
34642
|
+
const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
|
|
34617
34643
|
const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
|
|
34618
34644
|
if (typeof typeValue !== "string") {
|
|
34619
34645
|
logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
|
|
@@ -34646,25 +34672,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34646
34672
|
});
|
|
34647
34673
|
continue;
|
|
34648
34674
|
}
|
|
34649
|
-
if (typeValue === "
|
|
34675
|
+
if (typeValue === "code-judge") {
|
|
34650
34676
|
let command;
|
|
34651
34677
|
const rawCommand = rawEvaluator.command ?? rawEvaluator.script;
|
|
34652
34678
|
if (typeof rawCommand === "string") {
|
|
34653
34679
|
const trimmed = rawCommand.trim();
|
|
34654
34680
|
if (trimmed.length === 0) {
|
|
34655
34681
|
throw new Error(
|
|
34656
|
-
`Invalid
|
|
34682
|
+
`Invalid code-judge command for evaluator '${name16}' in '${evalId}': command cannot be empty`
|
|
34657
34683
|
);
|
|
34658
34684
|
}
|
|
34659
34685
|
command = parseCommandToArgv(trimmed);
|
|
34660
34686
|
} else {
|
|
34661
34687
|
command = asStringArray(
|
|
34662
34688
|
rawCommand,
|
|
34663
|
-
`
|
|
34689
|
+
`code-judge command for evaluator '${name16}' in '${evalId}'`
|
|
34664
34690
|
);
|
|
34665
34691
|
}
|
|
34666
34692
|
if (!command) {
|
|
34667
|
-
logWarning2(`Skipping
|
|
34693
|
+
logWarning2(`Skipping code-judge evaluator '${name16}' in '${evalId}': missing command`);
|
|
34668
34694
|
continue;
|
|
34669
34695
|
}
|
|
34670
34696
|
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
@@ -34725,7 +34751,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34725
34751
|
}
|
|
34726
34752
|
evaluators.push({
|
|
34727
34753
|
name: name16,
|
|
34728
|
-
type: "code",
|
|
34754
|
+
type: "code-judge",
|
|
34729
34755
|
command,
|
|
34730
34756
|
cwd,
|
|
34731
34757
|
resolvedCwd,
|
|
@@ -34751,7 +34777,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34751
34777
|
continue;
|
|
34752
34778
|
}
|
|
34753
34779
|
const aggregatorType = asString(rawAggregator.type);
|
|
34754
|
-
if (aggregatorType !== "weighted_average" && aggregatorType !== "
|
|
34780
|
+
if (aggregatorType !== "weighted_average" && aggregatorType !== "code-judge" && aggregatorType !== "llm-judge" && aggregatorType !== "threshold") {
|
|
34755
34781
|
logWarning2(
|
|
34756
34782
|
`Skipping composite evaluator '${name16}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
34757
34783
|
);
|
|
@@ -34800,16 +34826,16 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34800
34826
|
type: "weighted_average",
|
|
34801
34827
|
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
34802
34828
|
};
|
|
34803
|
-
} else if (aggregatorType === "
|
|
34829
|
+
} else if (aggregatorType === "code-judge") {
|
|
34804
34830
|
const aggregatorPath = asString(rawAggregator.path);
|
|
34805
34831
|
if (!aggregatorPath) {
|
|
34806
34832
|
logWarning2(
|
|
34807
|
-
`Skipping composite evaluator '${name16}' in '${evalId}':
|
|
34833
|
+
`Skipping composite evaluator '${name16}' in '${evalId}': code-judge aggregator missing path`
|
|
34808
34834
|
);
|
|
34809
34835
|
continue;
|
|
34810
34836
|
}
|
|
34811
34837
|
aggregator = {
|
|
34812
|
-
type: "
|
|
34838
|
+
type: "code-judge",
|
|
34813
34839
|
path: aggregatorPath,
|
|
34814
34840
|
cwd: searchRoots[0]
|
|
34815
34841
|
};
|
|
@@ -34835,7 +34861,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34835
34861
|
}
|
|
34836
34862
|
}
|
|
34837
34863
|
aggregator = {
|
|
34838
|
-
type: "
|
|
34864
|
+
type: "llm-judge",
|
|
34839
34865
|
...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
|
|
34840
34866
|
...promptPath2 ? { promptPath: promptPath2 } : {}
|
|
34841
34867
|
};
|
|
@@ -34853,11 +34879,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34853
34879
|
});
|
|
34854
34880
|
continue;
|
|
34855
34881
|
}
|
|
34856
|
-
if (typeValue === "
|
|
34882
|
+
if (typeValue === "tool-trajectory") {
|
|
34857
34883
|
const mode = asString(rawEvaluator.mode);
|
|
34858
34884
|
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact" && mode !== "subset" && mode !== "superset") {
|
|
34859
34885
|
logWarning2(
|
|
34860
|
-
`Skipping
|
|
34886
|
+
`Skipping tool-trajectory evaluator '${name16}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, exact, subset, or superset)`
|
|
34861
34887
|
);
|
|
34862
34888
|
continue;
|
|
34863
34889
|
}
|
|
@@ -34866,7 +34892,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34866
34892
|
if (rawMinimums !== void 0) {
|
|
34867
34893
|
if (!isJsonObject2(rawMinimums)) {
|
|
34868
34894
|
logWarning2(
|
|
34869
|
-
`Skipping
|
|
34895
|
+
`Skipping tool-trajectory evaluator '${name16}' in '${evalId}': minimums must be an object`
|
|
34870
34896
|
);
|
|
34871
34897
|
continue;
|
|
34872
34898
|
}
|
|
@@ -34892,7 +34918,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34892
34918
|
argsMatch2 = rawArgsMatch;
|
|
34893
34919
|
} else {
|
|
34894
34920
|
logWarning2(
|
|
34895
|
-
`Invalid args_match '${rawArgsMatch}' for
|
|
34921
|
+
`Invalid args_match '${rawArgsMatch}' for tool-trajectory evaluator '${name16}' in '${evalId}': must be exact, superset, subset, ignore, or a string array`
|
|
34896
34922
|
);
|
|
34897
34923
|
}
|
|
34898
34924
|
}
|
|
@@ -34902,7 +34928,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34902
34928
|
if (rawExpected !== void 0) {
|
|
34903
34929
|
if (!Array.isArray(rawExpected)) {
|
|
34904
34930
|
logWarning2(
|
|
34905
|
-
`Skipping
|
|
34931
|
+
`Skipping tool-trajectory evaluator '${name16}' in '${evalId}': expected must be an array`
|
|
34906
34932
|
);
|
|
34907
34933
|
continue;
|
|
34908
34934
|
}
|
|
@@ -34948,13 +34974,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34948
34974
|
}
|
|
34949
34975
|
if (mode === "any_order" && !minimums) {
|
|
34950
34976
|
logWarning2(
|
|
34951
|
-
`Skipping
|
|
34977
|
+
`Skipping tool-trajectory evaluator '${name16}' in '${evalId}': any_order mode requires minimums`
|
|
34952
34978
|
);
|
|
34953
34979
|
continue;
|
|
34954
34980
|
}
|
|
34955
34981
|
if ((mode === "in_order" || mode === "exact" || mode === "subset" || mode === "superset") && !expected) {
|
|
34956
34982
|
logWarning2(
|
|
34957
|
-
`Skipping
|
|
34983
|
+
`Skipping tool-trajectory evaluator '${name16}' in '${evalId}': ${mode} mode requires expected`
|
|
34958
34984
|
);
|
|
34959
34985
|
continue;
|
|
34960
34986
|
}
|
|
@@ -34962,7 +34988,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34962
34988
|
const required22 = parseRequired(rawEvaluator.required);
|
|
34963
34989
|
const config22 = {
|
|
34964
34990
|
name: name16,
|
|
34965
|
-
type: "
|
|
34991
|
+
type: "tool-trajectory",
|
|
34966
34992
|
mode,
|
|
34967
34993
|
...minimums ? { minimums } : {},
|
|
34968
34994
|
...expected ? { expected } : {},
|
|
@@ -34974,17 +35000,17 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34974
35000
|
evaluators.push(config22);
|
|
34975
35001
|
continue;
|
|
34976
35002
|
}
|
|
34977
|
-
if (typeValue === "
|
|
35003
|
+
if (typeValue === "field-accuracy") {
|
|
34978
35004
|
const rawFields = rawEvaluator.fields;
|
|
34979
35005
|
if (!Array.isArray(rawFields)) {
|
|
34980
35006
|
logWarning2(
|
|
34981
|
-
`Skipping
|
|
35007
|
+
`Skipping field-accuracy evaluator '${name16}' in '${evalId}': missing fields array`
|
|
34982
35008
|
);
|
|
34983
35009
|
continue;
|
|
34984
35010
|
}
|
|
34985
35011
|
if (rawFields.length === 0) {
|
|
34986
35012
|
logWarning2(
|
|
34987
|
-
`Skipping
|
|
35013
|
+
`Skipping field-accuracy evaluator '${name16}' in '${evalId}': fields array is empty`
|
|
34988
35014
|
);
|
|
34989
35015
|
continue;
|
|
34990
35016
|
}
|
|
@@ -34992,7 +35018,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
34992
35018
|
for (const rawField of rawFields) {
|
|
34993
35019
|
if (!isJsonObject2(rawField)) {
|
|
34994
35020
|
logWarning2(
|
|
34995
|
-
`Skipping invalid field entry in
|
|
35021
|
+
`Skipping invalid field entry in field-accuracy evaluator '${name16}' (expected object)`
|
|
34996
35022
|
);
|
|
34997
35023
|
continue;
|
|
34998
35024
|
}
|
|
@@ -35000,13 +35026,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35000
35026
|
const match = asString(rawField.match);
|
|
35001
35027
|
if (!fieldPath) {
|
|
35002
35028
|
logWarning2(
|
|
35003
|
-
`Skipping field without path in
|
|
35029
|
+
`Skipping field without path in field-accuracy evaluator '${name16}' in '${evalId}'`
|
|
35004
35030
|
);
|
|
35005
35031
|
continue;
|
|
35006
35032
|
}
|
|
35007
35033
|
if (!match || !isValidFieldMatchType(match)) {
|
|
35008
35034
|
logWarning2(
|
|
35009
|
-
`Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name16}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a
|
|
35035
|
+
`Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name16}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code-judge evaluator.`
|
|
35010
35036
|
);
|
|
35011
35037
|
continue;
|
|
35012
35038
|
}
|
|
@@ -35023,7 +35049,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35023
35049
|
}
|
|
35024
35050
|
if (fields.length === 0) {
|
|
35025
35051
|
logWarning2(
|
|
35026
|
-
`Skipping
|
|
35052
|
+
`Skipping field-accuracy evaluator '${name16}' in '${evalId}': no valid fields found`
|
|
35027
35053
|
);
|
|
35028
35054
|
continue;
|
|
35029
35055
|
}
|
|
@@ -35033,7 +35059,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35033
35059
|
const required22 = parseRequired(rawEvaluator.required);
|
|
35034
35060
|
evaluators.push({
|
|
35035
35061
|
name: name16,
|
|
35036
|
-
type: "
|
|
35062
|
+
type: "field-accuracy",
|
|
35037
35063
|
fields,
|
|
35038
35064
|
...validAggregation ? { aggregation: validAggregation } : {},
|
|
35039
35065
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
@@ -35082,7 +35108,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35082
35108
|
});
|
|
35083
35109
|
continue;
|
|
35084
35110
|
}
|
|
35085
|
-
if (typeValue === "
|
|
35111
|
+
if (typeValue === "token-usage") {
|
|
35086
35112
|
const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
|
|
35087
35113
|
const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
|
|
35088
35114
|
const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
|
|
@@ -35096,7 +35122,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35096
35122
|
if (raw === void 0) continue;
|
|
35097
35123
|
if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
|
|
35098
35124
|
logWarning2(
|
|
35099
|
-
`Skipping
|
|
35125
|
+
`Skipping token-usage evaluator '${name16}' in '${evalId}': ${key} must be a non-negative finite number`
|
|
35100
35126
|
);
|
|
35101
35127
|
continue;
|
|
35102
35128
|
}
|
|
@@ -35104,7 +35130,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35104
35130
|
}
|
|
35105
35131
|
if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
|
|
35106
35132
|
logWarning2(
|
|
35107
|
-
`Skipping
|
|
35133
|
+
`Skipping token-usage evaluator '${name16}' in '${evalId}': must set at least one of max_total, max_input, max_output`
|
|
35108
35134
|
);
|
|
35109
35135
|
continue;
|
|
35110
35136
|
}
|
|
@@ -35112,7 +35138,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35112
35138
|
const required22 = parseRequired(rawEvaluator.required);
|
|
35113
35139
|
evaluators.push({
|
|
35114
35140
|
name: name16,
|
|
35115
|
-
type: "
|
|
35141
|
+
type: "token-usage",
|
|
35116
35142
|
...validLimits,
|
|
35117
35143
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
35118
35144
|
...required22 !== void 0 ? { required: required22 } : {},
|
|
@@ -35120,7 +35146,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35120
35146
|
});
|
|
35121
35147
|
continue;
|
|
35122
35148
|
}
|
|
35123
|
-
if (typeValue === "
|
|
35149
|
+
if (typeValue === "execution-metrics") {
|
|
35124
35150
|
const maxToolCalls = rawEvaluator.max_tool_calls ?? rawEvaluator.maxToolCalls;
|
|
35125
35151
|
const maxLlmCalls = rawEvaluator.max_llm_calls ?? rawEvaluator.maxLlmCalls;
|
|
35126
35152
|
const maxTokens = rawEvaluator.max_tokens ?? rawEvaluator.maxTokens;
|
|
@@ -35143,7 +35169,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35143
35169
|
if (raw === void 0) continue;
|
|
35144
35170
|
if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
|
|
35145
35171
|
logWarning2(
|
|
35146
|
-
`Skipping
|
|
35172
|
+
`Skipping execution-metrics evaluator '${name16}' in '${evalId}': ${key} must be a non-negative finite number`
|
|
35147
35173
|
);
|
|
35148
35174
|
hasError = true;
|
|
35149
35175
|
break;
|
|
@@ -35156,7 +35182,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35156
35182
|
const hasThreshold = validThresholds.max_tool_calls !== void 0 || validThresholds.max_llm_calls !== void 0 || validThresholds.max_tokens !== void 0 || validThresholds.max_cost_usd !== void 0 || validThresholds.max_duration_ms !== void 0 || validThresholds.target_exploration_ratio !== void 0;
|
|
35157
35183
|
if (!hasThreshold) {
|
|
35158
35184
|
logWarning2(
|
|
35159
|
-
`Skipping
|
|
35185
|
+
`Skipping execution-metrics evaluator '${name16}' in '${evalId}': must set at least one threshold (max_tool_calls, max_llm_calls, max_tokens, max_cost_usd, max_duration_ms, or target_exploration_ratio)`
|
|
35160
35186
|
);
|
|
35161
35187
|
continue;
|
|
35162
35188
|
}
|
|
@@ -35164,7 +35190,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35164
35190
|
const required22 = parseRequired(rawEvaluator.required);
|
|
35165
35191
|
evaluators.push({
|
|
35166
35192
|
name: name16,
|
|
35167
|
-
type: "
|
|
35193
|
+
type: "execution-metrics",
|
|
35168
35194
|
...validThresholds,
|
|
35169
35195
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
35170
35196
|
...required22 !== void 0 ? { required: required22 } : {},
|
|
@@ -35172,13 +35198,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35172
35198
|
});
|
|
35173
35199
|
continue;
|
|
35174
35200
|
}
|
|
35175
|
-
if (typeValue === "
|
|
35201
|
+
if (typeValue === "agent-judge") {
|
|
35176
35202
|
const rawMaxSteps = rawEvaluator.max_steps ?? rawEvaluator.maxSteps;
|
|
35177
35203
|
let maxSteps;
|
|
35178
35204
|
if (rawMaxSteps !== void 0) {
|
|
35179
35205
|
if (typeof rawMaxSteps !== "number" || !Number.isInteger(rawMaxSteps) || rawMaxSteps < 1 || rawMaxSteps > 50) {
|
|
35180
35206
|
logWarning2(
|
|
35181
|
-
`Skipping
|
|
35207
|
+
`Skipping agent-judge evaluator '${name16}' in '${evalId}': max_steps must be an integer 1-50`
|
|
35182
35208
|
);
|
|
35183
35209
|
continue;
|
|
35184
35210
|
}
|
|
@@ -35189,7 +35215,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35189
35215
|
if (rawTemperature !== void 0) {
|
|
35190
35216
|
if (typeof rawTemperature !== "number" || rawTemperature < 0 || rawTemperature > 2) {
|
|
35191
35217
|
logWarning2(
|
|
35192
|
-
`Skipping
|
|
35218
|
+
`Skipping agent-judge evaluator '${name16}' in '${evalId}': temperature must be a number 0-2`
|
|
35193
35219
|
);
|
|
35194
35220
|
continue;
|
|
35195
35221
|
}
|
|
@@ -35212,7 +35238,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35212
35238
|
const required22 = parseRequired(rawEvaluator.required);
|
|
35213
35239
|
evaluators.push({
|
|
35214
35240
|
name: name16,
|
|
35215
|
-
type: "
|
|
35241
|
+
type: "agent-judge",
|
|
35216
35242
|
...agentPrompt ? { prompt: agentPrompt } : {},
|
|
35217
35243
|
...agentPromptPath ? { promptPath: agentPromptPath, resolvedPromptPath: agentPromptPath } : {},
|
|
35218
35244
|
...agentParsedRubrics && agentParsedRubrics.length > 0 ? { rubrics: agentParsedRubrics } : {},
|
|
@@ -35243,7 +35269,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35243
35269
|
});
|
|
35244
35270
|
continue;
|
|
35245
35271
|
}
|
|
35246
|
-
if (typeValue === "
|
|
35272
|
+
if (typeValue === "contains-any" || typeValue === "contains-all") {
|
|
35247
35273
|
const value = asStringArrayStrict(rawEvaluator.value);
|
|
35248
35274
|
if (!value || value.length === 0) {
|
|
35249
35275
|
logWarning2(
|
|
@@ -35281,7 +35307,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35281
35307
|
});
|
|
35282
35308
|
continue;
|
|
35283
35309
|
}
|
|
35284
|
-
if (typeValue === "
|
|
35310
|
+
if (typeValue === "icontains-any" || typeValue === "icontains-all") {
|
|
35285
35311
|
const value = asStringArrayStrict(rawEvaluator.value);
|
|
35286
35312
|
if (!value || value.length === 0) {
|
|
35287
35313
|
logWarning2(
|
|
@@ -35301,7 +35327,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35301
35327
|
});
|
|
35302
35328
|
continue;
|
|
35303
35329
|
}
|
|
35304
|
-
if (typeValue === "
|
|
35330
|
+
if (typeValue === "starts-with" || typeValue === "ends-with") {
|
|
35305
35331
|
const value = asString(rawEvaluator.value);
|
|
35306
35332
|
if (!value) {
|
|
35307
35333
|
logWarning2(`Skipping ${typeValue} evaluator '${name16}' in '${evalId}': missing value`);
|
|
@@ -35339,12 +35365,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35339
35365
|
});
|
|
35340
35366
|
continue;
|
|
35341
35367
|
}
|
|
35342
|
-
if (typeValue === "
|
|
35368
|
+
if (typeValue === "is-json") {
|
|
35343
35369
|
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35344
35370
|
const required22 = parseRequired(rawEvaluator.required);
|
|
35345
35371
|
evaluators.push({
|
|
35346
35372
|
name: name16,
|
|
35347
|
-
type: "
|
|
35373
|
+
type: "is-json",
|
|
35348
35374
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
35349
35375
|
...required22 !== void 0 ? { required: required22 } : {},
|
|
35350
35376
|
...negate !== void 0 ? { negate } : {}
|
|
@@ -35392,7 +35418,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35392
35418
|
const required22 = parseRequired(rawEvaluator.required);
|
|
35393
35419
|
evaluators.push({
|
|
35394
35420
|
name: name16,
|
|
35395
|
-
type: "
|
|
35421
|
+
type: "llm-judge",
|
|
35396
35422
|
rubrics: parsedCriteria,
|
|
35397
35423
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
35398
35424
|
...required22 !== void 0 ? { required: required22 } : {},
|
|
@@ -35459,7 +35485,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35459
35485
|
const required22 = parseRequired(rawEvaluator.required);
|
|
35460
35486
|
evaluators.push({
|
|
35461
35487
|
name: name16,
|
|
35462
|
-
type: "
|
|
35488
|
+
type: "llm-judge",
|
|
35463
35489
|
rubrics: parsedRubrics,
|
|
35464
35490
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
35465
35491
|
...required22 !== void 0 ? { required: required22 } : {},
|
|
@@ -35491,7 +35517,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35491
35517
|
const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
|
|
35492
35518
|
evaluators.push({
|
|
35493
35519
|
name: name16,
|
|
35494
|
-
type: "
|
|
35520
|
+
type: "llm-judge",
|
|
35495
35521
|
prompt,
|
|
35496
35522
|
promptPath,
|
|
35497
35523
|
...promptPath ? { resolvedPromptPath: promptPath } : {},
|
|
@@ -35507,15 +35533,15 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
35507
35533
|
}
|
|
35508
35534
|
var ASSERTION_TYPES = /* @__PURE__ */ new Set([
|
|
35509
35535
|
"contains",
|
|
35510
|
-
"
|
|
35511
|
-
"
|
|
35536
|
+
"contains-any",
|
|
35537
|
+
"contains-all",
|
|
35512
35538
|
"icontains",
|
|
35513
|
-
"
|
|
35514
|
-
"
|
|
35515
|
-
"
|
|
35516
|
-
"
|
|
35539
|
+
"icontains-any",
|
|
35540
|
+
"icontains-all",
|
|
35541
|
+
"starts-with",
|
|
35542
|
+
"ends-with",
|
|
35517
35543
|
"regex",
|
|
35518
|
-
"
|
|
35544
|
+
"is-json",
|
|
35519
35545
|
"equals",
|
|
35520
35546
|
"rubrics"
|
|
35521
35547
|
]);
|
|
@@ -35528,24 +35554,24 @@ function generateAssertionName(typeValue, rawEvaluator) {
|
|
|
35528
35554
|
switch (typeValue) {
|
|
35529
35555
|
case "contains":
|
|
35530
35556
|
return value ? `contains-${value}` : "contains";
|
|
35531
|
-
case "
|
|
35532
|
-
return arrayValue ? `
|
|
35533
|
-
case "
|
|
35534
|
-
return arrayValue ? `
|
|
35557
|
+
case "contains-any":
|
|
35558
|
+
return arrayValue ? `contains-any-${arrayValue.length}` : "contains-any";
|
|
35559
|
+
case "contains-all":
|
|
35560
|
+
return arrayValue ? `contains-all-${arrayValue.length}` : "contains-all";
|
|
35535
35561
|
case "icontains":
|
|
35536
35562
|
return value ? `icontains-${value}` : "icontains";
|
|
35537
|
-
case "
|
|
35538
|
-
return arrayValue ? `
|
|
35539
|
-
case "
|
|
35540
|
-
return arrayValue ? `
|
|
35541
|
-
case "
|
|
35542
|
-
return value ? `
|
|
35543
|
-
case "
|
|
35544
|
-
return value ? `
|
|
35563
|
+
case "icontains-any":
|
|
35564
|
+
return arrayValue ? `icontains-any-${arrayValue.length}` : "icontains-any";
|
|
35565
|
+
case "icontains-all":
|
|
35566
|
+
return arrayValue ? `icontains-all-${arrayValue.length}` : "icontains-all";
|
|
35567
|
+
case "starts-with":
|
|
35568
|
+
return value ? `starts-with-${value}` : "starts-with";
|
|
35569
|
+
case "ends-with":
|
|
35570
|
+
return value ? `ends-with-${value}` : "ends-with";
|
|
35545
35571
|
case "regex":
|
|
35546
35572
|
return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
|
|
35547
|
-
case "
|
|
35548
|
-
return "
|
|
35573
|
+
case "is-json":
|
|
35574
|
+
return "is-json";
|
|
35549
35575
|
case "equals":
|
|
35550
35576
|
return value ? `equals-${value}` : "equals";
|
|
35551
35577
|
case "rubrics":
|
|
@@ -35558,8 +35584,9 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
35558
35584
|
if (typeof candidate !== "string") {
|
|
35559
35585
|
return void 0;
|
|
35560
35586
|
}
|
|
35561
|
-
|
|
35562
|
-
|
|
35587
|
+
const normalized = normalizeEvaluatorType(candidate);
|
|
35588
|
+
if (isEvaluatorKind(normalized)) {
|
|
35589
|
+
return normalized;
|
|
35563
35590
|
}
|
|
35564
35591
|
logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
35565
35592
|
return void 0;
|
|
@@ -35605,6 +35632,16 @@ function parseCommandToArgv(command) {
|
|
|
35605
35632
|
function isJsonObject2(value) {
|
|
35606
35633
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
35607
35634
|
}
|
|
35635
|
+
var CRITERIA_CONSUMER_TYPES = /* @__PURE__ */ new Set(["llm-judge", "agent-judge", "code-judge"]);
|
|
35636
|
+
function warnUnconsumedCriteria(criteria, evaluators, testId) {
|
|
35637
|
+
if (!criteria?.trim() || !evaluators || evaluators.length === 0) return;
|
|
35638
|
+
const hasConsumer = evaluators.some((e) => CRITERIA_CONSUMER_TYPES.has(e.type));
|
|
35639
|
+
if (!hasConsumer) {
|
|
35640
|
+
logWarning2(
|
|
35641
|
+
`Test '${testId}': criteria is defined but no evaluator in assert will evaluate it. Add 'type: llm-judge' to assert, or remove criteria if it is documentation-only.`
|
|
35642
|
+
);
|
|
35643
|
+
}
|
|
35644
|
+
}
|
|
35608
35645
|
function logWarning2(message, details) {
|
|
35609
35646
|
if (details && details.length > 0) {
|
|
35610
35647
|
const detailBlock = details.join("\n");
|
|
@@ -35854,7 +35891,7 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
35854
35891
|
}
|
|
35855
35892
|
return {
|
|
35856
35893
|
name: "rubric",
|
|
35857
|
-
type: "
|
|
35894
|
+
type: "llm-judge",
|
|
35858
35895
|
rubrics: rubricItems
|
|
35859
35896
|
};
|
|
35860
35897
|
}
|
|
@@ -36221,7 +36258,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
36221
36258
|
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
36222
36259
|
const fallbackDataset = path6.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
36223
36260
|
const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
|
|
36224
|
-
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "
|
|
36261
|
+
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-judge";
|
|
36225
36262
|
const globalExecution = sidecar.execution;
|
|
36226
36263
|
if (verbose) {
|
|
36227
36264
|
console.log(`
|
|
@@ -36309,6 +36346,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
36309
36346
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
36310
36347
|
}
|
|
36311
36348
|
}
|
|
36349
|
+
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
36312
36350
|
const userFilePaths = [];
|
|
36313
36351
|
for (const segment of inputSegments) {
|
|
36314
36352
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -36653,13 +36691,15 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
36653
36691
|
}
|
|
36654
36692
|
const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
|
|
36655
36693
|
const metadata = parseMetadata(parsed);
|
|
36694
|
+
const failOnError = extractFailOnError(parsed);
|
|
36656
36695
|
return {
|
|
36657
36696
|
tests,
|
|
36658
36697
|
trials: extractTrialsConfig(parsed),
|
|
36659
36698
|
targets: extractTargetsFromSuite(parsed),
|
|
36660
36699
|
cacheConfig: extractCacheConfig(parsed),
|
|
36661
36700
|
totalBudgetUsd: extractTotalBudgetUsd(parsed),
|
|
36662
|
-
...metadata !== void 0 && { metadata }
|
|
36701
|
+
...metadata !== void 0 && { metadata },
|
|
36702
|
+
...failOnError !== void 0 && { failOnError }
|
|
36663
36703
|
};
|
|
36664
36704
|
}
|
|
36665
36705
|
var loadEvalSuite = loadTestSuite;
|
|
@@ -36690,7 +36730,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
36690
36730
|
const fallbackDataset = path8.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
36691
36731
|
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
36692
36732
|
const rawTestcases = resolveTests(suite);
|
|
36693
|
-
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "
|
|
36733
|
+
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-judge";
|
|
36694
36734
|
const evalFileDir = path8.dirname(absoluteTestPath);
|
|
36695
36735
|
let expandedTestcases;
|
|
36696
36736
|
if (typeof rawTestcases === "string") {
|
|
@@ -36787,6 +36827,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
36787
36827
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
36788
36828
|
}
|
|
36789
36829
|
}
|
|
36830
|
+
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
36790
36831
|
const userFilePaths = [];
|
|
36791
36832
|
for (const segment of inputSegments) {
|
|
36792
36833
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -42588,7 +42629,7 @@ function toCamelCaseDeep(obj) {
|
|
|
42588
42629
|
}
|
|
42589
42630
|
var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
|
|
42590
42631
|
var CodeEvaluator = class {
|
|
42591
|
-
kind = "code";
|
|
42632
|
+
kind = "code-judge";
|
|
42592
42633
|
command;
|
|
42593
42634
|
cwd;
|
|
42594
42635
|
agentTimeoutMs;
|
|
@@ -42789,7 +42830,7 @@ var scoreRangeEvaluationSchema = external_exports.object({
|
|
|
42789
42830
|
overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)").optional()
|
|
42790
42831
|
});
|
|
42791
42832
|
var LlmJudgeEvaluator = class {
|
|
42792
|
-
kind = "
|
|
42833
|
+
kind = "llm-judge";
|
|
42793
42834
|
resolveJudgeProvider;
|
|
42794
42835
|
maxOutputTokens;
|
|
42795
42836
|
temperature;
|
|
@@ -42806,7 +42847,7 @@ var LlmJudgeEvaluator = class {
|
|
|
42806
42847
|
throw new Error("No judge provider available for LLM grading");
|
|
42807
42848
|
}
|
|
42808
42849
|
const config2 = context.evaluator;
|
|
42809
|
-
if (config2?.type === "
|
|
42850
|
+
if (config2?.type === "llm-judge" && config2.rubrics && config2.rubrics.length > 0) {
|
|
42810
42851
|
return this.evaluateWithRubrics(context, judgeProvider, config2.rubrics);
|
|
42811
42852
|
}
|
|
42812
42853
|
return this.evaluateFreeform(context, judgeProvider);
|
|
@@ -42880,7 +42921,7 @@ ${context.fileChanges}`;
|
|
|
42880
42921
|
async evaluateWithRubrics(context, judgeProvider, rubrics) {
|
|
42881
42922
|
if (!rubrics || rubrics.length === 0) {
|
|
42882
42923
|
throw new Error(
|
|
42883
|
-
`No rubrics found for evaluator "${context.evaluator?.name ?? "
|
|
42924
|
+
`No rubrics found for evaluator "${context.evaluator?.name ?? "llm-judge"}". Run "agentv generate rubrics" first.`
|
|
42884
42925
|
);
|
|
42885
42926
|
}
|
|
42886
42927
|
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
@@ -43214,9 +43255,9 @@ var CompositeEvaluator = class {
|
|
|
43214
43255
|
async aggregate(results, context) {
|
|
43215
43256
|
const aggregator = this.config.aggregator;
|
|
43216
43257
|
switch (aggregator.type) {
|
|
43217
|
-
case "
|
|
43258
|
+
case "code-judge":
|
|
43218
43259
|
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
43219
|
-
case "
|
|
43260
|
+
case "llm-judge":
|
|
43220
43261
|
return this.runLlmAggregator(results, context, aggregator);
|
|
43221
43262
|
case "threshold":
|
|
43222
43263
|
return this.runThreshold(results, aggregator.threshold);
|
|
@@ -43359,7 +43400,7 @@ var CompositeEvaluator = class {
|
|
|
43359
43400
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
43360
43401
|
reasoning,
|
|
43361
43402
|
evaluatorRawRequest: {
|
|
43362
|
-
aggregator: "
|
|
43403
|
+
aggregator: "code-judge",
|
|
43363
43404
|
script: scriptPath
|
|
43364
43405
|
},
|
|
43365
43406
|
scores
|
|
@@ -43374,7 +43415,7 @@ var CompositeEvaluator = class {
|
|
|
43374
43415
|
expectedAspectCount: 1,
|
|
43375
43416
|
reasoning: message,
|
|
43376
43417
|
evaluatorRawRequest: {
|
|
43377
|
-
aggregator: "
|
|
43418
|
+
aggregator: "code-judge",
|
|
43378
43419
|
script: scriptPath,
|
|
43379
43420
|
error: message
|
|
43380
43421
|
},
|
|
@@ -43405,7 +43446,7 @@ var CompositeEvaluator = class {
|
|
|
43405
43446
|
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
43406
43447
|
const systemPrompt = buildOutputSchema();
|
|
43407
43448
|
const evaluatorRawRequest = {
|
|
43408
|
-
aggregator: "
|
|
43449
|
+
aggregator: "llm-judge",
|
|
43409
43450
|
userPrompt,
|
|
43410
43451
|
systemPrompt,
|
|
43411
43452
|
target: judgeProvider.targetName
|
|
@@ -43513,7 +43554,7 @@ var CostEvaluator = class {
|
|
|
43513
43554
|
}
|
|
43514
43555
|
};
|
|
43515
43556
|
var ExecutionMetricsEvaluator = class {
|
|
43516
|
-
kind = "
|
|
43557
|
+
kind = "execution-metrics";
|
|
43517
43558
|
config;
|
|
43518
43559
|
constructor(options) {
|
|
43519
43560
|
this.config = options.config;
|
|
@@ -43539,7 +43580,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
43539
43580
|
expectedAspectCount: 1,
|
|
43540
43581
|
reasoning: "Execution metrics not available - no trace summary provided",
|
|
43541
43582
|
evaluatorRawRequest: {
|
|
43542
|
-
type: "
|
|
43583
|
+
type: "execution-metrics",
|
|
43543
43584
|
config: this.extractConfiguredThresholds(),
|
|
43544
43585
|
actual: null
|
|
43545
43586
|
}
|
|
@@ -43648,7 +43689,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
43648
43689
|
if (actualMetrics.exploration_ratio !== void 0) {
|
|
43649
43690
|
reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
|
|
43650
43691
|
}
|
|
43651
|
-
const reasoning = reasoningParts.length > 0 ? `
|
|
43692
|
+
const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
|
|
43652
43693
|
return {
|
|
43653
43694
|
score,
|
|
43654
43695
|
verdict: scoreToVerdict(score),
|
|
@@ -43657,7 +43698,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
43657
43698
|
expectedAspectCount: totalChecks || 1,
|
|
43658
43699
|
reasoning,
|
|
43659
43700
|
evaluatorRawRequest: {
|
|
43660
|
-
type: "
|
|
43701
|
+
type: "execution-metrics",
|
|
43661
43702
|
config: this.extractConfiguredThresholds(),
|
|
43662
43703
|
actual: this.filterDefinedMetrics(actualMetrics)
|
|
43663
43704
|
}
|
|
@@ -43743,7 +43784,7 @@ var MONTH_NAMES = {
|
|
|
43743
43784
|
december: 11
|
|
43744
43785
|
};
|
|
43745
43786
|
var FieldAccuracyEvaluator = class {
|
|
43746
|
-
kind = "
|
|
43787
|
+
kind = "field-accuracy";
|
|
43747
43788
|
config;
|
|
43748
43789
|
constructor(options) {
|
|
43749
43790
|
this.config = options.config;
|
|
@@ -44189,7 +44230,7 @@ var BINARY_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
|
44189
44230
|
".dylib"
|
|
44190
44231
|
]);
|
|
44191
44232
|
var AgentJudgeEvaluator = class {
|
|
44192
|
-
kind = "
|
|
44233
|
+
kind = "agent-judge";
|
|
44193
44234
|
resolveJudgeProvider;
|
|
44194
44235
|
maxSteps;
|
|
44195
44236
|
temperature;
|
|
@@ -44214,24 +44255,24 @@ var AgentJudgeEvaluator = class {
|
|
|
44214
44255
|
async evaluateBuiltIn(context) {
|
|
44215
44256
|
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
44216
44257
|
if (!judgeProvider) {
|
|
44217
|
-
throw new Error("No judge provider available for
|
|
44258
|
+
throw new Error("No judge provider available for agent-judge evaluation");
|
|
44218
44259
|
}
|
|
44219
44260
|
const model = judgeProvider.asLanguageModel?.();
|
|
44220
44261
|
if (!model) {
|
|
44221
44262
|
throw new Error(
|
|
44222
|
-
`Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in
|
|
44263
|
+
`Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent-judge mode`
|
|
44223
44264
|
);
|
|
44224
44265
|
}
|
|
44225
44266
|
const workspacePath = context.workspacePath;
|
|
44226
44267
|
if (!workspacePath) {
|
|
44227
44268
|
throw new Error(
|
|
44228
|
-
"
|
|
44269
|
+
"agent-judge evaluator requires a workspace_template target (workspacePath is not set)"
|
|
44229
44270
|
);
|
|
44230
44271
|
}
|
|
44231
44272
|
const systemPrompt = this.buildSystemPrompt(context);
|
|
44232
44273
|
const userPrompt = this.buildUserPrompt(context);
|
|
44233
44274
|
const config2 = context.evaluator;
|
|
44234
|
-
const rubrics = config2?.type === "
|
|
44275
|
+
const rubrics = config2?.type === "agent-judge" ? config2.rubrics : void 0;
|
|
44235
44276
|
const fsTools = createFilesystemTools(workspacePath);
|
|
44236
44277
|
const evaluatorRawRequest = {
|
|
44237
44278
|
mode: "built-in",
|
|
@@ -44262,7 +44303,7 @@ var AgentJudgeEvaluator = class {
|
|
|
44262
44303
|
score: 0,
|
|
44263
44304
|
verdict: "fail",
|
|
44264
44305
|
hits: [],
|
|
44265
|
-
misses: [`
|
|
44306
|
+
misses: [`agent-judge built-in evaluation failed: ${message}`],
|
|
44266
44307
|
expectedAspectCount: 1,
|
|
44267
44308
|
evaluatorRawRequest,
|
|
44268
44309
|
details: { mode: "built-in", error: message }
|
|
@@ -44294,14 +44335,14 @@ var AgentJudgeEvaluator = class {
|
|
|
44294
44335
|
score: 0,
|
|
44295
44336
|
verdict: "fail",
|
|
44296
44337
|
hits: [],
|
|
44297
|
-
misses: ["
|
|
44338
|
+
misses: ["agent-judge judge_target returned no assistant response"],
|
|
44298
44339
|
expectedAspectCount: 1,
|
|
44299
44340
|
evaluatorRawRequest,
|
|
44300
44341
|
details: { mode: "judge_target", judge_target: provider.targetName }
|
|
44301
44342
|
};
|
|
44302
44343
|
}
|
|
44303
44344
|
const config2 = context.evaluator;
|
|
44304
|
-
const rubrics = config2?.type === "
|
|
44345
|
+
const rubrics = config2?.type === "agent-judge" ? config2.rubrics : void 0;
|
|
44305
44346
|
const details = {
|
|
44306
44347
|
mode: "judge_target",
|
|
44307
44348
|
judge_target: provider.targetName
|
|
@@ -44313,7 +44354,7 @@ var AgentJudgeEvaluator = class {
|
|
|
44313
44354
|
score: 0,
|
|
44314
44355
|
verdict: "fail",
|
|
44315
44356
|
hits: [],
|
|
44316
|
-
misses: [`
|
|
44357
|
+
misses: [`agent-judge judge_target evaluation failed: ${message}`],
|
|
44317
44358
|
expectedAspectCount: 1,
|
|
44318
44359
|
evaluatorRawRequest,
|
|
44319
44360
|
details: {
|
|
@@ -44364,7 +44405,7 @@ var AgentJudgeEvaluator = class {
|
|
|
44364
44405
|
score: 0,
|
|
44365
44406
|
verdict: "fail",
|
|
44366
44407
|
hits: [],
|
|
44367
|
-
misses: ["Failed to parse
|
|
44408
|
+
misses: ["Failed to parse agent-judge response as valid evaluation JSON"],
|
|
44368
44409
|
expectedAspectCount: 1,
|
|
44369
44410
|
evaluatorRawRequest,
|
|
44370
44411
|
details
|
|
@@ -44377,7 +44418,7 @@ var AgentJudgeEvaluator = class {
|
|
|
44377
44418
|
*/
|
|
44378
44419
|
buildSystemPrompt(context) {
|
|
44379
44420
|
const config2 = context.evaluator;
|
|
44380
|
-
const rubrics = config2?.type === "
|
|
44421
|
+
const rubrics = config2?.type === "agent-judge" ? config2.rubrics : void 0;
|
|
44381
44422
|
const parts = [
|
|
44382
44423
|
"You are an expert evaluator with access to the workspace filesystem.",
|
|
44383
44424
|
"Use the provided tools to investigate the workspace and verify the criteria are met.",
|
|
@@ -44408,7 +44449,7 @@ var AgentJudgeEvaluator = class {
|
|
|
44408
44449
|
return substituteVariables(this.evaluatorTemplate, variables);
|
|
44409
44450
|
}
|
|
44410
44451
|
const config2 = context.evaluator;
|
|
44411
|
-
const rubrics = config2?.type === "
|
|
44452
|
+
const rubrics = config2?.type === "agent-judge" ? config2.rubrics : void 0;
|
|
44412
44453
|
const parts = [
|
|
44413
44454
|
"Evaluate the candidate answer by investigating the workspace.",
|
|
44414
44455
|
"",
|
|
@@ -44451,7 +44492,7 @@ var AgentJudgeEvaluator = class {
|
|
|
44451
44492
|
buildDelegatedPrompt(context) {
|
|
44452
44493
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
44453
44494
|
const config2 = context.evaluator;
|
|
44454
|
-
const rubrics = config2?.type === "
|
|
44495
|
+
const rubrics = config2?.type === "agent-judge" ? config2.rubrics : void 0;
|
|
44455
44496
|
if (this.evaluatorTemplate) {
|
|
44456
44497
|
const variables = {
|
|
44457
44498
|
[TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
|
|
@@ -44533,11 +44574,11 @@ function createFilesystemTools(workspacePath) {
|
|
|
44533
44574
|
execute: async (input) => {
|
|
44534
44575
|
try {
|
|
44535
44576
|
const resolved = resolveSandboxed(workspacePath, input.path);
|
|
44536
|
-
const
|
|
44537
|
-
if (
|
|
44577
|
+
const stat8 = await fs2.stat(resolved);
|
|
44578
|
+
if (stat8.isDirectory()) {
|
|
44538
44579
|
return { error: `'${input.path}' is a directory, not a file` };
|
|
44539
44580
|
}
|
|
44540
|
-
const buffer = Buffer.alloc(Math.min(
|
|
44581
|
+
const buffer = Buffer.alloc(Math.min(stat8.size, MAX_FILE_SIZE));
|
|
44541
44582
|
const fd = await fs2.open(resolved, "r");
|
|
44542
44583
|
try {
|
|
44543
44584
|
await fd.read(buffer, 0, buffer.length, 0);
|
|
@@ -44545,8 +44586,8 @@ function createFilesystemTools(workspacePath) {
|
|
|
44545
44586
|
await fd.close();
|
|
44546
44587
|
}
|
|
44547
44588
|
const content = buffer.toString("utf-8");
|
|
44548
|
-
const truncated =
|
|
44549
|
-
return { content, truncated, size:
|
|
44589
|
+
const truncated = stat8.size > MAX_FILE_SIZE;
|
|
44590
|
+
return { content, truncated, size: stat8.size };
|
|
44550
44591
|
} catch (error40) {
|
|
44551
44592
|
return { error: error40 instanceof Error ? error40.message : String(error40) };
|
|
44552
44593
|
}
|
|
@@ -44590,8 +44631,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
|
44590
44631
|
const ext = path30.extname(entry.name).toLowerCase();
|
|
44591
44632
|
if (BINARY_EXTENSIONS.has(ext)) continue;
|
|
44592
44633
|
try {
|
|
44593
|
-
const
|
|
44594
|
-
if (
|
|
44634
|
+
const stat8 = await fs2.stat(fullPath);
|
|
44635
|
+
if (stat8.size > MAX_FILE_SIZE) continue;
|
|
44595
44636
|
const content = await fs2.readFile(fullPath, "utf-8");
|
|
44596
44637
|
const lines = content.split("\n");
|
|
44597
44638
|
for (let i = 0; i < lines.length; i++) {
|
|
@@ -44749,7 +44790,7 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
|
|
|
44749
44790
|
};
|
|
44750
44791
|
}
|
|
44751
44792
|
var TokenUsageEvaluator = class {
|
|
44752
|
-
kind = "
|
|
44793
|
+
kind = "token-usage";
|
|
44753
44794
|
config;
|
|
44754
44795
|
constructor(options) {
|
|
44755
44796
|
this.config = options.config;
|
|
@@ -44772,7 +44813,7 @@ var TokenUsageEvaluator = class {
|
|
|
44772
44813
|
expectedAspectCount,
|
|
44773
44814
|
reasoning: "Token usage not reported by provider",
|
|
44774
44815
|
evaluatorRawRequest: {
|
|
44775
|
-
type: "
|
|
44816
|
+
type: "token-usage",
|
|
44776
44817
|
max_total: maxTotal ?? null,
|
|
44777
44818
|
max_input: maxInput ?? null,
|
|
44778
44819
|
max_output: maxOutput ?? null,
|
|
@@ -44814,9 +44855,9 @@ var TokenUsageEvaluator = class {
|
|
|
44814
44855
|
hits,
|
|
44815
44856
|
misses,
|
|
44816
44857
|
expectedAspectCount,
|
|
44817
|
-
reasoning: `
|
|
44858
|
+
reasoning: `token-usage input=${input}, output=${output}, cached=${cached2}, total=${total}`,
|
|
44818
44859
|
evaluatorRawRequest: {
|
|
44819
|
-
type: "
|
|
44860
|
+
type: "token-usage",
|
|
44820
44861
|
max_total: maxTotal ?? null,
|
|
44821
44862
|
max_input: maxInput ?? null,
|
|
44822
44863
|
max_output: maxOutput ?? null,
|
|
@@ -44899,7 +44940,7 @@ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
|
|
|
44899
44940
|
};
|
|
44900
44941
|
}
|
|
44901
44942
|
var ToolTrajectoryEvaluator = class {
|
|
44902
|
-
kind = "
|
|
44943
|
+
kind = "tool-trajectory";
|
|
44903
44944
|
config;
|
|
44904
44945
|
constructor(options) {
|
|
44905
44946
|
this.config = options.config;
|
|
@@ -45087,7 +45128,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
45087
45128
|
}
|
|
45088
45129
|
}
|
|
45089
45130
|
for (const warning of warnings) {
|
|
45090
|
-
console.warn(`[
|
|
45131
|
+
console.warn(`[tool-trajectory] ${warning}`);
|
|
45091
45132
|
}
|
|
45092
45133
|
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
45093
45134
|
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
@@ -45163,7 +45204,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
45163
45204
|
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
45164
45205
|
}
|
|
45165
45206
|
for (const warning of warnings) {
|
|
45166
|
-
console.warn(`[
|
|
45207
|
+
console.warn(`[tool-trajectory] ${warning}`);
|
|
45167
45208
|
}
|
|
45168
45209
|
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
45169
45210
|
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
@@ -45631,7 +45672,7 @@ var llmJudgeFactory = (config2, context) => {
|
|
|
45631
45672
|
const c = config2;
|
|
45632
45673
|
const { llmJudge, agentTimeoutMs } = context;
|
|
45633
45674
|
return {
|
|
45634
|
-
kind: "
|
|
45675
|
+
kind: "llm-judge",
|
|
45635
45676
|
async evaluate(evalContext) {
|
|
45636
45677
|
const customPrompt = await resolveCustomPrompt(
|
|
45637
45678
|
c,
|
|
@@ -45720,7 +45761,7 @@ var agentJudgeFactory = (config2, context) => {
|
|
|
45720
45761
|
customPrompt = readFileSync(c.resolvedPromptPath, "utf-8");
|
|
45721
45762
|
} catch (error40) {
|
|
45722
45763
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
45723
|
-
console.warn(`Could not read
|
|
45764
|
+
console.warn(`Could not read agent-judge prompt at ${c.resolvedPromptPath}: ${message}`);
|
|
45724
45765
|
}
|
|
45725
45766
|
} else if (c.prompt) {
|
|
45726
45767
|
customPrompt = c.prompt;
|
|
@@ -45730,7 +45771,7 @@ var agentJudgeFactory = (config2, context) => {
|
|
|
45730
45771
|
judgeTargetProvider = targetResolver(c.target);
|
|
45731
45772
|
if (!judgeTargetProvider) {
|
|
45732
45773
|
throw new Error(
|
|
45733
|
-
`
|
|
45774
|
+
`agent-judge evaluator '${c.name}': target '${c.target}' not found in targets`
|
|
45734
45775
|
);
|
|
45735
45776
|
}
|
|
45736
45777
|
}
|
|
@@ -45774,7 +45815,7 @@ var regexFactory = (config2) => {
|
|
|
45774
45815
|
});
|
|
45775
45816
|
};
|
|
45776
45817
|
var isJsonFactory = () => {
|
|
45777
|
-
return new DeterministicAssertionEvaluator("
|
|
45818
|
+
return new DeterministicAssertionEvaluator("is-json", (ctx) => {
|
|
45778
45819
|
const result = runIsJsonAssertion(ctx.candidate);
|
|
45779
45820
|
return {
|
|
45780
45821
|
score: result.score,
|
|
@@ -45802,7 +45843,7 @@ var equalsFactory = (config2) => {
|
|
|
45802
45843
|
};
|
|
45803
45844
|
var containsAnyFactory = (config2) => {
|
|
45804
45845
|
const c = config2;
|
|
45805
|
-
return new DeterministicAssertionEvaluator("
|
|
45846
|
+
return new DeterministicAssertionEvaluator("contains-any", (ctx) => {
|
|
45806
45847
|
const result = runContainsAnyAssertion(ctx.candidate, c.value);
|
|
45807
45848
|
return {
|
|
45808
45849
|
score: result.score,
|
|
@@ -45816,7 +45857,7 @@ var containsAnyFactory = (config2) => {
|
|
|
45816
45857
|
};
|
|
45817
45858
|
var containsAllFactory = (config2) => {
|
|
45818
45859
|
const c = config2;
|
|
45819
|
-
return new DeterministicAssertionEvaluator("
|
|
45860
|
+
return new DeterministicAssertionEvaluator("contains-all", (ctx) => {
|
|
45820
45861
|
const result = runContainsAllAssertion(ctx.candidate, c.value);
|
|
45821
45862
|
return {
|
|
45822
45863
|
score: result.score,
|
|
@@ -45844,7 +45885,7 @@ var icontainsFactory = (config2) => {
|
|
|
45844
45885
|
};
|
|
45845
45886
|
var icontainsAnyFactory = (config2) => {
|
|
45846
45887
|
const c = config2;
|
|
45847
|
-
return new DeterministicAssertionEvaluator("
|
|
45888
|
+
return new DeterministicAssertionEvaluator("icontains-any", (ctx) => {
|
|
45848
45889
|
const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
|
|
45849
45890
|
return {
|
|
45850
45891
|
score: result.score,
|
|
@@ -45858,7 +45899,7 @@ var icontainsAnyFactory = (config2) => {
|
|
|
45858
45899
|
};
|
|
45859
45900
|
var icontainsAllFactory = (config2) => {
|
|
45860
45901
|
const c = config2;
|
|
45861
|
-
return new DeterministicAssertionEvaluator("
|
|
45902
|
+
return new DeterministicAssertionEvaluator("icontains-all", (ctx) => {
|
|
45862
45903
|
const result = runIcontainsAllAssertion(ctx.candidate, c.value);
|
|
45863
45904
|
return {
|
|
45864
45905
|
score: result.score,
|
|
@@ -45872,7 +45913,7 @@ var icontainsAllFactory = (config2) => {
|
|
|
45872
45913
|
};
|
|
45873
45914
|
var startsWithFactory = (config2) => {
|
|
45874
45915
|
const c = config2;
|
|
45875
|
-
return new DeterministicAssertionEvaluator("
|
|
45916
|
+
return new DeterministicAssertionEvaluator("starts-with", (ctx) => {
|
|
45876
45917
|
const result = runStartsWithAssertion(ctx.candidate, c.value);
|
|
45877
45918
|
return {
|
|
45878
45919
|
score: result.score,
|
|
@@ -45886,7 +45927,7 @@ var startsWithFactory = (config2) => {
|
|
|
45886
45927
|
};
|
|
45887
45928
|
var endsWithFactory = (config2) => {
|
|
45888
45929
|
const c = config2;
|
|
45889
|
-
return new DeterministicAssertionEvaluator("
|
|
45930
|
+
return new DeterministicAssertionEvaluator("ends-with", (ctx) => {
|
|
45890
45931
|
const result = runEndsWithAssertion(ctx.candidate, c.value);
|
|
45891
45932
|
return {
|
|
45892
45933
|
score: result.score,
|
|
@@ -45900,7 +45941,7 @@ var endsWithFactory = (config2) => {
|
|
|
45900
45941
|
};
|
|
45901
45942
|
function createBuiltinRegistry() {
|
|
45902
45943
|
const registry2 = new EvaluatorRegistry();
|
|
45903
|
-
registry2.register("
|
|
45944
|
+
registry2.register("llm-judge", llmJudgeFactory).register("code-judge", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("agent-judge", agentJudgeFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory);
|
|
45904
45945
|
return registry2;
|
|
45905
45946
|
}
|
|
45906
45947
|
async function discoverAssertions(registry2, baseDir) {
|
|
@@ -46553,7 +46594,8 @@ async function runEvaluation(options) {
|
|
|
46553
46594
|
cleanupWorkspaces,
|
|
46554
46595
|
trials,
|
|
46555
46596
|
streamCallbacks,
|
|
46556
|
-
totalBudgetUsd
|
|
46597
|
+
totalBudgetUsd,
|
|
46598
|
+
failOnError
|
|
46557
46599
|
} = options;
|
|
46558
46600
|
let useCache = options.useCache;
|
|
46559
46601
|
if (trials && trials.count > 1 && useCache) {
|
|
@@ -46611,7 +46653,7 @@ async function runEvaluation(options) {
|
|
|
46611
46653
|
};
|
|
46612
46654
|
if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
|
|
46613
46655
|
throw new Error(
|
|
46614
|
-
`Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g.,
|
|
46656
|
+
`Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure-base).`
|
|
46615
46657
|
);
|
|
46616
46658
|
}
|
|
46617
46659
|
const targetResolver = (name16) => {
|
|
@@ -46682,7 +46724,7 @@ async function runEvaluation(options) {
|
|
|
46682
46724
|
const rawTemplate = suiteWorkspace?.template ?? getWorkspaceTemplate(target);
|
|
46683
46725
|
const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
|
|
46684
46726
|
const workspaceTemplate = resolvedTemplate?.dir;
|
|
46685
|
-
|
|
46727
|
+
let suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
|
|
46686
46728
|
const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
|
|
46687
46729
|
const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
|
|
46688
46730
|
const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
|
|
@@ -46703,6 +46745,14 @@ async function runEvaluation(options) {
|
|
|
46703
46745
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
46704
46746
|
throw new Error(`Failed to create shared workspace: ${message}`);
|
|
46705
46747
|
}
|
|
46748
|
+
if (suiteWorkspaceFile && sharedWorkspacePath) {
|
|
46749
|
+
const copiedWorkspaceFile = path37.join(sharedWorkspacePath, path37.basename(suiteWorkspaceFile));
|
|
46750
|
+
try {
|
|
46751
|
+
await stat7(copiedWorkspaceFile);
|
|
46752
|
+
suiteWorkspaceFile = copiedWorkspaceFile;
|
|
46753
|
+
} catch {
|
|
46754
|
+
}
|
|
46755
|
+
}
|
|
46706
46756
|
} else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
|
|
46707
46757
|
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
46708
46758
|
await mkdir12(sharedWorkspacePath, { recursive: true });
|
|
@@ -46749,6 +46799,7 @@ async function runEvaluation(options) {
|
|
|
46749
46799
|
let beforeAllOutputAttached = false;
|
|
46750
46800
|
let cumulativeBudgetCost = 0;
|
|
46751
46801
|
let budgetExhausted = false;
|
|
46802
|
+
let failOnErrorTriggered = false;
|
|
46752
46803
|
const promises = filteredEvalCases.map(
|
|
46753
46804
|
(evalCase) => limit(async () => {
|
|
46754
46805
|
const workerId = nextWorkerId++;
|
|
@@ -46787,6 +46838,37 @@ async function runEvaluation(options) {
|
|
|
46787
46838
|
}
|
|
46788
46839
|
return budgetResult;
|
|
46789
46840
|
}
|
|
46841
|
+
if (failOnError === true && failOnErrorTriggered) {
|
|
46842
|
+
const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
|
|
46843
|
+
const haltResult = {
|
|
46844
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
46845
|
+
testId: evalCase.id,
|
|
46846
|
+
dataset: evalCase.dataset,
|
|
46847
|
+
score: 0,
|
|
46848
|
+
hits: [],
|
|
46849
|
+
misses: [],
|
|
46850
|
+
answer: "",
|
|
46851
|
+
target: target.name,
|
|
46852
|
+
error: errorMsg,
|
|
46853
|
+
executionStatus: "execution_error",
|
|
46854
|
+
failureStage: "setup",
|
|
46855
|
+
failureReasonCode: "error_threshold_exceeded",
|
|
46856
|
+
executionError: { message: errorMsg, stage: "setup" }
|
|
46857
|
+
};
|
|
46858
|
+
if (onProgress) {
|
|
46859
|
+
await onProgress({
|
|
46860
|
+
workerId,
|
|
46861
|
+
testId: evalCase.id,
|
|
46862
|
+
status: "failed",
|
|
46863
|
+
completedAt: Date.now(),
|
|
46864
|
+
error: haltResult.error
|
|
46865
|
+
});
|
|
46866
|
+
}
|
|
46867
|
+
if (onResult) {
|
|
46868
|
+
await onResult(haltResult);
|
|
46869
|
+
}
|
|
46870
|
+
return haltResult;
|
|
46871
|
+
}
|
|
46790
46872
|
if (onProgress) {
|
|
46791
46873
|
await onProgress({
|
|
46792
46874
|
workerId,
|
|
@@ -46839,6 +46921,9 @@ async function runEvaluation(options) {
|
|
|
46839
46921
|
}
|
|
46840
46922
|
}
|
|
46841
46923
|
}
|
|
46924
|
+
if (failOnError === true && result.executionStatus === "execution_error") {
|
|
46925
|
+
failOnErrorTriggered = true;
|
|
46926
|
+
}
|
|
46842
46927
|
if (beforeAllOutput && !beforeAllOutputAttached) {
|
|
46843
46928
|
result = { ...result, beforeAllOutput };
|
|
46844
46929
|
beforeAllOutputAttached = true;
|
|
@@ -47146,6 +47231,14 @@ async function runEvalCase(options) {
|
|
|
47146
47231
|
"template_error"
|
|
47147
47232
|
);
|
|
47148
47233
|
}
|
|
47234
|
+
if (caseWorkspaceFile && workspacePath) {
|
|
47235
|
+
const copiedFile = path37.join(workspacePath, path37.basename(caseWorkspaceFile));
|
|
47236
|
+
try {
|
|
47237
|
+
await stat7(copiedFile);
|
|
47238
|
+
caseWorkspaceFile = copiedFile;
|
|
47239
|
+
} catch {
|
|
47240
|
+
}
|
|
47241
|
+
}
|
|
47149
47242
|
}
|
|
47150
47243
|
if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
47151
47244
|
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
@@ -47655,8 +47748,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
47655
47748
|
workspacePath
|
|
47656
47749
|
});
|
|
47657
47750
|
}
|
|
47658
|
-
const evaluatorKind = evalCase.evaluator ?? "
|
|
47659
|
-
const activeEvaluator = evaluators[evaluatorKind] ?? evaluators
|
|
47751
|
+
const evaluatorKind = evalCase.evaluator ?? "llm-judge";
|
|
47752
|
+
const activeEvaluator = evaluators[evaluatorKind] ?? evaluators["llm-judge"];
|
|
47660
47753
|
if (!activeEvaluator) {
|
|
47661
47754
|
throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
|
|
47662
47755
|
}
|
|
@@ -47739,25 +47832,24 @@ async function runEvaluatorList(options) {
|
|
|
47739
47832
|
availableTargets,
|
|
47740
47833
|
agentTimeoutMs,
|
|
47741
47834
|
evalFileDir,
|
|
47742
|
-
llmJudge: evaluatorRegistry
|
|
47835
|
+
llmJudge: evaluatorRegistry["llm-judge"],
|
|
47743
47836
|
registry: typeRegistry
|
|
47744
47837
|
};
|
|
47745
47838
|
for (const evaluatorConfig of evaluators ?? []) {
|
|
47746
47839
|
try {
|
|
47747
47840
|
const evaluatorInstance = await typeRegistry.create(evaluatorConfig, dispatchContext);
|
|
47748
47841
|
const score2 = await evaluatorInstance.evaluate(evalContext);
|
|
47749
|
-
const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
|
|
47750
47842
|
const weight = evaluatorConfig.weight ?? 1;
|
|
47751
47843
|
scored.push({
|
|
47752
47844
|
score: score2,
|
|
47753
47845
|
name: evaluatorConfig.name,
|
|
47754
|
-
type:
|
|
47846
|
+
type: evaluatorConfig.type,
|
|
47755
47847
|
weight,
|
|
47756
47848
|
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
47757
47849
|
});
|
|
47758
47850
|
scores.push({
|
|
47759
47851
|
name: evaluatorConfig.name,
|
|
47760
|
-
type:
|
|
47852
|
+
type: evaluatorConfig.type,
|
|
47761
47853
|
score: score2.score,
|
|
47762
47854
|
weight,
|
|
47763
47855
|
verdict: score2.verdict,
|
|
@@ -47779,18 +47871,17 @@ async function runEvaluatorList(options) {
|
|
|
47779
47871
|
expectedAspectCount: 1,
|
|
47780
47872
|
reasoning: message
|
|
47781
47873
|
};
|
|
47782
|
-
const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
|
|
47783
47874
|
const weight = evaluatorConfig.weight ?? 1;
|
|
47784
47875
|
scored.push({
|
|
47785
47876
|
score: fallbackScore,
|
|
47786
47877
|
name: evaluatorConfig.name ?? "unknown",
|
|
47787
|
-
type:
|
|
47878
|
+
type: evaluatorConfig.type ?? "llm-judge",
|
|
47788
47879
|
weight,
|
|
47789
47880
|
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
47790
47881
|
});
|
|
47791
47882
|
scores.push({
|
|
47792
47883
|
name: evaluatorConfig.name ?? "unknown",
|
|
47793
|
-
type:
|
|
47884
|
+
type: evaluatorConfig.type ?? "llm-judge",
|
|
47794
47885
|
score: 0,
|
|
47795
47886
|
weight,
|
|
47796
47887
|
verdict: "fail",
|
|
@@ -47851,7 +47942,7 @@ function filterEvalCases(evalCases, filter2) {
|
|
|
47851
47942
|
return evalCases.filter((evalCase) => micromatch4.isMatch(evalCase.id, filter2));
|
|
47852
47943
|
}
|
|
47853
47944
|
function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
47854
|
-
const llmJudge = overrides?.
|
|
47945
|
+
const llmJudge = overrides?.["llm-judge"] ?? new LlmJudgeEvaluator({
|
|
47855
47946
|
resolveJudgeProvider: async (context) => {
|
|
47856
47947
|
if (context.judgeProvider) {
|
|
47857
47948
|
return context.judgeProvider;
|
|
@@ -47861,7 +47952,7 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
47861
47952
|
});
|
|
47862
47953
|
return {
|
|
47863
47954
|
...overrides,
|
|
47864
|
-
|
|
47955
|
+
"llm-judge": llmJudge
|
|
47865
47956
|
};
|
|
47866
47957
|
}
|
|
47867
47958
|
async function invokeProvider(provider, options) {
|
|
@@ -48117,12 +48208,7 @@ async function evaluate(config2) {
|
|
|
48117
48208
|
};
|
|
48118
48209
|
}
|
|
48119
48210
|
function mapAssertionType(type) {
|
|
48120
|
-
|
|
48121
|
-
case "code_judge":
|
|
48122
|
-
return "code";
|
|
48123
|
-
default:
|
|
48124
|
-
return type;
|
|
48125
|
-
}
|
|
48211
|
+
return type.replace(/_/g, "-");
|
|
48126
48212
|
}
|
|
48127
48213
|
function computeSummary(results, durationMs) {
|
|
48128
48214
|
const total = results.length;
|
|
@@ -48851,6 +48937,7 @@ export {
|
|
|
48851
48937
|
extractTargetsFromTestCase,
|
|
48852
48938
|
extractTrialsConfig,
|
|
48853
48939
|
extractCacheConfig,
|
|
48940
|
+
extractFailOnError,
|
|
48854
48941
|
detectFormat,
|
|
48855
48942
|
buildPromptInputs,
|
|
48856
48943
|
readTestSuiteMetadata,
|
|
@@ -48950,4 +49037,4 @@ export {
|
|
|
48950
49037
|
OtelStreamingObserver,
|
|
48951
49038
|
createAgentKernel
|
|
48952
49039
|
};
|
|
48953
|
-
//# sourceMappingURL=chunk-
|
|
49040
|
+
//# sourceMappingURL=chunk-OQN2GDEU.js.map
|