@agentv/core 2.13.0 → 2.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -17,7 +17,7 @@ import {
17
17
  readTextFile,
18
18
  resolveFileReference,
19
19
  resolveTargetDefinition
20
- } from "./chunk-JHER2LQ5.js";
20
+ } from "./chunk-N55K52OO.js";
21
21
  import {
22
22
  OtlpJsonFileExporter
23
23
  } from "./chunk-HFSYZHGF.js";
@@ -682,6 +682,9 @@ function validateTemplateVariables(content, source) {
682
682
  // src/evaluation/loaders/evaluator-parser.ts
683
683
  var ANSI_YELLOW4 = "\x1B[33m";
684
684
  var ANSI_RESET4 = "\x1B[0m";
685
+ function normalizeEvaluatorType(type) {
686
+ return type.replace(/_/g, "-");
687
+ }
685
688
  async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
686
689
  const execution = rawEvalCase.execution;
687
690
  const executionObject = isJsonObject2(execution) ? execution : void 0;
@@ -712,7 +715,8 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
712
715
  continue;
713
716
  }
714
717
  const rawName = asString(rawEvaluator.name);
715
- const typeValue = rawEvaluator.type;
718
+ const rawType = rawEvaluator.type;
719
+ const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
716
720
  const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
717
721
  if (typeof typeValue !== "string") {
718
722
  logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
@@ -745,25 +749,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
745
749
  });
746
750
  continue;
747
751
  }
748
- if (typeValue === "code_judge") {
752
+ if (typeValue === "code-judge") {
749
753
  let command;
750
754
  const rawCommand = rawEvaluator.command ?? rawEvaluator.script;
751
755
  if (typeof rawCommand === "string") {
752
756
  const trimmed = rawCommand.trim();
753
757
  if (trimmed.length === 0) {
754
758
  throw new Error(
755
- `Invalid code_judge command for evaluator '${name}' in '${evalId}': command cannot be empty`
759
+ `Invalid code-judge command for evaluator '${name}' in '${evalId}': command cannot be empty`
756
760
  );
757
761
  }
758
762
  command = parseCommandToArgv(trimmed);
759
763
  } else {
760
764
  command = asStringArray(
761
765
  rawCommand,
762
- `code_judge command for evaluator '${name}' in '${evalId}'`
766
+ `code-judge command for evaluator '${name}' in '${evalId}'`
763
767
  );
764
768
  }
765
769
  if (!command) {
766
- logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing command`);
770
+ logWarning2(`Skipping code-judge evaluator '${name}' in '${evalId}': missing command`);
767
771
  continue;
768
772
  }
769
773
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
@@ -824,7 +828,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
824
828
  }
825
829
  evaluators.push({
826
830
  name,
827
- type: "code",
831
+ type: "code-judge",
828
832
  command,
829
833
  cwd,
830
834
  resolvedCwd,
@@ -850,7 +854,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
850
854
  continue;
851
855
  }
852
856
  const aggregatorType = asString(rawAggregator.type);
853
- if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge" && aggregatorType !== "threshold") {
857
+ if (aggregatorType !== "weighted_average" && aggregatorType !== "code-judge" && aggregatorType !== "llm-judge" && aggregatorType !== "threshold") {
854
858
  logWarning2(
855
859
  `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
856
860
  );
@@ -899,16 +903,16 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
899
903
  type: "weighted_average",
900
904
  ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
901
905
  };
902
- } else if (aggregatorType === "code_judge") {
906
+ } else if (aggregatorType === "code-judge") {
903
907
  const aggregatorPath = asString(rawAggregator.path);
904
908
  if (!aggregatorPath) {
905
909
  logWarning2(
906
- `Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
910
+ `Skipping composite evaluator '${name}' in '${evalId}': code-judge aggregator missing path`
907
911
  );
908
912
  continue;
909
913
  }
910
914
  aggregator = {
911
- type: "code_judge",
915
+ type: "code-judge",
912
916
  path: aggregatorPath,
913
917
  cwd: searchRoots[0]
914
918
  };
@@ -934,7 +938,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
934
938
  }
935
939
  }
936
940
  aggregator = {
937
- type: "llm_judge",
941
+ type: "llm-judge",
938
942
  ...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
939
943
  ...promptPath2 ? { promptPath: promptPath2 } : {}
940
944
  };
@@ -952,11 +956,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
952
956
  });
953
957
  continue;
954
958
  }
955
- if (typeValue === "tool_trajectory") {
959
+ if (typeValue === "tool-trajectory") {
956
960
  const mode = asString(rawEvaluator.mode);
957
961
  if (mode !== "any_order" && mode !== "in_order" && mode !== "exact" && mode !== "subset" && mode !== "superset") {
958
962
  logWarning2(
959
- `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, exact, subset, or superset)`
963
+ `Skipping tool-trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, exact, subset, or superset)`
960
964
  );
961
965
  continue;
962
966
  }
@@ -965,7 +969,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
965
969
  if (rawMinimums !== void 0) {
966
970
  if (!isJsonObject2(rawMinimums)) {
967
971
  logWarning2(
968
- `Skipping tool_trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
972
+ `Skipping tool-trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
969
973
  );
970
974
  continue;
971
975
  }
@@ -991,7 +995,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
991
995
  argsMatch2 = rawArgsMatch;
992
996
  } else {
993
997
  logWarning2(
994
- `Invalid args_match '${rawArgsMatch}' for tool_trajectory evaluator '${name}' in '${evalId}': must be exact, superset, subset, ignore, or a string array`
998
+ `Invalid args_match '${rawArgsMatch}' for tool-trajectory evaluator '${name}' in '${evalId}': must be exact, superset, subset, ignore, or a string array`
995
999
  );
996
1000
  }
997
1001
  }
@@ -1001,7 +1005,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1001
1005
  if (rawExpected !== void 0) {
1002
1006
  if (!Array.isArray(rawExpected)) {
1003
1007
  logWarning2(
1004
- `Skipping tool_trajectory evaluator '${name}' in '${evalId}': expected must be an array`
1008
+ `Skipping tool-trajectory evaluator '${name}' in '${evalId}': expected must be an array`
1005
1009
  );
1006
1010
  continue;
1007
1011
  }
@@ -1047,13 +1051,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1047
1051
  }
1048
1052
  if (mode === "any_order" && !minimums) {
1049
1053
  logWarning2(
1050
- `Skipping tool_trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
1054
+ `Skipping tool-trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
1051
1055
  );
1052
1056
  continue;
1053
1057
  }
1054
1058
  if ((mode === "in_order" || mode === "exact" || mode === "subset" || mode === "superset") && !expected) {
1055
1059
  logWarning2(
1056
- `Skipping tool_trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
1060
+ `Skipping tool-trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
1057
1061
  );
1058
1062
  continue;
1059
1063
  }
@@ -1061,7 +1065,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1061
1065
  const required2 = parseRequired(rawEvaluator.required);
1062
1066
  const config2 = {
1063
1067
  name,
1064
- type: "tool_trajectory",
1068
+ type: "tool-trajectory",
1065
1069
  mode,
1066
1070
  ...minimums ? { minimums } : {},
1067
1071
  ...expected ? { expected } : {},
@@ -1073,17 +1077,17 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1073
1077
  evaluators.push(config2);
1074
1078
  continue;
1075
1079
  }
1076
- if (typeValue === "field_accuracy") {
1080
+ if (typeValue === "field-accuracy") {
1077
1081
  const rawFields = rawEvaluator.fields;
1078
1082
  if (!Array.isArray(rawFields)) {
1079
1083
  logWarning2(
1080
- `Skipping field_accuracy evaluator '${name}' in '${evalId}': missing fields array`
1084
+ `Skipping field-accuracy evaluator '${name}' in '${evalId}': missing fields array`
1081
1085
  );
1082
1086
  continue;
1083
1087
  }
1084
1088
  if (rawFields.length === 0) {
1085
1089
  logWarning2(
1086
- `Skipping field_accuracy evaluator '${name}' in '${evalId}': fields array is empty`
1090
+ `Skipping field-accuracy evaluator '${name}' in '${evalId}': fields array is empty`
1087
1091
  );
1088
1092
  continue;
1089
1093
  }
@@ -1091,7 +1095,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1091
1095
  for (const rawField of rawFields) {
1092
1096
  if (!isJsonObject2(rawField)) {
1093
1097
  logWarning2(
1094
- `Skipping invalid field entry in field_accuracy evaluator '${name}' (expected object)`
1098
+ `Skipping invalid field entry in field-accuracy evaluator '${name}' (expected object)`
1095
1099
  );
1096
1100
  continue;
1097
1101
  }
@@ -1099,13 +1103,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1099
1103
  const match = asString(rawField.match);
1100
1104
  if (!fieldPath) {
1101
1105
  logWarning2(
1102
- `Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
1106
+ `Skipping field without path in field-accuracy evaluator '${name}' in '${evalId}'`
1103
1107
  );
1104
1108
  continue;
1105
1109
  }
1106
1110
  if (!match || !isValidFieldMatchType(match)) {
1107
1111
  logWarning2(
1108
- `Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code_judge evaluator.`
1112
+ `Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code-judge evaluator.`
1109
1113
  );
1110
1114
  continue;
1111
1115
  }
@@ -1122,7 +1126,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1122
1126
  }
1123
1127
  if (fields.length === 0) {
1124
1128
  logWarning2(
1125
- `Skipping field_accuracy evaluator '${name}' in '${evalId}': no valid fields found`
1129
+ `Skipping field-accuracy evaluator '${name}' in '${evalId}': no valid fields found`
1126
1130
  );
1127
1131
  continue;
1128
1132
  }
@@ -1132,7 +1136,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1132
1136
  const required2 = parseRequired(rawEvaluator.required);
1133
1137
  evaluators.push({
1134
1138
  name,
1135
- type: "field_accuracy",
1139
+ type: "field-accuracy",
1136
1140
  fields,
1137
1141
  ...validAggregation ? { aggregation: validAggregation } : {},
1138
1142
  ...weight2 !== void 0 ? { weight: weight2 } : {},
@@ -1181,7 +1185,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1181
1185
  });
1182
1186
  continue;
1183
1187
  }
1184
- if (typeValue === "token_usage") {
1188
+ if (typeValue === "token-usage") {
1185
1189
  const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
1186
1190
  const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
1187
1191
  const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
@@ -1195,7 +1199,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1195
1199
  if (raw === void 0) continue;
1196
1200
  if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
1197
1201
  logWarning2(
1198
- `Skipping token_usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
1202
+ `Skipping token-usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
1199
1203
  );
1200
1204
  continue;
1201
1205
  }
@@ -1203,7 +1207,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1203
1207
  }
1204
1208
  if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
1205
1209
  logWarning2(
1206
- `Skipping token_usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
1210
+ `Skipping token-usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
1207
1211
  );
1208
1212
  continue;
1209
1213
  }
@@ -1211,7 +1215,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1211
1215
  const required2 = parseRequired(rawEvaluator.required);
1212
1216
  evaluators.push({
1213
1217
  name,
1214
- type: "token_usage",
1218
+ type: "token-usage",
1215
1219
  ...validLimits,
1216
1220
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1217
1221
  ...required2 !== void 0 ? { required: required2 } : {},
@@ -1219,7 +1223,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1219
1223
  });
1220
1224
  continue;
1221
1225
  }
1222
- if (typeValue === "execution_metrics") {
1226
+ if (typeValue === "execution-metrics") {
1223
1227
  const maxToolCalls = rawEvaluator.max_tool_calls ?? rawEvaluator.maxToolCalls;
1224
1228
  const maxLlmCalls = rawEvaluator.max_llm_calls ?? rawEvaluator.maxLlmCalls;
1225
1229
  const maxTokens = rawEvaluator.max_tokens ?? rawEvaluator.maxTokens;
@@ -1242,7 +1246,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1242
1246
  if (raw === void 0) continue;
1243
1247
  if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
1244
1248
  logWarning2(
1245
- `Skipping execution_metrics evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
1249
+ `Skipping execution-metrics evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
1246
1250
  );
1247
1251
  hasError = true;
1248
1252
  break;
@@ -1255,7 +1259,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1255
1259
  const hasThreshold = validThresholds.max_tool_calls !== void 0 || validThresholds.max_llm_calls !== void 0 || validThresholds.max_tokens !== void 0 || validThresholds.max_cost_usd !== void 0 || validThresholds.max_duration_ms !== void 0 || validThresholds.target_exploration_ratio !== void 0;
1256
1260
  if (!hasThreshold) {
1257
1261
  logWarning2(
1258
- `Skipping execution_metrics evaluator '${name}' in '${evalId}': must set at least one threshold (max_tool_calls, max_llm_calls, max_tokens, max_cost_usd, max_duration_ms, or target_exploration_ratio)`
1262
+ `Skipping execution-metrics evaluator '${name}' in '${evalId}': must set at least one threshold (max_tool_calls, max_llm_calls, max_tokens, max_cost_usd, max_duration_ms, or target_exploration_ratio)`
1259
1263
  );
1260
1264
  continue;
1261
1265
  }
@@ -1263,7 +1267,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1263
1267
  const required2 = parseRequired(rawEvaluator.required);
1264
1268
  evaluators.push({
1265
1269
  name,
1266
- type: "execution_metrics",
1270
+ type: "execution-metrics",
1267
1271
  ...validThresholds,
1268
1272
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1269
1273
  ...required2 !== void 0 ? { required: required2 } : {},
@@ -1271,13 +1275,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1271
1275
  });
1272
1276
  continue;
1273
1277
  }
1274
- if (typeValue === "agent_judge") {
1278
+ if (typeValue === "agent-judge") {
1275
1279
  const rawMaxSteps = rawEvaluator.max_steps ?? rawEvaluator.maxSteps;
1276
1280
  let maxSteps;
1277
1281
  if (rawMaxSteps !== void 0) {
1278
1282
  if (typeof rawMaxSteps !== "number" || !Number.isInteger(rawMaxSteps) || rawMaxSteps < 1 || rawMaxSteps > 50) {
1279
1283
  logWarning2(
1280
- `Skipping agent_judge evaluator '${name}' in '${evalId}': max_steps must be an integer 1-50`
1284
+ `Skipping agent-judge evaluator '${name}' in '${evalId}': max_steps must be an integer 1-50`
1281
1285
  );
1282
1286
  continue;
1283
1287
  }
@@ -1288,7 +1292,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1288
1292
  if (rawTemperature !== void 0) {
1289
1293
  if (typeof rawTemperature !== "number" || rawTemperature < 0 || rawTemperature > 2) {
1290
1294
  logWarning2(
1291
- `Skipping agent_judge evaluator '${name}' in '${evalId}': temperature must be a number 0-2`
1295
+ `Skipping agent-judge evaluator '${name}' in '${evalId}': temperature must be a number 0-2`
1292
1296
  );
1293
1297
  continue;
1294
1298
  }
@@ -1311,7 +1315,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1311
1315
  const required2 = parseRequired(rawEvaluator.required);
1312
1316
  evaluators.push({
1313
1317
  name,
1314
- type: "agent_judge",
1318
+ type: "agent-judge",
1315
1319
  ...agentPrompt ? { prompt: agentPrompt } : {},
1316
1320
  ...agentPromptPath ? { promptPath: agentPromptPath, resolvedPromptPath: agentPromptPath } : {},
1317
1321
  ...agentParsedRubrics && agentParsedRubrics.length > 0 ? { rubrics: agentParsedRubrics } : {},
@@ -1342,7 +1346,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1342
1346
  });
1343
1347
  continue;
1344
1348
  }
1345
- if (typeValue === "contains_any" || typeValue === "contains_all") {
1349
+ if (typeValue === "contains-any" || typeValue === "contains-all") {
1346
1350
  const value = asStringArrayStrict(rawEvaluator.value);
1347
1351
  if (!value || value.length === 0) {
1348
1352
  logWarning2(
@@ -1380,7 +1384,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1380
1384
  });
1381
1385
  continue;
1382
1386
  }
1383
- if (typeValue === "icontains_any" || typeValue === "icontains_all") {
1387
+ if (typeValue === "icontains-any" || typeValue === "icontains-all") {
1384
1388
  const value = asStringArrayStrict(rawEvaluator.value);
1385
1389
  if (!value || value.length === 0) {
1386
1390
  logWarning2(
@@ -1400,7 +1404,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1400
1404
  });
1401
1405
  continue;
1402
1406
  }
1403
- if (typeValue === "starts_with" || typeValue === "ends_with") {
1407
+ if (typeValue === "starts-with" || typeValue === "ends-with") {
1404
1408
  const value = asString(rawEvaluator.value);
1405
1409
  if (!value) {
1406
1410
  logWarning2(`Skipping ${typeValue} evaluator '${name}' in '${evalId}': missing value`);
@@ -1438,12 +1442,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1438
1442
  });
1439
1443
  continue;
1440
1444
  }
1441
- if (typeValue === "is_json") {
1445
+ if (typeValue === "is-json") {
1442
1446
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1443
1447
  const required2 = parseRequired(rawEvaluator.required);
1444
1448
  evaluators.push({
1445
1449
  name,
1446
- type: "is_json",
1450
+ type: "is-json",
1447
1451
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1448
1452
  ...required2 !== void 0 ? { required: required2 } : {},
1449
1453
  ...negate !== void 0 ? { negate } : {}
@@ -1491,7 +1495,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1491
1495
  const required2 = parseRequired(rawEvaluator.required);
1492
1496
  evaluators.push({
1493
1497
  name,
1494
- type: "llm_judge",
1498
+ type: "llm-judge",
1495
1499
  rubrics: parsedCriteria,
1496
1500
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1497
1501
  ...required2 !== void 0 ? { required: required2 } : {},
@@ -1558,7 +1562,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1558
1562
  const required2 = parseRequired(rawEvaluator.required);
1559
1563
  evaluators.push({
1560
1564
  name,
1561
- type: "llm_judge",
1565
+ type: "llm-judge",
1562
1566
  rubrics: parsedRubrics,
1563
1567
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1564
1568
  ...required2 !== void 0 ? { required: required2 } : {},
@@ -1590,7 +1594,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1590
1594
  const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
1591
1595
  evaluators.push({
1592
1596
  name,
1593
- type: "llm_judge",
1597
+ type: "llm-judge",
1594
1598
  prompt,
1595
1599
  promptPath,
1596
1600
  ...promptPath ? { resolvedPromptPath: promptPath } : {},
@@ -1606,15 +1610,15 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1606
1610
  }
1607
1611
  var ASSERTION_TYPES = /* @__PURE__ */ new Set([
1608
1612
  "contains",
1609
- "contains_any",
1610
- "contains_all",
1613
+ "contains-any",
1614
+ "contains-all",
1611
1615
  "icontains",
1612
- "icontains_any",
1613
- "icontains_all",
1614
- "starts_with",
1615
- "ends_with",
1616
+ "icontains-any",
1617
+ "icontains-all",
1618
+ "starts-with",
1619
+ "ends-with",
1616
1620
  "regex",
1617
- "is_json",
1621
+ "is-json",
1618
1622
  "equals",
1619
1623
  "rubrics"
1620
1624
  ]);
@@ -1627,24 +1631,24 @@ function generateAssertionName(typeValue, rawEvaluator) {
1627
1631
  switch (typeValue) {
1628
1632
  case "contains":
1629
1633
  return value ? `contains-${value}` : "contains";
1630
- case "contains_any":
1631
- return arrayValue ? `contains_any-${arrayValue.length}` : "contains_any";
1632
- case "contains_all":
1633
- return arrayValue ? `contains_all-${arrayValue.length}` : "contains_all";
1634
+ case "contains-any":
1635
+ return arrayValue ? `contains-any-${arrayValue.length}` : "contains-any";
1636
+ case "contains-all":
1637
+ return arrayValue ? `contains-all-${arrayValue.length}` : "contains-all";
1634
1638
  case "icontains":
1635
1639
  return value ? `icontains-${value}` : "icontains";
1636
- case "icontains_any":
1637
- return arrayValue ? `icontains_any-${arrayValue.length}` : "icontains_any";
1638
- case "icontains_all":
1639
- return arrayValue ? `icontains_all-${arrayValue.length}` : "icontains_all";
1640
- case "starts_with":
1641
- return value ? `starts_with-${value}` : "starts_with";
1642
- case "ends_with":
1643
- return value ? `ends_with-${value}` : "ends_with";
1640
+ case "icontains-any":
1641
+ return arrayValue ? `icontains-any-${arrayValue.length}` : "icontains-any";
1642
+ case "icontains-all":
1643
+ return arrayValue ? `icontains-all-${arrayValue.length}` : "icontains-all";
1644
+ case "starts-with":
1645
+ return value ? `starts-with-${value}` : "starts-with";
1646
+ case "ends-with":
1647
+ return value ? `ends-with-${value}` : "ends-with";
1644
1648
  case "regex":
1645
1649
  return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
1646
- case "is_json":
1647
- return "is_json";
1650
+ case "is-json":
1651
+ return "is-json";
1648
1652
  case "equals":
1649
1653
  return value ? `equals-${value}` : "equals";
1650
1654
  case "rubrics":
@@ -1657,8 +1661,9 @@ function coerceEvaluator(candidate, contextId) {
1657
1661
  if (typeof candidate !== "string") {
1658
1662
  return void 0;
1659
1663
  }
1660
- if (isEvaluatorKind(candidate)) {
1661
- return candidate;
1664
+ const normalized = normalizeEvaluatorType(candidate);
1665
+ if (isEvaluatorKind(normalized)) {
1666
+ return normalized;
1662
1667
  }
1663
1668
  logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
1664
1669
  return void 0;
@@ -1704,6 +1709,16 @@ function parseCommandToArgv(command) {
1704
1709
  function isJsonObject2(value) {
1705
1710
  return typeof value === "object" && value !== null && !Array.isArray(value);
1706
1711
  }
1712
+ var CRITERIA_CONSUMER_TYPES = /* @__PURE__ */ new Set(["llm-judge", "agent-judge", "code-judge"]);
1713
+ function warnUnconsumedCriteria(criteria, evaluators, testId) {
1714
+ if (!criteria?.trim() || !evaluators || evaluators.length === 0) return;
1715
+ const hasConsumer = evaluators.some((e) => CRITERIA_CONSUMER_TYPES.has(e.type));
1716
+ if (!hasConsumer) {
1717
+ logWarning2(
1718
+ `Test '${testId}': criteria is defined but no evaluator in assert will evaluate it. Add 'type: llm-judge' to assert, or remove criteria if it is documentation-only.`
1719
+ );
1720
+ }
1721
+ }
1707
1722
  function logWarning2(message, details) {
1708
1723
  if (details && details.length > 0) {
1709
1724
  const detailBlock = details.join("\n");
@@ -1953,7 +1968,7 @@ function parseInlineRubrics(rawRubrics) {
1953
1968
  }
1954
1969
  return {
1955
1970
  name: "rubric",
1956
- type: "llm_judge",
1971
+ type: "llm-judge",
1957
1972
  rubrics: rubricItems
1958
1973
  };
1959
1974
  }
@@ -2338,7 +2353,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2338
2353
  const rawCases = parseJsonlContent(rawFile, evalFilePath);
2339
2354
  const fallbackDataset = path6.basename(absoluteTestPath, ".jsonl") || "eval";
2340
2355
  const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
2341
- const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm_judge";
2356
+ const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-judge";
2342
2357
  const globalExecution = sidecar.execution;
2343
2358
  if (verbose) {
2344
2359
  console.log(`
@@ -2426,6 +2441,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2426
2441
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
2427
2442
  }
2428
2443
  }
2444
+ warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
2429
2445
  const userFilePaths = [];
2430
2446
  for (const segment of inputSegments) {
2431
2447
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -2818,7 +2834,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2818
2834
  const fallbackDataset = path8.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
2819
2835
  const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
2820
2836
  const rawTestcases = resolveTests(suite);
2821
- const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
2837
+ const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-judge";
2822
2838
  const evalFileDir = path8.dirname(absoluteTestPath);
2823
2839
  let expandedTestcases;
2824
2840
  if (typeof rawTestcases === "string") {
@@ -2915,6 +2931,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2915
2931
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
2916
2932
  }
2917
2933
  }
2934
+ warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
2918
2935
  const userFilePaths = [];
2919
2936
  for (const segment of inputSegments) {
2920
2937
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -8895,7 +8912,7 @@ function toCamelCaseDeep(obj) {
8895
8912
  // src/evaluation/evaluators/code-evaluator.ts
8896
8913
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
8897
8914
  var CodeEvaluator = class {
8898
- kind = "code";
8915
+ kind = "code-judge";
8899
8916
  command;
8900
8917
  cwd;
8901
8918
  agentTimeoutMs;
@@ -9103,7 +9120,7 @@ var scoreRangeEvaluationSchema = z3.object({
9103
9120
  overall_reasoning: z3.string().describe("Overall assessment summary (1-2 sentences)").optional()
9104
9121
  });
9105
9122
  var LlmJudgeEvaluator = class {
9106
- kind = "llm_judge";
9123
+ kind = "llm-judge";
9107
9124
  resolveJudgeProvider;
9108
9125
  maxOutputTokens;
9109
9126
  temperature;
@@ -9120,7 +9137,7 @@ var LlmJudgeEvaluator = class {
9120
9137
  throw new Error("No judge provider available for LLM grading");
9121
9138
  }
9122
9139
  const config = context.evaluator;
9123
- if (config?.type === "llm_judge" && config.rubrics && config.rubrics.length > 0) {
9140
+ if (config?.type === "llm-judge" && config.rubrics && config.rubrics.length > 0) {
9124
9141
  return this.evaluateWithRubrics(context, judgeProvider, config.rubrics);
9125
9142
  }
9126
9143
  return this.evaluateFreeform(context, judgeProvider);
@@ -9194,7 +9211,7 @@ ${context.fileChanges}`;
9194
9211
  async evaluateWithRubrics(context, judgeProvider, rubrics) {
9195
9212
  if (!rubrics || rubrics.length === 0) {
9196
9213
  throw new Error(
9197
- `No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
9214
+ `No rubrics found for evaluator "${context.evaluator?.name ?? "llm-judge"}". Run "agentv generate rubrics" first.`
9198
9215
  );
9199
9216
  }
9200
9217
  const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
@@ -9530,9 +9547,9 @@ var CompositeEvaluator = class {
9530
9547
  async aggregate(results, context) {
9531
9548
  const aggregator = this.config.aggregator;
9532
9549
  switch (aggregator.type) {
9533
- case "code_judge":
9550
+ case "code-judge":
9534
9551
  return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
9535
- case "llm_judge":
9552
+ case "llm-judge":
9536
9553
  return this.runLlmAggregator(results, context, aggregator);
9537
9554
  case "threshold":
9538
9555
  return this.runThreshold(results, aggregator.threshold);
@@ -9675,7 +9692,7 @@ var CompositeEvaluator = class {
9675
9692
  expectedAspectCount: hits.length + misses.length || 1,
9676
9693
  reasoning,
9677
9694
  evaluatorRawRequest: {
9678
- aggregator: "code_judge",
9695
+ aggregator: "code-judge",
9679
9696
  script: scriptPath
9680
9697
  },
9681
9698
  scores
@@ -9690,7 +9707,7 @@ var CompositeEvaluator = class {
9690
9707
  expectedAspectCount: 1,
9691
9708
  reasoning: message,
9692
9709
  evaluatorRawRequest: {
9693
- aggregator: "code_judge",
9710
+ aggregator: "code-judge",
9694
9711
  script: scriptPath,
9695
9712
  error: message
9696
9713
  },
@@ -9721,7 +9738,7 @@ var CompositeEvaluator = class {
9721
9738
  const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
9722
9739
  const systemPrompt = buildOutputSchema();
9723
9740
  const evaluatorRawRequest = {
9724
- aggregator: "llm_judge",
9741
+ aggregator: "llm-judge",
9725
9742
  userPrompt,
9726
9743
  systemPrompt,
9727
9744
  target: judgeProvider.targetName
@@ -9833,7 +9850,7 @@ var CostEvaluator = class {
9833
9850
 
9834
9851
  // src/evaluation/evaluators/execution-metrics.ts
9835
9852
  var ExecutionMetricsEvaluator = class {
9836
- kind = "execution_metrics";
9853
+ kind = "execution-metrics";
9837
9854
  config;
9838
9855
  constructor(options) {
9839
9856
  this.config = options.config;
@@ -9859,7 +9876,7 @@ var ExecutionMetricsEvaluator = class {
9859
9876
  expectedAspectCount: 1,
9860
9877
  reasoning: "Execution metrics not available - no trace summary provided",
9861
9878
  evaluatorRawRequest: {
9862
- type: "execution_metrics",
9879
+ type: "execution-metrics",
9863
9880
  config: this.extractConfiguredThresholds(),
9864
9881
  actual: null
9865
9882
  }
@@ -9968,7 +9985,7 @@ var ExecutionMetricsEvaluator = class {
9968
9985
  if (actualMetrics.exploration_ratio !== void 0) {
9969
9986
  reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
9970
9987
  }
9971
- const reasoning = reasoningParts.length > 0 ? `execution_metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
9988
+ const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
9972
9989
  return {
9973
9990
  score,
9974
9991
  verdict: scoreToVerdict(score),
@@ -9977,7 +9994,7 @@ var ExecutionMetricsEvaluator = class {
9977
9994
  expectedAspectCount: totalChecks || 1,
9978
9995
  reasoning,
9979
9996
  evaluatorRawRequest: {
9980
- type: "execution_metrics",
9997
+ type: "execution-metrics",
9981
9998
  config: this.extractConfiguredThresholds(),
9982
9999
  actual: this.filterDefinedMetrics(actualMetrics)
9983
10000
  }
@@ -10065,7 +10082,7 @@ var MONTH_NAMES = {
10065
10082
  december: 11
10066
10083
  };
10067
10084
  var FieldAccuracyEvaluator = class {
10068
- kind = "field_accuracy";
10085
+ kind = "field-accuracy";
10069
10086
  config;
10070
10087
  constructor(options) {
10071
10088
  this.config = options.config;
@@ -10519,7 +10536,7 @@ var BINARY_EXTENSIONS = /* @__PURE__ */ new Set([
10519
10536
  ".dylib"
10520
10537
  ]);
10521
10538
  var AgentJudgeEvaluator = class {
10522
- kind = "agent_judge";
10539
+ kind = "agent-judge";
10523
10540
  resolveJudgeProvider;
10524
10541
  maxSteps;
10525
10542
  temperature;
@@ -10544,24 +10561,24 @@ var AgentJudgeEvaluator = class {
10544
10561
  async evaluateBuiltIn(context) {
10545
10562
  const judgeProvider = await this.resolveJudgeProvider(context);
10546
10563
  if (!judgeProvider) {
10547
- throw new Error("No judge provider available for agent_judge evaluation");
10564
+ throw new Error("No judge provider available for agent-judge evaluation");
10548
10565
  }
10549
10566
  const model = judgeProvider.asLanguageModel?.();
10550
10567
  if (!model) {
10551
10568
  throw new Error(
10552
- `Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent_judge mode`
10569
+ `Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent-judge mode`
10553
10570
  );
10554
10571
  }
10555
10572
  const workspacePath = context.workspacePath;
10556
10573
  if (!workspacePath) {
10557
10574
  throw new Error(
10558
- "agent_judge evaluator requires a workspace_template target (workspacePath is not set)"
10575
+ "agent-judge evaluator requires a workspace_template target (workspacePath is not set)"
10559
10576
  );
10560
10577
  }
10561
10578
  const systemPrompt = this.buildSystemPrompt(context);
10562
10579
  const userPrompt = this.buildUserPrompt(context);
10563
10580
  const config = context.evaluator;
10564
- const rubrics = config?.type === "agent_judge" ? config.rubrics : void 0;
10581
+ const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
10565
10582
  const fsTools = createFilesystemTools(workspacePath);
10566
10583
  const evaluatorRawRequest = {
10567
10584
  mode: "built-in",
@@ -10592,7 +10609,7 @@ var AgentJudgeEvaluator = class {
10592
10609
  score: 0,
10593
10610
  verdict: "fail",
10594
10611
  hits: [],
10595
- misses: [`agent_judge built-in evaluation failed: ${message}`],
10612
+ misses: [`agent-judge built-in evaluation failed: ${message}`],
10596
10613
  expectedAspectCount: 1,
10597
10614
  evaluatorRawRequest,
10598
10615
  details: { mode: "built-in", error: message }
@@ -10624,14 +10641,14 @@ var AgentJudgeEvaluator = class {
10624
10641
  score: 0,
10625
10642
  verdict: "fail",
10626
10643
  hits: [],
10627
- misses: ["agent_judge judge_target returned no assistant response"],
10644
+ misses: ["agent-judge judge_target returned no assistant response"],
10628
10645
  expectedAspectCount: 1,
10629
10646
  evaluatorRawRequest,
10630
10647
  details: { mode: "judge_target", judge_target: provider.targetName }
10631
10648
  };
10632
10649
  }
10633
10650
  const config = context.evaluator;
10634
- const rubrics = config?.type === "agent_judge" ? config.rubrics : void 0;
10651
+ const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
10635
10652
  const details = {
10636
10653
  mode: "judge_target",
10637
10654
  judge_target: provider.targetName
@@ -10643,7 +10660,7 @@ var AgentJudgeEvaluator = class {
10643
10660
  score: 0,
10644
10661
  verdict: "fail",
10645
10662
  hits: [],
10646
- misses: [`agent_judge judge_target evaluation failed: ${message}`],
10663
+ misses: [`agent-judge judge_target evaluation failed: ${message}`],
10647
10664
  expectedAspectCount: 1,
10648
10665
  evaluatorRawRequest,
10649
10666
  details: {
@@ -10694,7 +10711,7 @@ var AgentJudgeEvaluator = class {
10694
10711
  score: 0,
10695
10712
  verdict: "fail",
10696
10713
  hits: [],
10697
- misses: ["Failed to parse agent_judge response as valid evaluation JSON"],
10714
+ misses: ["Failed to parse agent-judge response as valid evaluation JSON"],
10698
10715
  expectedAspectCount: 1,
10699
10716
  evaluatorRawRequest,
10700
10717
  details
@@ -10707,7 +10724,7 @@ var AgentJudgeEvaluator = class {
10707
10724
  */
10708
10725
  buildSystemPrompt(context) {
10709
10726
  const config = context.evaluator;
10710
- const rubrics = config?.type === "agent_judge" ? config.rubrics : void 0;
10727
+ const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
10711
10728
  const parts = [
10712
10729
  "You are an expert evaluator with access to the workspace filesystem.",
10713
10730
  "Use the provided tools to investigate the workspace and verify the criteria are met.",
@@ -10738,7 +10755,7 @@ var AgentJudgeEvaluator = class {
10738
10755
  return substituteVariables(this.evaluatorTemplate, variables);
10739
10756
  }
10740
10757
  const config = context.evaluator;
10741
- const rubrics = config?.type === "agent_judge" ? config.rubrics : void 0;
10758
+ const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
10742
10759
  const parts = [
10743
10760
  "Evaluate the candidate answer by investigating the workspace.",
10744
10761
  "",
@@ -10781,7 +10798,7 @@ var AgentJudgeEvaluator = class {
10781
10798
  buildDelegatedPrompt(context) {
10782
10799
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
10783
10800
  const config = context.evaluator;
10784
- const rubrics = config?.type === "agent_judge" ? config.rubrics : void 0;
10801
+ const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
10785
10802
  if (this.evaluatorTemplate) {
10786
10803
  const variables = {
10787
10804
  [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
@@ -10863,11 +10880,11 @@ function createFilesystemTools(workspacePath) {
10863
10880
  execute: async (input) => {
10864
10881
  try {
10865
10882
  const resolved = resolveSandboxed(workspacePath, input.path);
10866
- const stat7 = await fs2.stat(resolved);
10867
- if (stat7.isDirectory()) {
10883
+ const stat8 = await fs2.stat(resolved);
10884
+ if (stat8.isDirectory()) {
10868
10885
  return { error: `'${input.path}' is a directory, not a file` };
10869
10886
  }
10870
- const buffer = Buffer.alloc(Math.min(stat7.size, MAX_FILE_SIZE));
10887
+ const buffer = Buffer.alloc(Math.min(stat8.size, MAX_FILE_SIZE));
10871
10888
  const fd = await fs2.open(resolved, "r");
10872
10889
  try {
10873
10890
  await fd.read(buffer, 0, buffer.length, 0);
@@ -10875,8 +10892,8 @@ function createFilesystemTools(workspacePath) {
10875
10892
  await fd.close();
10876
10893
  }
10877
10894
  const content = buffer.toString("utf-8");
10878
- const truncated = stat7.size > MAX_FILE_SIZE;
10879
- return { content, truncated, size: stat7.size };
10895
+ const truncated = stat8.size > MAX_FILE_SIZE;
10896
+ return { content, truncated, size: stat8.size };
10880
10897
  } catch (error) {
10881
10898
  return { error: error instanceof Error ? error.message : String(error) };
10882
10899
  }
@@ -10920,8 +10937,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
10920
10937
  const ext = path30.extname(entry.name).toLowerCase();
10921
10938
  if (BINARY_EXTENSIONS.has(ext)) continue;
10922
10939
  try {
10923
- const stat7 = await fs2.stat(fullPath);
10924
- if (stat7.size > MAX_FILE_SIZE) continue;
10940
+ const stat8 = await fs2.stat(fullPath);
10941
+ if (stat8.size > MAX_FILE_SIZE) continue;
10925
10942
  const content = await fs2.readFile(fullPath, "utf-8");
10926
10943
  const lines = content.split("\n");
10927
10944
  for (let i = 0; i < lines.length; i++) {
@@ -11083,7 +11100,7 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
11083
11100
 
11084
11101
  // src/evaluation/evaluators/token-usage.ts
11085
11102
  var TokenUsageEvaluator = class {
11086
- kind = "token_usage";
11103
+ kind = "token-usage";
11087
11104
  config;
11088
11105
  constructor(options) {
11089
11106
  this.config = options.config;
@@ -11106,7 +11123,7 @@ var TokenUsageEvaluator = class {
11106
11123
  expectedAspectCount,
11107
11124
  reasoning: "Token usage not reported by provider",
11108
11125
  evaluatorRawRequest: {
11109
- type: "token_usage",
11126
+ type: "token-usage",
11110
11127
  max_total: maxTotal ?? null,
11111
11128
  max_input: maxInput ?? null,
11112
11129
  max_output: maxOutput ?? null,
@@ -11148,9 +11165,9 @@ var TokenUsageEvaluator = class {
11148
11165
  hits,
11149
11166
  misses,
11150
11167
  expectedAspectCount,
11151
- reasoning: `token_usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
11168
+ reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
11152
11169
  evaluatorRawRequest: {
11153
- type: "token_usage",
11170
+ type: "token-usage",
11154
11171
  max_total: maxTotal ?? null,
11155
11172
  max_input: maxInput ?? null,
11156
11173
  max_output: maxOutput ?? null,
@@ -11235,7 +11252,7 @@ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
11235
11252
  };
11236
11253
  }
11237
11254
  var ToolTrajectoryEvaluator = class {
11238
- kind = "tool_trajectory";
11255
+ kind = "tool-trajectory";
11239
11256
  config;
11240
11257
  constructor(options) {
11241
11258
  this.config = options.config;
@@ -11423,7 +11440,7 @@ var ToolTrajectoryEvaluator = class {
11423
11440
  }
11424
11441
  }
11425
11442
  for (const warning of warnings) {
11426
- console.warn(`[tool_trajectory] ${warning}`);
11443
+ console.warn(`[tool-trajectory] ${warning}`);
11427
11444
  }
11428
11445
  const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
11429
11446
  const totalAssertions = expected.length + effectiveLatencyAssertions;
@@ -11499,7 +11516,7 @@ var ToolTrajectoryEvaluator = class {
11499
11516
  misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
11500
11517
  }
11501
11518
  for (const warning of warnings) {
11502
- console.warn(`[tool_trajectory] ${warning}`);
11519
+ console.warn(`[tool-trajectory] ${warning}`);
11503
11520
  }
11504
11521
  const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
11505
11522
  const totalAssertions = expected.length + effectiveLatencyAssertions;
@@ -11729,7 +11746,7 @@ function runEqualsAssertion(output, value) {
11729
11746
 
11730
11747
  // src/evaluation/orchestrator.ts
11731
11748
  import { createHash as createHash2, randomUUID as randomUUID7 } from "node:crypto";
11732
- import { mkdir as mkdir12 } from "node:fs/promises";
11749
+ import { mkdir as mkdir12, stat as stat7 } from "node:fs/promises";
11733
11750
  import path37 from "node:path";
11734
11751
  import micromatch4 from "micromatch";
11735
11752
 
@@ -11989,7 +12006,7 @@ var llmJudgeFactory = (config, context) => {
11989
12006
  const c = config;
11990
12007
  const { llmJudge, agentTimeoutMs } = context;
11991
12008
  return {
11992
- kind: "llm_judge",
12009
+ kind: "llm-judge",
11993
12010
  async evaluate(evalContext) {
11994
12011
  const customPrompt = await resolveCustomPrompt(
11995
12012
  c,
@@ -12078,7 +12095,7 @@ var agentJudgeFactory = (config, context) => {
12078
12095
  customPrompt = readFileSync(c.resolvedPromptPath, "utf-8");
12079
12096
  } catch (error) {
12080
12097
  const message = error instanceof Error ? error.message : String(error);
12081
- console.warn(`Could not read agent_judge prompt at ${c.resolvedPromptPath}: ${message}`);
12098
+ console.warn(`Could not read agent-judge prompt at ${c.resolvedPromptPath}: ${message}`);
12082
12099
  }
12083
12100
  } else if (c.prompt) {
12084
12101
  customPrompt = c.prompt;
@@ -12088,7 +12105,7 @@ var agentJudgeFactory = (config, context) => {
12088
12105
  judgeTargetProvider = targetResolver(c.target);
12089
12106
  if (!judgeTargetProvider) {
12090
12107
  throw new Error(
12091
- `agent_judge evaluator '${c.name}': target '${c.target}' not found in targets`
12108
+ `agent-judge evaluator '${c.name}': target '${c.target}' not found in targets`
12092
12109
  );
12093
12110
  }
12094
12111
  }
@@ -12132,7 +12149,7 @@ var regexFactory = (config) => {
12132
12149
  });
12133
12150
  };
12134
12151
  var isJsonFactory = () => {
12135
- return new DeterministicAssertionEvaluator("is_json", (ctx) => {
12152
+ return new DeterministicAssertionEvaluator("is-json", (ctx) => {
12136
12153
  const result = runIsJsonAssertion(ctx.candidate);
12137
12154
  return {
12138
12155
  score: result.score,
@@ -12160,7 +12177,7 @@ var equalsFactory = (config) => {
12160
12177
  };
12161
12178
  var containsAnyFactory = (config) => {
12162
12179
  const c = config;
12163
- return new DeterministicAssertionEvaluator("contains_any", (ctx) => {
12180
+ return new DeterministicAssertionEvaluator("contains-any", (ctx) => {
12164
12181
  const result = runContainsAnyAssertion(ctx.candidate, c.value);
12165
12182
  return {
12166
12183
  score: result.score,
@@ -12174,7 +12191,7 @@ var containsAnyFactory = (config) => {
12174
12191
  };
12175
12192
  var containsAllFactory = (config) => {
12176
12193
  const c = config;
12177
- return new DeterministicAssertionEvaluator("contains_all", (ctx) => {
12194
+ return new DeterministicAssertionEvaluator("contains-all", (ctx) => {
12178
12195
  const result = runContainsAllAssertion(ctx.candidate, c.value);
12179
12196
  return {
12180
12197
  score: result.score,
@@ -12202,7 +12219,7 @@ var icontainsFactory = (config) => {
12202
12219
  };
12203
12220
  var icontainsAnyFactory = (config) => {
12204
12221
  const c = config;
12205
- return new DeterministicAssertionEvaluator("icontains_any", (ctx) => {
12222
+ return new DeterministicAssertionEvaluator("icontains-any", (ctx) => {
12206
12223
  const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
12207
12224
  return {
12208
12225
  score: result.score,
@@ -12216,7 +12233,7 @@ var icontainsAnyFactory = (config) => {
12216
12233
  };
12217
12234
  var icontainsAllFactory = (config) => {
12218
12235
  const c = config;
12219
- return new DeterministicAssertionEvaluator("icontains_all", (ctx) => {
12236
+ return new DeterministicAssertionEvaluator("icontains-all", (ctx) => {
12220
12237
  const result = runIcontainsAllAssertion(ctx.candidate, c.value);
12221
12238
  return {
12222
12239
  score: result.score,
@@ -12230,7 +12247,7 @@ var icontainsAllFactory = (config) => {
12230
12247
  };
12231
12248
  var startsWithFactory = (config) => {
12232
12249
  const c = config;
12233
- return new DeterministicAssertionEvaluator("starts_with", (ctx) => {
12250
+ return new DeterministicAssertionEvaluator("starts-with", (ctx) => {
12234
12251
  const result = runStartsWithAssertion(ctx.candidate, c.value);
12235
12252
  return {
12236
12253
  score: result.score,
@@ -12244,7 +12261,7 @@ var startsWithFactory = (config) => {
12244
12261
  };
12245
12262
  var endsWithFactory = (config) => {
12246
12263
  const c = config;
12247
- return new DeterministicAssertionEvaluator("ends_with", (ctx) => {
12264
+ return new DeterministicAssertionEvaluator("ends-with", (ctx) => {
12248
12265
  const result = runEndsWithAssertion(ctx.candidate, c.value);
12249
12266
  return {
12250
12267
  score: result.score,
@@ -12258,7 +12275,7 @@ var endsWithFactory = (config) => {
12258
12275
  };
12259
12276
  function createBuiltinRegistry() {
12260
12277
  const registry = new EvaluatorRegistry();
12261
- registry.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("contains_any", containsAnyFactory).register("contains_all", containsAllFactory).register("icontains", icontainsFactory).register("icontains_any", icontainsAnyFactory).register("icontains_all", icontainsAllFactory).register("starts_with", startsWithFactory).register("ends_with", endsWithFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
12278
+ registry.register("llm-judge", llmJudgeFactory).register("code-judge", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("agent-judge", agentJudgeFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory);
12262
12279
  return registry;
12263
12280
  }
12264
12281
 
@@ -13004,7 +13021,7 @@ async function runEvaluation(options) {
13004
13021
  };
13005
13022
  if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
13006
13023
  throw new Error(
13007
- `Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure_base).`
13024
+ `Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure-base).`
13008
13025
  );
13009
13026
  }
13010
13027
  const targetResolver = (name) => {
@@ -13075,7 +13092,7 @@ async function runEvaluation(options) {
13075
13092
  const rawTemplate = suiteWorkspace?.template ?? getWorkspaceTemplate(target);
13076
13093
  const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
13077
13094
  const workspaceTemplate = resolvedTemplate?.dir;
13078
- const suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
13095
+ let suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
13079
13096
  const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
13080
13097
  const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
13081
13098
  const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
@@ -13096,6 +13113,14 @@ async function runEvaluation(options) {
13096
13113
  const message = error instanceof Error ? error.message : String(error);
13097
13114
  throw new Error(`Failed to create shared workspace: ${message}`);
13098
13115
  }
13116
+ if (suiteWorkspaceFile && sharedWorkspacePath) {
13117
+ const copiedWorkspaceFile = path37.join(sharedWorkspacePath, path37.basename(suiteWorkspaceFile));
13118
+ try {
13119
+ await stat7(copiedWorkspaceFile);
13120
+ suiteWorkspaceFile = copiedWorkspaceFile;
13121
+ } catch {
13122
+ }
13123
+ }
13099
13124
  } else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
13100
13125
  sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
13101
13126
  await mkdir12(sharedWorkspacePath, { recursive: true });
@@ -13574,6 +13599,14 @@ async function runEvalCase(options) {
13574
13599
  "template_error"
13575
13600
  );
13576
13601
  }
13602
+ if (caseWorkspaceFile && workspacePath) {
13603
+ const copiedFile = path37.join(workspacePath, path37.basename(caseWorkspaceFile));
13604
+ try {
13605
+ await stat7(copiedFile);
13606
+ caseWorkspaceFile = copiedFile;
13607
+ } catch {
13608
+ }
13609
+ }
13577
13610
  }
13578
13611
  if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
13579
13612
  workspacePath = getWorkspacePath(evalRunId, evalCase.id);
@@ -14083,8 +14116,8 @@ async function runEvaluatorsForCase(options) {
14083
14116
  workspacePath
14084
14117
  });
14085
14118
  }
14086
- const evaluatorKind = evalCase.evaluator ?? "llm_judge";
14087
- const activeEvaluator = evaluators[evaluatorKind] ?? evaluators.llm_judge;
14119
+ const evaluatorKind = evalCase.evaluator ?? "llm-judge";
14120
+ const activeEvaluator = evaluators[evaluatorKind] ?? evaluators["llm-judge"];
14088
14121
  if (!activeEvaluator) {
14089
14122
  throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
14090
14123
  }
@@ -14167,25 +14200,24 @@ async function runEvaluatorList(options) {
14167
14200
  availableTargets,
14168
14201
  agentTimeoutMs,
14169
14202
  evalFileDir,
14170
- llmJudge: evaluatorRegistry.llm_judge,
14203
+ llmJudge: evaluatorRegistry["llm-judge"],
14171
14204
  registry: typeRegistry
14172
14205
  };
14173
14206
  for (const evaluatorConfig of evaluators ?? []) {
14174
14207
  try {
14175
14208
  const evaluatorInstance = await typeRegistry.create(evaluatorConfig, dispatchContext);
14176
14209
  const score2 = await evaluatorInstance.evaluate(evalContext);
14177
- const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
14178
14210
  const weight = evaluatorConfig.weight ?? 1;
14179
14211
  scored.push({
14180
14212
  score: score2,
14181
14213
  name: evaluatorConfig.name,
14182
- type: resultType,
14214
+ type: evaluatorConfig.type,
14183
14215
  weight,
14184
14216
  ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
14185
14217
  });
14186
14218
  scores.push({
14187
14219
  name: evaluatorConfig.name,
14188
- type: resultType,
14220
+ type: evaluatorConfig.type,
14189
14221
  score: score2.score,
14190
14222
  weight,
14191
14223
  verdict: score2.verdict,
@@ -14207,18 +14239,17 @@ async function runEvaluatorList(options) {
14207
14239
  expectedAspectCount: 1,
14208
14240
  reasoning: message
14209
14241
  };
14210
- const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
14211
14242
  const weight = evaluatorConfig.weight ?? 1;
14212
14243
  scored.push({
14213
14244
  score: fallbackScore,
14214
14245
  name: evaluatorConfig.name ?? "unknown",
14215
- type: resultType ?? "llm_judge",
14246
+ type: evaluatorConfig.type ?? "llm-judge",
14216
14247
  weight,
14217
14248
  ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
14218
14249
  });
14219
14250
  scores.push({
14220
14251
  name: evaluatorConfig.name ?? "unknown",
14221
- type: resultType ?? "llm_judge",
14252
+ type: evaluatorConfig.type ?? "llm-judge",
14222
14253
  score: 0,
14223
14254
  weight,
14224
14255
  verdict: "fail",
@@ -14279,7 +14310,7 @@ function filterEvalCases(evalCases, filter) {
14279
14310
  return evalCases.filter((evalCase) => micromatch4.isMatch(evalCase.id, filter));
14280
14311
  }
14281
14312
  function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
14282
- const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
14313
+ const llmJudge = overrides?.["llm-judge"] ?? new LlmJudgeEvaluator({
14283
14314
  resolveJudgeProvider: async (context) => {
14284
14315
  if (context.judgeProvider) {
14285
14316
  return context.judgeProvider;
@@ -14289,7 +14320,7 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
14289
14320
  });
14290
14321
  return {
14291
14322
  ...overrides,
14292
- llm_judge: llmJudge
14323
+ "llm-judge": llmJudge
14293
14324
  };
14294
14325
  }
14295
14326
  async function invokeProvider(provider, options) {
@@ -14549,12 +14580,7 @@ async function evaluate(config) {
14549
14580
  };
14550
14581
  }
14551
14582
  function mapAssertionType(type) {
14552
- switch (type) {
14553
- case "code_judge":
14554
- return "code";
14555
- default:
14556
- return type;
14557
- }
14583
+ return type.replace(/_/g, "-");
14558
14584
  }
14559
14585
  function computeSummary(results, durationMs) {
14560
14586
  const total = results.length;