@agentv/core 2.12.0 → 2.14.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -17,7 +17,7 @@ import {
17
17
  readTextFile,
18
18
  resolveFileReference,
19
19
  resolveTargetDefinition
20
- } from "./chunk-7HPKTRFZ.js";
20
+ } from "./chunk-N55K52OO.js";
21
21
  import {
22
22
  OtlpJsonFileExporter
23
23
  } from "./chunk-HFSYZHGF.js";
@@ -396,6 +396,11 @@ async function loadConfig(evalFilePath, repoRoot) {
396
396
  continue;
397
397
  }
398
398
  const config = parsed;
399
+ const requiredVersion = parsed.required_version;
400
+ if (requiredVersion !== void 0 && typeof requiredVersion !== "string") {
401
+ logWarning(`Invalid required_version in ${configPath}, expected string`);
402
+ continue;
403
+ }
399
404
  const guidelinePatterns = config.guideline_patterns;
400
405
  if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
401
406
  logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
@@ -419,6 +424,7 @@ async function loadConfig(evalFilePath, repoRoot) {
419
424
  configPath
420
425
  );
421
426
  return {
427
+ required_version: requiredVersion,
422
428
  guideline_patterns: guidelinePatterns,
423
429
  eval_patterns: evalPatterns,
424
430
  execution: executionDefaults
@@ -562,6 +568,22 @@ function extractTotalBudgetUsd(suite) {
562
568
  );
563
569
  return void 0;
564
570
  }
571
+ function extractFailOnError(suite) {
572
+ const execution = suite.execution;
573
+ if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
574
+ return void 0;
575
+ }
576
+ const executionObj = execution;
577
+ const raw = executionObj.fail_on_error ?? executionObj.failOnError;
578
+ if (raw === void 0 || raw === null) {
579
+ return void 0;
580
+ }
581
+ if (typeof raw === "boolean") {
582
+ return raw;
583
+ }
584
+ logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
585
+ return void 0;
586
+ }
565
587
  function parseExecutionDefaults(raw, configPath) {
566
588
  if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
567
589
  return void 0;
@@ -660,6 +682,9 @@ function validateTemplateVariables(content, source) {
660
682
  // src/evaluation/loaders/evaluator-parser.ts
661
683
  var ANSI_YELLOW4 = "\x1B[33m";
662
684
  var ANSI_RESET4 = "\x1B[0m";
685
+ function normalizeEvaluatorType(type) {
686
+ return type.replace(/_/g, "-");
687
+ }
663
688
  async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
664
689
  const execution = rawEvalCase.execution;
665
690
  const executionObject = isJsonObject2(execution) ? execution : void 0;
@@ -690,7 +715,8 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
690
715
  continue;
691
716
  }
692
717
  const rawName = asString(rawEvaluator.name);
693
- const typeValue = rawEvaluator.type;
718
+ const rawType = rawEvaluator.type;
719
+ const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
694
720
  const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
695
721
  if (typeof typeValue !== "string") {
696
722
  logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
@@ -723,25 +749,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
723
749
  });
724
750
  continue;
725
751
  }
726
- if (typeValue === "code_judge") {
752
+ if (typeValue === "code-judge") {
727
753
  let command;
728
754
  const rawCommand = rawEvaluator.command ?? rawEvaluator.script;
729
755
  if (typeof rawCommand === "string") {
730
756
  const trimmed = rawCommand.trim();
731
757
  if (trimmed.length === 0) {
732
758
  throw new Error(
733
- `Invalid code_judge command for evaluator '${name}' in '${evalId}': command cannot be empty`
759
+ `Invalid code-judge command for evaluator '${name}' in '${evalId}': command cannot be empty`
734
760
  );
735
761
  }
736
762
  command = parseCommandToArgv(trimmed);
737
763
  } else {
738
764
  command = asStringArray(
739
765
  rawCommand,
740
- `code_judge command for evaluator '${name}' in '${evalId}'`
766
+ `code-judge command for evaluator '${name}' in '${evalId}'`
741
767
  );
742
768
  }
743
769
  if (!command) {
744
- logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing command`);
770
+ logWarning2(`Skipping code-judge evaluator '${name}' in '${evalId}': missing command`);
745
771
  continue;
746
772
  }
747
773
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
@@ -802,7 +828,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
802
828
  }
803
829
  evaluators.push({
804
830
  name,
805
- type: "code",
831
+ type: "code-judge",
806
832
  command,
807
833
  cwd,
808
834
  resolvedCwd,
@@ -828,7 +854,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
828
854
  continue;
829
855
  }
830
856
  const aggregatorType = asString(rawAggregator.type);
831
- if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge" && aggregatorType !== "threshold") {
857
+ if (aggregatorType !== "weighted_average" && aggregatorType !== "code-judge" && aggregatorType !== "llm-judge" && aggregatorType !== "threshold") {
832
858
  logWarning2(
833
859
  `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
834
860
  );
@@ -877,16 +903,16 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
877
903
  type: "weighted_average",
878
904
  ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
879
905
  };
880
- } else if (aggregatorType === "code_judge") {
906
+ } else if (aggregatorType === "code-judge") {
881
907
  const aggregatorPath = asString(rawAggregator.path);
882
908
  if (!aggregatorPath) {
883
909
  logWarning2(
884
- `Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
910
+ `Skipping composite evaluator '${name}' in '${evalId}': code-judge aggregator missing path`
885
911
  );
886
912
  continue;
887
913
  }
888
914
  aggregator = {
889
- type: "code_judge",
915
+ type: "code-judge",
890
916
  path: aggregatorPath,
891
917
  cwd: searchRoots[0]
892
918
  };
@@ -912,7 +938,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
912
938
  }
913
939
  }
914
940
  aggregator = {
915
- type: "llm_judge",
941
+ type: "llm-judge",
916
942
  ...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
917
943
  ...promptPath2 ? { promptPath: promptPath2 } : {}
918
944
  };
@@ -930,11 +956,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
930
956
  });
931
957
  continue;
932
958
  }
933
- if (typeValue === "tool_trajectory") {
959
+ if (typeValue === "tool-trajectory") {
934
960
  const mode = asString(rawEvaluator.mode);
935
961
  if (mode !== "any_order" && mode !== "in_order" && mode !== "exact" && mode !== "subset" && mode !== "superset") {
936
962
  logWarning2(
937
- `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, exact, subset, or superset)`
963
+ `Skipping tool-trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, exact, subset, or superset)`
938
964
  );
939
965
  continue;
940
966
  }
@@ -943,7 +969,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
943
969
  if (rawMinimums !== void 0) {
944
970
  if (!isJsonObject2(rawMinimums)) {
945
971
  logWarning2(
946
- `Skipping tool_trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
972
+ `Skipping tool-trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
947
973
  );
948
974
  continue;
949
975
  }
@@ -969,7 +995,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
969
995
  argsMatch2 = rawArgsMatch;
970
996
  } else {
971
997
  logWarning2(
972
- `Invalid args_match '${rawArgsMatch}' for tool_trajectory evaluator '${name}' in '${evalId}': must be exact, superset, subset, ignore, or a string array`
998
+ `Invalid args_match '${rawArgsMatch}' for tool-trajectory evaluator '${name}' in '${evalId}': must be exact, superset, subset, ignore, or a string array`
973
999
  );
974
1000
  }
975
1001
  }
@@ -979,7 +1005,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
979
1005
  if (rawExpected !== void 0) {
980
1006
  if (!Array.isArray(rawExpected)) {
981
1007
  logWarning2(
982
- `Skipping tool_trajectory evaluator '${name}' in '${evalId}': expected must be an array`
1008
+ `Skipping tool-trajectory evaluator '${name}' in '${evalId}': expected must be an array`
983
1009
  );
984
1010
  continue;
985
1011
  }
@@ -1025,13 +1051,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1025
1051
  }
1026
1052
  if (mode === "any_order" && !minimums) {
1027
1053
  logWarning2(
1028
- `Skipping tool_trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
1054
+ `Skipping tool-trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
1029
1055
  );
1030
1056
  continue;
1031
1057
  }
1032
1058
  if ((mode === "in_order" || mode === "exact" || mode === "subset" || mode === "superset") && !expected) {
1033
1059
  logWarning2(
1034
- `Skipping tool_trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
1060
+ `Skipping tool-trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
1035
1061
  );
1036
1062
  continue;
1037
1063
  }
@@ -1039,7 +1065,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1039
1065
  const required2 = parseRequired(rawEvaluator.required);
1040
1066
  const config2 = {
1041
1067
  name,
1042
- type: "tool_trajectory",
1068
+ type: "tool-trajectory",
1043
1069
  mode,
1044
1070
  ...minimums ? { minimums } : {},
1045
1071
  ...expected ? { expected } : {},
@@ -1051,17 +1077,17 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1051
1077
  evaluators.push(config2);
1052
1078
  continue;
1053
1079
  }
1054
- if (typeValue === "field_accuracy") {
1080
+ if (typeValue === "field-accuracy") {
1055
1081
  const rawFields = rawEvaluator.fields;
1056
1082
  if (!Array.isArray(rawFields)) {
1057
1083
  logWarning2(
1058
- `Skipping field_accuracy evaluator '${name}' in '${evalId}': missing fields array`
1084
+ `Skipping field-accuracy evaluator '${name}' in '${evalId}': missing fields array`
1059
1085
  );
1060
1086
  continue;
1061
1087
  }
1062
1088
  if (rawFields.length === 0) {
1063
1089
  logWarning2(
1064
- `Skipping field_accuracy evaluator '${name}' in '${evalId}': fields array is empty`
1090
+ `Skipping field-accuracy evaluator '${name}' in '${evalId}': fields array is empty`
1065
1091
  );
1066
1092
  continue;
1067
1093
  }
@@ -1069,7 +1095,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1069
1095
  for (const rawField of rawFields) {
1070
1096
  if (!isJsonObject2(rawField)) {
1071
1097
  logWarning2(
1072
- `Skipping invalid field entry in field_accuracy evaluator '${name}' (expected object)`
1098
+ `Skipping invalid field entry in field-accuracy evaluator '${name}' (expected object)`
1073
1099
  );
1074
1100
  continue;
1075
1101
  }
@@ -1077,13 +1103,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1077
1103
  const match = asString(rawField.match);
1078
1104
  if (!fieldPath) {
1079
1105
  logWarning2(
1080
- `Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
1106
+ `Skipping field without path in field-accuracy evaluator '${name}' in '${evalId}'`
1081
1107
  );
1082
1108
  continue;
1083
1109
  }
1084
1110
  if (!match || !isValidFieldMatchType(match)) {
1085
1111
  logWarning2(
1086
- `Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code_judge evaluator.`
1112
+ `Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code-judge evaluator.`
1087
1113
  );
1088
1114
  continue;
1089
1115
  }
@@ -1100,7 +1126,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1100
1126
  }
1101
1127
  if (fields.length === 0) {
1102
1128
  logWarning2(
1103
- `Skipping field_accuracy evaluator '${name}' in '${evalId}': no valid fields found`
1129
+ `Skipping field-accuracy evaluator '${name}' in '${evalId}': no valid fields found`
1104
1130
  );
1105
1131
  continue;
1106
1132
  }
@@ -1110,7 +1136,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1110
1136
  const required2 = parseRequired(rawEvaluator.required);
1111
1137
  evaluators.push({
1112
1138
  name,
1113
- type: "field_accuracy",
1139
+ type: "field-accuracy",
1114
1140
  fields,
1115
1141
  ...validAggregation ? { aggregation: validAggregation } : {},
1116
1142
  ...weight2 !== void 0 ? { weight: weight2 } : {},
@@ -1159,7 +1185,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1159
1185
  });
1160
1186
  continue;
1161
1187
  }
1162
- if (typeValue === "token_usage") {
1188
+ if (typeValue === "token-usage") {
1163
1189
  const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
1164
1190
  const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
1165
1191
  const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
@@ -1173,7 +1199,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1173
1199
  if (raw === void 0) continue;
1174
1200
  if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
1175
1201
  logWarning2(
1176
- `Skipping token_usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
1202
+ `Skipping token-usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
1177
1203
  );
1178
1204
  continue;
1179
1205
  }
@@ -1181,7 +1207,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1181
1207
  }
1182
1208
  if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
1183
1209
  logWarning2(
1184
- `Skipping token_usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
1210
+ `Skipping token-usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
1185
1211
  );
1186
1212
  continue;
1187
1213
  }
@@ -1189,7 +1215,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1189
1215
  const required2 = parseRequired(rawEvaluator.required);
1190
1216
  evaluators.push({
1191
1217
  name,
1192
- type: "token_usage",
1218
+ type: "token-usage",
1193
1219
  ...validLimits,
1194
1220
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1195
1221
  ...required2 !== void 0 ? { required: required2 } : {},
@@ -1197,7 +1223,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1197
1223
  });
1198
1224
  continue;
1199
1225
  }
1200
- if (typeValue === "execution_metrics") {
1226
+ if (typeValue === "execution-metrics") {
1201
1227
  const maxToolCalls = rawEvaluator.max_tool_calls ?? rawEvaluator.maxToolCalls;
1202
1228
  const maxLlmCalls = rawEvaluator.max_llm_calls ?? rawEvaluator.maxLlmCalls;
1203
1229
  const maxTokens = rawEvaluator.max_tokens ?? rawEvaluator.maxTokens;
@@ -1220,7 +1246,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1220
1246
  if (raw === void 0) continue;
1221
1247
  if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
1222
1248
  logWarning2(
1223
- `Skipping execution_metrics evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
1249
+ `Skipping execution-metrics evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
1224
1250
  );
1225
1251
  hasError = true;
1226
1252
  break;
@@ -1233,7 +1259,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1233
1259
  const hasThreshold = validThresholds.max_tool_calls !== void 0 || validThresholds.max_llm_calls !== void 0 || validThresholds.max_tokens !== void 0 || validThresholds.max_cost_usd !== void 0 || validThresholds.max_duration_ms !== void 0 || validThresholds.target_exploration_ratio !== void 0;
1234
1260
  if (!hasThreshold) {
1235
1261
  logWarning2(
1236
- `Skipping execution_metrics evaluator '${name}' in '${evalId}': must set at least one threshold (max_tool_calls, max_llm_calls, max_tokens, max_cost_usd, max_duration_ms, or target_exploration_ratio)`
1262
+ `Skipping execution-metrics evaluator '${name}' in '${evalId}': must set at least one threshold (max_tool_calls, max_llm_calls, max_tokens, max_cost_usd, max_duration_ms, or target_exploration_ratio)`
1237
1263
  );
1238
1264
  continue;
1239
1265
  }
@@ -1241,7 +1267,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1241
1267
  const required2 = parseRequired(rawEvaluator.required);
1242
1268
  evaluators.push({
1243
1269
  name,
1244
- type: "execution_metrics",
1270
+ type: "execution-metrics",
1245
1271
  ...validThresholds,
1246
1272
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1247
1273
  ...required2 !== void 0 ? { required: required2 } : {},
@@ -1249,13 +1275,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1249
1275
  });
1250
1276
  continue;
1251
1277
  }
1252
- if (typeValue === "agent_judge") {
1278
+ if (typeValue === "agent-judge") {
1253
1279
  const rawMaxSteps = rawEvaluator.max_steps ?? rawEvaluator.maxSteps;
1254
1280
  let maxSteps;
1255
1281
  if (rawMaxSteps !== void 0) {
1256
1282
  if (typeof rawMaxSteps !== "number" || !Number.isInteger(rawMaxSteps) || rawMaxSteps < 1 || rawMaxSteps > 50) {
1257
1283
  logWarning2(
1258
- `Skipping agent_judge evaluator '${name}' in '${evalId}': max_steps must be an integer 1-50`
1284
+ `Skipping agent-judge evaluator '${name}' in '${evalId}': max_steps must be an integer 1-50`
1259
1285
  );
1260
1286
  continue;
1261
1287
  }
@@ -1266,7 +1292,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1266
1292
  if (rawTemperature !== void 0) {
1267
1293
  if (typeof rawTemperature !== "number" || rawTemperature < 0 || rawTemperature > 2) {
1268
1294
  logWarning2(
1269
- `Skipping agent_judge evaluator '${name}' in '${evalId}': temperature must be a number 0-2`
1295
+ `Skipping agent-judge evaluator '${name}' in '${evalId}': temperature must be a number 0-2`
1270
1296
  );
1271
1297
  continue;
1272
1298
  }
@@ -1289,7 +1315,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1289
1315
  const required2 = parseRequired(rawEvaluator.required);
1290
1316
  evaluators.push({
1291
1317
  name,
1292
- type: "agent_judge",
1318
+ type: "agent-judge",
1293
1319
  ...agentPrompt ? { prompt: agentPrompt } : {},
1294
1320
  ...agentPromptPath ? { promptPath: agentPromptPath, resolvedPromptPath: agentPromptPath } : {},
1295
1321
  ...agentParsedRubrics && agentParsedRubrics.length > 0 ? { rubrics: agentParsedRubrics } : {},
@@ -1320,7 +1346,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1320
1346
  });
1321
1347
  continue;
1322
1348
  }
1323
- if (typeValue === "contains_any" || typeValue === "contains_all") {
1349
+ if (typeValue === "contains-any" || typeValue === "contains-all") {
1324
1350
  const value = asStringArrayStrict(rawEvaluator.value);
1325
1351
  if (!value || value.length === 0) {
1326
1352
  logWarning2(
@@ -1358,7 +1384,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1358
1384
  });
1359
1385
  continue;
1360
1386
  }
1361
- if (typeValue === "icontains_any" || typeValue === "icontains_all") {
1387
+ if (typeValue === "icontains-any" || typeValue === "icontains-all") {
1362
1388
  const value = asStringArrayStrict(rawEvaluator.value);
1363
1389
  if (!value || value.length === 0) {
1364
1390
  logWarning2(
@@ -1378,7 +1404,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1378
1404
  });
1379
1405
  continue;
1380
1406
  }
1381
- if (typeValue === "starts_with" || typeValue === "ends_with") {
1407
+ if (typeValue === "starts-with" || typeValue === "ends-with") {
1382
1408
  const value = asString(rawEvaluator.value);
1383
1409
  if (!value) {
1384
1410
  logWarning2(`Skipping ${typeValue} evaluator '${name}' in '${evalId}': missing value`);
@@ -1416,12 +1442,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1416
1442
  });
1417
1443
  continue;
1418
1444
  }
1419
- if (typeValue === "is_json") {
1445
+ if (typeValue === "is-json") {
1420
1446
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1421
1447
  const required2 = parseRequired(rawEvaluator.required);
1422
1448
  evaluators.push({
1423
1449
  name,
1424
- type: "is_json",
1450
+ type: "is-json",
1425
1451
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1426
1452
  ...required2 !== void 0 ? { required: required2 } : {},
1427
1453
  ...negate !== void 0 ? { negate } : {}
@@ -1469,7 +1495,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1469
1495
  const required2 = parseRequired(rawEvaluator.required);
1470
1496
  evaluators.push({
1471
1497
  name,
1472
- type: "llm_judge",
1498
+ type: "llm-judge",
1473
1499
  rubrics: parsedCriteria,
1474
1500
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1475
1501
  ...required2 !== void 0 ? { required: required2 } : {},
@@ -1536,7 +1562,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1536
1562
  const required2 = parseRequired(rawEvaluator.required);
1537
1563
  evaluators.push({
1538
1564
  name,
1539
- type: "llm_judge",
1565
+ type: "llm-judge",
1540
1566
  rubrics: parsedRubrics,
1541
1567
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1542
1568
  ...required2 !== void 0 ? { required: required2 } : {},
@@ -1568,7 +1594,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1568
1594
  const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
1569
1595
  evaluators.push({
1570
1596
  name,
1571
- type: "llm_judge",
1597
+ type: "llm-judge",
1572
1598
  prompt,
1573
1599
  promptPath,
1574
1600
  ...promptPath ? { resolvedPromptPath: promptPath } : {},
@@ -1584,15 +1610,15 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1584
1610
  }
1585
1611
  var ASSERTION_TYPES = /* @__PURE__ */ new Set([
1586
1612
  "contains",
1587
- "contains_any",
1588
- "contains_all",
1613
+ "contains-any",
1614
+ "contains-all",
1589
1615
  "icontains",
1590
- "icontains_any",
1591
- "icontains_all",
1592
- "starts_with",
1593
- "ends_with",
1616
+ "icontains-any",
1617
+ "icontains-all",
1618
+ "starts-with",
1619
+ "ends-with",
1594
1620
  "regex",
1595
- "is_json",
1621
+ "is-json",
1596
1622
  "equals",
1597
1623
  "rubrics"
1598
1624
  ]);
@@ -1605,24 +1631,24 @@ function generateAssertionName(typeValue, rawEvaluator) {
1605
1631
  switch (typeValue) {
1606
1632
  case "contains":
1607
1633
  return value ? `contains-${value}` : "contains";
1608
- case "contains_any":
1609
- return arrayValue ? `contains_any-${arrayValue.length}` : "contains_any";
1610
- case "contains_all":
1611
- return arrayValue ? `contains_all-${arrayValue.length}` : "contains_all";
1634
+ case "contains-any":
1635
+ return arrayValue ? `contains-any-${arrayValue.length}` : "contains-any";
1636
+ case "contains-all":
1637
+ return arrayValue ? `contains-all-${arrayValue.length}` : "contains-all";
1612
1638
  case "icontains":
1613
1639
  return value ? `icontains-${value}` : "icontains";
1614
- case "icontains_any":
1615
- return arrayValue ? `icontains_any-${arrayValue.length}` : "icontains_any";
1616
- case "icontains_all":
1617
- return arrayValue ? `icontains_all-${arrayValue.length}` : "icontains_all";
1618
- case "starts_with":
1619
- return value ? `starts_with-${value}` : "starts_with";
1620
- case "ends_with":
1621
- return value ? `ends_with-${value}` : "ends_with";
1640
+ case "icontains-any":
1641
+ return arrayValue ? `icontains-any-${arrayValue.length}` : "icontains-any";
1642
+ case "icontains-all":
1643
+ return arrayValue ? `icontains-all-${arrayValue.length}` : "icontains-all";
1644
+ case "starts-with":
1645
+ return value ? `starts-with-${value}` : "starts-with";
1646
+ case "ends-with":
1647
+ return value ? `ends-with-${value}` : "ends-with";
1622
1648
  case "regex":
1623
1649
  return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
1624
- case "is_json":
1625
- return "is_json";
1650
+ case "is-json":
1651
+ return "is-json";
1626
1652
  case "equals":
1627
1653
  return value ? `equals-${value}` : "equals";
1628
1654
  case "rubrics":
@@ -1635,8 +1661,9 @@ function coerceEvaluator(candidate, contextId) {
1635
1661
  if (typeof candidate !== "string") {
1636
1662
  return void 0;
1637
1663
  }
1638
- if (isEvaluatorKind(candidate)) {
1639
- return candidate;
1664
+ const normalized = normalizeEvaluatorType(candidate);
1665
+ if (isEvaluatorKind(normalized)) {
1666
+ return normalized;
1640
1667
  }
1641
1668
  logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
1642
1669
  return void 0;
@@ -1682,6 +1709,16 @@ function parseCommandToArgv(command) {
1682
1709
  function isJsonObject2(value) {
1683
1710
  return typeof value === "object" && value !== null && !Array.isArray(value);
1684
1711
  }
1712
+ var CRITERIA_CONSUMER_TYPES = /* @__PURE__ */ new Set(["llm-judge", "agent-judge", "code-judge"]);
1713
+ function warnUnconsumedCriteria(criteria, evaluators, testId) {
1714
+ if (!criteria?.trim() || !evaluators || evaluators.length === 0) return;
1715
+ const hasConsumer = evaluators.some((e) => CRITERIA_CONSUMER_TYPES.has(e.type));
1716
+ if (!hasConsumer) {
1717
+ logWarning2(
1718
+ `Test '${testId}': criteria is defined but no evaluator in assert will evaluate it. Add 'type: llm-judge' to assert, or remove criteria if it is documentation-only.`
1719
+ );
1720
+ }
1721
+ }
1685
1722
  function logWarning2(message, details) {
1686
1723
  if (details && details.length > 0) {
1687
1724
  const detailBlock = details.join("\n");
@@ -1931,7 +1968,7 @@ function parseInlineRubrics(rawRubrics) {
1931
1968
  }
1932
1969
  return {
1933
1970
  name: "rubric",
1934
- type: "llm_judge",
1971
+ type: "llm-judge",
1935
1972
  rubrics: rubricItems
1936
1973
  };
1937
1974
  }
@@ -2316,7 +2353,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2316
2353
  const rawCases = parseJsonlContent(rawFile, evalFilePath);
2317
2354
  const fallbackDataset = path6.basename(absoluteTestPath, ".jsonl") || "eval";
2318
2355
  const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
2319
- const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm_judge";
2356
+ const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-judge";
2320
2357
  const globalExecution = sidecar.execution;
2321
2358
  if (verbose) {
2322
2359
  console.log(`
@@ -2404,6 +2441,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2404
2441
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
2405
2442
  }
2406
2443
  }
2444
+ warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
2407
2445
  const userFilePaths = [];
2408
2446
  for (const segment of inputSegments) {
2409
2447
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -2757,13 +2795,15 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
2757
2795
  }
2758
2796
  const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
2759
2797
  const metadata = parseMetadata(parsed);
2798
+ const failOnError = extractFailOnError(parsed);
2760
2799
  return {
2761
2800
  tests,
2762
2801
  trials: extractTrialsConfig(parsed),
2763
2802
  targets: extractTargetsFromSuite(parsed),
2764
2803
  cacheConfig: extractCacheConfig(parsed),
2765
2804
  totalBudgetUsd: extractTotalBudgetUsd(parsed),
2766
- ...metadata !== void 0 && { metadata }
2805
+ ...metadata !== void 0 && { metadata },
2806
+ ...failOnError !== void 0 && { failOnError }
2767
2807
  };
2768
2808
  }
2769
2809
  var loadEvalSuite = loadTestSuite;
@@ -2794,7 +2834,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2794
2834
  const fallbackDataset = path8.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
2795
2835
  const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
2796
2836
  const rawTestcases = resolveTests(suite);
2797
- const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
2837
+ const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-judge";
2798
2838
  const evalFileDir = path8.dirname(absoluteTestPath);
2799
2839
  let expandedTestcases;
2800
2840
  if (typeof rawTestcases === "string") {
@@ -2891,6 +2931,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2891
2931
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
2892
2932
  }
2893
2933
  }
2934
+ warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
2894
2935
  const userFilePaths = [];
2895
2936
  for (const segment of inputSegments) {
2896
2937
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -8871,7 +8912,7 @@ function toCamelCaseDeep(obj) {
8871
8912
  // src/evaluation/evaluators/code-evaluator.ts
8872
8913
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
8873
8914
  var CodeEvaluator = class {
8874
- kind = "code";
8915
+ kind = "code-judge";
8875
8916
  command;
8876
8917
  cwd;
8877
8918
  agentTimeoutMs;
@@ -9079,7 +9120,7 @@ var scoreRangeEvaluationSchema = z3.object({
9079
9120
  overall_reasoning: z3.string().describe("Overall assessment summary (1-2 sentences)").optional()
9080
9121
  });
9081
9122
  var LlmJudgeEvaluator = class {
9082
- kind = "llm_judge";
9123
+ kind = "llm-judge";
9083
9124
  resolveJudgeProvider;
9084
9125
  maxOutputTokens;
9085
9126
  temperature;
@@ -9096,7 +9137,7 @@ var LlmJudgeEvaluator = class {
9096
9137
  throw new Error("No judge provider available for LLM grading");
9097
9138
  }
9098
9139
  const config = context.evaluator;
9099
- if (config?.type === "llm_judge" && config.rubrics && config.rubrics.length > 0) {
9140
+ if (config?.type === "llm-judge" && config.rubrics && config.rubrics.length > 0) {
9100
9141
  return this.evaluateWithRubrics(context, judgeProvider, config.rubrics);
9101
9142
  }
9102
9143
  return this.evaluateFreeform(context, judgeProvider);
@@ -9170,7 +9211,7 @@ ${context.fileChanges}`;
9170
9211
  async evaluateWithRubrics(context, judgeProvider, rubrics) {
9171
9212
  if (!rubrics || rubrics.length === 0) {
9172
9213
  throw new Error(
9173
- `No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
9214
+ `No rubrics found for evaluator "${context.evaluator?.name ?? "llm-judge"}". Run "agentv generate rubrics" first.`
9174
9215
  );
9175
9216
  }
9176
9217
  const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
@@ -9506,9 +9547,9 @@ var CompositeEvaluator = class {
9506
9547
  async aggregate(results, context) {
9507
9548
  const aggregator = this.config.aggregator;
9508
9549
  switch (aggregator.type) {
9509
- case "code_judge":
9550
+ case "code-judge":
9510
9551
  return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
9511
- case "llm_judge":
9552
+ case "llm-judge":
9512
9553
  return this.runLlmAggregator(results, context, aggregator);
9513
9554
  case "threshold":
9514
9555
  return this.runThreshold(results, aggregator.threshold);
@@ -9651,7 +9692,7 @@ var CompositeEvaluator = class {
9651
9692
  expectedAspectCount: hits.length + misses.length || 1,
9652
9693
  reasoning,
9653
9694
  evaluatorRawRequest: {
9654
- aggregator: "code_judge",
9695
+ aggregator: "code-judge",
9655
9696
  script: scriptPath
9656
9697
  },
9657
9698
  scores
@@ -9666,7 +9707,7 @@ var CompositeEvaluator = class {
9666
9707
  expectedAspectCount: 1,
9667
9708
  reasoning: message,
9668
9709
  evaluatorRawRequest: {
9669
- aggregator: "code_judge",
9710
+ aggregator: "code-judge",
9670
9711
  script: scriptPath,
9671
9712
  error: message
9672
9713
  },
@@ -9697,7 +9738,7 @@ var CompositeEvaluator = class {
9697
9738
  const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
9698
9739
  const systemPrompt = buildOutputSchema();
9699
9740
  const evaluatorRawRequest = {
9700
- aggregator: "llm_judge",
9741
+ aggregator: "llm-judge",
9701
9742
  userPrompt,
9702
9743
  systemPrompt,
9703
9744
  target: judgeProvider.targetName
@@ -9809,7 +9850,7 @@ var CostEvaluator = class {
9809
9850
 
9810
9851
  // src/evaluation/evaluators/execution-metrics.ts
9811
9852
  var ExecutionMetricsEvaluator = class {
9812
- kind = "execution_metrics";
9853
+ kind = "execution-metrics";
9813
9854
  config;
9814
9855
  constructor(options) {
9815
9856
  this.config = options.config;
@@ -9835,7 +9876,7 @@ var ExecutionMetricsEvaluator = class {
9835
9876
  expectedAspectCount: 1,
9836
9877
  reasoning: "Execution metrics not available - no trace summary provided",
9837
9878
  evaluatorRawRequest: {
9838
- type: "execution_metrics",
9879
+ type: "execution-metrics",
9839
9880
  config: this.extractConfiguredThresholds(),
9840
9881
  actual: null
9841
9882
  }
@@ -9944,7 +9985,7 @@ var ExecutionMetricsEvaluator = class {
9944
9985
  if (actualMetrics.exploration_ratio !== void 0) {
9945
9986
  reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
9946
9987
  }
9947
- const reasoning = reasoningParts.length > 0 ? `execution_metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
9988
+ const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
9948
9989
  return {
9949
9990
  score,
9950
9991
  verdict: scoreToVerdict(score),
@@ -9953,7 +9994,7 @@ var ExecutionMetricsEvaluator = class {
9953
9994
  expectedAspectCount: totalChecks || 1,
9954
9995
  reasoning,
9955
9996
  evaluatorRawRequest: {
9956
- type: "execution_metrics",
9997
+ type: "execution-metrics",
9957
9998
  config: this.extractConfiguredThresholds(),
9958
9999
  actual: this.filterDefinedMetrics(actualMetrics)
9959
10000
  }
@@ -10041,7 +10082,7 @@ var MONTH_NAMES = {
10041
10082
  december: 11
10042
10083
  };
10043
10084
  var FieldAccuracyEvaluator = class {
10044
- kind = "field_accuracy";
10085
+ kind = "field-accuracy";
10045
10086
  config;
10046
10087
  constructor(options) {
10047
10088
  this.config = options.config;
@@ -10495,7 +10536,7 @@ var BINARY_EXTENSIONS = /* @__PURE__ */ new Set([
10495
10536
  ".dylib"
10496
10537
  ]);
10497
10538
  var AgentJudgeEvaluator = class {
10498
- kind = "agent_judge";
10539
+ kind = "agent-judge";
10499
10540
  resolveJudgeProvider;
10500
10541
  maxSteps;
10501
10542
  temperature;
@@ -10520,24 +10561,24 @@ var AgentJudgeEvaluator = class {
10520
10561
  async evaluateBuiltIn(context) {
10521
10562
  const judgeProvider = await this.resolveJudgeProvider(context);
10522
10563
  if (!judgeProvider) {
10523
- throw new Error("No judge provider available for agent_judge evaluation");
10564
+ throw new Error("No judge provider available for agent-judge evaluation");
10524
10565
  }
10525
10566
  const model = judgeProvider.asLanguageModel?.();
10526
10567
  if (!model) {
10527
10568
  throw new Error(
10528
- `Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent_judge mode`
10569
+ `Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent-judge mode`
10529
10570
  );
10530
10571
  }
10531
10572
  const workspacePath = context.workspacePath;
10532
10573
  if (!workspacePath) {
10533
10574
  throw new Error(
10534
- "agent_judge evaluator requires a workspace_template target (workspacePath is not set)"
10575
+ "agent-judge evaluator requires a workspace_template target (workspacePath is not set)"
10535
10576
  );
10536
10577
  }
10537
10578
  const systemPrompt = this.buildSystemPrompt(context);
10538
10579
  const userPrompt = this.buildUserPrompt(context);
10539
10580
  const config = context.evaluator;
10540
- const rubrics = config?.type === "agent_judge" ? config.rubrics : void 0;
10581
+ const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
10541
10582
  const fsTools = createFilesystemTools(workspacePath);
10542
10583
  const evaluatorRawRequest = {
10543
10584
  mode: "built-in",
@@ -10568,7 +10609,7 @@ var AgentJudgeEvaluator = class {
10568
10609
  score: 0,
10569
10610
  verdict: "fail",
10570
10611
  hits: [],
10571
- misses: [`agent_judge built-in evaluation failed: ${message}`],
10612
+ misses: [`agent-judge built-in evaluation failed: ${message}`],
10572
10613
  expectedAspectCount: 1,
10573
10614
  evaluatorRawRequest,
10574
10615
  details: { mode: "built-in", error: message }
@@ -10600,14 +10641,14 @@ var AgentJudgeEvaluator = class {
10600
10641
  score: 0,
10601
10642
  verdict: "fail",
10602
10643
  hits: [],
10603
- misses: ["agent_judge judge_target returned no assistant response"],
10644
+ misses: ["agent-judge judge_target returned no assistant response"],
10604
10645
  expectedAspectCount: 1,
10605
10646
  evaluatorRawRequest,
10606
10647
  details: { mode: "judge_target", judge_target: provider.targetName }
10607
10648
  };
10608
10649
  }
10609
10650
  const config = context.evaluator;
10610
- const rubrics = config?.type === "agent_judge" ? config.rubrics : void 0;
10651
+ const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
10611
10652
  const details = {
10612
10653
  mode: "judge_target",
10613
10654
  judge_target: provider.targetName
@@ -10619,7 +10660,7 @@ var AgentJudgeEvaluator = class {
10619
10660
  score: 0,
10620
10661
  verdict: "fail",
10621
10662
  hits: [],
10622
- misses: [`agent_judge judge_target evaluation failed: ${message}`],
10663
+ misses: [`agent-judge judge_target evaluation failed: ${message}`],
10623
10664
  expectedAspectCount: 1,
10624
10665
  evaluatorRawRequest,
10625
10666
  details: {
@@ -10670,7 +10711,7 @@ var AgentJudgeEvaluator = class {
10670
10711
  score: 0,
10671
10712
  verdict: "fail",
10672
10713
  hits: [],
10673
- misses: ["Failed to parse agent_judge response as valid evaluation JSON"],
10714
+ misses: ["Failed to parse agent-judge response as valid evaluation JSON"],
10674
10715
  expectedAspectCount: 1,
10675
10716
  evaluatorRawRequest,
10676
10717
  details
@@ -10683,7 +10724,7 @@ var AgentJudgeEvaluator = class {
10683
10724
  */
10684
10725
  buildSystemPrompt(context) {
10685
10726
  const config = context.evaluator;
10686
- const rubrics = config?.type === "agent_judge" ? config.rubrics : void 0;
10727
+ const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
10687
10728
  const parts = [
10688
10729
  "You are an expert evaluator with access to the workspace filesystem.",
10689
10730
  "Use the provided tools to investigate the workspace and verify the criteria are met.",
@@ -10714,7 +10755,7 @@ var AgentJudgeEvaluator = class {
10714
10755
  return substituteVariables(this.evaluatorTemplate, variables);
10715
10756
  }
10716
10757
  const config = context.evaluator;
10717
- const rubrics = config?.type === "agent_judge" ? config.rubrics : void 0;
10758
+ const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
10718
10759
  const parts = [
10719
10760
  "Evaluate the candidate answer by investigating the workspace.",
10720
10761
  "",
@@ -10757,7 +10798,7 @@ var AgentJudgeEvaluator = class {
10757
10798
  buildDelegatedPrompt(context) {
10758
10799
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
10759
10800
  const config = context.evaluator;
10760
- const rubrics = config?.type === "agent_judge" ? config.rubrics : void 0;
10801
+ const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
10761
10802
  if (this.evaluatorTemplate) {
10762
10803
  const variables = {
10763
10804
  [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
@@ -10839,11 +10880,11 @@ function createFilesystemTools(workspacePath) {
10839
10880
  execute: async (input) => {
10840
10881
  try {
10841
10882
  const resolved = resolveSandboxed(workspacePath, input.path);
10842
- const stat7 = await fs2.stat(resolved);
10843
- if (stat7.isDirectory()) {
10883
+ const stat8 = await fs2.stat(resolved);
10884
+ if (stat8.isDirectory()) {
10844
10885
  return { error: `'${input.path}' is a directory, not a file` };
10845
10886
  }
10846
- const buffer = Buffer.alloc(Math.min(stat7.size, MAX_FILE_SIZE));
10887
+ const buffer = Buffer.alloc(Math.min(stat8.size, MAX_FILE_SIZE));
10847
10888
  const fd = await fs2.open(resolved, "r");
10848
10889
  try {
10849
10890
  await fd.read(buffer, 0, buffer.length, 0);
@@ -10851,8 +10892,8 @@ function createFilesystemTools(workspacePath) {
10851
10892
  await fd.close();
10852
10893
  }
10853
10894
  const content = buffer.toString("utf-8");
10854
- const truncated = stat7.size > MAX_FILE_SIZE;
10855
- return { content, truncated, size: stat7.size };
10895
+ const truncated = stat8.size > MAX_FILE_SIZE;
10896
+ return { content, truncated, size: stat8.size };
10856
10897
  } catch (error) {
10857
10898
  return { error: error instanceof Error ? error.message : String(error) };
10858
10899
  }
@@ -10896,8 +10937,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
10896
10937
  const ext = path30.extname(entry.name).toLowerCase();
10897
10938
  if (BINARY_EXTENSIONS.has(ext)) continue;
10898
10939
  try {
10899
- const stat7 = await fs2.stat(fullPath);
10900
- if (stat7.size > MAX_FILE_SIZE) continue;
10940
+ const stat8 = await fs2.stat(fullPath);
10941
+ if (stat8.size > MAX_FILE_SIZE) continue;
10901
10942
  const content = await fs2.readFile(fullPath, "utf-8");
10902
10943
  const lines = content.split("\n");
10903
10944
  for (let i = 0; i < lines.length; i++) {
@@ -11059,7 +11100,7 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
11059
11100
 
11060
11101
  // src/evaluation/evaluators/token-usage.ts
11061
11102
  var TokenUsageEvaluator = class {
11062
- kind = "token_usage";
11103
+ kind = "token-usage";
11063
11104
  config;
11064
11105
  constructor(options) {
11065
11106
  this.config = options.config;
@@ -11082,7 +11123,7 @@ var TokenUsageEvaluator = class {
11082
11123
  expectedAspectCount,
11083
11124
  reasoning: "Token usage not reported by provider",
11084
11125
  evaluatorRawRequest: {
11085
- type: "token_usage",
11126
+ type: "token-usage",
11086
11127
  max_total: maxTotal ?? null,
11087
11128
  max_input: maxInput ?? null,
11088
11129
  max_output: maxOutput ?? null,
@@ -11124,9 +11165,9 @@ var TokenUsageEvaluator = class {
11124
11165
  hits,
11125
11166
  misses,
11126
11167
  expectedAspectCount,
11127
- reasoning: `token_usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
11168
+ reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
11128
11169
  evaluatorRawRequest: {
11129
- type: "token_usage",
11170
+ type: "token-usage",
11130
11171
  max_total: maxTotal ?? null,
11131
11172
  max_input: maxInput ?? null,
11132
11173
  max_output: maxOutput ?? null,
@@ -11211,7 +11252,7 @@ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
11211
11252
  };
11212
11253
  }
11213
11254
  var ToolTrajectoryEvaluator = class {
11214
- kind = "tool_trajectory";
11255
+ kind = "tool-trajectory";
11215
11256
  config;
11216
11257
  constructor(options) {
11217
11258
  this.config = options.config;
@@ -11399,7 +11440,7 @@ var ToolTrajectoryEvaluator = class {
11399
11440
  }
11400
11441
  }
11401
11442
  for (const warning of warnings) {
11402
- console.warn(`[tool_trajectory] ${warning}`);
11443
+ console.warn(`[tool-trajectory] ${warning}`);
11403
11444
  }
11404
11445
  const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
11405
11446
  const totalAssertions = expected.length + effectiveLatencyAssertions;
@@ -11475,7 +11516,7 @@ var ToolTrajectoryEvaluator = class {
11475
11516
  misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
11476
11517
  }
11477
11518
  for (const warning of warnings) {
11478
- console.warn(`[tool_trajectory] ${warning}`);
11519
+ console.warn(`[tool-trajectory] ${warning}`);
11479
11520
  }
11480
11521
  const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
11481
11522
  const totalAssertions = expected.length + effectiveLatencyAssertions;
@@ -11705,7 +11746,7 @@ function runEqualsAssertion(output, value) {
11705
11746
 
11706
11747
  // src/evaluation/orchestrator.ts
11707
11748
  import { createHash as createHash2, randomUUID as randomUUID7 } from "node:crypto";
11708
- import { mkdir as mkdir12 } from "node:fs/promises";
11749
+ import { mkdir as mkdir12, stat as stat7 } from "node:fs/promises";
11709
11750
  import path37 from "node:path";
11710
11751
  import micromatch4 from "micromatch";
11711
11752
 
@@ -11965,7 +12006,7 @@ var llmJudgeFactory = (config, context) => {
11965
12006
  const c = config;
11966
12007
  const { llmJudge, agentTimeoutMs } = context;
11967
12008
  return {
11968
- kind: "llm_judge",
12009
+ kind: "llm-judge",
11969
12010
  async evaluate(evalContext) {
11970
12011
  const customPrompt = await resolveCustomPrompt(
11971
12012
  c,
@@ -12054,7 +12095,7 @@ var agentJudgeFactory = (config, context) => {
12054
12095
  customPrompt = readFileSync(c.resolvedPromptPath, "utf-8");
12055
12096
  } catch (error) {
12056
12097
  const message = error instanceof Error ? error.message : String(error);
12057
- console.warn(`Could not read agent_judge prompt at ${c.resolvedPromptPath}: ${message}`);
12098
+ console.warn(`Could not read agent-judge prompt at ${c.resolvedPromptPath}: ${message}`);
12058
12099
  }
12059
12100
  } else if (c.prompt) {
12060
12101
  customPrompt = c.prompt;
@@ -12064,7 +12105,7 @@ var agentJudgeFactory = (config, context) => {
12064
12105
  judgeTargetProvider = targetResolver(c.target);
12065
12106
  if (!judgeTargetProvider) {
12066
12107
  throw new Error(
12067
- `agent_judge evaluator '${c.name}': target '${c.target}' not found in targets`
12108
+ `agent-judge evaluator '${c.name}': target '${c.target}' not found in targets`
12068
12109
  );
12069
12110
  }
12070
12111
  }
@@ -12108,7 +12149,7 @@ var regexFactory = (config) => {
12108
12149
  });
12109
12150
  };
12110
12151
  var isJsonFactory = () => {
12111
- return new DeterministicAssertionEvaluator("is_json", (ctx) => {
12152
+ return new DeterministicAssertionEvaluator("is-json", (ctx) => {
12112
12153
  const result = runIsJsonAssertion(ctx.candidate);
12113
12154
  return {
12114
12155
  score: result.score,
@@ -12136,7 +12177,7 @@ var equalsFactory = (config) => {
12136
12177
  };
12137
12178
  var containsAnyFactory = (config) => {
12138
12179
  const c = config;
12139
- return new DeterministicAssertionEvaluator("contains_any", (ctx) => {
12180
+ return new DeterministicAssertionEvaluator("contains-any", (ctx) => {
12140
12181
  const result = runContainsAnyAssertion(ctx.candidate, c.value);
12141
12182
  return {
12142
12183
  score: result.score,
@@ -12150,7 +12191,7 @@ var containsAnyFactory = (config) => {
12150
12191
  };
12151
12192
  var containsAllFactory = (config) => {
12152
12193
  const c = config;
12153
- return new DeterministicAssertionEvaluator("contains_all", (ctx) => {
12194
+ return new DeterministicAssertionEvaluator("contains-all", (ctx) => {
12154
12195
  const result = runContainsAllAssertion(ctx.candidate, c.value);
12155
12196
  return {
12156
12197
  score: result.score,
@@ -12178,7 +12219,7 @@ var icontainsFactory = (config) => {
12178
12219
  };
12179
12220
  var icontainsAnyFactory = (config) => {
12180
12221
  const c = config;
12181
- return new DeterministicAssertionEvaluator("icontains_any", (ctx) => {
12222
+ return new DeterministicAssertionEvaluator("icontains-any", (ctx) => {
12182
12223
  const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
12183
12224
  return {
12184
12225
  score: result.score,
@@ -12192,7 +12233,7 @@ var icontainsAnyFactory = (config) => {
12192
12233
  };
12193
12234
  var icontainsAllFactory = (config) => {
12194
12235
  const c = config;
12195
- return new DeterministicAssertionEvaluator("icontains_all", (ctx) => {
12236
+ return new DeterministicAssertionEvaluator("icontains-all", (ctx) => {
12196
12237
  const result = runIcontainsAllAssertion(ctx.candidate, c.value);
12197
12238
  return {
12198
12239
  score: result.score,
@@ -12206,7 +12247,7 @@ var icontainsAllFactory = (config) => {
12206
12247
  };
12207
12248
  var startsWithFactory = (config) => {
12208
12249
  const c = config;
12209
- return new DeterministicAssertionEvaluator("starts_with", (ctx) => {
12250
+ return new DeterministicAssertionEvaluator("starts-with", (ctx) => {
12210
12251
  const result = runStartsWithAssertion(ctx.candidate, c.value);
12211
12252
  return {
12212
12253
  score: result.score,
@@ -12220,7 +12261,7 @@ var startsWithFactory = (config) => {
12220
12261
  };
12221
12262
  var endsWithFactory = (config) => {
12222
12263
  const c = config;
12223
- return new DeterministicAssertionEvaluator("ends_with", (ctx) => {
12264
+ return new DeterministicAssertionEvaluator("ends-with", (ctx) => {
12224
12265
  const result = runEndsWithAssertion(ctx.candidate, c.value);
12225
12266
  return {
12226
12267
  score: result.score,
@@ -12234,7 +12275,7 @@ var endsWithFactory = (config) => {
12234
12275
  };
12235
12276
  function createBuiltinRegistry() {
12236
12277
  const registry = new EvaluatorRegistry();
12237
- registry.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("contains_any", containsAnyFactory).register("contains_all", containsAllFactory).register("icontains", icontainsFactory).register("icontains_any", icontainsAnyFactory).register("icontains_all", icontainsAllFactory).register("starts_with", startsWithFactory).register("ends_with", endsWithFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
12278
+ registry.register("llm-judge", llmJudgeFactory).register("code-judge", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("agent-judge", agentJudgeFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory);
12238
12279
  return registry;
12239
12280
  }
12240
12281
 
@@ -12921,7 +12962,8 @@ async function runEvaluation(options) {
12921
12962
  cleanupWorkspaces,
12922
12963
  trials,
12923
12964
  streamCallbacks,
12924
- totalBudgetUsd
12965
+ totalBudgetUsd,
12966
+ failOnError
12925
12967
  } = options;
12926
12968
  let useCache = options.useCache;
12927
12969
  if (trials && trials.count > 1 && useCache) {
@@ -12979,7 +13021,7 @@ async function runEvaluation(options) {
12979
13021
  };
12980
13022
  if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
12981
13023
  throw new Error(
12982
- `Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure_base).`
13024
+ `Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure-base).`
12983
13025
  );
12984
13026
  }
12985
13027
  const targetResolver = (name) => {
@@ -13050,7 +13092,7 @@ async function runEvaluation(options) {
13050
13092
  const rawTemplate = suiteWorkspace?.template ?? getWorkspaceTemplate(target);
13051
13093
  const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
13052
13094
  const workspaceTemplate = resolvedTemplate?.dir;
13053
- const suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
13095
+ let suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
13054
13096
  const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
13055
13097
  const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
13056
13098
  const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
@@ -13071,6 +13113,14 @@ async function runEvaluation(options) {
13071
13113
  const message = error instanceof Error ? error.message : String(error);
13072
13114
  throw new Error(`Failed to create shared workspace: ${message}`);
13073
13115
  }
13116
+ if (suiteWorkspaceFile && sharedWorkspacePath) {
13117
+ const copiedWorkspaceFile = path37.join(sharedWorkspacePath, path37.basename(suiteWorkspaceFile));
13118
+ try {
13119
+ await stat7(copiedWorkspaceFile);
13120
+ suiteWorkspaceFile = copiedWorkspaceFile;
13121
+ } catch {
13122
+ }
13123
+ }
13074
13124
  } else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
13075
13125
  sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
13076
13126
  await mkdir12(sharedWorkspacePath, { recursive: true });
@@ -13117,6 +13167,7 @@ async function runEvaluation(options) {
13117
13167
  let beforeAllOutputAttached = false;
13118
13168
  let cumulativeBudgetCost = 0;
13119
13169
  let budgetExhausted = false;
13170
+ let failOnErrorTriggered = false;
13120
13171
  const promises = filteredEvalCases.map(
13121
13172
  (evalCase) => limit(async () => {
13122
13173
  const workerId = nextWorkerId++;
@@ -13155,6 +13206,37 @@ async function runEvaluation(options) {
13155
13206
  }
13156
13207
  return budgetResult;
13157
13208
  }
13209
+ if (failOnError === true && failOnErrorTriggered) {
13210
+ const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
13211
+ const haltResult = {
13212
+ timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
13213
+ testId: evalCase.id,
13214
+ dataset: evalCase.dataset,
13215
+ score: 0,
13216
+ hits: [],
13217
+ misses: [],
13218
+ answer: "",
13219
+ target: target.name,
13220
+ error: errorMsg,
13221
+ executionStatus: "execution_error",
13222
+ failureStage: "setup",
13223
+ failureReasonCode: "error_threshold_exceeded",
13224
+ executionError: { message: errorMsg, stage: "setup" }
13225
+ };
13226
+ if (onProgress) {
13227
+ await onProgress({
13228
+ workerId,
13229
+ testId: evalCase.id,
13230
+ status: "failed",
13231
+ completedAt: Date.now(),
13232
+ error: haltResult.error
13233
+ });
13234
+ }
13235
+ if (onResult) {
13236
+ await onResult(haltResult);
13237
+ }
13238
+ return haltResult;
13239
+ }
13158
13240
  if (onProgress) {
13159
13241
  await onProgress({
13160
13242
  workerId,
@@ -13207,6 +13289,9 @@ async function runEvaluation(options) {
13207
13289
  }
13208
13290
  }
13209
13291
  }
13292
+ if (failOnError === true && result.executionStatus === "execution_error") {
13293
+ failOnErrorTriggered = true;
13294
+ }
13210
13295
  if (beforeAllOutput && !beforeAllOutputAttached) {
13211
13296
  result = { ...result, beforeAllOutput };
13212
13297
  beforeAllOutputAttached = true;
@@ -13514,6 +13599,14 @@ async function runEvalCase(options) {
13514
13599
  "template_error"
13515
13600
  );
13516
13601
  }
13602
+ if (caseWorkspaceFile && workspacePath) {
13603
+ const copiedFile = path37.join(workspacePath, path37.basename(caseWorkspaceFile));
13604
+ try {
13605
+ await stat7(copiedFile);
13606
+ caseWorkspaceFile = copiedFile;
13607
+ } catch {
13608
+ }
13609
+ }
13517
13610
  }
13518
13611
  if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
13519
13612
  workspacePath = getWorkspacePath(evalRunId, evalCase.id);
@@ -14023,8 +14116,8 @@ async function runEvaluatorsForCase(options) {
14023
14116
  workspacePath
14024
14117
  });
14025
14118
  }
14026
- const evaluatorKind = evalCase.evaluator ?? "llm_judge";
14027
- const activeEvaluator = evaluators[evaluatorKind] ?? evaluators.llm_judge;
14119
+ const evaluatorKind = evalCase.evaluator ?? "llm-judge";
14120
+ const activeEvaluator = evaluators[evaluatorKind] ?? evaluators["llm-judge"];
14028
14121
  if (!activeEvaluator) {
14029
14122
  throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
14030
14123
  }
@@ -14107,25 +14200,24 @@ async function runEvaluatorList(options) {
14107
14200
  availableTargets,
14108
14201
  agentTimeoutMs,
14109
14202
  evalFileDir,
14110
- llmJudge: evaluatorRegistry.llm_judge,
14203
+ llmJudge: evaluatorRegistry["llm-judge"],
14111
14204
  registry: typeRegistry
14112
14205
  };
14113
14206
  for (const evaluatorConfig of evaluators ?? []) {
14114
14207
  try {
14115
14208
  const evaluatorInstance = await typeRegistry.create(evaluatorConfig, dispatchContext);
14116
14209
  const score2 = await evaluatorInstance.evaluate(evalContext);
14117
- const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
14118
14210
  const weight = evaluatorConfig.weight ?? 1;
14119
14211
  scored.push({
14120
14212
  score: score2,
14121
14213
  name: evaluatorConfig.name,
14122
- type: resultType,
14214
+ type: evaluatorConfig.type,
14123
14215
  weight,
14124
14216
  ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
14125
14217
  });
14126
14218
  scores.push({
14127
14219
  name: evaluatorConfig.name,
14128
- type: resultType,
14220
+ type: evaluatorConfig.type,
14129
14221
  score: score2.score,
14130
14222
  weight,
14131
14223
  verdict: score2.verdict,
@@ -14147,18 +14239,17 @@ async function runEvaluatorList(options) {
14147
14239
  expectedAspectCount: 1,
14148
14240
  reasoning: message
14149
14241
  };
14150
- const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
14151
14242
  const weight = evaluatorConfig.weight ?? 1;
14152
14243
  scored.push({
14153
14244
  score: fallbackScore,
14154
14245
  name: evaluatorConfig.name ?? "unknown",
14155
- type: resultType ?? "llm_judge",
14246
+ type: evaluatorConfig.type ?? "llm-judge",
14156
14247
  weight,
14157
14248
  ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
14158
14249
  });
14159
14250
  scores.push({
14160
14251
  name: evaluatorConfig.name ?? "unknown",
14161
- type: resultType ?? "llm_judge",
14252
+ type: evaluatorConfig.type ?? "llm-judge",
14162
14253
  score: 0,
14163
14254
  weight,
14164
14255
  verdict: "fail",
@@ -14219,7 +14310,7 @@ function filterEvalCases(evalCases, filter) {
14219
14310
  return evalCases.filter((evalCase) => micromatch4.isMatch(evalCase.id, filter));
14220
14311
  }
14221
14312
  function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
14222
- const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
14313
+ const llmJudge = overrides?.["llm-judge"] ?? new LlmJudgeEvaluator({
14223
14314
  resolveJudgeProvider: async (context) => {
14224
14315
  if (context.judgeProvider) {
14225
14316
  return context.judgeProvider;
@@ -14229,7 +14320,7 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
14229
14320
  });
14230
14321
  return {
14231
14322
  ...overrides,
14232
- llm_judge: llmJudge
14323
+ "llm-judge": llmJudge
14233
14324
  };
14234
14325
  }
14235
14326
  async function invokeProvider(provider, options) {
@@ -14489,12 +14580,7 @@ async function evaluate(config) {
14489
14580
  };
14490
14581
  }
14491
14582
  function mapAssertionType(type) {
14492
- switch (type) {
14493
- case "code_judge":
14494
- return "code";
14495
- default:
14496
- return type;
14497
- }
14583
+ return type.replace(/_/g, "-");
14498
14584
  }
14499
14585
  function computeSummary(results, durationMs) {
14500
14586
  const total = results.length;
@@ -15268,6 +15354,7 @@ export {
15268
15354
  executeWorkspaceScript,
15269
15355
  explorationRatio,
15270
15356
  extractCacheConfig,
15357
+ extractFailOnError,
15271
15358
  extractJsonBlob,
15272
15359
  extractTargetFromSuite,
15273
15360
  extractTargetsFromSuite,