agentv 2.12.0 → 2.14.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -148,7 +148,7 @@ var require_dist = __commonJS({
148
148
  }
149
149
  });
150
150
 
151
- // ../../packages/core/dist/chunk-7HPKTRFZ.js
151
+ // ../../packages/core/dist/chunk-N55K52OO.js
152
152
  import { constants } from "node:fs";
153
153
  import { access, readFile } from "node:fs/promises";
154
154
  import path from "node:path";
@@ -4195,7 +4195,7 @@ var coerce = {
4195
4195
  };
4196
4196
  var NEVER = INVALID;
4197
4197
 
4198
- // ../../packages/core/dist/chunk-7HPKTRFZ.js
4198
+ // ../../packages/core/dist/chunk-N55K52OO.js
4199
4199
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
4200
4200
  var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
4201
4201
  var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
@@ -4243,27 +4243,27 @@ function isTestMessage(value) {
4243
4243
  return false;
4244
4244
  }
4245
4245
  var EVALUATOR_KIND_VALUES = [
4246
- "code_judge",
4247
- "llm_judge",
4246
+ "code-judge",
4247
+ "llm-judge",
4248
4248
  "rubric",
4249
4249
  "composite",
4250
- "tool_trajectory",
4251
- "field_accuracy",
4250
+ "tool-trajectory",
4251
+ "field-accuracy",
4252
4252
  "latency",
4253
4253
  "cost",
4254
- "token_usage",
4255
- "execution_metrics",
4256
- "agent_judge",
4254
+ "token-usage",
4255
+ "execution-metrics",
4256
+ "agent-judge",
4257
4257
  "contains",
4258
- "contains_any",
4259
- "contains_all",
4258
+ "contains-any",
4259
+ "contains-all",
4260
4260
  "icontains",
4261
- "icontains_any",
4262
- "icontains_all",
4263
- "starts_with",
4264
- "ends_with",
4261
+ "icontains-any",
4262
+ "icontains-all",
4263
+ "starts-with",
4264
+ "ends-with",
4265
4265
  "regex",
4266
- "is_json",
4266
+ "is-json",
4267
4267
  "equals",
4268
4268
  "rubrics"
4269
4269
  ];
@@ -33960,7 +33960,7 @@ import { createServer } from "node:http";
33960
33960
  import fs2 from "node:fs/promises";
33961
33961
  import path30 from "node:path";
33962
33962
  import { createHash as createHash2, randomUUID as randomUUID7 } from "node:crypto";
33963
- import { mkdir as mkdir12 } from "node:fs/promises";
33963
+ import { mkdir as mkdir12, stat as stat7 } from "node:fs/promises";
33964
33964
  import path37 from "node:path";
33965
33965
  import micromatch4 from "micromatch";
33966
33966
  import { readFileSync } from "node:fs";
@@ -34331,6 +34331,11 @@ async function loadConfig(evalFilePath, repoRoot) {
34331
34331
  continue;
34332
34332
  }
34333
34333
  const config2 = parsed;
34334
+ const requiredVersion = parsed.required_version;
34335
+ if (requiredVersion !== void 0 && typeof requiredVersion !== "string") {
34336
+ logWarning(`Invalid required_version in ${configPath}, expected string`);
34337
+ continue;
34338
+ }
34334
34339
  const guidelinePatterns = config2.guideline_patterns;
34335
34340
  if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
34336
34341
  logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
@@ -34354,6 +34359,7 @@ async function loadConfig(evalFilePath, repoRoot) {
34354
34359
  configPath
34355
34360
  );
34356
34361
  return {
34362
+ required_version: requiredVersion,
34357
34363
  guideline_patterns: guidelinePatterns,
34358
34364
  eval_patterns: evalPatterns,
34359
34365
  execution: executionDefaults
@@ -34497,6 +34503,22 @@ function extractTotalBudgetUsd(suite) {
34497
34503
  );
34498
34504
  return void 0;
34499
34505
  }
34506
+ function extractFailOnError(suite) {
34507
+ const execution = suite.execution;
34508
+ if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
34509
+ return void 0;
34510
+ }
34511
+ const executionObj = execution;
34512
+ const raw = executionObj.fail_on_error ?? executionObj.failOnError;
34513
+ if (raw === void 0 || raw === null) {
34514
+ return void 0;
34515
+ }
34516
+ if (typeof raw === "boolean") {
34517
+ return raw;
34518
+ }
34519
+ logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
34520
+ return void 0;
34521
+ }
34500
34522
  function parseExecutionDefaults(raw, configPath) {
34501
34523
  if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
34502
34524
  return void 0;
@@ -34583,6 +34605,9 @@ function validateTemplateVariables(content, source) {
34583
34605
  }
34584
34606
  var ANSI_YELLOW4 = "\x1B[33m";
34585
34607
  var ANSI_RESET4 = "\x1B[0m";
34608
+ function normalizeEvaluatorType(type) {
34609
+ return type.replace(/_/g, "-");
34610
+ }
34586
34611
  async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
34587
34612
  const execution = rawEvalCase.execution;
34588
34613
  const executionObject = isJsonObject2(execution) ? execution : void 0;
@@ -34613,7 +34638,8 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34613
34638
  continue;
34614
34639
  }
34615
34640
  const rawName = asString(rawEvaluator.name);
34616
- const typeValue = rawEvaluator.type;
34641
+ const rawType = rawEvaluator.type;
34642
+ const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
34617
34643
  const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
34618
34644
  if (typeof typeValue !== "string") {
34619
34645
  logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
@@ -34646,25 +34672,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34646
34672
  });
34647
34673
  continue;
34648
34674
  }
34649
- if (typeValue === "code_judge") {
34675
+ if (typeValue === "code-judge") {
34650
34676
  let command;
34651
34677
  const rawCommand = rawEvaluator.command ?? rawEvaluator.script;
34652
34678
  if (typeof rawCommand === "string") {
34653
34679
  const trimmed = rawCommand.trim();
34654
34680
  if (trimmed.length === 0) {
34655
34681
  throw new Error(
34656
- `Invalid code_judge command for evaluator '${name16}' in '${evalId}': command cannot be empty`
34682
+ `Invalid code-judge command for evaluator '${name16}' in '${evalId}': command cannot be empty`
34657
34683
  );
34658
34684
  }
34659
34685
  command = parseCommandToArgv(trimmed);
34660
34686
  } else {
34661
34687
  command = asStringArray(
34662
34688
  rawCommand,
34663
- `code_judge command for evaluator '${name16}' in '${evalId}'`
34689
+ `code-judge command for evaluator '${name16}' in '${evalId}'`
34664
34690
  );
34665
34691
  }
34666
34692
  if (!command) {
34667
- logWarning2(`Skipping code_judge evaluator '${name16}' in '${evalId}': missing command`);
34693
+ logWarning2(`Skipping code-judge evaluator '${name16}' in '${evalId}': missing command`);
34668
34694
  continue;
34669
34695
  }
34670
34696
  const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
@@ -34725,7 +34751,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34725
34751
  }
34726
34752
  evaluators.push({
34727
34753
  name: name16,
34728
- type: "code",
34754
+ type: "code-judge",
34729
34755
  command,
34730
34756
  cwd,
34731
34757
  resolvedCwd,
@@ -34751,7 +34777,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34751
34777
  continue;
34752
34778
  }
34753
34779
  const aggregatorType = asString(rawAggregator.type);
34754
- if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge" && aggregatorType !== "threshold") {
34780
+ if (aggregatorType !== "weighted_average" && aggregatorType !== "code-judge" && aggregatorType !== "llm-judge" && aggregatorType !== "threshold") {
34755
34781
  logWarning2(
34756
34782
  `Skipping composite evaluator '${name16}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
34757
34783
  );
@@ -34800,16 +34826,16 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34800
34826
  type: "weighted_average",
34801
34827
  ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
34802
34828
  };
34803
- } else if (aggregatorType === "code_judge") {
34829
+ } else if (aggregatorType === "code-judge") {
34804
34830
  const aggregatorPath = asString(rawAggregator.path);
34805
34831
  if (!aggregatorPath) {
34806
34832
  logWarning2(
34807
- `Skipping composite evaluator '${name16}' in '${evalId}': code_judge aggregator missing path`
34833
+ `Skipping composite evaluator '${name16}' in '${evalId}': code-judge aggregator missing path`
34808
34834
  );
34809
34835
  continue;
34810
34836
  }
34811
34837
  aggregator = {
34812
- type: "code_judge",
34838
+ type: "code-judge",
34813
34839
  path: aggregatorPath,
34814
34840
  cwd: searchRoots[0]
34815
34841
  };
@@ -34835,7 +34861,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34835
34861
  }
34836
34862
  }
34837
34863
  aggregator = {
34838
- type: "llm_judge",
34864
+ type: "llm-judge",
34839
34865
  ...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
34840
34866
  ...promptPath2 ? { promptPath: promptPath2 } : {}
34841
34867
  };
@@ -34853,11 +34879,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34853
34879
  });
34854
34880
  continue;
34855
34881
  }
34856
- if (typeValue === "tool_trajectory") {
34882
+ if (typeValue === "tool-trajectory") {
34857
34883
  const mode = asString(rawEvaluator.mode);
34858
34884
  if (mode !== "any_order" && mode !== "in_order" && mode !== "exact" && mode !== "subset" && mode !== "superset") {
34859
34885
  logWarning2(
34860
- `Skipping tool_trajectory evaluator '${name16}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, exact, subset, or superset)`
34886
+ `Skipping tool-trajectory evaluator '${name16}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, exact, subset, or superset)`
34861
34887
  );
34862
34888
  continue;
34863
34889
  }
@@ -34866,7 +34892,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34866
34892
  if (rawMinimums !== void 0) {
34867
34893
  if (!isJsonObject2(rawMinimums)) {
34868
34894
  logWarning2(
34869
- `Skipping tool_trajectory evaluator '${name16}' in '${evalId}': minimums must be an object`
34895
+ `Skipping tool-trajectory evaluator '${name16}' in '${evalId}': minimums must be an object`
34870
34896
  );
34871
34897
  continue;
34872
34898
  }
@@ -34892,7 +34918,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34892
34918
  argsMatch2 = rawArgsMatch;
34893
34919
  } else {
34894
34920
  logWarning2(
34895
- `Invalid args_match '${rawArgsMatch}' for tool_trajectory evaluator '${name16}' in '${evalId}': must be exact, superset, subset, ignore, or a string array`
34921
+ `Invalid args_match '${rawArgsMatch}' for tool-trajectory evaluator '${name16}' in '${evalId}': must be exact, superset, subset, ignore, or a string array`
34896
34922
  );
34897
34923
  }
34898
34924
  }
@@ -34902,7 +34928,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34902
34928
  if (rawExpected !== void 0) {
34903
34929
  if (!Array.isArray(rawExpected)) {
34904
34930
  logWarning2(
34905
- `Skipping tool_trajectory evaluator '${name16}' in '${evalId}': expected must be an array`
34931
+ `Skipping tool-trajectory evaluator '${name16}' in '${evalId}': expected must be an array`
34906
34932
  );
34907
34933
  continue;
34908
34934
  }
@@ -34948,13 +34974,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34948
34974
  }
34949
34975
  if (mode === "any_order" && !minimums) {
34950
34976
  logWarning2(
34951
- `Skipping tool_trajectory evaluator '${name16}' in '${evalId}': any_order mode requires minimums`
34977
+ `Skipping tool-trajectory evaluator '${name16}' in '${evalId}': any_order mode requires minimums`
34952
34978
  );
34953
34979
  continue;
34954
34980
  }
34955
34981
  if ((mode === "in_order" || mode === "exact" || mode === "subset" || mode === "superset") && !expected) {
34956
34982
  logWarning2(
34957
- `Skipping tool_trajectory evaluator '${name16}' in '${evalId}': ${mode} mode requires expected`
34983
+ `Skipping tool-trajectory evaluator '${name16}' in '${evalId}': ${mode} mode requires expected`
34958
34984
  );
34959
34985
  continue;
34960
34986
  }
@@ -34962,7 +34988,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34962
34988
  const required22 = parseRequired(rawEvaluator.required);
34963
34989
  const config22 = {
34964
34990
  name: name16,
34965
- type: "tool_trajectory",
34991
+ type: "tool-trajectory",
34966
34992
  mode,
34967
34993
  ...minimums ? { minimums } : {},
34968
34994
  ...expected ? { expected } : {},
@@ -34974,17 +35000,17 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34974
35000
  evaluators.push(config22);
34975
35001
  continue;
34976
35002
  }
34977
- if (typeValue === "field_accuracy") {
35003
+ if (typeValue === "field-accuracy") {
34978
35004
  const rawFields = rawEvaluator.fields;
34979
35005
  if (!Array.isArray(rawFields)) {
34980
35006
  logWarning2(
34981
- `Skipping field_accuracy evaluator '${name16}' in '${evalId}': missing fields array`
35007
+ `Skipping field-accuracy evaluator '${name16}' in '${evalId}': missing fields array`
34982
35008
  );
34983
35009
  continue;
34984
35010
  }
34985
35011
  if (rawFields.length === 0) {
34986
35012
  logWarning2(
34987
- `Skipping field_accuracy evaluator '${name16}' in '${evalId}': fields array is empty`
35013
+ `Skipping field-accuracy evaluator '${name16}' in '${evalId}': fields array is empty`
34988
35014
  );
34989
35015
  continue;
34990
35016
  }
@@ -34992,7 +35018,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34992
35018
  for (const rawField of rawFields) {
34993
35019
  if (!isJsonObject2(rawField)) {
34994
35020
  logWarning2(
34995
- `Skipping invalid field entry in field_accuracy evaluator '${name16}' (expected object)`
35021
+ `Skipping invalid field entry in field-accuracy evaluator '${name16}' (expected object)`
34996
35022
  );
34997
35023
  continue;
34998
35024
  }
@@ -35000,13 +35026,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35000
35026
  const match = asString(rawField.match);
35001
35027
  if (!fieldPath) {
35002
35028
  logWarning2(
35003
- `Skipping field without path in field_accuracy evaluator '${name16}' in '${evalId}'`
35029
+ `Skipping field without path in field-accuracy evaluator '${name16}' in '${evalId}'`
35004
35030
  );
35005
35031
  continue;
35006
35032
  }
35007
35033
  if (!match || !isValidFieldMatchType(match)) {
35008
35034
  logWarning2(
35009
- `Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name16}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code_judge evaluator.`
35035
+ `Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name16}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code-judge evaluator.`
35010
35036
  );
35011
35037
  continue;
35012
35038
  }
@@ -35023,7 +35049,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35023
35049
  }
35024
35050
  if (fields.length === 0) {
35025
35051
  logWarning2(
35026
- `Skipping field_accuracy evaluator '${name16}' in '${evalId}': no valid fields found`
35052
+ `Skipping field-accuracy evaluator '${name16}' in '${evalId}': no valid fields found`
35027
35053
  );
35028
35054
  continue;
35029
35055
  }
@@ -35033,7 +35059,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35033
35059
  const required22 = parseRequired(rawEvaluator.required);
35034
35060
  evaluators.push({
35035
35061
  name: name16,
35036
- type: "field_accuracy",
35062
+ type: "field-accuracy",
35037
35063
  fields,
35038
35064
  ...validAggregation ? { aggregation: validAggregation } : {},
35039
35065
  ...weight2 !== void 0 ? { weight: weight2 } : {},
@@ -35082,7 +35108,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35082
35108
  });
35083
35109
  continue;
35084
35110
  }
35085
- if (typeValue === "token_usage") {
35111
+ if (typeValue === "token-usage") {
35086
35112
  const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
35087
35113
  const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
35088
35114
  const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
@@ -35096,7 +35122,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35096
35122
  if (raw === void 0) continue;
35097
35123
  if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
35098
35124
  logWarning2(
35099
- `Skipping token_usage evaluator '${name16}' in '${evalId}': ${key} must be a non-negative finite number`
35125
+ `Skipping token-usage evaluator '${name16}' in '${evalId}': ${key} must be a non-negative finite number`
35100
35126
  );
35101
35127
  continue;
35102
35128
  }
@@ -35104,7 +35130,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35104
35130
  }
35105
35131
  if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
35106
35132
  logWarning2(
35107
- `Skipping token_usage evaluator '${name16}' in '${evalId}': must set at least one of max_total, max_input, max_output`
35133
+ `Skipping token-usage evaluator '${name16}' in '${evalId}': must set at least one of max_total, max_input, max_output`
35108
35134
  );
35109
35135
  continue;
35110
35136
  }
@@ -35112,7 +35138,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35112
35138
  const required22 = parseRequired(rawEvaluator.required);
35113
35139
  evaluators.push({
35114
35140
  name: name16,
35115
- type: "token_usage",
35141
+ type: "token-usage",
35116
35142
  ...validLimits,
35117
35143
  ...weight2 !== void 0 ? { weight: weight2 } : {},
35118
35144
  ...required22 !== void 0 ? { required: required22 } : {},
@@ -35120,7 +35146,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35120
35146
  });
35121
35147
  continue;
35122
35148
  }
35123
- if (typeValue === "execution_metrics") {
35149
+ if (typeValue === "execution-metrics") {
35124
35150
  const maxToolCalls = rawEvaluator.max_tool_calls ?? rawEvaluator.maxToolCalls;
35125
35151
  const maxLlmCalls = rawEvaluator.max_llm_calls ?? rawEvaluator.maxLlmCalls;
35126
35152
  const maxTokens = rawEvaluator.max_tokens ?? rawEvaluator.maxTokens;
@@ -35143,7 +35169,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35143
35169
  if (raw === void 0) continue;
35144
35170
  if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
35145
35171
  logWarning2(
35146
- `Skipping execution_metrics evaluator '${name16}' in '${evalId}': ${key} must be a non-negative finite number`
35172
+ `Skipping execution-metrics evaluator '${name16}' in '${evalId}': ${key} must be a non-negative finite number`
35147
35173
  );
35148
35174
  hasError = true;
35149
35175
  break;
@@ -35156,7 +35182,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35156
35182
  const hasThreshold = validThresholds.max_tool_calls !== void 0 || validThresholds.max_llm_calls !== void 0 || validThresholds.max_tokens !== void 0 || validThresholds.max_cost_usd !== void 0 || validThresholds.max_duration_ms !== void 0 || validThresholds.target_exploration_ratio !== void 0;
35157
35183
  if (!hasThreshold) {
35158
35184
  logWarning2(
35159
- `Skipping execution_metrics evaluator '${name16}' in '${evalId}': must set at least one threshold (max_tool_calls, max_llm_calls, max_tokens, max_cost_usd, max_duration_ms, or target_exploration_ratio)`
35185
+ `Skipping execution-metrics evaluator '${name16}' in '${evalId}': must set at least one threshold (max_tool_calls, max_llm_calls, max_tokens, max_cost_usd, max_duration_ms, or target_exploration_ratio)`
35160
35186
  );
35161
35187
  continue;
35162
35188
  }
@@ -35164,7 +35190,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35164
35190
  const required22 = parseRequired(rawEvaluator.required);
35165
35191
  evaluators.push({
35166
35192
  name: name16,
35167
- type: "execution_metrics",
35193
+ type: "execution-metrics",
35168
35194
  ...validThresholds,
35169
35195
  ...weight2 !== void 0 ? { weight: weight2 } : {},
35170
35196
  ...required22 !== void 0 ? { required: required22 } : {},
@@ -35172,13 +35198,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35172
35198
  });
35173
35199
  continue;
35174
35200
  }
35175
- if (typeValue === "agent_judge") {
35201
+ if (typeValue === "agent-judge") {
35176
35202
  const rawMaxSteps = rawEvaluator.max_steps ?? rawEvaluator.maxSteps;
35177
35203
  let maxSteps;
35178
35204
  if (rawMaxSteps !== void 0) {
35179
35205
  if (typeof rawMaxSteps !== "number" || !Number.isInteger(rawMaxSteps) || rawMaxSteps < 1 || rawMaxSteps > 50) {
35180
35206
  logWarning2(
35181
- `Skipping agent_judge evaluator '${name16}' in '${evalId}': max_steps must be an integer 1-50`
35207
+ `Skipping agent-judge evaluator '${name16}' in '${evalId}': max_steps must be an integer 1-50`
35182
35208
  );
35183
35209
  continue;
35184
35210
  }
@@ -35189,7 +35215,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35189
35215
  if (rawTemperature !== void 0) {
35190
35216
  if (typeof rawTemperature !== "number" || rawTemperature < 0 || rawTemperature > 2) {
35191
35217
  logWarning2(
35192
- `Skipping agent_judge evaluator '${name16}' in '${evalId}': temperature must be a number 0-2`
35218
+ `Skipping agent-judge evaluator '${name16}' in '${evalId}': temperature must be a number 0-2`
35193
35219
  );
35194
35220
  continue;
35195
35221
  }
@@ -35212,7 +35238,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35212
35238
  const required22 = parseRequired(rawEvaluator.required);
35213
35239
  evaluators.push({
35214
35240
  name: name16,
35215
- type: "agent_judge",
35241
+ type: "agent-judge",
35216
35242
  ...agentPrompt ? { prompt: agentPrompt } : {},
35217
35243
  ...agentPromptPath ? { promptPath: agentPromptPath, resolvedPromptPath: agentPromptPath } : {},
35218
35244
  ...agentParsedRubrics && agentParsedRubrics.length > 0 ? { rubrics: agentParsedRubrics } : {},
@@ -35243,7 +35269,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35243
35269
  });
35244
35270
  continue;
35245
35271
  }
35246
- if (typeValue === "contains_any" || typeValue === "contains_all") {
35272
+ if (typeValue === "contains-any" || typeValue === "contains-all") {
35247
35273
  const value = asStringArrayStrict(rawEvaluator.value);
35248
35274
  if (!value || value.length === 0) {
35249
35275
  logWarning2(
@@ -35281,7 +35307,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35281
35307
  });
35282
35308
  continue;
35283
35309
  }
35284
- if (typeValue === "icontains_any" || typeValue === "icontains_all") {
35310
+ if (typeValue === "icontains-any" || typeValue === "icontains-all") {
35285
35311
  const value = asStringArrayStrict(rawEvaluator.value);
35286
35312
  if (!value || value.length === 0) {
35287
35313
  logWarning2(
@@ -35301,7 +35327,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35301
35327
  });
35302
35328
  continue;
35303
35329
  }
35304
- if (typeValue === "starts_with" || typeValue === "ends_with") {
35330
+ if (typeValue === "starts-with" || typeValue === "ends-with") {
35305
35331
  const value = asString(rawEvaluator.value);
35306
35332
  if (!value) {
35307
35333
  logWarning2(`Skipping ${typeValue} evaluator '${name16}' in '${evalId}': missing value`);
@@ -35339,12 +35365,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35339
35365
  });
35340
35366
  continue;
35341
35367
  }
35342
- if (typeValue === "is_json") {
35368
+ if (typeValue === "is-json") {
35343
35369
  const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
35344
35370
  const required22 = parseRequired(rawEvaluator.required);
35345
35371
  evaluators.push({
35346
35372
  name: name16,
35347
- type: "is_json",
35373
+ type: "is-json",
35348
35374
  ...weight2 !== void 0 ? { weight: weight2 } : {},
35349
35375
  ...required22 !== void 0 ? { required: required22 } : {},
35350
35376
  ...negate !== void 0 ? { negate } : {}
@@ -35392,7 +35418,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35392
35418
  const required22 = parseRequired(rawEvaluator.required);
35393
35419
  evaluators.push({
35394
35420
  name: name16,
35395
- type: "llm_judge",
35421
+ type: "llm-judge",
35396
35422
  rubrics: parsedCriteria,
35397
35423
  ...weight2 !== void 0 ? { weight: weight2 } : {},
35398
35424
  ...required22 !== void 0 ? { required: required22 } : {},
@@ -35459,7 +35485,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35459
35485
  const required22 = parseRequired(rawEvaluator.required);
35460
35486
  evaluators.push({
35461
35487
  name: name16,
35462
- type: "llm_judge",
35488
+ type: "llm-judge",
35463
35489
  rubrics: parsedRubrics,
35464
35490
  ...weight2 !== void 0 ? { weight: weight2 } : {},
35465
35491
  ...required22 !== void 0 ? { required: required22 } : {},
@@ -35491,7 +35517,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35491
35517
  const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
35492
35518
  evaluators.push({
35493
35519
  name: name16,
35494
- type: "llm_judge",
35520
+ type: "llm-judge",
35495
35521
  prompt,
35496
35522
  promptPath,
35497
35523
  ...promptPath ? { resolvedPromptPath: promptPath } : {},
@@ -35507,15 +35533,15 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35507
35533
  }
35508
35534
  var ASSERTION_TYPES = /* @__PURE__ */ new Set([
35509
35535
  "contains",
35510
- "contains_any",
35511
- "contains_all",
35536
+ "contains-any",
35537
+ "contains-all",
35512
35538
  "icontains",
35513
- "icontains_any",
35514
- "icontains_all",
35515
- "starts_with",
35516
- "ends_with",
35539
+ "icontains-any",
35540
+ "icontains-all",
35541
+ "starts-with",
35542
+ "ends-with",
35517
35543
  "regex",
35518
- "is_json",
35544
+ "is-json",
35519
35545
  "equals",
35520
35546
  "rubrics"
35521
35547
  ]);
@@ -35528,24 +35554,24 @@ function generateAssertionName(typeValue, rawEvaluator) {
35528
35554
  switch (typeValue) {
35529
35555
  case "contains":
35530
35556
  return value ? `contains-${value}` : "contains";
35531
- case "contains_any":
35532
- return arrayValue ? `contains_any-${arrayValue.length}` : "contains_any";
35533
- case "contains_all":
35534
- return arrayValue ? `contains_all-${arrayValue.length}` : "contains_all";
35557
+ case "contains-any":
35558
+ return arrayValue ? `contains-any-${arrayValue.length}` : "contains-any";
35559
+ case "contains-all":
35560
+ return arrayValue ? `contains-all-${arrayValue.length}` : "contains-all";
35535
35561
  case "icontains":
35536
35562
  return value ? `icontains-${value}` : "icontains";
35537
- case "icontains_any":
35538
- return arrayValue ? `icontains_any-${arrayValue.length}` : "icontains_any";
35539
- case "icontains_all":
35540
- return arrayValue ? `icontains_all-${arrayValue.length}` : "icontains_all";
35541
- case "starts_with":
35542
- return value ? `starts_with-${value}` : "starts_with";
35543
- case "ends_with":
35544
- return value ? `ends_with-${value}` : "ends_with";
35563
+ case "icontains-any":
35564
+ return arrayValue ? `icontains-any-${arrayValue.length}` : "icontains-any";
35565
+ case "icontains-all":
35566
+ return arrayValue ? `icontains-all-${arrayValue.length}` : "icontains-all";
35567
+ case "starts-with":
35568
+ return value ? `starts-with-${value}` : "starts-with";
35569
+ case "ends-with":
35570
+ return value ? `ends-with-${value}` : "ends-with";
35545
35571
  case "regex":
35546
35572
  return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
35547
- case "is_json":
35548
- return "is_json";
35573
+ case "is-json":
35574
+ return "is-json";
35549
35575
  case "equals":
35550
35576
  return value ? `equals-${value}` : "equals";
35551
35577
  case "rubrics":
@@ -35558,8 +35584,9 @@ function coerceEvaluator(candidate, contextId) {
35558
35584
  if (typeof candidate !== "string") {
35559
35585
  return void 0;
35560
35586
  }
35561
- if (isEvaluatorKind(candidate)) {
35562
- return candidate;
35587
+ const normalized = normalizeEvaluatorType(candidate);
35588
+ if (isEvaluatorKind(normalized)) {
35589
+ return normalized;
35563
35590
  }
35564
35591
  logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
35565
35592
  return void 0;
@@ -35605,6 +35632,16 @@ function parseCommandToArgv(command) {
35605
35632
  function isJsonObject2(value) {
35606
35633
  return typeof value === "object" && value !== null && !Array.isArray(value);
35607
35634
  }
35635
+ var CRITERIA_CONSUMER_TYPES = /* @__PURE__ */ new Set(["llm-judge", "agent-judge", "code-judge"]);
35636
+ function warnUnconsumedCriteria(criteria, evaluators, testId) {
35637
+ if (!criteria?.trim() || !evaluators || evaluators.length === 0) return;
35638
+ const hasConsumer = evaluators.some((e) => CRITERIA_CONSUMER_TYPES.has(e.type));
35639
+ if (!hasConsumer) {
35640
+ logWarning2(
35641
+ `Test '${testId}': criteria is defined but no evaluator in assert will evaluate it. Add 'type: llm-judge' to assert, or remove criteria if it is documentation-only.`
35642
+ );
35643
+ }
35644
+ }
35608
35645
  function logWarning2(message, details) {
35609
35646
  if (details && details.length > 0) {
35610
35647
  const detailBlock = details.join("\n");
@@ -35854,7 +35891,7 @@ function parseInlineRubrics(rawRubrics) {
35854
35891
  }
35855
35892
  return {
35856
35893
  name: "rubric",
35857
- type: "llm_judge",
35894
+ type: "llm-judge",
35858
35895
  rubrics: rubricItems
35859
35896
  };
35860
35897
  }
@@ -36221,7 +36258,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
36221
36258
  const rawCases = parseJsonlContent(rawFile, evalFilePath);
36222
36259
  const fallbackDataset = path6.basename(absoluteTestPath, ".jsonl") || "eval";
36223
36260
  const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
36224
- const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm_judge";
36261
+ const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-judge";
36225
36262
  const globalExecution = sidecar.execution;
36226
36263
  if (verbose) {
36227
36264
  console.log(`
@@ -36309,6 +36346,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
36309
36346
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
36310
36347
  }
36311
36348
  }
36349
+ warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
36312
36350
  const userFilePaths = [];
36313
36351
  for (const segment of inputSegments) {
36314
36352
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -36653,13 +36691,15 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
36653
36691
  }
36654
36692
  const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
36655
36693
  const metadata = parseMetadata(parsed);
36694
+ const failOnError = extractFailOnError(parsed);
36656
36695
  return {
36657
36696
  tests,
36658
36697
  trials: extractTrialsConfig(parsed),
36659
36698
  targets: extractTargetsFromSuite(parsed),
36660
36699
  cacheConfig: extractCacheConfig(parsed),
36661
36700
  totalBudgetUsd: extractTotalBudgetUsd(parsed),
36662
- ...metadata !== void 0 && { metadata }
36701
+ ...metadata !== void 0 && { metadata },
36702
+ ...failOnError !== void 0 && { failOnError }
36663
36703
  };
36664
36704
  }
36665
36705
  var loadEvalSuite = loadTestSuite;
@@ -36690,7 +36730,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
36690
36730
  const fallbackDataset = path8.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
36691
36731
  const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
36692
36732
  const rawTestcases = resolveTests(suite);
36693
- const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
36733
+ const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-judge";
36694
36734
  const evalFileDir = path8.dirname(absoluteTestPath);
36695
36735
  let expandedTestcases;
36696
36736
  if (typeof rawTestcases === "string") {
@@ -36787,6 +36827,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
36787
36827
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
36788
36828
  }
36789
36829
  }
36830
+ warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
36790
36831
  const userFilePaths = [];
36791
36832
  for (const segment of inputSegments) {
36792
36833
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -42588,7 +42629,7 @@ function toCamelCaseDeep(obj) {
42588
42629
  }
42589
42630
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
42590
42631
  var CodeEvaluator = class {
42591
- kind = "code";
42632
+ kind = "code-judge";
42592
42633
  command;
42593
42634
  cwd;
42594
42635
  agentTimeoutMs;
@@ -42789,7 +42830,7 @@ var scoreRangeEvaluationSchema = external_exports.object({
42789
42830
  overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)").optional()
42790
42831
  });
42791
42832
  var LlmJudgeEvaluator = class {
42792
- kind = "llm_judge";
42833
+ kind = "llm-judge";
42793
42834
  resolveJudgeProvider;
42794
42835
  maxOutputTokens;
42795
42836
  temperature;
@@ -42806,7 +42847,7 @@ var LlmJudgeEvaluator = class {
42806
42847
  throw new Error("No judge provider available for LLM grading");
42807
42848
  }
42808
42849
  const config2 = context.evaluator;
42809
- if (config2?.type === "llm_judge" && config2.rubrics && config2.rubrics.length > 0) {
42850
+ if (config2?.type === "llm-judge" && config2.rubrics && config2.rubrics.length > 0) {
42810
42851
  return this.evaluateWithRubrics(context, judgeProvider, config2.rubrics);
42811
42852
  }
42812
42853
  return this.evaluateFreeform(context, judgeProvider);
@@ -42880,7 +42921,7 @@ ${context.fileChanges}`;
42880
42921
  async evaluateWithRubrics(context, judgeProvider, rubrics) {
42881
42922
  if (!rubrics || rubrics.length === 0) {
42882
42923
  throw new Error(
42883
- `No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
42924
+ `No rubrics found for evaluator "${context.evaluator?.name ?? "llm-judge"}". Run "agentv generate rubrics" first.`
42884
42925
  );
42885
42926
  }
42886
42927
  const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
@@ -43214,9 +43255,9 @@ var CompositeEvaluator = class {
43214
43255
  async aggregate(results, context) {
43215
43256
  const aggregator = this.config.aggregator;
43216
43257
  switch (aggregator.type) {
43217
- case "code_judge":
43258
+ case "code-judge":
43218
43259
  return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
43219
- case "llm_judge":
43260
+ case "llm-judge":
43220
43261
  return this.runLlmAggregator(results, context, aggregator);
43221
43262
  case "threshold":
43222
43263
  return this.runThreshold(results, aggregator.threshold);
@@ -43359,7 +43400,7 @@ var CompositeEvaluator = class {
43359
43400
  expectedAspectCount: hits.length + misses.length || 1,
43360
43401
  reasoning,
43361
43402
  evaluatorRawRequest: {
43362
- aggregator: "code_judge",
43403
+ aggregator: "code-judge",
43363
43404
  script: scriptPath
43364
43405
  },
43365
43406
  scores
@@ -43374,7 +43415,7 @@ var CompositeEvaluator = class {
43374
43415
  expectedAspectCount: 1,
43375
43416
  reasoning: message,
43376
43417
  evaluatorRawRequest: {
43377
- aggregator: "code_judge",
43418
+ aggregator: "code-judge",
43378
43419
  script: scriptPath,
43379
43420
  error: message
43380
43421
  },
@@ -43405,7 +43446,7 @@ var CompositeEvaluator = class {
43405
43446
  const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
43406
43447
  const systemPrompt = buildOutputSchema();
43407
43448
  const evaluatorRawRequest = {
43408
- aggregator: "llm_judge",
43449
+ aggregator: "llm-judge",
43409
43450
  userPrompt,
43410
43451
  systemPrompt,
43411
43452
  target: judgeProvider.targetName
@@ -43513,7 +43554,7 @@ var CostEvaluator = class {
43513
43554
  }
43514
43555
  };
43515
43556
  var ExecutionMetricsEvaluator = class {
43516
- kind = "execution_metrics";
43557
+ kind = "execution-metrics";
43517
43558
  config;
43518
43559
  constructor(options) {
43519
43560
  this.config = options.config;
@@ -43539,7 +43580,7 @@ var ExecutionMetricsEvaluator = class {
43539
43580
  expectedAspectCount: 1,
43540
43581
  reasoning: "Execution metrics not available - no trace summary provided",
43541
43582
  evaluatorRawRequest: {
43542
- type: "execution_metrics",
43583
+ type: "execution-metrics",
43543
43584
  config: this.extractConfiguredThresholds(),
43544
43585
  actual: null
43545
43586
  }
@@ -43648,7 +43689,7 @@ var ExecutionMetricsEvaluator = class {
43648
43689
  if (actualMetrics.exploration_ratio !== void 0) {
43649
43690
  reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
43650
43691
  }
43651
- const reasoning = reasoningParts.length > 0 ? `execution_metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
43692
+ const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
43652
43693
  return {
43653
43694
  score,
43654
43695
  verdict: scoreToVerdict(score),
@@ -43657,7 +43698,7 @@ var ExecutionMetricsEvaluator = class {
43657
43698
  expectedAspectCount: totalChecks || 1,
43658
43699
  reasoning,
43659
43700
  evaluatorRawRequest: {
43660
- type: "execution_metrics",
43701
+ type: "execution-metrics",
43661
43702
  config: this.extractConfiguredThresholds(),
43662
43703
  actual: this.filterDefinedMetrics(actualMetrics)
43663
43704
  }
@@ -43743,7 +43784,7 @@ var MONTH_NAMES = {
43743
43784
  december: 11
43744
43785
  };
43745
43786
  var FieldAccuracyEvaluator = class {
43746
- kind = "field_accuracy";
43787
+ kind = "field-accuracy";
43747
43788
  config;
43748
43789
  constructor(options) {
43749
43790
  this.config = options.config;
@@ -44189,7 +44230,7 @@ var BINARY_EXTENSIONS = /* @__PURE__ */ new Set([
44189
44230
  ".dylib"
44190
44231
  ]);
44191
44232
  var AgentJudgeEvaluator = class {
44192
- kind = "agent_judge";
44233
+ kind = "agent-judge";
44193
44234
  resolveJudgeProvider;
44194
44235
  maxSteps;
44195
44236
  temperature;
@@ -44214,24 +44255,24 @@ var AgentJudgeEvaluator = class {
44214
44255
  async evaluateBuiltIn(context) {
44215
44256
  const judgeProvider = await this.resolveJudgeProvider(context);
44216
44257
  if (!judgeProvider) {
44217
- throw new Error("No judge provider available for agent_judge evaluation");
44258
+ throw new Error("No judge provider available for agent-judge evaluation");
44218
44259
  }
44219
44260
  const model = judgeProvider.asLanguageModel?.();
44220
44261
  if (!model) {
44221
44262
  throw new Error(
44222
- `Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent_judge mode`
44263
+ `Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent-judge mode`
44223
44264
  );
44224
44265
  }
44225
44266
  const workspacePath = context.workspacePath;
44226
44267
  if (!workspacePath) {
44227
44268
  throw new Error(
44228
- "agent_judge evaluator requires a workspace_template target (workspacePath is not set)"
44269
+ "agent-judge evaluator requires a workspace_template target (workspacePath is not set)"
44229
44270
  );
44230
44271
  }
44231
44272
  const systemPrompt = this.buildSystemPrompt(context);
44232
44273
  const userPrompt = this.buildUserPrompt(context);
44233
44274
  const config2 = context.evaluator;
44234
- const rubrics = config2?.type === "agent_judge" ? config2.rubrics : void 0;
44275
+ const rubrics = config2?.type === "agent-judge" ? config2.rubrics : void 0;
44235
44276
  const fsTools = createFilesystemTools(workspacePath);
44236
44277
  const evaluatorRawRequest = {
44237
44278
  mode: "built-in",
@@ -44262,7 +44303,7 @@ var AgentJudgeEvaluator = class {
44262
44303
  score: 0,
44263
44304
  verdict: "fail",
44264
44305
  hits: [],
44265
- misses: [`agent_judge built-in evaluation failed: ${message}`],
44306
+ misses: [`agent-judge built-in evaluation failed: ${message}`],
44266
44307
  expectedAspectCount: 1,
44267
44308
  evaluatorRawRequest,
44268
44309
  details: { mode: "built-in", error: message }
@@ -44294,14 +44335,14 @@ var AgentJudgeEvaluator = class {
44294
44335
  score: 0,
44295
44336
  verdict: "fail",
44296
44337
  hits: [],
44297
- misses: ["agent_judge judge_target returned no assistant response"],
44338
+ misses: ["agent-judge judge_target returned no assistant response"],
44298
44339
  expectedAspectCount: 1,
44299
44340
  evaluatorRawRequest,
44300
44341
  details: { mode: "judge_target", judge_target: provider.targetName }
44301
44342
  };
44302
44343
  }
44303
44344
  const config2 = context.evaluator;
44304
- const rubrics = config2?.type === "agent_judge" ? config2.rubrics : void 0;
44345
+ const rubrics = config2?.type === "agent-judge" ? config2.rubrics : void 0;
44305
44346
  const details = {
44306
44347
  mode: "judge_target",
44307
44348
  judge_target: provider.targetName
@@ -44313,7 +44354,7 @@ var AgentJudgeEvaluator = class {
44313
44354
  score: 0,
44314
44355
  verdict: "fail",
44315
44356
  hits: [],
44316
- misses: [`agent_judge judge_target evaluation failed: ${message}`],
44357
+ misses: [`agent-judge judge_target evaluation failed: ${message}`],
44317
44358
  expectedAspectCount: 1,
44318
44359
  evaluatorRawRequest,
44319
44360
  details: {
@@ -44364,7 +44405,7 @@ var AgentJudgeEvaluator = class {
44364
44405
  score: 0,
44365
44406
  verdict: "fail",
44366
44407
  hits: [],
44367
- misses: ["Failed to parse agent_judge response as valid evaluation JSON"],
44408
+ misses: ["Failed to parse agent-judge response as valid evaluation JSON"],
44368
44409
  expectedAspectCount: 1,
44369
44410
  evaluatorRawRequest,
44370
44411
  details
@@ -44377,7 +44418,7 @@ var AgentJudgeEvaluator = class {
44377
44418
  */
44378
44419
  buildSystemPrompt(context) {
44379
44420
  const config2 = context.evaluator;
44380
- const rubrics = config2?.type === "agent_judge" ? config2.rubrics : void 0;
44421
+ const rubrics = config2?.type === "agent-judge" ? config2.rubrics : void 0;
44381
44422
  const parts = [
44382
44423
  "You are an expert evaluator with access to the workspace filesystem.",
44383
44424
  "Use the provided tools to investigate the workspace and verify the criteria are met.",
@@ -44408,7 +44449,7 @@ var AgentJudgeEvaluator = class {
44408
44449
  return substituteVariables(this.evaluatorTemplate, variables);
44409
44450
  }
44410
44451
  const config2 = context.evaluator;
44411
- const rubrics = config2?.type === "agent_judge" ? config2.rubrics : void 0;
44452
+ const rubrics = config2?.type === "agent-judge" ? config2.rubrics : void 0;
44412
44453
  const parts = [
44413
44454
  "Evaluate the candidate answer by investigating the workspace.",
44414
44455
  "",
@@ -44451,7 +44492,7 @@ var AgentJudgeEvaluator = class {
44451
44492
  buildDelegatedPrompt(context) {
44452
44493
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
44453
44494
  const config2 = context.evaluator;
44454
- const rubrics = config2?.type === "agent_judge" ? config2.rubrics : void 0;
44495
+ const rubrics = config2?.type === "agent-judge" ? config2.rubrics : void 0;
44455
44496
  if (this.evaluatorTemplate) {
44456
44497
  const variables = {
44457
44498
  [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
@@ -44533,11 +44574,11 @@ function createFilesystemTools(workspacePath) {
44533
44574
  execute: async (input) => {
44534
44575
  try {
44535
44576
  const resolved = resolveSandboxed(workspacePath, input.path);
44536
- const stat7 = await fs2.stat(resolved);
44537
- if (stat7.isDirectory()) {
44577
+ const stat8 = await fs2.stat(resolved);
44578
+ if (stat8.isDirectory()) {
44538
44579
  return { error: `'${input.path}' is a directory, not a file` };
44539
44580
  }
44540
- const buffer = Buffer.alloc(Math.min(stat7.size, MAX_FILE_SIZE));
44581
+ const buffer = Buffer.alloc(Math.min(stat8.size, MAX_FILE_SIZE));
44541
44582
  const fd = await fs2.open(resolved, "r");
44542
44583
  try {
44543
44584
  await fd.read(buffer, 0, buffer.length, 0);
@@ -44545,8 +44586,8 @@ function createFilesystemTools(workspacePath) {
44545
44586
  await fd.close();
44546
44587
  }
44547
44588
  const content = buffer.toString("utf-8");
44548
- const truncated = stat7.size > MAX_FILE_SIZE;
44549
- return { content, truncated, size: stat7.size };
44589
+ const truncated = stat8.size > MAX_FILE_SIZE;
44590
+ return { content, truncated, size: stat8.size };
44550
44591
  } catch (error40) {
44551
44592
  return { error: error40 instanceof Error ? error40.message : String(error40) };
44552
44593
  }
@@ -44590,8 +44631,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
44590
44631
  const ext = path30.extname(entry.name).toLowerCase();
44591
44632
  if (BINARY_EXTENSIONS.has(ext)) continue;
44592
44633
  try {
44593
- const stat7 = await fs2.stat(fullPath);
44594
- if (stat7.size > MAX_FILE_SIZE) continue;
44634
+ const stat8 = await fs2.stat(fullPath);
44635
+ if (stat8.size > MAX_FILE_SIZE) continue;
44595
44636
  const content = await fs2.readFile(fullPath, "utf-8");
44596
44637
  const lines = content.split("\n");
44597
44638
  for (let i = 0; i < lines.length; i++) {
@@ -44749,7 +44790,7 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
44749
44790
  };
44750
44791
  }
44751
44792
  var TokenUsageEvaluator = class {
44752
- kind = "token_usage";
44793
+ kind = "token-usage";
44753
44794
  config;
44754
44795
  constructor(options) {
44755
44796
  this.config = options.config;
@@ -44772,7 +44813,7 @@ var TokenUsageEvaluator = class {
44772
44813
  expectedAspectCount,
44773
44814
  reasoning: "Token usage not reported by provider",
44774
44815
  evaluatorRawRequest: {
44775
- type: "token_usage",
44816
+ type: "token-usage",
44776
44817
  max_total: maxTotal ?? null,
44777
44818
  max_input: maxInput ?? null,
44778
44819
  max_output: maxOutput ?? null,
@@ -44814,9 +44855,9 @@ var TokenUsageEvaluator = class {
44814
44855
  hits,
44815
44856
  misses,
44816
44857
  expectedAspectCount,
44817
- reasoning: `token_usage input=${input}, output=${output}, cached=${cached2}, total=${total}`,
44858
+ reasoning: `token-usage input=${input}, output=${output}, cached=${cached2}, total=${total}`,
44818
44859
  evaluatorRawRequest: {
44819
- type: "token_usage",
44860
+ type: "token-usage",
44820
44861
  max_total: maxTotal ?? null,
44821
44862
  max_input: maxInput ?? null,
44822
44863
  max_output: maxOutput ?? null,
@@ -44899,7 +44940,7 @@ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
44899
44940
  };
44900
44941
  }
44901
44942
  var ToolTrajectoryEvaluator = class {
44902
- kind = "tool_trajectory";
44943
+ kind = "tool-trajectory";
44903
44944
  config;
44904
44945
  constructor(options) {
44905
44946
  this.config = options.config;
@@ -45087,7 +45128,7 @@ var ToolTrajectoryEvaluator = class {
45087
45128
  }
45088
45129
  }
45089
45130
  for (const warning of warnings) {
45090
- console.warn(`[tool_trajectory] ${warning}`);
45131
+ console.warn(`[tool-trajectory] ${warning}`);
45091
45132
  }
45092
45133
  const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
45093
45134
  const totalAssertions = expected.length + effectiveLatencyAssertions;
@@ -45163,7 +45204,7 @@ var ToolTrajectoryEvaluator = class {
45163
45204
  misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
45164
45205
  }
45165
45206
  for (const warning of warnings) {
45166
- console.warn(`[tool_trajectory] ${warning}`);
45207
+ console.warn(`[tool-trajectory] ${warning}`);
45167
45208
  }
45168
45209
  const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
45169
45210
  const totalAssertions = expected.length + effectiveLatencyAssertions;
@@ -45631,7 +45672,7 @@ var llmJudgeFactory = (config2, context) => {
45631
45672
  const c = config2;
45632
45673
  const { llmJudge, agentTimeoutMs } = context;
45633
45674
  return {
45634
- kind: "llm_judge",
45675
+ kind: "llm-judge",
45635
45676
  async evaluate(evalContext) {
45636
45677
  const customPrompt = await resolveCustomPrompt(
45637
45678
  c,
@@ -45720,7 +45761,7 @@ var agentJudgeFactory = (config2, context) => {
45720
45761
  customPrompt = readFileSync(c.resolvedPromptPath, "utf-8");
45721
45762
  } catch (error40) {
45722
45763
  const message = error40 instanceof Error ? error40.message : String(error40);
45723
- console.warn(`Could not read agent_judge prompt at ${c.resolvedPromptPath}: ${message}`);
45764
+ console.warn(`Could not read agent-judge prompt at ${c.resolvedPromptPath}: ${message}`);
45724
45765
  }
45725
45766
  } else if (c.prompt) {
45726
45767
  customPrompt = c.prompt;
@@ -45730,7 +45771,7 @@ var agentJudgeFactory = (config2, context) => {
45730
45771
  judgeTargetProvider = targetResolver(c.target);
45731
45772
  if (!judgeTargetProvider) {
45732
45773
  throw new Error(
45733
- `agent_judge evaluator '${c.name}': target '${c.target}' not found in targets`
45774
+ `agent-judge evaluator '${c.name}': target '${c.target}' not found in targets`
45734
45775
  );
45735
45776
  }
45736
45777
  }
@@ -45774,7 +45815,7 @@ var regexFactory = (config2) => {
45774
45815
  });
45775
45816
  };
45776
45817
  var isJsonFactory = () => {
45777
- return new DeterministicAssertionEvaluator("is_json", (ctx) => {
45818
+ return new DeterministicAssertionEvaluator("is-json", (ctx) => {
45778
45819
  const result = runIsJsonAssertion(ctx.candidate);
45779
45820
  return {
45780
45821
  score: result.score,
@@ -45802,7 +45843,7 @@ var equalsFactory = (config2) => {
45802
45843
  };
45803
45844
  var containsAnyFactory = (config2) => {
45804
45845
  const c = config2;
45805
- return new DeterministicAssertionEvaluator("contains_any", (ctx) => {
45846
+ return new DeterministicAssertionEvaluator("contains-any", (ctx) => {
45806
45847
  const result = runContainsAnyAssertion(ctx.candidate, c.value);
45807
45848
  return {
45808
45849
  score: result.score,
@@ -45816,7 +45857,7 @@ var containsAnyFactory = (config2) => {
45816
45857
  };
45817
45858
  var containsAllFactory = (config2) => {
45818
45859
  const c = config2;
45819
- return new DeterministicAssertionEvaluator("contains_all", (ctx) => {
45860
+ return new DeterministicAssertionEvaluator("contains-all", (ctx) => {
45820
45861
  const result = runContainsAllAssertion(ctx.candidate, c.value);
45821
45862
  return {
45822
45863
  score: result.score,
@@ -45844,7 +45885,7 @@ var icontainsFactory = (config2) => {
45844
45885
  };
45845
45886
  var icontainsAnyFactory = (config2) => {
45846
45887
  const c = config2;
45847
- return new DeterministicAssertionEvaluator("icontains_any", (ctx) => {
45888
+ return new DeterministicAssertionEvaluator("icontains-any", (ctx) => {
45848
45889
  const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
45849
45890
  return {
45850
45891
  score: result.score,
@@ -45858,7 +45899,7 @@ var icontainsAnyFactory = (config2) => {
45858
45899
  };
45859
45900
  var icontainsAllFactory = (config2) => {
45860
45901
  const c = config2;
45861
- return new DeterministicAssertionEvaluator("icontains_all", (ctx) => {
45902
+ return new DeterministicAssertionEvaluator("icontains-all", (ctx) => {
45862
45903
  const result = runIcontainsAllAssertion(ctx.candidate, c.value);
45863
45904
  return {
45864
45905
  score: result.score,
@@ -45872,7 +45913,7 @@ var icontainsAllFactory = (config2) => {
45872
45913
  };
45873
45914
  var startsWithFactory = (config2) => {
45874
45915
  const c = config2;
45875
- return new DeterministicAssertionEvaluator("starts_with", (ctx) => {
45916
+ return new DeterministicAssertionEvaluator("starts-with", (ctx) => {
45876
45917
  const result = runStartsWithAssertion(ctx.candidate, c.value);
45877
45918
  return {
45878
45919
  score: result.score,
@@ -45886,7 +45927,7 @@ var startsWithFactory = (config2) => {
45886
45927
  };
45887
45928
  var endsWithFactory = (config2) => {
45888
45929
  const c = config2;
45889
- return new DeterministicAssertionEvaluator("ends_with", (ctx) => {
45930
+ return new DeterministicAssertionEvaluator("ends-with", (ctx) => {
45890
45931
  const result = runEndsWithAssertion(ctx.candidate, c.value);
45891
45932
  return {
45892
45933
  score: result.score,
@@ -45900,7 +45941,7 @@ var endsWithFactory = (config2) => {
45900
45941
  };
45901
45942
  function createBuiltinRegistry() {
45902
45943
  const registry2 = new EvaluatorRegistry();
45903
- registry2.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("contains_any", containsAnyFactory).register("contains_all", containsAllFactory).register("icontains", icontainsFactory).register("icontains_any", icontainsAnyFactory).register("icontains_all", icontainsAllFactory).register("starts_with", startsWithFactory).register("ends_with", endsWithFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
45944
+ registry2.register("llm-judge", llmJudgeFactory).register("code-judge", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("agent-judge", agentJudgeFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory);
45904
45945
  return registry2;
45905
45946
  }
45906
45947
  async function discoverAssertions(registry2, baseDir) {
@@ -46553,7 +46594,8 @@ async function runEvaluation(options) {
46553
46594
  cleanupWorkspaces,
46554
46595
  trials,
46555
46596
  streamCallbacks,
46556
- totalBudgetUsd
46597
+ totalBudgetUsd,
46598
+ failOnError
46557
46599
  } = options;
46558
46600
  let useCache = options.useCache;
46559
46601
  if (trials && trials.count > 1 && useCache) {
@@ -46611,7 +46653,7 @@ async function runEvaluation(options) {
46611
46653
  };
46612
46654
  if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
46613
46655
  throw new Error(
46614
- `Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure_base).`
46656
+ `Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure-base).`
46615
46657
  );
46616
46658
  }
46617
46659
  const targetResolver = (name16) => {
@@ -46682,7 +46724,7 @@ async function runEvaluation(options) {
46682
46724
  const rawTemplate = suiteWorkspace?.template ?? getWorkspaceTemplate(target);
46683
46725
  const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
46684
46726
  const workspaceTemplate = resolvedTemplate?.dir;
46685
- const suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
46727
+ let suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
46686
46728
  const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
46687
46729
  const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
46688
46730
  const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
@@ -46703,6 +46745,14 @@ async function runEvaluation(options) {
46703
46745
  const message = error40 instanceof Error ? error40.message : String(error40);
46704
46746
  throw new Error(`Failed to create shared workspace: ${message}`);
46705
46747
  }
46748
+ if (suiteWorkspaceFile && sharedWorkspacePath) {
46749
+ const copiedWorkspaceFile = path37.join(sharedWorkspacePath, path37.basename(suiteWorkspaceFile));
46750
+ try {
46751
+ await stat7(copiedWorkspaceFile);
46752
+ suiteWorkspaceFile = copiedWorkspaceFile;
46753
+ } catch {
46754
+ }
46755
+ }
46706
46756
  } else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
46707
46757
  sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
46708
46758
  await mkdir12(sharedWorkspacePath, { recursive: true });
@@ -46749,6 +46799,7 @@ async function runEvaluation(options) {
46749
46799
  let beforeAllOutputAttached = false;
46750
46800
  let cumulativeBudgetCost = 0;
46751
46801
  let budgetExhausted = false;
46802
+ let failOnErrorTriggered = false;
46752
46803
  const promises = filteredEvalCases.map(
46753
46804
  (evalCase) => limit(async () => {
46754
46805
  const workerId = nextWorkerId++;
@@ -46787,6 +46838,37 @@ async function runEvaluation(options) {
46787
46838
  }
46788
46839
  return budgetResult;
46789
46840
  }
46841
+ if (failOnError === true && failOnErrorTriggered) {
46842
+ const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
46843
+ const haltResult = {
46844
+ timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
46845
+ testId: evalCase.id,
46846
+ dataset: evalCase.dataset,
46847
+ score: 0,
46848
+ hits: [],
46849
+ misses: [],
46850
+ answer: "",
46851
+ target: target.name,
46852
+ error: errorMsg,
46853
+ executionStatus: "execution_error",
46854
+ failureStage: "setup",
46855
+ failureReasonCode: "error_threshold_exceeded",
46856
+ executionError: { message: errorMsg, stage: "setup" }
46857
+ };
46858
+ if (onProgress) {
46859
+ await onProgress({
46860
+ workerId,
46861
+ testId: evalCase.id,
46862
+ status: "failed",
46863
+ completedAt: Date.now(),
46864
+ error: haltResult.error
46865
+ });
46866
+ }
46867
+ if (onResult) {
46868
+ await onResult(haltResult);
46869
+ }
46870
+ return haltResult;
46871
+ }
46790
46872
  if (onProgress) {
46791
46873
  await onProgress({
46792
46874
  workerId,
@@ -46839,6 +46921,9 @@ async function runEvaluation(options) {
46839
46921
  }
46840
46922
  }
46841
46923
  }
46924
+ if (failOnError === true && result.executionStatus === "execution_error") {
46925
+ failOnErrorTriggered = true;
46926
+ }
46842
46927
  if (beforeAllOutput && !beforeAllOutputAttached) {
46843
46928
  result = { ...result, beforeAllOutput };
46844
46929
  beforeAllOutputAttached = true;
@@ -47146,6 +47231,14 @@ async function runEvalCase(options) {
47146
47231
  "template_error"
47147
47232
  );
47148
47233
  }
47234
+ if (caseWorkspaceFile && workspacePath) {
47235
+ const copiedFile = path37.join(workspacePath, path37.basename(caseWorkspaceFile));
47236
+ try {
47237
+ await stat7(copiedFile);
47238
+ caseWorkspaceFile = copiedFile;
47239
+ } catch {
47240
+ }
47241
+ }
47149
47242
  }
47150
47243
  if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
47151
47244
  workspacePath = getWorkspacePath(evalRunId, evalCase.id);
@@ -47655,8 +47748,8 @@ async function runEvaluatorsForCase(options) {
47655
47748
  workspacePath
47656
47749
  });
47657
47750
  }
47658
- const evaluatorKind = evalCase.evaluator ?? "llm_judge";
47659
- const activeEvaluator = evaluators[evaluatorKind] ?? evaluators.llm_judge;
47751
+ const evaluatorKind = evalCase.evaluator ?? "llm-judge";
47752
+ const activeEvaluator = evaluators[evaluatorKind] ?? evaluators["llm-judge"];
47660
47753
  if (!activeEvaluator) {
47661
47754
  throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
47662
47755
  }
@@ -47739,25 +47832,24 @@ async function runEvaluatorList(options) {
47739
47832
  availableTargets,
47740
47833
  agentTimeoutMs,
47741
47834
  evalFileDir,
47742
- llmJudge: evaluatorRegistry.llm_judge,
47835
+ llmJudge: evaluatorRegistry["llm-judge"],
47743
47836
  registry: typeRegistry
47744
47837
  };
47745
47838
  for (const evaluatorConfig of evaluators ?? []) {
47746
47839
  try {
47747
47840
  const evaluatorInstance = await typeRegistry.create(evaluatorConfig, dispatchContext);
47748
47841
  const score2 = await evaluatorInstance.evaluate(evalContext);
47749
- const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
47750
47842
  const weight = evaluatorConfig.weight ?? 1;
47751
47843
  scored.push({
47752
47844
  score: score2,
47753
47845
  name: evaluatorConfig.name,
47754
- type: resultType,
47846
+ type: evaluatorConfig.type,
47755
47847
  weight,
47756
47848
  ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
47757
47849
  });
47758
47850
  scores.push({
47759
47851
  name: evaluatorConfig.name,
47760
- type: resultType,
47852
+ type: evaluatorConfig.type,
47761
47853
  score: score2.score,
47762
47854
  weight,
47763
47855
  verdict: score2.verdict,
@@ -47779,18 +47871,17 @@ async function runEvaluatorList(options) {
47779
47871
  expectedAspectCount: 1,
47780
47872
  reasoning: message
47781
47873
  };
47782
- const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
47783
47874
  const weight = evaluatorConfig.weight ?? 1;
47784
47875
  scored.push({
47785
47876
  score: fallbackScore,
47786
47877
  name: evaluatorConfig.name ?? "unknown",
47787
- type: resultType ?? "llm_judge",
47878
+ type: evaluatorConfig.type ?? "llm-judge",
47788
47879
  weight,
47789
47880
  ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
47790
47881
  });
47791
47882
  scores.push({
47792
47883
  name: evaluatorConfig.name ?? "unknown",
47793
- type: resultType ?? "llm_judge",
47884
+ type: evaluatorConfig.type ?? "llm-judge",
47794
47885
  score: 0,
47795
47886
  weight,
47796
47887
  verdict: "fail",
@@ -47851,7 +47942,7 @@ function filterEvalCases(evalCases, filter2) {
47851
47942
  return evalCases.filter((evalCase) => micromatch4.isMatch(evalCase.id, filter2));
47852
47943
  }
47853
47944
  function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
47854
- const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
47945
+ const llmJudge = overrides?.["llm-judge"] ?? new LlmJudgeEvaluator({
47855
47946
  resolveJudgeProvider: async (context) => {
47856
47947
  if (context.judgeProvider) {
47857
47948
  return context.judgeProvider;
@@ -47861,7 +47952,7 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
47861
47952
  });
47862
47953
  return {
47863
47954
  ...overrides,
47864
- llm_judge: llmJudge
47955
+ "llm-judge": llmJudge
47865
47956
  };
47866
47957
  }
47867
47958
  async function invokeProvider(provider, options) {
@@ -48117,12 +48208,7 @@ async function evaluate(config2) {
48117
48208
  };
48118
48209
  }
48119
48210
  function mapAssertionType(type) {
48120
- switch (type) {
48121
- case "code_judge":
48122
- return "code";
48123
- default:
48124
- return type;
48125
- }
48211
+ return type.replace(/_/g, "-");
48126
48212
  }
48127
48213
  function computeSummary(results, durationMs) {
48128
48214
  const total = results.length;
@@ -48851,6 +48937,7 @@ export {
48851
48937
  extractTargetsFromTestCase,
48852
48938
  extractTrialsConfig,
48853
48939
  extractCacheConfig,
48940
+ extractFailOnError,
48854
48941
  detectFormat,
48855
48942
  buildPromptInputs,
48856
48943
  readTestSuiteMetadata,
@@ -48950,4 +49037,4 @@ export {
48950
49037
  OtelStreamingObserver,
48951
49038
  createAgentKernel
48952
49039
  };
48953
- //# sourceMappingURL=chunk-LUHCYBMD.js.map
49040
+ //# sourceMappingURL=chunk-OQN2GDEU.js.map