agentv 2.13.0 → 2.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -148,7 +148,7 @@ var require_dist = __commonJS({
148
148
  }
149
149
  });
150
150
 
151
- // ../../packages/core/dist/chunk-JHER2LQ5.js
151
+ // ../../packages/core/dist/chunk-N55K52OO.js
152
152
  import { constants } from "node:fs";
153
153
  import { access, readFile } from "node:fs/promises";
154
154
  import path from "node:path";
@@ -4195,7 +4195,7 @@ var coerce = {
4195
4195
  };
4196
4196
  var NEVER = INVALID;
4197
4197
 
4198
- // ../../packages/core/dist/chunk-JHER2LQ5.js
4198
+ // ../../packages/core/dist/chunk-N55K52OO.js
4199
4199
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
4200
4200
  var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
4201
4201
  var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
@@ -4243,27 +4243,27 @@ function isTestMessage(value) {
4243
4243
  return false;
4244
4244
  }
4245
4245
  var EVALUATOR_KIND_VALUES = [
4246
- "code_judge",
4247
- "llm_judge",
4246
+ "code-judge",
4247
+ "llm-judge",
4248
4248
  "rubric",
4249
4249
  "composite",
4250
- "tool_trajectory",
4251
- "field_accuracy",
4250
+ "tool-trajectory",
4251
+ "field-accuracy",
4252
4252
  "latency",
4253
4253
  "cost",
4254
- "token_usage",
4255
- "execution_metrics",
4256
- "agent_judge",
4254
+ "token-usage",
4255
+ "execution-metrics",
4256
+ "agent-judge",
4257
4257
  "contains",
4258
- "contains_any",
4259
- "contains_all",
4258
+ "contains-any",
4259
+ "contains-all",
4260
4260
  "icontains",
4261
- "icontains_any",
4262
- "icontains_all",
4263
- "starts_with",
4264
- "ends_with",
4261
+ "icontains-any",
4262
+ "icontains-all",
4263
+ "starts-with",
4264
+ "ends-with",
4265
4265
  "regex",
4266
- "is_json",
4266
+ "is-json",
4267
4267
  "equals",
4268
4268
  "rubrics"
4269
4269
  ];
@@ -33960,7 +33960,7 @@ import { createServer } from "node:http";
33960
33960
  import fs2 from "node:fs/promises";
33961
33961
  import path30 from "node:path";
33962
33962
  import { createHash as createHash2, randomUUID as randomUUID7 } from "node:crypto";
33963
- import { mkdir as mkdir12 } from "node:fs/promises";
33963
+ import { mkdir as mkdir12, stat as stat7 } from "node:fs/promises";
33964
33964
  import path37 from "node:path";
33965
33965
  import micromatch4 from "micromatch";
33966
33966
  import { readFileSync } from "node:fs";
@@ -34605,6 +34605,9 @@ function validateTemplateVariables(content, source) {
34605
34605
  }
34606
34606
  var ANSI_YELLOW4 = "\x1B[33m";
34607
34607
  var ANSI_RESET4 = "\x1B[0m";
34608
+ function normalizeEvaluatorType(type) {
34609
+ return type.replace(/_/g, "-");
34610
+ }
34608
34611
  async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
34609
34612
  const execution = rawEvalCase.execution;
34610
34613
  const executionObject = isJsonObject2(execution) ? execution : void 0;
@@ -34635,7 +34638,8 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34635
34638
  continue;
34636
34639
  }
34637
34640
  const rawName = asString(rawEvaluator.name);
34638
- const typeValue = rawEvaluator.type;
34641
+ const rawType = rawEvaluator.type;
34642
+ const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
34639
34643
  const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
34640
34644
  if (typeof typeValue !== "string") {
34641
34645
  logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
@@ -34668,25 +34672,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34668
34672
  });
34669
34673
  continue;
34670
34674
  }
34671
- if (typeValue === "code_judge") {
34675
+ if (typeValue === "code-judge") {
34672
34676
  let command;
34673
34677
  const rawCommand = rawEvaluator.command ?? rawEvaluator.script;
34674
34678
  if (typeof rawCommand === "string") {
34675
34679
  const trimmed = rawCommand.trim();
34676
34680
  if (trimmed.length === 0) {
34677
34681
  throw new Error(
34678
- `Invalid code_judge command for evaluator '${name16}' in '${evalId}': command cannot be empty`
34682
+ `Invalid code-judge command for evaluator '${name16}' in '${evalId}': command cannot be empty`
34679
34683
  );
34680
34684
  }
34681
34685
  command = parseCommandToArgv(trimmed);
34682
34686
  } else {
34683
34687
  command = asStringArray(
34684
34688
  rawCommand,
34685
- `code_judge command for evaluator '${name16}' in '${evalId}'`
34689
+ `code-judge command for evaluator '${name16}' in '${evalId}'`
34686
34690
  );
34687
34691
  }
34688
34692
  if (!command) {
34689
- logWarning2(`Skipping code_judge evaluator '${name16}' in '${evalId}': missing command`);
34693
+ logWarning2(`Skipping code-judge evaluator '${name16}' in '${evalId}': missing command`);
34690
34694
  continue;
34691
34695
  }
34692
34696
  const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
@@ -34747,7 +34751,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34747
34751
  }
34748
34752
  evaluators.push({
34749
34753
  name: name16,
34750
- type: "code",
34754
+ type: "code-judge",
34751
34755
  command,
34752
34756
  cwd,
34753
34757
  resolvedCwd,
@@ -34773,7 +34777,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34773
34777
  continue;
34774
34778
  }
34775
34779
  const aggregatorType = asString(rawAggregator.type);
34776
- if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge" && aggregatorType !== "threshold") {
34780
+ if (aggregatorType !== "weighted_average" && aggregatorType !== "code-judge" && aggregatorType !== "llm-judge" && aggregatorType !== "threshold") {
34777
34781
  logWarning2(
34778
34782
  `Skipping composite evaluator '${name16}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
34779
34783
  );
@@ -34822,16 +34826,16 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34822
34826
  type: "weighted_average",
34823
34827
  ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
34824
34828
  };
34825
- } else if (aggregatorType === "code_judge") {
34829
+ } else if (aggregatorType === "code-judge") {
34826
34830
  const aggregatorPath = asString(rawAggregator.path);
34827
34831
  if (!aggregatorPath) {
34828
34832
  logWarning2(
34829
- `Skipping composite evaluator '${name16}' in '${evalId}': code_judge aggregator missing path`
34833
+ `Skipping composite evaluator '${name16}' in '${evalId}': code-judge aggregator missing path`
34830
34834
  );
34831
34835
  continue;
34832
34836
  }
34833
34837
  aggregator = {
34834
- type: "code_judge",
34838
+ type: "code-judge",
34835
34839
  path: aggregatorPath,
34836
34840
  cwd: searchRoots[0]
34837
34841
  };
@@ -34857,7 +34861,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34857
34861
  }
34858
34862
  }
34859
34863
  aggregator = {
34860
- type: "llm_judge",
34864
+ type: "llm-judge",
34861
34865
  ...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
34862
34866
  ...promptPath2 ? { promptPath: promptPath2 } : {}
34863
34867
  };
@@ -34875,11 +34879,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34875
34879
  });
34876
34880
  continue;
34877
34881
  }
34878
- if (typeValue === "tool_trajectory") {
34882
+ if (typeValue === "tool-trajectory") {
34879
34883
  const mode = asString(rawEvaluator.mode);
34880
34884
  if (mode !== "any_order" && mode !== "in_order" && mode !== "exact" && mode !== "subset" && mode !== "superset") {
34881
34885
  logWarning2(
34882
- `Skipping tool_trajectory evaluator '${name16}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, exact, subset, or superset)`
34886
+ `Skipping tool-trajectory evaluator '${name16}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, exact, subset, or superset)`
34883
34887
  );
34884
34888
  continue;
34885
34889
  }
@@ -34888,7 +34892,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34888
34892
  if (rawMinimums !== void 0) {
34889
34893
  if (!isJsonObject2(rawMinimums)) {
34890
34894
  logWarning2(
34891
- `Skipping tool_trajectory evaluator '${name16}' in '${evalId}': minimums must be an object`
34895
+ `Skipping tool-trajectory evaluator '${name16}' in '${evalId}': minimums must be an object`
34892
34896
  );
34893
34897
  continue;
34894
34898
  }
@@ -34914,7 +34918,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34914
34918
  argsMatch2 = rawArgsMatch;
34915
34919
  } else {
34916
34920
  logWarning2(
34917
- `Invalid args_match '${rawArgsMatch}' for tool_trajectory evaluator '${name16}' in '${evalId}': must be exact, superset, subset, ignore, or a string array`
34921
+ `Invalid args_match '${rawArgsMatch}' for tool-trajectory evaluator '${name16}' in '${evalId}': must be exact, superset, subset, ignore, or a string array`
34918
34922
  );
34919
34923
  }
34920
34924
  }
@@ -34924,7 +34928,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34924
34928
  if (rawExpected !== void 0) {
34925
34929
  if (!Array.isArray(rawExpected)) {
34926
34930
  logWarning2(
34927
- `Skipping tool_trajectory evaluator '${name16}' in '${evalId}': expected must be an array`
34931
+ `Skipping tool-trajectory evaluator '${name16}' in '${evalId}': expected must be an array`
34928
34932
  );
34929
34933
  continue;
34930
34934
  }
@@ -34970,13 +34974,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34970
34974
  }
34971
34975
  if (mode === "any_order" && !minimums) {
34972
34976
  logWarning2(
34973
- `Skipping tool_trajectory evaluator '${name16}' in '${evalId}': any_order mode requires minimums`
34977
+ `Skipping tool-trajectory evaluator '${name16}' in '${evalId}': any_order mode requires minimums`
34974
34978
  );
34975
34979
  continue;
34976
34980
  }
34977
34981
  if ((mode === "in_order" || mode === "exact" || mode === "subset" || mode === "superset") && !expected) {
34978
34982
  logWarning2(
34979
- `Skipping tool_trajectory evaluator '${name16}' in '${evalId}': ${mode} mode requires expected`
34983
+ `Skipping tool-trajectory evaluator '${name16}' in '${evalId}': ${mode} mode requires expected`
34980
34984
  );
34981
34985
  continue;
34982
34986
  }
@@ -34984,7 +34988,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34984
34988
  const required22 = parseRequired(rawEvaluator.required);
34985
34989
  const config22 = {
34986
34990
  name: name16,
34987
- type: "tool_trajectory",
34991
+ type: "tool-trajectory",
34988
34992
  mode,
34989
34993
  ...minimums ? { minimums } : {},
34990
34994
  ...expected ? { expected } : {},
@@ -34996,17 +35000,17 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
34996
35000
  evaluators.push(config22);
34997
35001
  continue;
34998
35002
  }
34999
- if (typeValue === "field_accuracy") {
35003
+ if (typeValue === "field-accuracy") {
35000
35004
  const rawFields = rawEvaluator.fields;
35001
35005
  if (!Array.isArray(rawFields)) {
35002
35006
  logWarning2(
35003
- `Skipping field_accuracy evaluator '${name16}' in '${evalId}': missing fields array`
35007
+ `Skipping field-accuracy evaluator '${name16}' in '${evalId}': missing fields array`
35004
35008
  );
35005
35009
  continue;
35006
35010
  }
35007
35011
  if (rawFields.length === 0) {
35008
35012
  logWarning2(
35009
- `Skipping field_accuracy evaluator '${name16}' in '${evalId}': fields array is empty`
35013
+ `Skipping field-accuracy evaluator '${name16}' in '${evalId}': fields array is empty`
35010
35014
  );
35011
35015
  continue;
35012
35016
  }
@@ -35014,7 +35018,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35014
35018
  for (const rawField of rawFields) {
35015
35019
  if (!isJsonObject2(rawField)) {
35016
35020
  logWarning2(
35017
- `Skipping invalid field entry in field_accuracy evaluator '${name16}' (expected object)`
35021
+ `Skipping invalid field entry in field-accuracy evaluator '${name16}' (expected object)`
35018
35022
  );
35019
35023
  continue;
35020
35024
  }
@@ -35022,13 +35026,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35022
35026
  const match = asString(rawField.match);
35023
35027
  if (!fieldPath) {
35024
35028
  logWarning2(
35025
- `Skipping field without path in field_accuracy evaluator '${name16}' in '${evalId}'`
35029
+ `Skipping field without path in field-accuracy evaluator '${name16}' in '${evalId}'`
35026
35030
  );
35027
35031
  continue;
35028
35032
  }
35029
35033
  if (!match || !isValidFieldMatchType(match)) {
35030
35034
  logWarning2(
35031
- `Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name16}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code_judge evaluator.`
35035
+ `Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name16}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code-judge evaluator.`
35032
35036
  );
35033
35037
  continue;
35034
35038
  }
@@ -35045,7 +35049,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35045
35049
  }
35046
35050
  if (fields.length === 0) {
35047
35051
  logWarning2(
35048
- `Skipping field_accuracy evaluator '${name16}' in '${evalId}': no valid fields found`
35052
+ `Skipping field-accuracy evaluator '${name16}' in '${evalId}': no valid fields found`
35049
35053
  );
35050
35054
  continue;
35051
35055
  }
@@ -35055,7 +35059,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35055
35059
  const required22 = parseRequired(rawEvaluator.required);
35056
35060
  evaluators.push({
35057
35061
  name: name16,
35058
- type: "field_accuracy",
35062
+ type: "field-accuracy",
35059
35063
  fields,
35060
35064
  ...validAggregation ? { aggregation: validAggregation } : {},
35061
35065
  ...weight2 !== void 0 ? { weight: weight2 } : {},
@@ -35104,7 +35108,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35104
35108
  });
35105
35109
  continue;
35106
35110
  }
35107
- if (typeValue === "token_usage") {
35111
+ if (typeValue === "token-usage") {
35108
35112
  const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
35109
35113
  const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
35110
35114
  const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
@@ -35118,7 +35122,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35118
35122
  if (raw === void 0) continue;
35119
35123
  if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
35120
35124
  logWarning2(
35121
- `Skipping token_usage evaluator '${name16}' in '${evalId}': ${key} must be a non-negative finite number`
35125
+ `Skipping token-usage evaluator '${name16}' in '${evalId}': ${key} must be a non-negative finite number`
35122
35126
  );
35123
35127
  continue;
35124
35128
  }
@@ -35126,7 +35130,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35126
35130
  }
35127
35131
  if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
35128
35132
  logWarning2(
35129
- `Skipping token_usage evaluator '${name16}' in '${evalId}': must set at least one of max_total, max_input, max_output`
35133
+ `Skipping token-usage evaluator '${name16}' in '${evalId}': must set at least one of max_total, max_input, max_output`
35130
35134
  );
35131
35135
  continue;
35132
35136
  }
@@ -35134,7 +35138,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35134
35138
  const required22 = parseRequired(rawEvaluator.required);
35135
35139
  evaluators.push({
35136
35140
  name: name16,
35137
- type: "token_usage",
35141
+ type: "token-usage",
35138
35142
  ...validLimits,
35139
35143
  ...weight2 !== void 0 ? { weight: weight2 } : {},
35140
35144
  ...required22 !== void 0 ? { required: required22 } : {},
@@ -35142,7 +35146,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35142
35146
  });
35143
35147
  continue;
35144
35148
  }
35145
- if (typeValue === "execution_metrics") {
35149
+ if (typeValue === "execution-metrics") {
35146
35150
  const maxToolCalls = rawEvaluator.max_tool_calls ?? rawEvaluator.maxToolCalls;
35147
35151
  const maxLlmCalls = rawEvaluator.max_llm_calls ?? rawEvaluator.maxLlmCalls;
35148
35152
  const maxTokens = rawEvaluator.max_tokens ?? rawEvaluator.maxTokens;
@@ -35165,7 +35169,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35165
35169
  if (raw === void 0) continue;
35166
35170
  if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
35167
35171
  logWarning2(
35168
- `Skipping execution_metrics evaluator '${name16}' in '${evalId}': ${key} must be a non-negative finite number`
35172
+ `Skipping execution-metrics evaluator '${name16}' in '${evalId}': ${key} must be a non-negative finite number`
35169
35173
  );
35170
35174
  hasError = true;
35171
35175
  break;
@@ -35178,7 +35182,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35178
35182
  const hasThreshold = validThresholds.max_tool_calls !== void 0 || validThresholds.max_llm_calls !== void 0 || validThresholds.max_tokens !== void 0 || validThresholds.max_cost_usd !== void 0 || validThresholds.max_duration_ms !== void 0 || validThresholds.target_exploration_ratio !== void 0;
35179
35183
  if (!hasThreshold) {
35180
35184
  logWarning2(
35181
- `Skipping execution_metrics evaluator '${name16}' in '${evalId}': must set at least one threshold (max_tool_calls, max_llm_calls, max_tokens, max_cost_usd, max_duration_ms, or target_exploration_ratio)`
35185
+ `Skipping execution-metrics evaluator '${name16}' in '${evalId}': must set at least one threshold (max_tool_calls, max_llm_calls, max_tokens, max_cost_usd, max_duration_ms, or target_exploration_ratio)`
35182
35186
  );
35183
35187
  continue;
35184
35188
  }
@@ -35186,7 +35190,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35186
35190
  const required22 = parseRequired(rawEvaluator.required);
35187
35191
  evaluators.push({
35188
35192
  name: name16,
35189
- type: "execution_metrics",
35193
+ type: "execution-metrics",
35190
35194
  ...validThresholds,
35191
35195
  ...weight2 !== void 0 ? { weight: weight2 } : {},
35192
35196
  ...required22 !== void 0 ? { required: required22 } : {},
@@ -35194,13 +35198,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35194
35198
  });
35195
35199
  continue;
35196
35200
  }
35197
- if (typeValue === "agent_judge") {
35201
+ if (typeValue === "agent-judge") {
35198
35202
  const rawMaxSteps = rawEvaluator.max_steps ?? rawEvaluator.maxSteps;
35199
35203
  let maxSteps;
35200
35204
  if (rawMaxSteps !== void 0) {
35201
35205
  if (typeof rawMaxSteps !== "number" || !Number.isInteger(rawMaxSteps) || rawMaxSteps < 1 || rawMaxSteps > 50) {
35202
35206
  logWarning2(
35203
- `Skipping agent_judge evaluator '${name16}' in '${evalId}': max_steps must be an integer 1-50`
35207
+ `Skipping agent-judge evaluator '${name16}' in '${evalId}': max_steps must be an integer 1-50`
35204
35208
  );
35205
35209
  continue;
35206
35210
  }
@@ -35211,7 +35215,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35211
35215
  if (rawTemperature !== void 0) {
35212
35216
  if (typeof rawTemperature !== "number" || rawTemperature < 0 || rawTemperature > 2) {
35213
35217
  logWarning2(
35214
- `Skipping agent_judge evaluator '${name16}' in '${evalId}': temperature must be a number 0-2`
35218
+ `Skipping agent-judge evaluator '${name16}' in '${evalId}': temperature must be a number 0-2`
35215
35219
  );
35216
35220
  continue;
35217
35221
  }
@@ -35234,7 +35238,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35234
35238
  const required22 = parseRequired(rawEvaluator.required);
35235
35239
  evaluators.push({
35236
35240
  name: name16,
35237
- type: "agent_judge",
35241
+ type: "agent-judge",
35238
35242
  ...agentPrompt ? { prompt: agentPrompt } : {},
35239
35243
  ...agentPromptPath ? { promptPath: agentPromptPath, resolvedPromptPath: agentPromptPath } : {},
35240
35244
  ...agentParsedRubrics && agentParsedRubrics.length > 0 ? { rubrics: agentParsedRubrics } : {},
@@ -35265,7 +35269,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35265
35269
  });
35266
35270
  continue;
35267
35271
  }
35268
- if (typeValue === "contains_any" || typeValue === "contains_all") {
35272
+ if (typeValue === "contains-any" || typeValue === "contains-all") {
35269
35273
  const value = asStringArrayStrict(rawEvaluator.value);
35270
35274
  if (!value || value.length === 0) {
35271
35275
  logWarning2(
@@ -35303,7 +35307,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35303
35307
  });
35304
35308
  continue;
35305
35309
  }
35306
- if (typeValue === "icontains_any" || typeValue === "icontains_all") {
35310
+ if (typeValue === "icontains-any" || typeValue === "icontains-all") {
35307
35311
  const value = asStringArrayStrict(rawEvaluator.value);
35308
35312
  if (!value || value.length === 0) {
35309
35313
  logWarning2(
@@ -35323,7 +35327,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35323
35327
  });
35324
35328
  continue;
35325
35329
  }
35326
- if (typeValue === "starts_with" || typeValue === "ends_with") {
35330
+ if (typeValue === "starts-with" || typeValue === "ends-with") {
35327
35331
  const value = asString(rawEvaluator.value);
35328
35332
  if (!value) {
35329
35333
  logWarning2(`Skipping ${typeValue} evaluator '${name16}' in '${evalId}': missing value`);
@@ -35361,12 +35365,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35361
35365
  });
35362
35366
  continue;
35363
35367
  }
35364
- if (typeValue === "is_json") {
35368
+ if (typeValue === "is-json") {
35365
35369
  const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
35366
35370
  const required22 = parseRequired(rawEvaluator.required);
35367
35371
  evaluators.push({
35368
35372
  name: name16,
35369
- type: "is_json",
35373
+ type: "is-json",
35370
35374
  ...weight2 !== void 0 ? { weight: weight2 } : {},
35371
35375
  ...required22 !== void 0 ? { required: required22 } : {},
35372
35376
  ...negate !== void 0 ? { negate } : {}
@@ -35414,7 +35418,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35414
35418
  const required22 = parseRequired(rawEvaluator.required);
35415
35419
  evaluators.push({
35416
35420
  name: name16,
35417
- type: "llm_judge",
35421
+ type: "llm-judge",
35418
35422
  rubrics: parsedCriteria,
35419
35423
  ...weight2 !== void 0 ? { weight: weight2 } : {},
35420
35424
  ...required22 !== void 0 ? { required: required22 } : {},
@@ -35481,7 +35485,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35481
35485
  const required22 = parseRequired(rawEvaluator.required);
35482
35486
  evaluators.push({
35483
35487
  name: name16,
35484
- type: "llm_judge",
35488
+ type: "llm-judge",
35485
35489
  rubrics: parsedRubrics,
35486
35490
  ...weight2 !== void 0 ? { weight: weight2 } : {},
35487
35491
  ...required22 !== void 0 ? { required: required22 } : {},
@@ -35513,7 +35517,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35513
35517
  const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
35514
35518
  evaluators.push({
35515
35519
  name: name16,
35516
- type: "llm_judge",
35520
+ type: "llm-judge",
35517
35521
  prompt,
35518
35522
  promptPath,
35519
35523
  ...promptPath ? { resolvedPromptPath: promptPath } : {},
@@ -35529,15 +35533,15 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
35529
35533
  }
35530
35534
  var ASSERTION_TYPES = /* @__PURE__ */ new Set([
35531
35535
  "contains",
35532
- "contains_any",
35533
- "contains_all",
35536
+ "contains-any",
35537
+ "contains-all",
35534
35538
  "icontains",
35535
- "icontains_any",
35536
- "icontains_all",
35537
- "starts_with",
35538
- "ends_with",
35539
+ "icontains-any",
35540
+ "icontains-all",
35541
+ "starts-with",
35542
+ "ends-with",
35539
35543
  "regex",
35540
- "is_json",
35544
+ "is-json",
35541
35545
  "equals",
35542
35546
  "rubrics"
35543
35547
  ]);
@@ -35550,24 +35554,24 @@ function generateAssertionName(typeValue, rawEvaluator) {
35550
35554
  switch (typeValue) {
35551
35555
  case "contains":
35552
35556
  return value ? `contains-${value}` : "contains";
35553
- case "contains_any":
35554
- return arrayValue ? `contains_any-${arrayValue.length}` : "contains_any";
35555
- case "contains_all":
35556
- return arrayValue ? `contains_all-${arrayValue.length}` : "contains_all";
35557
+ case "contains-any":
35558
+ return arrayValue ? `contains-any-${arrayValue.length}` : "contains-any";
35559
+ case "contains-all":
35560
+ return arrayValue ? `contains-all-${arrayValue.length}` : "contains-all";
35557
35561
  case "icontains":
35558
35562
  return value ? `icontains-${value}` : "icontains";
35559
- case "icontains_any":
35560
- return arrayValue ? `icontains_any-${arrayValue.length}` : "icontains_any";
35561
- case "icontains_all":
35562
- return arrayValue ? `icontains_all-${arrayValue.length}` : "icontains_all";
35563
- case "starts_with":
35564
- return value ? `starts_with-${value}` : "starts_with";
35565
- case "ends_with":
35566
- return value ? `ends_with-${value}` : "ends_with";
35563
+ case "icontains-any":
35564
+ return arrayValue ? `icontains-any-${arrayValue.length}` : "icontains-any";
35565
+ case "icontains-all":
35566
+ return arrayValue ? `icontains-all-${arrayValue.length}` : "icontains-all";
35567
+ case "starts-with":
35568
+ return value ? `starts-with-${value}` : "starts-with";
35569
+ case "ends-with":
35570
+ return value ? `ends-with-${value}` : "ends-with";
35567
35571
  case "regex":
35568
35572
  return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
35569
- case "is_json":
35570
- return "is_json";
35573
+ case "is-json":
35574
+ return "is-json";
35571
35575
  case "equals":
35572
35576
  return value ? `equals-${value}` : "equals";
35573
35577
  case "rubrics":
@@ -35580,8 +35584,9 @@ function coerceEvaluator(candidate, contextId) {
35580
35584
  if (typeof candidate !== "string") {
35581
35585
  return void 0;
35582
35586
  }
35583
- if (isEvaluatorKind(candidate)) {
35584
- return candidate;
35587
+ const normalized = normalizeEvaluatorType(candidate);
35588
+ if (isEvaluatorKind(normalized)) {
35589
+ return normalized;
35585
35590
  }
35586
35591
  logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
35587
35592
  return void 0;
@@ -35627,6 +35632,16 @@ function parseCommandToArgv(command) {
35627
35632
  function isJsonObject2(value) {
35628
35633
  return typeof value === "object" && value !== null && !Array.isArray(value);
35629
35634
  }
35635
+ var CRITERIA_CONSUMER_TYPES = /* @__PURE__ */ new Set(["llm-judge", "agent-judge", "code-judge"]);
35636
+ function warnUnconsumedCriteria(criteria, evaluators, testId) {
35637
+ if (!criteria?.trim() || !evaluators || evaluators.length === 0) return;
35638
+ const hasConsumer = evaluators.some((e) => CRITERIA_CONSUMER_TYPES.has(e.type));
35639
+ if (!hasConsumer) {
35640
+ logWarning2(
35641
+ `Test '${testId}': criteria is defined but no evaluator in assert will evaluate it. Add 'type: llm-judge' to assert, or remove criteria if it is documentation-only.`
35642
+ );
35643
+ }
35644
+ }
35630
35645
  function logWarning2(message, details) {
35631
35646
  if (details && details.length > 0) {
35632
35647
  const detailBlock = details.join("\n");
@@ -35876,7 +35891,7 @@ function parseInlineRubrics(rawRubrics) {
35876
35891
  }
35877
35892
  return {
35878
35893
  name: "rubric",
35879
- type: "llm_judge",
35894
+ type: "llm-judge",
35880
35895
  rubrics: rubricItems
35881
35896
  };
35882
35897
  }
@@ -36243,7 +36258,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
36243
36258
  const rawCases = parseJsonlContent(rawFile, evalFilePath);
36244
36259
  const fallbackDataset = path6.basename(absoluteTestPath, ".jsonl") || "eval";
36245
36260
  const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
36246
- const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm_judge";
36261
+ const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-judge";
36247
36262
  const globalExecution = sidecar.execution;
36248
36263
  if (verbose) {
36249
36264
  console.log(`
@@ -36331,6 +36346,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
36331
36346
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
36332
36347
  }
36333
36348
  }
36349
+ warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
36334
36350
  const userFilePaths = [];
36335
36351
  for (const segment of inputSegments) {
36336
36352
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -36714,7 +36730,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
36714
36730
  const fallbackDataset = path8.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
36715
36731
  const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
36716
36732
  const rawTestcases = resolveTests(suite);
36717
- const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
36733
+ const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-judge";
36718
36734
  const evalFileDir = path8.dirname(absoluteTestPath);
36719
36735
  let expandedTestcases;
36720
36736
  if (typeof rawTestcases === "string") {
@@ -36811,6 +36827,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
36811
36827
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
36812
36828
  }
36813
36829
  }
36830
+ warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
36814
36831
  const userFilePaths = [];
36815
36832
  for (const segment of inputSegments) {
36816
36833
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -42612,7 +42629,7 @@ function toCamelCaseDeep(obj) {
42612
42629
  }
42613
42630
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
42614
42631
  var CodeEvaluator = class {
42615
- kind = "code";
42632
+ kind = "code-judge";
42616
42633
  command;
42617
42634
  cwd;
42618
42635
  agentTimeoutMs;
@@ -42813,7 +42830,7 @@ var scoreRangeEvaluationSchema = external_exports.object({
42813
42830
  overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)").optional()
42814
42831
  });
42815
42832
  var LlmJudgeEvaluator = class {
42816
- kind = "llm_judge";
42833
+ kind = "llm-judge";
42817
42834
  resolveJudgeProvider;
42818
42835
  maxOutputTokens;
42819
42836
  temperature;
@@ -42830,7 +42847,7 @@ var LlmJudgeEvaluator = class {
42830
42847
  throw new Error("No judge provider available for LLM grading");
42831
42848
  }
42832
42849
  const config2 = context.evaluator;
42833
- if (config2?.type === "llm_judge" && config2.rubrics && config2.rubrics.length > 0) {
42850
+ if (config2?.type === "llm-judge" && config2.rubrics && config2.rubrics.length > 0) {
42834
42851
  return this.evaluateWithRubrics(context, judgeProvider, config2.rubrics);
42835
42852
  }
42836
42853
  return this.evaluateFreeform(context, judgeProvider);
@@ -42904,7 +42921,7 @@ ${context.fileChanges}`;
42904
42921
  async evaluateWithRubrics(context, judgeProvider, rubrics) {
42905
42922
  if (!rubrics || rubrics.length === 0) {
42906
42923
  throw new Error(
42907
- `No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
42924
+ `No rubrics found for evaluator "${context.evaluator?.name ?? "llm-judge"}". Run "agentv generate rubrics" first.`
42908
42925
  );
42909
42926
  }
42910
42927
  const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
@@ -43238,9 +43255,9 @@ var CompositeEvaluator = class {
43238
43255
  async aggregate(results, context) {
43239
43256
  const aggregator = this.config.aggregator;
43240
43257
  switch (aggregator.type) {
43241
- case "code_judge":
43258
+ case "code-judge":
43242
43259
  return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
43243
- case "llm_judge":
43260
+ case "llm-judge":
43244
43261
  return this.runLlmAggregator(results, context, aggregator);
43245
43262
  case "threshold":
43246
43263
  return this.runThreshold(results, aggregator.threshold);
@@ -43383,7 +43400,7 @@ var CompositeEvaluator = class {
43383
43400
  expectedAspectCount: hits.length + misses.length || 1,
43384
43401
  reasoning,
43385
43402
  evaluatorRawRequest: {
43386
- aggregator: "code_judge",
43403
+ aggregator: "code-judge",
43387
43404
  script: scriptPath
43388
43405
  },
43389
43406
  scores
@@ -43398,7 +43415,7 @@ var CompositeEvaluator = class {
43398
43415
  expectedAspectCount: 1,
43399
43416
  reasoning: message,
43400
43417
  evaluatorRawRequest: {
43401
- aggregator: "code_judge",
43418
+ aggregator: "code-judge",
43402
43419
  script: scriptPath,
43403
43420
  error: message
43404
43421
  },
@@ -43429,7 +43446,7 @@ var CompositeEvaluator = class {
43429
43446
  const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
43430
43447
  const systemPrompt = buildOutputSchema();
43431
43448
  const evaluatorRawRequest = {
43432
- aggregator: "llm_judge",
43449
+ aggregator: "llm-judge",
43433
43450
  userPrompt,
43434
43451
  systemPrompt,
43435
43452
  target: judgeProvider.targetName
@@ -43537,7 +43554,7 @@ var CostEvaluator = class {
43537
43554
  }
43538
43555
  };
43539
43556
  var ExecutionMetricsEvaluator = class {
43540
- kind = "execution_metrics";
43557
+ kind = "execution-metrics";
43541
43558
  config;
43542
43559
  constructor(options) {
43543
43560
  this.config = options.config;
@@ -43563,7 +43580,7 @@ var ExecutionMetricsEvaluator = class {
43563
43580
  expectedAspectCount: 1,
43564
43581
  reasoning: "Execution metrics not available - no trace summary provided",
43565
43582
  evaluatorRawRequest: {
43566
- type: "execution_metrics",
43583
+ type: "execution-metrics",
43567
43584
  config: this.extractConfiguredThresholds(),
43568
43585
  actual: null
43569
43586
  }
@@ -43672,7 +43689,7 @@ var ExecutionMetricsEvaluator = class {
43672
43689
  if (actualMetrics.exploration_ratio !== void 0) {
43673
43690
  reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
43674
43691
  }
43675
- const reasoning = reasoningParts.length > 0 ? `execution_metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
43692
+ const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
43676
43693
  return {
43677
43694
  score,
43678
43695
  verdict: scoreToVerdict(score),
@@ -43681,7 +43698,7 @@ var ExecutionMetricsEvaluator = class {
43681
43698
  expectedAspectCount: totalChecks || 1,
43682
43699
  reasoning,
43683
43700
  evaluatorRawRequest: {
43684
- type: "execution_metrics",
43701
+ type: "execution-metrics",
43685
43702
  config: this.extractConfiguredThresholds(),
43686
43703
  actual: this.filterDefinedMetrics(actualMetrics)
43687
43704
  }
@@ -43767,7 +43784,7 @@ var MONTH_NAMES = {
43767
43784
  december: 11
43768
43785
  };
43769
43786
  var FieldAccuracyEvaluator = class {
43770
- kind = "field_accuracy";
43787
+ kind = "field-accuracy";
43771
43788
  config;
43772
43789
  constructor(options) {
43773
43790
  this.config = options.config;
@@ -44213,7 +44230,7 @@ var BINARY_EXTENSIONS = /* @__PURE__ */ new Set([
44213
44230
  ".dylib"
44214
44231
  ]);
44215
44232
  var AgentJudgeEvaluator = class {
44216
- kind = "agent_judge";
44233
+ kind = "agent-judge";
44217
44234
  resolveJudgeProvider;
44218
44235
  maxSteps;
44219
44236
  temperature;
@@ -44238,24 +44255,24 @@ var AgentJudgeEvaluator = class {
44238
44255
  async evaluateBuiltIn(context) {
44239
44256
  const judgeProvider = await this.resolveJudgeProvider(context);
44240
44257
  if (!judgeProvider) {
44241
- throw new Error("No judge provider available for agent_judge evaluation");
44258
+ throw new Error("No judge provider available for agent-judge evaluation");
44242
44259
  }
44243
44260
  const model = judgeProvider.asLanguageModel?.();
44244
44261
  if (!model) {
44245
44262
  throw new Error(
44246
- `Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent_judge mode`
44263
+ `Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent-judge mode`
44247
44264
  );
44248
44265
  }
44249
44266
  const workspacePath = context.workspacePath;
44250
44267
  if (!workspacePath) {
44251
44268
  throw new Error(
44252
- "agent_judge evaluator requires a workspace_template target (workspacePath is not set)"
44269
+ "agent-judge evaluator requires a workspace_template target (workspacePath is not set)"
44253
44270
  );
44254
44271
  }
44255
44272
  const systemPrompt = this.buildSystemPrompt(context);
44256
44273
  const userPrompt = this.buildUserPrompt(context);
44257
44274
  const config2 = context.evaluator;
44258
- const rubrics = config2?.type === "agent_judge" ? config2.rubrics : void 0;
44275
+ const rubrics = config2?.type === "agent-judge" ? config2.rubrics : void 0;
44259
44276
  const fsTools = createFilesystemTools(workspacePath);
44260
44277
  const evaluatorRawRequest = {
44261
44278
  mode: "built-in",
@@ -44286,7 +44303,7 @@ var AgentJudgeEvaluator = class {
44286
44303
  score: 0,
44287
44304
  verdict: "fail",
44288
44305
  hits: [],
44289
- misses: [`agent_judge built-in evaluation failed: ${message}`],
44306
+ misses: [`agent-judge built-in evaluation failed: ${message}`],
44290
44307
  expectedAspectCount: 1,
44291
44308
  evaluatorRawRequest,
44292
44309
  details: { mode: "built-in", error: message }
@@ -44318,14 +44335,14 @@ var AgentJudgeEvaluator = class {
44318
44335
  score: 0,
44319
44336
  verdict: "fail",
44320
44337
  hits: [],
44321
- misses: ["agent_judge judge_target returned no assistant response"],
44338
+ misses: ["agent-judge judge_target returned no assistant response"],
44322
44339
  expectedAspectCount: 1,
44323
44340
  evaluatorRawRequest,
44324
44341
  details: { mode: "judge_target", judge_target: provider.targetName }
44325
44342
  };
44326
44343
  }
44327
44344
  const config2 = context.evaluator;
44328
- const rubrics = config2?.type === "agent_judge" ? config2.rubrics : void 0;
44345
+ const rubrics = config2?.type === "agent-judge" ? config2.rubrics : void 0;
44329
44346
  const details = {
44330
44347
  mode: "judge_target",
44331
44348
  judge_target: provider.targetName
@@ -44337,7 +44354,7 @@ var AgentJudgeEvaluator = class {
44337
44354
  score: 0,
44338
44355
  verdict: "fail",
44339
44356
  hits: [],
44340
- misses: [`agent_judge judge_target evaluation failed: ${message}`],
44357
+ misses: [`agent-judge judge_target evaluation failed: ${message}`],
44341
44358
  expectedAspectCount: 1,
44342
44359
  evaluatorRawRequest,
44343
44360
  details: {
@@ -44388,7 +44405,7 @@ var AgentJudgeEvaluator = class {
44388
44405
  score: 0,
44389
44406
  verdict: "fail",
44390
44407
  hits: [],
44391
- misses: ["Failed to parse agent_judge response as valid evaluation JSON"],
44408
+ misses: ["Failed to parse agent-judge response as valid evaluation JSON"],
44392
44409
  expectedAspectCount: 1,
44393
44410
  evaluatorRawRequest,
44394
44411
  details
@@ -44401,7 +44418,7 @@ var AgentJudgeEvaluator = class {
44401
44418
  */
44402
44419
  buildSystemPrompt(context) {
44403
44420
  const config2 = context.evaluator;
44404
- const rubrics = config2?.type === "agent_judge" ? config2.rubrics : void 0;
44421
+ const rubrics = config2?.type === "agent-judge" ? config2.rubrics : void 0;
44405
44422
  const parts = [
44406
44423
  "You are an expert evaluator with access to the workspace filesystem.",
44407
44424
  "Use the provided tools to investigate the workspace and verify the criteria are met.",
@@ -44432,7 +44449,7 @@ var AgentJudgeEvaluator = class {
44432
44449
  return substituteVariables(this.evaluatorTemplate, variables);
44433
44450
  }
44434
44451
  const config2 = context.evaluator;
44435
- const rubrics = config2?.type === "agent_judge" ? config2.rubrics : void 0;
44452
+ const rubrics = config2?.type === "agent-judge" ? config2.rubrics : void 0;
44436
44453
  const parts = [
44437
44454
  "Evaluate the candidate answer by investigating the workspace.",
44438
44455
  "",
@@ -44475,7 +44492,7 @@ var AgentJudgeEvaluator = class {
44475
44492
  buildDelegatedPrompt(context) {
44476
44493
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
44477
44494
  const config2 = context.evaluator;
44478
- const rubrics = config2?.type === "agent_judge" ? config2.rubrics : void 0;
44495
+ const rubrics = config2?.type === "agent-judge" ? config2.rubrics : void 0;
44479
44496
  if (this.evaluatorTemplate) {
44480
44497
  const variables = {
44481
44498
  [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
@@ -44557,11 +44574,11 @@ function createFilesystemTools(workspacePath) {
44557
44574
  execute: async (input) => {
44558
44575
  try {
44559
44576
  const resolved = resolveSandboxed(workspacePath, input.path);
44560
- const stat7 = await fs2.stat(resolved);
44561
- if (stat7.isDirectory()) {
44577
+ const stat8 = await fs2.stat(resolved);
44578
+ if (stat8.isDirectory()) {
44562
44579
  return { error: `'${input.path}' is a directory, not a file` };
44563
44580
  }
44564
- const buffer = Buffer.alloc(Math.min(stat7.size, MAX_FILE_SIZE));
44581
+ const buffer = Buffer.alloc(Math.min(stat8.size, MAX_FILE_SIZE));
44565
44582
  const fd = await fs2.open(resolved, "r");
44566
44583
  try {
44567
44584
  await fd.read(buffer, 0, buffer.length, 0);
@@ -44569,8 +44586,8 @@ function createFilesystemTools(workspacePath) {
44569
44586
  await fd.close();
44570
44587
  }
44571
44588
  const content = buffer.toString("utf-8");
44572
- const truncated = stat7.size > MAX_FILE_SIZE;
44573
- return { content, truncated, size: stat7.size };
44589
+ const truncated = stat8.size > MAX_FILE_SIZE;
44590
+ return { content, truncated, size: stat8.size };
44574
44591
  } catch (error40) {
44575
44592
  return { error: error40 instanceof Error ? error40.message : String(error40) };
44576
44593
  }
@@ -44614,8 +44631,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
44614
44631
  const ext = path30.extname(entry.name).toLowerCase();
44615
44632
  if (BINARY_EXTENSIONS.has(ext)) continue;
44616
44633
  try {
44617
- const stat7 = await fs2.stat(fullPath);
44618
- if (stat7.size > MAX_FILE_SIZE) continue;
44634
+ const stat8 = await fs2.stat(fullPath);
44635
+ if (stat8.size > MAX_FILE_SIZE) continue;
44619
44636
  const content = await fs2.readFile(fullPath, "utf-8");
44620
44637
  const lines = content.split("\n");
44621
44638
  for (let i = 0; i < lines.length; i++) {
@@ -44773,7 +44790,7 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
44773
44790
  };
44774
44791
  }
44775
44792
  var TokenUsageEvaluator = class {
44776
- kind = "token_usage";
44793
+ kind = "token-usage";
44777
44794
  config;
44778
44795
  constructor(options) {
44779
44796
  this.config = options.config;
@@ -44796,7 +44813,7 @@ var TokenUsageEvaluator = class {
44796
44813
  expectedAspectCount,
44797
44814
  reasoning: "Token usage not reported by provider",
44798
44815
  evaluatorRawRequest: {
44799
- type: "token_usage",
44816
+ type: "token-usage",
44800
44817
  max_total: maxTotal ?? null,
44801
44818
  max_input: maxInput ?? null,
44802
44819
  max_output: maxOutput ?? null,
@@ -44838,9 +44855,9 @@ var TokenUsageEvaluator = class {
44838
44855
  hits,
44839
44856
  misses,
44840
44857
  expectedAspectCount,
44841
- reasoning: `token_usage input=${input}, output=${output}, cached=${cached2}, total=${total}`,
44858
+ reasoning: `token-usage input=${input}, output=${output}, cached=${cached2}, total=${total}`,
44842
44859
  evaluatorRawRequest: {
44843
- type: "token_usage",
44860
+ type: "token-usage",
44844
44861
  max_total: maxTotal ?? null,
44845
44862
  max_input: maxInput ?? null,
44846
44863
  max_output: maxOutput ?? null,
@@ -44923,7 +44940,7 @@ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
44923
44940
  };
44924
44941
  }
44925
44942
  var ToolTrajectoryEvaluator = class {
44926
- kind = "tool_trajectory";
44943
+ kind = "tool-trajectory";
44927
44944
  config;
44928
44945
  constructor(options) {
44929
44946
  this.config = options.config;
@@ -45111,7 +45128,7 @@ var ToolTrajectoryEvaluator = class {
45111
45128
  }
45112
45129
  }
45113
45130
  for (const warning of warnings) {
45114
- console.warn(`[tool_trajectory] ${warning}`);
45131
+ console.warn(`[tool-trajectory] ${warning}`);
45115
45132
  }
45116
45133
  const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
45117
45134
  const totalAssertions = expected.length + effectiveLatencyAssertions;
@@ -45187,7 +45204,7 @@ var ToolTrajectoryEvaluator = class {
45187
45204
  misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
45188
45205
  }
45189
45206
  for (const warning of warnings) {
45190
- console.warn(`[tool_trajectory] ${warning}`);
45207
+ console.warn(`[tool-trajectory] ${warning}`);
45191
45208
  }
45192
45209
  const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
45193
45210
  const totalAssertions = expected.length + effectiveLatencyAssertions;
@@ -45655,7 +45672,7 @@ var llmJudgeFactory = (config2, context) => {
45655
45672
  const c = config2;
45656
45673
  const { llmJudge, agentTimeoutMs } = context;
45657
45674
  return {
45658
- kind: "llm_judge",
45675
+ kind: "llm-judge",
45659
45676
  async evaluate(evalContext) {
45660
45677
  const customPrompt = await resolveCustomPrompt(
45661
45678
  c,
@@ -45744,7 +45761,7 @@ var agentJudgeFactory = (config2, context) => {
45744
45761
  customPrompt = readFileSync(c.resolvedPromptPath, "utf-8");
45745
45762
  } catch (error40) {
45746
45763
  const message = error40 instanceof Error ? error40.message : String(error40);
45747
- console.warn(`Could not read agent_judge prompt at ${c.resolvedPromptPath}: ${message}`);
45764
+ console.warn(`Could not read agent-judge prompt at ${c.resolvedPromptPath}: ${message}`);
45748
45765
  }
45749
45766
  } else if (c.prompt) {
45750
45767
  customPrompt = c.prompt;
@@ -45754,7 +45771,7 @@ var agentJudgeFactory = (config2, context) => {
45754
45771
  judgeTargetProvider = targetResolver(c.target);
45755
45772
  if (!judgeTargetProvider) {
45756
45773
  throw new Error(
45757
- `agent_judge evaluator '${c.name}': target '${c.target}' not found in targets`
45774
+ `agent-judge evaluator '${c.name}': target '${c.target}' not found in targets`
45758
45775
  );
45759
45776
  }
45760
45777
  }
@@ -45798,7 +45815,7 @@ var regexFactory = (config2) => {
45798
45815
  });
45799
45816
  };
45800
45817
  var isJsonFactory = () => {
45801
- return new DeterministicAssertionEvaluator("is_json", (ctx) => {
45818
+ return new DeterministicAssertionEvaluator("is-json", (ctx) => {
45802
45819
  const result = runIsJsonAssertion(ctx.candidate);
45803
45820
  return {
45804
45821
  score: result.score,
@@ -45826,7 +45843,7 @@ var equalsFactory = (config2) => {
45826
45843
  };
45827
45844
  var containsAnyFactory = (config2) => {
45828
45845
  const c = config2;
45829
- return new DeterministicAssertionEvaluator("contains_any", (ctx) => {
45846
+ return new DeterministicAssertionEvaluator("contains-any", (ctx) => {
45830
45847
  const result = runContainsAnyAssertion(ctx.candidate, c.value);
45831
45848
  return {
45832
45849
  score: result.score,
@@ -45840,7 +45857,7 @@ var containsAnyFactory = (config2) => {
45840
45857
  };
45841
45858
  var containsAllFactory = (config2) => {
45842
45859
  const c = config2;
45843
- return new DeterministicAssertionEvaluator("contains_all", (ctx) => {
45860
+ return new DeterministicAssertionEvaluator("contains-all", (ctx) => {
45844
45861
  const result = runContainsAllAssertion(ctx.candidate, c.value);
45845
45862
  return {
45846
45863
  score: result.score,
@@ -45868,7 +45885,7 @@ var icontainsFactory = (config2) => {
45868
45885
  };
45869
45886
  var icontainsAnyFactory = (config2) => {
45870
45887
  const c = config2;
45871
- return new DeterministicAssertionEvaluator("icontains_any", (ctx) => {
45888
+ return new DeterministicAssertionEvaluator("icontains-any", (ctx) => {
45872
45889
  const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
45873
45890
  return {
45874
45891
  score: result.score,
@@ -45882,7 +45899,7 @@ var icontainsAnyFactory = (config2) => {
45882
45899
  };
45883
45900
  var icontainsAllFactory = (config2) => {
45884
45901
  const c = config2;
45885
- return new DeterministicAssertionEvaluator("icontains_all", (ctx) => {
45902
+ return new DeterministicAssertionEvaluator("icontains-all", (ctx) => {
45886
45903
  const result = runIcontainsAllAssertion(ctx.candidate, c.value);
45887
45904
  return {
45888
45905
  score: result.score,
@@ -45896,7 +45913,7 @@ var icontainsAllFactory = (config2) => {
45896
45913
  };
45897
45914
  var startsWithFactory = (config2) => {
45898
45915
  const c = config2;
45899
- return new DeterministicAssertionEvaluator("starts_with", (ctx) => {
45916
+ return new DeterministicAssertionEvaluator("starts-with", (ctx) => {
45900
45917
  const result = runStartsWithAssertion(ctx.candidate, c.value);
45901
45918
  return {
45902
45919
  score: result.score,
@@ -45910,7 +45927,7 @@ var startsWithFactory = (config2) => {
45910
45927
  };
45911
45928
  var endsWithFactory = (config2) => {
45912
45929
  const c = config2;
45913
- return new DeterministicAssertionEvaluator("ends_with", (ctx) => {
45930
+ return new DeterministicAssertionEvaluator("ends-with", (ctx) => {
45914
45931
  const result = runEndsWithAssertion(ctx.candidate, c.value);
45915
45932
  return {
45916
45933
  score: result.score,
@@ -45924,7 +45941,7 @@ var endsWithFactory = (config2) => {
45924
45941
  };
45925
45942
  function createBuiltinRegistry() {
45926
45943
  const registry2 = new EvaluatorRegistry();
45927
- registry2.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("contains_any", containsAnyFactory).register("contains_all", containsAllFactory).register("icontains", icontainsFactory).register("icontains_any", icontainsAnyFactory).register("icontains_all", icontainsAllFactory).register("starts_with", startsWithFactory).register("ends_with", endsWithFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
45944
+ registry2.register("llm-judge", llmJudgeFactory).register("code-judge", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("agent-judge", agentJudgeFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory);
45928
45945
  return registry2;
45929
45946
  }
45930
45947
  async function discoverAssertions(registry2, baseDir) {
@@ -46636,7 +46653,7 @@ async function runEvaluation(options) {
46636
46653
  };
46637
46654
  if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
46638
46655
  throw new Error(
46639
- `Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure_base).`
46656
+ `Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure-base).`
46640
46657
  );
46641
46658
  }
46642
46659
  const targetResolver = (name16) => {
@@ -46707,7 +46724,7 @@ async function runEvaluation(options) {
46707
46724
  const rawTemplate = suiteWorkspace?.template ?? getWorkspaceTemplate(target);
46708
46725
  const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
46709
46726
  const workspaceTemplate = resolvedTemplate?.dir;
46710
- const suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
46727
+ let suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
46711
46728
  const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
46712
46729
  const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
46713
46730
  const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
@@ -46728,6 +46745,14 @@ async function runEvaluation(options) {
46728
46745
  const message = error40 instanceof Error ? error40.message : String(error40);
46729
46746
  throw new Error(`Failed to create shared workspace: ${message}`);
46730
46747
  }
46748
+ if (suiteWorkspaceFile && sharedWorkspacePath) {
46749
+ const copiedWorkspaceFile = path37.join(sharedWorkspacePath, path37.basename(suiteWorkspaceFile));
46750
+ try {
46751
+ await stat7(copiedWorkspaceFile);
46752
+ suiteWorkspaceFile = copiedWorkspaceFile;
46753
+ } catch {
46754
+ }
46755
+ }
46731
46756
  } else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
46732
46757
  sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
46733
46758
  await mkdir12(sharedWorkspacePath, { recursive: true });
@@ -47206,6 +47231,14 @@ async function runEvalCase(options) {
47206
47231
  "template_error"
47207
47232
  );
47208
47233
  }
47234
+ if (caseWorkspaceFile && workspacePath) {
47235
+ const copiedFile = path37.join(workspacePath, path37.basename(caseWorkspaceFile));
47236
+ try {
47237
+ await stat7(copiedFile);
47238
+ caseWorkspaceFile = copiedFile;
47239
+ } catch {
47240
+ }
47241
+ }
47209
47242
  }
47210
47243
  if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
47211
47244
  workspacePath = getWorkspacePath(evalRunId, evalCase.id);
@@ -47715,8 +47748,8 @@ async function runEvaluatorsForCase(options) {
47715
47748
  workspacePath
47716
47749
  });
47717
47750
  }
47718
- const evaluatorKind = evalCase.evaluator ?? "llm_judge";
47719
- const activeEvaluator = evaluators[evaluatorKind] ?? evaluators.llm_judge;
47751
+ const evaluatorKind = evalCase.evaluator ?? "llm-judge";
47752
+ const activeEvaluator = evaluators[evaluatorKind] ?? evaluators["llm-judge"];
47720
47753
  if (!activeEvaluator) {
47721
47754
  throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
47722
47755
  }
@@ -47799,25 +47832,24 @@ async function runEvaluatorList(options) {
47799
47832
  availableTargets,
47800
47833
  agentTimeoutMs,
47801
47834
  evalFileDir,
47802
- llmJudge: evaluatorRegistry.llm_judge,
47835
+ llmJudge: evaluatorRegistry["llm-judge"],
47803
47836
  registry: typeRegistry
47804
47837
  };
47805
47838
  for (const evaluatorConfig of evaluators ?? []) {
47806
47839
  try {
47807
47840
  const evaluatorInstance = await typeRegistry.create(evaluatorConfig, dispatchContext);
47808
47841
  const score2 = await evaluatorInstance.evaluate(evalContext);
47809
- const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
47810
47842
  const weight = evaluatorConfig.weight ?? 1;
47811
47843
  scored.push({
47812
47844
  score: score2,
47813
47845
  name: evaluatorConfig.name,
47814
- type: resultType,
47846
+ type: evaluatorConfig.type,
47815
47847
  weight,
47816
47848
  ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
47817
47849
  });
47818
47850
  scores.push({
47819
47851
  name: evaluatorConfig.name,
47820
- type: resultType,
47852
+ type: evaluatorConfig.type,
47821
47853
  score: score2.score,
47822
47854
  weight,
47823
47855
  verdict: score2.verdict,
@@ -47839,18 +47871,17 @@ async function runEvaluatorList(options) {
47839
47871
  expectedAspectCount: 1,
47840
47872
  reasoning: message
47841
47873
  };
47842
- const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
47843
47874
  const weight = evaluatorConfig.weight ?? 1;
47844
47875
  scored.push({
47845
47876
  score: fallbackScore,
47846
47877
  name: evaluatorConfig.name ?? "unknown",
47847
- type: resultType ?? "llm_judge",
47878
+ type: evaluatorConfig.type ?? "llm-judge",
47848
47879
  weight,
47849
47880
  ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
47850
47881
  });
47851
47882
  scores.push({
47852
47883
  name: evaluatorConfig.name ?? "unknown",
47853
- type: resultType ?? "llm_judge",
47884
+ type: evaluatorConfig.type ?? "llm-judge",
47854
47885
  score: 0,
47855
47886
  weight,
47856
47887
  verdict: "fail",
@@ -47911,7 +47942,7 @@ function filterEvalCases(evalCases, filter2) {
47911
47942
  return evalCases.filter((evalCase) => micromatch4.isMatch(evalCase.id, filter2));
47912
47943
  }
47913
47944
  function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
47914
- const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
47945
+ const llmJudge = overrides?.["llm-judge"] ?? new LlmJudgeEvaluator({
47915
47946
  resolveJudgeProvider: async (context) => {
47916
47947
  if (context.judgeProvider) {
47917
47948
  return context.judgeProvider;
@@ -47921,7 +47952,7 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
47921
47952
  });
47922
47953
  return {
47923
47954
  ...overrides,
47924
- llm_judge: llmJudge
47955
+ "llm-judge": llmJudge
47925
47956
  };
47926
47957
  }
47927
47958
  async function invokeProvider(provider, options) {
@@ -48177,12 +48208,7 @@ async function evaluate(config2) {
48177
48208
  };
48178
48209
  }
48179
48210
  function mapAssertionType(type) {
48180
- switch (type) {
48181
- case "code_judge":
48182
- return "code";
48183
- default:
48184
- return type;
48185
- }
48211
+ return type.replace(/_/g, "-");
48186
48212
  }
48187
48213
  function computeSummary(results, durationMs) {
48188
48214
  const total = results.length;
@@ -49011,4 +49037,4 @@ export {
49011
49037
  OtelStreamingObserver,
49012
49038
  createAgentKernel
49013
49039
  };
49014
- //# sourceMappingURL=chunk-FSBZM3HT.js.map
49040
+ //# sourceMappingURL=chunk-OQN2GDEU.js.map