@agentv/core 2.13.0 → 2.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1612,27 +1612,27 @@ function isTestMessage(value) {
1612
1612
  return false;
1613
1613
  }
1614
1614
  var EVALUATOR_KIND_VALUES = [
1615
- "code_judge",
1616
- "llm_judge",
1615
+ "code-judge",
1616
+ "llm-judge",
1617
1617
  "rubric",
1618
1618
  "composite",
1619
- "tool_trajectory",
1620
- "field_accuracy",
1619
+ "tool-trajectory",
1620
+ "field-accuracy",
1621
1621
  "latency",
1622
1622
  "cost",
1623
- "token_usage",
1624
- "execution_metrics",
1625
- "agent_judge",
1623
+ "token-usage",
1624
+ "execution-metrics",
1625
+ "agent-judge",
1626
1626
  "contains",
1627
- "contains_any",
1628
- "contains_all",
1627
+ "contains-any",
1628
+ "contains-all",
1629
1629
  "icontains",
1630
- "icontains_any",
1631
- "icontains_all",
1632
- "starts_with",
1633
- "ends_with",
1630
+ "icontains-any",
1631
+ "icontains-all",
1632
+ "starts-with",
1633
+ "ends-with",
1634
1634
  "regex",
1635
- "is_json",
1635
+ "is-json",
1636
1636
  "equals",
1637
1637
  "rubrics"
1638
1638
  ];
@@ -2301,6 +2301,9 @@ function validateTemplateVariables(content, source) {
2301
2301
  // src/evaluation/loaders/evaluator-parser.ts
2302
2302
  var ANSI_YELLOW4 = "\x1B[33m";
2303
2303
  var ANSI_RESET4 = "\x1B[0m";
2304
+ function normalizeEvaluatorType(type) {
2305
+ return type.replace(/_/g, "-");
2306
+ }
2304
2307
  async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
2305
2308
  const execution = rawEvalCase.execution;
2306
2309
  const executionObject = isJsonObject2(execution) ? execution : void 0;
@@ -2331,7 +2334,8 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2331
2334
  continue;
2332
2335
  }
2333
2336
  const rawName = asString(rawEvaluator.name);
2334
- const typeValue = rawEvaluator.type;
2337
+ const rawType = rawEvaluator.type;
2338
+ const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
2335
2339
  const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
2336
2340
  if (typeof typeValue !== "string") {
2337
2341
  logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
@@ -2364,25 +2368,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2364
2368
  });
2365
2369
  continue;
2366
2370
  }
2367
- if (typeValue === "code_judge") {
2371
+ if (typeValue === "code-judge") {
2368
2372
  let command;
2369
2373
  const rawCommand = rawEvaluator.command ?? rawEvaluator.script;
2370
2374
  if (typeof rawCommand === "string") {
2371
2375
  const trimmed = rawCommand.trim();
2372
2376
  if (trimmed.length === 0) {
2373
2377
  throw new Error(
2374
- `Invalid code_judge command for evaluator '${name}' in '${evalId}': command cannot be empty`
2378
+ `Invalid code-judge command for evaluator '${name}' in '${evalId}': command cannot be empty`
2375
2379
  );
2376
2380
  }
2377
2381
  command = parseCommandToArgv(trimmed);
2378
2382
  } else {
2379
2383
  command = asStringArray(
2380
2384
  rawCommand,
2381
- `code_judge command for evaluator '${name}' in '${evalId}'`
2385
+ `code-judge command for evaluator '${name}' in '${evalId}'`
2382
2386
  );
2383
2387
  }
2384
2388
  if (!command) {
2385
- logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing command`);
2389
+ logWarning2(`Skipping code-judge evaluator '${name}' in '${evalId}': missing command`);
2386
2390
  continue;
2387
2391
  }
2388
2392
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
@@ -2443,7 +2447,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2443
2447
  }
2444
2448
  evaluators.push({
2445
2449
  name,
2446
- type: "code",
2450
+ type: "code-judge",
2447
2451
  command,
2448
2452
  cwd,
2449
2453
  resolvedCwd,
@@ -2469,7 +2473,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2469
2473
  continue;
2470
2474
  }
2471
2475
  const aggregatorType = asString(rawAggregator.type);
2472
- if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge" && aggregatorType !== "threshold") {
2476
+ if (aggregatorType !== "weighted_average" && aggregatorType !== "code-judge" && aggregatorType !== "llm-judge" && aggregatorType !== "threshold") {
2473
2477
  logWarning2(
2474
2478
  `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
2475
2479
  );
@@ -2518,16 +2522,16 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2518
2522
  type: "weighted_average",
2519
2523
  ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
2520
2524
  };
2521
- } else if (aggregatorType === "code_judge") {
2525
+ } else if (aggregatorType === "code-judge") {
2522
2526
  const aggregatorPath = asString(rawAggregator.path);
2523
2527
  if (!aggregatorPath) {
2524
2528
  logWarning2(
2525
- `Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
2529
+ `Skipping composite evaluator '${name}' in '${evalId}': code-judge aggregator missing path`
2526
2530
  );
2527
2531
  continue;
2528
2532
  }
2529
2533
  aggregator = {
2530
- type: "code_judge",
2534
+ type: "code-judge",
2531
2535
  path: aggregatorPath,
2532
2536
  cwd: searchRoots[0]
2533
2537
  };
@@ -2553,7 +2557,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2553
2557
  }
2554
2558
  }
2555
2559
  aggregator = {
2556
- type: "llm_judge",
2560
+ type: "llm-judge",
2557
2561
  ...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
2558
2562
  ...promptPath2 ? { promptPath: promptPath2 } : {}
2559
2563
  };
@@ -2571,11 +2575,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2571
2575
  });
2572
2576
  continue;
2573
2577
  }
2574
- if (typeValue === "tool_trajectory") {
2578
+ if (typeValue === "tool-trajectory") {
2575
2579
  const mode = asString(rawEvaluator.mode);
2576
2580
  if (mode !== "any_order" && mode !== "in_order" && mode !== "exact" && mode !== "subset" && mode !== "superset") {
2577
2581
  logWarning2(
2578
- `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, exact, subset, or superset)`
2582
+ `Skipping tool-trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, exact, subset, or superset)`
2579
2583
  );
2580
2584
  continue;
2581
2585
  }
@@ -2584,7 +2588,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2584
2588
  if (rawMinimums !== void 0) {
2585
2589
  if (!isJsonObject2(rawMinimums)) {
2586
2590
  logWarning2(
2587
- `Skipping tool_trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
2591
+ `Skipping tool-trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
2588
2592
  );
2589
2593
  continue;
2590
2594
  }
@@ -2610,7 +2614,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2610
2614
  argsMatch2 = rawArgsMatch;
2611
2615
  } else {
2612
2616
  logWarning2(
2613
- `Invalid args_match '${rawArgsMatch}' for tool_trajectory evaluator '${name}' in '${evalId}': must be exact, superset, subset, ignore, or a string array`
2617
+ `Invalid args_match '${rawArgsMatch}' for tool-trajectory evaluator '${name}' in '${evalId}': must be exact, superset, subset, ignore, or a string array`
2614
2618
  );
2615
2619
  }
2616
2620
  }
@@ -2620,7 +2624,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2620
2624
  if (rawExpected !== void 0) {
2621
2625
  if (!Array.isArray(rawExpected)) {
2622
2626
  logWarning2(
2623
- `Skipping tool_trajectory evaluator '${name}' in '${evalId}': expected must be an array`
2627
+ `Skipping tool-trajectory evaluator '${name}' in '${evalId}': expected must be an array`
2624
2628
  );
2625
2629
  continue;
2626
2630
  }
@@ -2666,13 +2670,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2666
2670
  }
2667
2671
  if (mode === "any_order" && !minimums) {
2668
2672
  logWarning2(
2669
- `Skipping tool_trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
2673
+ `Skipping tool-trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
2670
2674
  );
2671
2675
  continue;
2672
2676
  }
2673
2677
  if ((mode === "in_order" || mode === "exact" || mode === "subset" || mode === "superset") && !expected) {
2674
2678
  logWarning2(
2675
- `Skipping tool_trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
2679
+ `Skipping tool-trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
2676
2680
  );
2677
2681
  continue;
2678
2682
  }
@@ -2680,7 +2684,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2680
2684
  const required2 = parseRequired(rawEvaluator.required);
2681
2685
  const config2 = {
2682
2686
  name,
2683
- type: "tool_trajectory",
2687
+ type: "tool-trajectory",
2684
2688
  mode,
2685
2689
  ...minimums ? { minimums } : {},
2686
2690
  ...expected ? { expected } : {},
@@ -2692,17 +2696,17 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2692
2696
  evaluators.push(config2);
2693
2697
  continue;
2694
2698
  }
2695
- if (typeValue === "field_accuracy") {
2699
+ if (typeValue === "field-accuracy") {
2696
2700
  const rawFields = rawEvaluator.fields;
2697
2701
  if (!Array.isArray(rawFields)) {
2698
2702
  logWarning2(
2699
- `Skipping field_accuracy evaluator '${name}' in '${evalId}': missing fields array`
2703
+ `Skipping field-accuracy evaluator '${name}' in '${evalId}': missing fields array`
2700
2704
  );
2701
2705
  continue;
2702
2706
  }
2703
2707
  if (rawFields.length === 0) {
2704
2708
  logWarning2(
2705
- `Skipping field_accuracy evaluator '${name}' in '${evalId}': fields array is empty`
2709
+ `Skipping field-accuracy evaluator '${name}' in '${evalId}': fields array is empty`
2706
2710
  );
2707
2711
  continue;
2708
2712
  }
@@ -2710,7 +2714,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2710
2714
  for (const rawField of rawFields) {
2711
2715
  if (!isJsonObject2(rawField)) {
2712
2716
  logWarning2(
2713
- `Skipping invalid field entry in field_accuracy evaluator '${name}' (expected object)`
2717
+ `Skipping invalid field entry in field-accuracy evaluator '${name}' (expected object)`
2714
2718
  );
2715
2719
  continue;
2716
2720
  }
@@ -2718,13 +2722,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2718
2722
  const match = asString(rawField.match);
2719
2723
  if (!fieldPath) {
2720
2724
  logWarning2(
2721
- `Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
2725
+ `Skipping field without path in field-accuracy evaluator '${name}' in '${evalId}'`
2722
2726
  );
2723
2727
  continue;
2724
2728
  }
2725
2729
  if (!match || !isValidFieldMatchType(match)) {
2726
2730
  logWarning2(
2727
- `Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code_judge evaluator.`
2731
+ `Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code-judge evaluator.`
2728
2732
  );
2729
2733
  continue;
2730
2734
  }
@@ -2741,7 +2745,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2741
2745
  }
2742
2746
  if (fields.length === 0) {
2743
2747
  logWarning2(
2744
- `Skipping field_accuracy evaluator '${name}' in '${evalId}': no valid fields found`
2748
+ `Skipping field-accuracy evaluator '${name}' in '${evalId}': no valid fields found`
2745
2749
  );
2746
2750
  continue;
2747
2751
  }
@@ -2751,7 +2755,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2751
2755
  const required2 = parseRequired(rawEvaluator.required);
2752
2756
  evaluators.push({
2753
2757
  name,
2754
- type: "field_accuracy",
2758
+ type: "field-accuracy",
2755
2759
  fields,
2756
2760
  ...validAggregation ? { aggregation: validAggregation } : {},
2757
2761
  ...weight2 !== void 0 ? { weight: weight2 } : {},
@@ -2800,7 +2804,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2800
2804
  });
2801
2805
  continue;
2802
2806
  }
2803
- if (typeValue === "token_usage") {
2807
+ if (typeValue === "token-usage") {
2804
2808
  const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
2805
2809
  const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
2806
2810
  const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
@@ -2814,7 +2818,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2814
2818
  if (raw === void 0) continue;
2815
2819
  if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
2816
2820
  logWarning2(
2817
- `Skipping token_usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
2821
+ `Skipping token-usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
2818
2822
  );
2819
2823
  continue;
2820
2824
  }
@@ -2822,7 +2826,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2822
2826
  }
2823
2827
  if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
2824
2828
  logWarning2(
2825
- `Skipping token_usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
2829
+ `Skipping token-usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
2826
2830
  );
2827
2831
  continue;
2828
2832
  }
@@ -2830,7 +2834,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2830
2834
  const required2 = parseRequired(rawEvaluator.required);
2831
2835
  evaluators.push({
2832
2836
  name,
2833
- type: "token_usage",
2837
+ type: "token-usage",
2834
2838
  ...validLimits,
2835
2839
  ...weight2 !== void 0 ? { weight: weight2 } : {},
2836
2840
  ...required2 !== void 0 ? { required: required2 } : {},
@@ -2838,7 +2842,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2838
2842
  });
2839
2843
  continue;
2840
2844
  }
2841
- if (typeValue === "execution_metrics") {
2845
+ if (typeValue === "execution-metrics") {
2842
2846
  const maxToolCalls = rawEvaluator.max_tool_calls ?? rawEvaluator.maxToolCalls;
2843
2847
  const maxLlmCalls = rawEvaluator.max_llm_calls ?? rawEvaluator.maxLlmCalls;
2844
2848
  const maxTokens = rawEvaluator.max_tokens ?? rawEvaluator.maxTokens;
@@ -2861,7 +2865,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2861
2865
  if (raw === void 0) continue;
2862
2866
  if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
2863
2867
  logWarning2(
2864
- `Skipping execution_metrics evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
2868
+ `Skipping execution-metrics evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
2865
2869
  );
2866
2870
  hasError = true;
2867
2871
  break;
@@ -2874,7 +2878,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2874
2878
  const hasThreshold = validThresholds.max_tool_calls !== void 0 || validThresholds.max_llm_calls !== void 0 || validThresholds.max_tokens !== void 0 || validThresholds.max_cost_usd !== void 0 || validThresholds.max_duration_ms !== void 0 || validThresholds.target_exploration_ratio !== void 0;
2875
2879
  if (!hasThreshold) {
2876
2880
  logWarning2(
2877
- `Skipping execution_metrics evaluator '${name}' in '${evalId}': must set at least one threshold (max_tool_calls, max_llm_calls, max_tokens, max_cost_usd, max_duration_ms, or target_exploration_ratio)`
2881
+ `Skipping execution-metrics evaluator '${name}' in '${evalId}': must set at least one threshold (max_tool_calls, max_llm_calls, max_tokens, max_cost_usd, max_duration_ms, or target_exploration_ratio)`
2878
2882
  );
2879
2883
  continue;
2880
2884
  }
@@ -2882,7 +2886,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2882
2886
  const required2 = parseRequired(rawEvaluator.required);
2883
2887
  evaluators.push({
2884
2888
  name,
2885
- type: "execution_metrics",
2889
+ type: "execution-metrics",
2886
2890
  ...validThresholds,
2887
2891
  ...weight2 !== void 0 ? { weight: weight2 } : {},
2888
2892
  ...required2 !== void 0 ? { required: required2 } : {},
@@ -2890,13 +2894,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2890
2894
  });
2891
2895
  continue;
2892
2896
  }
2893
- if (typeValue === "agent_judge") {
2897
+ if (typeValue === "agent-judge") {
2894
2898
  const rawMaxSteps = rawEvaluator.max_steps ?? rawEvaluator.maxSteps;
2895
2899
  let maxSteps;
2896
2900
  if (rawMaxSteps !== void 0) {
2897
2901
  if (typeof rawMaxSteps !== "number" || !Number.isInteger(rawMaxSteps) || rawMaxSteps < 1 || rawMaxSteps > 50) {
2898
2902
  logWarning2(
2899
- `Skipping agent_judge evaluator '${name}' in '${evalId}': max_steps must be an integer 1-50`
2903
+ `Skipping agent-judge evaluator '${name}' in '${evalId}': max_steps must be an integer 1-50`
2900
2904
  );
2901
2905
  continue;
2902
2906
  }
@@ -2907,7 +2911,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2907
2911
  if (rawTemperature !== void 0) {
2908
2912
  if (typeof rawTemperature !== "number" || rawTemperature < 0 || rawTemperature > 2) {
2909
2913
  logWarning2(
2910
- `Skipping agent_judge evaluator '${name}' in '${evalId}': temperature must be a number 0-2`
2914
+ `Skipping agent-judge evaluator '${name}' in '${evalId}': temperature must be a number 0-2`
2911
2915
  );
2912
2916
  continue;
2913
2917
  }
@@ -2930,7 +2934,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2930
2934
  const required2 = parseRequired(rawEvaluator.required);
2931
2935
  evaluators.push({
2932
2936
  name,
2933
- type: "agent_judge",
2937
+ type: "agent-judge",
2934
2938
  ...agentPrompt ? { prompt: agentPrompt } : {},
2935
2939
  ...agentPromptPath ? { promptPath: agentPromptPath, resolvedPromptPath: agentPromptPath } : {},
2936
2940
  ...agentParsedRubrics && agentParsedRubrics.length > 0 ? { rubrics: agentParsedRubrics } : {},
@@ -2961,7 +2965,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2961
2965
  });
2962
2966
  continue;
2963
2967
  }
2964
- if (typeValue === "contains_any" || typeValue === "contains_all") {
2968
+ if (typeValue === "contains-any" || typeValue === "contains-all") {
2965
2969
  const value = asStringArrayStrict(rawEvaluator.value);
2966
2970
  if (!value || value.length === 0) {
2967
2971
  logWarning2(
@@ -2999,7 +3003,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2999
3003
  });
3000
3004
  continue;
3001
3005
  }
3002
- if (typeValue === "icontains_any" || typeValue === "icontains_all") {
3006
+ if (typeValue === "icontains-any" || typeValue === "icontains-all") {
3003
3007
  const value = asStringArrayStrict(rawEvaluator.value);
3004
3008
  if (!value || value.length === 0) {
3005
3009
  logWarning2(
@@ -3019,7 +3023,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3019
3023
  });
3020
3024
  continue;
3021
3025
  }
3022
- if (typeValue === "starts_with" || typeValue === "ends_with") {
3026
+ if (typeValue === "starts-with" || typeValue === "ends-with") {
3023
3027
  const value = asString(rawEvaluator.value);
3024
3028
  if (!value) {
3025
3029
  logWarning2(`Skipping ${typeValue} evaluator '${name}' in '${evalId}': missing value`);
@@ -3057,12 +3061,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3057
3061
  });
3058
3062
  continue;
3059
3063
  }
3060
- if (typeValue === "is_json") {
3064
+ if (typeValue === "is-json") {
3061
3065
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3062
3066
  const required2 = parseRequired(rawEvaluator.required);
3063
3067
  evaluators.push({
3064
3068
  name,
3065
- type: "is_json",
3069
+ type: "is-json",
3066
3070
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3067
3071
  ...required2 !== void 0 ? { required: required2 } : {},
3068
3072
  ...negate !== void 0 ? { negate } : {}
@@ -3110,7 +3114,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3110
3114
  const required2 = parseRequired(rawEvaluator.required);
3111
3115
  evaluators.push({
3112
3116
  name,
3113
- type: "llm_judge",
3117
+ type: "llm-judge",
3114
3118
  rubrics: parsedCriteria,
3115
3119
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3116
3120
  ...required2 !== void 0 ? { required: required2 } : {},
@@ -3177,7 +3181,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3177
3181
  const required2 = parseRequired(rawEvaluator.required);
3178
3182
  evaluators.push({
3179
3183
  name,
3180
- type: "llm_judge",
3184
+ type: "llm-judge",
3181
3185
  rubrics: parsedRubrics,
3182
3186
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3183
3187
  ...required2 !== void 0 ? { required: required2 } : {},
@@ -3209,7 +3213,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3209
3213
  const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
3210
3214
  evaluators.push({
3211
3215
  name,
3212
- type: "llm_judge",
3216
+ type: "llm-judge",
3213
3217
  prompt,
3214
3218
  promptPath,
3215
3219
  ...promptPath ? { resolvedPromptPath: promptPath } : {},
@@ -3225,15 +3229,15 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3225
3229
  }
3226
3230
  var ASSERTION_TYPES = /* @__PURE__ */ new Set([
3227
3231
  "contains",
3228
- "contains_any",
3229
- "contains_all",
3232
+ "contains-any",
3233
+ "contains-all",
3230
3234
  "icontains",
3231
- "icontains_any",
3232
- "icontains_all",
3233
- "starts_with",
3234
- "ends_with",
3235
+ "icontains-any",
3236
+ "icontains-all",
3237
+ "starts-with",
3238
+ "ends-with",
3235
3239
  "regex",
3236
- "is_json",
3240
+ "is-json",
3237
3241
  "equals",
3238
3242
  "rubrics"
3239
3243
  ]);
@@ -3246,24 +3250,24 @@ function generateAssertionName(typeValue, rawEvaluator) {
3246
3250
  switch (typeValue) {
3247
3251
  case "contains":
3248
3252
  return value ? `contains-${value}` : "contains";
3249
- case "contains_any":
3250
- return arrayValue ? `contains_any-${arrayValue.length}` : "contains_any";
3251
- case "contains_all":
3252
- return arrayValue ? `contains_all-${arrayValue.length}` : "contains_all";
3253
+ case "contains-any":
3254
+ return arrayValue ? `contains-any-${arrayValue.length}` : "contains-any";
3255
+ case "contains-all":
3256
+ return arrayValue ? `contains-all-${arrayValue.length}` : "contains-all";
3253
3257
  case "icontains":
3254
3258
  return value ? `icontains-${value}` : "icontains";
3255
- case "icontains_any":
3256
- return arrayValue ? `icontains_any-${arrayValue.length}` : "icontains_any";
3257
- case "icontains_all":
3258
- return arrayValue ? `icontains_all-${arrayValue.length}` : "icontains_all";
3259
- case "starts_with":
3260
- return value ? `starts_with-${value}` : "starts_with";
3261
- case "ends_with":
3262
- return value ? `ends_with-${value}` : "ends_with";
3259
+ case "icontains-any":
3260
+ return arrayValue ? `icontains-any-${arrayValue.length}` : "icontains-any";
3261
+ case "icontains-all":
3262
+ return arrayValue ? `icontains-all-${arrayValue.length}` : "icontains-all";
3263
+ case "starts-with":
3264
+ return value ? `starts-with-${value}` : "starts-with";
3265
+ case "ends-with":
3266
+ return value ? `ends-with-${value}` : "ends-with";
3263
3267
  case "regex":
3264
3268
  return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
3265
- case "is_json":
3266
- return "is_json";
3269
+ case "is-json":
3270
+ return "is-json";
3267
3271
  case "equals":
3268
3272
  return value ? `equals-${value}` : "equals";
3269
3273
  case "rubrics":
@@ -3276,8 +3280,9 @@ function coerceEvaluator(candidate, contextId) {
3276
3280
  if (typeof candidate !== "string") {
3277
3281
  return void 0;
3278
3282
  }
3279
- if (isEvaluatorKind(candidate)) {
3280
- return candidate;
3283
+ const normalized = normalizeEvaluatorType(candidate);
3284
+ if (isEvaluatorKind(normalized)) {
3285
+ return normalized;
3281
3286
  }
3282
3287
  logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
3283
3288
  return void 0;
@@ -3323,6 +3328,16 @@ function parseCommandToArgv(command) {
3323
3328
  function isJsonObject2(value) {
3324
3329
  return typeof value === "object" && value !== null && !Array.isArray(value);
3325
3330
  }
3331
+ var CRITERIA_CONSUMER_TYPES = /* @__PURE__ */ new Set(["llm-judge", "agent-judge", "code-judge"]);
3332
+ function warnUnconsumedCriteria(criteria, evaluators, testId) {
3333
+ if (!criteria?.trim() || !evaluators || evaluators.length === 0) return;
3334
+ const hasConsumer = evaluators.some((e) => CRITERIA_CONSUMER_TYPES.has(e.type));
3335
+ if (!hasConsumer) {
3336
+ logWarning2(
3337
+ `Test '${testId}': criteria is defined but no evaluator in assert will evaluate it. Add 'type: llm-judge' to assert, or remove criteria if it is documentation-only.`
3338
+ );
3339
+ }
3340
+ }
3326
3341
  function logWarning2(message, details) {
3327
3342
  if (details && details.length > 0) {
3328
3343
  const detailBlock = details.join("\n");
@@ -3572,7 +3587,7 @@ function parseInlineRubrics(rawRubrics) {
3572
3587
  }
3573
3588
  return {
3574
3589
  name: "rubric",
3575
- type: "llm_judge",
3590
+ type: "llm-judge",
3576
3591
  rubrics: rubricItems
3577
3592
  };
3578
3593
  }
@@ -3957,7 +3972,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
3957
3972
  const rawCases = parseJsonlContent(rawFile, evalFilePath);
3958
3973
  const fallbackDataset = import_node_path6.default.basename(absoluteTestPath, ".jsonl") || "eval";
3959
3974
  const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
3960
- const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm_judge";
3975
+ const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-judge";
3961
3976
  const globalExecution = sidecar.execution;
3962
3977
  if (verbose) {
3963
3978
  console.log(`
@@ -4045,6 +4060,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
4045
4060
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
4046
4061
  }
4047
4062
  }
4063
+ warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
4048
4064
  const userFilePaths = [];
4049
4065
  for (const segment of inputSegments) {
4050
4066
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -4437,7 +4453,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4437
4453
  const fallbackDataset = import_node_path8.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
4438
4454
  const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
4439
4455
  const rawTestcases = resolveTests(suite);
4440
- const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
4456
+ const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-judge";
4441
4457
  const evalFileDir = import_node_path8.default.dirname(absoluteTestPath);
4442
4458
  let expandedTestcases;
4443
4459
  if (typeof rawTestcases === "string") {
@@ -4534,6 +4550,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4534
4550
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
4535
4551
  }
4536
4552
  }
4553
+ warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
4537
4554
  const userFilePaths = [];
4538
4555
  for (const segment of inputSegments) {
4539
4556
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -11726,7 +11743,7 @@ function toCamelCaseDeep(obj) {
11726
11743
  // src/evaluation/evaluators/code-evaluator.ts
11727
11744
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
11728
11745
  var CodeEvaluator = class {
11729
- kind = "code";
11746
+ kind = "code-judge";
11730
11747
  command;
11731
11748
  cwd;
11732
11749
  agentTimeoutMs;
@@ -11963,7 +11980,7 @@ var scoreRangeEvaluationSchema = import_zod4.z.object({
11963
11980
  overall_reasoning: import_zod4.z.string().describe("Overall assessment summary (1-2 sentences)").optional()
11964
11981
  });
11965
11982
  var LlmJudgeEvaluator = class {
11966
- kind = "llm_judge";
11983
+ kind = "llm-judge";
11967
11984
  resolveJudgeProvider;
11968
11985
  maxOutputTokens;
11969
11986
  temperature;
@@ -11980,7 +11997,7 @@ var LlmJudgeEvaluator = class {
11980
11997
  throw new Error("No judge provider available for LLM grading");
11981
11998
  }
11982
11999
  const config = context2.evaluator;
11983
- if (config?.type === "llm_judge" && config.rubrics && config.rubrics.length > 0) {
12000
+ if (config?.type === "llm-judge" && config.rubrics && config.rubrics.length > 0) {
11984
12001
  return this.evaluateWithRubrics(context2, judgeProvider, config.rubrics);
11985
12002
  }
11986
12003
  return this.evaluateFreeform(context2, judgeProvider);
@@ -12054,7 +12071,7 @@ ${context2.fileChanges}`;
12054
12071
  async evaluateWithRubrics(context2, judgeProvider, rubrics) {
12055
12072
  if (!rubrics || rubrics.length === 0) {
12056
12073
  throw new Error(
12057
- `No rubrics found for evaluator "${context2.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
12074
+ `No rubrics found for evaluator "${context2.evaluator?.name ?? "llm-judge"}". Run "agentv generate rubrics" first.`
12058
12075
  );
12059
12076
  }
12060
12077
  const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
@@ -12390,9 +12407,9 @@ var CompositeEvaluator = class {
12390
12407
  async aggregate(results, context2) {
12391
12408
  const aggregator = this.config.aggregator;
12392
12409
  switch (aggregator.type) {
12393
- case "code_judge":
12410
+ case "code-judge":
12394
12411
  return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
12395
- case "llm_judge":
12412
+ case "llm-judge":
12396
12413
  return this.runLlmAggregator(results, context2, aggregator);
12397
12414
  case "threshold":
12398
12415
  return this.runThreshold(results, aggregator.threshold);
@@ -12535,7 +12552,7 @@ var CompositeEvaluator = class {
12535
12552
  expectedAspectCount: hits.length + misses.length || 1,
12536
12553
  reasoning,
12537
12554
  evaluatorRawRequest: {
12538
- aggregator: "code_judge",
12555
+ aggregator: "code-judge",
12539
12556
  script: scriptPath
12540
12557
  },
12541
12558
  scores
@@ -12550,7 +12567,7 @@ var CompositeEvaluator = class {
12550
12567
  expectedAspectCount: 1,
12551
12568
  reasoning: message,
12552
12569
  evaluatorRawRequest: {
12553
- aggregator: "code_judge",
12570
+ aggregator: "code-judge",
12554
12571
  script: scriptPath,
12555
12572
  error: message
12556
12573
  },
@@ -12581,7 +12598,7 @@ var CompositeEvaluator = class {
12581
12598
  const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
12582
12599
  const systemPrompt = buildOutputSchema();
12583
12600
  const evaluatorRawRequest = {
12584
- aggregator: "llm_judge",
12601
+ aggregator: "llm-judge",
12585
12602
  userPrompt,
12586
12603
  systemPrompt,
12587
12604
  target: judgeProvider.targetName
@@ -12693,7 +12710,7 @@ var CostEvaluator = class {
12693
12710
 
12694
12711
  // src/evaluation/evaluators/execution-metrics.ts
12695
12712
  var ExecutionMetricsEvaluator = class {
12696
- kind = "execution_metrics";
12713
+ kind = "execution-metrics";
12697
12714
  config;
12698
12715
  constructor(options) {
12699
12716
  this.config = options.config;
@@ -12719,7 +12736,7 @@ var ExecutionMetricsEvaluator = class {
12719
12736
  expectedAspectCount: 1,
12720
12737
  reasoning: "Execution metrics not available - no trace summary provided",
12721
12738
  evaluatorRawRequest: {
12722
- type: "execution_metrics",
12739
+ type: "execution-metrics",
12723
12740
  config: this.extractConfiguredThresholds(),
12724
12741
  actual: null
12725
12742
  }
@@ -12828,7 +12845,7 @@ var ExecutionMetricsEvaluator = class {
12828
12845
  if (actualMetrics.exploration_ratio !== void 0) {
12829
12846
  reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
12830
12847
  }
12831
- const reasoning = reasoningParts.length > 0 ? `execution_metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
12848
+ const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
12832
12849
  return {
12833
12850
  score,
12834
12851
  verdict: scoreToVerdict(score),
@@ -12837,7 +12854,7 @@ var ExecutionMetricsEvaluator = class {
12837
12854
  expectedAspectCount: totalChecks || 1,
12838
12855
  reasoning,
12839
12856
  evaluatorRawRequest: {
12840
- type: "execution_metrics",
12857
+ type: "execution-metrics",
12841
12858
  config: this.extractConfiguredThresholds(),
12842
12859
  actual: this.filterDefinedMetrics(actualMetrics)
12843
12860
  }
@@ -12925,7 +12942,7 @@ var MONTH_NAMES = {
12925
12942
  december: 11
12926
12943
  };
12927
12944
  var FieldAccuracyEvaluator = class {
12928
- kind = "field_accuracy";
12945
+ kind = "field-accuracy";
12929
12946
  config;
12930
12947
  constructor(options) {
12931
12948
  this.config = options.config;
@@ -13379,7 +13396,7 @@ var BINARY_EXTENSIONS = /* @__PURE__ */ new Set([
13379
13396
  ".dylib"
13380
13397
  ]);
13381
13398
  var AgentJudgeEvaluator = class {
13382
- kind = "agent_judge";
13399
+ kind = "agent-judge";
13383
13400
  resolveJudgeProvider;
13384
13401
  maxSteps;
13385
13402
  temperature;
@@ -13404,24 +13421,24 @@ var AgentJudgeEvaluator = class {
13404
13421
  async evaluateBuiltIn(context2) {
13405
13422
  const judgeProvider = await this.resolveJudgeProvider(context2);
13406
13423
  if (!judgeProvider) {
13407
- throw new Error("No judge provider available for agent_judge evaluation");
13424
+ throw new Error("No judge provider available for agent-judge evaluation");
13408
13425
  }
13409
13426
  const model = judgeProvider.asLanguageModel?.();
13410
13427
  if (!model) {
13411
13428
  throw new Error(
13412
- `Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent_judge mode`
13429
+ `Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent-judge mode`
13413
13430
  );
13414
13431
  }
13415
13432
  const workspacePath = context2.workspacePath;
13416
13433
  if (!workspacePath) {
13417
13434
  throw new Error(
13418
- "agent_judge evaluator requires a workspace_template target (workspacePath is not set)"
13435
+ "agent-judge evaluator requires a workspace_template target (workspacePath is not set)"
13419
13436
  );
13420
13437
  }
13421
13438
  const systemPrompt = this.buildSystemPrompt(context2);
13422
13439
  const userPrompt = this.buildUserPrompt(context2);
13423
13440
  const config = context2.evaluator;
13424
- const rubrics = config?.type === "agent_judge" ? config.rubrics : void 0;
13441
+ const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
13425
13442
  const fsTools = createFilesystemTools(workspacePath);
13426
13443
  const evaluatorRawRequest = {
13427
13444
  mode: "built-in",
@@ -13452,7 +13469,7 @@ var AgentJudgeEvaluator = class {
13452
13469
  score: 0,
13453
13470
  verdict: "fail",
13454
13471
  hits: [],
13455
- misses: [`agent_judge built-in evaluation failed: ${message}`],
13472
+ misses: [`agent-judge built-in evaluation failed: ${message}`],
13456
13473
  expectedAspectCount: 1,
13457
13474
  evaluatorRawRequest,
13458
13475
  details: { mode: "built-in", error: message }
@@ -13484,14 +13501,14 @@ var AgentJudgeEvaluator = class {
13484
13501
  score: 0,
13485
13502
  verdict: "fail",
13486
13503
  hits: [],
13487
- misses: ["agent_judge judge_target returned no assistant response"],
13504
+ misses: ["agent-judge judge_target returned no assistant response"],
13488
13505
  expectedAspectCount: 1,
13489
13506
  evaluatorRawRequest,
13490
13507
  details: { mode: "judge_target", judge_target: provider.targetName }
13491
13508
  };
13492
13509
  }
13493
13510
  const config = context2.evaluator;
13494
- const rubrics = config?.type === "agent_judge" ? config.rubrics : void 0;
13511
+ const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
13495
13512
  const details = {
13496
13513
  mode: "judge_target",
13497
13514
  judge_target: provider.targetName
@@ -13503,7 +13520,7 @@ var AgentJudgeEvaluator = class {
13503
13520
  score: 0,
13504
13521
  verdict: "fail",
13505
13522
  hits: [],
13506
- misses: [`agent_judge judge_target evaluation failed: ${message}`],
13523
+ misses: [`agent-judge judge_target evaluation failed: ${message}`],
13507
13524
  expectedAspectCount: 1,
13508
13525
  evaluatorRawRequest,
13509
13526
  details: {
@@ -13554,7 +13571,7 @@ var AgentJudgeEvaluator = class {
13554
13571
  score: 0,
13555
13572
  verdict: "fail",
13556
13573
  hits: [],
13557
- misses: ["Failed to parse agent_judge response as valid evaluation JSON"],
13574
+ misses: ["Failed to parse agent-judge response as valid evaluation JSON"],
13558
13575
  expectedAspectCount: 1,
13559
13576
  evaluatorRawRequest,
13560
13577
  details
@@ -13567,7 +13584,7 @@ var AgentJudgeEvaluator = class {
13567
13584
  */
13568
13585
  buildSystemPrompt(context2) {
13569
13586
  const config = context2.evaluator;
13570
- const rubrics = config?.type === "agent_judge" ? config.rubrics : void 0;
13587
+ const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
13571
13588
  const parts = [
13572
13589
  "You are an expert evaluator with access to the workspace filesystem.",
13573
13590
  "Use the provided tools to investigate the workspace and verify the criteria are met.",
@@ -13598,7 +13615,7 @@ var AgentJudgeEvaluator = class {
13598
13615
  return substituteVariables(this.evaluatorTemplate, variables);
13599
13616
  }
13600
13617
  const config = context2.evaluator;
13601
- const rubrics = config?.type === "agent_judge" ? config.rubrics : void 0;
13618
+ const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
13602
13619
  const parts = [
13603
13620
  "Evaluate the candidate answer by investigating the workspace.",
13604
13621
  "",
@@ -13641,7 +13658,7 @@ var AgentJudgeEvaluator = class {
13641
13658
  buildDelegatedPrompt(context2) {
13642
13659
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
13643
13660
  const config = context2.evaluator;
13644
- const rubrics = config?.type === "agent_judge" ? config.rubrics : void 0;
13661
+ const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
13645
13662
  if (this.evaluatorTemplate) {
13646
13663
  const variables = {
13647
13664
  [TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
@@ -13723,11 +13740,11 @@ function createFilesystemTools(workspacePath) {
13723
13740
  execute: async (input) => {
13724
13741
  try {
13725
13742
  const resolved = resolveSandboxed(workspacePath, input.path);
13726
- const stat7 = await import_promises25.default.stat(resolved);
13727
- if (stat7.isDirectory()) {
13743
+ const stat8 = await import_promises25.default.stat(resolved);
13744
+ if (stat8.isDirectory()) {
13728
13745
  return { error: `'${input.path}' is a directory, not a file` };
13729
13746
  }
13730
- const buffer = Buffer.alloc(Math.min(stat7.size, MAX_FILE_SIZE));
13747
+ const buffer = Buffer.alloc(Math.min(stat8.size, MAX_FILE_SIZE));
13731
13748
  const fd = await import_promises25.default.open(resolved, "r");
13732
13749
  try {
13733
13750
  await fd.read(buffer, 0, buffer.length, 0);
@@ -13735,8 +13752,8 @@ function createFilesystemTools(workspacePath) {
13735
13752
  await fd.close();
13736
13753
  }
13737
13754
  const content = buffer.toString("utf-8");
13738
- const truncated = stat7.size > MAX_FILE_SIZE;
13739
- return { content, truncated, size: stat7.size };
13755
+ const truncated = stat8.size > MAX_FILE_SIZE;
13756
+ return { content, truncated, size: stat8.size };
13740
13757
  } catch (error) {
13741
13758
  return { error: error instanceof Error ? error.message : String(error) };
13742
13759
  }
@@ -13780,8 +13797,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
13780
13797
  const ext = import_node_path33.default.extname(entry.name).toLowerCase();
13781
13798
  if (BINARY_EXTENSIONS.has(ext)) continue;
13782
13799
  try {
13783
- const stat7 = await import_promises25.default.stat(fullPath);
13784
- if (stat7.size > MAX_FILE_SIZE) continue;
13800
+ const stat8 = await import_promises25.default.stat(fullPath);
13801
+ if (stat8.size > MAX_FILE_SIZE) continue;
13785
13802
  const content = await import_promises25.default.readFile(fullPath, "utf-8");
13786
13803
  const lines = content.split("\n");
13787
13804
  for (let i = 0; i < lines.length; i++) {
@@ -13943,7 +13960,7 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
13943
13960
 
13944
13961
  // src/evaluation/evaluators/token-usage.ts
13945
13962
  var TokenUsageEvaluator = class {
13946
- kind = "token_usage";
13963
+ kind = "token-usage";
13947
13964
  config;
13948
13965
  constructor(options) {
13949
13966
  this.config = options.config;
@@ -13966,7 +13983,7 @@ var TokenUsageEvaluator = class {
13966
13983
  expectedAspectCount,
13967
13984
  reasoning: "Token usage not reported by provider",
13968
13985
  evaluatorRawRequest: {
13969
- type: "token_usage",
13986
+ type: "token-usage",
13970
13987
  max_total: maxTotal ?? null,
13971
13988
  max_input: maxInput ?? null,
13972
13989
  max_output: maxOutput ?? null,
@@ -14008,9 +14025,9 @@ var TokenUsageEvaluator = class {
14008
14025
  hits,
14009
14026
  misses,
14010
14027
  expectedAspectCount,
14011
- reasoning: `token_usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
14028
+ reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
14012
14029
  evaluatorRawRequest: {
14013
- type: "token_usage",
14030
+ type: "token-usage",
14014
14031
  max_total: maxTotal ?? null,
14015
14032
  max_input: maxInput ?? null,
14016
14033
  max_output: maxOutput ?? null,
@@ -14095,7 +14112,7 @@ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
14095
14112
  };
14096
14113
  }
14097
14114
  var ToolTrajectoryEvaluator = class {
14098
- kind = "tool_trajectory";
14115
+ kind = "tool-trajectory";
14099
14116
  config;
14100
14117
  constructor(options) {
14101
14118
  this.config = options.config;
@@ -14283,7 +14300,7 @@ var ToolTrajectoryEvaluator = class {
14283
14300
  }
14284
14301
  }
14285
14302
  for (const warning of warnings) {
14286
- console.warn(`[tool_trajectory] ${warning}`);
14303
+ console.warn(`[tool-trajectory] ${warning}`);
14287
14304
  }
14288
14305
  const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
14289
14306
  const totalAssertions = expected.length + effectiveLatencyAssertions;
@@ -14359,7 +14376,7 @@ var ToolTrajectoryEvaluator = class {
14359
14376
  misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
14360
14377
  }
14361
14378
  for (const warning of warnings) {
14362
- console.warn(`[tool_trajectory] ${warning}`);
14379
+ console.warn(`[tool-trajectory] ${warning}`);
14363
14380
  }
14364
14381
  const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
14365
14382
  const totalAssertions = expected.length + effectiveLatencyAssertions;
@@ -14849,7 +14866,7 @@ var llmJudgeFactory = (config, context2) => {
14849
14866
  const c = config;
14850
14867
  const { llmJudge, agentTimeoutMs } = context2;
14851
14868
  return {
14852
- kind: "llm_judge",
14869
+ kind: "llm-judge",
14853
14870
  async evaluate(evalContext) {
14854
14871
  const customPrompt = await resolveCustomPrompt(
14855
14872
  c,
@@ -14938,7 +14955,7 @@ var agentJudgeFactory = (config, context2) => {
14938
14955
  customPrompt = (0, import_node_fs9.readFileSync)(c.resolvedPromptPath, "utf-8");
14939
14956
  } catch (error) {
14940
14957
  const message = error instanceof Error ? error.message : String(error);
14941
- console.warn(`Could not read agent_judge prompt at ${c.resolvedPromptPath}: ${message}`);
14958
+ console.warn(`Could not read agent-judge prompt at ${c.resolvedPromptPath}: ${message}`);
14942
14959
  }
14943
14960
  } else if (c.prompt) {
14944
14961
  customPrompt = c.prompt;
@@ -14948,7 +14965,7 @@ var agentJudgeFactory = (config, context2) => {
14948
14965
  judgeTargetProvider = targetResolver(c.target);
14949
14966
  if (!judgeTargetProvider) {
14950
14967
  throw new Error(
14951
- `agent_judge evaluator '${c.name}': target '${c.target}' not found in targets`
14968
+ `agent-judge evaluator '${c.name}': target '${c.target}' not found in targets`
14952
14969
  );
14953
14970
  }
14954
14971
  }
@@ -14992,7 +15009,7 @@ var regexFactory = (config) => {
14992
15009
  });
14993
15010
  };
14994
15011
  var isJsonFactory = () => {
14995
- return new DeterministicAssertionEvaluator("is_json", (ctx) => {
15012
+ return new DeterministicAssertionEvaluator("is-json", (ctx) => {
14996
15013
  const result = runIsJsonAssertion(ctx.candidate);
14997
15014
  return {
14998
15015
  score: result.score,
@@ -15020,7 +15037,7 @@ var equalsFactory = (config) => {
15020
15037
  };
15021
15038
  var containsAnyFactory = (config) => {
15022
15039
  const c = config;
15023
- return new DeterministicAssertionEvaluator("contains_any", (ctx) => {
15040
+ return new DeterministicAssertionEvaluator("contains-any", (ctx) => {
15024
15041
  const result = runContainsAnyAssertion(ctx.candidate, c.value);
15025
15042
  return {
15026
15043
  score: result.score,
@@ -15034,7 +15051,7 @@ var containsAnyFactory = (config) => {
15034
15051
  };
15035
15052
  var containsAllFactory = (config) => {
15036
15053
  const c = config;
15037
- return new DeterministicAssertionEvaluator("contains_all", (ctx) => {
15054
+ return new DeterministicAssertionEvaluator("contains-all", (ctx) => {
15038
15055
  const result = runContainsAllAssertion(ctx.candidate, c.value);
15039
15056
  return {
15040
15057
  score: result.score,
@@ -15062,7 +15079,7 @@ var icontainsFactory = (config) => {
15062
15079
  };
15063
15080
  var icontainsAnyFactory = (config) => {
15064
15081
  const c = config;
15065
- return new DeterministicAssertionEvaluator("icontains_any", (ctx) => {
15082
+ return new DeterministicAssertionEvaluator("icontains-any", (ctx) => {
15066
15083
  const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
15067
15084
  return {
15068
15085
  score: result.score,
@@ -15076,7 +15093,7 @@ var icontainsAnyFactory = (config) => {
15076
15093
  };
15077
15094
  var icontainsAllFactory = (config) => {
15078
15095
  const c = config;
15079
- return new DeterministicAssertionEvaluator("icontains_all", (ctx) => {
15096
+ return new DeterministicAssertionEvaluator("icontains-all", (ctx) => {
15080
15097
  const result = runIcontainsAllAssertion(ctx.candidate, c.value);
15081
15098
  return {
15082
15099
  score: result.score,
@@ -15090,7 +15107,7 @@ var icontainsAllFactory = (config) => {
15090
15107
  };
15091
15108
  var startsWithFactory = (config) => {
15092
15109
  const c = config;
15093
- return new DeterministicAssertionEvaluator("starts_with", (ctx) => {
15110
+ return new DeterministicAssertionEvaluator("starts-with", (ctx) => {
15094
15111
  const result = runStartsWithAssertion(ctx.candidate, c.value);
15095
15112
  return {
15096
15113
  score: result.score,
@@ -15104,7 +15121,7 @@ var startsWithFactory = (config) => {
15104
15121
  };
15105
15122
  var endsWithFactory = (config) => {
15106
15123
  const c = config;
15107
- return new DeterministicAssertionEvaluator("ends_with", (ctx) => {
15124
+ return new DeterministicAssertionEvaluator("ends-with", (ctx) => {
15108
15125
  const result = runEndsWithAssertion(ctx.candidate, c.value);
15109
15126
  return {
15110
15127
  score: result.score,
@@ -15118,7 +15135,7 @@ var endsWithFactory = (config) => {
15118
15135
  };
15119
15136
  function createBuiltinRegistry() {
15120
15137
  const registry = new EvaluatorRegistry();
15121
- registry.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("contains_any", containsAnyFactory).register("contains_all", containsAllFactory).register("icontains", icontainsFactory).register("icontains_any", icontainsAnyFactory).register("icontains_all", icontainsAllFactory).register("starts_with", startsWithFactory).register("ends_with", endsWithFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
15138
+ registry.register("llm-judge", llmJudgeFactory).register("code-judge", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("agent-judge", agentJudgeFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory);
15122
15139
  return registry;
15123
15140
  }
15124
15141
 
@@ -15864,7 +15881,7 @@ async function runEvaluation(options) {
15864
15881
  };
15865
15882
  if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
15866
15883
  throw new Error(
15867
- `Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure_base).`
15884
+ `Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure-base).`
15868
15885
  );
15869
15886
  }
15870
15887
  const targetResolver = (name) => {
@@ -15935,7 +15952,7 @@ async function runEvaluation(options) {
15935
15952
  const rawTemplate = suiteWorkspace?.template ?? getWorkspaceTemplate(target);
15936
15953
  const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
15937
15954
  const workspaceTemplate = resolvedTemplate?.dir;
15938
- const suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
15955
+ let suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
15939
15956
  const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
15940
15957
  const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
15941
15958
  const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
@@ -15956,6 +15973,14 @@ async function runEvaluation(options) {
15956
15973
  const message = error instanceof Error ? error.message : String(error);
15957
15974
  throw new Error(`Failed to create shared workspace: ${message}`);
15958
15975
  }
15976
+ if (suiteWorkspaceFile && sharedWorkspacePath) {
15977
+ const copiedWorkspaceFile = import_node_path40.default.join(sharedWorkspacePath, import_node_path40.default.basename(suiteWorkspaceFile));
15978
+ try {
15979
+ await (0, import_promises29.stat)(copiedWorkspaceFile);
15980
+ suiteWorkspaceFile = copiedWorkspaceFile;
15981
+ } catch {
15982
+ }
15983
+ }
15959
15984
  } else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
15960
15985
  sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
15961
15986
  await (0, import_promises29.mkdir)(sharedWorkspacePath, { recursive: true });
@@ -16434,6 +16459,14 @@ async function runEvalCase(options) {
16434
16459
  "template_error"
16435
16460
  );
16436
16461
  }
16462
+ if (caseWorkspaceFile && workspacePath) {
16463
+ const copiedFile = import_node_path40.default.join(workspacePath, import_node_path40.default.basename(caseWorkspaceFile));
16464
+ try {
16465
+ await (0, import_promises29.stat)(copiedFile);
16466
+ caseWorkspaceFile = copiedFile;
16467
+ } catch {
16468
+ }
16469
+ }
16437
16470
  }
16438
16471
  if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
16439
16472
  workspacePath = getWorkspacePath(evalRunId, evalCase.id);
@@ -16943,8 +16976,8 @@ async function runEvaluatorsForCase(options) {
16943
16976
  workspacePath
16944
16977
  });
16945
16978
  }
16946
- const evaluatorKind = evalCase.evaluator ?? "llm_judge";
16947
- const activeEvaluator = evaluators[evaluatorKind] ?? evaluators.llm_judge;
16979
+ const evaluatorKind = evalCase.evaluator ?? "llm-judge";
16980
+ const activeEvaluator = evaluators[evaluatorKind] ?? evaluators["llm-judge"];
16948
16981
  if (!activeEvaluator) {
16949
16982
  throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
16950
16983
  }
@@ -17027,25 +17060,24 @@ async function runEvaluatorList(options) {
17027
17060
  availableTargets,
17028
17061
  agentTimeoutMs,
17029
17062
  evalFileDir,
17030
- llmJudge: evaluatorRegistry.llm_judge,
17063
+ llmJudge: evaluatorRegistry["llm-judge"],
17031
17064
  registry: typeRegistry
17032
17065
  };
17033
17066
  for (const evaluatorConfig of evaluators ?? []) {
17034
17067
  try {
17035
17068
  const evaluatorInstance = await typeRegistry.create(evaluatorConfig, dispatchContext);
17036
17069
  const score2 = await evaluatorInstance.evaluate(evalContext);
17037
- const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
17038
17070
  const weight = evaluatorConfig.weight ?? 1;
17039
17071
  scored.push({
17040
17072
  score: score2,
17041
17073
  name: evaluatorConfig.name,
17042
- type: resultType,
17074
+ type: evaluatorConfig.type,
17043
17075
  weight,
17044
17076
  ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
17045
17077
  });
17046
17078
  scores.push({
17047
17079
  name: evaluatorConfig.name,
17048
- type: resultType,
17080
+ type: evaluatorConfig.type,
17049
17081
  score: score2.score,
17050
17082
  weight,
17051
17083
  verdict: score2.verdict,
@@ -17067,18 +17099,17 @@ async function runEvaluatorList(options) {
17067
17099
  expectedAspectCount: 1,
17068
17100
  reasoning: message
17069
17101
  };
17070
- const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
17071
17102
  const weight = evaluatorConfig.weight ?? 1;
17072
17103
  scored.push({
17073
17104
  score: fallbackScore,
17074
17105
  name: evaluatorConfig.name ?? "unknown",
17075
- type: resultType ?? "llm_judge",
17106
+ type: evaluatorConfig.type ?? "llm-judge",
17076
17107
  weight,
17077
17108
  ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
17078
17109
  });
17079
17110
  scores.push({
17080
17111
  name: evaluatorConfig.name ?? "unknown",
17081
- type: resultType ?? "llm_judge",
17112
+ type: evaluatorConfig.type ?? "llm-judge",
17082
17113
  score: 0,
17083
17114
  weight,
17084
17115
  verdict: "fail",
@@ -17139,7 +17170,7 @@ function filterEvalCases(evalCases, filter) {
17139
17170
  return evalCases.filter((evalCase) => import_micromatch4.default.isMatch(evalCase.id, filter));
17140
17171
  }
17141
17172
  function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
17142
- const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
17173
+ const llmJudge = overrides?.["llm-judge"] ?? new LlmJudgeEvaluator({
17143
17174
  resolveJudgeProvider: async (context2) => {
17144
17175
  if (context2.judgeProvider) {
17145
17176
  return context2.judgeProvider;
@@ -17149,7 +17180,7 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
17149
17180
  });
17150
17181
  return {
17151
17182
  ...overrides,
17152
- llm_judge: llmJudge
17183
+ "llm-judge": llmJudge
17153
17184
  };
17154
17185
  }
17155
17186
  async function invokeProvider(provider, options) {
@@ -17409,12 +17440,7 @@ async function evaluate(config) {
17409
17440
  };
17410
17441
  }
17411
17442
  function mapAssertionType(type) {
17412
- switch (type) {
17413
- case "code_judge":
17414
- return "code";
17415
- default:
17416
- return type;
17417
- }
17443
+ return type.replace(/_/g, "-");
17418
17444
  }
17419
17445
  function computeSummary(results, durationMs) {
17420
17446
  const total = results.length;