@agentv/core 2.12.0 → 2.14.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1492,6 +1492,7 @@ __export(index_exports, {
1492
1492
  executeWorkspaceScript: () => executeWorkspaceScript,
1493
1493
  explorationRatio: () => explorationRatio,
1494
1494
  extractCacheConfig: () => extractCacheConfig,
1495
+ extractFailOnError: () => extractFailOnError,
1495
1496
  extractJsonBlob: () => extractJsonBlob,
1496
1497
  extractTargetFromSuite: () => extractTargetFromSuite,
1497
1498
  extractTargetsFromSuite: () => extractTargetsFromSuite,
@@ -1611,27 +1612,27 @@ function isTestMessage(value) {
1611
1612
  return false;
1612
1613
  }
1613
1614
  var EVALUATOR_KIND_VALUES = [
1614
- "code_judge",
1615
- "llm_judge",
1615
+ "code-judge",
1616
+ "llm-judge",
1616
1617
  "rubric",
1617
1618
  "composite",
1618
- "tool_trajectory",
1619
- "field_accuracy",
1619
+ "tool-trajectory",
1620
+ "field-accuracy",
1620
1621
  "latency",
1621
1622
  "cost",
1622
- "token_usage",
1623
- "execution_metrics",
1624
- "agent_judge",
1623
+ "token-usage",
1624
+ "execution-metrics",
1625
+ "agent-judge",
1625
1626
  "contains",
1626
- "contains_any",
1627
- "contains_all",
1627
+ "contains-any",
1628
+ "contains-all",
1628
1629
  "icontains",
1629
- "icontains_any",
1630
- "icontains_all",
1631
- "starts_with",
1632
- "ends_with",
1630
+ "icontains-any",
1631
+ "icontains-all",
1632
+ "starts-with",
1633
+ "ends-with",
1633
1634
  "regex",
1634
- "is_json",
1635
+ "is-json",
1635
1636
  "equals",
1636
1637
  "rubrics"
1637
1638
  ];
@@ -2014,6 +2015,11 @@ async function loadConfig(evalFilePath, repoRoot) {
2014
2015
  continue;
2015
2016
  }
2016
2017
  const config = parsed;
2018
+ const requiredVersion = parsed.required_version;
2019
+ if (requiredVersion !== void 0 && typeof requiredVersion !== "string") {
2020
+ logWarning(`Invalid required_version in ${configPath}, expected string`);
2021
+ continue;
2022
+ }
2017
2023
  const guidelinePatterns = config.guideline_patterns;
2018
2024
  if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
2019
2025
  logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
@@ -2037,6 +2043,7 @@ async function loadConfig(evalFilePath, repoRoot) {
2037
2043
  configPath
2038
2044
  );
2039
2045
  return {
2046
+ required_version: requiredVersion,
2040
2047
  guideline_patterns: guidelinePatterns,
2041
2048
  eval_patterns: evalPatterns,
2042
2049
  execution: executionDefaults
@@ -2180,6 +2187,22 @@ function extractTotalBudgetUsd(suite) {
2180
2187
  );
2181
2188
  return void 0;
2182
2189
  }
2190
+ function extractFailOnError(suite) {
2191
+ const execution = suite.execution;
2192
+ if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
2193
+ return void 0;
2194
+ }
2195
+ const executionObj = execution;
2196
+ const raw = executionObj.fail_on_error ?? executionObj.failOnError;
2197
+ if (raw === void 0 || raw === null) {
2198
+ return void 0;
2199
+ }
2200
+ if (typeof raw === "boolean") {
2201
+ return raw;
2202
+ }
2203
+ logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
2204
+ return void 0;
2205
+ }
2183
2206
  function parseExecutionDefaults(raw, configPath) {
2184
2207
  if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
2185
2208
  return void 0;
@@ -2278,6 +2301,9 @@ function validateTemplateVariables(content, source) {
2278
2301
  // src/evaluation/loaders/evaluator-parser.ts
2279
2302
  var ANSI_YELLOW4 = "\x1B[33m";
2280
2303
  var ANSI_RESET4 = "\x1B[0m";
2304
+ function normalizeEvaluatorType(type) {
2305
+ return type.replace(/_/g, "-");
2306
+ }
2281
2307
  async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
2282
2308
  const execution = rawEvalCase.execution;
2283
2309
  const executionObject = isJsonObject2(execution) ? execution : void 0;
@@ -2308,7 +2334,8 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2308
2334
  continue;
2309
2335
  }
2310
2336
  const rawName = asString(rawEvaluator.name);
2311
- const typeValue = rawEvaluator.type;
2337
+ const rawType = rawEvaluator.type;
2338
+ const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
2312
2339
  const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
2313
2340
  if (typeof typeValue !== "string") {
2314
2341
  logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
@@ -2341,25 +2368,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2341
2368
  });
2342
2369
  continue;
2343
2370
  }
2344
- if (typeValue === "code_judge") {
2371
+ if (typeValue === "code-judge") {
2345
2372
  let command;
2346
2373
  const rawCommand = rawEvaluator.command ?? rawEvaluator.script;
2347
2374
  if (typeof rawCommand === "string") {
2348
2375
  const trimmed = rawCommand.trim();
2349
2376
  if (trimmed.length === 0) {
2350
2377
  throw new Error(
2351
- `Invalid code_judge command for evaluator '${name}' in '${evalId}': command cannot be empty`
2378
+ `Invalid code-judge command for evaluator '${name}' in '${evalId}': command cannot be empty`
2352
2379
  );
2353
2380
  }
2354
2381
  command = parseCommandToArgv(trimmed);
2355
2382
  } else {
2356
2383
  command = asStringArray(
2357
2384
  rawCommand,
2358
- `code_judge command for evaluator '${name}' in '${evalId}'`
2385
+ `code-judge command for evaluator '${name}' in '${evalId}'`
2359
2386
  );
2360
2387
  }
2361
2388
  if (!command) {
2362
- logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing command`);
2389
+ logWarning2(`Skipping code-judge evaluator '${name}' in '${evalId}': missing command`);
2363
2390
  continue;
2364
2391
  }
2365
2392
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
@@ -2420,7 +2447,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2420
2447
  }
2421
2448
  evaluators.push({
2422
2449
  name,
2423
- type: "code",
2450
+ type: "code-judge",
2424
2451
  command,
2425
2452
  cwd,
2426
2453
  resolvedCwd,
@@ -2446,7 +2473,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2446
2473
  continue;
2447
2474
  }
2448
2475
  const aggregatorType = asString(rawAggregator.type);
2449
- if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge" && aggregatorType !== "threshold") {
2476
+ if (aggregatorType !== "weighted_average" && aggregatorType !== "code-judge" && aggregatorType !== "llm-judge" && aggregatorType !== "threshold") {
2450
2477
  logWarning2(
2451
2478
  `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
2452
2479
  );
@@ -2495,16 +2522,16 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2495
2522
  type: "weighted_average",
2496
2523
  ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
2497
2524
  };
2498
- } else if (aggregatorType === "code_judge") {
2525
+ } else if (aggregatorType === "code-judge") {
2499
2526
  const aggregatorPath = asString(rawAggregator.path);
2500
2527
  if (!aggregatorPath) {
2501
2528
  logWarning2(
2502
- `Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
2529
+ `Skipping composite evaluator '${name}' in '${evalId}': code-judge aggregator missing path`
2503
2530
  );
2504
2531
  continue;
2505
2532
  }
2506
2533
  aggregator = {
2507
- type: "code_judge",
2534
+ type: "code-judge",
2508
2535
  path: aggregatorPath,
2509
2536
  cwd: searchRoots[0]
2510
2537
  };
@@ -2530,7 +2557,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2530
2557
  }
2531
2558
  }
2532
2559
  aggregator = {
2533
- type: "llm_judge",
2560
+ type: "llm-judge",
2534
2561
  ...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
2535
2562
  ...promptPath2 ? { promptPath: promptPath2 } : {}
2536
2563
  };
@@ -2548,11 +2575,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2548
2575
  });
2549
2576
  continue;
2550
2577
  }
2551
- if (typeValue === "tool_trajectory") {
2578
+ if (typeValue === "tool-trajectory") {
2552
2579
  const mode = asString(rawEvaluator.mode);
2553
2580
  if (mode !== "any_order" && mode !== "in_order" && mode !== "exact" && mode !== "subset" && mode !== "superset") {
2554
2581
  logWarning2(
2555
- `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, exact, subset, or superset)`
2582
+ `Skipping tool-trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, exact, subset, or superset)`
2556
2583
  );
2557
2584
  continue;
2558
2585
  }
@@ -2561,7 +2588,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2561
2588
  if (rawMinimums !== void 0) {
2562
2589
  if (!isJsonObject2(rawMinimums)) {
2563
2590
  logWarning2(
2564
- `Skipping tool_trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
2591
+ `Skipping tool-trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
2565
2592
  );
2566
2593
  continue;
2567
2594
  }
@@ -2587,7 +2614,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2587
2614
  argsMatch2 = rawArgsMatch;
2588
2615
  } else {
2589
2616
  logWarning2(
2590
- `Invalid args_match '${rawArgsMatch}' for tool_trajectory evaluator '${name}' in '${evalId}': must be exact, superset, subset, ignore, or a string array`
2617
+ `Invalid args_match '${rawArgsMatch}' for tool-trajectory evaluator '${name}' in '${evalId}': must be exact, superset, subset, ignore, or a string array`
2591
2618
  );
2592
2619
  }
2593
2620
  }
@@ -2597,7 +2624,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2597
2624
  if (rawExpected !== void 0) {
2598
2625
  if (!Array.isArray(rawExpected)) {
2599
2626
  logWarning2(
2600
- `Skipping tool_trajectory evaluator '${name}' in '${evalId}': expected must be an array`
2627
+ `Skipping tool-trajectory evaluator '${name}' in '${evalId}': expected must be an array`
2601
2628
  );
2602
2629
  continue;
2603
2630
  }
@@ -2643,13 +2670,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2643
2670
  }
2644
2671
  if (mode === "any_order" && !minimums) {
2645
2672
  logWarning2(
2646
- `Skipping tool_trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
2673
+ `Skipping tool-trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
2647
2674
  );
2648
2675
  continue;
2649
2676
  }
2650
2677
  if ((mode === "in_order" || mode === "exact" || mode === "subset" || mode === "superset") && !expected) {
2651
2678
  logWarning2(
2652
- `Skipping tool_trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
2679
+ `Skipping tool-trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
2653
2680
  );
2654
2681
  continue;
2655
2682
  }
@@ -2657,7 +2684,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2657
2684
  const required2 = parseRequired(rawEvaluator.required);
2658
2685
  const config2 = {
2659
2686
  name,
2660
- type: "tool_trajectory",
2687
+ type: "tool-trajectory",
2661
2688
  mode,
2662
2689
  ...minimums ? { minimums } : {},
2663
2690
  ...expected ? { expected } : {},
@@ -2669,17 +2696,17 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2669
2696
  evaluators.push(config2);
2670
2697
  continue;
2671
2698
  }
2672
- if (typeValue === "field_accuracy") {
2699
+ if (typeValue === "field-accuracy") {
2673
2700
  const rawFields = rawEvaluator.fields;
2674
2701
  if (!Array.isArray(rawFields)) {
2675
2702
  logWarning2(
2676
- `Skipping field_accuracy evaluator '${name}' in '${evalId}': missing fields array`
2703
+ `Skipping field-accuracy evaluator '${name}' in '${evalId}': missing fields array`
2677
2704
  );
2678
2705
  continue;
2679
2706
  }
2680
2707
  if (rawFields.length === 0) {
2681
2708
  logWarning2(
2682
- `Skipping field_accuracy evaluator '${name}' in '${evalId}': fields array is empty`
2709
+ `Skipping field-accuracy evaluator '${name}' in '${evalId}': fields array is empty`
2683
2710
  );
2684
2711
  continue;
2685
2712
  }
@@ -2687,7 +2714,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2687
2714
  for (const rawField of rawFields) {
2688
2715
  if (!isJsonObject2(rawField)) {
2689
2716
  logWarning2(
2690
- `Skipping invalid field entry in field_accuracy evaluator '${name}' (expected object)`
2717
+ `Skipping invalid field entry in field-accuracy evaluator '${name}' (expected object)`
2691
2718
  );
2692
2719
  continue;
2693
2720
  }
@@ -2695,13 +2722,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2695
2722
  const match = asString(rawField.match);
2696
2723
  if (!fieldPath) {
2697
2724
  logWarning2(
2698
- `Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
2725
+ `Skipping field without path in field-accuracy evaluator '${name}' in '${evalId}'`
2699
2726
  );
2700
2727
  continue;
2701
2728
  }
2702
2729
  if (!match || !isValidFieldMatchType(match)) {
2703
2730
  logWarning2(
2704
- `Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code_judge evaluator.`
2731
+ `Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code-judge evaluator.`
2705
2732
  );
2706
2733
  continue;
2707
2734
  }
@@ -2718,7 +2745,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2718
2745
  }
2719
2746
  if (fields.length === 0) {
2720
2747
  logWarning2(
2721
- `Skipping field_accuracy evaluator '${name}' in '${evalId}': no valid fields found`
2748
+ `Skipping field-accuracy evaluator '${name}' in '${evalId}': no valid fields found`
2722
2749
  );
2723
2750
  continue;
2724
2751
  }
@@ -2728,7 +2755,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2728
2755
  const required2 = parseRequired(rawEvaluator.required);
2729
2756
  evaluators.push({
2730
2757
  name,
2731
- type: "field_accuracy",
2758
+ type: "field-accuracy",
2732
2759
  fields,
2733
2760
  ...validAggregation ? { aggregation: validAggregation } : {},
2734
2761
  ...weight2 !== void 0 ? { weight: weight2 } : {},
@@ -2777,7 +2804,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2777
2804
  });
2778
2805
  continue;
2779
2806
  }
2780
- if (typeValue === "token_usage") {
2807
+ if (typeValue === "token-usage") {
2781
2808
  const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
2782
2809
  const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
2783
2810
  const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
@@ -2791,7 +2818,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2791
2818
  if (raw === void 0) continue;
2792
2819
  if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
2793
2820
  logWarning2(
2794
- `Skipping token_usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
2821
+ `Skipping token-usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
2795
2822
  );
2796
2823
  continue;
2797
2824
  }
@@ -2799,7 +2826,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2799
2826
  }
2800
2827
  if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
2801
2828
  logWarning2(
2802
- `Skipping token_usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
2829
+ `Skipping token-usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
2803
2830
  );
2804
2831
  continue;
2805
2832
  }
@@ -2807,7 +2834,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2807
2834
  const required2 = parseRequired(rawEvaluator.required);
2808
2835
  evaluators.push({
2809
2836
  name,
2810
- type: "token_usage",
2837
+ type: "token-usage",
2811
2838
  ...validLimits,
2812
2839
  ...weight2 !== void 0 ? { weight: weight2 } : {},
2813
2840
  ...required2 !== void 0 ? { required: required2 } : {},
@@ -2815,7 +2842,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2815
2842
  });
2816
2843
  continue;
2817
2844
  }
2818
- if (typeValue === "execution_metrics") {
2845
+ if (typeValue === "execution-metrics") {
2819
2846
  const maxToolCalls = rawEvaluator.max_tool_calls ?? rawEvaluator.maxToolCalls;
2820
2847
  const maxLlmCalls = rawEvaluator.max_llm_calls ?? rawEvaluator.maxLlmCalls;
2821
2848
  const maxTokens = rawEvaluator.max_tokens ?? rawEvaluator.maxTokens;
@@ -2838,7 +2865,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2838
2865
  if (raw === void 0) continue;
2839
2866
  if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
2840
2867
  logWarning2(
2841
- `Skipping execution_metrics evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
2868
+ `Skipping execution-metrics evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
2842
2869
  );
2843
2870
  hasError = true;
2844
2871
  break;
@@ -2851,7 +2878,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2851
2878
  const hasThreshold = validThresholds.max_tool_calls !== void 0 || validThresholds.max_llm_calls !== void 0 || validThresholds.max_tokens !== void 0 || validThresholds.max_cost_usd !== void 0 || validThresholds.max_duration_ms !== void 0 || validThresholds.target_exploration_ratio !== void 0;
2852
2879
  if (!hasThreshold) {
2853
2880
  logWarning2(
2854
- `Skipping execution_metrics evaluator '${name}' in '${evalId}': must set at least one threshold (max_tool_calls, max_llm_calls, max_tokens, max_cost_usd, max_duration_ms, or target_exploration_ratio)`
2881
+ `Skipping execution-metrics evaluator '${name}' in '${evalId}': must set at least one threshold (max_tool_calls, max_llm_calls, max_tokens, max_cost_usd, max_duration_ms, or target_exploration_ratio)`
2855
2882
  );
2856
2883
  continue;
2857
2884
  }
@@ -2859,7 +2886,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2859
2886
  const required2 = parseRequired(rawEvaluator.required);
2860
2887
  evaluators.push({
2861
2888
  name,
2862
- type: "execution_metrics",
2889
+ type: "execution-metrics",
2863
2890
  ...validThresholds,
2864
2891
  ...weight2 !== void 0 ? { weight: weight2 } : {},
2865
2892
  ...required2 !== void 0 ? { required: required2 } : {},
@@ -2867,13 +2894,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2867
2894
  });
2868
2895
  continue;
2869
2896
  }
2870
- if (typeValue === "agent_judge") {
2897
+ if (typeValue === "agent-judge") {
2871
2898
  const rawMaxSteps = rawEvaluator.max_steps ?? rawEvaluator.maxSteps;
2872
2899
  let maxSteps;
2873
2900
  if (rawMaxSteps !== void 0) {
2874
2901
  if (typeof rawMaxSteps !== "number" || !Number.isInteger(rawMaxSteps) || rawMaxSteps < 1 || rawMaxSteps > 50) {
2875
2902
  logWarning2(
2876
- `Skipping agent_judge evaluator '${name}' in '${evalId}': max_steps must be an integer 1-50`
2903
+ `Skipping agent-judge evaluator '${name}' in '${evalId}': max_steps must be an integer 1-50`
2877
2904
  );
2878
2905
  continue;
2879
2906
  }
@@ -2884,7 +2911,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2884
2911
  if (rawTemperature !== void 0) {
2885
2912
  if (typeof rawTemperature !== "number" || rawTemperature < 0 || rawTemperature > 2) {
2886
2913
  logWarning2(
2887
- `Skipping agent_judge evaluator '${name}' in '${evalId}': temperature must be a number 0-2`
2914
+ `Skipping agent-judge evaluator '${name}' in '${evalId}': temperature must be a number 0-2`
2888
2915
  );
2889
2916
  continue;
2890
2917
  }
@@ -2907,7 +2934,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2907
2934
  const required2 = parseRequired(rawEvaluator.required);
2908
2935
  evaluators.push({
2909
2936
  name,
2910
- type: "agent_judge",
2937
+ type: "agent-judge",
2911
2938
  ...agentPrompt ? { prompt: agentPrompt } : {},
2912
2939
  ...agentPromptPath ? { promptPath: agentPromptPath, resolvedPromptPath: agentPromptPath } : {},
2913
2940
  ...agentParsedRubrics && agentParsedRubrics.length > 0 ? { rubrics: agentParsedRubrics } : {},
@@ -2938,7 +2965,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2938
2965
  });
2939
2966
  continue;
2940
2967
  }
2941
- if (typeValue === "contains_any" || typeValue === "contains_all") {
2968
+ if (typeValue === "contains-any" || typeValue === "contains-all") {
2942
2969
  const value = asStringArrayStrict(rawEvaluator.value);
2943
2970
  if (!value || value.length === 0) {
2944
2971
  logWarning2(
@@ -2976,7 +3003,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2976
3003
  });
2977
3004
  continue;
2978
3005
  }
2979
- if (typeValue === "icontains_any" || typeValue === "icontains_all") {
3006
+ if (typeValue === "icontains-any" || typeValue === "icontains-all") {
2980
3007
  const value = asStringArrayStrict(rawEvaluator.value);
2981
3008
  if (!value || value.length === 0) {
2982
3009
  logWarning2(
@@ -2996,7 +3023,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2996
3023
  });
2997
3024
  continue;
2998
3025
  }
2999
- if (typeValue === "starts_with" || typeValue === "ends_with") {
3026
+ if (typeValue === "starts-with" || typeValue === "ends-with") {
3000
3027
  const value = asString(rawEvaluator.value);
3001
3028
  if (!value) {
3002
3029
  logWarning2(`Skipping ${typeValue} evaluator '${name}' in '${evalId}': missing value`);
@@ -3034,12 +3061,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3034
3061
  });
3035
3062
  continue;
3036
3063
  }
3037
- if (typeValue === "is_json") {
3064
+ if (typeValue === "is-json") {
3038
3065
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3039
3066
  const required2 = parseRequired(rawEvaluator.required);
3040
3067
  evaluators.push({
3041
3068
  name,
3042
- type: "is_json",
3069
+ type: "is-json",
3043
3070
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3044
3071
  ...required2 !== void 0 ? { required: required2 } : {},
3045
3072
  ...negate !== void 0 ? { negate } : {}
@@ -3087,7 +3114,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3087
3114
  const required2 = parseRequired(rawEvaluator.required);
3088
3115
  evaluators.push({
3089
3116
  name,
3090
- type: "llm_judge",
3117
+ type: "llm-judge",
3091
3118
  rubrics: parsedCriteria,
3092
3119
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3093
3120
  ...required2 !== void 0 ? { required: required2 } : {},
@@ -3154,7 +3181,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3154
3181
  const required2 = parseRequired(rawEvaluator.required);
3155
3182
  evaluators.push({
3156
3183
  name,
3157
- type: "llm_judge",
3184
+ type: "llm-judge",
3158
3185
  rubrics: parsedRubrics,
3159
3186
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3160
3187
  ...required2 !== void 0 ? { required: required2 } : {},
@@ -3186,7 +3213,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3186
3213
  const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
3187
3214
  evaluators.push({
3188
3215
  name,
3189
- type: "llm_judge",
3216
+ type: "llm-judge",
3190
3217
  prompt,
3191
3218
  promptPath,
3192
3219
  ...promptPath ? { resolvedPromptPath: promptPath } : {},
@@ -3202,15 +3229,15 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3202
3229
  }
3203
3230
  var ASSERTION_TYPES = /* @__PURE__ */ new Set([
3204
3231
  "contains",
3205
- "contains_any",
3206
- "contains_all",
3232
+ "contains-any",
3233
+ "contains-all",
3207
3234
  "icontains",
3208
- "icontains_any",
3209
- "icontains_all",
3210
- "starts_with",
3211
- "ends_with",
3235
+ "icontains-any",
3236
+ "icontains-all",
3237
+ "starts-with",
3238
+ "ends-with",
3212
3239
  "regex",
3213
- "is_json",
3240
+ "is-json",
3214
3241
  "equals",
3215
3242
  "rubrics"
3216
3243
  ]);
@@ -3223,24 +3250,24 @@ function generateAssertionName(typeValue, rawEvaluator) {
3223
3250
  switch (typeValue) {
3224
3251
  case "contains":
3225
3252
  return value ? `contains-${value}` : "contains";
3226
- case "contains_any":
3227
- return arrayValue ? `contains_any-${arrayValue.length}` : "contains_any";
3228
- case "contains_all":
3229
- return arrayValue ? `contains_all-${arrayValue.length}` : "contains_all";
3253
+ case "contains-any":
3254
+ return arrayValue ? `contains-any-${arrayValue.length}` : "contains-any";
3255
+ case "contains-all":
3256
+ return arrayValue ? `contains-all-${arrayValue.length}` : "contains-all";
3230
3257
  case "icontains":
3231
3258
  return value ? `icontains-${value}` : "icontains";
3232
- case "icontains_any":
3233
- return arrayValue ? `icontains_any-${arrayValue.length}` : "icontains_any";
3234
- case "icontains_all":
3235
- return arrayValue ? `icontains_all-${arrayValue.length}` : "icontains_all";
3236
- case "starts_with":
3237
- return value ? `starts_with-${value}` : "starts_with";
3238
- case "ends_with":
3239
- return value ? `ends_with-${value}` : "ends_with";
3259
+ case "icontains-any":
3260
+ return arrayValue ? `icontains-any-${arrayValue.length}` : "icontains-any";
3261
+ case "icontains-all":
3262
+ return arrayValue ? `icontains-all-${arrayValue.length}` : "icontains-all";
3263
+ case "starts-with":
3264
+ return value ? `starts-with-${value}` : "starts-with";
3265
+ case "ends-with":
3266
+ return value ? `ends-with-${value}` : "ends-with";
3240
3267
  case "regex":
3241
3268
  return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
3242
- case "is_json":
3243
- return "is_json";
3269
+ case "is-json":
3270
+ return "is-json";
3244
3271
  case "equals":
3245
3272
  return value ? `equals-${value}` : "equals";
3246
3273
  case "rubrics":
@@ -3253,8 +3280,9 @@ function coerceEvaluator(candidate, contextId) {
3253
3280
  if (typeof candidate !== "string") {
3254
3281
  return void 0;
3255
3282
  }
3256
- if (isEvaluatorKind(candidate)) {
3257
- return candidate;
3283
+ const normalized = normalizeEvaluatorType(candidate);
3284
+ if (isEvaluatorKind(normalized)) {
3285
+ return normalized;
3258
3286
  }
3259
3287
  logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
3260
3288
  return void 0;
@@ -3300,6 +3328,16 @@ function parseCommandToArgv(command) {
3300
3328
  function isJsonObject2(value) {
3301
3329
  return typeof value === "object" && value !== null && !Array.isArray(value);
3302
3330
  }
3331
+ var CRITERIA_CONSUMER_TYPES = /* @__PURE__ */ new Set(["llm-judge", "agent-judge", "code-judge"]);
3332
+ function warnUnconsumedCriteria(criteria, evaluators, testId) {
3333
+ if (!criteria?.trim() || !evaluators || evaluators.length === 0) return;
3334
+ const hasConsumer = evaluators.some((e) => CRITERIA_CONSUMER_TYPES.has(e.type));
3335
+ if (!hasConsumer) {
3336
+ logWarning2(
3337
+ `Test '${testId}': criteria is defined but no evaluator in assert will evaluate it. Add 'type: llm-judge' to assert, or remove criteria if it is documentation-only.`
3338
+ );
3339
+ }
3340
+ }
3303
3341
  function logWarning2(message, details) {
3304
3342
  if (details && details.length > 0) {
3305
3343
  const detailBlock = details.join("\n");
@@ -3549,7 +3587,7 @@ function parseInlineRubrics(rawRubrics) {
3549
3587
  }
3550
3588
  return {
3551
3589
  name: "rubric",
3552
- type: "llm_judge",
3590
+ type: "llm-judge",
3553
3591
  rubrics: rubricItems
3554
3592
  };
3555
3593
  }
@@ -3934,7 +3972,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
3934
3972
  const rawCases = parseJsonlContent(rawFile, evalFilePath);
3935
3973
  const fallbackDataset = import_node_path6.default.basename(absoluteTestPath, ".jsonl") || "eval";
3936
3974
  const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
3937
- const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm_judge";
3975
+ const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-judge";
3938
3976
  const globalExecution = sidecar.execution;
3939
3977
  if (verbose) {
3940
3978
  console.log(`
@@ -4022,6 +4060,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
4022
4060
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
4023
4061
  }
4024
4062
  }
4063
+ warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
4025
4064
  const userFilePaths = [];
4026
4065
  for (const segment of inputSegments) {
4027
4066
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -4375,13 +4414,15 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
4375
4414
  }
4376
4415
  const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
4377
4416
  const metadata = parseMetadata(parsed);
4417
+ const failOnError = extractFailOnError(parsed);
4378
4418
  return {
4379
4419
  tests,
4380
4420
  trials: extractTrialsConfig(parsed),
4381
4421
  targets: extractTargetsFromSuite(parsed),
4382
4422
  cacheConfig: extractCacheConfig(parsed),
4383
4423
  totalBudgetUsd: extractTotalBudgetUsd(parsed),
4384
- ...metadata !== void 0 && { metadata }
4424
+ ...metadata !== void 0 && { metadata },
4425
+ ...failOnError !== void 0 && { failOnError }
4385
4426
  };
4386
4427
  }
4387
4428
  var loadEvalSuite = loadTestSuite;
@@ -4412,7 +4453,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4412
4453
  const fallbackDataset = import_node_path8.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
4413
4454
  const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
4414
4455
  const rawTestcases = resolveTests(suite);
4415
- const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
4456
+ const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-judge";
4416
4457
  const evalFileDir = import_node_path8.default.dirname(absoluteTestPath);
4417
4458
  let expandedTestcases;
4418
4459
  if (typeof rawTestcases === "string") {
@@ -4509,6 +4550,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4509
4550
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
4510
4551
  }
4511
4552
  }
4553
+ warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
4512
4554
  const userFilePaths = [];
4513
4555
  for (const segment of inputSegments) {
4514
4556
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -11701,7 +11743,7 @@ function toCamelCaseDeep(obj) {
11701
11743
  // src/evaluation/evaluators/code-evaluator.ts
11702
11744
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
11703
11745
  var CodeEvaluator = class {
11704
- kind = "code";
11746
+ kind = "code-judge";
11705
11747
  command;
11706
11748
  cwd;
11707
11749
  agentTimeoutMs;
@@ -11938,7 +11980,7 @@ var scoreRangeEvaluationSchema = import_zod4.z.object({
11938
11980
  overall_reasoning: import_zod4.z.string().describe("Overall assessment summary (1-2 sentences)").optional()
11939
11981
  });
11940
11982
  var LlmJudgeEvaluator = class {
11941
- kind = "llm_judge";
11983
+ kind = "llm-judge";
11942
11984
  resolveJudgeProvider;
11943
11985
  maxOutputTokens;
11944
11986
  temperature;
@@ -11955,7 +11997,7 @@ var LlmJudgeEvaluator = class {
11955
11997
  throw new Error("No judge provider available for LLM grading");
11956
11998
  }
11957
11999
  const config = context2.evaluator;
11958
- if (config?.type === "llm_judge" && config.rubrics && config.rubrics.length > 0) {
12000
+ if (config?.type === "llm-judge" && config.rubrics && config.rubrics.length > 0) {
11959
12001
  return this.evaluateWithRubrics(context2, judgeProvider, config.rubrics);
11960
12002
  }
11961
12003
  return this.evaluateFreeform(context2, judgeProvider);
@@ -12029,7 +12071,7 @@ ${context2.fileChanges}`;
12029
12071
  async evaluateWithRubrics(context2, judgeProvider, rubrics) {
12030
12072
  if (!rubrics || rubrics.length === 0) {
12031
12073
  throw new Error(
12032
- `No rubrics found for evaluator "${context2.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
12074
+ `No rubrics found for evaluator "${context2.evaluator?.name ?? "llm-judge"}". Run "agentv generate rubrics" first.`
12033
12075
  );
12034
12076
  }
12035
12077
  const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
@@ -12365,9 +12407,9 @@ var CompositeEvaluator = class {
12365
12407
  async aggregate(results, context2) {
12366
12408
  const aggregator = this.config.aggregator;
12367
12409
  switch (aggregator.type) {
12368
- case "code_judge":
12410
+ case "code-judge":
12369
12411
  return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
12370
- case "llm_judge":
12412
+ case "llm-judge":
12371
12413
  return this.runLlmAggregator(results, context2, aggregator);
12372
12414
  case "threshold":
12373
12415
  return this.runThreshold(results, aggregator.threshold);
@@ -12510,7 +12552,7 @@ var CompositeEvaluator = class {
12510
12552
  expectedAspectCount: hits.length + misses.length || 1,
12511
12553
  reasoning,
12512
12554
  evaluatorRawRequest: {
12513
- aggregator: "code_judge",
12555
+ aggregator: "code-judge",
12514
12556
  script: scriptPath
12515
12557
  },
12516
12558
  scores
@@ -12525,7 +12567,7 @@ var CompositeEvaluator = class {
12525
12567
  expectedAspectCount: 1,
12526
12568
  reasoning: message,
12527
12569
  evaluatorRawRequest: {
12528
- aggregator: "code_judge",
12570
+ aggregator: "code-judge",
12529
12571
  script: scriptPath,
12530
12572
  error: message
12531
12573
  },
@@ -12556,7 +12598,7 @@ var CompositeEvaluator = class {
12556
12598
  const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
12557
12599
  const systemPrompt = buildOutputSchema();
12558
12600
  const evaluatorRawRequest = {
12559
- aggregator: "llm_judge",
12601
+ aggregator: "llm-judge",
12560
12602
  userPrompt,
12561
12603
  systemPrompt,
12562
12604
  target: judgeProvider.targetName
@@ -12668,7 +12710,7 @@ var CostEvaluator = class {
12668
12710
 
12669
12711
  // src/evaluation/evaluators/execution-metrics.ts
12670
12712
  var ExecutionMetricsEvaluator = class {
12671
- kind = "execution_metrics";
12713
+ kind = "execution-metrics";
12672
12714
  config;
12673
12715
  constructor(options) {
12674
12716
  this.config = options.config;
@@ -12694,7 +12736,7 @@ var ExecutionMetricsEvaluator = class {
12694
12736
  expectedAspectCount: 1,
12695
12737
  reasoning: "Execution metrics not available - no trace summary provided",
12696
12738
  evaluatorRawRequest: {
12697
- type: "execution_metrics",
12739
+ type: "execution-metrics",
12698
12740
  config: this.extractConfiguredThresholds(),
12699
12741
  actual: null
12700
12742
  }
@@ -12803,7 +12845,7 @@ var ExecutionMetricsEvaluator = class {
12803
12845
  if (actualMetrics.exploration_ratio !== void 0) {
12804
12846
  reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
12805
12847
  }
12806
- const reasoning = reasoningParts.length > 0 ? `execution_metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
12848
+ const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
12807
12849
  return {
12808
12850
  score,
12809
12851
  verdict: scoreToVerdict(score),
@@ -12812,7 +12854,7 @@ var ExecutionMetricsEvaluator = class {
12812
12854
  expectedAspectCount: totalChecks || 1,
12813
12855
  reasoning,
12814
12856
  evaluatorRawRequest: {
12815
- type: "execution_metrics",
12857
+ type: "execution-metrics",
12816
12858
  config: this.extractConfiguredThresholds(),
12817
12859
  actual: this.filterDefinedMetrics(actualMetrics)
12818
12860
  }
@@ -12900,7 +12942,7 @@ var MONTH_NAMES = {
12900
12942
  december: 11
12901
12943
  };
12902
12944
  var FieldAccuracyEvaluator = class {
12903
- kind = "field_accuracy";
12945
+ kind = "field-accuracy";
12904
12946
  config;
12905
12947
  constructor(options) {
12906
12948
  this.config = options.config;
@@ -13354,7 +13396,7 @@ var BINARY_EXTENSIONS = /* @__PURE__ */ new Set([
13354
13396
  ".dylib"
13355
13397
  ]);
13356
13398
  var AgentJudgeEvaluator = class {
13357
- kind = "agent_judge";
13399
+ kind = "agent-judge";
13358
13400
  resolveJudgeProvider;
13359
13401
  maxSteps;
13360
13402
  temperature;
@@ -13379,24 +13421,24 @@ var AgentJudgeEvaluator = class {
13379
13421
  async evaluateBuiltIn(context2) {
13380
13422
  const judgeProvider = await this.resolveJudgeProvider(context2);
13381
13423
  if (!judgeProvider) {
13382
- throw new Error("No judge provider available for agent_judge evaluation");
13424
+ throw new Error("No judge provider available for agent-judge evaluation");
13383
13425
  }
13384
13426
  const model = judgeProvider.asLanguageModel?.();
13385
13427
  if (!model) {
13386
13428
  throw new Error(
13387
- `Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent_judge mode`
13429
+ `Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent-judge mode`
13388
13430
  );
13389
13431
  }
13390
13432
  const workspacePath = context2.workspacePath;
13391
13433
  if (!workspacePath) {
13392
13434
  throw new Error(
13393
- "agent_judge evaluator requires a workspace_template target (workspacePath is not set)"
13435
+ "agent-judge evaluator requires a workspace_template target (workspacePath is not set)"
13394
13436
  );
13395
13437
  }
13396
13438
  const systemPrompt = this.buildSystemPrompt(context2);
13397
13439
  const userPrompt = this.buildUserPrompt(context2);
13398
13440
  const config = context2.evaluator;
13399
- const rubrics = config?.type === "agent_judge" ? config.rubrics : void 0;
13441
+ const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
13400
13442
  const fsTools = createFilesystemTools(workspacePath);
13401
13443
  const evaluatorRawRequest = {
13402
13444
  mode: "built-in",
@@ -13427,7 +13469,7 @@ var AgentJudgeEvaluator = class {
13427
13469
  score: 0,
13428
13470
  verdict: "fail",
13429
13471
  hits: [],
13430
- misses: [`agent_judge built-in evaluation failed: ${message}`],
13472
+ misses: [`agent-judge built-in evaluation failed: ${message}`],
13431
13473
  expectedAspectCount: 1,
13432
13474
  evaluatorRawRequest,
13433
13475
  details: { mode: "built-in", error: message }
@@ -13459,14 +13501,14 @@ var AgentJudgeEvaluator = class {
13459
13501
  score: 0,
13460
13502
  verdict: "fail",
13461
13503
  hits: [],
13462
- misses: ["agent_judge judge_target returned no assistant response"],
13504
+ misses: ["agent-judge judge_target returned no assistant response"],
13463
13505
  expectedAspectCount: 1,
13464
13506
  evaluatorRawRequest,
13465
13507
  details: { mode: "judge_target", judge_target: provider.targetName }
13466
13508
  };
13467
13509
  }
13468
13510
  const config = context2.evaluator;
13469
- const rubrics = config?.type === "agent_judge" ? config.rubrics : void 0;
13511
+ const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
13470
13512
  const details = {
13471
13513
  mode: "judge_target",
13472
13514
  judge_target: provider.targetName
@@ -13478,7 +13520,7 @@ var AgentJudgeEvaluator = class {
13478
13520
  score: 0,
13479
13521
  verdict: "fail",
13480
13522
  hits: [],
13481
- misses: [`agent_judge judge_target evaluation failed: ${message}`],
13523
+ misses: [`agent-judge judge_target evaluation failed: ${message}`],
13482
13524
  expectedAspectCount: 1,
13483
13525
  evaluatorRawRequest,
13484
13526
  details: {
@@ -13529,7 +13571,7 @@ var AgentJudgeEvaluator = class {
13529
13571
  score: 0,
13530
13572
  verdict: "fail",
13531
13573
  hits: [],
13532
- misses: ["Failed to parse agent_judge response as valid evaluation JSON"],
13574
+ misses: ["Failed to parse agent-judge response as valid evaluation JSON"],
13533
13575
  expectedAspectCount: 1,
13534
13576
  evaluatorRawRequest,
13535
13577
  details
@@ -13542,7 +13584,7 @@ var AgentJudgeEvaluator = class {
13542
13584
  */
13543
13585
  buildSystemPrompt(context2) {
13544
13586
  const config = context2.evaluator;
13545
- const rubrics = config?.type === "agent_judge" ? config.rubrics : void 0;
13587
+ const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
13546
13588
  const parts = [
13547
13589
  "You are an expert evaluator with access to the workspace filesystem.",
13548
13590
  "Use the provided tools to investigate the workspace and verify the criteria are met.",
@@ -13573,7 +13615,7 @@ var AgentJudgeEvaluator = class {
13573
13615
  return substituteVariables(this.evaluatorTemplate, variables);
13574
13616
  }
13575
13617
  const config = context2.evaluator;
13576
- const rubrics = config?.type === "agent_judge" ? config.rubrics : void 0;
13618
+ const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
13577
13619
  const parts = [
13578
13620
  "Evaluate the candidate answer by investigating the workspace.",
13579
13621
  "",
@@ -13616,7 +13658,7 @@ var AgentJudgeEvaluator = class {
13616
13658
  buildDelegatedPrompt(context2) {
13617
13659
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
13618
13660
  const config = context2.evaluator;
13619
- const rubrics = config?.type === "agent_judge" ? config.rubrics : void 0;
13661
+ const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
13620
13662
  if (this.evaluatorTemplate) {
13621
13663
  const variables = {
13622
13664
  [TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
@@ -13698,11 +13740,11 @@ function createFilesystemTools(workspacePath) {
13698
13740
  execute: async (input) => {
13699
13741
  try {
13700
13742
  const resolved = resolveSandboxed(workspacePath, input.path);
13701
- const stat7 = await import_promises25.default.stat(resolved);
13702
- if (stat7.isDirectory()) {
13743
+ const stat8 = await import_promises25.default.stat(resolved);
13744
+ if (stat8.isDirectory()) {
13703
13745
  return { error: `'${input.path}' is a directory, not a file` };
13704
13746
  }
13705
- const buffer = Buffer.alloc(Math.min(stat7.size, MAX_FILE_SIZE));
13747
+ const buffer = Buffer.alloc(Math.min(stat8.size, MAX_FILE_SIZE));
13706
13748
  const fd = await import_promises25.default.open(resolved, "r");
13707
13749
  try {
13708
13750
  await fd.read(buffer, 0, buffer.length, 0);
@@ -13710,8 +13752,8 @@ function createFilesystemTools(workspacePath) {
13710
13752
  await fd.close();
13711
13753
  }
13712
13754
  const content = buffer.toString("utf-8");
13713
- const truncated = stat7.size > MAX_FILE_SIZE;
13714
- return { content, truncated, size: stat7.size };
13755
+ const truncated = stat8.size > MAX_FILE_SIZE;
13756
+ return { content, truncated, size: stat8.size };
13715
13757
  } catch (error) {
13716
13758
  return { error: error instanceof Error ? error.message : String(error) };
13717
13759
  }
@@ -13755,8 +13797,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
13755
13797
  const ext = import_node_path33.default.extname(entry.name).toLowerCase();
13756
13798
  if (BINARY_EXTENSIONS.has(ext)) continue;
13757
13799
  try {
13758
- const stat7 = await import_promises25.default.stat(fullPath);
13759
- if (stat7.size > MAX_FILE_SIZE) continue;
13800
+ const stat8 = await import_promises25.default.stat(fullPath);
13801
+ if (stat8.size > MAX_FILE_SIZE) continue;
13760
13802
  const content = await import_promises25.default.readFile(fullPath, "utf-8");
13761
13803
  const lines = content.split("\n");
13762
13804
  for (let i = 0; i < lines.length; i++) {
@@ -13918,7 +13960,7 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
13918
13960
 
13919
13961
  // src/evaluation/evaluators/token-usage.ts
13920
13962
  var TokenUsageEvaluator = class {
13921
- kind = "token_usage";
13963
+ kind = "token-usage";
13922
13964
  config;
13923
13965
  constructor(options) {
13924
13966
  this.config = options.config;
@@ -13941,7 +13983,7 @@ var TokenUsageEvaluator = class {
13941
13983
  expectedAspectCount,
13942
13984
  reasoning: "Token usage not reported by provider",
13943
13985
  evaluatorRawRequest: {
13944
- type: "token_usage",
13986
+ type: "token-usage",
13945
13987
  max_total: maxTotal ?? null,
13946
13988
  max_input: maxInput ?? null,
13947
13989
  max_output: maxOutput ?? null,
@@ -13983,9 +14025,9 @@ var TokenUsageEvaluator = class {
13983
14025
  hits,
13984
14026
  misses,
13985
14027
  expectedAspectCount,
13986
- reasoning: `token_usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
14028
+ reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
13987
14029
  evaluatorRawRequest: {
13988
- type: "token_usage",
14030
+ type: "token-usage",
13989
14031
  max_total: maxTotal ?? null,
13990
14032
  max_input: maxInput ?? null,
13991
14033
  max_output: maxOutput ?? null,
@@ -14070,7 +14112,7 @@ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
14070
14112
  };
14071
14113
  }
14072
14114
  var ToolTrajectoryEvaluator = class {
14073
- kind = "tool_trajectory";
14115
+ kind = "tool-trajectory";
14074
14116
  config;
14075
14117
  constructor(options) {
14076
14118
  this.config = options.config;
@@ -14258,7 +14300,7 @@ var ToolTrajectoryEvaluator = class {
14258
14300
  }
14259
14301
  }
14260
14302
  for (const warning of warnings) {
14261
- console.warn(`[tool_trajectory] ${warning}`);
14303
+ console.warn(`[tool-trajectory] ${warning}`);
14262
14304
  }
14263
14305
  const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
14264
14306
  const totalAssertions = expected.length + effectiveLatencyAssertions;
@@ -14334,7 +14376,7 @@ var ToolTrajectoryEvaluator = class {
14334
14376
  misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
14335
14377
  }
14336
14378
  for (const warning of warnings) {
14337
- console.warn(`[tool_trajectory] ${warning}`);
14379
+ console.warn(`[tool-trajectory] ${warning}`);
14338
14380
  }
14339
14381
  const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
14340
14382
  const totalAssertions = expected.length + effectiveLatencyAssertions;
@@ -14824,7 +14866,7 @@ var llmJudgeFactory = (config, context2) => {
14824
14866
  const c = config;
14825
14867
  const { llmJudge, agentTimeoutMs } = context2;
14826
14868
  return {
14827
- kind: "llm_judge",
14869
+ kind: "llm-judge",
14828
14870
  async evaluate(evalContext) {
14829
14871
  const customPrompt = await resolveCustomPrompt(
14830
14872
  c,
@@ -14913,7 +14955,7 @@ var agentJudgeFactory = (config, context2) => {
14913
14955
  customPrompt = (0, import_node_fs9.readFileSync)(c.resolvedPromptPath, "utf-8");
14914
14956
  } catch (error) {
14915
14957
  const message = error instanceof Error ? error.message : String(error);
14916
- console.warn(`Could not read agent_judge prompt at ${c.resolvedPromptPath}: ${message}`);
14958
+ console.warn(`Could not read agent-judge prompt at ${c.resolvedPromptPath}: ${message}`);
14917
14959
  }
14918
14960
  } else if (c.prompt) {
14919
14961
  customPrompt = c.prompt;
@@ -14923,7 +14965,7 @@ var agentJudgeFactory = (config, context2) => {
14923
14965
  judgeTargetProvider = targetResolver(c.target);
14924
14966
  if (!judgeTargetProvider) {
14925
14967
  throw new Error(
14926
- `agent_judge evaluator '${c.name}': target '${c.target}' not found in targets`
14968
+ `agent-judge evaluator '${c.name}': target '${c.target}' not found in targets`
14927
14969
  );
14928
14970
  }
14929
14971
  }
@@ -14967,7 +15009,7 @@ var regexFactory = (config) => {
14967
15009
  });
14968
15010
  };
14969
15011
  var isJsonFactory = () => {
14970
- return new DeterministicAssertionEvaluator("is_json", (ctx) => {
15012
+ return new DeterministicAssertionEvaluator("is-json", (ctx) => {
14971
15013
  const result = runIsJsonAssertion(ctx.candidate);
14972
15014
  return {
14973
15015
  score: result.score,
@@ -14995,7 +15037,7 @@ var equalsFactory = (config) => {
14995
15037
  };
14996
15038
  var containsAnyFactory = (config) => {
14997
15039
  const c = config;
14998
- return new DeterministicAssertionEvaluator("contains_any", (ctx) => {
15040
+ return new DeterministicAssertionEvaluator("contains-any", (ctx) => {
14999
15041
  const result = runContainsAnyAssertion(ctx.candidate, c.value);
15000
15042
  return {
15001
15043
  score: result.score,
@@ -15009,7 +15051,7 @@ var containsAnyFactory = (config) => {
15009
15051
  };
15010
15052
  var containsAllFactory = (config) => {
15011
15053
  const c = config;
15012
- return new DeterministicAssertionEvaluator("contains_all", (ctx) => {
15054
+ return new DeterministicAssertionEvaluator("contains-all", (ctx) => {
15013
15055
  const result = runContainsAllAssertion(ctx.candidate, c.value);
15014
15056
  return {
15015
15057
  score: result.score,
@@ -15037,7 +15079,7 @@ var icontainsFactory = (config) => {
15037
15079
  };
15038
15080
  var icontainsAnyFactory = (config) => {
15039
15081
  const c = config;
15040
- return new DeterministicAssertionEvaluator("icontains_any", (ctx) => {
15082
+ return new DeterministicAssertionEvaluator("icontains-any", (ctx) => {
15041
15083
  const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
15042
15084
  return {
15043
15085
  score: result.score,
@@ -15051,7 +15093,7 @@ var icontainsAnyFactory = (config) => {
15051
15093
  };
15052
15094
  var icontainsAllFactory = (config) => {
15053
15095
  const c = config;
15054
- return new DeterministicAssertionEvaluator("icontains_all", (ctx) => {
15096
+ return new DeterministicAssertionEvaluator("icontains-all", (ctx) => {
15055
15097
  const result = runIcontainsAllAssertion(ctx.candidate, c.value);
15056
15098
  return {
15057
15099
  score: result.score,
@@ -15065,7 +15107,7 @@ var icontainsAllFactory = (config) => {
15065
15107
  };
15066
15108
  var startsWithFactory = (config) => {
15067
15109
  const c = config;
15068
- return new DeterministicAssertionEvaluator("starts_with", (ctx) => {
15110
+ return new DeterministicAssertionEvaluator("starts-with", (ctx) => {
15069
15111
  const result = runStartsWithAssertion(ctx.candidate, c.value);
15070
15112
  return {
15071
15113
  score: result.score,
@@ -15079,7 +15121,7 @@ var startsWithFactory = (config) => {
15079
15121
  };
15080
15122
  var endsWithFactory = (config) => {
15081
15123
  const c = config;
15082
- return new DeterministicAssertionEvaluator("ends_with", (ctx) => {
15124
+ return new DeterministicAssertionEvaluator("ends-with", (ctx) => {
15083
15125
  const result = runEndsWithAssertion(ctx.candidate, c.value);
15084
15126
  return {
15085
15127
  score: result.score,
@@ -15093,7 +15135,7 @@ var endsWithFactory = (config) => {
15093
15135
  };
15094
15136
  function createBuiltinRegistry() {
15095
15137
  const registry = new EvaluatorRegistry();
15096
- registry.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("contains_any", containsAnyFactory).register("contains_all", containsAllFactory).register("icontains", icontainsFactory).register("icontains_any", icontainsAnyFactory).register("icontains_all", icontainsAllFactory).register("starts_with", startsWithFactory).register("ends_with", endsWithFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
15138
+ registry.register("llm-judge", llmJudgeFactory).register("code-judge", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("agent-judge", agentJudgeFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory);
15097
15139
  return registry;
15098
15140
  }
15099
15141
 
@@ -15780,7 +15822,8 @@ async function runEvaluation(options) {
15780
15822
  cleanupWorkspaces,
15781
15823
  trials,
15782
15824
  streamCallbacks,
15783
- totalBudgetUsd
15825
+ totalBudgetUsd,
15826
+ failOnError
15784
15827
  } = options;
15785
15828
  let useCache = options.useCache;
15786
15829
  if (trials && trials.count > 1 && useCache) {
@@ -15838,7 +15881,7 @@ async function runEvaluation(options) {
15838
15881
  };
15839
15882
  if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
15840
15883
  throw new Error(
15841
- `Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure_base).`
15884
+ `Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure-base).`
15842
15885
  );
15843
15886
  }
15844
15887
  const targetResolver = (name) => {
@@ -15909,7 +15952,7 @@ async function runEvaluation(options) {
15909
15952
  const rawTemplate = suiteWorkspace?.template ?? getWorkspaceTemplate(target);
15910
15953
  const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
15911
15954
  const workspaceTemplate = resolvedTemplate?.dir;
15912
- const suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
15955
+ let suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
15913
15956
  const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
15914
15957
  const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
15915
15958
  const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
@@ -15930,6 +15973,14 @@ async function runEvaluation(options) {
15930
15973
  const message = error instanceof Error ? error.message : String(error);
15931
15974
  throw new Error(`Failed to create shared workspace: ${message}`);
15932
15975
  }
15976
+ if (suiteWorkspaceFile && sharedWorkspacePath) {
15977
+ const copiedWorkspaceFile = import_node_path40.default.join(sharedWorkspacePath, import_node_path40.default.basename(suiteWorkspaceFile));
15978
+ try {
15979
+ await (0, import_promises29.stat)(copiedWorkspaceFile);
15980
+ suiteWorkspaceFile = copiedWorkspaceFile;
15981
+ } catch {
15982
+ }
15983
+ }
15933
15984
  } else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
15934
15985
  sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
15935
15986
  await (0, import_promises29.mkdir)(sharedWorkspacePath, { recursive: true });
@@ -15976,6 +16027,7 @@ async function runEvaluation(options) {
15976
16027
  let beforeAllOutputAttached = false;
15977
16028
  let cumulativeBudgetCost = 0;
15978
16029
  let budgetExhausted = false;
16030
+ let failOnErrorTriggered = false;
15979
16031
  const promises = filteredEvalCases.map(
15980
16032
  (evalCase) => limit(async () => {
15981
16033
  const workerId = nextWorkerId++;
@@ -16014,6 +16066,37 @@ async function runEvaluation(options) {
16014
16066
  }
16015
16067
  return budgetResult;
16016
16068
  }
16069
+ if (failOnError === true && failOnErrorTriggered) {
16070
+ const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
16071
+ const haltResult = {
16072
+ timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
16073
+ testId: evalCase.id,
16074
+ dataset: evalCase.dataset,
16075
+ score: 0,
16076
+ hits: [],
16077
+ misses: [],
16078
+ answer: "",
16079
+ target: target.name,
16080
+ error: errorMsg,
16081
+ executionStatus: "execution_error",
16082
+ failureStage: "setup",
16083
+ failureReasonCode: "error_threshold_exceeded",
16084
+ executionError: { message: errorMsg, stage: "setup" }
16085
+ };
16086
+ if (onProgress) {
16087
+ await onProgress({
16088
+ workerId,
16089
+ testId: evalCase.id,
16090
+ status: "failed",
16091
+ completedAt: Date.now(),
16092
+ error: haltResult.error
16093
+ });
16094
+ }
16095
+ if (onResult) {
16096
+ await onResult(haltResult);
16097
+ }
16098
+ return haltResult;
16099
+ }
16017
16100
  if (onProgress) {
16018
16101
  await onProgress({
16019
16102
  workerId,
@@ -16066,6 +16149,9 @@ async function runEvaluation(options) {
16066
16149
  }
16067
16150
  }
16068
16151
  }
16152
+ if (failOnError === true && result.executionStatus === "execution_error") {
16153
+ failOnErrorTriggered = true;
16154
+ }
16069
16155
  if (beforeAllOutput && !beforeAllOutputAttached) {
16070
16156
  result = { ...result, beforeAllOutput };
16071
16157
  beforeAllOutputAttached = true;
@@ -16373,6 +16459,14 @@ async function runEvalCase(options) {
16373
16459
  "template_error"
16374
16460
  );
16375
16461
  }
16462
+ if (caseWorkspaceFile && workspacePath) {
16463
+ const copiedFile = import_node_path40.default.join(workspacePath, import_node_path40.default.basename(caseWorkspaceFile));
16464
+ try {
16465
+ await (0, import_promises29.stat)(copiedFile);
16466
+ caseWorkspaceFile = copiedFile;
16467
+ } catch {
16468
+ }
16469
+ }
16376
16470
  }
16377
16471
  if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
16378
16472
  workspacePath = getWorkspacePath(evalRunId, evalCase.id);
@@ -16882,8 +16976,8 @@ async function runEvaluatorsForCase(options) {
16882
16976
  workspacePath
16883
16977
  });
16884
16978
  }
16885
- const evaluatorKind = evalCase.evaluator ?? "llm_judge";
16886
- const activeEvaluator = evaluators[evaluatorKind] ?? evaluators.llm_judge;
16979
+ const evaluatorKind = evalCase.evaluator ?? "llm-judge";
16980
+ const activeEvaluator = evaluators[evaluatorKind] ?? evaluators["llm-judge"];
16887
16981
  if (!activeEvaluator) {
16888
16982
  throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
16889
16983
  }
@@ -16966,25 +17060,24 @@ async function runEvaluatorList(options) {
16966
17060
  availableTargets,
16967
17061
  agentTimeoutMs,
16968
17062
  evalFileDir,
16969
- llmJudge: evaluatorRegistry.llm_judge,
17063
+ llmJudge: evaluatorRegistry["llm-judge"],
16970
17064
  registry: typeRegistry
16971
17065
  };
16972
17066
  for (const evaluatorConfig of evaluators ?? []) {
16973
17067
  try {
16974
17068
  const evaluatorInstance = await typeRegistry.create(evaluatorConfig, dispatchContext);
16975
17069
  const score2 = await evaluatorInstance.evaluate(evalContext);
16976
- const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
16977
17070
  const weight = evaluatorConfig.weight ?? 1;
16978
17071
  scored.push({
16979
17072
  score: score2,
16980
17073
  name: evaluatorConfig.name,
16981
- type: resultType,
17074
+ type: evaluatorConfig.type,
16982
17075
  weight,
16983
17076
  ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
16984
17077
  });
16985
17078
  scores.push({
16986
17079
  name: evaluatorConfig.name,
16987
- type: resultType,
17080
+ type: evaluatorConfig.type,
16988
17081
  score: score2.score,
16989
17082
  weight,
16990
17083
  verdict: score2.verdict,
@@ -17006,18 +17099,17 @@ async function runEvaluatorList(options) {
17006
17099
  expectedAspectCount: 1,
17007
17100
  reasoning: message
17008
17101
  };
17009
- const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
17010
17102
  const weight = evaluatorConfig.weight ?? 1;
17011
17103
  scored.push({
17012
17104
  score: fallbackScore,
17013
17105
  name: evaluatorConfig.name ?? "unknown",
17014
- type: resultType ?? "llm_judge",
17106
+ type: evaluatorConfig.type ?? "llm-judge",
17015
17107
  weight,
17016
17108
  ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
17017
17109
  });
17018
17110
  scores.push({
17019
17111
  name: evaluatorConfig.name ?? "unknown",
17020
- type: resultType ?? "llm_judge",
17112
+ type: evaluatorConfig.type ?? "llm-judge",
17021
17113
  score: 0,
17022
17114
  weight,
17023
17115
  verdict: "fail",
@@ -17078,7 +17170,7 @@ function filterEvalCases(evalCases, filter) {
17078
17170
  return evalCases.filter((evalCase) => import_micromatch4.default.isMatch(evalCase.id, filter));
17079
17171
  }
17080
17172
  function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
17081
- const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
17173
+ const llmJudge = overrides?.["llm-judge"] ?? new LlmJudgeEvaluator({
17082
17174
  resolveJudgeProvider: async (context2) => {
17083
17175
  if (context2.judgeProvider) {
17084
17176
  return context2.judgeProvider;
@@ -17088,7 +17180,7 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
17088
17180
  });
17089
17181
  return {
17090
17182
  ...overrides,
17091
- llm_judge: llmJudge
17183
+ "llm-judge": llmJudge
17092
17184
  };
17093
17185
  }
17094
17186
  async function invokeProvider(provider, options) {
@@ -17348,12 +17440,7 @@ async function evaluate(config) {
17348
17440
  };
17349
17441
  }
17350
17442
  function mapAssertionType(type) {
17351
- switch (type) {
17352
- case "code_judge":
17353
- return "code";
17354
- default:
17355
- return type;
17356
- }
17443
+ return type.replace(/_/g, "-");
17357
17444
  }
17358
17445
  function computeSummary(results, durationMs) {
17359
17446
  const total = results.length;
@@ -18132,6 +18219,7 @@ function createAgentKernel() {
18132
18219
  executeWorkspaceScript,
18133
18220
  explorationRatio,
18134
18221
  extractCacheConfig,
18222
+ extractFailOnError,
18135
18223
  extractJsonBlob,
18136
18224
  extractTargetFromSuite,
18137
18225
  extractTargetsFromSuite,