@agentv/core 2.13.0 → 2.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-JHER2LQ5.js → chunk-N55K52OO.js} +15 -15
- package/dist/chunk-N55K52OO.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +25 -24
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +12 -11
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +184 -158
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +40 -40
- package/dist/index.d.ts +40 -40
- package/dist/index.js +172 -146
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-JHER2LQ5.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1612,27 +1612,27 @@ function isTestMessage(value) {
|
|
|
1612
1612
|
return false;
|
|
1613
1613
|
}
|
|
1614
1614
|
var EVALUATOR_KIND_VALUES = [
|
|
1615
|
-
"
|
|
1616
|
-
"
|
|
1615
|
+
"code-judge",
|
|
1616
|
+
"llm-judge",
|
|
1617
1617
|
"rubric",
|
|
1618
1618
|
"composite",
|
|
1619
|
-
"
|
|
1620
|
-
"
|
|
1619
|
+
"tool-trajectory",
|
|
1620
|
+
"field-accuracy",
|
|
1621
1621
|
"latency",
|
|
1622
1622
|
"cost",
|
|
1623
|
-
"
|
|
1624
|
-
"
|
|
1625
|
-
"
|
|
1623
|
+
"token-usage",
|
|
1624
|
+
"execution-metrics",
|
|
1625
|
+
"agent-judge",
|
|
1626
1626
|
"contains",
|
|
1627
|
-
"
|
|
1628
|
-
"
|
|
1627
|
+
"contains-any",
|
|
1628
|
+
"contains-all",
|
|
1629
1629
|
"icontains",
|
|
1630
|
-
"
|
|
1631
|
-
"
|
|
1632
|
-
"
|
|
1633
|
-
"
|
|
1630
|
+
"icontains-any",
|
|
1631
|
+
"icontains-all",
|
|
1632
|
+
"starts-with",
|
|
1633
|
+
"ends-with",
|
|
1634
1634
|
"regex",
|
|
1635
|
-
"
|
|
1635
|
+
"is-json",
|
|
1636
1636
|
"equals",
|
|
1637
1637
|
"rubrics"
|
|
1638
1638
|
];
|
|
@@ -2301,6 +2301,9 @@ function validateTemplateVariables(content, source) {
|
|
|
2301
2301
|
// src/evaluation/loaders/evaluator-parser.ts
|
|
2302
2302
|
var ANSI_YELLOW4 = "\x1B[33m";
|
|
2303
2303
|
var ANSI_RESET4 = "\x1B[0m";
|
|
2304
|
+
function normalizeEvaluatorType(type) {
|
|
2305
|
+
return type.replace(/_/g, "-");
|
|
2306
|
+
}
|
|
2304
2307
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
2305
2308
|
const execution = rawEvalCase.execution;
|
|
2306
2309
|
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
@@ -2331,7 +2334,8 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2331
2334
|
continue;
|
|
2332
2335
|
}
|
|
2333
2336
|
const rawName = asString(rawEvaluator.name);
|
|
2334
|
-
const
|
|
2337
|
+
const rawType = rawEvaluator.type;
|
|
2338
|
+
const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
|
|
2335
2339
|
const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
|
|
2336
2340
|
if (typeof typeValue !== "string") {
|
|
2337
2341
|
logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
|
|
@@ -2364,25 +2368,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2364
2368
|
});
|
|
2365
2369
|
continue;
|
|
2366
2370
|
}
|
|
2367
|
-
if (typeValue === "
|
|
2371
|
+
if (typeValue === "code-judge") {
|
|
2368
2372
|
let command;
|
|
2369
2373
|
const rawCommand = rawEvaluator.command ?? rawEvaluator.script;
|
|
2370
2374
|
if (typeof rawCommand === "string") {
|
|
2371
2375
|
const trimmed = rawCommand.trim();
|
|
2372
2376
|
if (trimmed.length === 0) {
|
|
2373
2377
|
throw new Error(
|
|
2374
|
-
`Invalid
|
|
2378
|
+
`Invalid code-judge command for evaluator '${name}' in '${evalId}': command cannot be empty`
|
|
2375
2379
|
);
|
|
2376
2380
|
}
|
|
2377
2381
|
command = parseCommandToArgv(trimmed);
|
|
2378
2382
|
} else {
|
|
2379
2383
|
command = asStringArray(
|
|
2380
2384
|
rawCommand,
|
|
2381
|
-
`
|
|
2385
|
+
`code-judge command for evaluator '${name}' in '${evalId}'`
|
|
2382
2386
|
);
|
|
2383
2387
|
}
|
|
2384
2388
|
if (!command) {
|
|
2385
|
-
logWarning2(`Skipping
|
|
2389
|
+
logWarning2(`Skipping code-judge evaluator '${name}' in '${evalId}': missing command`);
|
|
2386
2390
|
continue;
|
|
2387
2391
|
}
|
|
2388
2392
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
@@ -2443,7 +2447,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2443
2447
|
}
|
|
2444
2448
|
evaluators.push({
|
|
2445
2449
|
name,
|
|
2446
|
-
type: "code",
|
|
2450
|
+
type: "code-judge",
|
|
2447
2451
|
command,
|
|
2448
2452
|
cwd,
|
|
2449
2453
|
resolvedCwd,
|
|
@@ -2469,7 +2473,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2469
2473
|
continue;
|
|
2470
2474
|
}
|
|
2471
2475
|
const aggregatorType = asString(rawAggregator.type);
|
|
2472
|
-
if (aggregatorType !== "weighted_average" && aggregatorType !== "
|
|
2476
|
+
if (aggregatorType !== "weighted_average" && aggregatorType !== "code-judge" && aggregatorType !== "llm-judge" && aggregatorType !== "threshold") {
|
|
2473
2477
|
logWarning2(
|
|
2474
2478
|
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
2475
2479
|
);
|
|
@@ -2518,16 +2522,16 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2518
2522
|
type: "weighted_average",
|
|
2519
2523
|
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
2520
2524
|
};
|
|
2521
|
-
} else if (aggregatorType === "
|
|
2525
|
+
} else if (aggregatorType === "code-judge") {
|
|
2522
2526
|
const aggregatorPath = asString(rawAggregator.path);
|
|
2523
2527
|
if (!aggregatorPath) {
|
|
2524
2528
|
logWarning2(
|
|
2525
|
-
`Skipping composite evaluator '${name}' in '${evalId}':
|
|
2529
|
+
`Skipping composite evaluator '${name}' in '${evalId}': code-judge aggregator missing path`
|
|
2526
2530
|
);
|
|
2527
2531
|
continue;
|
|
2528
2532
|
}
|
|
2529
2533
|
aggregator = {
|
|
2530
|
-
type: "
|
|
2534
|
+
type: "code-judge",
|
|
2531
2535
|
path: aggregatorPath,
|
|
2532
2536
|
cwd: searchRoots[0]
|
|
2533
2537
|
};
|
|
@@ -2553,7 +2557,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2553
2557
|
}
|
|
2554
2558
|
}
|
|
2555
2559
|
aggregator = {
|
|
2556
|
-
type: "
|
|
2560
|
+
type: "llm-judge",
|
|
2557
2561
|
...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
|
|
2558
2562
|
...promptPath2 ? { promptPath: promptPath2 } : {}
|
|
2559
2563
|
};
|
|
@@ -2571,11 +2575,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2571
2575
|
});
|
|
2572
2576
|
continue;
|
|
2573
2577
|
}
|
|
2574
|
-
if (typeValue === "
|
|
2578
|
+
if (typeValue === "tool-trajectory") {
|
|
2575
2579
|
const mode = asString(rawEvaluator.mode);
|
|
2576
2580
|
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact" && mode !== "subset" && mode !== "superset") {
|
|
2577
2581
|
logWarning2(
|
|
2578
|
-
`Skipping
|
|
2582
|
+
`Skipping tool-trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, exact, subset, or superset)`
|
|
2579
2583
|
);
|
|
2580
2584
|
continue;
|
|
2581
2585
|
}
|
|
@@ -2584,7 +2588,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2584
2588
|
if (rawMinimums !== void 0) {
|
|
2585
2589
|
if (!isJsonObject2(rawMinimums)) {
|
|
2586
2590
|
logWarning2(
|
|
2587
|
-
`Skipping
|
|
2591
|
+
`Skipping tool-trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
|
|
2588
2592
|
);
|
|
2589
2593
|
continue;
|
|
2590
2594
|
}
|
|
@@ -2610,7 +2614,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2610
2614
|
argsMatch2 = rawArgsMatch;
|
|
2611
2615
|
} else {
|
|
2612
2616
|
logWarning2(
|
|
2613
|
-
`Invalid args_match '${rawArgsMatch}' for
|
|
2617
|
+
`Invalid args_match '${rawArgsMatch}' for tool-trajectory evaluator '${name}' in '${evalId}': must be exact, superset, subset, ignore, or a string array`
|
|
2614
2618
|
);
|
|
2615
2619
|
}
|
|
2616
2620
|
}
|
|
@@ -2620,7 +2624,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2620
2624
|
if (rawExpected !== void 0) {
|
|
2621
2625
|
if (!Array.isArray(rawExpected)) {
|
|
2622
2626
|
logWarning2(
|
|
2623
|
-
`Skipping
|
|
2627
|
+
`Skipping tool-trajectory evaluator '${name}' in '${evalId}': expected must be an array`
|
|
2624
2628
|
);
|
|
2625
2629
|
continue;
|
|
2626
2630
|
}
|
|
@@ -2666,13 +2670,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2666
2670
|
}
|
|
2667
2671
|
if (mode === "any_order" && !minimums) {
|
|
2668
2672
|
logWarning2(
|
|
2669
|
-
`Skipping
|
|
2673
|
+
`Skipping tool-trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
|
|
2670
2674
|
);
|
|
2671
2675
|
continue;
|
|
2672
2676
|
}
|
|
2673
2677
|
if ((mode === "in_order" || mode === "exact" || mode === "subset" || mode === "superset") && !expected) {
|
|
2674
2678
|
logWarning2(
|
|
2675
|
-
`Skipping
|
|
2679
|
+
`Skipping tool-trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
|
|
2676
2680
|
);
|
|
2677
2681
|
continue;
|
|
2678
2682
|
}
|
|
@@ -2680,7 +2684,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2680
2684
|
const required2 = parseRequired(rawEvaluator.required);
|
|
2681
2685
|
const config2 = {
|
|
2682
2686
|
name,
|
|
2683
|
-
type: "
|
|
2687
|
+
type: "tool-trajectory",
|
|
2684
2688
|
mode,
|
|
2685
2689
|
...minimums ? { minimums } : {},
|
|
2686
2690
|
...expected ? { expected } : {},
|
|
@@ -2692,17 +2696,17 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2692
2696
|
evaluators.push(config2);
|
|
2693
2697
|
continue;
|
|
2694
2698
|
}
|
|
2695
|
-
if (typeValue === "
|
|
2699
|
+
if (typeValue === "field-accuracy") {
|
|
2696
2700
|
const rawFields = rawEvaluator.fields;
|
|
2697
2701
|
if (!Array.isArray(rawFields)) {
|
|
2698
2702
|
logWarning2(
|
|
2699
|
-
`Skipping
|
|
2703
|
+
`Skipping field-accuracy evaluator '${name}' in '${evalId}': missing fields array`
|
|
2700
2704
|
);
|
|
2701
2705
|
continue;
|
|
2702
2706
|
}
|
|
2703
2707
|
if (rawFields.length === 0) {
|
|
2704
2708
|
logWarning2(
|
|
2705
|
-
`Skipping
|
|
2709
|
+
`Skipping field-accuracy evaluator '${name}' in '${evalId}': fields array is empty`
|
|
2706
2710
|
);
|
|
2707
2711
|
continue;
|
|
2708
2712
|
}
|
|
@@ -2710,7 +2714,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2710
2714
|
for (const rawField of rawFields) {
|
|
2711
2715
|
if (!isJsonObject2(rawField)) {
|
|
2712
2716
|
logWarning2(
|
|
2713
|
-
`Skipping invalid field entry in
|
|
2717
|
+
`Skipping invalid field entry in field-accuracy evaluator '${name}' (expected object)`
|
|
2714
2718
|
);
|
|
2715
2719
|
continue;
|
|
2716
2720
|
}
|
|
@@ -2718,13 +2722,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2718
2722
|
const match = asString(rawField.match);
|
|
2719
2723
|
if (!fieldPath) {
|
|
2720
2724
|
logWarning2(
|
|
2721
|
-
`Skipping field without path in
|
|
2725
|
+
`Skipping field without path in field-accuracy evaluator '${name}' in '${evalId}'`
|
|
2722
2726
|
);
|
|
2723
2727
|
continue;
|
|
2724
2728
|
}
|
|
2725
2729
|
if (!match || !isValidFieldMatchType(match)) {
|
|
2726
2730
|
logWarning2(
|
|
2727
|
-
`Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a
|
|
2731
|
+
`Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code-judge evaluator.`
|
|
2728
2732
|
);
|
|
2729
2733
|
continue;
|
|
2730
2734
|
}
|
|
@@ -2741,7 +2745,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2741
2745
|
}
|
|
2742
2746
|
if (fields.length === 0) {
|
|
2743
2747
|
logWarning2(
|
|
2744
|
-
`Skipping
|
|
2748
|
+
`Skipping field-accuracy evaluator '${name}' in '${evalId}': no valid fields found`
|
|
2745
2749
|
);
|
|
2746
2750
|
continue;
|
|
2747
2751
|
}
|
|
@@ -2751,7 +2755,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2751
2755
|
const required2 = parseRequired(rawEvaluator.required);
|
|
2752
2756
|
evaluators.push({
|
|
2753
2757
|
name,
|
|
2754
|
-
type: "
|
|
2758
|
+
type: "field-accuracy",
|
|
2755
2759
|
fields,
|
|
2756
2760
|
...validAggregation ? { aggregation: validAggregation } : {},
|
|
2757
2761
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
@@ -2800,7 +2804,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2800
2804
|
});
|
|
2801
2805
|
continue;
|
|
2802
2806
|
}
|
|
2803
|
-
if (typeValue === "
|
|
2807
|
+
if (typeValue === "token-usage") {
|
|
2804
2808
|
const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
|
|
2805
2809
|
const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
|
|
2806
2810
|
const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
|
|
@@ -2814,7 +2818,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2814
2818
|
if (raw === void 0) continue;
|
|
2815
2819
|
if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
|
|
2816
2820
|
logWarning2(
|
|
2817
|
-
`Skipping
|
|
2821
|
+
`Skipping token-usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
|
|
2818
2822
|
);
|
|
2819
2823
|
continue;
|
|
2820
2824
|
}
|
|
@@ -2822,7 +2826,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2822
2826
|
}
|
|
2823
2827
|
if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
|
|
2824
2828
|
logWarning2(
|
|
2825
|
-
`Skipping
|
|
2829
|
+
`Skipping token-usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
|
|
2826
2830
|
);
|
|
2827
2831
|
continue;
|
|
2828
2832
|
}
|
|
@@ -2830,7 +2834,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2830
2834
|
const required2 = parseRequired(rawEvaluator.required);
|
|
2831
2835
|
evaluators.push({
|
|
2832
2836
|
name,
|
|
2833
|
-
type: "
|
|
2837
|
+
type: "token-usage",
|
|
2834
2838
|
...validLimits,
|
|
2835
2839
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
2836
2840
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
@@ -2838,7 +2842,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2838
2842
|
});
|
|
2839
2843
|
continue;
|
|
2840
2844
|
}
|
|
2841
|
-
if (typeValue === "
|
|
2845
|
+
if (typeValue === "execution-metrics") {
|
|
2842
2846
|
const maxToolCalls = rawEvaluator.max_tool_calls ?? rawEvaluator.maxToolCalls;
|
|
2843
2847
|
const maxLlmCalls = rawEvaluator.max_llm_calls ?? rawEvaluator.maxLlmCalls;
|
|
2844
2848
|
const maxTokens = rawEvaluator.max_tokens ?? rawEvaluator.maxTokens;
|
|
@@ -2861,7 +2865,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2861
2865
|
if (raw === void 0) continue;
|
|
2862
2866
|
if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
|
|
2863
2867
|
logWarning2(
|
|
2864
|
-
`Skipping
|
|
2868
|
+
`Skipping execution-metrics evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
|
|
2865
2869
|
);
|
|
2866
2870
|
hasError = true;
|
|
2867
2871
|
break;
|
|
@@ -2874,7 +2878,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2874
2878
|
const hasThreshold = validThresholds.max_tool_calls !== void 0 || validThresholds.max_llm_calls !== void 0 || validThresholds.max_tokens !== void 0 || validThresholds.max_cost_usd !== void 0 || validThresholds.max_duration_ms !== void 0 || validThresholds.target_exploration_ratio !== void 0;
|
|
2875
2879
|
if (!hasThreshold) {
|
|
2876
2880
|
logWarning2(
|
|
2877
|
-
`Skipping
|
|
2881
|
+
`Skipping execution-metrics evaluator '${name}' in '${evalId}': must set at least one threshold (max_tool_calls, max_llm_calls, max_tokens, max_cost_usd, max_duration_ms, or target_exploration_ratio)`
|
|
2878
2882
|
);
|
|
2879
2883
|
continue;
|
|
2880
2884
|
}
|
|
@@ -2882,7 +2886,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2882
2886
|
const required2 = parseRequired(rawEvaluator.required);
|
|
2883
2887
|
evaluators.push({
|
|
2884
2888
|
name,
|
|
2885
|
-
type: "
|
|
2889
|
+
type: "execution-metrics",
|
|
2886
2890
|
...validThresholds,
|
|
2887
2891
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
2888
2892
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
@@ -2890,13 +2894,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2890
2894
|
});
|
|
2891
2895
|
continue;
|
|
2892
2896
|
}
|
|
2893
|
-
if (typeValue === "
|
|
2897
|
+
if (typeValue === "agent-judge") {
|
|
2894
2898
|
const rawMaxSteps = rawEvaluator.max_steps ?? rawEvaluator.maxSteps;
|
|
2895
2899
|
let maxSteps;
|
|
2896
2900
|
if (rawMaxSteps !== void 0) {
|
|
2897
2901
|
if (typeof rawMaxSteps !== "number" || !Number.isInteger(rawMaxSteps) || rawMaxSteps < 1 || rawMaxSteps > 50) {
|
|
2898
2902
|
logWarning2(
|
|
2899
|
-
`Skipping
|
|
2903
|
+
`Skipping agent-judge evaluator '${name}' in '${evalId}': max_steps must be an integer 1-50`
|
|
2900
2904
|
);
|
|
2901
2905
|
continue;
|
|
2902
2906
|
}
|
|
@@ -2907,7 +2911,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2907
2911
|
if (rawTemperature !== void 0) {
|
|
2908
2912
|
if (typeof rawTemperature !== "number" || rawTemperature < 0 || rawTemperature > 2) {
|
|
2909
2913
|
logWarning2(
|
|
2910
|
-
`Skipping
|
|
2914
|
+
`Skipping agent-judge evaluator '${name}' in '${evalId}': temperature must be a number 0-2`
|
|
2911
2915
|
);
|
|
2912
2916
|
continue;
|
|
2913
2917
|
}
|
|
@@ -2930,7 +2934,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2930
2934
|
const required2 = parseRequired(rawEvaluator.required);
|
|
2931
2935
|
evaluators.push({
|
|
2932
2936
|
name,
|
|
2933
|
-
type: "
|
|
2937
|
+
type: "agent-judge",
|
|
2934
2938
|
...agentPrompt ? { prompt: agentPrompt } : {},
|
|
2935
2939
|
...agentPromptPath ? { promptPath: agentPromptPath, resolvedPromptPath: agentPromptPath } : {},
|
|
2936
2940
|
...agentParsedRubrics && agentParsedRubrics.length > 0 ? { rubrics: agentParsedRubrics } : {},
|
|
@@ -2961,7 +2965,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2961
2965
|
});
|
|
2962
2966
|
continue;
|
|
2963
2967
|
}
|
|
2964
|
-
if (typeValue === "
|
|
2968
|
+
if (typeValue === "contains-any" || typeValue === "contains-all") {
|
|
2965
2969
|
const value = asStringArrayStrict(rawEvaluator.value);
|
|
2966
2970
|
if (!value || value.length === 0) {
|
|
2967
2971
|
logWarning2(
|
|
@@ -2999,7 +3003,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2999
3003
|
});
|
|
3000
3004
|
continue;
|
|
3001
3005
|
}
|
|
3002
|
-
if (typeValue === "
|
|
3006
|
+
if (typeValue === "icontains-any" || typeValue === "icontains-all") {
|
|
3003
3007
|
const value = asStringArrayStrict(rawEvaluator.value);
|
|
3004
3008
|
if (!value || value.length === 0) {
|
|
3005
3009
|
logWarning2(
|
|
@@ -3019,7 +3023,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3019
3023
|
});
|
|
3020
3024
|
continue;
|
|
3021
3025
|
}
|
|
3022
|
-
if (typeValue === "
|
|
3026
|
+
if (typeValue === "starts-with" || typeValue === "ends-with") {
|
|
3023
3027
|
const value = asString(rawEvaluator.value);
|
|
3024
3028
|
if (!value) {
|
|
3025
3029
|
logWarning2(`Skipping ${typeValue} evaluator '${name}' in '${evalId}': missing value`);
|
|
@@ -3057,12 +3061,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3057
3061
|
});
|
|
3058
3062
|
continue;
|
|
3059
3063
|
}
|
|
3060
|
-
if (typeValue === "
|
|
3064
|
+
if (typeValue === "is-json") {
|
|
3061
3065
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3062
3066
|
const required2 = parseRequired(rawEvaluator.required);
|
|
3063
3067
|
evaluators.push({
|
|
3064
3068
|
name,
|
|
3065
|
-
type: "
|
|
3069
|
+
type: "is-json",
|
|
3066
3070
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3067
3071
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3068
3072
|
...negate !== void 0 ? { negate } : {}
|
|
@@ -3110,7 +3114,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3110
3114
|
const required2 = parseRequired(rawEvaluator.required);
|
|
3111
3115
|
evaluators.push({
|
|
3112
3116
|
name,
|
|
3113
|
-
type: "
|
|
3117
|
+
type: "llm-judge",
|
|
3114
3118
|
rubrics: parsedCriteria,
|
|
3115
3119
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3116
3120
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
@@ -3177,7 +3181,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3177
3181
|
const required2 = parseRequired(rawEvaluator.required);
|
|
3178
3182
|
evaluators.push({
|
|
3179
3183
|
name,
|
|
3180
|
-
type: "
|
|
3184
|
+
type: "llm-judge",
|
|
3181
3185
|
rubrics: parsedRubrics,
|
|
3182
3186
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3183
3187
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
@@ -3209,7 +3213,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3209
3213
|
const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
|
|
3210
3214
|
evaluators.push({
|
|
3211
3215
|
name,
|
|
3212
|
-
type: "
|
|
3216
|
+
type: "llm-judge",
|
|
3213
3217
|
prompt,
|
|
3214
3218
|
promptPath,
|
|
3215
3219
|
...promptPath ? { resolvedPromptPath: promptPath } : {},
|
|
@@ -3225,15 +3229,15 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3225
3229
|
}
|
|
3226
3230
|
var ASSERTION_TYPES = /* @__PURE__ */ new Set([
|
|
3227
3231
|
"contains",
|
|
3228
|
-
"
|
|
3229
|
-
"
|
|
3232
|
+
"contains-any",
|
|
3233
|
+
"contains-all",
|
|
3230
3234
|
"icontains",
|
|
3231
|
-
"
|
|
3232
|
-
"
|
|
3233
|
-
"
|
|
3234
|
-
"
|
|
3235
|
+
"icontains-any",
|
|
3236
|
+
"icontains-all",
|
|
3237
|
+
"starts-with",
|
|
3238
|
+
"ends-with",
|
|
3235
3239
|
"regex",
|
|
3236
|
-
"
|
|
3240
|
+
"is-json",
|
|
3237
3241
|
"equals",
|
|
3238
3242
|
"rubrics"
|
|
3239
3243
|
]);
|
|
@@ -3246,24 +3250,24 @@ function generateAssertionName(typeValue, rawEvaluator) {
|
|
|
3246
3250
|
switch (typeValue) {
|
|
3247
3251
|
case "contains":
|
|
3248
3252
|
return value ? `contains-${value}` : "contains";
|
|
3249
|
-
case "
|
|
3250
|
-
return arrayValue ? `
|
|
3251
|
-
case "
|
|
3252
|
-
return arrayValue ? `
|
|
3253
|
+
case "contains-any":
|
|
3254
|
+
return arrayValue ? `contains-any-${arrayValue.length}` : "contains-any";
|
|
3255
|
+
case "contains-all":
|
|
3256
|
+
return arrayValue ? `contains-all-${arrayValue.length}` : "contains-all";
|
|
3253
3257
|
case "icontains":
|
|
3254
3258
|
return value ? `icontains-${value}` : "icontains";
|
|
3255
|
-
case "
|
|
3256
|
-
return arrayValue ? `
|
|
3257
|
-
case "
|
|
3258
|
-
return arrayValue ? `
|
|
3259
|
-
case "
|
|
3260
|
-
return value ? `
|
|
3261
|
-
case "
|
|
3262
|
-
return value ? `
|
|
3259
|
+
case "icontains-any":
|
|
3260
|
+
return arrayValue ? `icontains-any-${arrayValue.length}` : "icontains-any";
|
|
3261
|
+
case "icontains-all":
|
|
3262
|
+
return arrayValue ? `icontains-all-${arrayValue.length}` : "icontains-all";
|
|
3263
|
+
case "starts-with":
|
|
3264
|
+
return value ? `starts-with-${value}` : "starts-with";
|
|
3265
|
+
case "ends-with":
|
|
3266
|
+
return value ? `ends-with-${value}` : "ends-with";
|
|
3263
3267
|
case "regex":
|
|
3264
3268
|
return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
|
|
3265
|
-
case "
|
|
3266
|
-
return "
|
|
3269
|
+
case "is-json":
|
|
3270
|
+
return "is-json";
|
|
3267
3271
|
case "equals":
|
|
3268
3272
|
return value ? `equals-${value}` : "equals";
|
|
3269
3273
|
case "rubrics":
|
|
@@ -3276,8 +3280,9 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
3276
3280
|
if (typeof candidate !== "string") {
|
|
3277
3281
|
return void 0;
|
|
3278
3282
|
}
|
|
3279
|
-
|
|
3280
|
-
|
|
3283
|
+
const normalized = normalizeEvaluatorType(candidate);
|
|
3284
|
+
if (isEvaluatorKind(normalized)) {
|
|
3285
|
+
return normalized;
|
|
3281
3286
|
}
|
|
3282
3287
|
logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
3283
3288
|
return void 0;
|
|
@@ -3323,6 +3328,16 @@ function parseCommandToArgv(command) {
|
|
|
3323
3328
|
function isJsonObject2(value) {
|
|
3324
3329
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
3325
3330
|
}
|
|
3331
|
+
var CRITERIA_CONSUMER_TYPES = /* @__PURE__ */ new Set(["llm-judge", "agent-judge", "code-judge"]);
|
|
3332
|
+
function warnUnconsumedCriteria(criteria, evaluators, testId) {
|
|
3333
|
+
if (!criteria?.trim() || !evaluators || evaluators.length === 0) return;
|
|
3334
|
+
const hasConsumer = evaluators.some((e) => CRITERIA_CONSUMER_TYPES.has(e.type));
|
|
3335
|
+
if (!hasConsumer) {
|
|
3336
|
+
logWarning2(
|
|
3337
|
+
`Test '${testId}': criteria is defined but no evaluator in assert will evaluate it. Add 'type: llm-judge' to assert, or remove criteria if it is documentation-only.`
|
|
3338
|
+
);
|
|
3339
|
+
}
|
|
3340
|
+
}
|
|
3326
3341
|
function logWarning2(message, details) {
|
|
3327
3342
|
if (details && details.length > 0) {
|
|
3328
3343
|
const detailBlock = details.join("\n");
|
|
@@ -3572,7 +3587,7 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
3572
3587
|
}
|
|
3573
3588
|
return {
|
|
3574
3589
|
name: "rubric",
|
|
3575
|
-
type: "
|
|
3590
|
+
type: "llm-judge",
|
|
3576
3591
|
rubrics: rubricItems
|
|
3577
3592
|
};
|
|
3578
3593
|
}
|
|
@@ -3957,7 +3972,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
3957
3972
|
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
3958
3973
|
const fallbackDataset = import_node_path6.default.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
3959
3974
|
const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
|
|
3960
|
-
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "
|
|
3975
|
+
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-judge";
|
|
3961
3976
|
const globalExecution = sidecar.execution;
|
|
3962
3977
|
if (verbose) {
|
|
3963
3978
|
console.log(`
|
|
@@ -4045,6 +4060,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
4045
4060
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
4046
4061
|
}
|
|
4047
4062
|
}
|
|
4063
|
+
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
4048
4064
|
const userFilePaths = [];
|
|
4049
4065
|
for (const segment of inputSegments) {
|
|
4050
4066
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -4437,7 +4453,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4437
4453
|
const fallbackDataset = import_node_path8.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
4438
4454
|
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
4439
4455
|
const rawTestcases = resolveTests(suite);
|
|
4440
|
-
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "
|
|
4456
|
+
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-judge";
|
|
4441
4457
|
const evalFileDir = import_node_path8.default.dirname(absoluteTestPath);
|
|
4442
4458
|
let expandedTestcases;
|
|
4443
4459
|
if (typeof rawTestcases === "string") {
|
|
@@ -4534,6 +4550,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4534
4550
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
4535
4551
|
}
|
|
4536
4552
|
}
|
|
4553
|
+
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
4537
4554
|
const userFilePaths = [];
|
|
4538
4555
|
for (const segment of inputSegments) {
|
|
4539
4556
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -11726,7 +11743,7 @@ function toCamelCaseDeep(obj) {
|
|
|
11726
11743
|
// src/evaluation/evaluators/code-evaluator.ts
|
|
11727
11744
|
var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
|
|
11728
11745
|
var CodeEvaluator = class {
|
|
11729
|
-
kind = "code";
|
|
11746
|
+
kind = "code-judge";
|
|
11730
11747
|
command;
|
|
11731
11748
|
cwd;
|
|
11732
11749
|
agentTimeoutMs;
|
|
@@ -11963,7 +11980,7 @@ var scoreRangeEvaluationSchema = import_zod4.z.object({
|
|
|
11963
11980
|
overall_reasoning: import_zod4.z.string().describe("Overall assessment summary (1-2 sentences)").optional()
|
|
11964
11981
|
});
|
|
11965
11982
|
var LlmJudgeEvaluator = class {
|
|
11966
|
-
kind = "
|
|
11983
|
+
kind = "llm-judge";
|
|
11967
11984
|
resolveJudgeProvider;
|
|
11968
11985
|
maxOutputTokens;
|
|
11969
11986
|
temperature;
|
|
@@ -11980,7 +11997,7 @@ var LlmJudgeEvaluator = class {
|
|
|
11980
11997
|
throw new Error("No judge provider available for LLM grading");
|
|
11981
11998
|
}
|
|
11982
11999
|
const config = context2.evaluator;
|
|
11983
|
-
if (config?.type === "
|
|
12000
|
+
if (config?.type === "llm-judge" && config.rubrics && config.rubrics.length > 0) {
|
|
11984
12001
|
return this.evaluateWithRubrics(context2, judgeProvider, config.rubrics);
|
|
11985
12002
|
}
|
|
11986
12003
|
return this.evaluateFreeform(context2, judgeProvider);
|
|
@@ -12054,7 +12071,7 @@ ${context2.fileChanges}`;
|
|
|
12054
12071
|
async evaluateWithRubrics(context2, judgeProvider, rubrics) {
|
|
12055
12072
|
if (!rubrics || rubrics.length === 0) {
|
|
12056
12073
|
throw new Error(
|
|
12057
|
-
`No rubrics found for evaluator "${context2.evaluator?.name ?? "
|
|
12074
|
+
`No rubrics found for evaluator "${context2.evaluator?.name ?? "llm-judge"}". Run "agentv generate rubrics" first.`
|
|
12058
12075
|
);
|
|
12059
12076
|
}
|
|
12060
12077
|
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
@@ -12390,9 +12407,9 @@ var CompositeEvaluator = class {
|
|
|
12390
12407
|
async aggregate(results, context2) {
|
|
12391
12408
|
const aggregator = this.config.aggregator;
|
|
12392
12409
|
switch (aggregator.type) {
|
|
12393
|
-
case "
|
|
12410
|
+
case "code-judge":
|
|
12394
12411
|
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
12395
|
-
case "
|
|
12412
|
+
case "llm-judge":
|
|
12396
12413
|
return this.runLlmAggregator(results, context2, aggregator);
|
|
12397
12414
|
case "threshold":
|
|
12398
12415
|
return this.runThreshold(results, aggregator.threshold);
|
|
@@ -12535,7 +12552,7 @@ var CompositeEvaluator = class {
|
|
|
12535
12552
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
12536
12553
|
reasoning,
|
|
12537
12554
|
evaluatorRawRequest: {
|
|
12538
|
-
aggregator: "
|
|
12555
|
+
aggregator: "code-judge",
|
|
12539
12556
|
script: scriptPath
|
|
12540
12557
|
},
|
|
12541
12558
|
scores
|
|
@@ -12550,7 +12567,7 @@ var CompositeEvaluator = class {
|
|
|
12550
12567
|
expectedAspectCount: 1,
|
|
12551
12568
|
reasoning: message,
|
|
12552
12569
|
evaluatorRawRequest: {
|
|
12553
|
-
aggregator: "
|
|
12570
|
+
aggregator: "code-judge",
|
|
12554
12571
|
script: scriptPath,
|
|
12555
12572
|
error: message
|
|
12556
12573
|
},
|
|
@@ -12581,7 +12598,7 @@ var CompositeEvaluator = class {
|
|
|
12581
12598
|
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
12582
12599
|
const systemPrompt = buildOutputSchema();
|
|
12583
12600
|
const evaluatorRawRequest = {
|
|
12584
|
-
aggregator: "
|
|
12601
|
+
aggregator: "llm-judge",
|
|
12585
12602
|
userPrompt,
|
|
12586
12603
|
systemPrompt,
|
|
12587
12604
|
target: judgeProvider.targetName
|
|
@@ -12693,7 +12710,7 @@ var CostEvaluator = class {
|
|
|
12693
12710
|
|
|
12694
12711
|
// src/evaluation/evaluators/execution-metrics.ts
|
|
12695
12712
|
var ExecutionMetricsEvaluator = class {
|
|
12696
|
-
kind = "
|
|
12713
|
+
kind = "execution-metrics";
|
|
12697
12714
|
config;
|
|
12698
12715
|
constructor(options) {
|
|
12699
12716
|
this.config = options.config;
|
|
@@ -12719,7 +12736,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12719
12736
|
expectedAspectCount: 1,
|
|
12720
12737
|
reasoning: "Execution metrics not available - no trace summary provided",
|
|
12721
12738
|
evaluatorRawRequest: {
|
|
12722
|
-
type: "
|
|
12739
|
+
type: "execution-metrics",
|
|
12723
12740
|
config: this.extractConfiguredThresholds(),
|
|
12724
12741
|
actual: null
|
|
12725
12742
|
}
|
|
@@ -12828,7 +12845,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12828
12845
|
if (actualMetrics.exploration_ratio !== void 0) {
|
|
12829
12846
|
reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
|
|
12830
12847
|
}
|
|
12831
|
-
const reasoning = reasoningParts.length > 0 ? `
|
|
12848
|
+
const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
|
|
12832
12849
|
return {
|
|
12833
12850
|
score,
|
|
12834
12851
|
verdict: scoreToVerdict(score),
|
|
@@ -12837,7 +12854,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12837
12854
|
expectedAspectCount: totalChecks || 1,
|
|
12838
12855
|
reasoning,
|
|
12839
12856
|
evaluatorRawRequest: {
|
|
12840
|
-
type: "
|
|
12857
|
+
type: "execution-metrics",
|
|
12841
12858
|
config: this.extractConfiguredThresholds(),
|
|
12842
12859
|
actual: this.filterDefinedMetrics(actualMetrics)
|
|
12843
12860
|
}
|
|
@@ -12925,7 +12942,7 @@ var MONTH_NAMES = {
|
|
|
12925
12942
|
december: 11
|
|
12926
12943
|
};
|
|
12927
12944
|
var FieldAccuracyEvaluator = class {
|
|
12928
|
-
kind = "
|
|
12945
|
+
kind = "field-accuracy";
|
|
12929
12946
|
config;
|
|
12930
12947
|
constructor(options) {
|
|
12931
12948
|
this.config = options.config;
|
|
@@ -13379,7 +13396,7 @@ var BINARY_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
|
13379
13396
|
".dylib"
|
|
13380
13397
|
]);
|
|
13381
13398
|
var AgentJudgeEvaluator = class {
|
|
13382
|
-
kind = "
|
|
13399
|
+
kind = "agent-judge";
|
|
13383
13400
|
resolveJudgeProvider;
|
|
13384
13401
|
maxSteps;
|
|
13385
13402
|
temperature;
|
|
@@ -13404,24 +13421,24 @@ var AgentJudgeEvaluator = class {
|
|
|
13404
13421
|
async evaluateBuiltIn(context2) {
|
|
13405
13422
|
const judgeProvider = await this.resolveJudgeProvider(context2);
|
|
13406
13423
|
if (!judgeProvider) {
|
|
13407
|
-
throw new Error("No judge provider available for
|
|
13424
|
+
throw new Error("No judge provider available for agent-judge evaluation");
|
|
13408
13425
|
}
|
|
13409
13426
|
const model = judgeProvider.asLanguageModel?.();
|
|
13410
13427
|
if (!model) {
|
|
13411
13428
|
throw new Error(
|
|
13412
|
-
`Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in
|
|
13429
|
+
`Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent-judge mode`
|
|
13413
13430
|
);
|
|
13414
13431
|
}
|
|
13415
13432
|
const workspacePath = context2.workspacePath;
|
|
13416
13433
|
if (!workspacePath) {
|
|
13417
13434
|
throw new Error(
|
|
13418
|
-
"
|
|
13435
|
+
"agent-judge evaluator requires a workspace_template target (workspacePath is not set)"
|
|
13419
13436
|
);
|
|
13420
13437
|
}
|
|
13421
13438
|
const systemPrompt = this.buildSystemPrompt(context2);
|
|
13422
13439
|
const userPrompt = this.buildUserPrompt(context2);
|
|
13423
13440
|
const config = context2.evaluator;
|
|
13424
|
-
const rubrics = config?.type === "
|
|
13441
|
+
const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
|
|
13425
13442
|
const fsTools = createFilesystemTools(workspacePath);
|
|
13426
13443
|
const evaluatorRawRequest = {
|
|
13427
13444
|
mode: "built-in",
|
|
@@ -13452,7 +13469,7 @@ var AgentJudgeEvaluator = class {
|
|
|
13452
13469
|
score: 0,
|
|
13453
13470
|
verdict: "fail",
|
|
13454
13471
|
hits: [],
|
|
13455
|
-
misses: [`
|
|
13472
|
+
misses: [`agent-judge built-in evaluation failed: ${message}`],
|
|
13456
13473
|
expectedAspectCount: 1,
|
|
13457
13474
|
evaluatorRawRequest,
|
|
13458
13475
|
details: { mode: "built-in", error: message }
|
|
@@ -13484,14 +13501,14 @@ var AgentJudgeEvaluator = class {
|
|
|
13484
13501
|
score: 0,
|
|
13485
13502
|
verdict: "fail",
|
|
13486
13503
|
hits: [],
|
|
13487
|
-
misses: ["
|
|
13504
|
+
misses: ["agent-judge judge_target returned no assistant response"],
|
|
13488
13505
|
expectedAspectCount: 1,
|
|
13489
13506
|
evaluatorRawRequest,
|
|
13490
13507
|
details: { mode: "judge_target", judge_target: provider.targetName }
|
|
13491
13508
|
};
|
|
13492
13509
|
}
|
|
13493
13510
|
const config = context2.evaluator;
|
|
13494
|
-
const rubrics = config?.type === "
|
|
13511
|
+
const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
|
|
13495
13512
|
const details = {
|
|
13496
13513
|
mode: "judge_target",
|
|
13497
13514
|
judge_target: provider.targetName
|
|
@@ -13503,7 +13520,7 @@ var AgentJudgeEvaluator = class {
|
|
|
13503
13520
|
score: 0,
|
|
13504
13521
|
verdict: "fail",
|
|
13505
13522
|
hits: [],
|
|
13506
|
-
misses: [`
|
|
13523
|
+
misses: [`agent-judge judge_target evaluation failed: ${message}`],
|
|
13507
13524
|
expectedAspectCount: 1,
|
|
13508
13525
|
evaluatorRawRequest,
|
|
13509
13526
|
details: {
|
|
@@ -13554,7 +13571,7 @@ var AgentJudgeEvaluator = class {
|
|
|
13554
13571
|
score: 0,
|
|
13555
13572
|
verdict: "fail",
|
|
13556
13573
|
hits: [],
|
|
13557
|
-
misses: ["Failed to parse
|
|
13574
|
+
misses: ["Failed to parse agent-judge response as valid evaluation JSON"],
|
|
13558
13575
|
expectedAspectCount: 1,
|
|
13559
13576
|
evaluatorRawRequest,
|
|
13560
13577
|
details
|
|
@@ -13567,7 +13584,7 @@ var AgentJudgeEvaluator = class {
|
|
|
13567
13584
|
*/
|
|
13568
13585
|
buildSystemPrompt(context2) {
|
|
13569
13586
|
const config = context2.evaluator;
|
|
13570
|
-
const rubrics = config?.type === "
|
|
13587
|
+
const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
|
|
13571
13588
|
const parts = [
|
|
13572
13589
|
"You are an expert evaluator with access to the workspace filesystem.",
|
|
13573
13590
|
"Use the provided tools to investigate the workspace and verify the criteria are met.",
|
|
@@ -13598,7 +13615,7 @@ var AgentJudgeEvaluator = class {
|
|
|
13598
13615
|
return substituteVariables(this.evaluatorTemplate, variables);
|
|
13599
13616
|
}
|
|
13600
13617
|
const config = context2.evaluator;
|
|
13601
|
-
const rubrics = config?.type === "
|
|
13618
|
+
const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
|
|
13602
13619
|
const parts = [
|
|
13603
13620
|
"Evaluate the candidate answer by investigating the workspace.",
|
|
13604
13621
|
"",
|
|
@@ -13641,7 +13658,7 @@ var AgentJudgeEvaluator = class {
|
|
|
13641
13658
|
buildDelegatedPrompt(context2) {
|
|
13642
13659
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
13643
13660
|
const config = context2.evaluator;
|
|
13644
|
-
const rubrics = config?.type === "
|
|
13661
|
+
const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
|
|
13645
13662
|
if (this.evaluatorTemplate) {
|
|
13646
13663
|
const variables = {
|
|
13647
13664
|
[TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
|
|
@@ -13723,11 +13740,11 @@ function createFilesystemTools(workspacePath) {
|
|
|
13723
13740
|
execute: async (input) => {
|
|
13724
13741
|
try {
|
|
13725
13742
|
const resolved = resolveSandboxed(workspacePath, input.path);
|
|
13726
|
-
const
|
|
13727
|
-
if (
|
|
13743
|
+
const stat8 = await import_promises25.default.stat(resolved);
|
|
13744
|
+
if (stat8.isDirectory()) {
|
|
13728
13745
|
return { error: `'${input.path}' is a directory, not a file` };
|
|
13729
13746
|
}
|
|
13730
|
-
const buffer = Buffer.alloc(Math.min(
|
|
13747
|
+
const buffer = Buffer.alloc(Math.min(stat8.size, MAX_FILE_SIZE));
|
|
13731
13748
|
const fd = await import_promises25.default.open(resolved, "r");
|
|
13732
13749
|
try {
|
|
13733
13750
|
await fd.read(buffer, 0, buffer.length, 0);
|
|
@@ -13735,8 +13752,8 @@ function createFilesystemTools(workspacePath) {
|
|
|
13735
13752
|
await fd.close();
|
|
13736
13753
|
}
|
|
13737
13754
|
const content = buffer.toString("utf-8");
|
|
13738
|
-
const truncated =
|
|
13739
|
-
return { content, truncated, size:
|
|
13755
|
+
const truncated = stat8.size > MAX_FILE_SIZE;
|
|
13756
|
+
return { content, truncated, size: stat8.size };
|
|
13740
13757
|
} catch (error) {
|
|
13741
13758
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
13742
13759
|
}
|
|
@@ -13780,8 +13797,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
|
13780
13797
|
const ext = import_node_path33.default.extname(entry.name).toLowerCase();
|
|
13781
13798
|
if (BINARY_EXTENSIONS.has(ext)) continue;
|
|
13782
13799
|
try {
|
|
13783
|
-
const
|
|
13784
|
-
if (
|
|
13800
|
+
const stat8 = await import_promises25.default.stat(fullPath);
|
|
13801
|
+
if (stat8.size > MAX_FILE_SIZE) continue;
|
|
13785
13802
|
const content = await import_promises25.default.readFile(fullPath, "utf-8");
|
|
13786
13803
|
const lines = content.split("\n");
|
|
13787
13804
|
for (let i = 0; i < lines.length; i++) {
|
|
@@ -13943,7 +13960,7 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
|
|
|
13943
13960
|
|
|
13944
13961
|
// src/evaluation/evaluators/token-usage.ts
|
|
13945
13962
|
var TokenUsageEvaluator = class {
|
|
13946
|
-
kind = "
|
|
13963
|
+
kind = "token-usage";
|
|
13947
13964
|
config;
|
|
13948
13965
|
constructor(options) {
|
|
13949
13966
|
this.config = options.config;
|
|
@@ -13966,7 +13983,7 @@ var TokenUsageEvaluator = class {
|
|
|
13966
13983
|
expectedAspectCount,
|
|
13967
13984
|
reasoning: "Token usage not reported by provider",
|
|
13968
13985
|
evaluatorRawRequest: {
|
|
13969
|
-
type: "
|
|
13986
|
+
type: "token-usage",
|
|
13970
13987
|
max_total: maxTotal ?? null,
|
|
13971
13988
|
max_input: maxInput ?? null,
|
|
13972
13989
|
max_output: maxOutput ?? null,
|
|
@@ -14008,9 +14025,9 @@ var TokenUsageEvaluator = class {
|
|
|
14008
14025
|
hits,
|
|
14009
14026
|
misses,
|
|
14010
14027
|
expectedAspectCount,
|
|
14011
|
-
reasoning: `
|
|
14028
|
+
reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
|
|
14012
14029
|
evaluatorRawRequest: {
|
|
14013
|
-
type: "
|
|
14030
|
+
type: "token-usage",
|
|
14014
14031
|
max_total: maxTotal ?? null,
|
|
14015
14032
|
max_input: maxInput ?? null,
|
|
14016
14033
|
max_output: maxOutput ?? null,
|
|
@@ -14095,7 +14112,7 @@ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
|
|
|
14095
14112
|
};
|
|
14096
14113
|
}
|
|
14097
14114
|
var ToolTrajectoryEvaluator = class {
|
|
14098
|
-
kind = "
|
|
14115
|
+
kind = "tool-trajectory";
|
|
14099
14116
|
config;
|
|
14100
14117
|
constructor(options) {
|
|
14101
14118
|
this.config = options.config;
|
|
@@ -14283,7 +14300,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
14283
14300
|
}
|
|
14284
14301
|
}
|
|
14285
14302
|
for (const warning of warnings) {
|
|
14286
|
-
console.warn(`[
|
|
14303
|
+
console.warn(`[tool-trajectory] ${warning}`);
|
|
14287
14304
|
}
|
|
14288
14305
|
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
14289
14306
|
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
@@ -14359,7 +14376,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
14359
14376
|
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
14360
14377
|
}
|
|
14361
14378
|
for (const warning of warnings) {
|
|
14362
|
-
console.warn(`[
|
|
14379
|
+
console.warn(`[tool-trajectory] ${warning}`);
|
|
14363
14380
|
}
|
|
14364
14381
|
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
14365
14382
|
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
@@ -14849,7 +14866,7 @@ var llmJudgeFactory = (config, context2) => {
|
|
|
14849
14866
|
const c = config;
|
|
14850
14867
|
const { llmJudge, agentTimeoutMs } = context2;
|
|
14851
14868
|
return {
|
|
14852
|
-
kind: "
|
|
14869
|
+
kind: "llm-judge",
|
|
14853
14870
|
async evaluate(evalContext) {
|
|
14854
14871
|
const customPrompt = await resolveCustomPrompt(
|
|
14855
14872
|
c,
|
|
@@ -14938,7 +14955,7 @@ var agentJudgeFactory = (config, context2) => {
|
|
|
14938
14955
|
customPrompt = (0, import_node_fs9.readFileSync)(c.resolvedPromptPath, "utf-8");
|
|
14939
14956
|
} catch (error) {
|
|
14940
14957
|
const message = error instanceof Error ? error.message : String(error);
|
|
14941
|
-
console.warn(`Could not read
|
|
14958
|
+
console.warn(`Could not read agent-judge prompt at ${c.resolvedPromptPath}: ${message}`);
|
|
14942
14959
|
}
|
|
14943
14960
|
} else if (c.prompt) {
|
|
14944
14961
|
customPrompt = c.prompt;
|
|
@@ -14948,7 +14965,7 @@ var agentJudgeFactory = (config, context2) => {
|
|
|
14948
14965
|
judgeTargetProvider = targetResolver(c.target);
|
|
14949
14966
|
if (!judgeTargetProvider) {
|
|
14950
14967
|
throw new Error(
|
|
14951
|
-
`
|
|
14968
|
+
`agent-judge evaluator '${c.name}': target '${c.target}' not found in targets`
|
|
14952
14969
|
);
|
|
14953
14970
|
}
|
|
14954
14971
|
}
|
|
@@ -14992,7 +15009,7 @@ var regexFactory = (config) => {
|
|
|
14992
15009
|
});
|
|
14993
15010
|
};
|
|
14994
15011
|
var isJsonFactory = () => {
|
|
14995
|
-
return new DeterministicAssertionEvaluator("
|
|
15012
|
+
return new DeterministicAssertionEvaluator("is-json", (ctx) => {
|
|
14996
15013
|
const result = runIsJsonAssertion(ctx.candidate);
|
|
14997
15014
|
return {
|
|
14998
15015
|
score: result.score,
|
|
@@ -15020,7 +15037,7 @@ var equalsFactory = (config) => {
|
|
|
15020
15037
|
};
|
|
15021
15038
|
var containsAnyFactory = (config) => {
|
|
15022
15039
|
const c = config;
|
|
15023
|
-
return new DeterministicAssertionEvaluator("
|
|
15040
|
+
return new DeterministicAssertionEvaluator("contains-any", (ctx) => {
|
|
15024
15041
|
const result = runContainsAnyAssertion(ctx.candidate, c.value);
|
|
15025
15042
|
return {
|
|
15026
15043
|
score: result.score,
|
|
@@ -15034,7 +15051,7 @@ var containsAnyFactory = (config) => {
|
|
|
15034
15051
|
};
|
|
15035
15052
|
var containsAllFactory = (config) => {
|
|
15036
15053
|
const c = config;
|
|
15037
|
-
return new DeterministicAssertionEvaluator("
|
|
15054
|
+
return new DeterministicAssertionEvaluator("contains-all", (ctx) => {
|
|
15038
15055
|
const result = runContainsAllAssertion(ctx.candidate, c.value);
|
|
15039
15056
|
return {
|
|
15040
15057
|
score: result.score,
|
|
@@ -15062,7 +15079,7 @@ var icontainsFactory = (config) => {
|
|
|
15062
15079
|
};
|
|
15063
15080
|
var icontainsAnyFactory = (config) => {
|
|
15064
15081
|
const c = config;
|
|
15065
|
-
return new DeterministicAssertionEvaluator("
|
|
15082
|
+
return new DeterministicAssertionEvaluator("icontains-any", (ctx) => {
|
|
15066
15083
|
const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
|
|
15067
15084
|
return {
|
|
15068
15085
|
score: result.score,
|
|
@@ -15076,7 +15093,7 @@ var icontainsAnyFactory = (config) => {
|
|
|
15076
15093
|
};
|
|
15077
15094
|
var icontainsAllFactory = (config) => {
|
|
15078
15095
|
const c = config;
|
|
15079
|
-
return new DeterministicAssertionEvaluator("
|
|
15096
|
+
return new DeterministicAssertionEvaluator("icontains-all", (ctx) => {
|
|
15080
15097
|
const result = runIcontainsAllAssertion(ctx.candidate, c.value);
|
|
15081
15098
|
return {
|
|
15082
15099
|
score: result.score,
|
|
@@ -15090,7 +15107,7 @@ var icontainsAllFactory = (config) => {
|
|
|
15090
15107
|
};
|
|
15091
15108
|
var startsWithFactory = (config) => {
|
|
15092
15109
|
const c = config;
|
|
15093
|
-
return new DeterministicAssertionEvaluator("
|
|
15110
|
+
return new DeterministicAssertionEvaluator("starts-with", (ctx) => {
|
|
15094
15111
|
const result = runStartsWithAssertion(ctx.candidate, c.value);
|
|
15095
15112
|
return {
|
|
15096
15113
|
score: result.score,
|
|
@@ -15104,7 +15121,7 @@ var startsWithFactory = (config) => {
|
|
|
15104
15121
|
};
|
|
15105
15122
|
var endsWithFactory = (config) => {
|
|
15106
15123
|
const c = config;
|
|
15107
|
-
return new DeterministicAssertionEvaluator("
|
|
15124
|
+
return new DeterministicAssertionEvaluator("ends-with", (ctx) => {
|
|
15108
15125
|
const result = runEndsWithAssertion(ctx.candidate, c.value);
|
|
15109
15126
|
return {
|
|
15110
15127
|
score: result.score,
|
|
@@ -15118,7 +15135,7 @@ var endsWithFactory = (config) => {
|
|
|
15118
15135
|
};
|
|
15119
15136
|
function createBuiltinRegistry() {
|
|
15120
15137
|
const registry = new EvaluatorRegistry();
|
|
15121
|
-
registry.register("
|
|
15138
|
+
registry.register("llm-judge", llmJudgeFactory).register("code-judge", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("agent-judge", agentJudgeFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory);
|
|
15122
15139
|
return registry;
|
|
15123
15140
|
}
|
|
15124
15141
|
|
|
@@ -15864,7 +15881,7 @@ async function runEvaluation(options) {
|
|
|
15864
15881
|
};
|
|
15865
15882
|
if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
|
|
15866
15883
|
throw new Error(
|
|
15867
|
-
`Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g.,
|
|
15884
|
+
`Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure-base).`
|
|
15868
15885
|
);
|
|
15869
15886
|
}
|
|
15870
15887
|
const targetResolver = (name) => {
|
|
@@ -15935,7 +15952,7 @@ async function runEvaluation(options) {
|
|
|
15935
15952
|
const rawTemplate = suiteWorkspace?.template ?? getWorkspaceTemplate(target);
|
|
15936
15953
|
const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
|
|
15937
15954
|
const workspaceTemplate = resolvedTemplate?.dir;
|
|
15938
|
-
|
|
15955
|
+
let suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
|
|
15939
15956
|
const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
|
|
15940
15957
|
const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
|
|
15941
15958
|
const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
|
|
@@ -15956,6 +15973,14 @@ async function runEvaluation(options) {
|
|
|
15956
15973
|
const message = error instanceof Error ? error.message : String(error);
|
|
15957
15974
|
throw new Error(`Failed to create shared workspace: ${message}`);
|
|
15958
15975
|
}
|
|
15976
|
+
if (suiteWorkspaceFile && sharedWorkspacePath) {
|
|
15977
|
+
const copiedWorkspaceFile = import_node_path40.default.join(sharedWorkspacePath, import_node_path40.default.basename(suiteWorkspaceFile));
|
|
15978
|
+
try {
|
|
15979
|
+
await (0, import_promises29.stat)(copiedWorkspaceFile);
|
|
15980
|
+
suiteWorkspaceFile = copiedWorkspaceFile;
|
|
15981
|
+
} catch {
|
|
15982
|
+
}
|
|
15983
|
+
}
|
|
15959
15984
|
} else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
|
|
15960
15985
|
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
15961
15986
|
await (0, import_promises29.mkdir)(sharedWorkspacePath, { recursive: true });
|
|
@@ -16434,6 +16459,14 @@ async function runEvalCase(options) {
|
|
|
16434
16459
|
"template_error"
|
|
16435
16460
|
);
|
|
16436
16461
|
}
|
|
16462
|
+
if (caseWorkspaceFile && workspacePath) {
|
|
16463
|
+
const copiedFile = import_node_path40.default.join(workspacePath, import_node_path40.default.basename(caseWorkspaceFile));
|
|
16464
|
+
try {
|
|
16465
|
+
await (0, import_promises29.stat)(copiedFile);
|
|
16466
|
+
caseWorkspaceFile = copiedFile;
|
|
16467
|
+
} catch {
|
|
16468
|
+
}
|
|
16469
|
+
}
|
|
16437
16470
|
}
|
|
16438
16471
|
if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
16439
16472
|
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
@@ -16943,8 +16976,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
16943
16976
|
workspacePath
|
|
16944
16977
|
});
|
|
16945
16978
|
}
|
|
16946
|
-
const evaluatorKind = evalCase.evaluator ?? "
|
|
16947
|
-
const activeEvaluator = evaluators[evaluatorKind] ?? evaluators
|
|
16979
|
+
const evaluatorKind = evalCase.evaluator ?? "llm-judge";
|
|
16980
|
+
const activeEvaluator = evaluators[evaluatorKind] ?? evaluators["llm-judge"];
|
|
16948
16981
|
if (!activeEvaluator) {
|
|
16949
16982
|
throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
|
|
16950
16983
|
}
|
|
@@ -17027,25 +17060,24 @@ async function runEvaluatorList(options) {
|
|
|
17027
17060
|
availableTargets,
|
|
17028
17061
|
agentTimeoutMs,
|
|
17029
17062
|
evalFileDir,
|
|
17030
|
-
llmJudge: evaluatorRegistry
|
|
17063
|
+
llmJudge: evaluatorRegistry["llm-judge"],
|
|
17031
17064
|
registry: typeRegistry
|
|
17032
17065
|
};
|
|
17033
17066
|
for (const evaluatorConfig of evaluators ?? []) {
|
|
17034
17067
|
try {
|
|
17035
17068
|
const evaluatorInstance = await typeRegistry.create(evaluatorConfig, dispatchContext);
|
|
17036
17069
|
const score2 = await evaluatorInstance.evaluate(evalContext);
|
|
17037
|
-
const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
|
|
17038
17070
|
const weight = evaluatorConfig.weight ?? 1;
|
|
17039
17071
|
scored.push({
|
|
17040
17072
|
score: score2,
|
|
17041
17073
|
name: evaluatorConfig.name,
|
|
17042
|
-
type:
|
|
17074
|
+
type: evaluatorConfig.type,
|
|
17043
17075
|
weight,
|
|
17044
17076
|
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
17045
17077
|
});
|
|
17046
17078
|
scores.push({
|
|
17047
17079
|
name: evaluatorConfig.name,
|
|
17048
|
-
type:
|
|
17080
|
+
type: evaluatorConfig.type,
|
|
17049
17081
|
score: score2.score,
|
|
17050
17082
|
weight,
|
|
17051
17083
|
verdict: score2.verdict,
|
|
@@ -17067,18 +17099,17 @@ async function runEvaluatorList(options) {
|
|
|
17067
17099
|
expectedAspectCount: 1,
|
|
17068
17100
|
reasoning: message
|
|
17069
17101
|
};
|
|
17070
|
-
const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
|
|
17071
17102
|
const weight = evaluatorConfig.weight ?? 1;
|
|
17072
17103
|
scored.push({
|
|
17073
17104
|
score: fallbackScore,
|
|
17074
17105
|
name: evaluatorConfig.name ?? "unknown",
|
|
17075
|
-
type:
|
|
17106
|
+
type: evaluatorConfig.type ?? "llm-judge",
|
|
17076
17107
|
weight,
|
|
17077
17108
|
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
17078
17109
|
});
|
|
17079
17110
|
scores.push({
|
|
17080
17111
|
name: evaluatorConfig.name ?? "unknown",
|
|
17081
|
-
type:
|
|
17112
|
+
type: evaluatorConfig.type ?? "llm-judge",
|
|
17082
17113
|
score: 0,
|
|
17083
17114
|
weight,
|
|
17084
17115
|
verdict: "fail",
|
|
@@ -17139,7 +17170,7 @@ function filterEvalCases(evalCases, filter) {
|
|
|
17139
17170
|
return evalCases.filter((evalCase) => import_micromatch4.default.isMatch(evalCase.id, filter));
|
|
17140
17171
|
}
|
|
17141
17172
|
function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
17142
|
-
const llmJudge = overrides?.
|
|
17173
|
+
const llmJudge = overrides?.["llm-judge"] ?? new LlmJudgeEvaluator({
|
|
17143
17174
|
resolveJudgeProvider: async (context2) => {
|
|
17144
17175
|
if (context2.judgeProvider) {
|
|
17145
17176
|
return context2.judgeProvider;
|
|
@@ -17149,7 +17180,7 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
17149
17180
|
});
|
|
17150
17181
|
return {
|
|
17151
17182
|
...overrides,
|
|
17152
|
-
|
|
17183
|
+
"llm-judge": llmJudge
|
|
17153
17184
|
};
|
|
17154
17185
|
}
|
|
17155
17186
|
async function invokeProvider(provider, options) {
|
|
@@ -17409,12 +17440,7 @@ async function evaluate(config) {
|
|
|
17409
17440
|
};
|
|
17410
17441
|
}
|
|
17411
17442
|
function mapAssertionType(type) {
|
|
17412
|
-
|
|
17413
|
-
case "code_judge":
|
|
17414
|
-
return "code";
|
|
17415
|
-
default:
|
|
17416
|
-
return type;
|
|
17417
|
-
}
|
|
17443
|
+
return type.replace(/_/g, "-");
|
|
17418
17444
|
}
|
|
17419
17445
|
function computeSummary(results, durationMs) {
|
|
17420
17446
|
const total = results.length;
|