@agentv/core 2.12.0 → 2.14.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-7HPKTRFZ.js → chunk-N55K52OO.js} +15 -15
- package/dist/chunk-N55K52OO.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +25 -24
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +12 -11
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +248 -160
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +58 -41
- package/dist/index.d.ts +58 -41
- package/dist/index.js +235 -148
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-7HPKTRFZ.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1492,6 +1492,7 @@ __export(index_exports, {
|
|
|
1492
1492
|
executeWorkspaceScript: () => executeWorkspaceScript,
|
|
1493
1493
|
explorationRatio: () => explorationRatio,
|
|
1494
1494
|
extractCacheConfig: () => extractCacheConfig,
|
|
1495
|
+
extractFailOnError: () => extractFailOnError,
|
|
1495
1496
|
extractJsonBlob: () => extractJsonBlob,
|
|
1496
1497
|
extractTargetFromSuite: () => extractTargetFromSuite,
|
|
1497
1498
|
extractTargetsFromSuite: () => extractTargetsFromSuite,
|
|
@@ -1611,27 +1612,27 @@ function isTestMessage(value) {
|
|
|
1611
1612
|
return false;
|
|
1612
1613
|
}
|
|
1613
1614
|
var EVALUATOR_KIND_VALUES = [
|
|
1614
|
-
"
|
|
1615
|
-
"
|
|
1615
|
+
"code-judge",
|
|
1616
|
+
"llm-judge",
|
|
1616
1617
|
"rubric",
|
|
1617
1618
|
"composite",
|
|
1618
|
-
"
|
|
1619
|
-
"
|
|
1619
|
+
"tool-trajectory",
|
|
1620
|
+
"field-accuracy",
|
|
1620
1621
|
"latency",
|
|
1621
1622
|
"cost",
|
|
1622
|
-
"
|
|
1623
|
-
"
|
|
1624
|
-
"
|
|
1623
|
+
"token-usage",
|
|
1624
|
+
"execution-metrics",
|
|
1625
|
+
"agent-judge",
|
|
1625
1626
|
"contains",
|
|
1626
|
-
"
|
|
1627
|
-
"
|
|
1627
|
+
"contains-any",
|
|
1628
|
+
"contains-all",
|
|
1628
1629
|
"icontains",
|
|
1629
|
-
"
|
|
1630
|
-
"
|
|
1631
|
-
"
|
|
1632
|
-
"
|
|
1630
|
+
"icontains-any",
|
|
1631
|
+
"icontains-all",
|
|
1632
|
+
"starts-with",
|
|
1633
|
+
"ends-with",
|
|
1633
1634
|
"regex",
|
|
1634
|
-
"
|
|
1635
|
+
"is-json",
|
|
1635
1636
|
"equals",
|
|
1636
1637
|
"rubrics"
|
|
1637
1638
|
];
|
|
@@ -2014,6 +2015,11 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
2014
2015
|
continue;
|
|
2015
2016
|
}
|
|
2016
2017
|
const config = parsed;
|
|
2018
|
+
const requiredVersion = parsed.required_version;
|
|
2019
|
+
if (requiredVersion !== void 0 && typeof requiredVersion !== "string") {
|
|
2020
|
+
logWarning(`Invalid required_version in ${configPath}, expected string`);
|
|
2021
|
+
continue;
|
|
2022
|
+
}
|
|
2017
2023
|
const guidelinePatterns = config.guideline_patterns;
|
|
2018
2024
|
if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
|
|
2019
2025
|
logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
|
|
@@ -2037,6 +2043,7 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
2037
2043
|
configPath
|
|
2038
2044
|
);
|
|
2039
2045
|
return {
|
|
2046
|
+
required_version: requiredVersion,
|
|
2040
2047
|
guideline_patterns: guidelinePatterns,
|
|
2041
2048
|
eval_patterns: evalPatterns,
|
|
2042
2049
|
execution: executionDefaults
|
|
@@ -2180,6 +2187,22 @@ function extractTotalBudgetUsd(suite) {
|
|
|
2180
2187
|
);
|
|
2181
2188
|
return void 0;
|
|
2182
2189
|
}
|
|
2190
|
+
function extractFailOnError(suite) {
|
|
2191
|
+
const execution = suite.execution;
|
|
2192
|
+
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
2193
|
+
return void 0;
|
|
2194
|
+
}
|
|
2195
|
+
const executionObj = execution;
|
|
2196
|
+
const raw = executionObj.fail_on_error ?? executionObj.failOnError;
|
|
2197
|
+
if (raw === void 0 || raw === null) {
|
|
2198
|
+
return void 0;
|
|
2199
|
+
}
|
|
2200
|
+
if (typeof raw === "boolean") {
|
|
2201
|
+
return raw;
|
|
2202
|
+
}
|
|
2203
|
+
logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
|
|
2204
|
+
return void 0;
|
|
2205
|
+
}
|
|
2183
2206
|
function parseExecutionDefaults(raw, configPath) {
|
|
2184
2207
|
if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
|
|
2185
2208
|
return void 0;
|
|
@@ -2278,6 +2301,9 @@ function validateTemplateVariables(content, source) {
|
|
|
2278
2301
|
// src/evaluation/loaders/evaluator-parser.ts
|
|
2279
2302
|
var ANSI_YELLOW4 = "\x1B[33m";
|
|
2280
2303
|
var ANSI_RESET4 = "\x1B[0m";
|
|
2304
|
+
function normalizeEvaluatorType(type) {
|
|
2305
|
+
return type.replace(/_/g, "-");
|
|
2306
|
+
}
|
|
2281
2307
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
2282
2308
|
const execution = rawEvalCase.execution;
|
|
2283
2309
|
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
@@ -2308,7 +2334,8 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2308
2334
|
continue;
|
|
2309
2335
|
}
|
|
2310
2336
|
const rawName = asString(rawEvaluator.name);
|
|
2311
|
-
const
|
|
2337
|
+
const rawType = rawEvaluator.type;
|
|
2338
|
+
const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
|
|
2312
2339
|
const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
|
|
2313
2340
|
if (typeof typeValue !== "string") {
|
|
2314
2341
|
logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
|
|
@@ -2341,25 +2368,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2341
2368
|
});
|
|
2342
2369
|
continue;
|
|
2343
2370
|
}
|
|
2344
|
-
if (typeValue === "
|
|
2371
|
+
if (typeValue === "code-judge") {
|
|
2345
2372
|
let command;
|
|
2346
2373
|
const rawCommand = rawEvaluator.command ?? rawEvaluator.script;
|
|
2347
2374
|
if (typeof rawCommand === "string") {
|
|
2348
2375
|
const trimmed = rawCommand.trim();
|
|
2349
2376
|
if (trimmed.length === 0) {
|
|
2350
2377
|
throw new Error(
|
|
2351
|
-
`Invalid
|
|
2378
|
+
`Invalid code-judge command for evaluator '${name}' in '${evalId}': command cannot be empty`
|
|
2352
2379
|
);
|
|
2353
2380
|
}
|
|
2354
2381
|
command = parseCommandToArgv(trimmed);
|
|
2355
2382
|
} else {
|
|
2356
2383
|
command = asStringArray(
|
|
2357
2384
|
rawCommand,
|
|
2358
|
-
`
|
|
2385
|
+
`code-judge command for evaluator '${name}' in '${evalId}'`
|
|
2359
2386
|
);
|
|
2360
2387
|
}
|
|
2361
2388
|
if (!command) {
|
|
2362
|
-
logWarning2(`Skipping
|
|
2389
|
+
logWarning2(`Skipping code-judge evaluator '${name}' in '${evalId}': missing command`);
|
|
2363
2390
|
continue;
|
|
2364
2391
|
}
|
|
2365
2392
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
@@ -2420,7 +2447,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2420
2447
|
}
|
|
2421
2448
|
evaluators.push({
|
|
2422
2449
|
name,
|
|
2423
|
-
type: "code",
|
|
2450
|
+
type: "code-judge",
|
|
2424
2451
|
command,
|
|
2425
2452
|
cwd,
|
|
2426
2453
|
resolvedCwd,
|
|
@@ -2446,7 +2473,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2446
2473
|
continue;
|
|
2447
2474
|
}
|
|
2448
2475
|
const aggregatorType = asString(rawAggregator.type);
|
|
2449
|
-
if (aggregatorType !== "weighted_average" && aggregatorType !== "
|
|
2476
|
+
if (aggregatorType !== "weighted_average" && aggregatorType !== "code-judge" && aggregatorType !== "llm-judge" && aggregatorType !== "threshold") {
|
|
2450
2477
|
logWarning2(
|
|
2451
2478
|
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
2452
2479
|
);
|
|
@@ -2495,16 +2522,16 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2495
2522
|
type: "weighted_average",
|
|
2496
2523
|
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
2497
2524
|
};
|
|
2498
|
-
} else if (aggregatorType === "
|
|
2525
|
+
} else if (aggregatorType === "code-judge") {
|
|
2499
2526
|
const aggregatorPath = asString(rawAggregator.path);
|
|
2500
2527
|
if (!aggregatorPath) {
|
|
2501
2528
|
logWarning2(
|
|
2502
|
-
`Skipping composite evaluator '${name}' in '${evalId}':
|
|
2529
|
+
`Skipping composite evaluator '${name}' in '${evalId}': code-judge aggregator missing path`
|
|
2503
2530
|
);
|
|
2504
2531
|
continue;
|
|
2505
2532
|
}
|
|
2506
2533
|
aggregator = {
|
|
2507
|
-
type: "
|
|
2534
|
+
type: "code-judge",
|
|
2508
2535
|
path: aggregatorPath,
|
|
2509
2536
|
cwd: searchRoots[0]
|
|
2510
2537
|
};
|
|
@@ -2530,7 +2557,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2530
2557
|
}
|
|
2531
2558
|
}
|
|
2532
2559
|
aggregator = {
|
|
2533
|
-
type: "
|
|
2560
|
+
type: "llm-judge",
|
|
2534
2561
|
...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
|
|
2535
2562
|
...promptPath2 ? { promptPath: promptPath2 } : {}
|
|
2536
2563
|
};
|
|
@@ -2548,11 +2575,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2548
2575
|
});
|
|
2549
2576
|
continue;
|
|
2550
2577
|
}
|
|
2551
|
-
if (typeValue === "
|
|
2578
|
+
if (typeValue === "tool-trajectory") {
|
|
2552
2579
|
const mode = asString(rawEvaluator.mode);
|
|
2553
2580
|
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact" && mode !== "subset" && mode !== "superset") {
|
|
2554
2581
|
logWarning2(
|
|
2555
|
-
`Skipping
|
|
2582
|
+
`Skipping tool-trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, exact, subset, or superset)`
|
|
2556
2583
|
);
|
|
2557
2584
|
continue;
|
|
2558
2585
|
}
|
|
@@ -2561,7 +2588,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2561
2588
|
if (rawMinimums !== void 0) {
|
|
2562
2589
|
if (!isJsonObject2(rawMinimums)) {
|
|
2563
2590
|
logWarning2(
|
|
2564
|
-
`Skipping
|
|
2591
|
+
`Skipping tool-trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
|
|
2565
2592
|
);
|
|
2566
2593
|
continue;
|
|
2567
2594
|
}
|
|
@@ -2587,7 +2614,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2587
2614
|
argsMatch2 = rawArgsMatch;
|
|
2588
2615
|
} else {
|
|
2589
2616
|
logWarning2(
|
|
2590
|
-
`Invalid args_match '${rawArgsMatch}' for
|
|
2617
|
+
`Invalid args_match '${rawArgsMatch}' for tool-trajectory evaluator '${name}' in '${evalId}': must be exact, superset, subset, ignore, or a string array`
|
|
2591
2618
|
);
|
|
2592
2619
|
}
|
|
2593
2620
|
}
|
|
@@ -2597,7 +2624,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2597
2624
|
if (rawExpected !== void 0) {
|
|
2598
2625
|
if (!Array.isArray(rawExpected)) {
|
|
2599
2626
|
logWarning2(
|
|
2600
|
-
`Skipping
|
|
2627
|
+
`Skipping tool-trajectory evaluator '${name}' in '${evalId}': expected must be an array`
|
|
2601
2628
|
);
|
|
2602
2629
|
continue;
|
|
2603
2630
|
}
|
|
@@ -2643,13 +2670,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2643
2670
|
}
|
|
2644
2671
|
if (mode === "any_order" && !minimums) {
|
|
2645
2672
|
logWarning2(
|
|
2646
|
-
`Skipping
|
|
2673
|
+
`Skipping tool-trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
|
|
2647
2674
|
);
|
|
2648
2675
|
continue;
|
|
2649
2676
|
}
|
|
2650
2677
|
if ((mode === "in_order" || mode === "exact" || mode === "subset" || mode === "superset") && !expected) {
|
|
2651
2678
|
logWarning2(
|
|
2652
|
-
`Skipping
|
|
2679
|
+
`Skipping tool-trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
|
|
2653
2680
|
);
|
|
2654
2681
|
continue;
|
|
2655
2682
|
}
|
|
@@ -2657,7 +2684,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2657
2684
|
const required2 = parseRequired(rawEvaluator.required);
|
|
2658
2685
|
const config2 = {
|
|
2659
2686
|
name,
|
|
2660
|
-
type: "
|
|
2687
|
+
type: "tool-trajectory",
|
|
2661
2688
|
mode,
|
|
2662
2689
|
...minimums ? { minimums } : {},
|
|
2663
2690
|
...expected ? { expected } : {},
|
|
@@ -2669,17 +2696,17 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2669
2696
|
evaluators.push(config2);
|
|
2670
2697
|
continue;
|
|
2671
2698
|
}
|
|
2672
|
-
if (typeValue === "
|
|
2699
|
+
if (typeValue === "field-accuracy") {
|
|
2673
2700
|
const rawFields = rawEvaluator.fields;
|
|
2674
2701
|
if (!Array.isArray(rawFields)) {
|
|
2675
2702
|
logWarning2(
|
|
2676
|
-
`Skipping
|
|
2703
|
+
`Skipping field-accuracy evaluator '${name}' in '${evalId}': missing fields array`
|
|
2677
2704
|
);
|
|
2678
2705
|
continue;
|
|
2679
2706
|
}
|
|
2680
2707
|
if (rawFields.length === 0) {
|
|
2681
2708
|
logWarning2(
|
|
2682
|
-
`Skipping
|
|
2709
|
+
`Skipping field-accuracy evaluator '${name}' in '${evalId}': fields array is empty`
|
|
2683
2710
|
);
|
|
2684
2711
|
continue;
|
|
2685
2712
|
}
|
|
@@ -2687,7 +2714,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2687
2714
|
for (const rawField of rawFields) {
|
|
2688
2715
|
if (!isJsonObject2(rawField)) {
|
|
2689
2716
|
logWarning2(
|
|
2690
|
-
`Skipping invalid field entry in
|
|
2717
|
+
`Skipping invalid field entry in field-accuracy evaluator '${name}' (expected object)`
|
|
2691
2718
|
);
|
|
2692
2719
|
continue;
|
|
2693
2720
|
}
|
|
@@ -2695,13 +2722,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2695
2722
|
const match = asString(rawField.match);
|
|
2696
2723
|
if (!fieldPath) {
|
|
2697
2724
|
logWarning2(
|
|
2698
|
-
`Skipping field without path in
|
|
2725
|
+
`Skipping field without path in field-accuracy evaluator '${name}' in '${evalId}'`
|
|
2699
2726
|
);
|
|
2700
2727
|
continue;
|
|
2701
2728
|
}
|
|
2702
2729
|
if (!match || !isValidFieldMatchType(match)) {
|
|
2703
2730
|
logWarning2(
|
|
2704
|
-
`Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a
|
|
2731
|
+
`Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code-judge evaluator.`
|
|
2705
2732
|
);
|
|
2706
2733
|
continue;
|
|
2707
2734
|
}
|
|
@@ -2718,7 +2745,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2718
2745
|
}
|
|
2719
2746
|
if (fields.length === 0) {
|
|
2720
2747
|
logWarning2(
|
|
2721
|
-
`Skipping
|
|
2748
|
+
`Skipping field-accuracy evaluator '${name}' in '${evalId}': no valid fields found`
|
|
2722
2749
|
);
|
|
2723
2750
|
continue;
|
|
2724
2751
|
}
|
|
@@ -2728,7 +2755,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2728
2755
|
const required2 = parseRequired(rawEvaluator.required);
|
|
2729
2756
|
evaluators.push({
|
|
2730
2757
|
name,
|
|
2731
|
-
type: "
|
|
2758
|
+
type: "field-accuracy",
|
|
2732
2759
|
fields,
|
|
2733
2760
|
...validAggregation ? { aggregation: validAggregation } : {},
|
|
2734
2761
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
@@ -2777,7 +2804,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2777
2804
|
});
|
|
2778
2805
|
continue;
|
|
2779
2806
|
}
|
|
2780
|
-
if (typeValue === "
|
|
2807
|
+
if (typeValue === "token-usage") {
|
|
2781
2808
|
const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
|
|
2782
2809
|
const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
|
|
2783
2810
|
const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
|
|
@@ -2791,7 +2818,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2791
2818
|
if (raw === void 0) continue;
|
|
2792
2819
|
if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
|
|
2793
2820
|
logWarning2(
|
|
2794
|
-
`Skipping
|
|
2821
|
+
`Skipping token-usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
|
|
2795
2822
|
);
|
|
2796
2823
|
continue;
|
|
2797
2824
|
}
|
|
@@ -2799,7 +2826,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2799
2826
|
}
|
|
2800
2827
|
if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
|
|
2801
2828
|
logWarning2(
|
|
2802
|
-
`Skipping
|
|
2829
|
+
`Skipping token-usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
|
|
2803
2830
|
);
|
|
2804
2831
|
continue;
|
|
2805
2832
|
}
|
|
@@ -2807,7 +2834,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2807
2834
|
const required2 = parseRequired(rawEvaluator.required);
|
|
2808
2835
|
evaluators.push({
|
|
2809
2836
|
name,
|
|
2810
|
-
type: "
|
|
2837
|
+
type: "token-usage",
|
|
2811
2838
|
...validLimits,
|
|
2812
2839
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
2813
2840
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
@@ -2815,7 +2842,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2815
2842
|
});
|
|
2816
2843
|
continue;
|
|
2817
2844
|
}
|
|
2818
|
-
if (typeValue === "
|
|
2845
|
+
if (typeValue === "execution-metrics") {
|
|
2819
2846
|
const maxToolCalls = rawEvaluator.max_tool_calls ?? rawEvaluator.maxToolCalls;
|
|
2820
2847
|
const maxLlmCalls = rawEvaluator.max_llm_calls ?? rawEvaluator.maxLlmCalls;
|
|
2821
2848
|
const maxTokens = rawEvaluator.max_tokens ?? rawEvaluator.maxTokens;
|
|
@@ -2838,7 +2865,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2838
2865
|
if (raw === void 0) continue;
|
|
2839
2866
|
if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
|
|
2840
2867
|
logWarning2(
|
|
2841
|
-
`Skipping
|
|
2868
|
+
`Skipping execution-metrics evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
|
|
2842
2869
|
);
|
|
2843
2870
|
hasError = true;
|
|
2844
2871
|
break;
|
|
@@ -2851,7 +2878,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2851
2878
|
const hasThreshold = validThresholds.max_tool_calls !== void 0 || validThresholds.max_llm_calls !== void 0 || validThresholds.max_tokens !== void 0 || validThresholds.max_cost_usd !== void 0 || validThresholds.max_duration_ms !== void 0 || validThresholds.target_exploration_ratio !== void 0;
|
|
2852
2879
|
if (!hasThreshold) {
|
|
2853
2880
|
logWarning2(
|
|
2854
|
-
`Skipping
|
|
2881
|
+
`Skipping execution-metrics evaluator '${name}' in '${evalId}': must set at least one threshold (max_tool_calls, max_llm_calls, max_tokens, max_cost_usd, max_duration_ms, or target_exploration_ratio)`
|
|
2855
2882
|
);
|
|
2856
2883
|
continue;
|
|
2857
2884
|
}
|
|
@@ -2859,7 +2886,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2859
2886
|
const required2 = parseRequired(rawEvaluator.required);
|
|
2860
2887
|
evaluators.push({
|
|
2861
2888
|
name,
|
|
2862
|
-
type: "
|
|
2889
|
+
type: "execution-metrics",
|
|
2863
2890
|
...validThresholds,
|
|
2864
2891
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
2865
2892
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
@@ -2867,13 +2894,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2867
2894
|
});
|
|
2868
2895
|
continue;
|
|
2869
2896
|
}
|
|
2870
|
-
if (typeValue === "
|
|
2897
|
+
if (typeValue === "agent-judge") {
|
|
2871
2898
|
const rawMaxSteps = rawEvaluator.max_steps ?? rawEvaluator.maxSteps;
|
|
2872
2899
|
let maxSteps;
|
|
2873
2900
|
if (rawMaxSteps !== void 0) {
|
|
2874
2901
|
if (typeof rawMaxSteps !== "number" || !Number.isInteger(rawMaxSteps) || rawMaxSteps < 1 || rawMaxSteps > 50) {
|
|
2875
2902
|
logWarning2(
|
|
2876
|
-
`Skipping
|
|
2903
|
+
`Skipping agent-judge evaluator '${name}' in '${evalId}': max_steps must be an integer 1-50`
|
|
2877
2904
|
);
|
|
2878
2905
|
continue;
|
|
2879
2906
|
}
|
|
@@ -2884,7 +2911,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2884
2911
|
if (rawTemperature !== void 0) {
|
|
2885
2912
|
if (typeof rawTemperature !== "number" || rawTemperature < 0 || rawTemperature > 2) {
|
|
2886
2913
|
logWarning2(
|
|
2887
|
-
`Skipping
|
|
2914
|
+
`Skipping agent-judge evaluator '${name}' in '${evalId}': temperature must be a number 0-2`
|
|
2888
2915
|
);
|
|
2889
2916
|
continue;
|
|
2890
2917
|
}
|
|
@@ -2907,7 +2934,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2907
2934
|
const required2 = parseRequired(rawEvaluator.required);
|
|
2908
2935
|
evaluators.push({
|
|
2909
2936
|
name,
|
|
2910
|
-
type: "
|
|
2937
|
+
type: "agent-judge",
|
|
2911
2938
|
...agentPrompt ? { prompt: agentPrompt } : {},
|
|
2912
2939
|
...agentPromptPath ? { promptPath: agentPromptPath, resolvedPromptPath: agentPromptPath } : {},
|
|
2913
2940
|
...agentParsedRubrics && agentParsedRubrics.length > 0 ? { rubrics: agentParsedRubrics } : {},
|
|
@@ -2938,7 +2965,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2938
2965
|
});
|
|
2939
2966
|
continue;
|
|
2940
2967
|
}
|
|
2941
|
-
if (typeValue === "
|
|
2968
|
+
if (typeValue === "contains-any" || typeValue === "contains-all") {
|
|
2942
2969
|
const value = asStringArrayStrict(rawEvaluator.value);
|
|
2943
2970
|
if (!value || value.length === 0) {
|
|
2944
2971
|
logWarning2(
|
|
@@ -2976,7 +3003,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2976
3003
|
});
|
|
2977
3004
|
continue;
|
|
2978
3005
|
}
|
|
2979
|
-
if (typeValue === "
|
|
3006
|
+
if (typeValue === "icontains-any" || typeValue === "icontains-all") {
|
|
2980
3007
|
const value = asStringArrayStrict(rawEvaluator.value);
|
|
2981
3008
|
if (!value || value.length === 0) {
|
|
2982
3009
|
logWarning2(
|
|
@@ -2996,7 +3023,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2996
3023
|
});
|
|
2997
3024
|
continue;
|
|
2998
3025
|
}
|
|
2999
|
-
if (typeValue === "
|
|
3026
|
+
if (typeValue === "starts-with" || typeValue === "ends-with") {
|
|
3000
3027
|
const value = asString(rawEvaluator.value);
|
|
3001
3028
|
if (!value) {
|
|
3002
3029
|
logWarning2(`Skipping ${typeValue} evaluator '${name}' in '${evalId}': missing value`);
|
|
@@ -3034,12 +3061,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3034
3061
|
});
|
|
3035
3062
|
continue;
|
|
3036
3063
|
}
|
|
3037
|
-
if (typeValue === "
|
|
3064
|
+
if (typeValue === "is-json") {
|
|
3038
3065
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3039
3066
|
const required2 = parseRequired(rawEvaluator.required);
|
|
3040
3067
|
evaluators.push({
|
|
3041
3068
|
name,
|
|
3042
|
-
type: "
|
|
3069
|
+
type: "is-json",
|
|
3043
3070
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3044
3071
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3045
3072
|
...negate !== void 0 ? { negate } : {}
|
|
@@ -3087,7 +3114,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3087
3114
|
const required2 = parseRequired(rawEvaluator.required);
|
|
3088
3115
|
evaluators.push({
|
|
3089
3116
|
name,
|
|
3090
|
-
type: "
|
|
3117
|
+
type: "llm-judge",
|
|
3091
3118
|
rubrics: parsedCriteria,
|
|
3092
3119
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3093
3120
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
@@ -3154,7 +3181,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3154
3181
|
const required2 = parseRequired(rawEvaluator.required);
|
|
3155
3182
|
evaluators.push({
|
|
3156
3183
|
name,
|
|
3157
|
-
type: "
|
|
3184
|
+
type: "llm-judge",
|
|
3158
3185
|
rubrics: parsedRubrics,
|
|
3159
3186
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3160
3187
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
@@ -3186,7 +3213,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3186
3213
|
const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : void 0);
|
|
3187
3214
|
evaluators.push({
|
|
3188
3215
|
name,
|
|
3189
|
-
type: "
|
|
3216
|
+
type: "llm-judge",
|
|
3190
3217
|
prompt,
|
|
3191
3218
|
promptPath,
|
|
3192
3219
|
...promptPath ? { resolvedPromptPath: promptPath } : {},
|
|
@@ -3202,15 +3229,15 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3202
3229
|
}
|
|
3203
3230
|
var ASSERTION_TYPES = /* @__PURE__ */ new Set([
|
|
3204
3231
|
"contains",
|
|
3205
|
-
"
|
|
3206
|
-
"
|
|
3232
|
+
"contains-any",
|
|
3233
|
+
"contains-all",
|
|
3207
3234
|
"icontains",
|
|
3208
|
-
"
|
|
3209
|
-
"
|
|
3210
|
-
"
|
|
3211
|
-
"
|
|
3235
|
+
"icontains-any",
|
|
3236
|
+
"icontains-all",
|
|
3237
|
+
"starts-with",
|
|
3238
|
+
"ends-with",
|
|
3212
3239
|
"regex",
|
|
3213
|
-
"
|
|
3240
|
+
"is-json",
|
|
3214
3241
|
"equals",
|
|
3215
3242
|
"rubrics"
|
|
3216
3243
|
]);
|
|
@@ -3223,24 +3250,24 @@ function generateAssertionName(typeValue, rawEvaluator) {
|
|
|
3223
3250
|
switch (typeValue) {
|
|
3224
3251
|
case "contains":
|
|
3225
3252
|
return value ? `contains-${value}` : "contains";
|
|
3226
|
-
case "
|
|
3227
|
-
return arrayValue ? `
|
|
3228
|
-
case "
|
|
3229
|
-
return arrayValue ? `
|
|
3253
|
+
case "contains-any":
|
|
3254
|
+
return arrayValue ? `contains-any-${arrayValue.length}` : "contains-any";
|
|
3255
|
+
case "contains-all":
|
|
3256
|
+
return arrayValue ? `contains-all-${arrayValue.length}` : "contains-all";
|
|
3230
3257
|
case "icontains":
|
|
3231
3258
|
return value ? `icontains-${value}` : "icontains";
|
|
3232
|
-
case "
|
|
3233
|
-
return arrayValue ? `
|
|
3234
|
-
case "
|
|
3235
|
-
return arrayValue ? `
|
|
3236
|
-
case "
|
|
3237
|
-
return value ? `
|
|
3238
|
-
case "
|
|
3239
|
-
return value ? `
|
|
3259
|
+
case "icontains-any":
|
|
3260
|
+
return arrayValue ? `icontains-any-${arrayValue.length}` : "icontains-any";
|
|
3261
|
+
case "icontains-all":
|
|
3262
|
+
return arrayValue ? `icontains-all-${arrayValue.length}` : "icontains-all";
|
|
3263
|
+
case "starts-with":
|
|
3264
|
+
return value ? `starts-with-${value}` : "starts-with";
|
|
3265
|
+
case "ends-with":
|
|
3266
|
+
return value ? `ends-with-${value}` : "ends-with";
|
|
3240
3267
|
case "regex":
|
|
3241
3268
|
return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
|
|
3242
|
-
case "
|
|
3243
|
-
return "
|
|
3269
|
+
case "is-json":
|
|
3270
|
+
return "is-json";
|
|
3244
3271
|
case "equals":
|
|
3245
3272
|
return value ? `equals-${value}` : "equals";
|
|
3246
3273
|
case "rubrics":
|
|
@@ -3253,8 +3280,9 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
3253
3280
|
if (typeof candidate !== "string") {
|
|
3254
3281
|
return void 0;
|
|
3255
3282
|
}
|
|
3256
|
-
|
|
3257
|
-
|
|
3283
|
+
const normalized = normalizeEvaluatorType(candidate);
|
|
3284
|
+
if (isEvaluatorKind(normalized)) {
|
|
3285
|
+
return normalized;
|
|
3258
3286
|
}
|
|
3259
3287
|
logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
3260
3288
|
return void 0;
|
|
@@ -3300,6 +3328,16 @@ function parseCommandToArgv(command) {
|
|
|
3300
3328
|
function isJsonObject2(value) {
|
|
3301
3329
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
3302
3330
|
}
|
|
3331
|
+
var CRITERIA_CONSUMER_TYPES = /* @__PURE__ */ new Set(["llm-judge", "agent-judge", "code-judge"]);
|
|
3332
|
+
function warnUnconsumedCriteria(criteria, evaluators, testId) {
|
|
3333
|
+
if (!criteria?.trim() || !evaluators || evaluators.length === 0) return;
|
|
3334
|
+
const hasConsumer = evaluators.some((e) => CRITERIA_CONSUMER_TYPES.has(e.type));
|
|
3335
|
+
if (!hasConsumer) {
|
|
3336
|
+
logWarning2(
|
|
3337
|
+
`Test '${testId}': criteria is defined but no evaluator in assert will evaluate it. Add 'type: llm-judge' to assert, or remove criteria if it is documentation-only.`
|
|
3338
|
+
);
|
|
3339
|
+
}
|
|
3340
|
+
}
|
|
3303
3341
|
function logWarning2(message, details) {
|
|
3304
3342
|
if (details && details.length > 0) {
|
|
3305
3343
|
const detailBlock = details.join("\n");
|
|
@@ -3549,7 +3587,7 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
3549
3587
|
}
|
|
3550
3588
|
return {
|
|
3551
3589
|
name: "rubric",
|
|
3552
|
-
type: "
|
|
3590
|
+
type: "llm-judge",
|
|
3553
3591
|
rubrics: rubricItems
|
|
3554
3592
|
};
|
|
3555
3593
|
}
|
|
@@ -3934,7 +3972,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
3934
3972
|
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
3935
3973
|
const fallbackDataset = import_node_path6.default.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
3936
3974
|
const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
|
|
3937
|
-
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "
|
|
3975
|
+
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-judge";
|
|
3938
3976
|
const globalExecution = sidecar.execution;
|
|
3939
3977
|
if (verbose) {
|
|
3940
3978
|
console.log(`
|
|
@@ -4022,6 +4060,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
4022
4060
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
4023
4061
|
}
|
|
4024
4062
|
}
|
|
4063
|
+
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
4025
4064
|
const userFilePaths = [];
|
|
4026
4065
|
for (const segment of inputSegments) {
|
|
4027
4066
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -4375,13 +4414,15 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
4375
4414
|
}
|
|
4376
4415
|
const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
|
|
4377
4416
|
const metadata = parseMetadata(parsed);
|
|
4417
|
+
const failOnError = extractFailOnError(parsed);
|
|
4378
4418
|
return {
|
|
4379
4419
|
tests,
|
|
4380
4420
|
trials: extractTrialsConfig(parsed),
|
|
4381
4421
|
targets: extractTargetsFromSuite(parsed),
|
|
4382
4422
|
cacheConfig: extractCacheConfig(parsed),
|
|
4383
4423
|
totalBudgetUsd: extractTotalBudgetUsd(parsed),
|
|
4384
|
-
...metadata !== void 0 && { metadata }
|
|
4424
|
+
...metadata !== void 0 && { metadata },
|
|
4425
|
+
...failOnError !== void 0 && { failOnError }
|
|
4385
4426
|
};
|
|
4386
4427
|
}
|
|
4387
4428
|
var loadEvalSuite = loadTestSuite;
|
|
@@ -4412,7 +4453,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4412
4453
|
const fallbackDataset = import_node_path8.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
4413
4454
|
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
4414
4455
|
const rawTestcases = resolveTests(suite);
|
|
4415
|
-
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "
|
|
4456
|
+
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-judge";
|
|
4416
4457
|
const evalFileDir = import_node_path8.default.dirname(absoluteTestPath);
|
|
4417
4458
|
let expandedTestcases;
|
|
4418
4459
|
if (typeof rawTestcases === "string") {
|
|
@@ -4509,6 +4550,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4509
4550
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
4510
4551
|
}
|
|
4511
4552
|
}
|
|
4553
|
+
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
4512
4554
|
const userFilePaths = [];
|
|
4513
4555
|
for (const segment of inputSegments) {
|
|
4514
4556
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -11701,7 +11743,7 @@ function toCamelCaseDeep(obj) {
|
|
|
11701
11743
|
// src/evaluation/evaluators/code-evaluator.ts
|
|
11702
11744
|
var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
|
|
11703
11745
|
var CodeEvaluator = class {
|
|
11704
|
-
kind = "code";
|
|
11746
|
+
kind = "code-judge";
|
|
11705
11747
|
command;
|
|
11706
11748
|
cwd;
|
|
11707
11749
|
agentTimeoutMs;
|
|
@@ -11938,7 +11980,7 @@ var scoreRangeEvaluationSchema = import_zod4.z.object({
|
|
|
11938
11980
|
overall_reasoning: import_zod4.z.string().describe("Overall assessment summary (1-2 sentences)").optional()
|
|
11939
11981
|
});
|
|
11940
11982
|
var LlmJudgeEvaluator = class {
|
|
11941
|
-
kind = "
|
|
11983
|
+
kind = "llm-judge";
|
|
11942
11984
|
resolveJudgeProvider;
|
|
11943
11985
|
maxOutputTokens;
|
|
11944
11986
|
temperature;
|
|
@@ -11955,7 +11997,7 @@ var LlmJudgeEvaluator = class {
|
|
|
11955
11997
|
throw new Error("No judge provider available for LLM grading");
|
|
11956
11998
|
}
|
|
11957
11999
|
const config = context2.evaluator;
|
|
11958
|
-
if (config?.type === "
|
|
12000
|
+
if (config?.type === "llm-judge" && config.rubrics && config.rubrics.length > 0) {
|
|
11959
12001
|
return this.evaluateWithRubrics(context2, judgeProvider, config.rubrics);
|
|
11960
12002
|
}
|
|
11961
12003
|
return this.evaluateFreeform(context2, judgeProvider);
|
|
@@ -12029,7 +12071,7 @@ ${context2.fileChanges}`;
|
|
|
12029
12071
|
async evaluateWithRubrics(context2, judgeProvider, rubrics) {
|
|
12030
12072
|
if (!rubrics || rubrics.length === 0) {
|
|
12031
12073
|
throw new Error(
|
|
12032
|
-
`No rubrics found for evaluator "${context2.evaluator?.name ?? "
|
|
12074
|
+
`No rubrics found for evaluator "${context2.evaluator?.name ?? "llm-judge"}". Run "agentv generate rubrics" first.`
|
|
12033
12075
|
);
|
|
12034
12076
|
}
|
|
12035
12077
|
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
@@ -12365,9 +12407,9 @@ var CompositeEvaluator = class {
|
|
|
12365
12407
|
async aggregate(results, context2) {
|
|
12366
12408
|
const aggregator = this.config.aggregator;
|
|
12367
12409
|
switch (aggregator.type) {
|
|
12368
|
-
case "
|
|
12410
|
+
case "code-judge":
|
|
12369
12411
|
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
12370
|
-
case "
|
|
12412
|
+
case "llm-judge":
|
|
12371
12413
|
return this.runLlmAggregator(results, context2, aggregator);
|
|
12372
12414
|
case "threshold":
|
|
12373
12415
|
return this.runThreshold(results, aggregator.threshold);
|
|
@@ -12510,7 +12552,7 @@ var CompositeEvaluator = class {
|
|
|
12510
12552
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
12511
12553
|
reasoning,
|
|
12512
12554
|
evaluatorRawRequest: {
|
|
12513
|
-
aggregator: "
|
|
12555
|
+
aggregator: "code-judge",
|
|
12514
12556
|
script: scriptPath
|
|
12515
12557
|
},
|
|
12516
12558
|
scores
|
|
@@ -12525,7 +12567,7 @@ var CompositeEvaluator = class {
|
|
|
12525
12567
|
expectedAspectCount: 1,
|
|
12526
12568
|
reasoning: message,
|
|
12527
12569
|
evaluatorRawRequest: {
|
|
12528
|
-
aggregator: "
|
|
12570
|
+
aggregator: "code-judge",
|
|
12529
12571
|
script: scriptPath,
|
|
12530
12572
|
error: message
|
|
12531
12573
|
},
|
|
@@ -12556,7 +12598,7 @@ var CompositeEvaluator = class {
|
|
|
12556
12598
|
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
12557
12599
|
const systemPrompt = buildOutputSchema();
|
|
12558
12600
|
const evaluatorRawRequest = {
|
|
12559
|
-
aggregator: "
|
|
12601
|
+
aggregator: "llm-judge",
|
|
12560
12602
|
userPrompt,
|
|
12561
12603
|
systemPrompt,
|
|
12562
12604
|
target: judgeProvider.targetName
|
|
@@ -12668,7 +12710,7 @@ var CostEvaluator = class {
|
|
|
12668
12710
|
|
|
12669
12711
|
// src/evaluation/evaluators/execution-metrics.ts
|
|
12670
12712
|
var ExecutionMetricsEvaluator = class {
|
|
12671
|
-
kind = "
|
|
12713
|
+
kind = "execution-metrics";
|
|
12672
12714
|
config;
|
|
12673
12715
|
constructor(options) {
|
|
12674
12716
|
this.config = options.config;
|
|
@@ -12694,7 +12736,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12694
12736
|
expectedAspectCount: 1,
|
|
12695
12737
|
reasoning: "Execution metrics not available - no trace summary provided",
|
|
12696
12738
|
evaluatorRawRequest: {
|
|
12697
|
-
type: "
|
|
12739
|
+
type: "execution-metrics",
|
|
12698
12740
|
config: this.extractConfiguredThresholds(),
|
|
12699
12741
|
actual: null
|
|
12700
12742
|
}
|
|
@@ -12803,7 +12845,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12803
12845
|
if (actualMetrics.exploration_ratio !== void 0) {
|
|
12804
12846
|
reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
|
|
12805
12847
|
}
|
|
12806
|
-
const reasoning = reasoningParts.length > 0 ? `
|
|
12848
|
+
const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
|
|
12807
12849
|
return {
|
|
12808
12850
|
score,
|
|
12809
12851
|
verdict: scoreToVerdict(score),
|
|
@@ -12812,7 +12854,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12812
12854
|
expectedAspectCount: totalChecks || 1,
|
|
12813
12855
|
reasoning,
|
|
12814
12856
|
evaluatorRawRequest: {
|
|
12815
|
-
type: "
|
|
12857
|
+
type: "execution-metrics",
|
|
12816
12858
|
config: this.extractConfiguredThresholds(),
|
|
12817
12859
|
actual: this.filterDefinedMetrics(actualMetrics)
|
|
12818
12860
|
}
|
|
@@ -12900,7 +12942,7 @@ var MONTH_NAMES = {
|
|
|
12900
12942
|
december: 11
|
|
12901
12943
|
};
|
|
12902
12944
|
var FieldAccuracyEvaluator = class {
|
|
12903
|
-
kind = "
|
|
12945
|
+
kind = "field-accuracy";
|
|
12904
12946
|
config;
|
|
12905
12947
|
constructor(options) {
|
|
12906
12948
|
this.config = options.config;
|
|
@@ -13354,7 +13396,7 @@ var BINARY_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
|
13354
13396
|
".dylib"
|
|
13355
13397
|
]);
|
|
13356
13398
|
var AgentJudgeEvaluator = class {
|
|
13357
|
-
kind = "
|
|
13399
|
+
kind = "agent-judge";
|
|
13358
13400
|
resolveJudgeProvider;
|
|
13359
13401
|
maxSteps;
|
|
13360
13402
|
temperature;
|
|
@@ -13379,24 +13421,24 @@ var AgentJudgeEvaluator = class {
|
|
|
13379
13421
|
async evaluateBuiltIn(context2) {
|
|
13380
13422
|
const judgeProvider = await this.resolveJudgeProvider(context2);
|
|
13381
13423
|
if (!judgeProvider) {
|
|
13382
|
-
throw new Error("No judge provider available for
|
|
13424
|
+
throw new Error("No judge provider available for agent-judge evaluation");
|
|
13383
13425
|
}
|
|
13384
13426
|
const model = judgeProvider.asLanguageModel?.();
|
|
13385
13427
|
if (!model) {
|
|
13386
13428
|
throw new Error(
|
|
13387
|
-
`Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in
|
|
13429
|
+
`Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent-judge mode`
|
|
13388
13430
|
);
|
|
13389
13431
|
}
|
|
13390
13432
|
const workspacePath = context2.workspacePath;
|
|
13391
13433
|
if (!workspacePath) {
|
|
13392
13434
|
throw new Error(
|
|
13393
|
-
"
|
|
13435
|
+
"agent-judge evaluator requires a workspace_template target (workspacePath is not set)"
|
|
13394
13436
|
);
|
|
13395
13437
|
}
|
|
13396
13438
|
const systemPrompt = this.buildSystemPrompt(context2);
|
|
13397
13439
|
const userPrompt = this.buildUserPrompt(context2);
|
|
13398
13440
|
const config = context2.evaluator;
|
|
13399
|
-
const rubrics = config?.type === "
|
|
13441
|
+
const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
|
|
13400
13442
|
const fsTools = createFilesystemTools(workspacePath);
|
|
13401
13443
|
const evaluatorRawRequest = {
|
|
13402
13444
|
mode: "built-in",
|
|
@@ -13427,7 +13469,7 @@ var AgentJudgeEvaluator = class {
|
|
|
13427
13469
|
score: 0,
|
|
13428
13470
|
verdict: "fail",
|
|
13429
13471
|
hits: [],
|
|
13430
|
-
misses: [`
|
|
13472
|
+
misses: [`agent-judge built-in evaluation failed: ${message}`],
|
|
13431
13473
|
expectedAspectCount: 1,
|
|
13432
13474
|
evaluatorRawRequest,
|
|
13433
13475
|
details: { mode: "built-in", error: message }
|
|
@@ -13459,14 +13501,14 @@ var AgentJudgeEvaluator = class {
|
|
|
13459
13501
|
score: 0,
|
|
13460
13502
|
verdict: "fail",
|
|
13461
13503
|
hits: [],
|
|
13462
|
-
misses: ["
|
|
13504
|
+
misses: ["agent-judge judge_target returned no assistant response"],
|
|
13463
13505
|
expectedAspectCount: 1,
|
|
13464
13506
|
evaluatorRawRequest,
|
|
13465
13507
|
details: { mode: "judge_target", judge_target: provider.targetName }
|
|
13466
13508
|
};
|
|
13467
13509
|
}
|
|
13468
13510
|
const config = context2.evaluator;
|
|
13469
|
-
const rubrics = config?.type === "
|
|
13511
|
+
const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
|
|
13470
13512
|
const details = {
|
|
13471
13513
|
mode: "judge_target",
|
|
13472
13514
|
judge_target: provider.targetName
|
|
@@ -13478,7 +13520,7 @@ var AgentJudgeEvaluator = class {
|
|
|
13478
13520
|
score: 0,
|
|
13479
13521
|
verdict: "fail",
|
|
13480
13522
|
hits: [],
|
|
13481
|
-
misses: [`
|
|
13523
|
+
misses: [`agent-judge judge_target evaluation failed: ${message}`],
|
|
13482
13524
|
expectedAspectCount: 1,
|
|
13483
13525
|
evaluatorRawRequest,
|
|
13484
13526
|
details: {
|
|
@@ -13529,7 +13571,7 @@ var AgentJudgeEvaluator = class {
|
|
|
13529
13571
|
score: 0,
|
|
13530
13572
|
verdict: "fail",
|
|
13531
13573
|
hits: [],
|
|
13532
|
-
misses: ["Failed to parse
|
|
13574
|
+
misses: ["Failed to parse agent-judge response as valid evaluation JSON"],
|
|
13533
13575
|
expectedAspectCount: 1,
|
|
13534
13576
|
evaluatorRawRequest,
|
|
13535
13577
|
details
|
|
@@ -13542,7 +13584,7 @@ var AgentJudgeEvaluator = class {
|
|
|
13542
13584
|
*/
|
|
13543
13585
|
buildSystemPrompt(context2) {
|
|
13544
13586
|
const config = context2.evaluator;
|
|
13545
|
-
const rubrics = config?.type === "
|
|
13587
|
+
const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
|
|
13546
13588
|
const parts = [
|
|
13547
13589
|
"You are an expert evaluator with access to the workspace filesystem.",
|
|
13548
13590
|
"Use the provided tools to investigate the workspace and verify the criteria are met.",
|
|
@@ -13573,7 +13615,7 @@ var AgentJudgeEvaluator = class {
|
|
|
13573
13615
|
return substituteVariables(this.evaluatorTemplate, variables);
|
|
13574
13616
|
}
|
|
13575
13617
|
const config = context2.evaluator;
|
|
13576
|
-
const rubrics = config?.type === "
|
|
13618
|
+
const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
|
|
13577
13619
|
const parts = [
|
|
13578
13620
|
"Evaluate the candidate answer by investigating the workspace.",
|
|
13579
13621
|
"",
|
|
@@ -13616,7 +13658,7 @@ var AgentJudgeEvaluator = class {
|
|
|
13616
13658
|
buildDelegatedPrompt(context2) {
|
|
13617
13659
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
13618
13660
|
const config = context2.evaluator;
|
|
13619
|
-
const rubrics = config?.type === "
|
|
13661
|
+
const rubrics = config?.type === "agent-judge" ? config.rubrics : void 0;
|
|
13620
13662
|
if (this.evaluatorTemplate) {
|
|
13621
13663
|
const variables = {
|
|
13622
13664
|
[TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
|
|
@@ -13698,11 +13740,11 @@ function createFilesystemTools(workspacePath) {
|
|
|
13698
13740
|
execute: async (input) => {
|
|
13699
13741
|
try {
|
|
13700
13742
|
const resolved = resolveSandboxed(workspacePath, input.path);
|
|
13701
|
-
const
|
|
13702
|
-
if (
|
|
13743
|
+
const stat8 = await import_promises25.default.stat(resolved);
|
|
13744
|
+
if (stat8.isDirectory()) {
|
|
13703
13745
|
return { error: `'${input.path}' is a directory, not a file` };
|
|
13704
13746
|
}
|
|
13705
|
-
const buffer = Buffer.alloc(Math.min(
|
|
13747
|
+
const buffer = Buffer.alloc(Math.min(stat8.size, MAX_FILE_SIZE));
|
|
13706
13748
|
const fd = await import_promises25.default.open(resolved, "r");
|
|
13707
13749
|
try {
|
|
13708
13750
|
await fd.read(buffer, 0, buffer.length, 0);
|
|
@@ -13710,8 +13752,8 @@ function createFilesystemTools(workspacePath) {
|
|
|
13710
13752
|
await fd.close();
|
|
13711
13753
|
}
|
|
13712
13754
|
const content = buffer.toString("utf-8");
|
|
13713
|
-
const truncated =
|
|
13714
|
-
return { content, truncated, size:
|
|
13755
|
+
const truncated = stat8.size > MAX_FILE_SIZE;
|
|
13756
|
+
return { content, truncated, size: stat8.size };
|
|
13715
13757
|
} catch (error) {
|
|
13716
13758
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
13717
13759
|
}
|
|
@@ -13755,8 +13797,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
|
13755
13797
|
const ext = import_node_path33.default.extname(entry.name).toLowerCase();
|
|
13756
13798
|
if (BINARY_EXTENSIONS.has(ext)) continue;
|
|
13757
13799
|
try {
|
|
13758
|
-
const
|
|
13759
|
-
if (
|
|
13800
|
+
const stat8 = await import_promises25.default.stat(fullPath);
|
|
13801
|
+
if (stat8.size > MAX_FILE_SIZE) continue;
|
|
13760
13802
|
const content = await import_promises25.default.readFile(fullPath, "utf-8");
|
|
13761
13803
|
const lines = content.split("\n");
|
|
13762
13804
|
for (let i = 0; i < lines.length; i++) {
|
|
@@ -13918,7 +13960,7 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
|
|
|
13918
13960
|
|
|
13919
13961
|
// src/evaluation/evaluators/token-usage.ts
|
|
13920
13962
|
var TokenUsageEvaluator = class {
|
|
13921
|
-
kind = "
|
|
13963
|
+
kind = "token-usage";
|
|
13922
13964
|
config;
|
|
13923
13965
|
constructor(options) {
|
|
13924
13966
|
this.config = options.config;
|
|
@@ -13941,7 +13983,7 @@ var TokenUsageEvaluator = class {
|
|
|
13941
13983
|
expectedAspectCount,
|
|
13942
13984
|
reasoning: "Token usage not reported by provider",
|
|
13943
13985
|
evaluatorRawRequest: {
|
|
13944
|
-
type: "
|
|
13986
|
+
type: "token-usage",
|
|
13945
13987
|
max_total: maxTotal ?? null,
|
|
13946
13988
|
max_input: maxInput ?? null,
|
|
13947
13989
|
max_output: maxOutput ?? null,
|
|
@@ -13983,9 +14025,9 @@ var TokenUsageEvaluator = class {
|
|
|
13983
14025
|
hits,
|
|
13984
14026
|
misses,
|
|
13985
14027
|
expectedAspectCount,
|
|
13986
|
-
reasoning: `
|
|
14028
|
+
reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
|
|
13987
14029
|
evaluatorRawRequest: {
|
|
13988
|
-
type: "
|
|
14030
|
+
type: "token-usage",
|
|
13989
14031
|
max_total: maxTotal ?? null,
|
|
13990
14032
|
max_input: maxInput ?? null,
|
|
13991
14033
|
max_output: maxOutput ?? null,
|
|
@@ -14070,7 +14112,7 @@ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
|
|
|
14070
14112
|
};
|
|
14071
14113
|
}
|
|
14072
14114
|
var ToolTrajectoryEvaluator = class {
|
|
14073
|
-
kind = "
|
|
14115
|
+
kind = "tool-trajectory";
|
|
14074
14116
|
config;
|
|
14075
14117
|
constructor(options) {
|
|
14076
14118
|
this.config = options.config;
|
|
@@ -14258,7 +14300,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
14258
14300
|
}
|
|
14259
14301
|
}
|
|
14260
14302
|
for (const warning of warnings) {
|
|
14261
|
-
console.warn(`[
|
|
14303
|
+
console.warn(`[tool-trajectory] ${warning}`);
|
|
14262
14304
|
}
|
|
14263
14305
|
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
14264
14306
|
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
@@ -14334,7 +14376,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
14334
14376
|
misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
|
|
14335
14377
|
}
|
|
14336
14378
|
for (const warning of warnings) {
|
|
14337
|
-
console.warn(`[
|
|
14379
|
+
console.warn(`[tool-trajectory] ${warning}`);
|
|
14338
14380
|
}
|
|
14339
14381
|
const effectiveLatencyAssertions = latencyAssertionCount - latencySkips;
|
|
14340
14382
|
const totalAssertions = expected.length + effectiveLatencyAssertions;
|
|
@@ -14824,7 +14866,7 @@ var llmJudgeFactory = (config, context2) => {
|
|
|
14824
14866
|
const c = config;
|
|
14825
14867
|
const { llmJudge, agentTimeoutMs } = context2;
|
|
14826
14868
|
return {
|
|
14827
|
-
kind: "
|
|
14869
|
+
kind: "llm-judge",
|
|
14828
14870
|
async evaluate(evalContext) {
|
|
14829
14871
|
const customPrompt = await resolveCustomPrompt(
|
|
14830
14872
|
c,
|
|
@@ -14913,7 +14955,7 @@ var agentJudgeFactory = (config, context2) => {
|
|
|
14913
14955
|
customPrompt = (0, import_node_fs9.readFileSync)(c.resolvedPromptPath, "utf-8");
|
|
14914
14956
|
} catch (error) {
|
|
14915
14957
|
const message = error instanceof Error ? error.message : String(error);
|
|
14916
|
-
console.warn(`Could not read
|
|
14958
|
+
console.warn(`Could not read agent-judge prompt at ${c.resolvedPromptPath}: ${message}`);
|
|
14917
14959
|
}
|
|
14918
14960
|
} else if (c.prompt) {
|
|
14919
14961
|
customPrompt = c.prompt;
|
|
@@ -14923,7 +14965,7 @@ var agentJudgeFactory = (config, context2) => {
|
|
|
14923
14965
|
judgeTargetProvider = targetResolver(c.target);
|
|
14924
14966
|
if (!judgeTargetProvider) {
|
|
14925
14967
|
throw new Error(
|
|
14926
|
-
`
|
|
14968
|
+
`agent-judge evaluator '${c.name}': target '${c.target}' not found in targets`
|
|
14927
14969
|
);
|
|
14928
14970
|
}
|
|
14929
14971
|
}
|
|
@@ -14967,7 +15009,7 @@ var regexFactory = (config) => {
|
|
|
14967
15009
|
});
|
|
14968
15010
|
};
|
|
14969
15011
|
var isJsonFactory = () => {
|
|
14970
|
-
return new DeterministicAssertionEvaluator("
|
|
15012
|
+
return new DeterministicAssertionEvaluator("is-json", (ctx) => {
|
|
14971
15013
|
const result = runIsJsonAssertion(ctx.candidate);
|
|
14972
15014
|
return {
|
|
14973
15015
|
score: result.score,
|
|
@@ -14995,7 +15037,7 @@ var equalsFactory = (config) => {
|
|
|
14995
15037
|
};
|
|
14996
15038
|
var containsAnyFactory = (config) => {
|
|
14997
15039
|
const c = config;
|
|
14998
|
-
return new DeterministicAssertionEvaluator("
|
|
15040
|
+
return new DeterministicAssertionEvaluator("contains-any", (ctx) => {
|
|
14999
15041
|
const result = runContainsAnyAssertion(ctx.candidate, c.value);
|
|
15000
15042
|
return {
|
|
15001
15043
|
score: result.score,
|
|
@@ -15009,7 +15051,7 @@ var containsAnyFactory = (config) => {
|
|
|
15009
15051
|
};
|
|
15010
15052
|
var containsAllFactory = (config) => {
|
|
15011
15053
|
const c = config;
|
|
15012
|
-
return new DeterministicAssertionEvaluator("
|
|
15054
|
+
return new DeterministicAssertionEvaluator("contains-all", (ctx) => {
|
|
15013
15055
|
const result = runContainsAllAssertion(ctx.candidate, c.value);
|
|
15014
15056
|
return {
|
|
15015
15057
|
score: result.score,
|
|
@@ -15037,7 +15079,7 @@ var icontainsFactory = (config) => {
|
|
|
15037
15079
|
};
|
|
15038
15080
|
var icontainsAnyFactory = (config) => {
|
|
15039
15081
|
const c = config;
|
|
15040
|
-
return new DeterministicAssertionEvaluator("
|
|
15082
|
+
return new DeterministicAssertionEvaluator("icontains-any", (ctx) => {
|
|
15041
15083
|
const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
|
|
15042
15084
|
return {
|
|
15043
15085
|
score: result.score,
|
|
@@ -15051,7 +15093,7 @@ var icontainsAnyFactory = (config) => {
|
|
|
15051
15093
|
};
|
|
15052
15094
|
var icontainsAllFactory = (config) => {
|
|
15053
15095
|
const c = config;
|
|
15054
|
-
return new DeterministicAssertionEvaluator("
|
|
15096
|
+
return new DeterministicAssertionEvaluator("icontains-all", (ctx) => {
|
|
15055
15097
|
const result = runIcontainsAllAssertion(ctx.candidate, c.value);
|
|
15056
15098
|
return {
|
|
15057
15099
|
score: result.score,
|
|
@@ -15065,7 +15107,7 @@ var icontainsAllFactory = (config) => {
|
|
|
15065
15107
|
};
|
|
15066
15108
|
var startsWithFactory = (config) => {
|
|
15067
15109
|
const c = config;
|
|
15068
|
-
return new DeterministicAssertionEvaluator("
|
|
15110
|
+
return new DeterministicAssertionEvaluator("starts-with", (ctx) => {
|
|
15069
15111
|
const result = runStartsWithAssertion(ctx.candidate, c.value);
|
|
15070
15112
|
return {
|
|
15071
15113
|
score: result.score,
|
|
@@ -15079,7 +15121,7 @@ var startsWithFactory = (config) => {
|
|
|
15079
15121
|
};
|
|
15080
15122
|
var endsWithFactory = (config) => {
|
|
15081
15123
|
const c = config;
|
|
15082
|
-
return new DeterministicAssertionEvaluator("
|
|
15124
|
+
return new DeterministicAssertionEvaluator("ends-with", (ctx) => {
|
|
15083
15125
|
const result = runEndsWithAssertion(ctx.candidate, c.value);
|
|
15084
15126
|
return {
|
|
15085
15127
|
score: result.score,
|
|
@@ -15093,7 +15135,7 @@ var endsWithFactory = (config) => {
|
|
|
15093
15135
|
};
|
|
15094
15136
|
function createBuiltinRegistry() {
|
|
15095
15137
|
const registry = new EvaluatorRegistry();
|
|
15096
|
-
registry.register("
|
|
15138
|
+
registry.register("llm-judge", llmJudgeFactory).register("code-judge", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("agent-judge", agentJudgeFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory);
|
|
15097
15139
|
return registry;
|
|
15098
15140
|
}
|
|
15099
15141
|
|
|
@@ -15780,7 +15822,8 @@ async function runEvaluation(options) {
|
|
|
15780
15822
|
cleanupWorkspaces,
|
|
15781
15823
|
trials,
|
|
15782
15824
|
streamCallbacks,
|
|
15783
|
-
totalBudgetUsd
|
|
15825
|
+
totalBudgetUsd,
|
|
15826
|
+
failOnError
|
|
15784
15827
|
} = options;
|
|
15785
15828
|
let useCache = options.useCache;
|
|
15786
15829
|
if (trials && trials.count > 1 && useCache) {
|
|
@@ -15838,7 +15881,7 @@ async function runEvaluation(options) {
|
|
|
15838
15881
|
};
|
|
15839
15882
|
if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
|
|
15840
15883
|
throw new Error(
|
|
15841
|
-
`Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g.,
|
|
15884
|
+
`Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure-base).`
|
|
15842
15885
|
);
|
|
15843
15886
|
}
|
|
15844
15887
|
const targetResolver = (name) => {
|
|
@@ -15909,7 +15952,7 @@ async function runEvaluation(options) {
|
|
|
15909
15952
|
const rawTemplate = suiteWorkspace?.template ?? getWorkspaceTemplate(target);
|
|
15910
15953
|
const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
|
|
15911
15954
|
const workspaceTemplate = resolvedTemplate?.dir;
|
|
15912
|
-
|
|
15955
|
+
let suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
|
|
15913
15956
|
const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
|
|
15914
15957
|
const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
|
|
15915
15958
|
const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
|
|
@@ -15930,6 +15973,14 @@ async function runEvaluation(options) {
|
|
|
15930
15973
|
const message = error instanceof Error ? error.message : String(error);
|
|
15931
15974
|
throw new Error(`Failed to create shared workspace: ${message}`);
|
|
15932
15975
|
}
|
|
15976
|
+
if (suiteWorkspaceFile && sharedWorkspacePath) {
|
|
15977
|
+
const copiedWorkspaceFile = import_node_path40.default.join(sharedWorkspacePath, import_node_path40.default.basename(suiteWorkspaceFile));
|
|
15978
|
+
try {
|
|
15979
|
+
await (0, import_promises29.stat)(copiedWorkspaceFile);
|
|
15980
|
+
suiteWorkspaceFile = copiedWorkspaceFile;
|
|
15981
|
+
} catch {
|
|
15982
|
+
}
|
|
15983
|
+
}
|
|
15933
15984
|
} else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
|
|
15934
15985
|
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
15935
15986
|
await (0, import_promises29.mkdir)(sharedWorkspacePath, { recursive: true });
|
|
@@ -15976,6 +16027,7 @@ async function runEvaluation(options) {
|
|
|
15976
16027
|
let beforeAllOutputAttached = false;
|
|
15977
16028
|
let cumulativeBudgetCost = 0;
|
|
15978
16029
|
let budgetExhausted = false;
|
|
16030
|
+
let failOnErrorTriggered = false;
|
|
15979
16031
|
const promises = filteredEvalCases.map(
|
|
15980
16032
|
(evalCase) => limit(async () => {
|
|
15981
16033
|
const workerId = nextWorkerId++;
|
|
@@ -16014,6 +16066,37 @@ async function runEvaluation(options) {
|
|
|
16014
16066
|
}
|
|
16015
16067
|
return budgetResult;
|
|
16016
16068
|
}
|
|
16069
|
+
if (failOnError === true && failOnErrorTriggered) {
|
|
16070
|
+
const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
|
|
16071
|
+
const haltResult = {
|
|
16072
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
16073
|
+
testId: evalCase.id,
|
|
16074
|
+
dataset: evalCase.dataset,
|
|
16075
|
+
score: 0,
|
|
16076
|
+
hits: [],
|
|
16077
|
+
misses: [],
|
|
16078
|
+
answer: "",
|
|
16079
|
+
target: target.name,
|
|
16080
|
+
error: errorMsg,
|
|
16081
|
+
executionStatus: "execution_error",
|
|
16082
|
+
failureStage: "setup",
|
|
16083
|
+
failureReasonCode: "error_threshold_exceeded",
|
|
16084
|
+
executionError: { message: errorMsg, stage: "setup" }
|
|
16085
|
+
};
|
|
16086
|
+
if (onProgress) {
|
|
16087
|
+
await onProgress({
|
|
16088
|
+
workerId,
|
|
16089
|
+
testId: evalCase.id,
|
|
16090
|
+
status: "failed",
|
|
16091
|
+
completedAt: Date.now(),
|
|
16092
|
+
error: haltResult.error
|
|
16093
|
+
});
|
|
16094
|
+
}
|
|
16095
|
+
if (onResult) {
|
|
16096
|
+
await onResult(haltResult);
|
|
16097
|
+
}
|
|
16098
|
+
return haltResult;
|
|
16099
|
+
}
|
|
16017
16100
|
if (onProgress) {
|
|
16018
16101
|
await onProgress({
|
|
16019
16102
|
workerId,
|
|
@@ -16066,6 +16149,9 @@ async function runEvaluation(options) {
|
|
|
16066
16149
|
}
|
|
16067
16150
|
}
|
|
16068
16151
|
}
|
|
16152
|
+
if (failOnError === true && result.executionStatus === "execution_error") {
|
|
16153
|
+
failOnErrorTriggered = true;
|
|
16154
|
+
}
|
|
16069
16155
|
if (beforeAllOutput && !beforeAllOutputAttached) {
|
|
16070
16156
|
result = { ...result, beforeAllOutput };
|
|
16071
16157
|
beforeAllOutputAttached = true;
|
|
@@ -16373,6 +16459,14 @@ async function runEvalCase(options) {
|
|
|
16373
16459
|
"template_error"
|
|
16374
16460
|
);
|
|
16375
16461
|
}
|
|
16462
|
+
if (caseWorkspaceFile && workspacePath) {
|
|
16463
|
+
const copiedFile = import_node_path40.default.join(workspacePath, import_node_path40.default.basename(caseWorkspaceFile));
|
|
16464
|
+
try {
|
|
16465
|
+
await (0, import_promises29.stat)(copiedFile);
|
|
16466
|
+
caseWorkspaceFile = copiedFile;
|
|
16467
|
+
} catch {
|
|
16468
|
+
}
|
|
16469
|
+
}
|
|
16376
16470
|
}
|
|
16377
16471
|
if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
16378
16472
|
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
@@ -16882,8 +16976,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
16882
16976
|
workspacePath
|
|
16883
16977
|
});
|
|
16884
16978
|
}
|
|
16885
|
-
const evaluatorKind = evalCase.evaluator ?? "
|
|
16886
|
-
const activeEvaluator = evaluators[evaluatorKind] ?? evaluators
|
|
16979
|
+
const evaluatorKind = evalCase.evaluator ?? "llm-judge";
|
|
16980
|
+
const activeEvaluator = evaluators[evaluatorKind] ?? evaluators["llm-judge"];
|
|
16887
16981
|
if (!activeEvaluator) {
|
|
16888
16982
|
throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
|
|
16889
16983
|
}
|
|
@@ -16966,25 +17060,24 @@ async function runEvaluatorList(options) {
|
|
|
16966
17060
|
availableTargets,
|
|
16967
17061
|
agentTimeoutMs,
|
|
16968
17062
|
evalFileDir,
|
|
16969
|
-
llmJudge: evaluatorRegistry
|
|
17063
|
+
llmJudge: evaluatorRegistry["llm-judge"],
|
|
16970
17064
|
registry: typeRegistry
|
|
16971
17065
|
};
|
|
16972
17066
|
for (const evaluatorConfig of evaluators ?? []) {
|
|
16973
17067
|
try {
|
|
16974
17068
|
const evaluatorInstance = await typeRegistry.create(evaluatorConfig, dispatchContext);
|
|
16975
17069
|
const score2 = await evaluatorInstance.evaluate(evalContext);
|
|
16976
|
-
const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
|
|
16977
17070
|
const weight = evaluatorConfig.weight ?? 1;
|
|
16978
17071
|
scored.push({
|
|
16979
17072
|
score: score2,
|
|
16980
17073
|
name: evaluatorConfig.name,
|
|
16981
|
-
type:
|
|
17074
|
+
type: evaluatorConfig.type,
|
|
16982
17075
|
weight,
|
|
16983
17076
|
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
16984
17077
|
});
|
|
16985
17078
|
scores.push({
|
|
16986
17079
|
name: evaluatorConfig.name,
|
|
16987
|
-
type:
|
|
17080
|
+
type: evaluatorConfig.type,
|
|
16988
17081
|
score: score2.score,
|
|
16989
17082
|
weight,
|
|
16990
17083
|
verdict: score2.verdict,
|
|
@@ -17006,18 +17099,17 @@ async function runEvaluatorList(options) {
|
|
|
17006
17099
|
expectedAspectCount: 1,
|
|
17007
17100
|
reasoning: message
|
|
17008
17101
|
};
|
|
17009
|
-
const resultType = evaluatorConfig.type === "code" ? "code_judge" : evaluatorConfig.type;
|
|
17010
17102
|
const weight = evaluatorConfig.weight ?? 1;
|
|
17011
17103
|
scored.push({
|
|
17012
17104
|
score: fallbackScore,
|
|
17013
17105
|
name: evaluatorConfig.name ?? "unknown",
|
|
17014
|
-
type:
|
|
17106
|
+
type: evaluatorConfig.type ?? "llm-judge",
|
|
17015
17107
|
weight,
|
|
17016
17108
|
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
17017
17109
|
});
|
|
17018
17110
|
scores.push({
|
|
17019
17111
|
name: evaluatorConfig.name ?? "unknown",
|
|
17020
|
-
type:
|
|
17112
|
+
type: evaluatorConfig.type ?? "llm-judge",
|
|
17021
17113
|
score: 0,
|
|
17022
17114
|
weight,
|
|
17023
17115
|
verdict: "fail",
|
|
@@ -17078,7 +17170,7 @@ function filterEvalCases(evalCases, filter) {
|
|
|
17078
17170
|
return evalCases.filter((evalCase) => import_micromatch4.default.isMatch(evalCase.id, filter));
|
|
17079
17171
|
}
|
|
17080
17172
|
function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
17081
|
-
const llmJudge = overrides?.
|
|
17173
|
+
const llmJudge = overrides?.["llm-judge"] ?? new LlmJudgeEvaluator({
|
|
17082
17174
|
resolveJudgeProvider: async (context2) => {
|
|
17083
17175
|
if (context2.judgeProvider) {
|
|
17084
17176
|
return context2.judgeProvider;
|
|
@@ -17088,7 +17180,7 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
17088
17180
|
});
|
|
17089
17181
|
return {
|
|
17090
17182
|
...overrides,
|
|
17091
|
-
|
|
17183
|
+
"llm-judge": llmJudge
|
|
17092
17184
|
};
|
|
17093
17185
|
}
|
|
17094
17186
|
async function invokeProvider(provider, options) {
|
|
@@ -17348,12 +17440,7 @@ async function evaluate(config) {
|
|
|
17348
17440
|
};
|
|
17349
17441
|
}
|
|
17350
17442
|
function mapAssertionType(type) {
|
|
17351
|
-
|
|
17352
|
-
case "code_judge":
|
|
17353
|
-
return "code";
|
|
17354
|
-
default:
|
|
17355
|
-
return type;
|
|
17356
|
-
}
|
|
17443
|
+
return type.replace(/_/g, "-");
|
|
17357
17444
|
}
|
|
17358
17445
|
function computeSummary(results, durationMs) {
|
|
17359
17446
|
const total = results.length;
|
|
@@ -18132,6 +18219,7 @@ function createAgentKernel() {
|
|
|
18132
18219
|
executeWorkspaceScript,
|
|
18133
18220
|
explorationRatio,
|
|
18134
18221
|
extractCacheConfig,
|
|
18222
|
+
extractFailOnError,
|
|
18135
18223
|
extractJsonBlob,
|
|
18136
18224
|
extractTargetFromSuite,
|
|
18137
18225
|
extractTargetsFromSuite,
|