@agentv/core 4.6.0 → 4.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-AIQ5FO4G.js → chunk-75RFVESM.js} +273 -125
- package/dist/chunk-75RFVESM.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +110 -95
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +30 -72
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1488 -517
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +291 -74
- package/dist/index.d.ts +291 -74
- package/dist/index.js +1187 -369
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-AIQ5FO4G.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -31,12 +31,9 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
31
31
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
32
32
|
|
|
33
33
|
// ../../node_modules/.bun/tsup@8.3.5+19811ebab77a7b1c/node_modules/tsup/assets/cjs_shims.js
|
|
34
|
-
var getImportMetaUrl, importMetaUrl;
|
|
35
34
|
var init_cjs_shims = __esm({
|
|
36
35
|
"../../node_modules/.bun/tsup@8.3.5+19811ebab77a7b1c/node_modules/tsup/assets/cjs_shims.js"() {
|
|
37
36
|
"use strict";
|
|
38
|
-
getImportMetaUrl = () => typeof document === "undefined" ? new URL(`file:${__filename}`).href : document.currentScript && document.currentScript.src || new URL("main.js", document.baseURI).href;
|
|
39
|
-
importMetaUrl = /* @__PURE__ */ getImportMetaUrl();
|
|
40
37
|
}
|
|
41
38
|
});
|
|
42
39
|
|
|
@@ -1435,6 +1432,7 @@ __export(index_exports, {
|
|
|
1435
1432
|
DEFAULT_EVALUATOR_TEMPLATE: () => DEFAULT_EVALUATOR_TEMPLATE,
|
|
1436
1433
|
DEFAULT_EVAL_PATTERNS: () => DEFAULT_EVAL_PATTERNS,
|
|
1437
1434
|
DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
|
|
1435
|
+
DEFAULT_THRESHOLD: () => DEFAULT_THRESHOLD,
|
|
1438
1436
|
DeterministicAssertionEvaluator: () => DeterministicAssertionEvaluator,
|
|
1439
1437
|
EvaluatorRegistry: () => EvaluatorRegistry,
|
|
1440
1438
|
ExecutionMetricsEvaluator: () => ExecutionMetricsEvaluator,
|
|
@@ -1456,6 +1454,7 @@ __export(index_exports, {
|
|
|
1456
1454
|
TemplateNotFoundError: () => TemplateNotFoundError,
|
|
1457
1455
|
TokenUsageEvaluator: () => TokenUsageEvaluator,
|
|
1458
1456
|
ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
|
|
1457
|
+
TranscriptProvider: () => TranscriptProvider,
|
|
1459
1458
|
WorkspaceCreationError: () => WorkspaceCreationError,
|
|
1460
1459
|
WorkspacePoolManager: () => WorkspacePoolManager,
|
|
1461
1460
|
addProject: () => addProject,
|
|
@@ -1492,6 +1491,7 @@ __export(index_exports, {
|
|
|
1492
1491
|
detectFormat: () => detectFormat,
|
|
1493
1492
|
discoverAssertions: () => discoverAssertions,
|
|
1494
1493
|
discoverClaudeSessions: () => discoverClaudeSessions,
|
|
1494
|
+
discoverCodexSessions: () => discoverCodexSessions,
|
|
1495
1495
|
discoverCopilotSessions: () => discoverCopilotSessions,
|
|
1496
1496
|
discoverGraders: () => discoverGraders,
|
|
1497
1497
|
discoverJudges: () => discoverGraders,
|
|
@@ -1552,6 +1552,8 @@ __export(index_exports, {
|
|
|
1552
1552
|
normalizeLineEndings: () => normalizeLineEndings,
|
|
1553
1553
|
parseAgentSkillsEvals: () => parseAgentSkillsEvals,
|
|
1554
1554
|
parseClaudeSession: () => parseClaudeSession,
|
|
1555
|
+
parseCodexSession: () => parseCodexSession,
|
|
1556
|
+
parseCopilotEvents: () => parseCopilotEvents,
|
|
1555
1557
|
parseJsonFromText: () => parseJsonFromText,
|
|
1556
1558
|
parseJsonSafe: () => parseJsonSafe,
|
|
1557
1559
|
readJsonFile: () => readJsonFile,
|
|
@@ -1559,8 +1561,10 @@ __export(index_exports, {
|
|
|
1559
1561
|
readTestSuiteMetadata: () => readTestSuiteMetadata,
|
|
1560
1562
|
readTextFile: () => readTextFile,
|
|
1561
1563
|
readTranscriptFile: () => readTranscriptFile,
|
|
1564
|
+
readTranscriptJsonl: () => readTranscriptJsonl,
|
|
1562
1565
|
removeProject: () => removeProject,
|
|
1563
1566
|
resolveAndCreateProvider: () => resolveAndCreateProvider,
|
|
1567
|
+
resolveDelegatedTargetDefinition: () => resolveDelegatedTargetDefinition,
|
|
1564
1568
|
resolveFileReference: () => resolveFileReference3,
|
|
1565
1569
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
1566
1570
|
resolveWorkspaceTemplate: () => resolveWorkspaceTemplate,
|
|
@@ -1590,6 +1594,7 @@ __export(index_exports, {
|
|
|
1590
1594
|
substituteVariables: () => substituteVariables,
|
|
1591
1595
|
toCamelCaseDeep: () => toCamelCaseDeep,
|
|
1592
1596
|
toSnakeCaseDeep: () => toSnakeCaseDeep,
|
|
1597
|
+
toTranscriptJsonLine: () => toTranscriptJsonLine,
|
|
1593
1598
|
tokensPerTool: () => tokensPerTool,
|
|
1594
1599
|
touchProject: () => touchProject,
|
|
1595
1600
|
transpileEvalYaml: () => transpileEvalYaml,
|
|
@@ -2674,8 +2679,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2674
2679
|
const negate = rawEvaluator.negate === true ? true : void 0;
|
|
2675
2680
|
if (isCustomType) {
|
|
2676
2681
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
2677
|
-
const required2 =
|
|
2678
|
-
|
|
2682
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
2683
|
+
rawEvaluator.required,
|
|
2684
|
+
rawEvaluator.min_score,
|
|
2685
|
+
name,
|
|
2686
|
+
evalId
|
|
2687
|
+
);
|
|
2688
|
+
const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "min_score", "negate"]);
|
|
2679
2689
|
const config2 = {};
|
|
2680
2690
|
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
2681
2691
|
if (!knownProps2.has(key) && value !== void 0) {
|
|
@@ -2687,6 +2697,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2687
2697
|
type: customTypeName,
|
|
2688
2698
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
2689
2699
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
2700
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
2690
2701
|
...negate !== void 0 ? { negate } : {},
|
|
2691
2702
|
...Object.keys(config2).length > 0 ? { config: config2 } : {}
|
|
2692
2703
|
});
|
|
@@ -2756,7 +2767,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2756
2767
|
);
|
|
2757
2768
|
}
|
|
2758
2769
|
}
|
|
2759
|
-
const required2 =
|
|
2770
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
2771
|
+
rawEvaluator.required,
|
|
2772
|
+
rawEvaluator.min_score,
|
|
2773
|
+
name,
|
|
2774
|
+
evalId
|
|
2775
|
+
);
|
|
2760
2776
|
const knownProps2 = /* @__PURE__ */ new Set([
|
|
2761
2777
|
"name",
|
|
2762
2778
|
"type",
|
|
@@ -2782,6 +2798,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2782
2798
|
resolvedCwd,
|
|
2783
2799
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
2784
2800
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
2801
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
2785
2802
|
...negate !== void 0 ? { negate } : {},
|
|
2786
2803
|
...Object.keys(config2).length > 0 ? { config: config2 } : {},
|
|
2787
2804
|
...targetConfig !== void 0 ? { target: targetConfig } : {}
|
|
@@ -2910,7 +2927,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2910
2927
|
};
|
|
2911
2928
|
}
|
|
2912
2929
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
2913
|
-
const required2 =
|
|
2930
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
2931
|
+
rawEvaluator.required,
|
|
2932
|
+
rawEvaluator.min_score,
|
|
2933
|
+
name,
|
|
2934
|
+
evalId
|
|
2935
|
+
);
|
|
2914
2936
|
evaluators.push({
|
|
2915
2937
|
name,
|
|
2916
2938
|
type: "composite",
|
|
@@ -2918,6 +2940,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2918
2940
|
aggregator,
|
|
2919
2941
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
2920
2942
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
2943
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
2921
2944
|
...negate !== void 0 ? { negate } : {}
|
|
2922
2945
|
});
|
|
2923
2946
|
continue;
|
|
@@ -3028,7 +3051,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3028
3051
|
continue;
|
|
3029
3052
|
}
|
|
3030
3053
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3031
|
-
const required2 =
|
|
3054
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3055
|
+
rawEvaluator.required,
|
|
3056
|
+
rawEvaluator.min_score,
|
|
3057
|
+
name,
|
|
3058
|
+
evalId
|
|
3059
|
+
);
|
|
3032
3060
|
const config2 = {
|
|
3033
3061
|
name,
|
|
3034
3062
|
type: "tool-trajectory",
|
|
@@ -3037,6 +3065,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3037
3065
|
...expected ? { expected } : {},
|
|
3038
3066
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3039
3067
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3068
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3040
3069
|
...negate !== void 0 ? { negate } : {},
|
|
3041
3070
|
...argsMatch2 !== void 0 ? { argsMatch: argsMatch2 } : {}
|
|
3042
3071
|
};
|
|
@@ -3099,7 +3128,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3099
3128
|
const aggregation = asString(rawEvaluator.aggregation);
|
|
3100
3129
|
const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
|
|
3101
3130
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3102
|
-
const required2 =
|
|
3131
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3132
|
+
rawEvaluator.required,
|
|
3133
|
+
rawEvaluator.min_score,
|
|
3134
|
+
name,
|
|
3135
|
+
evalId
|
|
3136
|
+
);
|
|
3103
3137
|
evaluators.push({
|
|
3104
3138
|
name,
|
|
3105
3139
|
type: "field-accuracy",
|
|
@@ -3107,6 +3141,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3107
3141
|
...validAggregation ? { aggregation: validAggregation } : {},
|
|
3108
3142
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3109
3143
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3144
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3110
3145
|
...negate !== void 0 ? { negate } : {}
|
|
3111
3146
|
});
|
|
3112
3147
|
continue;
|
|
@@ -3120,13 +3155,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3120
3155
|
continue;
|
|
3121
3156
|
}
|
|
3122
3157
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3123
|
-
const required2 =
|
|
3158
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3159
|
+
rawEvaluator.required,
|
|
3160
|
+
rawEvaluator.min_score,
|
|
3161
|
+
name,
|
|
3162
|
+
evalId
|
|
3163
|
+
);
|
|
3124
3164
|
evaluators.push({
|
|
3125
3165
|
name,
|
|
3126
3166
|
type: "latency",
|
|
3127
3167
|
threshold,
|
|
3128
3168
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3129
3169
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3170
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3130
3171
|
...negate !== void 0 ? { negate } : {}
|
|
3131
3172
|
});
|
|
3132
3173
|
continue;
|
|
@@ -3140,13 +3181,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3140
3181
|
continue;
|
|
3141
3182
|
}
|
|
3142
3183
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3143
|
-
const required2 =
|
|
3184
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3185
|
+
rawEvaluator.required,
|
|
3186
|
+
rawEvaluator.min_score,
|
|
3187
|
+
name,
|
|
3188
|
+
evalId
|
|
3189
|
+
);
|
|
3144
3190
|
evaluators.push({
|
|
3145
3191
|
name,
|
|
3146
3192
|
type: "cost",
|
|
3147
3193
|
budget,
|
|
3148
3194
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3149
3195
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3196
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3150
3197
|
...negate !== void 0 ? { negate } : {}
|
|
3151
3198
|
});
|
|
3152
3199
|
continue;
|
|
@@ -3178,13 +3225,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3178
3225
|
continue;
|
|
3179
3226
|
}
|
|
3180
3227
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3181
|
-
const required2 =
|
|
3228
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3229
|
+
rawEvaluator.required,
|
|
3230
|
+
rawEvaluator.min_score,
|
|
3231
|
+
name,
|
|
3232
|
+
evalId
|
|
3233
|
+
);
|
|
3182
3234
|
evaluators.push({
|
|
3183
3235
|
name,
|
|
3184
3236
|
type: "token-usage",
|
|
3185
3237
|
...validLimits,
|
|
3186
3238
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3187
3239
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3240
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3188
3241
|
...negate !== void 0 ? { negate } : {}
|
|
3189
3242
|
});
|
|
3190
3243
|
continue;
|
|
@@ -3230,13 +3283,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3230
3283
|
continue;
|
|
3231
3284
|
}
|
|
3232
3285
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3233
|
-
const required2 =
|
|
3286
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3287
|
+
rawEvaluator.required,
|
|
3288
|
+
rawEvaluator.min_score,
|
|
3289
|
+
name,
|
|
3290
|
+
evalId
|
|
3291
|
+
);
|
|
3234
3292
|
evaluators.push({
|
|
3235
3293
|
name,
|
|
3236
3294
|
type: "execution-metrics",
|
|
3237
3295
|
...validThresholds,
|
|
3238
3296
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3239
3297
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3298
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3240
3299
|
...negate !== void 0 ? { negate } : {}
|
|
3241
3300
|
});
|
|
3242
3301
|
continue;
|
|
@@ -3250,7 +3309,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3250
3309
|
const rawShouldTrigger = rawEvaluator.should_trigger;
|
|
3251
3310
|
const shouldTrigger = typeof rawShouldTrigger === "boolean" ? rawShouldTrigger : void 0;
|
|
3252
3311
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3253
|
-
const required2 =
|
|
3312
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3313
|
+
rawEvaluator.required,
|
|
3314
|
+
rawEvaluator.min_score,
|
|
3315
|
+
name,
|
|
3316
|
+
evalId
|
|
3317
|
+
);
|
|
3254
3318
|
evaluators.push({
|
|
3255
3319
|
name,
|
|
3256
3320
|
type: "skill-trigger",
|
|
@@ -3258,6 +3322,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3258
3322
|
...shouldTrigger !== void 0 ? { should_trigger: shouldTrigger } : {},
|
|
3259
3323
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3260
3324
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3325
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3261
3326
|
...negate !== void 0 ? { negate } : {}
|
|
3262
3327
|
});
|
|
3263
3328
|
continue;
|
|
@@ -3269,13 +3334,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3269
3334
|
continue;
|
|
3270
3335
|
}
|
|
3271
3336
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3272
|
-
const required2 =
|
|
3337
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3338
|
+
rawEvaluator.required,
|
|
3339
|
+
rawEvaluator.min_score,
|
|
3340
|
+
name,
|
|
3341
|
+
evalId
|
|
3342
|
+
);
|
|
3273
3343
|
evaluators.push({
|
|
3274
3344
|
name,
|
|
3275
3345
|
type: "contains",
|
|
3276
3346
|
value,
|
|
3277
3347
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3278
3348
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3349
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3279
3350
|
...negate !== void 0 ? { negate } : {}
|
|
3280
3351
|
});
|
|
3281
3352
|
continue;
|
|
@@ -3289,13 +3360,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3289
3360
|
continue;
|
|
3290
3361
|
}
|
|
3291
3362
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3292
|
-
const required2 =
|
|
3363
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3364
|
+
rawEvaluator.required,
|
|
3365
|
+
rawEvaluator.min_score,
|
|
3366
|
+
name,
|
|
3367
|
+
evalId
|
|
3368
|
+
);
|
|
3293
3369
|
evaluators.push({
|
|
3294
3370
|
name,
|
|
3295
3371
|
type: typeValue,
|
|
3296
3372
|
value,
|
|
3297
3373
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3298
3374
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3375
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3299
3376
|
...negate !== void 0 ? { negate } : {}
|
|
3300
3377
|
});
|
|
3301
3378
|
continue;
|
|
@@ -3307,13 +3384,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3307
3384
|
continue;
|
|
3308
3385
|
}
|
|
3309
3386
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3310
|
-
const required2 =
|
|
3387
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3388
|
+
rawEvaluator.required,
|
|
3389
|
+
rawEvaluator.min_score,
|
|
3390
|
+
name,
|
|
3391
|
+
evalId
|
|
3392
|
+
);
|
|
3311
3393
|
evaluators.push({
|
|
3312
3394
|
name,
|
|
3313
3395
|
type: "icontains",
|
|
3314
3396
|
value,
|
|
3315
3397
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3316
3398
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3399
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3317
3400
|
...negate !== void 0 ? { negate } : {}
|
|
3318
3401
|
});
|
|
3319
3402
|
continue;
|
|
@@ -3327,13 +3410,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3327
3410
|
continue;
|
|
3328
3411
|
}
|
|
3329
3412
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3330
|
-
const required2 =
|
|
3413
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3414
|
+
rawEvaluator.required,
|
|
3415
|
+
rawEvaluator.min_score,
|
|
3416
|
+
name,
|
|
3417
|
+
evalId
|
|
3418
|
+
);
|
|
3331
3419
|
evaluators.push({
|
|
3332
3420
|
name,
|
|
3333
3421
|
type: typeValue,
|
|
3334
3422
|
value,
|
|
3335
3423
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3336
3424
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3425
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3337
3426
|
...negate !== void 0 ? { negate } : {}
|
|
3338
3427
|
});
|
|
3339
3428
|
continue;
|
|
@@ -3345,13 +3434,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3345
3434
|
continue;
|
|
3346
3435
|
}
|
|
3347
3436
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3348
|
-
const required2 =
|
|
3437
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3438
|
+
rawEvaluator.required,
|
|
3439
|
+
rawEvaluator.min_score,
|
|
3440
|
+
name,
|
|
3441
|
+
evalId
|
|
3442
|
+
);
|
|
3349
3443
|
evaluators.push({
|
|
3350
3444
|
name,
|
|
3351
3445
|
type: typeValue,
|
|
3352
3446
|
value,
|
|
3353
3447
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3354
3448
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3449
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3355
3450
|
...negate !== void 0 ? { negate } : {}
|
|
3356
3451
|
});
|
|
3357
3452
|
continue;
|
|
@@ -3364,7 +3459,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3364
3459
|
}
|
|
3365
3460
|
const flags = asString(rawEvaluator.flags);
|
|
3366
3461
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3367
|
-
const required2 =
|
|
3462
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3463
|
+
rawEvaluator.required,
|
|
3464
|
+
rawEvaluator.min_score,
|
|
3465
|
+
name,
|
|
3466
|
+
evalId
|
|
3467
|
+
);
|
|
3368
3468
|
evaluators.push({
|
|
3369
3469
|
name,
|
|
3370
3470
|
type: "regex",
|
|
@@ -3372,18 +3472,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3372
3472
|
...flags !== void 0 ? { flags } : {},
|
|
3373
3473
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3374
3474
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3475
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3375
3476
|
...negate !== void 0 ? { negate } : {}
|
|
3376
3477
|
});
|
|
3377
3478
|
continue;
|
|
3378
3479
|
}
|
|
3379
3480
|
if (typeValue === "is-json") {
|
|
3380
3481
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3381
|
-
const required2 =
|
|
3482
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3483
|
+
rawEvaluator.required,
|
|
3484
|
+
rawEvaluator.min_score,
|
|
3485
|
+
name,
|
|
3486
|
+
evalId
|
|
3487
|
+
);
|
|
3382
3488
|
evaluators.push({
|
|
3383
3489
|
name,
|
|
3384
3490
|
type: "is-json",
|
|
3385
3491
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3386
3492
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3493
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3387
3494
|
...negate !== void 0 ? { negate } : {}
|
|
3388
3495
|
});
|
|
3389
3496
|
continue;
|
|
@@ -3395,13 +3502,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3395
3502
|
continue;
|
|
3396
3503
|
}
|
|
3397
3504
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3398
|
-
const required2 =
|
|
3505
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3506
|
+
rawEvaluator.required,
|
|
3507
|
+
rawEvaluator.min_score,
|
|
3508
|
+
name,
|
|
3509
|
+
evalId
|
|
3510
|
+
);
|
|
3399
3511
|
evaluators.push({
|
|
3400
3512
|
name,
|
|
3401
3513
|
type: "equals",
|
|
3402
3514
|
value,
|
|
3403
3515
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3404
3516
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3517
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3405
3518
|
...negate !== void 0 ? { negate } : {}
|
|
3406
3519
|
});
|
|
3407
3520
|
continue;
|
|
@@ -3437,7 +3550,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3437
3550
|
continue;
|
|
3438
3551
|
}
|
|
3439
3552
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3440
|
-
const required2 =
|
|
3553
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3554
|
+
rawEvaluator.required,
|
|
3555
|
+
rawEvaluator.min_score,
|
|
3556
|
+
name,
|
|
3557
|
+
evalId
|
|
3558
|
+
);
|
|
3441
3559
|
evaluators.push({
|
|
3442
3560
|
name,
|
|
3443
3561
|
type: "llm-grader",
|
|
@@ -3445,6 +3563,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3445
3563
|
...graderTargetName ? { target: graderTargetName } : {},
|
|
3446
3564
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3447
3565
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3566
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3448
3567
|
...negate !== void 0 ? { negate } : {}
|
|
3449
3568
|
});
|
|
3450
3569
|
continue;
|
|
@@ -3514,7 +3633,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3514
3633
|
continue;
|
|
3515
3634
|
}
|
|
3516
3635
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3517
|
-
const required2 =
|
|
3636
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3637
|
+
rawEvaluator.required,
|
|
3638
|
+
rawEvaluator.min_score,
|
|
3639
|
+
name,
|
|
3640
|
+
evalId
|
|
3641
|
+
);
|
|
3518
3642
|
evaluators.push({
|
|
3519
3643
|
name,
|
|
3520
3644
|
type: "llm-grader",
|
|
@@ -3522,12 +3646,18 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3522
3646
|
...graderTargetName ? { target: graderTargetName } : {},
|
|
3523
3647
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3524
3648
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3649
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3525
3650
|
...negate !== void 0 ? { negate } : {}
|
|
3526
3651
|
});
|
|
3527
3652
|
continue;
|
|
3528
3653
|
}
|
|
3529
3654
|
const weight = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3530
|
-
const required =
|
|
3655
|
+
const { required, min_score } = parseRequiredAndMinScore(
|
|
3656
|
+
rawEvaluator.required,
|
|
3657
|
+
rawEvaluator.min_score,
|
|
3658
|
+
name,
|
|
3659
|
+
evalId
|
|
3660
|
+
);
|
|
3531
3661
|
const knownProps = /* @__PURE__ */ new Set([
|
|
3532
3662
|
"name",
|
|
3533
3663
|
"type",
|
|
@@ -3538,6 +3668,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3538
3668
|
"weight",
|
|
3539
3669
|
"config",
|
|
3540
3670
|
"required",
|
|
3671
|
+
"min_score",
|
|
3541
3672
|
"negate",
|
|
3542
3673
|
"max_steps",
|
|
3543
3674
|
"maxSteps",
|
|
@@ -3567,6 +3698,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3567
3698
|
...graderTargetName ? { target: graderTargetName } : {},
|
|
3568
3699
|
...weight !== void 0 ? { weight } : {},
|
|
3569
3700
|
...required !== void 0 ? { required } : {},
|
|
3701
|
+
...min_score !== void 0 ? { min_score } : {},
|
|
3570
3702
|
...negate !== void 0 ? { negate } : {},
|
|
3571
3703
|
...finalConfig ? { config: finalConfig } : {},
|
|
3572
3704
|
...llmMaxSteps !== void 0 ? { max_steps: llmMaxSteps } : {},
|
|
@@ -3698,10 +3830,23 @@ ${detailBlock}${ANSI_RESET5}`);
|
|
|
3698
3830
|
console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET5}`);
|
|
3699
3831
|
}
|
|
3700
3832
|
}
|
|
3701
|
-
function
|
|
3702
|
-
|
|
3703
|
-
if (typeof
|
|
3704
|
-
|
|
3833
|
+
function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalId) {
|
|
3834
|
+
const result = {};
|
|
3835
|
+
if (typeof rawMinScore === "number" && rawMinScore > 0 && rawMinScore <= 1) {
|
|
3836
|
+
result.min_score = rawMinScore;
|
|
3837
|
+
}
|
|
3838
|
+
if (rawRequired === true) {
|
|
3839
|
+
result.required = true;
|
|
3840
|
+
} else if (typeof rawRequired === "number" && rawRequired > 0 && rawRequired <= 1) {
|
|
3841
|
+
if (result.min_score === void 0) {
|
|
3842
|
+
result.min_score = rawRequired;
|
|
3843
|
+
}
|
|
3844
|
+
result.required = rawRequired;
|
|
3845
|
+
logWarning2(
|
|
3846
|
+
`Evaluator '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
|
|
3847
|
+
);
|
|
3848
|
+
}
|
|
3849
|
+
return result;
|
|
3705
3850
|
}
|
|
3706
3851
|
function validateWeight(rawWeight, evaluatorName, evalId) {
|
|
3707
3852
|
if (rawWeight === void 0) {
|
|
@@ -3744,16 +3889,30 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
3744
3889
|
const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
|
|
3745
3890
|
const expectedOutcome = asString(rawRubric.outcome) ?? "";
|
|
3746
3891
|
const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
|
|
3892
|
+
let minScore;
|
|
3747
3893
|
let requiredMinScore;
|
|
3748
3894
|
let required;
|
|
3749
|
-
if (typeof rawRubric.
|
|
3750
|
-
const
|
|
3751
|
-
if (
|
|
3895
|
+
if (typeof rawRubric.min_score === "number") {
|
|
3896
|
+
const ms = rawRubric.min_score;
|
|
3897
|
+
if (ms <= 0 || ms > 1) {
|
|
3752
3898
|
throw new Error(
|
|
3753
|
-
`Invalid
|
|
3899
|
+
`Invalid min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be in (0, 1] (got ${ms})`
|
|
3754
3900
|
);
|
|
3755
3901
|
}
|
|
3756
|
-
|
|
3902
|
+
minScore = ms;
|
|
3903
|
+
requiredMinScore = Math.round(ms * 10);
|
|
3904
|
+
} else if (typeof rawRubric.required_min_score === "number") {
|
|
3905
|
+
const rms = rawRubric.required_min_score;
|
|
3906
|
+
if (!Number.isInteger(rms) || rms < 0 || rms > 10) {
|
|
3907
|
+
throw new Error(
|
|
3908
|
+
`Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${rms})`
|
|
3909
|
+
);
|
|
3910
|
+
}
|
|
3911
|
+
requiredMinScore = rms;
|
|
3912
|
+
minScore = rms / 10;
|
|
3913
|
+
logWarning2(
|
|
3914
|
+
`Rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': 'required_min_score: ${rms}' is deprecated. Use 'min_score: ${rms / 10}' (0-1 scale) instead.`
|
|
3915
|
+
);
|
|
3757
3916
|
}
|
|
3758
3917
|
if (typeof rawRubric.required === "boolean") {
|
|
3759
3918
|
required = rawRubric.required;
|
|
@@ -3773,6 +3932,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
3773
3932
|
weight,
|
|
3774
3933
|
...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
|
|
3775
3934
|
...required !== void 0 ? { required } : {},
|
|
3935
|
+
...minScore !== void 0 ? { min_score: minScore } : {},
|
|
3776
3936
|
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
|
|
3777
3937
|
score_ranges: scoreRanges
|
|
3778
3938
|
});
|
|
@@ -3789,6 +3949,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
3789
3949
|
weight,
|
|
3790
3950
|
// Default to required: true if not specified (backward compatibility)
|
|
3791
3951
|
required: required ?? true,
|
|
3952
|
+
...minScore !== void 0 ? { min_score: minScore } : {},
|
|
3792
3953
|
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
|
|
3793
3954
|
});
|
|
3794
3955
|
}
|
|
@@ -3917,12 +4078,22 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
3917
4078
|
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
3918
4079
|
weight: typeof rubric.weight === "number" ? rubric.weight : 1
|
|
3919
4080
|
};
|
|
4081
|
+
let inlineMinScore;
|
|
4082
|
+
let inlineRequiredMinScore;
|
|
4083
|
+
if (typeof rubric.min_score === "number") {
|
|
4084
|
+
inlineMinScore = rubric.min_score;
|
|
4085
|
+
inlineRequiredMinScore = Math.round(inlineMinScore * 10);
|
|
4086
|
+
} else if (typeof rubric.required_min_score === "number") {
|
|
4087
|
+
inlineRequiredMinScore = rubric.required_min_score;
|
|
4088
|
+
inlineMinScore = inlineRequiredMinScore / 10;
|
|
4089
|
+
}
|
|
3920
4090
|
if (scoreRanges && scoreRanges.length > 0) {
|
|
3921
4091
|
return {
|
|
3922
4092
|
...baseRubric,
|
|
3923
4093
|
...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
|
|
3924
4094
|
...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
|
|
3925
|
-
...
|
|
4095
|
+
...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
|
|
4096
|
+
...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {},
|
|
3926
4097
|
score_ranges: scoreRanges
|
|
3927
4098
|
};
|
|
3928
4099
|
}
|
|
@@ -3930,7 +4101,8 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
3930
4101
|
...baseRubric,
|
|
3931
4102
|
outcome: expectedOutcome,
|
|
3932
4103
|
required: typeof rubric.required === "boolean" ? rubric.required : true,
|
|
3933
|
-
...
|
|
4104
|
+
...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
|
|
4105
|
+
...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {}
|
|
3934
4106
|
};
|
|
3935
4107
|
}).filter((r) => r.outcome && r.outcome.length > 0 || "score_ranges" in r && r.score_ranges);
|
|
3936
4108
|
if (rubricItems.length === 0) {
|
|
@@ -4334,6 +4506,9 @@ function resolveExpectedMessages(raw) {
|
|
|
4334
4506
|
var ANSI_YELLOW6 = "\x1B[33m";
|
|
4335
4507
|
var ANSI_RED2 = "\x1B[31m";
|
|
4336
4508
|
var ANSI_RESET7 = "\x1B[0m";
|
|
4509
|
+
function matchesFilter(id, filter) {
|
|
4510
|
+
return typeof filter === "string" ? import_micromatch.default.isMatch(id, filter) : filter.some((pattern) => import_micromatch.default.isMatch(id, pattern));
|
|
4511
|
+
}
|
|
4337
4512
|
function detectFormat(filePath) {
|
|
4338
4513
|
const ext = import_node_path7.default.extname(filePath).toLowerCase();
|
|
4339
4514
|
if (ext === ".jsonl") return "jsonl";
|
|
@@ -4401,40 +4576,40 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
4401
4576
|
const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
|
|
4402
4577
|
const rawFile = await (0, import_promises7.readFile)(absoluteTestPath, "utf8");
|
|
4403
4578
|
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
4404
|
-
const
|
|
4405
|
-
const
|
|
4579
|
+
const fallbackSuiteName = import_node_path7.default.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
4580
|
+
const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
|
|
4406
4581
|
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-grader";
|
|
4407
4582
|
const globalExecution = sidecar.execution;
|
|
4408
4583
|
if (verbose) {
|
|
4409
4584
|
console.log(`
|
|
4410
|
-
[JSONL
|
|
4585
|
+
[JSONL Suite: ${evalFilePath}]`);
|
|
4411
4586
|
console.log(` Cases: ${rawCases.length}`);
|
|
4412
|
-
console.log(`
|
|
4587
|
+
console.log(` Suite: ${suiteName}`);
|
|
4413
4588
|
if (sidecar.description) {
|
|
4414
4589
|
console.log(` Description: ${sidecar.description}`);
|
|
4415
4590
|
}
|
|
4416
4591
|
}
|
|
4417
4592
|
const results = [];
|
|
4418
4593
|
for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
|
|
4419
|
-
const
|
|
4594
|
+
const testCaseConfig = rawCases[lineIndex];
|
|
4420
4595
|
const lineNumber = lineIndex + 1;
|
|
4421
|
-
const id = asString4(
|
|
4422
|
-
if (filterPattern && (!id || !
|
|
4596
|
+
const id = asString4(testCaseConfig.id);
|
|
4597
|
+
if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
|
|
4423
4598
|
continue;
|
|
4424
4599
|
}
|
|
4425
|
-
const conversationId = asString4(
|
|
4426
|
-
let outcome = asString4(
|
|
4427
|
-
if (!outcome &&
|
|
4428
|
-
outcome = asString4(
|
|
4600
|
+
const conversationId = asString4(testCaseConfig.conversation_id);
|
|
4601
|
+
let outcome = asString4(testCaseConfig.criteria);
|
|
4602
|
+
if (!outcome && testCaseConfig.expected_outcome !== void 0) {
|
|
4603
|
+
outcome = asString4(testCaseConfig.expected_outcome);
|
|
4429
4604
|
if (outcome) {
|
|
4430
4605
|
logWarning4(
|
|
4431
|
-
`Test '${asString4(
|
|
4606
|
+
`Test '${asString4(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
|
|
4432
4607
|
);
|
|
4433
4608
|
}
|
|
4434
4609
|
}
|
|
4435
|
-
const rawInputMessages = resolveInputMessages(
|
|
4436
|
-
const expectedMessages = resolveExpectedMessages(
|
|
4437
|
-
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 ||
|
|
4610
|
+
const rawInputMessages = resolveInputMessages(testCaseConfig);
|
|
4611
|
+
const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
|
|
4612
|
+
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assert !== void 0;
|
|
4438
4613
|
if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
|
|
4439
4614
|
logError2(
|
|
4440
4615
|
`Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
|
|
@@ -4471,18 +4646,23 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
4471
4646
|
}
|
|
4472
4647
|
}
|
|
4473
4648
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
4474
|
-
const caseExecution = isJsonObject(
|
|
4649
|
+
const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
|
|
4475
4650
|
const mergedExecution = caseExecution ?? globalExecution;
|
|
4476
|
-
const
|
|
4651
|
+
const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
|
|
4477
4652
|
let evaluators;
|
|
4478
4653
|
try {
|
|
4479
|
-
evaluators = await parseEvaluators(
|
|
4654
|
+
evaluators = await parseEvaluators(
|
|
4655
|
+
testCaseConfig,
|
|
4656
|
+
mergedExecution,
|
|
4657
|
+
searchRoots,
|
|
4658
|
+
id ?? "unknown"
|
|
4659
|
+
);
|
|
4480
4660
|
} catch (error) {
|
|
4481
4661
|
const message = error instanceof Error ? error.message : String(error);
|
|
4482
4662
|
logError2(`Skipping test '${id}' at line ${lineNumber}: ${message}`);
|
|
4483
4663
|
continue;
|
|
4484
4664
|
}
|
|
4485
|
-
const inlineRubrics =
|
|
4665
|
+
const inlineRubrics = testCaseConfig.rubrics;
|
|
4486
4666
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
4487
4667
|
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
4488
4668
|
if (rubricEvaluator) {
|
|
@@ -4493,7 +4673,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
4493
4673
|
const userFilePaths = collectResolvedInputFilePaths(inputMessages);
|
|
4494
4674
|
const testCase = {
|
|
4495
4675
|
id,
|
|
4496
|
-
|
|
4676
|
+
suite: suiteName,
|
|
4497
4677
|
conversation_id: conversationId,
|
|
4498
4678
|
question,
|
|
4499
4679
|
input: inputMessages,
|
|
@@ -4501,7 +4681,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
4501
4681
|
reference_answer: referenceAnswer,
|
|
4502
4682
|
file_paths: userFilePaths,
|
|
4503
4683
|
criteria: outcome ?? "",
|
|
4504
|
-
evaluator:
|
|
4684
|
+
evaluator: testCaseEvaluatorKind,
|
|
4505
4685
|
assertions: evaluators
|
|
4506
4686
|
};
|
|
4507
4687
|
results.push(testCase);
|
|
@@ -4686,6 +4866,9 @@ function buildChatPromptFromSegments(options) {
|
|
|
4686
4866
|
var ANSI_YELLOW7 = "\x1B[33m";
|
|
4687
4867
|
var ANSI_RED3 = "\x1B[31m";
|
|
4688
4868
|
var ANSI_RESET8 = "\x1B[0m";
|
|
4869
|
+
function matchesFilter2(id, filter) {
|
|
4870
|
+
return typeof filter === "string" ? import_micromatch2.default.isMatch(id, filter) : filter.some((pattern) => import_micromatch2.default.isMatch(id, pattern));
|
|
4871
|
+
}
|
|
4689
4872
|
function resolveTests(suite) {
|
|
4690
4873
|
if (suite.tests !== void 0) return suite.tests;
|
|
4691
4874
|
if (suite.eval_cases !== void 0) {
|
|
@@ -4765,18 +4948,18 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4765
4948
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
4766
4949
|
}
|
|
4767
4950
|
const suite = interpolated;
|
|
4768
|
-
const
|
|
4769
|
-
const
|
|
4770
|
-
const
|
|
4771
|
-
const
|
|
4951
|
+
const suiteNameFromFile = asString5(suite.name)?.trim();
|
|
4952
|
+
const fallbackSuiteName = import_node_path8.default.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
|
|
4953
|
+
const suiteName = suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName;
|
|
4954
|
+
const rawTestCases = resolveTests(suite);
|
|
4772
4955
|
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
|
|
4773
4956
|
const evalFileDir = import_node_path8.default.dirname(absoluteTestPath);
|
|
4774
|
-
let
|
|
4775
|
-
if (typeof
|
|
4776
|
-
const externalPath = import_node_path8.default.resolve(evalFileDir,
|
|
4777
|
-
|
|
4778
|
-
} else if (Array.isArray(
|
|
4779
|
-
|
|
4957
|
+
let expandedTestCases;
|
|
4958
|
+
if (typeof rawTestCases === "string") {
|
|
4959
|
+
const externalPath = import_node_path8.default.resolve(evalFileDir, rawTestCases);
|
|
4960
|
+
expandedTestCases = await loadCasesFromFile(externalPath);
|
|
4961
|
+
} else if (Array.isArray(rawTestCases)) {
|
|
4962
|
+
expandedTestCases = await expandFileReferences(rawTestCases, evalFileDir);
|
|
4780
4963
|
} else {
|
|
4781
4964
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
|
|
4782
4965
|
}
|
|
@@ -4791,32 +4974,33 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4791
4974
|
}
|
|
4792
4975
|
const globalExecution = suiteAssertions !== void 0 ? { ...rawGlobalExecution ?? {}, assertions: suiteAssertions } : rawGlobalExecution;
|
|
4793
4976
|
const results = [];
|
|
4794
|
-
for (const
|
|
4795
|
-
if (!isJsonObject(
|
|
4977
|
+
for (const rawTestCase of expandedTestCases) {
|
|
4978
|
+
if (!isJsonObject(rawTestCase)) {
|
|
4796
4979
|
logWarning5("Skipping invalid test entry (expected object)");
|
|
4797
4980
|
continue;
|
|
4798
4981
|
}
|
|
4799
|
-
const
|
|
4800
|
-
const id = asString5(
|
|
4801
|
-
if (filterPattern && (!id || !
|
|
4982
|
+
const testCaseConfig = rawTestCase;
|
|
4983
|
+
const id = asString5(testCaseConfig.id);
|
|
4984
|
+
if (filterPattern && (!id || !matchesFilter2(id, filterPattern))) {
|
|
4802
4985
|
continue;
|
|
4803
4986
|
}
|
|
4804
|
-
const conversationId = asString5(
|
|
4805
|
-
let outcome = asString5(
|
|
4806
|
-
if (!outcome &&
|
|
4807
|
-
outcome = asString5(
|
|
4987
|
+
const conversationId = asString5(testCaseConfig.conversation_id);
|
|
4988
|
+
let outcome = asString5(testCaseConfig.criteria);
|
|
4989
|
+
if (!outcome && testCaseConfig.expected_outcome !== void 0) {
|
|
4990
|
+
outcome = asString5(testCaseConfig.expected_outcome);
|
|
4808
4991
|
if (outcome) {
|
|
4809
4992
|
logWarning5(
|
|
4810
|
-
`Test '${asString5(
|
|
4993
|
+
`Test '${asString5(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
|
|
4811
4994
|
);
|
|
4812
4995
|
}
|
|
4813
4996
|
}
|
|
4814
|
-
const caseExecution = isJsonObject(
|
|
4997
|
+
const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
|
|
4815
4998
|
const skipDefaults = caseExecution?.skip_defaults === true;
|
|
4999
|
+
const caseThreshold = typeof caseExecution?.threshold === "number" && caseExecution.threshold >= 0 && caseExecution.threshold <= 1 ? caseExecution.threshold : void 0;
|
|
4816
5000
|
const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
|
|
4817
|
-
const testInputMessages = resolveInputMessages(
|
|
4818
|
-
const expectedMessages = resolveExpectedMessages(
|
|
4819
|
-
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 ||
|
|
5001
|
+
const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
|
|
5002
|
+
const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
|
|
5003
|
+
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
|
|
4820
5004
|
if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
|
|
4821
5005
|
logError3(
|
|
4822
5006
|
`Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
|
|
@@ -4863,16 +5047,21 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4863
5047
|
}
|
|
4864
5048
|
}
|
|
4865
5049
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
4866
|
-
const
|
|
5050
|
+
const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
|
|
4867
5051
|
let evaluators;
|
|
4868
5052
|
try {
|
|
4869
|
-
evaluators = await parseEvaluators(
|
|
5053
|
+
evaluators = await parseEvaluators(
|
|
5054
|
+
testCaseConfig,
|
|
5055
|
+
globalExecution,
|
|
5056
|
+
searchRoots,
|
|
5057
|
+
id ?? "unknown"
|
|
5058
|
+
);
|
|
4870
5059
|
} catch (error) {
|
|
4871
5060
|
const message = error instanceof Error ? error.message : String(error);
|
|
4872
5061
|
logError3(`Skipping test '${id}': ${message}`);
|
|
4873
5062
|
continue;
|
|
4874
5063
|
}
|
|
4875
|
-
const inlineRubrics =
|
|
5064
|
+
const inlineRubrics = testCaseConfig.rubrics;
|
|
4876
5065
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
4877
5066
|
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
4878
5067
|
if (rubricEvaluator) {
|
|
@@ -4881,13 +5070,13 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4881
5070
|
}
|
|
4882
5071
|
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
4883
5072
|
const userFilePaths = collectResolvedInputFilePaths(inputMessages);
|
|
4884
|
-
const caseWorkspace = await resolveWorkspaceConfig(
|
|
5073
|
+
const caseWorkspace = await resolveWorkspaceConfig(testCaseConfig.workspace, evalFileDir);
|
|
4885
5074
|
const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
|
|
4886
|
-
const metadata = isJsonObject(
|
|
4887
|
-
const caseTargets = extractTargetsFromTestCase(
|
|
5075
|
+
const metadata = isJsonObject(testCaseConfig.metadata) ? testCaseConfig.metadata : void 0;
|
|
5076
|
+
const caseTargets = extractTargetsFromTestCase(testCaseConfig);
|
|
4888
5077
|
const testCase = {
|
|
4889
5078
|
id,
|
|
4890
|
-
|
|
5079
|
+
suite: suiteName,
|
|
4891
5080
|
category: options?.category,
|
|
4892
5081
|
conversation_id: conversationId,
|
|
4893
5082
|
question,
|
|
@@ -4896,11 +5085,12 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4896
5085
|
reference_answer: referenceAnswer,
|
|
4897
5086
|
file_paths: userFilePaths,
|
|
4898
5087
|
criteria: outcome ?? "",
|
|
4899
|
-
evaluator:
|
|
5088
|
+
evaluator: testCaseEvaluatorKind,
|
|
4900
5089
|
assertions: evaluators,
|
|
4901
5090
|
workspace: mergedWorkspace,
|
|
4902
5091
|
metadata,
|
|
4903
|
-
targets: caseTargets
|
|
5092
|
+
targets: caseTargets,
|
|
5093
|
+
...caseThreshold !== void 0 ? { threshold: caseThreshold } : {}
|
|
4904
5094
|
};
|
|
4905
5095
|
results.push(testCase);
|
|
4906
5096
|
}
|
|
@@ -5566,7 +5756,7 @@ var AzureProvider = class {
|
|
|
5566
5756
|
};
|
|
5567
5757
|
this.retryConfig = config.retry;
|
|
5568
5758
|
const azure = (0, import_azure2.createAzure)(buildAzureOptions(config));
|
|
5569
|
-
this.model = azure.chat(config.deploymentName);
|
|
5759
|
+
this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
|
|
5570
5760
|
}
|
|
5571
5761
|
id;
|
|
5572
5762
|
kind = "azure";
|
|
@@ -5692,7 +5882,9 @@ function buildAzureOptions(config) {
|
|
|
5692
5882
|
const options = {
|
|
5693
5883
|
apiKey: config.apiKey,
|
|
5694
5884
|
apiVersion: config.version,
|
|
5695
|
-
|
|
5885
|
+
// Chat completions still use deployment-scoped Azure URLs for compatibility
|
|
5886
|
+
// with existing deployments. Responses API should use the SDK's v1 path.
|
|
5887
|
+
useDeploymentBasedUrls: config.apiFormat !== "responses"
|
|
5696
5888
|
};
|
|
5697
5889
|
const baseURL = normalizeAzureBaseUrl(config.resourceName);
|
|
5698
5890
|
if (baseURL) {
|
|
@@ -7169,15 +7361,16 @@ var CliProvider = class {
|
|
|
7169
7361
|
outputFilePath
|
|
7170
7362
|
);
|
|
7171
7363
|
const renderedCommand = renderTemplate(this.config.command, templateValues);
|
|
7364
|
+
const effectiveCwd = requests[0]?.cwd ?? this.config.cwd;
|
|
7172
7365
|
if (this.verbose) {
|
|
7173
7366
|
console.log(
|
|
7174
|
-
`[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${
|
|
7367
|
+
`[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${effectiveCwd ?? ""} command=${renderedCommand}`
|
|
7175
7368
|
);
|
|
7176
7369
|
}
|
|
7177
7370
|
try {
|
|
7178
7371
|
const startTime = Date.now();
|
|
7179
7372
|
const result = await this.runCommand(renderedCommand, {
|
|
7180
|
-
cwd:
|
|
7373
|
+
cwd: effectiveCwd,
|
|
7181
7374
|
env: process.env,
|
|
7182
7375
|
timeoutMs: this.config.timeoutMs,
|
|
7183
7376
|
signal: controller.signal
|
|
@@ -7210,7 +7403,7 @@ var CliProvider = class {
|
|
|
7210
7403
|
command: renderedCommand,
|
|
7211
7404
|
stderr: result.stderr,
|
|
7212
7405
|
exitCode: result.exitCode ?? 0,
|
|
7213
|
-
cwd:
|
|
7406
|
+
cwd: effectiveCwd,
|
|
7214
7407
|
outputFile: outputFilePath
|
|
7215
7408
|
}
|
|
7216
7409
|
};
|
|
@@ -7228,7 +7421,7 @@ var CliProvider = class {
|
|
|
7228
7421
|
command: renderedCommand,
|
|
7229
7422
|
stderr: result.stderr,
|
|
7230
7423
|
exitCode: result.exitCode ?? 0,
|
|
7231
|
-
cwd:
|
|
7424
|
+
cwd: effectiveCwd,
|
|
7232
7425
|
outputFile: outputFilePath,
|
|
7233
7426
|
error: errorMessage
|
|
7234
7427
|
}
|
|
@@ -7243,7 +7436,7 @@ var CliProvider = class {
|
|
|
7243
7436
|
command: renderedCommand,
|
|
7244
7437
|
stderr: result.stderr,
|
|
7245
7438
|
exitCode: result.exitCode ?? 0,
|
|
7246
|
-
cwd:
|
|
7439
|
+
cwd: effectiveCwd,
|
|
7247
7440
|
outputFile: outputFilePath,
|
|
7248
7441
|
recordId: evalCaseId
|
|
7249
7442
|
}
|
|
@@ -9267,6 +9460,76 @@ function subscribeToPiLogEntries(listener) {
|
|
|
9267
9460
|
};
|
|
9268
9461
|
}
|
|
9269
9462
|
|
|
9463
|
+
// src/evaluation/providers/pi-provider-aliases.ts
|
|
9464
|
+
init_cjs_shims();
|
|
9465
|
+
var SUBPROVIDER_ALIASES = {
|
|
9466
|
+
azure: "azure-openai-responses"
|
|
9467
|
+
};
|
|
9468
|
+
var SUBPROVIDER_ALIASES_WITH_BASE_URL = {
|
|
9469
|
+
// Azure v1 endpoints are OpenAI-compatible; use the standard client
|
|
9470
|
+
// to avoid AzureOpenAI adding api-version query params.
|
|
9471
|
+
azure: "openai-responses"
|
|
9472
|
+
};
|
|
9473
|
+
var ENV_KEY_MAP = {
|
|
9474
|
+
google: "GEMINI_API_KEY",
|
|
9475
|
+
gemini: "GEMINI_API_KEY",
|
|
9476
|
+
anthropic: "ANTHROPIC_API_KEY",
|
|
9477
|
+
openai: "OPENAI_API_KEY",
|
|
9478
|
+
groq: "GROQ_API_KEY",
|
|
9479
|
+
xai: "XAI_API_KEY",
|
|
9480
|
+
openrouter: "OPENROUTER_API_KEY",
|
|
9481
|
+
azure: "AZURE_OPENAI_API_KEY"
|
|
9482
|
+
};
|
|
9483
|
+
var ENV_BASE_URL_MAP = {
|
|
9484
|
+
openai: "OPENAI_BASE_URL",
|
|
9485
|
+
azure: "AZURE_OPENAI_BASE_URL",
|
|
9486
|
+
openrouter: "OPENROUTER_BASE_URL"
|
|
9487
|
+
};
|
|
9488
|
+
function resolveSubprovider(name, hasBaseUrl = false) {
|
|
9489
|
+
const lower = name.toLowerCase();
|
|
9490
|
+
if (hasBaseUrl) {
|
|
9491
|
+
const alias = SUBPROVIDER_ALIASES_WITH_BASE_URL[lower];
|
|
9492
|
+
if (alias) return alias;
|
|
9493
|
+
}
|
|
9494
|
+
return SUBPROVIDER_ALIASES[lower] ?? name;
|
|
9495
|
+
}
|
|
9496
|
+
function resolveCliProvider(name) {
|
|
9497
|
+
const lower = name.toLowerCase();
|
|
9498
|
+
if (lower === "azure") return "azure-openai-responses";
|
|
9499
|
+
return name;
|
|
9500
|
+
}
|
|
9501
|
+
function resolveEnvKeyName(provider, hasBaseUrl = false) {
|
|
9502
|
+
const lower = provider.toLowerCase();
|
|
9503
|
+
if (hasBaseUrl && lower === "azure") return "OPENAI_API_KEY";
|
|
9504
|
+
return ENV_KEY_MAP[lower];
|
|
9505
|
+
}
|
|
9506
|
+
function resolveEnvBaseUrlName(provider, hasBaseUrl = false) {
|
|
9507
|
+
const lower = provider.toLowerCase();
|
|
9508
|
+
if (hasBaseUrl && lower === "azure") return "OPENAI_BASE_URL";
|
|
9509
|
+
return ENV_BASE_URL_MAP[lower];
|
|
9510
|
+
}
|
|
9511
|
+
function extractAzureResourceName(baseUrl) {
|
|
9512
|
+
const urlMatch = baseUrl.match(/^https?:\/\/([^./]+)/);
|
|
9513
|
+
if (urlMatch) return urlMatch[1];
|
|
9514
|
+
return baseUrl;
|
|
9515
|
+
}
|
|
9516
|
+
function normalizeAzureSdkBaseUrl(baseUrl) {
|
|
9517
|
+
const trimmed = baseUrl.trim().replace(/\/+$/, "");
|
|
9518
|
+
if (!trimmed) {
|
|
9519
|
+
return trimmed;
|
|
9520
|
+
}
|
|
9521
|
+
if (!/^https?:\/\//i.test(trimmed)) {
|
|
9522
|
+
return `https://${trimmed}.openai.azure.com/openai/v1`;
|
|
9523
|
+
}
|
|
9524
|
+
if (/\/openai\/v1$/i.test(trimmed)) {
|
|
9525
|
+
return trimmed;
|
|
9526
|
+
}
|
|
9527
|
+
if (/\/openai$/i.test(trimmed)) {
|
|
9528
|
+
return `${trimmed}/v1`;
|
|
9529
|
+
}
|
|
9530
|
+
return `${trimmed}/openai/v1`;
|
|
9531
|
+
}
|
|
9532
|
+
|
|
9270
9533
|
// src/evaluation/providers/pi-utils.ts
|
|
9271
9534
|
init_cjs_shims();
|
|
9272
9535
|
function extractPiTextContent(content) {
|
|
@@ -9426,12 +9689,12 @@ var PiCliProvider = class {
|
|
|
9426
9689
|
buildPiArgs(prompt, inputFiles) {
|
|
9427
9690
|
const args = [];
|
|
9428
9691
|
if (this.config.subprovider) {
|
|
9429
|
-
args.push("--provider", this.config.subprovider);
|
|
9692
|
+
args.push("--provider", resolveCliProvider(this.config.subprovider));
|
|
9430
9693
|
}
|
|
9431
9694
|
if (this.config.model) {
|
|
9432
9695
|
args.push("--model", this.config.model);
|
|
9433
9696
|
}
|
|
9434
|
-
if (this.config.apiKey) {
|
|
9697
|
+
if (this.config.apiKey && this.config.subprovider?.toLowerCase() !== "azure") {
|
|
9435
9698
|
args.push("--api-key", this.config.apiKey);
|
|
9436
9699
|
}
|
|
9437
9700
|
args.push("--mode", "json");
|
|
@@ -9483,35 +9746,35 @@ ${prompt}` : prompt;
|
|
|
9483
9746
|
}
|
|
9484
9747
|
buildEnv() {
|
|
9485
9748
|
const env = { ...process.env };
|
|
9486
|
-
|
|
9487
|
-
|
|
9488
|
-
|
|
9489
|
-
|
|
9490
|
-
|
|
9491
|
-
|
|
9492
|
-
|
|
9493
|
-
|
|
9494
|
-
|
|
9495
|
-
|
|
9496
|
-
|
|
9497
|
-
|
|
9498
|
-
|
|
9499
|
-
|
|
9749
|
+
const provider = this.config.subprovider?.toLowerCase() ?? "google";
|
|
9750
|
+
if (provider === "azure") {
|
|
9751
|
+
if (this.config.apiKey) {
|
|
9752
|
+
env.AZURE_OPENAI_API_KEY = this.config.apiKey;
|
|
9753
|
+
}
|
|
9754
|
+
if (this.config.baseUrl) {
|
|
9755
|
+
env.AZURE_OPENAI_RESOURCE_NAME = extractAzureResourceName(this.config.baseUrl);
|
|
9756
|
+
}
|
|
9757
|
+
} else {
|
|
9758
|
+
if (this.config.apiKey) {
|
|
9759
|
+
const envKey = resolveEnvKeyName(provider);
|
|
9760
|
+
if (envKey) {
|
|
9761
|
+
env[envKey] = this.config.apiKey;
|
|
9762
|
+
}
|
|
9500
9763
|
}
|
|
9501
9764
|
}
|
|
9502
9765
|
if (this.config.subprovider) {
|
|
9503
|
-
const
|
|
9766
|
+
const resolvedProvider = resolveCliProvider(this.config.subprovider);
|
|
9504
9767
|
const PROVIDER_OWN_PREFIXES = {
|
|
9505
9768
|
openrouter: ["OPENROUTER_"],
|
|
9506
9769
|
anthropic: ["ANTHROPIC_"],
|
|
9507
9770
|
openai: ["OPENAI_"],
|
|
9508
|
-
azure: ["AZURE_OPENAI_"],
|
|
9771
|
+
"azure-openai-responses": ["AZURE_OPENAI_"],
|
|
9509
9772
|
google: ["GEMINI_", "GOOGLE_GENERATIVE_AI_"],
|
|
9510
9773
|
gemini: ["GEMINI_", "GOOGLE_GENERATIVE_AI_"],
|
|
9511
9774
|
groq: ["GROQ_"],
|
|
9512
9775
|
xai: ["XAI_"]
|
|
9513
9776
|
};
|
|
9514
|
-
const ownPrefixes = PROVIDER_OWN_PREFIXES[
|
|
9777
|
+
const ownPrefixes = PROVIDER_OWN_PREFIXES[resolvedProvider] ?? [];
|
|
9515
9778
|
const allOtherPrefixes = Object.entries(PROVIDER_OWN_PREFIXES).filter(([key]) => key !== provider).flatMap(([, prefixes]) => prefixes);
|
|
9516
9779
|
for (const key of Object.keys(env)) {
|
|
9517
9780
|
if (allOtherPrefixes.some((prefix) => key.startsWith(prefix)) && !ownPrefixes.some((prefix) => key.startsWith(prefix))) {
|
|
@@ -9802,6 +10065,24 @@ function extractMessages(events) {
|
|
|
9802
10065
|
}
|
|
9803
10066
|
}
|
|
9804
10067
|
}
|
|
10068
|
+
if (messages) {
|
|
10069
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
10070
|
+
if (messages[i].role === "assistant" && !messages[i].content) {
|
|
10071
|
+
for (let j = events.length - 1; j >= 0; j--) {
|
|
10072
|
+
const evt = events[j];
|
|
10073
|
+
if (!evt || evt.type !== "message_end") continue;
|
|
10074
|
+
const msg = evt.message;
|
|
10075
|
+
if (msg?.role !== "assistant") continue;
|
|
10076
|
+
const text = extractPiTextContent(msg.content);
|
|
10077
|
+
if (text) {
|
|
10078
|
+
messages[i] = { ...messages[i], content: text };
|
|
10079
|
+
break;
|
|
10080
|
+
}
|
|
10081
|
+
}
|
|
10082
|
+
break;
|
|
10083
|
+
}
|
|
10084
|
+
}
|
|
10085
|
+
}
|
|
9805
10086
|
const eventToolCalls = extractToolCallsFromEvents(events);
|
|
9806
10087
|
if (eventToolCalls.length > 0) {
|
|
9807
10088
|
injectEventToolCalls(messages, eventToolCalls);
|
|
@@ -9986,17 +10267,43 @@ function formatTimeoutSuffix3(timeoutMs) {
|
|
|
9986
10267
|
if (!timeoutMs || timeoutMs <= 0) return "";
|
|
9987
10268
|
return ` after ${Math.ceil(timeoutMs / 1e3)}s`;
|
|
9988
10269
|
}
|
|
10270
|
+
function resolveWindowsCmd(executable) {
|
|
10271
|
+
if (process.platform !== "win32") return [executable, []];
|
|
10272
|
+
const lower = executable.toLowerCase();
|
|
10273
|
+
if (lower.endsWith(".js") || lower.endsWith(".exe")) return [executable, []];
|
|
10274
|
+
let fullPath;
|
|
10275
|
+
try {
|
|
10276
|
+
fullPath = (0, import_node_child_process4.execSync)(`where ${executable}`, { encoding: "utf-8" }).trim().split(/\r?\n/)[0].trim();
|
|
10277
|
+
} catch {
|
|
10278
|
+
return [executable, []];
|
|
10279
|
+
}
|
|
10280
|
+
const cmdPath = fullPath.endsWith(".cmd") ? fullPath : `${fullPath}.cmd`;
|
|
10281
|
+
try {
|
|
10282
|
+
const content = (0, import_node_fs9.readFileSync)(cmdPath, "utf-8");
|
|
10283
|
+
const match = content.match(/"?%_prog%"?\s+"([^"]+\.js)"/);
|
|
10284
|
+
if (match) {
|
|
10285
|
+
const dp0 = import_node_path21.default.dirname(import_node_path21.default.resolve(cmdPath));
|
|
10286
|
+
const scriptPath = match[1].replace(/%dp0%[/\\]?/gi, `${dp0}${import_node_path21.default.sep}`);
|
|
10287
|
+
try {
|
|
10288
|
+
(0, import_node_fs9.accessSync)(scriptPath);
|
|
10289
|
+
return ["node", [scriptPath]];
|
|
10290
|
+
} catch {
|
|
10291
|
+
}
|
|
10292
|
+
}
|
|
10293
|
+
} catch {
|
|
10294
|
+
}
|
|
10295
|
+
return [executable, []];
|
|
10296
|
+
}
|
|
9989
10297
|
async function defaultPiRunner(options) {
|
|
9990
10298
|
return await new Promise((resolve, reject) => {
|
|
9991
10299
|
const parts = options.executable.split(/\s+/);
|
|
9992
|
-
const
|
|
9993
|
-
const executableArgs = parts.slice(1);
|
|
10300
|
+
const [resolvedExe, prefixArgs] = resolveWindowsCmd(parts[0]);
|
|
10301
|
+
const executableArgs = [...prefixArgs, ...parts.slice(1)];
|
|
9994
10302
|
const allArgs = [...executableArgs, ...options.args];
|
|
9995
|
-
const child = (0, import_node_child_process4.spawn)(
|
|
10303
|
+
const child = (0, import_node_child_process4.spawn)(resolvedExe, allArgs, {
|
|
9996
10304
|
cwd: options.cwd,
|
|
9997
10305
|
env: options.env,
|
|
9998
|
-
stdio: ["pipe", "pipe", "pipe"]
|
|
9999
|
-
shell: false
|
|
10306
|
+
stdio: ["pipe", "pipe", "pipe"]
|
|
10000
10307
|
});
|
|
10001
10308
|
let stdout = "";
|
|
10002
10309
|
let stderr = "";
|
|
@@ -10056,9 +10363,40 @@ var import_node_child_process5 = require("child_process");
|
|
|
10056
10363
|
var import_node_crypto8 = require("crypto");
|
|
10057
10364
|
var import_node_fs10 = require("fs");
|
|
10058
10365
|
var import_promises19 = require("fs/promises");
|
|
10059
|
-
var
|
|
10366
|
+
var import_node_path23 = __toESM(require("path"), 1);
|
|
10060
10367
|
var import_node_readline = require("readline");
|
|
10061
10368
|
var import_node_url3 = require("url");
|
|
10369
|
+
|
|
10370
|
+
// src/paths.ts
|
|
10371
|
+
init_cjs_shims();
|
|
10372
|
+
var import_node_os6 = __toESM(require("os"), 1);
|
|
10373
|
+
var import_node_path22 = __toESM(require("path"), 1);
|
|
10374
|
+
var logged = false;
|
|
10375
|
+
function getAgentvHome() {
|
|
10376
|
+
const envHome = process.env.AGENTV_HOME;
|
|
10377
|
+
if (envHome && envHome !== "undefined") {
|
|
10378
|
+
if (!logged) {
|
|
10379
|
+
logged = true;
|
|
10380
|
+
console.warn(`Using AGENTV_HOME: ${envHome}`);
|
|
10381
|
+
}
|
|
10382
|
+
return envHome;
|
|
10383
|
+
}
|
|
10384
|
+
return import_node_path22.default.join(import_node_os6.default.homedir(), ".agentv");
|
|
10385
|
+
}
|
|
10386
|
+
function getWorkspacesRoot() {
|
|
10387
|
+
return import_node_path22.default.join(getAgentvHome(), "workspaces");
|
|
10388
|
+
}
|
|
10389
|
+
function getSubagentsRoot() {
|
|
10390
|
+
return import_node_path22.default.join(getAgentvHome(), "subagents");
|
|
10391
|
+
}
|
|
10392
|
+
function getTraceStateRoot() {
|
|
10393
|
+
return import_node_path22.default.join(getAgentvHome(), "trace-state");
|
|
10394
|
+
}
|
|
10395
|
+
function getWorkspacePoolRoot() {
|
|
10396
|
+
return import_node_path22.default.join(getAgentvHome(), "workspace-pool");
|
|
10397
|
+
}
|
|
10398
|
+
|
|
10399
|
+
// src/evaluation/providers/pi-coding-agent.ts
|
|
10062
10400
|
var piCodingAgentModule = null;
|
|
10063
10401
|
var piAiModule = null;
|
|
10064
10402
|
var loadingPromise = null;
|
|
@@ -10076,46 +10414,126 @@ async function promptInstall() {
|
|
|
10076
10414
|
rl.close();
|
|
10077
10415
|
}
|
|
10078
10416
|
}
|
|
10079
|
-
function
|
|
10080
|
-
|
|
10081
|
-
|
|
10082
|
-
|
|
10417
|
+
function findManagedSdkInstallRoot() {
|
|
10418
|
+
return import_node_path23.default.join(getAgentvHome(), "deps", "pi-sdk");
|
|
10419
|
+
}
|
|
10420
|
+
function resolveGlobalNpmRoot() {
|
|
10421
|
+
try {
|
|
10422
|
+
const root = (0, import_node_child_process5.execSync)("npm root -g", {
|
|
10423
|
+
encoding: "utf-8",
|
|
10424
|
+
stdio: ["ignore", "pipe", "ignore"]
|
|
10425
|
+
}).trim();
|
|
10426
|
+
return root.length > 0 ? root : void 0;
|
|
10427
|
+
} catch {
|
|
10428
|
+
return void 0;
|
|
10429
|
+
}
|
|
10430
|
+
}
|
|
10431
|
+
function buildGlobalModuleEntry(moduleName, globalNpmRoot) {
|
|
10432
|
+
return import_node_path23.default.join(globalNpmRoot, ...moduleName.split("/"), "dist", "index.js");
|
|
10433
|
+
}
|
|
10434
|
+
function findAccessiblePath(paths) {
|
|
10435
|
+
for (const candidate of paths) {
|
|
10083
10436
|
try {
|
|
10084
|
-
|
|
10085
|
-
|
|
10086
|
-
return dir;
|
|
10437
|
+
(0, import_node_fs10.accessSync)(candidate);
|
|
10438
|
+
return candidate;
|
|
10087
10439
|
} catch {
|
|
10088
|
-
const parent = import_node_path22.default.dirname(dir);
|
|
10089
|
-
if (parent === dir) break;
|
|
10090
|
-
dir = parent;
|
|
10091
10440
|
}
|
|
10092
10441
|
}
|
|
10093
|
-
return
|
|
10442
|
+
return void 0;
|
|
10094
10443
|
}
|
|
10095
|
-
async function
|
|
10444
|
+
async function tryImportLocalSdkModules() {
|
|
10096
10445
|
try {
|
|
10097
10446
|
[piCodingAgentModule, piAiModule] = await Promise.all([
|
|
10098
10447
|
import("@mariozechner/pi-coding-agent"),
|
|
10099
10448
|
import("@mariozechner/pi-ai")
|
|
10100
10449
|
]);
|
|
10450
|
+
return true;
|
|
10101
10451
|
} catch {
|
|
10102
|
-
|
|
10103
|
-
|
|
10104
|
-
|
|
10105
|
-
|
|
10106
|
-
|
|
10107
|
-
|
|
10108
|
-
|
|
10109
|
-
|
|
10110
|
-
|
|
10111
|
-
|
|
10112
|
-
|
|
10113
|
-
|
|
10114
|
-
|
|
10115
|
-
|
|
10116
|
-
|
|
10452
|
+
return false;
|
|
10453
|
+
}
|
|
10454
|
+
}
|
|
10455
|
+
async function tryImportManagedSdkModules() {
|
|
10456
|
+
const managedRoot = findManagedSdkInstallRoot();
|
|
10457
|
+
const piCodingAgentEntry = findAccessiblePath([
|
|
10458
|
+
import_node_path23.default.join(managedRoot, "node_modules", "@mariozechner", "pi-coding-agent", "dist", "index.js")
|
|
10459
|
+
]);
|
|
10460
|
+
const piAiEntry = findAccessiblePath([
|
|
10461
|
+
import_node_path23.default.join(managedRoot, "node_modules", "@mariozechner", "pi-ai", "dist", "index.js"),
|
|
10462
|
+
import_node_path23.default.join(
|
|
10463
|
+
managedRoot,
|
|
10464
|
+
"node_modules",
|
|
10465
|
+
"@mariozechner",
|
|
10466
|
+
"pi-coding-agent",
|
|
10467
|
+
"node_modules",
|
|
10468
|
+
"@mariozechner",
|
|
10469
|
+
"pi-ai",
|
|
10470
|
+
"dist",
|
|
10471
|
+
"index.js"
|
|
10472
|
+
)
|
|
10473
|
+
]);
|
|
10474
|
+
if (!piCodingAgentEntry || !piAiEntry) return false;
|
|
10475
|
+
try {
|
|
10476
|
+
[piCodingAgentModule, piAiModule] = await Promise.all([
|
|
10477
|
+
import((0, import_node_url3.pathToFileURL)(piCodingAgentEntry).href),
|
|
10478
|
+
import((0, import_node_url3.pathToFileURL)(piAiEntry).href)
|
|
10479
|
+
]);
|
|
10480
|
+
return true;
|
|
10481
|
+
} catch {
|
|
10482
|
+
return false;
|
|
10483
|
+
}
|
|
10484
|
+
}
|
|
10485
|
+
async function tryImportGlobalSdkModules() {
|
|
10486
|
+
const globalNpmRoot = resolveGlobalNpmRoot();
|
|
10487
|
+
if (!globalNpmRoot) return false;
|
|
10488
|
+
const piCodingAgentEntry = findAccessiblePath([
|
|
10489
|
+
buildGlobalModuleEntry("@mariozechner/pi-coding-agent", globalNpmRoot)
|
|
10490
|
+
]);
|
|
10491
|
+
const piAiEntry = findAccessiblePath([
|
|
10492
|
+
buildGlobalModuleEntry("@mariozechner/pi-ai", globalNpmRoot),
|
|
10493
|
+
import_node_path23.default.join(
|
|
10494
|
+
globalNpmRoot,
|
|
10495
|
+
"@mariozechner",
|
|
10496
|
+
"pi-coding-agent",
|
|
10497
|
+
"node_modules",
|
|
10498
|
+
"@mariozechner",
|
|
10499
|
+
"pi-ai",
|
|
10500
|
+
"dist",
|
|
10501
|
+
"index.js"
|
|
10502
|
+
)
|
|
10503
|
+
]);
|
|
10504
|
+
if (!piCodingAgentEntry || !piAiEntry) return false;
|
|
10505
|
+
try {
|
|
10506
|
+
[piCodingAgentModule, piAiModule] = await Promise.all([
|
|
10507
|
+
import((0, import_node_url3.pathToFileURL)(piCodingAgentEntry).href),
|
|
10508
|
+
import((0, import_node_url3.pathToFileURL)(piAiEntry).href)
|
|
10509
|
+
]);
|
|
10510
|
+
return true;
|
|
10511
|
+
} catch {
|
|
10512
|
+
return false;
|
|
10513
|
+
}
|
|
10514
|
+
}
|
|
10515
|
+
function installSdkModules(installDir) {
|
|
10516
|
+
console.error(`Installing @mariozechner/pi-coding-agent into ${installDir} via npm...`);
|
|
10517
|
+
(0, import_node_fs10.mkdirSync)(installDir, { recursive: true });
|
|
10518
|
+
(0, import_node_child_process5.execSync)("npm install --no-save --no-package-lock @mariozechner/pi-coding-agent", {
|
|
10519
|
+
cwd: installDir,
|
|
10520
|
+
stdio: "inherit"
|
|
10521
|
+
});
|
|
10522
|
+
}
|
|
10523
|
+
async function doLoadSdkModules() {
|
|
10524
|
+
if (await tryImportLocalSdkModules() || await tryImportManagedSdkModules() || await tryImportGlobalSdkModules()) {
|
|
10525
|
+
return;
|
|
10526
|
+
}
|
|
10527
|
+
if (await promptInstall()) {
|
|
10528
|
+
const installDir = findManagedSdkInstallRoot();
|
|
10529
|
+
installSdkModules(installDir);
|
|
10530
|
+
if (await tryImportManagedSdkModules()) {
|
|
10531
|
+
return;
|
|
10117
10532
|
}
|
|
10118
10533
|
}
|
|
10534
|
+
throw new Error(
|
|
10535
|
+
"pi-coding-agent SDK is not installed. Install it with:\n npm install @mariozechner/pi-coding-agent"
|
|
10536
|
+
);
|
|
10119
10537
|
}
|
|
10120
10538
|
async function loadSdkModules() {
|
|
10121
10539
|
if (!piCodingAgentModule || !piAiModule) {
|
|
@@ -10143,7 +10561,9 @@ async function loadSdkModules() {
|
|
|
10143
10561
|
codingTools: piSdk.codingTools,
|
|
10144
10562
|
toolMap,
|
|
10145
10563
|
SessionManager: piSdk.SessionManager,
|
|
10146
|
-
getModel: piAi.getModel
|
|
10564
|
+
getModel: piAi.getModel,
|
|
10565
|
+
// biome-ignore lint/suspicious/noExplicitAny: registerBuiltInApiProviders exists at runtime but not in type defs
|
|
10566
|
+
registerBuiltInApiProviders: piAi.registerBuiltInApiProviders
|
|
10147
10567
|
};
|
|
10148
10568
|
}
|
|
10149
10569
|
var PiCodingAgentProvider = class {
|
|
@@ -10165,17 +10585,35 @@ var PiCodingAgentProvider = class {
|
|
|
10165
10585
|
const startTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
10166
10586
|
const startMs = Date.now();
|
|
10167
10587
|
const sdk = await loadSdkModules();
|
|
10588
|
+
sdk.registerBuiltInApiProviders();
|
|
10168
10589
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
10169
10590
|
try {
|
|
10170
10591
|
const cwd = this.resolveCwd(request.cwd);
|
|
10171
|
-
const
|
|
10592
|
+
const rawProvider = this.config.subprovider ?? "google";
|
|
10593
|
+
const normalizedBaseUrl = this.normalizeSdkBaseUrl(rawProvider, this.config.baseUrl);
|
|
10594
|
+
const hasBaseUrl = !!normalizedBaseUrl;
|
|
10595
|
+
const providerName = resolveSubprovider(rawProvider, hasBaseUrl);
|
|
10172
10596
|
const modelId = this.config.model ?? "gemini-2.5-flash";
|
|
10173
|
-
this.setApiKeyEnv(
|
|
10174
|
-
|
|
10597
|
+
this.setApiKeyEnv(rawProvider, hasBaseUrl);
|
|
10598
|
+
this.setBaseUrlEnv(rawProvider, normalizedBaseUrl, hasBaseUrl);
|
|
10599
|
+
let model = sdk.getModel(providerName, modelId);
|
|
10600
|
+
if (model && normalizedBaseUrl) {
|
|
10601
|
+
model = { ...model, baseUrl: normalizedBaseUrl };
|
|
10602
|
+
}
|
|
10175
10603
|
if (!model) {
|
|
10176
|
-
|
|
10177
|
-
|
|
10178
|
-
|
|
10604
|
+
const envProvider = providerName.replace(/-responses$/, "");
|
|
10605
|
+
model = {
|
|
10606
|
+
id: modelId,
|
|
10607
|
+
name: modelId,
|
|
10608
|
+
api: providerName,
|
|
10609
|
+
provider: envProvider,
|
|
10610
|
+
baseUrl: normalizedBaseUrl ?? "",
|
|
10611
|
+
reasoning: false,
|
|
10612
|
+
input: ["text"],
|
|
10613
|
+
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
|
10614
|
+
contextWindow: 128e3,
|
|
10615
|
+
maxTokens: 16384
|
|
10616
|
+
};
|
|
10179
10617
|
}
|
|
10180
10618
|
const tools = this.resolveTools(sdk);
|
|
10181
10619
|
const { session } = await sdk.createAgentSession({
|
|
@@ -10328,28 +10766,35 @@ ${fileList}`;
|
|
|
10328
10766
|
}
|
|
10329
10767
|
}
|
|
10330
10768
|
/** Maps config apiKey to the provider-specific env var the SDK reads. */
|
|
10331
|
-
setApiKeyEnv(providerName) {
|
|
10769
|
+
setApiKeyEnv(providerName, hasBaseUrl = false) {
|
|
10332
10770
|
if (!this.config.apiKey) return;
|
|
10333
|
-
const
|
|
10334
|
-
google: "GEMINI_API_KEY",
|
|
10335
|
-
gemini: "GEMINI_API_KEY",
|
|
10336
|
-
anthropic: "ANTHROPIC_API_KEY",
|
|
10337
|
-
openai: "OPENAI_API_KEY",
|
|
10338
|
-
groq: "GROQ_API_KEY",
|
|
10339
|
-
xai: "XAI_API_KEY",
|
|
10340
|
-
openrouter: "OPENROUTER_API_KEY"
|
|
10341
|
-
};
|
|
10342
|
-
const envKey = ENV_KEY_MAP[providerName.toLowerCase()];
|
|
10771
|
+
const envKey = resolveEnvKeyName(providerName, hasBaseUrl);
|
|
10343
10772
|
if (envKey) {
|
|
10344
10773
|
process.env[envKey] = this.config.apiKey;
|
|
10345
10774
|
}
|
|
10346
10775
|
}
|
|
10776
|
+
/** Maps config baseUrl to the provider-specific env var the SDK reads. */
|
|
10777
|
+
setBaseUrlEnv(providerName, baseUrl = this.config.baseUrl, hasBaseUrl = false) {
|
|
10778
|
+
const normalizedBaseUrl = this.normalizeSdkBaseUrl(providerName, baseUrl);
|
|
10779
|
+
if (!normalizedBaseUrl) return;
|
|
10780
|
+
const envKey = resolveEnvBaseUrlName(providerName, hasBaseUrl);
|
|
10781
|
+
if (envKey) {
|
|
10782
|
+
process.env[envKey] = normalizedBaseUrl;
|
|
10783
|
+
}
|
|
10784
|
+
}
|
|
10785
|
+
normalizeSdkBaseUrl(providerName, baseUrl) {
|
|
10786
|
+
if (!baseUrl) return void 0;
|
|
10787
|
+
if (providerName.toLowerCase() === "azure") {
|
|
10788
|
+
return normalizeAzureSdkBaseUrl(baseUrl);
|
|
10789
|
+
}
|
|
10790
|
+
return baseUrl;
|
|
10791
|
+
}
|
|
10347
10792
|
resolveCwd(cwdOverride) {
|
|
10348
10793
|
if (cwdOverride) {
|
|
10349
|
-
return
|
|
10794
|
+
return import_node_path23.default.resolve(cwdOverride);
|
|
10350
10795
|
}
|
|
10351
10796
|
if (this.config.cwd) {
|
|
10352
|
-
return
|
|
10797
|
+
return import_node_path23.default.resolve(this.config.cwd);
|
|
10353
10798
|
}
|
|
10354
10799
|
return process.cwd();
|
|
10355
10800
|
}
|
|
@@ -10368,9 +10813,9 @@ ${fileList}`;
|
|
|
10368
10813
|
}
|
|
10369
10814
|
resolveLogDirectory() {
|
|
10370
10815
|
if (this.config.logDir) {
|
|
10371
|
-
return
|
|
10816
|
+
return import_node_path23.default.resolve(this.config.logDir);
|
|
10372
10817
|
}
|
|
10373
|
-
return
|
|
10818
|
+
return import_node_path23.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
10374
10819
|
}
|
|
10375
10820
|
async createStreamLogger(request) {
|
|
10376
10821
|
const logDir = this.resolveLogDirectory();
|
|
@@ -10384,7 +10829,7 @@ ${fileList}`;
|
|
|
10384
10829
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
10385
10830
|
return void 0;
|
|
10386
10831
|
}
|
|
10387
|
-
const filePath =
|
|
10832
|
+
const filePath = import_node_path23.default.join(logDir, buildLogFilename6(request, this.targetName));
|
|
10388
10833
|
try {
|
|
10389
10834
|
const logger = await PiStreamLogger2.create({
|
|
10390
10835
|
filePath,
|
|
@@ -10599,19 +11044,17 @@ var ProviderRegistry = class {
|
|
|
10599
11044
|
|
|
10600
11045
|
// src/evaluation/providers/targets.ts
|
|
10601
11046
|
init_cjs_shims();
|
|
10602
|
-
var
|
|
11047
|
+
var import_node_path24 = __toESM(require("path"), 1);
|
|
10603
11048
|
var import_zod3 = require("zod");
|
|
10604
11049
|
var CliHealthcheckHttpInputSchema = import_zod3.z.object({
|
|
10605
11050
|
url: import_zod3.z.string().min(1, "healthcheck URL is required"),
|
|
10606
|
-
timeout_seconds: import_zod3.z.number().positive().optional()
|
|
10607
|
-
|
|
10608
|
-
});
|
|
11051
|
+
timeout_seconds: import_zod3.z.number().positive().optional()
|
|
11052
|
+
}).passthrough();
|
|
10609
11053
|
var CliHealthcheckCommandInputSchema = import_zod3.z.object({
|
|
10610
11054
|
command: import_zod3.z.string().min(1, "healthcheck command is required"),
|
|
10611
11055
|
cwd: import_zod3.z.string().optional(),
|
|
10612
|
-
timeout_seconds: import_zod3.z.number().positive().optional()
|
|
10613
|
-
|
|
10614
|
-
});
|
|
11056
|
+
timeout_seconds: import_zod3.z.number().positive().optional()
|
|
11057
|
+
}).passthrough();
|
|
10615
11058
|
var CliHealthcheckInputSchema = import_zod3.z.union([
|
|
10616
11059
|
CliHealthcheckHttpInputSchema,
|
|
10617
11060
|
CliHealthcheckCommandInputSchema
|
|
@@ -10623,36 +11066,28 @@ var CliTargetInputSchema = import_zod3.z.object({
|
|
|
10623
11066
|
command: import_zod3.z.string(),
|
|
10624
11067
|
// Files format - optional
|
|
10625
11068
|
files_format: import_zod3.z.string().optional(),
|
|
10626
|
-
filesFormat: import_zod3.z.string().optional(),
|
|
10627
11069
|
attachments_format: import_zod3.z.string().optional(),
|
|
10628
|
-
attachmentsFormat: import_zod3.z.string().optional(),
|
|
10629
11070
|
// Working directory - optional
|
|
10630
11071
|
cwd: import_zod3.z.string().optional(),
|
|
10631
11072
|
// Workspace template directory - optional (mutually exclusive with cwd)
|
|
10632
11073
|
workspace_template: import_zod3.z.string().optional(),
|
|
10633
|
-
workspaceTemplate: import_zod3.z.string().optional(),
|
|
10634
11074
|
// Timeout in seconds - optional
|
|
10635
11075
|
timeout_seconds: import_zod3.z.number().positive().optional(),
|
|
10636
|
-
timeoutSeconds: import_zod3.z.number().positive().optional(),
|
|
10637
11076
|
// Healthcheck configuration - optional
|
|
10638
11077
|
healthcheck: CliHealthcheckInputSchema.optional(),
|
|
10639
11078
|
// Verbose mode - optional
|
|
10640
11079
|
verbose: import_zod3.z.boolean().optional(),
|
|
10641
11080
|
cli_verbose: import_zod3.z.boolean().optional(),
|
|
10642
|
-
cliVerbose: import_zod3.z.boolean().optional(),
|
|
10643
11081
|
// Keep temp files - optional
|
|
10644
11082
|
keep_temp_files: import_zod3.z.boolean().optional(),
|
|
10645
|
-
keepTempFiles: import_zod3.z.boolean().optional(),
|
|
10646
11083
|
keep_output_files: import_zod3.z.boolean().optional(),
|
|
10647
|
-
keepOutputFiles: import_zod3.z.boolean().optional(),
|
|
10648
11084
|
// Common target fields
|
|
10649
11085
|
grader_target: import_zod3.z.string().optional(),
|
|
10650
11086
|
judge_target: import_zod3.z.string().optional(),
|
|
10651
11087
|
// backward compat
|
|
10652
11088
|
workers: import_zod3.z.number().int().min(1).optional(),
|
|
10653
|
-
provider_batching: import_zod3.z.boolean().optional()
|
|
10654
|
-
|
|
10655
|
-
});
|
|
11089
|
+
provider_batching: import_zod3.z.boolean().optional()
|
|
11090
|
+
}).passthrough();
|
|
10656
11091
|
var CliHealthcheckHttpSchema = import_zod3.z.object({
|
|
10657
11092
|
url: import_zod3.z.string().min(1),
|
|
10658
11093
|
timeoutMs: import_zod3.z.number().positive().optional()
|
|
@@ -10677,7 +11112,7 @@ var CliTargetConfigSchema = import_zod3.z.object({
|
|
|
10677
11112
|
keepTempFiles: import_zod3.z.boolean().optional()
|
|
10678
11113
|
}).strict();
|
|
10679
11114
|
function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
|
|
10680
|
-
const timeoutSeconds = input.timeout_seconds
|
|
11115
|
+
const timeoutSeconds = input.timeout_seconds;
|
|
10681
11116
|
const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
|
|
10682
11117
|
if ("url" in input && input.url) {
|
|
10683
11118
|
const url = resolveString(input.url, env, `${targetName} healthcheck URL`);
|
|
@@ -10696,11 +11131,11 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
|
|
|
10696
11131
|
allowLiteral: true,
|
|
10697
11132
|
optionalEnv: true
|
|
10698
11133
|
});
|
|
10699
|
-
if (cwd && evalFilePath && !
|
|
10700
|
-
cwd =
|
|
11134
|
+
if (cwd && evalFilePath && !import_node_path24.default.isAbsolute(cwd)) {
|
|
11135
|
+
cwd = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), cwd);
|
|
10701
11136
|
}
|
|
10702
11137
|
if (!cwd && evalFilePath) {
|
|
10703
|
-
cwd =
|
|
11138
|
+
cwd = import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath));
|
|
10704
11139
|
}
|
|
10705
11140
|
return {
|
|
10706
11141
|
command,
|
|
@@ -10711,9 +11146,9 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
|
|
|
10711
11146
|
function normalizeCliTargetInput(input, env, evalFilePath) {
|
|
10712
11147
|
const targetName = input.name;
|
|
10713
11148
|
const command = resolveString(input.command, env, `${targetName} CLI command`, true);
|
|
10714
|
-
const filesFormatSource = input.files_format ?? input.
|
|
11149
|
+
const filesFormatSource = input.files_format ?? input.attachments_format;
|
|
10715
11150
|
const filesFormat = resolveOptionalLiteralString(filesFormatSource);
|
|
10716
|
-
const workspaceTemplateSource = input.workspace_template
|
|
11151
|
+
const workspaceTemplateSource = input.workspace_template;
|
|
10717
11152
|
let workspaceTemplate = resolveOptionalString(
|
|
10718
11153
|
workspaceTemplateSource,
|
|
10719
11154
|
env,
|
|
@@ -10723,15 +11158,15 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
|
|
|
10723
11158
|
optionalEnv: true
|
|
10724
11159
|
}
|
|
10725
11160
|
);
|
|
10726
|
-
if (workspaceTemplate && evalFilePath && !
|
|
10727
|
-
workspaceTemplate =
|
|
11161
|
+
if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
|
|
11162
|
+
workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
|
|
10728
11163
|
}
|
|
10729
11164
|
let cwd = resolveOptionalString(input.cwd, env, `${targetName} working directory`, {
|
|
10730
11165
|
allowLiteral: true,
|
|
10731
11166
|
optionalEnv: true
|
|
10732
11167
|
});
|
|
10733
|
-
if (cwd && evalFilePath && !
|
|
10734
|
-
cwd =
|
|
11168
|
+
if (cwd && evalFilePath && !import_node_path24.default.isAbsolute(cwd)) {
|
|
11169
|
+
cwd = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), cwd);
|
|
10735
11170
|
}
|
|
10736
11171
|
if (cwd && workspaceTemplate) {
|
|
10737
11172
|
throw new Error(
|
|
@@ -10739,14 +11174,12 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
|
|
|
10739
11174
|
);
|
|
10740
11175
|
}
|
|
10741
11176
|
if (!cwd && !workspaceTemplate && evalFilePath) {
|
|
10742
|
-
cwd =
|
|
11177
|
+
cwd = import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath));
|
|
10743
11178
|
}
|
|
10744
|
-
const timeoutSeconds = input.timeout_seconds
|
|
11179
|
+
const timeoutSeconds = input.timeout_seconds;
|
|
10745
11180
|
const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
|
|
10746
|
-
const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose
|
|
10747
|
-
const keepTempFiles = resolveOptionalBoolean(
|
|
10748
|
-
input.keep_temp_files ?? input.keepTempFiles ?? input.keep_output_files ?? input.keepOutputFiles
|
|
10749
|
-
);
|
|
11181
|
+
const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose);
|
|
11182
|
+
const keepTempFiles = resolveOptionalBoolean(input.keep_temp_files ?? input.keep_output_files);
|
|
10750
11183
|
const healthcheck = input.healthcheck ? normalizeCliHealthcheck(input.healthcheck, env, targetName, evalFilePath) : void 0;
|
|
10751
11184
|
return {
|
|
10752
11185
|
command,
|
|
@@ -10767,15 +11200,106 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
|
|
|
10767
11200
|
"FILES",
|
|
10768
11201
|
"OUTPUT_FILE"
|
|
10769
11202
|
]);
|
|
11203
|
+
var DEPRECATED_TARGET_CAMEL_CASE_FIELDS = /* @__PURE__ */ new Map([
|
|
11204
|
+
["providerBatching", "provider_batching"],
|
|
11205
|
+
["subagentModeAllowed", "subagent_mode_allowed"],
|
|
11206
|
+
["fallbackTargets", "fallback_targets"],
|
|
11207
|
+
["resourceName", "endpoint"],
|
|
11208
|
+
["baseUrl", "base_url"],
|
|
11209
|
+
["apiKey", "api_key"],
|
|
11210
|
+
["deploymentName", "model"],
|
|
11211
|
+
["thinkingBudget", "thinking_budget"],
|
|
11212
|
+
["maxTokens", "max_output_tokens"],
|
|
11213
|
+
["apiFormat", "api_format"],
|
|
11214
|
+
["timeoutSeconds", "timeout_seconds"],
|
|
11215
|
+
["logDir", "log_dir"],
|
|
11216
|
+
["logDirectory", "log_directory"],
|
|
11217
|
+
["logFormat", "log_format"],
|
|
11218
|
+
["logOutputFormat", "log_output_format"],
|
|
11219
|
+
["systemPrompt", "system_prompt"],
|
|
11220
|
+
["maxTurns", "max_turns"],
|
|
11221
|
+
["maxBudgetUsd", "max_budget_usd"],
|
|
11222
|
+
["dryRun", "dry_run"],
|
|
11223
|
+
["subagentRoot", "subagent_root"],
|
|
11224
|
+
["filesFormat", "files_format"],
|
|
11225
|
+
["attachmentsFormat", "attachments_format"],
|
|
11226
|
+
["cliUrl", "cli_url"],
|
|
11227
|
+
["cliPath", "cli_path"],
|
|
11228
|
+
["githubToken", "github_token"],
|
|
11229
|
+
["sessionDir", "session_dir"],
|
|
11230
|
+
["sessionId", "session_id"],
|
|
11231
|
+
["sessionStateDir", "session_state_dir"],
|
|
11232
|
+
["maxRetries", "max_retries"],
|
|
11233
|
+
["retryInitialDelayMs", "retry_initial_delay_ms"],
|
|
11234
|
+
["retryMaxDelayMs", "retry_max_delay_ms"],
|
|
11235
|
+
["retryBackoffFactor", "retry_backoff_factor"],
|
|
11236
|
+
["retryStatusCodes", "retry_status_codes"]
|
|
11237
|
+
]);
|
|
11238
|
+
var DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS = /* @__PURE__ */ new Map([
|
|
11239
|
+
["timeoutSeconds", "timeout_seconds"]
|
|
11240
|
+
]);
|
|
11241
|
+
function collectDeprecatedCamelCaseWarnings(value, location, aliases) {
|
|
11242
|
+
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
11243
|
+
return [];
|
|
11244
|
+
}
|
|
11245
|
+
const warnings = [];
|
|
11246
|
+
for (const [camelCaseField, snakeCaseField] of aliases) {
|
|
11247
|
+
if (Object.prototype.hasOwnProperty.call(value, camelCaseField)) {
|
|
11248
|
+
warnings.push({
|
|
11249
|
+
location: `${location}.${camelCaseField}`,
|
|
11250
|
+
message: `camelCase field '${camelCaseField}' is no longer supported in targets.yaml. Use '${snakeCaseField}' instead.`
|
|
11251
|
+
});
|
|
11252
|
+
}
|
|
11253
|
+
}
|
|
11254
|
+
return warnings;
|
|
11255
|
+
}
|
|
11256
|
+
function assertNoDeprecatedCamelCaseTargetFields(definition) {
|
|
11257
|
+
if (Object.prototype.hasOwnProperty.call(definition, "workspaceTemplate")) {
|
|
11258
|
+
throw new Error(
|
|
11259
|
+
`${definition.name}: target-level workspace_template has been removed. Use eval-level workspace.template.`
|
|
11260
|
+
);
|
|
11261
|
+
}
|
|
11262
|
+
const warning = findDeprecatedCamelCaseTargetWarnings(
|
|
11263
|
+
definition,
|
|
11264
|
+
`target "${definition.name}"`
|
|
11265
|
+
)[0];
|
|
11266
|
+
if (!warning) {
|
|
11267
|
+
return;
|
|
11268
|
+
}
|
|
11269
|
+
const fieldMatch = warning.message.match(/field '([^']+)'/);
|
|
11270
|
+
const replacementMatch = warning.message.match(/Use '([^']+)' instead/);
|
|
11271
|
+
const field = fieldMatch?.[1] ?? "unknown";
|
|
11272
|
+
const replacement = replacementMatch?.[1] ?? "snake_case";
|
|
11273
|
+
throw new Error(
|
|
11274
|
+
`${warning.location}: camelCase field '${field}' is no longer supported in targets.yaml. Use '${replacement}' instead.`
|
|
11275
|
+
);
|
|
11276
|
+
}
|
|
11277
|
+
function findDeprecatedCamelCaseTargetWarnings(target, location) {
|
|
11278
|
+
const warnings = collectDeprecatedCamelCaseWarnings(
|
|
11279
|
+
target,
|
|
11280
|
+
location,
|
|
11281
|
+
DEPRECATED_TARGET_CAMEL_CASE_FIELDS
|
|
11282
|
+
);
|
|
11283
|
+
if (typeof target !== "object" || target === null || Array.isArray(target)) {
|
|
11284
|
+
return warnings;
|
|
11285
|
+
}
|
|
11286
|
+
const healthcheck = target.healthcheck;
|
|
11287
|
+
warnings.push(
|
|
11288
|
+
...collectDeprecatedCamelCaseWarnings(
|
|
11289
|
+
healthcheck,
|
|
11290
|
+
`${location}.healthcheck`,
|
|
11291
|
+
DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS
|
|
11292
|
+
)
|
|
11293
|
+
);
|
|
11294
|
+
return warnings;
|
|
11295
|
+
}
|
|
10770
11296
|
var COMMON_TARGET_SETTINGS = [
|
|
10771
11297
|
"use_target",
|
|
10772
11298
|
"provider_batching",
|
|
10773
|
-
"providerBatching",
|
|
10774
11299
|
"subagent_mode_allowed",
|
|
10775
|
-
"
|
|
10776
|
-
"fallback_targets",
|
|
10777
|
-
"fallbackTargets"
|
|
11300
|
+
"fallback_targets"
|
|
10778
11301
|
];
|
|
11302
|
+
var USE_TARGET_ENV_PATTERN = /^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i;
|
|
10779
11303
|
var BASE_TARGET_SCHEMA = import_zod3.z.object({
|
|
10780
11304
|
name: import_zod3.z.string().min(1, "target name is required"),
|
|
10781
11305
|
provider: import_zod3.z.string().optional(),
|
|
@@ -10785,43 +11309,40 @@ var BASE_TARGET_SCHEMA = import_zod3.z.object({
|
|
|
10785
11309
|
// backward compat
|
|
10786
11310
|
workers: import_zod3.z.number().int().min(1).optional(),
|
|
10787
11311
|
workspace_template: import_zod3.z.string().optional(),
|
|
10788
|
-
workspaceTemplate: import_zod3.z.string().optional(),
|
|
10789
11312
|
subagent_mode_allowed: import_zod3.z.boolean().optional(),
|
|
10790
|
-
fallback_targets: import_zod3.z.array(import_zod3.z.string().min(1)).optional()
|
|
10791
|
-
fallbackTargets: import_zod3.z.array(import_zod3.z.string().min(1)).optional()
|
|
11313
|
+
fallback_targets: import_zod3.z.array(import_zod3.z.string().min(1)).optional()
|
|
10792
11314
|
}).passthrough();
|
|
10793
11315
|
var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
|
|
11316
|
+
var DEFAULT_AZURE_RESPONSES_API_VERSION = "v1";
|
|
10794
11317
|
var DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
|
|
10795
|
-
function normalizeAzureApiVersion(value) {
|
|
11318
|
+
function normalizeAzureApiVersion(value, apiFormat) {
|
|
11319
|
+
const defaultVersion = apiFormat === "responses" ? DEFAULT_AZURE_RESPONSES_API_VERSION : DEFAULT_AZURE_API_VERSION;
|
|
10796
11320
|
if (!value) {
|
|
10797
|
-
return
|
|
11321
|
+
return defaultVersion;
|
|
10798
11322
|
}
|
|
10799
11323
|
const trimmed = value.trim();
|
|
10800
11324
|
if (trimmed.length === 0) {
|
|
10801
|
-
return
|
|
11325
|
+
return defaultVersion;
|
|
10802
11326
|
}
|
|
10803
11327
|
const withoutPrefix = trimmed.replace(/^api[-_]?version\s*=\s*/i, "").trim();
|
|
10804
|
-
return withoutPrefix.length > 0 ? withoutPrefix :
|
|
11328
|
+
return withoutPrefix.length > 0 ? withoutPrefix : defaultVersion;
|
|
10805
11329
|
}
|
|
10806
11330
|
function resolveRetryConfig(target) {
|
|
10807
|
-
const maxRetries = resolveOptionalNumber(
|
|
10808
|
-
target.max_retries ?? target.maxRetries,
|
|
10809
|
-
`${target.name} max retries`
|
|
10810
|
-
);
|
|
11331
|
+
const maxRetries = resolveOptionalNumber(target.max_retries, `${target.name} max retries`);
|
|
10811
11332
|
const initialDelayMs = resolveOptionalNumber(
|
|
10812
|
-
target.retry_initial_delay_ms
|
|
11333
|
+
target.retry_initial_delay_ms,
|
|
10813
11334
|
`${target.name} retry initial delay`
|
|
10814
11335
|
);
|
|
10815
11336
|
const maxDelayMs = resolveOptionalNumber(
|
|
10816
|
-
target.retry_max_delay_ms
|
|
11337
|
+
target.retry_max_delay_ms,
|
|
10817
11338
|
`${target.name} retry max delay`
|
|
10818
11339
|
);
|
|
10819
11340
|
const backoffFactor = resolveOptionalNumber(
|
|
10820
|
-
target.retry_backoff_factor
|
|
11341
|
+
target.retry_backoff_factor,
|
|
10821
11342
|
`${target.name} retry backoff factor`
|
|
10822
11343
|
);
|
|
10823
11344
|
const retryableStatusCodes = resolveOptionalNumberArray(
|
|
10824
|
-
target.retry_status_codes
|
|
11345
|
+
target.retry_status_codes,
|
|
10825
11346
|
`${target.name} retry status codes`
|
|
10826
11347
|
);
|
|
10827
11348
|
if (maxRetries === void 0 && initialDelayMs === void 0 && maxDelayMs === void 0 && backoffFactor === void 0 && retryableStatusCodes === void 0) {
|
|
@@ -10835,9 +11356,56 @@ function resolveRetryConfig(target) {
|
|
|
10835
11356
|
retryableStatusCodes
|
|
10836
11357
|
};
|
|
10837
11358
|
}
|
|
10838
|
-
function
|
|
11359
|
+
function resolveDelegatedTargetDefinition(name, definitions, env = process.env) {
|
|
11360
|
+
let definition = definitions.get(name);
|
|
11361
|
+
if (!definition) {
|
|
11362
|
+
return void 0;
|
|
11363
|
+
}
|
|
11364
|
+
const visited = [definition.name];
|
|
11365
|
+
for (let depth = 0; depth < 10; depth++) {
|
|
11366
|
+
const rawUseTarget = typeof definition.use_target === "string" ? definition.use_target.trim() : void 0;
|
|
11367
|
+
if (!rawUseTarget) {
|
|
11368
|
+
return definition;
|
|
11369
|
+
}
|
|
11370
|
+
const envMatch = rawUseTarget.match(USE_TARGET_ENV_PATTERN);
|
|
11371
|
+
const envVarName = envMatch?.[1];
|
|
11372
|
+
const resolvedName = envVarName ? env[envVarName]?.trim() ?? "" : rawUseTarget;
|
|
11373
|
+
if (resolvedName.length === 0) {
|
|
11374
|
+
if (envVarName) {
|
|
11375
|
+
throw new Error(
|
|
11376
|
+
`Target "${definition.name}" uses use_target: \${{ ${envVarName} }}, but ${envVarName} is not set. Set ${envVarName} to the name of a concrete target (for example, "azure") before running the eval.`
|
|
11377
|
+
);
|
|
11378
|
+
}
|
|
11379
|
+
throw new Error(
|
|
11380
|
+
`Target "${definition.name}" has an empty use_target value. Point it at a concrete target name before running the eval.`
|
|
11381
|
+
);
|
|
11382
|
+
}
|
|
11383
|
+
const next = definitions.get(resolvedName);
|
|
11384
|
+
if (!next) {
|
|
11385
|
+
if (envVarName) {
|
|
11386
|
+
throw new Error(
|
|
11387
|
+
`Target "${definition.name}" uses use_target: \${{ ${envVarName} }}, which resolved to "${resolvedName}", but no target named "${resolvedName}" exists.`
|
|
11388
|
+
);
|
|
11389
|
+
}
|
|
11390
|
+
throw new Error(
|
|
11391
|
+
`Target "${definition.name}" uses use_target: "${resolvedName}", but no target named "${resolvedName}" exists.`
|
|
11392
|
+
);
|
|
11393
|
+
}
|
|
11394
|
+
if (visited.includes(next.name)) {
|
|
11395
|
+
const chain = [...visited, next.name].join(" -> ");
|
|
11396
|
+
throw new Error(`Circular use_target reference detected: ${chain}`);
|
|
11397
|
+
}
|
|
11398
|
+
definition = next;
|
|
11399
|
+
visited.push(definition.name);
|
|
11400
|
+
}
|
|
11401
|
+
throw new Error(
|
|
11402
|
+
`Target "${name}" exceeded the maximum use_target resolution depth (10). Check for a delegation loop or overly deep alias chain.`
|
|
11403
|
+
);
|
|
11404
|
+
}
|
|
11405
|
+
function resolveTargetDefinition(definition, env = process.env, evalFilePath, options) {
|
|
11406
|
+
assertNoDeprecatedCamelCaseTargetFields(definition);
|
|
10839
11407
|
const parsed = BASE_TARGET_SCHEMA.parse(definition);
|
|
10840
|
-
if (parsed.workspace_template !== void 0
|
|
11408
|
+
if (parsed.workspace_template !== void 0) {
|
|
10841
11409
|
throw new Error(
|
|
10842
11410
|
`${parsed.name}: target-level workspace_template has been removed. Use eval-level workspace.template.`
|
|
10843
11411
|
);
|
|
@@ -10853,13 +11421,9 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
|
10853
11421
|
`${parsed.name} provider`,
|
|
10854
11422
|
true
|
|
10855
11423
|
).toLowerCase();
|
|
10856
|
-
const providerBatching = resolveOptionalBoolean(
|
|
10857
|
-
|
|
10858
|
-
|
|
10859
|
-
const subagentModeAllowed = resolveOptionalBoolean(
|
|
10860
|
-
parsed.subagent_mode_allowed ?? parsed.subagentModeAllowed
|
|
10861
|
-
);
|
|
10862
|
-
const fallbackTargets = parsed.fallback_targets ?? parsed.fallbackTargets;
|
|
11424
|
+
const providerBatching = resolveOptionalBoolean(parsed.provider_batching);
|
|
11425
|
+
const subagentModeAllowed = resolveOptionalBoolean(parsed.subagent_mode_allowed);
|
|
11426
|
+
const fallbackTargets = parsed.fallback_targets;
|
|
10863
11427
|
const base = {
|
|
10864
11428
|
name: parsed.name,
|
|
10865
11429
|
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
@@ -11009,20 +11573,22 @@ function normalizeOpenAIBaseUrl(value) {
|
|
|
11009
11573
|
return trimmed.endsWith("/v1") ? trimmed : `${trimmed}/v1`;
|
|
11010
11574
|
}
|
|
11011
11575
|
function resolveAzureConfig(target, env) {
|
|
11012
|
-
const endpointSource = target.endpoint ?? target.resource
|
|
11013
|
-
const apiKeySource = target.api_key
|
|
11014
|
-
const deploymentSource = target.deployment ?? target.
|
|
11576
|
+
const endpointSource = target.endpoint ?? target.resource;
|
|
11577
|
+
const apiKeySource = target.api_key;
|
|
11578
|
+
const deploymentSource = target.deployment ?? target.model;
|
|
11015
11579
|
const versionSource = target.version ?? target.api_version;
|
|
11016
11580
|
const temperatureSource = target.temperature;
|
|
11017
|
-
const maxTokensSource = target.max_output_tokens
|
|
11581
|
+
const maxTokensSource = target.max_output_tokens;
|
|
11018
11582
|
const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
|
|
11019
11583
|
const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
|
|
11020
11584
|
const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
|
|
11585
|
+
const apiFormat = resolveApiFormat(target, env, target.name);
|
|
11021
11586
|
const version = normalizeAzureApiVersion(
|
|
11022
11587
|
resolveOptionalString(versionSource, env, `${target.name} api version`, {
|
|
11023
11588
|
allowLiteral: true,
|
|
11024
11589
|
optionalEnv: true
|
|
11025
|
-
})
|
|
11590
|
+
}),
|
|
11591
|
+
apiFormat
|
|
11026
11592
|
);
|
|
11027
11593
|
const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
|
|
11028
11594
|
const maxOutputTokens = resolveOptionalNumber(
|
|
@@ -11035,13 +11601,17 @@ function resolveAzureConfig(target, env) {
|
|
|
11035
11601
|
deploymentName,
|
|
11036
11602
|
apiKey,
|
|
11037
11603
|
version,
|
|
11604
|
+
apiFormat,
|
|
11038
11605
|
temperature,
|
|
11039
11606
|
maxOutputTokens,
|
|
11040
11607
|
retry
|
|
11041
11608
|
};
|
|
11042
11609
|
}
|
|
11043
|
-
function resolveApiFormat(target, targetName) {
|
|
11044
|
-
const raw = target.api_format
|
|
11610
|
+
function resolveApiFormat(target, env, targetName) {
|
|
11611
|
+
const raw = resolveOptionalString(target.api_format, env, `${targetName} api format`, {
|
|
11612
|
+
allowLiteral: true,
|
|
11613
|
+
optionalEnv: true
|
|
11614
|
+
});
|
|
11045
11615
|
if (raw === void 0) return void 0;
|
|
11046
11616
|
if (raw === "chat" || raw === "responses") return raw;
|
|
11047
11617
|
throw new Error(
|
|
@@ -11049,11 +11619,11 @@ function resolveApiFormat(target, targetName) {
|
|
|
11049
11619
|
);
|
|
11050
11620
|
}
|
|
11051
11621
|
function resolveOpenAIConfig(target, env) {
|
|
11052
|
-
const endpointSource = target.endpoint ?? target.base_url
|
|
11053
|
-
const apiKeySource = target.api_key
|
|
11622
|
+
const endpointSource = target.endpoint ?? target.base_url;
|
|
11623
|
+
const apiKeySource = target.api_key;
|
|
11054
11624
|
const modelSource = target.model ?? target.deployment ?? target.variant;
|
|
11055
11625
|
const temperatureSource = target.temperature;
|
|
11056
|
-
const maxTokensSource = target.max_output_tokens
|
|
11626
|
+
const maxTokensSource = target.max_output_tokens;
|
|
11057
11627
|
const baseURL = normalizeOpenAIBaseUrl(
|
|
11058
11628
|
resolveOptionalString(endpointSource, env, `${target.name} endpoint`, {
|
|
11059
11629
|
allowLiteral: true,
|
|
@@ -11067,17 +11637,17 @@ function resolveOpenAIConfig(target, env) {
|
|
|
11067
11637
|
baseURL,
|
|
11068
11638
|
apiKey,
|
|
11069
11639
|
model,
|
|
11070
|
-
apiFormat: resolveApiFormat(target, target.name),
|
|
11640
|
+
apiFormat: resolveApiFormat(target, env, target.name),
|
|
11071
11641
|
temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
|
|
11072
11642
|
maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
|
|
11073
11643
|
retry
|
|
11074
11644
|
};
|
|
11075
11645
|
}
|
|
11076
11646
|
function resolveOpenRouterConfig(target, env) {
|
|
11077
|
-
const apiKeySource = target.api_key
|
|
11647
|
+
const apiKeySource = target.api_key;
|
|
11078
11648
|
const modelSource = target.model ?? target.deployment ?? target.variant;
|
|
11079
11649
|
const temperatureSource = target.temperature;
|
|
11080
|
-
const maxTokensSource = target.max_output_tokens
|
|
11650
|
+
const maxTokensSource = target.max_output_tokens;
|
|
11081
11651
|
const retry = resolveRetryConfig(target);
|
|
11082
11652
|
return {
|
|
11083
11653
|
apiKey: resolveString(apiKeySource, env, `${target.name} OpenRouter api key`),
|
|
@@ -11088,11 +11658,11 @@ function resolveOpenRouterConfig(target, env) {
|
|
|
11088
11658
|
};
|
|
11089
11659
|
}
|
|
11090
11660
|
function resolveAnthropicConfig(target, env) {
|
|
11091
|
-
const apiKeySource = target.api_key
|
|
11661
|
+
const apiKeySource = target.api_key;
|
|
11092
11662
|
const modelSource = target.model ?? target.deployment ?? target.variant;
|
|
11093
11663
|
const temperatureSource = target.temperature;
|
|
11094
|
-
const maxTokensSource = target.max_output_tokens
|
|
11095
|
-
const thinkingBudgetSource = target.thinking_budget
|
|
11664
|
+
const maxTokensSource = target.max_output_tokens;
|
|
11665
|
+
const thinkingBudgetSource = target.thinking_budget;
|
|
11096
11666
|
const apiKey = resolveString(apiKeySource, env, `${target.name} Anthropic api key`);
|
|
11097
11667
|
const model = resolveString(modelSource, env, `${target.name} Anthropic model`);
|
|
11098
11668
|
const retry = resolveRetryConfig(target);
|
|
@@ -11106,10 +11676,10 @@ function resolveAnthropicConfig(target, env) {
|
|
|
11106
11676
|
};
|
|
11107
11677
|
}
|
|
11108
11678
|
function resolveGeminiConfig(target, env) {
|
|
11109
|
-
const apiKeySource = target.api_key
|
|
11679
|
+
const apiKeySource = target.api_key;
|
|
11110
11680
|
const modelSource = target.model ?? target.deployment ?? target.variant;
|
|
11111
11681
|
const temperatureSource = target.temperature;
|
|
11112
|
-
const maxTokensSource = target.max_output_tokens
|
|
11682
|
+
const maxTokensSource = target.max_output_tokens;
|
|
11113
11683
|
const apiKey = resolveString(apiKeySource, env, `${target.name} Google API key`);
|
|
11114
11684
|
const model = resolveOptionalString(modelSource, env, `${target.name} Gemini model`, {
|
|
11115
11685
|
allowLiteral: true,
|
|
@@ -11129,11 +11699,11 @@ function resolveCodexConfig(target, env, evalFilePath) {
|
|
|
11129
11699
|
const executableSource = target.executable ?? target.command ?? target.binary;
|
|
11130
11700
|
const argsSource = target.args ?? target.arguments;
|
|
11131
11701
|
const cwdSource = target.cwd;
|
|
11132
|
-
const workspaceTemplateSource = target.workspace_template
|
|
11133
|
-
const timeoutSource = target.timeout_seconds
|
|
11134
|
-
const logDirSource = target.log_dir ?? target.
|
|
11135
|
-
const logFormatSource = target.log_format ?? target.
|
|
11136
|
-
const systemPromptSource = target.system_prompt
|
|
11702
|
+
const workspaceTemplateSource = target.workspace_template;
|
|
11703
|
+
const timeoutSource = target.timeout_seconds;
|
|
11704
|
+
const logDirSource = target.log_dir ?? target.log_directory;
|
|
11705
|
+
const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CODEX_LOG_FORMAT;
|
|
11706
|
+
const systemPromptSource = target.system_prompt;
|
|
11137
11707
|
const model = resolveOptionalString(modelSource, env, `${target.name} codex model`, {
|
|
11138
11708
|
allowLiteral: true,
|
|
11139
11709
|
optionalEnv: true
|
|
@@ -11156,8 +11726,8 @@ function resolveCodexConfig(target, env, evalFilePath) {
|
|
|
11156
11726
|
optionalEnv: true
|
|
11157
11727
|
}
|
|
11158
11728
|
);
|
|
11159
|
-
if (workspaceTemplate && evalFilePath && !
|
|
11160
|
-
workspaceTemplate =
|
|
11729
|
+
if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
|
|
11730
|
+
workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
|
|
11161
11731
|
}
|
|
11162
11732
|
if (cwd && workspaceTemplate) {
|
|
11163
11733
|
throw new Error(
|
|
@@ -11197,16 +11767,16 @@ function normalizeCodexLogFormat(value) {
|
|
|
11197
11767
|
throw new Error("codex log format must be 'summary' or 'json'");
|
|
11198
11768
|
}
|
|
11199
11769
|
function resolveCopilotSdkConfig(target, env, evalFilePath) {
|
|
11200
|
-
const cliUrlSource = target.cli_url
|
|
11201
|
-
const cliPathSource = target.cli_path
|
|
11202
|
-
const githubTokenSource = target.github_token
|
|
11770
|
+
const cliUrlSource = target.cli_url;
|
|
11771
|
+
const cliPathSource = target.cli_path;
|
|
11772
|
+
const githubTokenSource = target.github_token;
|
|
11203
11773
|
const modelSource = target.model;
|
|
11204
11774
|
const cwdSource = target.cwd;
|
|
11205
|
-
const workspaceTemplateSource = target.workspace_template
|
|
11206
|
-
const timeoutSource = target.timeout_seconds
|
|
11207
|
-
const logDirSource = target.log_dir ?? target.
|
|
11208
|
-
const logFormatSource = target.log_format
|
|
11209
|
-
const systemPromptSource = target.system_prompt
|
|
11775
|
+
const workspaceTemplateSource = target.workspace_template;
|
|
11776
|
+
const timeoutSource = target.timeout_seconds;
|
|
11777
|
+
const logDirSource = target.log_dir ?? target.log_directory;
|
|
11778
|
+
const logFormatSource = target.log_format;
|
|
11779
|
+
const systemPromptSource = target.system_prompt;
|
|
11210
11780
|
const cliUrl = resolveOptionalString(cliUrlSource, env, `${target.name} copilot-sdk cli URL`, {
|
|
11211
11781
|
allowLiteral: true,
|
|
11212
11782
|
optionalEnv: true
|
|
@@ -11241,8 +11811,8 @@ function resolveCopilotSdkConfig(target, env, evalFilePath) {
|
|
|
11241
11811
|
optionalEnv: true
|
|
11242
11812
|
}
|
|
11243
11813
|
);
|
|
11244
|
-
if (workspaceTemplate && evalFilePath && !
|
|
11245
|
-
workspaceTemplate =
|
|
11814
|
+
if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
|
|
11815
|
+
workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
|
|
11246
11816
|
}
|
|
11247
11817
|
if (cwd && workspaceTemplate) {
|
|
11248
11818
|
throw new Error(
|
|
@@ -11279,11 +11849,11 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
|
|
|
11279
11849
|
const modelSource = target.model;
|
|
11280
11850
|
const argsSource = target.args ?? target.arguments;
|
|
11281
11851
|
const cwdSource = target.cwd;
|
|
11282
|
-
const workspaceTemplateSource = target.workspace_template
|
|
11283
|
-
const timeoutSource = target.timeout_seconds
|
|
11284
|
-
const logDirSource = target.log_dir ?? target.
|
|
11285
|
-
const logFormatSource = target.log_format
|
|
11286
|
-
const systemPromptSource = target.system_prompt
|
|
11852
|
+
const workspaceTemplateSource = target.workspace_template;
|
|
11853
|
+
const timeoutSource = target.timeout_seconds;
|
|
11854
|
+
const logDirSource = target.log_dir ?? target.log_directory;
|
|
11855
|
+
const logFormatSource = target.log_format;
|
|
11856
|
+
const systemPromptSource = target.system_prompt;
|
|
11287
11857
|
const executable = resolveOptionalString(executableSource, env, `${target.name} copilot-cli executable`, {
|
|
11288
11858
|
allowLiteral: true,
|
|
11289
11859
|
optionalEnv: true
|
|
@@ -11306,8 +11876,8 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
|
|
|
11306
11876
|
optionalEnv: true
|
|
11307
11877
|
}
|
|
11308
11878
|
);
|
|
11309
|
-
if (workspaceTemplate && evalFilePath && !
|
|
11310
|
-
workspaceTemplate =
|
|
11879
|
+
if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
|
|
11880
|
+
workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
|
|
11311
11881
|
}
|
|
11312
11882
|
if (cwd && workspaceTemplate) {
|
|
11313
11883
|
throw new Error(
|
|
@@ -11347,16 +11917,16 @@ function normalizeCopilotLogFormat(value) {
|
|
|
11347
11917
|
}
|
|
11348
11918
|
function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
11349
11919
|
const subproviderSource = target.subprovider;
|
|
11350
|
-
const modelSource = target.model ?? target.pi_model
|
|
11351
|
-
const apiKeySource = target.api_key
|
|
11352
|
-
const toolsSource = target.tools ?? target.pi_tools
|
|
11353
|
-
const thinkingSource = target.thinking ?? target.pi_thinking
|
|
11920
|
+
const modelSource = target.model ?? target.pi_model;
|
|
11921
|
+
const apiKeySource = target.api_key;
|
|
11922
|
+
const toolsSource = target.tools ?? target.pi_tools;
|
|
11923
|
+
const thinkingSource = target.thinking ?? target.pi_thinking;
|
|
11354
11924
|
const cwdSource = target.cwd;
|
|
11355
|
-
const workspaceTemplateSource = target.workspace_template
|
|
11356
|
-
const timeoutSource = target.timeout_seconds
|
|
11357
|
-
const logDirSource = target.log_dir ?? target.
|
|
11358
|
-
const logFormatSource = target.log_format
|
|
11359
|
-
const systemPromptSource = target.system_prompt
|
|
11925
|
+
const workspaceTemplateSource = target.workspace_template;
|
|
11926
|
+
const timeoutSource = target.timeout_seconds;
|
|
11927
|
+
const logDirSource = target.log_dir ?? target.log_directory;
|
|
11928
|
+
const logFormatSource = target.log_format;
|
|
11929
|
+
const systemPromptSource = target.system_prompt;
|
|
11360
11930
|
const subprovider = resolveOptionalString(
|
|
11361
11931
|
subproviderSource,
|
|
11362
11932
|
env,
|
|
@@ -11374,6 +11944,11 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
|
11374
11944
|
allowLiteral: false,
|
|
11375
11945
|
optionalEnv: true
|
|
11376
11946
|
});
|
|
11947
|
+
const baseUrlSource = target.base_url ?? target.endpoint;
|
|
11948
|
+
const baseUrl = resolveOptionalString(baseUrlSource, env, `${target.name} pi base url`, {
|
|
11949
|
+
allowLiteral: true,
|
|
11950
|
+
optionalEnv: true
|
|
11951
|
+
});
|
|
11377
11952
|
const tools = resolveOptionalString(toolsSource, env, `${target.name} pi tools`, {
|
|
11378
11953
|
allowLiteral: true,
|
|
11379
11954
|
optionalEnv: true
|
|
@@ -11395,8 +11970,8 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
|
11395
11970
|
optionalEnv: true
|
|
11396
11971
|
}
|
|
11397
11972
|
);
|
|
11398
|
-
if (workspaceTemplate && evalFilePath && !
|
|
11399
|
-
workspaceTemplate =
|
|
11973
|
+
if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
|
|
11974
|
+
workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
|
|
11400
11975
|
}
|
|
11401
11976
|
if (cwd && workspaceTemplate) {
|
|
11402
11977
|
throw new Error(
|
|
@@ -11414,6 +11989,7 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
|
11414
11989
|
subprovider,
|
|
11415
11990
|
model,
|
|
11416
11991
|
apiKey,
|
|
11992
|
+
baseUrl,
|
|
11417
11993
|
tools,
|
|
11418
11994
|
thinking,
|
|
11419
11995
|
cwd,
|
|
@@ -11427,16 +12003,16 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
|
11427
12003
|
function resolvePiCliConfig(target, env, evalFilePath) {
|
|
11428
12004
|
const executableSource = target.executable ?? target.command ?? target.binary;
|
|
11429
12005
|
const subproviderSource = target.subprovider;
|
|
11430
|
-
const modelSource = target.model ?? target.pi_model
|
|
11431
|
-
const apiKeySource = target.api_key
|
|
11432
|
-
const toolsSource = target.tools ?? target.pi_tools
|
|
11433
|
-
const thinkingSource = target.thinking ?? target.pi_thinking
|
|
12006
|
+
const modelSource = target.model ?? target.pi_model;
|
|
12007
|
+
const apiKeySource = target.api_key;
|
|
12008
|
+
const toolsSource = target.tools ?? target.pi_tools;
|
|
12009
|
+
const thinkingSource = target.thinking ?? target.pi_thinking;
|
|
11434
12010
|
const cwdSource = target.cwd;
|
|
11435
|
-
const workspaceTemplateSource = target.workspace_template
|
|
11436
|
-
const timeoutSource = target.timeout_seconds
|
|
11437
|
-
const logDirSource = target.log_dir ?? target.
|
|
11438
|
-
const logFormatSource = target.log_format
|
|
11439
|
-
const systemPromptSource = target.system_prompt
|
|
12011
|
+
const workspaceTemplateSource = target.workspace_template;
|
|
12012
|
+
const timeoutSource = target.timeout_seconds;
|
|
12013
|
+
const logDirSource = target.log_dir ?? target.log_directory;
|
|
12014
|
+
const logFormatSource = target.log_format;
|
|
12015
|
+
const systemPromptSource = target.system_prompt;
|
|
11440
12016
|
const executable = resolveOptionalString(executableSource, env, `${target.name} pi-cli executable`, {
|
|
11441
12017
|
allowLiteral: true,
|
|
11442
12018
|
optionalEnv: true
|
|
@@ -11455,6 +12031,11 @@ function resolvePiCliConfig(target, env, evalFilePath) {
|
|
|
11455
12031
|
allowLiteral: false,
|
|
11456
12032
|
optionalEnv: true
|
|
11457
12033
|
});
|
|
12034
|
+
const baseUrlSource = target.base_url ?? target.endpoint;
|
|
12035
|
+
const baseUrl = resolveOptionalString(baseUrlSource, env, `${target.name} pi-cli base url`, {
|
|
12036
|
+
allowLiteral: true,
|
|
12037
|
+
optionalEnv: true
|
|
12038
|
+
});
|
|
11458
12039
|
const tools = resolveOptionalString(toolsSource, env, `${target.name} pi-cli tools`, {
|
|
11459
12040
|
allowLiteral: true,
|
|
11460
12041
|
optionalEnv: true
|
|
@@ -11475,8 +12056,8 @@ function resolvePiCliConfig(target, env, evalFilePath) {
|
|
|
11475
12056
|
`${target.name} pi-cli workspace template`,
|
|
11476
12057
|
{ allowLiteral: true, optionalEnv: true }
|
|
11477
12058
|
);
|
|
11478
|
-
if (workspaceTemplate && evalFilePath && !
|
|
11479
|
-
workspaceTemplate =
|
|
12059
|
+
if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
|
|
12060
|
+
workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
|
|
11480
12061
|
}
|
|
11481
12062
|
if (cwd && workspaceTemplate) {
|
|
11482
12063
|
throw new Error(`${target.name}: 'cwd' and 'workspace_template' are mutually exclusive.`);
|
|
@@ -11493,6 +12074,7 @@ function resolvePiCliConfig(target, env, evalFilePath) {
|
|
|
11493
12074
|
subprovider,
|
|
11494
12075
|
model,
|
|
11495
12076
|
apiKey,
|
|
12077
|
+
baseUrl,
|
|
11496
12078
|
tools,
|
|
11497
12079
|
thinking,
|
|
11498
12080
|
args,
|
|
@@ -11507,11 +12089,11 @@ function resolvePiCliConfig(target, env, evalFilePath) {
|
|
|
11507
12089
|
function resolveClaudeConfig(target, env, evalFilePath) {
|
|
11508
12090
|
const modelSource = target.model;
|
|
11509
12091
|
const cwdSource = target.cwd;
|
|
11510
|
-
const workspaceTemplateSource = target.workspace_template
|
|
11511
|
-
const timeoutSource = target.timeout_seconds
|
|
11512
|
-
const logDirSource = target.log_dir ?? target.
|
|
11513
|
-
const logFormatSource = target.log_format ?? target.
|
|
11514
|
-
const systemPromptSource = target.system_prompt
|
|
12092
|
+
const workspaceTemplateSource = target.workspace_template;
|
|
12093
|
+
const timeoutSource = target.timeout_seconds;
|
|
12094
|
+
const logDirSource = target.log_dir ?? target.log_directory;
|
|
12095
|
+
const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CLAUDE_LOG_FORMAT;
|
|
12096
|
+
const systemPromptSource = target.system_prompt;
|
|
11515
12097
|
const model = resolveOptionalString(modelSource, env, `${target.name} claude model`, {
|
|
11516
12098
|
allowLiteral: true,
|
|
11517
12099
|
optionalEnv: true
|
|
@@ -11529,8 +12111,8 @@ function resolveClaudeConfig(target, env, evalFilePath) {
|
|
|
11529
12111
|
optionalEnv: true
|
|
11530
12112
|
}
|
|
11531
12113
|
);
|
|
11532
|
-
if (workspaceTemplate && evalFilePath && !
|
|
11533
|
-
workspaceTemplate =
|
|
12114
|
+
if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
|
|
12115
|
+
workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
|
|
11534
12116
|
}
|
|
11535
12117
|
if (cwd && workspaceTemplate) {
|
|
11536
12118
|
throw new Error(
|
|
@@ -11544,8 +12126,8 @@ function resolveClaudeConfig(target, env, evalFilePath) {
|
|
|
11544
12126
|
});
|
|
11545
12127
|
const logFormat = normalizeClaudeLogFormat(logFormatSource);
|
|
11546
12128
|
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
11547
|
-
const maxTurns = typeof target.max_turns === "number" ? target.max_turns :
|
|
11548
|
-
const maxBudgetUsd = typeof target.max_budget_usd === "number" ? target.max_budget_usd :
|
|
12129
|
+
const maxTurns = typeof target.max_turns === "number" ? target.max_turns : void 0;
|
|
12130
|
+
const maxBudgetUsd = typeof target.max_budget_usd === "number" ? target.max_budget_usd : void 0;
|
|
11549
12131
|
return {
|
|
11550
12132
|
model,
|
|
11551
12133
|
systemPrompt,
|
|
@@ -11576,9 +12158,7 @@ function resolveMockConfig(target) {
|
|
|
11576
12158
|
return { response };
|
|
11577
12159
|
}
|
|
11578
12160
|
function resolveVSCodeConfig(target, env, insiders, evalFilePath) {
|
|
11579
|
-
const workspaceTemplateEnvVar = resolveOptionalLiteralString(
|
|
11580
|
-
target.workspace_template ?? target.workspaceTemplate
|
|
11581
|
-
);
|
|
12161
|
+
const workspaceTemplateEnvVar = resolveOptionalLiteralString(target.workspace_template);
|
|
11582
12162
|
let workspaceTemplate = workspaceTemplateEnvVar ? resolveOptionalString(
|
|
11583
12163
|
workspaceTemplateEnvVar,
|
|
11584
12164
|
env,
|
|
@@ -11588,14 +12168,14 @@ function resolveVSCodeConfig(target, env, insiders, evalFilePath) {
|
|
|
11588
12168
|
optionalEnv: true
|
|
11589
12169
|
}
|
|
11590
12170
|
) : void 0;
|
|
11591
|
-
if (workspaceTemplate && evalFilePath && !
|
|
11592
|
-
workspaceTemplate =
|
|
12171
|
+
if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
|
|
12172
|
+
workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
|
|
11593
12173
|
}
|
|
11594
12174
|
const executableSource = target.executable;
|
|
11595
12175
|
const waitSource = target.wait;
|
|
11596
|
-
const dryRunSource = target.dry_run
|
|
11597
|
-
const subagentRootSource = target.subagent_root
|
|
11598
|
-
const timeoutSource = target.timeout_seconds
|
|
12176
|
+
const dryRunSource = target.dry_run;
|
|
12177
|
+
const subagentRootSource = target.subagent_root;
|
|
12178
|
+
const timeoutSource = target.timeout_seconds;
|
|
11599
12179
|
const defaultCommand = insiders ? "code-insiders" : "code";
|
|
11600
12180
|
const executable = resolveOptionalString(executableSource, env, `${target.name} vscode executable`, {
|
|
11601
12181
|
allowLiteral: true,
|
|
@@ -11630,8 +12210,8 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
11630
12210
|
const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
|
|
11631
12211
|
if (!parseResult.success) {
|
|
11632
12212
|
const firstError = parseResult.error.errors[0];
|
|
11633
|
-
const
|
|
11634
|
-
const prefix =
|
|
12213
|
+
const path53 = firstError?.path.join(".") || "";
|
|
12214
|
+
const prefix = path53 ? `${target.name} ${path53}: ` : `${target.name}: `;
|
|
11635
12215
|
throw new Error(`${prefix}${firstError?.message}`);
|
|
11636
12216
|
}
|
|
11637
12217
|
const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
|
|
@@ -11646,17 +12226,17 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
11646
12226
|
}
|
|
11647
12227
|
function resolveDiscoveredProviderConfig(target, providerKind, env, evalFilePath) {
|
|
11648
12228
|
const command = target.command ? resolveString(target.command, env, `${target.name} command`, true) : `bun run .agentv/providers/${providerKind}.ts {PROMPT}`;
|
|
11649
|
-
const timeoutSeconds = target.timeout_seconds
|
|
12229
|
+
const timeoutSeconds = target.timeout_seconds;
|
|
11650
12230
|
const timeoutMs = resolveTimeoutMs(timeoutSeconds, `${target.name} timeout`);
|
|
11651
12231
|
let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
|
|
11652
12232
|
allowLiteral: true,
|
|
11653
12233
|
optionalEnv: true
|
|
11654
12234
|
});
|
|
11655
|
-
if (cwd && evalFilePath && !
|
|
11656
|
-
cwd =
|
|
12235
|
+
if (cwd && evalFilePath && !import_node_path24.default.isAbsolute(cwd)) {
|
|
12236
|
+
cwd = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), cwd);
|
|
11657
12237
|
}
|
|
11658
12238
|
if (!cwd && evalFilePath) {
|
|
11659
|
-
cwd =
|
|
12239
|
+
cwd = import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath));
|
|
11660
12240
|
}
|
|
11661
12241
|
return {
|
|
11662
12242
|
command,
|
|
@@ -11710,10 +12290,10 @@ function resolveDiscover(value, targetName) {
|
|
|
11710
12290
|
throw new Error(`Target "${targetName}": discover must be "latest" (got "${String(value)}")`);
|
|
11711
12291
|
}
|
|
11712
12292
|
function resolveCopilotLogConfig(target, env) {
|
|
11713
|
-
const sessionDirSource = target.session_dir
|
|
11714
|
-
const sessionIdSource = target.session_id
|
|
12293
|
+
const sessionDirSource = target.session_dir;
|
|
12294
|
+
const sessionIdSource = target.session_id;
|
|
11715
12295
|
const discoverSource = target.discover;
|
|
11716
|
-
const sessionStateDirSource = target.session_state_dir
|
|
12296
|
+
const sessionStateDirSource = target.session_state_dir;
|
|
11717
12297
|
const cwdSource = target.cwd;
|
|
11718
12298
|
return {
|
|
11719
12299
|
sessionDir: resolveOptionalString(
|
|
@@ -11894,7 +12474,7 @@ var import_node_path33 = __toESM(require("path"), 1);
|
|
|
11894
12474
|
init_cjs_shims();
|
|
11895
12475
|
var import_node_fs11 = require("fs");
|
|
11896
12476
|
var import_promises20 = require("fs/promises");
|
|
11897
|
-
var
|
|
12477
|
+
var import_node_path25 = __toESM(require("path"), 1);
|
|
11898
12478
|
async function pathExists(target) {
|
|
11899
12479
|
try {
|
|
11900
12480
|
await (0, import_promises20.access)(target, import_node_fs11.constants.F_OK);
|
|
@@ -11910,7 +12490,7 @@ async function readDirEntries(target) {
|
|
|
11910
12490
|
const entries = await (0, import_promises20.readdir)(target, { withFileTypes: true });
|
|
11911
12491
|
return entries.map((entry) => ({
|
|
11912
12492
|
name: entry.name,
|
|
11913
|
-
absolutePath:
|
|
12493
|
+
absolutePath: import_node_path25.default.join(target, entry.name),
|
|
11914
12494
|
isDirectory: entry.isDirectory()
|
|
11915
12495
|
}));
|
|
11916
12496
|
}
|
|
@@ -11926,9 +12506,9 @@ async function removeIfExists(target) {
|
|
|
11926
12506
|
|
|
11927
12507
|
// src/evaluation/providers/vscode/utils/path.ts
|
|
11928
12508
|
init_cjs_shims();
|
|
11929
|
-
var
|
|
12509
|
+
var import_node_path26 = __toESM(require("path"), 1);
|
|
11930
12510
|
function pathToFileUri2(filePath) {
|
|
11931
|
-
const absolutePath =
|
|
12511
|
+
const absolutePath = import_node_path26.default.isAbsolute(filePath) ? filePath : import_node_path26.default.resolve(filePath);
|
|
11932
12512
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
11933
12513
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
11934
12514
|
return `file:///${normalizedPath}`;
|
|
@@ -11938,7 +12518,7 @@ function pathToFileUri2(filePath) {
|
|
|
11938
12518
|
|
|
11939
12519
|
// src/evaluation/providers/vscode/dispatch/promptBuilder.ts
|
|
11940
12520
|
init_cjs_shims();
|
|
11941
|
-
var
|
|
12521
|
+
var import_node_path27 = __toESM(require("path"), 1);
|
|
11942
12522
|
|
|
11943
12523
|
// src/evaluation/providers/vscode/utils/template.ts
|
|
11944
12524
|
init_cjs_shims();
|
|
@@ -12032,8 +12612,8 @@ function createBatchRequestPrompt(userQuery, responseFileTmp, responseFileFinal,
|
|
|
12032
12612
|
});
|
|
12033
12613
|
}
|
|
12034
12614
|
function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateContent) {
|
|
12035
|
-
const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${
|
|
12036
|
-
const responseList = responseFiles.map((file) => `"${
|
|
12615
|
+
const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${import_node_path27.default.basename(file)}`).join("\n");
|
|
12616
|
+
const responseList = responseFiles.map((file) => `"${import_node_path27.default.basename(file)}"`).join(", ");
|
|
12037
12617
|
return renderTemplate2(templateContent, {
|
|
12038
12618
|
requestFiles: requestLines,
|
|
12039
12619
|
responseList
|
|
@@ -12043,7 +12623,7 @@ function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateCont
|
|
|
12043
12623
|
// src/evaluation/providers/vscode/dispatch/responseWaiter.ts
|
|
12044
12624
|
init_cjs_shims();
|
|
12045
12625
|
var import_promises21 = require("fs/promises");
|
|
12046
|
-
var
|
|
12626
|
+
var import_node_path28 = __toESM(require("path"), 1);
|
|
12047
12627
|
|
|
12048
12628
|
// src/evaluation/providers/vscode/utils/time.ts
|
|
12049
12629
|
init_cjs_shims();
|
|
@@ -12103,7 +12683,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
|
|
|
12103
12683
|
}
|
|
12104
12684
|
async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, silent = false, timeoutMs = DEFAULT_TIMEOUT_MS) {
|
|
12105
12685
|
if (!silent) {
|
|
12106
|
-
const fileList = responseFilesFinal.map((file) =>
|
|
12686
|
+
const fileList = responseFilesFinal.map((file) => import_node_path28.default.basename(file)).join(", ");
|
|
12107
12687
|
console.error(`waiting for ${responseFilesFinal.length} batch response(s): ${fileList}`);
|
|
12108
12688
|
}
|
|
12109
12689
|
const deadline = Date.now() + timeoutMs;
|
|
@@ -12112,7 +12692,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
|
|
|
12112
12692
|
while (pending.size > 0) {
|
|
12113
12693
|
if (Date.now() >= deadline) {
|
|
12114
12694
|
if (!silent) {
|
|
12115
|
-
const remaining = [...pending].map((f) =>
|
|
12695
|
+
const remaining = [...pending].map((f) => import_node_path28.default.basename(f)).join(", ");
|
|
12116
12696
|
console.error(
|
|
12117
12697
|
`error: timed out after ${Math.round(timeoutMs / 1e3)}s waiting for batch responses. Still pending: ${remaining}`
|
|
12118
12698
|
);
|
|
@@ -12170,37 +12750,6 @@ var import_node_util2 = require("util");
|
|
|
12170
12750
|
// src/evaluation/providers/vscode/dispatch/constants.ts
|
|
12171
12751
|
init_cjs_shims();
|
|
12172
12752
|
var import_node_path29 = __toESM(require("path"), 1);
|
|
12173
|
-
|
|
12174
|
-
// src/paths.ts
|
|
12175
|
-
init_cjs_shims();
|
|
12176
|
-
var import_node_os6 = __toESM(require("os"), 1);
|
|
12177
|
-
var import_node_path28 = __toESM(require("path"), 1);
|
|
12178
|
-
var logged = false;
|
|
12179
|
-
function getAgentvHome() {
|
|
12180
|
-
const envHome = process.env.AGENTV_HOME;
|
|
12181
|
-
if (envHome && envHome !== "undefined") {
|
|
12182
|
-
if (!logged) {
|
|
12183
|
-
logged = true;
|
|
12184
|
-
console.warn(`Using AGENTV_HOME: ${envHome}`);
|
|
12185
|
-
}
|
|
12186
|
-
return envHome;
|
|
12187
|
-
}
|
|
12188
|
-
return import_node_path28.default.join(import_node_os6.default.homedir(), ".agentv");
|
|
12189
|
-
}
|
|
12190
|
-
function getWorkspacesRoot() {
|
|
12191
|
-
return import_node_path28.default.join(getAgentvHome(), "workspaces");
|
|
12192
|
-
}
|
|
12193
|
-
function getSubagentsRoot() {
|
|
12194
|
-
return import_node_path28.default.join(getAgentvHome(), "subagents");
|
|
12195
|
-
}
|
|
12196
|
-
function getTraceStateRoot() {
|
|
12197
|
-
return import_node_path28.default.join(getAgentvHome(), "trace-state");
|
|
12198
|
-
}
|
|
12199
|
-
function getWorkspacePoolRoot() {
|
|
12200
|
-
return import_node_path28.default.join(getAgentvHome(), "workspace-pool");
|
|
12201
|
-
}
|
|
12202
|
-
|
|
12203
|
-
// src/evaluation/providers/vscode/dispatch/constants.ts
|
|
12204
12753
|
var DEFAULT_LOCK_NAME = "subagent.lock";
|
|
12205
12754
|
var DEFAULT_ALIVE_FILENAME = ".alive";
|
|
12206
12755
|
function getDefaultSubagentRoot(vscodeCmd = "code") {
|
|
@@ -13353,6 +13902,15 @@ var AGENT_PROVIDER_KINDS = [
|
|
|
13353
13902
|
"vscode",
|
|
13354
13903
|
"vscode-insiders"
|
|
13355
13904
|
];
|
|
13905
|
+
var LLM_GRADER_CAPABLE_KINDS = [
|
|
13906
|
+
"openai",
|
|
13907
|
+
"openrouter",
|
|
13908
|
+
"azure",
|
|
13909
|
+
"anthropic",
|
|
13910
|
+
"gemini",
|
|
13911
|
+
"agentv",
|
|
13912
|
+
"mock"
|
|
13913
|
+
];
|
|
13356
13914
|
function extractLastAssistantContent(messages) {
|
|
13357
13915
|
if (!messages || messages.length === 0) {
|
|
13358
13916
|
return "";
|
|
@@ -13506,9 +14064,10 @@ init_cjs_shims();
|
|
|
13506
14064
|
|
|
13507
14065
|
// src/evaluation/evaluators/scoring.ts
|
|
13508
14066
|
init_cjs_shims();
|
|
13509
|
-
var
|
|
13510
|
-
|
|
13511
|
-
|
|
14067
|
+
var DEFAULT_THRESHOLD = 0.8;
|
|
14068
|
+
var PASS_THRESHOLD = DEFAULT_THRESHOLD;
|
|
14069
|
+
function scoreToVerdict(score, threshold = DEFAULT_THRESHOLD) {
|
|
14070
|
+
return score >= threshold ? "pass" : "fail";
|
|
13512
14071
|
}
|
|
13513
14072
|
function clampScore(value) {
|
|
13514
14073
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
@@ -13699,13 +14258,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
13699
14258
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
13700
14259
|
const { mkdir: mkdir17, readFile: readFile17, rm: rm6, writeFile: writeFile9 } = await import("fs/promises");
|
|
13701
14260
|
const { tmpdir: tmpdir3 } = await import("os");
|
|
13702
|
-
const
|
|
14261
|
+
const path53 = await import("path");
|
|
13703
14262
|
const { randomUUID: randomUUID10 } = await import("crypto");
|
|
13704
|
-
const dir =
|
|
14263
|
+
const dir = path53.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
|
|
13705
14264
|
await mkdir17(dir, { recursive: true });
|
|
13706
|
-
const stdinPath =
|
|
13707
|
-
const stdoutPath =
|
|
13708
|
-
const stderrPath =
|
|
14265
|
+
const stdinPath = path53.join(dir, "stdin.txt");
|
|
14266
|
+
const stdoutPath = path53.join(dir, "stdout.txt");
|
|
14267
|
+
const stderrPath = path53.join(dir, "stderr.txt");
|
|
13709
14268
|
await writeFile9(stdinPath, stdinPayload, "utf8");
|
|
13710
14269
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
13711
14270
|
const { spawn: spawn5 } = await import("child_process");
|
|
@@ -14907,7 +15466,7 @@ ${outputSchema}`;
|
|
|
14907
15466
|
parts.push("[[ ## scoring_criteria ## ]]");
|
|
14908
15467
|
for (const rubric of rubrics) {
|
|
14909
15468
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
14910
|
-
const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
15469
|
+
const minScoreLabel = rubric.min_score !== void 0 ? ` [REQUIRED: min score ${rubric.min_score}]` : rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
14911
15470
|
parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
|
|
14912
15471
|
if (rubric.outcome) {
|
|
14913
15472
|
parts.push(`Description: ${rubric.outcome}`);
|
|
@@ -14961,54 +15520,106 @@ ${outputSchema}`;
|
|
|
14961
15520
|
async runWithRetry(options) {
|
|
14962
15521
|
const { context: context2, graderProvider, systemPrompt, userPrompt, schema, images } = options;
|
|
14963
15522
|
let lastError;
|
|
15523
|
+
let lastInvalidResponse;
|
|
15524
|
+
let shouldAttemptStructureFix = false;
|
|
14964
15525
|
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
14965
15526
|
try {
|
|
14966
|
-
const
|
|
14967
|
-
|
|
14968
|
-
|
|
14969
|
-
|
|
14970
|
-
|
|
14971
|
-
|
|
14972
|
-
|
|
14973
|
-
|
|
14974
|
-
|
|
14975
|
-
|
|
14976
|
-
|
|
14977
|
-
|
|
14978
|
-
|
|
14979
|
-
|
|
14980
|
-
|
|
14981
|
-
|
|
14982
|
-
]
|
|
14983
|
-
}
|
|
14984
|
-
],
|
|
14985
|
-
...modelOptions
|
|
14986
|
-
}) : await (0, import_ai2.generateText)({
|
|
14987
|
-
model,
|
|
14988
|
-
system: systemPrompt,
|
|
14989
|
-
prompt: userPrompt,
|
|
14990
|
-
...modelOptions
|
|
14991
|
-
});
|
|
14992
|
-
const data2 = schema.parse(parseJsonFromText(result.text));
|
|
14993
|
-
const rawUsage = result.usage;
|
|
14994
|
-
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
14995
|
-
return { data: data2, tokenUsage };
|
|
15527
|
+
const result = await this.generateStructuredResponse({
|
|
15528
|
+
context: context2,
|
|
15529
|
+
graderProvider,
|
|
15530
|
+
systemPrompt,
|
|
15531
|
+
userPrompt,
|
|
15532
|
+
images
|
|
15533
|
+
});
|
|
15534
|
+
const canRepairResponse = result.text.trim().length > 0;
|
|
15535
|
+
lastInvalidResponse = canRepairResponse ? result : void 0;
|
|
15536
|
+
let data;
|
|
15537
|
+
try {
|
|
15538
|
+
data = schema.parse(parseJsonFromText(result.text));
|
|
15539
|
+
} catch (e) {
|
|
15540
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
15541
|
+
shouldAttemptStructureFix = canRepairResponse;
|
|
15542
|
+
continue;
|
|
14996
15543
|
}
|
|
14997
|
-
|
|
14998
|
-
|
|
15544
|
+
return {
|
|
15545
|
+
data,
|
|
15546
|
+
providerResponse: result.providerResponse,
|
|
15547
|
+
tokenUsage: result.tokenUsage
|
|
15548
|
+
};
|
|
15549
|
+
} catch (e) {
|
|
15550
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
15551
|
+
}
|
|
15552
|
+
}
|
|
15553
|
+
if (shouldAttemptStructureFix && lastInvalidResponse) {
|
|
15554
|
+
try {
|
|
15555
|
+
const repaired = await this.generateStructuredResponse({
|
|
15556
|
+
context: context2,
|
|
15557
|
+
graderProvider,
|
|
14999
15558
|
systemPrompt,
|
|
15000
|
-
|
|
15001
|
-
|
|
15002
|
-
|
|
15003
|
-
|
|
15559
|
+
userPrompt: buildStructureRepairPrompt({
|
|
15560
|
+
validationError: lastError?.message ?? "Schema validation failed",
|
|
15561
|
+
invalidResponse: lastInvalidResponse.text
|
|
15562
|
+
})
|
|
15004
15563
|
});
|
|
15005
|
-
const data = schema.parse(parseJsonFromText(
|
|
15006
|
-
return {
|
|
15564
|
+
const data = schema.parse(parseJsonFromText(repaired.text));
|
|
15565
|
+
return {
|
|
15566
|
+
data,
|
|
15567
|
+
providerResponse: repaired.providerResponse,
|
|
15568
|
+
tokenUsage: sumTokenUsage(lastInvalidResponse.tokenUsage, repaired.tokenUsage)
|
|
15569
|
+
};
|
|
15007
15570
|
} catch (e) {
|
|
15008
15571
|
lastError = e instanceof Error ? e : new Error(String(e));
|
|
15009
15572
|
}
|
|
15010
15573
|
}
|
|
15011
|
-
throw new Error(
|
|
15574
|
+
throw new Error(
|
|
15575
|
+
`Failed to parse evaluator response after 3 attempts and 1 structure-fix attempt: ${lastError?.message}`
|
|
15576
|
+
);
|
|
15577
|
+
}
|
|
15578
|
+
async generateStructuredResponse(options) {
|
|
15579
|
+
const { context: context2, graderProvider, systemPrompt, userPrompt, images } = options;
|
|
15580
|
+
const model = graderProvider.asLanguageModel?.();
|
|
15581
|
+
if (model) {
|
|
15582
|
+
const modelOptions = {
|
|
15583
|
+
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
15584
|
+
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
15585
|
+
};
|
|
15586
|
+
const hasImages = images && images.length > 0;
|
|
15587
|
+
const result = hasImages ? await (0, import_ai2.generateText)({
|
|
15588
|
+
model,
|
|
15589
|
+
system: systemPrompt,
|
|
15590
|
+
messages: [
|
|
15591
|
+
{
|
|
15592
|
+
role: "user",
|
|
15593
|
+
content: [
|
|
15594
|
+
{ type: "text", text: userPrompt },
|
|
15595
|
+
...toAiSdkImageParts(images)
|
|
15596
|
+
]
|
|
15597
|
+
}
|
|
15598
|
+
],
|
|
15599
|
+
...modelOptions
|
|
15600
|
+
}) : await (0, import_ai2.generateText)({
|
|
15601
|
+
model,
|
|
15602
|
+
system: systemPrompt,
|
|
15603
|
+
prompt: userPrompt,
|
|
15604
|
+
...modelOptions
|
|
15605
|
+
});
|
|
15606
|
+
const rawUsage = result.usage;
|
|
15607
|
+
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
15608
|
+
return { text: result.text, tokenUsage };
|
|
15609
|
+
}
|
|
15610
|
+
const response = await graderProvider.invoke({
|
|
15611
|
+
question: userPrompt,
|
|
15612
|
+
systemPrompt,
|
|
15613
|
+
evalCaseId: context2.evalCase.id,
|
|
15614
|
+
attempt: context2.attempt,
|
|
15615
|
+
maxOutputTokens: this.maxOutputTokens,
|
|
15616
|
+
temperature: this.temperature
|
|
15617
|
+
});
|
|
15618
|
+
return {
|
|
15619
|
+
text: extractLastAssistantContent(response.output),
|
|
15620
|
+
providerResponse: response,
|
|
15621
|
+
tokenUsage: response.tokenUsage
|
|
15622
|
+
};
|
|
15012
15623
|
}
|
|
15013
15624
|
};
|
|
15014
15625
|
function buildOutputSchema() {
|
|
@@ -15028,6 +15639,29 @@ function buildOutputSchema() {
|
|
|
15028
15639
|
"}"
|
|
15029
15640
|
].join("\n");
|
|
15030
15641
|
}
|
|
15642
|
+
function buildStructureRepairPrompt(options) {
|
|
15643
|
+
const { validationError, invalidResponse } = options;
|
|
15644
|
+
return [
|
|
15645
|
+
"The following evaluation response has useful grading content but invalid JSON structure.",
|
|
15646
|
+
"Repair it to satisfy the schema in the system prompt.",
|
|
15647
|
+
"Preserve the evaluation meaning, do not re-grade the answer, and return only a single JSON object.",
|
|
15648
|
+
"",
|
|
15649
|
+
"Validation error:",
|
|
15650
|
+
validationError,
|
|
15651
|
+
"",
|
|
15652
|
+
"Invalid response:",
|
|
15653
|
+
invalidResponse
|
|
15654
|
+
].join("\n");
|
|
15655
|
+
}
|
|
15656
|
+
function sumTokenUsage(first, second) {
|
|
15657
|
+
if (!first && !second) {
|
|
15658
|
+
return void 0;
|
|
15659
|
+
}
|
|
15660
|
+
return {
|
|
15661
|
+
input: (first?.input ?? 0) + (second?.input ?? 0),
|
|
15662
|
+
output: (first?.output ?? 0) + (second?.output ?? 0)
|
|
15663
|
+
};
|
|
15664
|
+
}
|
|
15031
15665
|
function buildRubricOutputSchema() {
|
|
15032
15666
|
return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
15033
15667
|
You must return a valid JSON object matching this schema:
|
|
@@ -15127,19 +15761,21 @@ function calculateScoreRangeResult(result, rubrics) {
|
|
|
15127
15761
|
rawScores[rubric.id] = rawScore;
|
|
15128
15762
|
totalWeight += rubric.weight;
|
|
15129
15763
|
weightedScoreSum += normalizedScore * rubric.weight;
|
|
15130
|
-
let
|
|
15131
|
-
if (rubric.
|
|
15132
|
-
|
|
15764
|
+
let minScoreThreshold;
|
|
15765
|
+
if (rubric.min_score !== void 0) {
|
|
15766
|
+
minScoreThreshold = rubric.min_score;
|
|
15767
|
+
} else if (rubric.required_min_score !== void 0) {
|
|
15768
|
+
minScoreThreshold = rubric.required_min_score / 10;
|
|
15133
15769
|
} else if (rubric.required === true) {
|
|
15134
|
-
|
|
15770
|
+
minScoreThreshold = 1;
|
|
15135
15771
|
}
|
|
15136
15772
|
const matchingRange = rubric.score_ranges?.find(
|
|
15137
15773
|
(r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
|
|
15138
15774
|
);
|
|
15139
15775
|
const rangeDescription = matchingRange?.outcome ?? "";
|
|
15140
15776
|
const criterionLabel = rubric.outcome ?? rubric.id;
|
|
15141
|
-
const passed = !(
|
|
15142
|
-
if (
|
|
15777
|
+
const passed = !(minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) && rawScore >= 7;
|
|
15778
|
+
if (minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) {
|
|
15143
15779
|
failedRequired = true;
|
|
15144
15780
|
}
|
|
15145
15781
|
assertions.push({
|
|
@@ -15216,11 +15852,11 @@ function createFilesystemTools(workspacePath) {
|
|
|
15216
15852
|
execute: async (input) => {
|
|
15217
15853
|
try {
|
|
15218
15854
|
const resolved = resolveSandboxed(workspacePath, input.path);
|
|
15219
|
-
const
|
|
15220
|
-
if (
|
|
15855
|
+
const stat11 = await import_promises29.default.stat(resolved);
|
|
15856
|
+
if (stat11.isDirectory()) {
|
|
15221
15857
|
return { error: `'${input.path}' is a directory, not a file` };
|
|
15222
15858
|
}
|
|
15223
|
-
const buffer = Buffer.alloc(Math.min(
|
|
15859
|
+
const buffer = Buffer.alloc(Math.min(stat11.size, MAX_FILE_SIZE));
|
|
15224
15860
|
const fd = await import_promises29.default.open(resolved, "r");
|
|
15225
15861
|
try {
|
|
15226
15862
|
await fd.read(buffer, 0, buffer.length, 0);
|
|
@@ -15228,8 +15864,8 @@ function createFilesystemTools(workspacePath) {
|
|
|
15228
15864
|
await fd.close();
|
|
15229
15865
|
}
|
|
15230
15866
|
const content = buffer.toString("utf-8");
|
|
15231
|
-
const truncated =
|
|
15232
|
-
return { content, truncated, size:
|
|
15867
|
+
const truncated = stat11.size > MAX_FILE_SIZE;
|
|
15868
|
+
return { content, truncated, size: stat11.size };
|
|
15233
15869
|
} catch (error) {
|
|
15234
15870
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
15235
15871
|
}
|
|
@@ -15280,8 +15916,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
|
15280
15916
|
const ext = import_node_path39.default.extname(entry.name).toLowerCase();
|
|
15281
15917
|
if (BINARY_EXTENSIONS.has(ext)) continue;
|
|
15282
15918
|
try {
|
|
15283
|
-
const
|
|
15284
|
-
if (
|
|
15919
|
+
const stat11 = await import_promises29.default.stat(fullPath);
|
|
15920
|
+
if (stat11.size > MAX_FILE_SIZE) continue;
|
|
15285
15921
|
const content = await import_promises29.default.readFile(fullPath, "utf-8");
|
|
15286
15922
|
const lines = content.split("\n");
|
|
15287
15923
|
for (let i = 0; i < lines.length; i++) {
|
|
@@ -15925,115 +16561,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
15925
16561
|
* Evaluate a single field against the expected value.
|
|
15926
16562
|
*/
|
|
15927
16563
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
15928
|
-
const { path:
|
|
15929
|
-
const candidateValue = resolvePath(candidateData,
|
|
15930
|
-
const expectedValue = resolvePath(expectedData,
|
|
16564
|
+
const { path: path53, match, required = true, weight = 1 } = fieldConfig;
|
|
16565
|
+
const candidateValue = resolvePath(candidateData, path53);
|
|
16566
|
+
const expectedValue = resolvePath(expectedData, path53);
|
|
15931
16567
|
if (expectedValue === void 0) {
|
|
15932
16568
|
return {
|
|
15933
|
-
path:
|
|
16569
|
+
path: path53,
|
|
15934
16570
|
score: 1,
|
|
15935
16571
|
// No expected value means no comparison needed
|
|
15936
16572
|
weight,
|
|
15937
16573
|
hit: true,
|
|
15938
|
-
message: `${
|
|
16574
|
+
message: `${path53}: no expected value`
|
|
15939
16575
|
};
|
|
15940
16576
|
}
|
|
15941
16577
|
if (candidateValue === void 0) {
|
|
15942
16578
|
if (required) {
|
|
15943
16579
|
return {
|
|
15944
|
-
path:
|
|
16580
|
+
path: path53,
|
|
15945
16581
|
score: 0,
|
|
15946
16582
|
weight,
|
|
15947
16583
|
hit: false,
|
|
15948
|
-
message: `${
|
|
16584
|
+
message: `${path53} (required, missing)`
|
|
15949
16585
|
};
|
|
15950
16586
|
}
|
|
15951
16587
|
return {
|
|
15952
|
-
path:
|
|
16588
|
+
path: path53,
|
|
15953
16589
|
score: 1,
|
|
15954
16590
|
// Don't penalize missing optional fields
|
|
15955
16591
|
weight: 0,
|
|
15956
16592
|
// Zero weight means it won't affect the score
|
|
15957
16593
|
hit: true,
|
|
15958
|
-
message: `${
|
|
16594
|
+
message: `${path53}: optional field missing`
|
|
15959
16595
|
};
|
|
15960
16596
|
}
|
|
15961
16597
|
switch (match) {
|
|
15962
16598
|
case "exact":
|
|
15963
|
-
return this.compareExact(
|
|
16599
|
+
return this.compareExact(path53, candidateValue, expectedValue, weight);
|
|
15964
16600
|
case "numeric_tolerance":
|
|
15965
16601
|
return this.compareNumericTolerance(
|
|
15966
|
-
|
|
16602
|
+
path53,
|
|
15967
16603
|
candidateValue,
|
|
15968
16604
|
expectedValue,
|
|
15969
16605
|
fieldConfig,
|
|
15970
16606
|
weight
|
|
15971
16607
|
);
|
|
15972
16608
|
case "date":
|
|
15973
|
-
return this.compareDate(
|
|
16609
|
+
return this.compareDate(path53, candidateValue, expectedValue, fieldConfig, weight);
|
|
15974
16610
|
default:
|
|
15975
16611
|
return {
|
|
15976
|
-
path:
|
|
16612
|
+
path: path53,
|
|
15977
16613
|
score: 0,
|
|
15978
16614
|
weight,
|
|
15979
16615
|
hit: false,
|
|
15980
|
-
message: `${
|
|
16616
|
+
message: `${path53}: unknown match type "${match}"`
|
|
15981
16617
|
};
|
|
15982
16618
|
}
|
|
15983
16619
|
}
|
|
15984
16620
|
/**
|
|
15985
16621
|
* Exact equality comparison.
|
|
15986
16622
|
*/
|
|
15987
|
-
compareExact(
|
|
16623
|
+
compareExact(path53, candidateValue, expectedValue, weight) {
|
|
15988
16624
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
15989
16625
|
return {
|
|
15990
|
-
path:
|
|
16626
|
+
path: path53,
|
|
15991
16627
|
score: 1,
|
|
15992
16628
|
weight,
|
|
15993
16629
|
hit: true,
|
|
15994
|
-
message:
|
|
16630
|
+
message: path53
|
|
15995
16631
|
};
|
|
15996
16632
|
}
|
|
15997
16633
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
15998
16634
|
return {
|
|
15999
|
-
path:
|
|
16635
|
+
path: path53,
|
|
16000
16636
|
score: 0,
|
|
16001
16637
|
weight,
|
|
16002
16638
|
hit: false,
|
|
16003
|
-
message: `${
|
|
16639
|
+
message: `${path53} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
16004
16640
|
};
|
|
16005
16641
|
}
|
|
16006
16642
|
return {
|
|
16007
|
-
path:
|
|
16643
|
+
path: path53,
|
|
16008
16644
|
score: 0,
|
|
16009
16645
|
weight,
|
|
16010
16646
|
hit: false,
|
|
16011
|
-
message: `${
|
|
16647
|
+
message: `${path53} (value mismatch)`
|
|
16012
16648
|
};
|
|
16013
16649
|
}
|
|
16014
16650
|
/**
|
|
16015
16651
|
* Numeric comparison with absolute or relative tolerance.
|
|
16016
16652
|
*/
|
|
16017
|
-
compareNumericTolerance(
|
|
16653
|
+
compareNumericTolerance(path53, candidateValue, expectedValue, fieldConfig, weight) {
|
|
16018
16654
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
16019
16655
|
const candidateNum = toNumber(candidateValue);
|
|
16020
16656
|
const expectedNum = toNumber(expectedValue);
|
|
16021
16657
|
if (candidateNum === null || expectedNum === null) {
|
|
16022
16658
|
return {
|
|
16023
|
-
path:
|
|
16659
|
+
path: path53,
|
|
16024
16660
|
score: 0,
|
|
16025
16661
|
weight,
|
|
16026
16662
|
hit: false,
|
|
16027
|
-
message: `${
|
|
16663
|
+
message: `${path53} (non-numeric value)`
|
|
16028
16664
|
};
|
|
16029
16665
|
}
|
|
16030
16666
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
16031
16667
|
return {
|
|
16032
|
-
path:
|
|
16668
|
+
path: path53,
|
|
16033
16669
|
score: 0,
|
|
16034
16670
|
weight,
|
|
16035
16671
|
hit: false,
|
|
16036
|
-
message: `${
|
|
16672
|
+
message: `${path53} (invalid numeric value)`
|
|
16037
16673
|
};
|
|
16038
16674
|
}
|
|
16039
16675
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -16046,61 +16682,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
16046
16682
|
}
|
|
16047
16683
|
if (withinTolerance) {
|
|
16048
16684
|
return {
|
|
16049
|
-
path:
|
|
16685
|
+
path: path53,
|
|
16050
16686
|
score: 1,
|
|
16051
16687
|
weight,
|
|
16052
16688
|
hit: true,
|
|
16053
|
-
message: `${
|
|
16689
|
+
message: `${path53} (within tolerance: diff=${diff.toFixed(2)})`
|
|
16054
16690
|
};
|
|
16055
16691
|
}
|
|
16056
16692
|
return {
|
|
16057
|
-
path:
|
|
16693
|
+
path: path53,
|
|
16058
16694
|
score: 0,
|
|
16059
16695
|
weight,
|
|
16060
16696
|
hit: false,
|
|
16061
|
-
message: `${
|
|
16697
|
+
message: `${path53} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
16062
16698
|
};
|
|
16063
16699
|
}
|
|
16064
16700
|
/**
|
|
16065
16701
|
* Date comparison with format normalization.
|
|
16066
16702
|
*/
|
|
16067
|
-
compareDate(
|
|
16703
|
+
compareDate(path53, candidateValue, expectedValue, fieldConfig, weight) {
|
|
16068
16704
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
16069
16705
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
16070
16706
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
16071
16707
|
if (candidateDate === null) {
|
|
16072
16708
|
return {
|
|
16073
|
-
path:
|
|
16709
|
+
path: path53,
|
|
16074
16710
|
score: 0,
|
|
16075
16711
|
weight,
|
|
16076
16712
|
hit: false,
|
|
16077
|
-
message: `${
|
|
16713
|
+
message: `${path53} (unparseable candidate date)`
|
|
16078
16714
|
};
|
|
16079
16715
|
}
|
|
16080
16716
|
if (expectedDate === null) {
|
|
16081
16717
|
return {
|
|
16082
|
-
path:
|
|
16718
|
+
path: path53,
|
|
16083
16719
|
score: 0,
|
|
16084
16720
|
weight,
|
|
16085
16721
|
hit: false,
|
|
16086
|
-
message: `${
|
|
16722
|
+
message: `${path53} (unparseable expected date)`
|
|
16087
16723
|
};
|
|
16088
16724
|
}
|
|
16089
16725
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
16090
16726
|
return {
|
|
16091
|
-
path:
|
|
16727
|
+
path: path53,
|
|
16092
16728
|
score: 1,
|
|
16093
16729
|
weight,
|
|
16094
16730
|
hit: true,
|
|
16095
|
-
message:
|
|
16731
|
+
message: path53
|
|
16096
16732
|
};
|
|
16097
16733
|
}
|
|
16098
16734
|
return {
|
|
16099
|
-
path:
|
|
16735
|
+
path: path53,
|
|
16100
16736
|
score: 0,
|
|
16101
16737
|
weight,
|
|
16102
16738
|
hit: false,
|
|
16103
|
-
message: `${
|
|
16739
|
+
message: `${path53} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
16104
16740
|
};
|
|
16105
16741
|
}
|
|
16106
16742
|
/**
|
|
@@ -16133,11 +16769,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
16133
16769
|
};
|
|
16134
16770
|
}
|
|
16135
16771
|
};
|
|
16136
|
-
function resolvePath(obj,
|
|
16137
|
-
if (!
|
|
16772
|
+
function resolvePath(obj, path53) {
|
|
16773
|
+
if (!path53 || !obj) {
|
|
16138
16774
|
return void 0;
|
|
16139
16775
|
}
|
|
16140
|
-
const parts =
|
|
16776
|
+
const parts = path53.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
16141
16777
|
let current = obj;
|
|
16142
16778
|
for (const part of parts) {
|
|
16143
16779
|
if (current === null || current === void 0) {
|
|
@@ -16634,8 +17270,8 @@ var TokenUsageEvaluator = class {
|
|
|
16634
17270
|
|
|
16635
17271
|
// src/evaluation/evaluators/tool-trajectory.ts
|
|
16636
17272
|
init_cjs_shims();
|
|
16637
|
-
function getNestedValue(obj,
|
|
16638
|
-
const parts =
|
|
17273
|
+
function getNestedValue(obj, path53) {
|
|
17274
|
+
const parts = path53.split(".");
|
|
16639
17275
|
let current = obj;
|
|
16640
17276
|
for (const part of parts) {
|
|
16641
17277
|
if (current === null || current === void 0 || typeof current !== "object") {
|
|
@@ -18428,7 +19064,7 @@ var WorkspacePoolManager = class {
|
|
|
18428
19064
|
}
|
|
18429
19065
|
/**
|
|
18430
19066
|
* Reset an existing slot for reuse:
|
|
18431
|
-
* 1. Reset repos (git reset --hard
|
|
19067
|
+
* 1. Reset repos (fetch from origin when resolve=remote, then git reset --hard && git clean per repo)
|
|
18432
19068
|
* 2. Re-copy template files (skip repo directories)
|
|
18433
19069
|
*/
|
|
18434
19070
|
async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
|
|
@@ -18441,7 +19077,17 @@ var WorkspacePoolManager = class {
|
|
|
18441
19077
|
continue;
|
|
18442
19078
|
}
|
|
18443
19079
|
const ref = repo.checkout?.ref ?? "HEAD";
|
|
18444
|
-
|
|
19080
|
+
const resolve = repo.checkout?.resolve ?? "remote";
|
|
19081
|
+
if (resolve === "remote") {
|
|
19082
|
+
const fetchArgs = ["fetch", "origin", ref];
|
|
19083
|
+
if (repo.clone?.depth) {
|
|
19084
|
+
fetchArgs.splice(1, 0, "--depth", String(repo.clone.depth));
|
|
19085
|
+
}
|
|
19086
|
+
await git(fetchArgs, { cwd: repoDir });
|
|
19087
|
+
await git(["reset", "--hard", "FETCH_HEAD"], { cwd: repoDir });
|
|
19088
|
+
} else {
|
|
19089
|
+
await git(["reset", "--hard", ref], { cwd: repoDir });
|
|
19090
|
+
}
|
|
18445
19091
|
const cleanFlag = poolReset === "strict" ? "-fdx" : "-fd";
|
|
18446
19092
|
await git(["clean", cleanFlag], { cwd: repoDir });
|
|
18447
19093
|
}
|
|
@@ -18741,7 +19387,7 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
|
|
|
18741
19387
|
}
|
|
18742
19388
|
|
|
18743
19389
|
// src/evaluation/orchestrator.ts
|
|
18744
|
-
function classifyQualityStatus(score, threshold =
|
|
19390
|
+
function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
|
|
18745
19391
|
return score >= threshold ? "ok" : "quality_failure";
|
|
18746
19392
|
}
|
|
18747
19393
|
function buildSkippedEvaluatorError(scores) {
|
|
@@ -18833,7 +19479,7 @@ async function runEvaluation(options) {
|
|
|
18833
19479
|
const filteredEvalCases = filterEvalCases(evalCases, filter);
|
|
18834
19480
|
if (filteredEvalCases.length === 0) {
|
|
18835
19481
|
if (filter) {
|
|
18836
|
-
throw new Error(`No tests matched filter '${filter}' in ${evalFilePath}`);
|
|
19482
|
+
throw new Error(`No tests matched filter '${formatFilter(filter)}' in ${evalFilePath}`);
|
|
18837
19483
|
}
|
|
18838
19484
|
return [];
|
|
18839
19485
|
}
|
|
@@ -18859,20 +19505,10 @@ async function runEvaluation(options) {
|
|
|
18859
19505
|
if (resolvedTargetsByName.has(name)) {
|
|
18860
19506
|
return resolvedTargetsByName.get(name);
|
|
18861
19507
|
}
|
|
18862
|
-
|
|
19508
|
+
const definition = resolveDelegatedTargetDefinition(name, targetDefinitions, envLookup);
|
|
18863
19509
|
if (!definition) {
|
|
18864
19510
|
return void 0;
|
|
18865
19511
|
}
|
|
18866
|
-
for (let depth = 0; depth < 5; depth++) {
|
|
18867
|
-
const useTarget = definition.use_target;
|
|
18868
|
-
if (typeof useTarget !== "string" || useTarget.trim().length === 0) break;
|
|
18869
|
-
const envMatch = useTarget.trim().match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
|
|
18870
|
-
const resolvedName = envMatch ? envLookup[envMatch[1]] ?? "" : useTarget.trim();
|
|
18871
|
-
if (resolvedName.length === 0) break;
|
|
18872
|
-
const next = targetDefinitions.get(resolvedName);
|
|
18873
|
-
if (!next) break;
|
|
18874
|
-
definition = next;
|
|
18875
|
-
}
|
|
18876
19512
|
const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
|
|
18877
19513
|
resolvedTargetsByName.set(name, resolved);
|
|
18878
19514
|
return resolved;
|
|
@@ -18895,6 +19531,9 @@ async function runEvaluation(options) {
|
|
|
18895
19531
|
const graderName = targetContext.graderTarget ?? targetContext.name;
|
|
18896
19532
|
const resolvedGrader = resolveTargetByName(graderName);
|
|
18897
19533
|
if (!resolvedGrader) {
|
|
19534
|
+
if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) {
|
|
19535
|
+
return void 0;
|
|
19536
|
+
}
|
|
18898
19537
|
return getOrCreateProvider(targetContext);
|
|
18899
19538
|
}
|
|
18900
19539
|
return getOrCreateProvider(resolvedGrader);
|
|
@@ -19225,7 +19864,7 @@ async function runEvaluation(options) {
|
|
|
19225
19864
|
const budgetResult = {
|
|
19226
19865
|
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
19227
19866
|
testId: evalCase.id,
|
|
19228
|
-
|
|
19867
|
+
suite: evalCase.suite,
|
|
19229
19868
|
category: evalCase.category,
|
|
19230
19869
|
score: 0,
|
|
19231
19870
|
assertions: [],
|
|
@@ -19262,7 +19901,7 @@ async function runEvaluation(options) {
|
|
|
19262
19901
|
const haltResult = {
|
|
19263
19902
|
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
19264
19903
|
testId: evalCase.id,
|
|
19265
|
-
|
|
19904
|
+
suite: evalCase.suite,
|
|
19266
19905
|
category: evalCase.category,
|
|
19267
19906
|
score: 0,
|
|
19268
19907
|
assertions: [],
|
|
@@ -19574,7 +20213,7 @@ async function runBatchEvaluation(options) {
|
|
|
19574
20213
|
targetResolver,
|
|
19575
20214
|
availableTargets,
|
|
19576
20215
|
verbose,
|
|
19577
|
-
threshold: batchThreshold
|
|
20216
|
+
threshold: evalCase.threshold ?? batchThreshold
|
|
19578
20217
|
});
|
|
19579
20218
|
if (providerError) {
|
|
19580
20219
|
result = {
|
|
@@ -20036,8 +20675,9 @@ async function runEvalCase(options) {
|
|
|
20036
20675
|
fileChanges,
|
|
20037
20676
|
workspacePath,
|
|
20038
20677
|
verbose,
|
|
20039
|
-
threshold: caseThreshold
|
|
20678
|
+
threshold: evalCase.threshold ?? caseThreshold
|
|
20040
20679
|
});
|
|
20680
|
+
const effectiveThreshold = evalCase.threshold ?? caseThreshold;
|
|
20041
20681
|
const totalDurationMs = Date.now() - caseStartMs;
|
|
20042
20682
|
const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
|
|
20043
20683
|
const evalRunTokenUsage = tokenUsage || graderTokens ? {
|
|
@@ -20051,7 +20691,7 @@ async function runEvalCase(options) {
|
|
|
20051
20691
|
...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
|
|
20052
20692
|
};
|
|
20053
20693
|
const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
|
|
20054
|
-
const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score,
|
|
20694
|
+
const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, effectiveThreshold);
|
|
20055
20695
|
const targetUsedField = targetUsed ? { targetUsed } : {};
|
|
20056
20696
|
const finalResult = providerError ? {
|
|
20057
20697
|
...result,
|
|
@@ -20252,7 +20892,8 @@ async function evaluateCandidate(options) {
|
|
|
20252
20892
|
targetResolver,
|
|
20253
20893
|
availableTargets,
|
|
20254
20894
|
fileChanges,
|
|
20255
|
-
workspacePath
|
|
20895
|
+
workspacePath,
|
|
20896
|
+
threshold: evalThreshold
|
|
20256
20897
|
});
|
|
20257
20898
|
const completedAt = nowFn();
|
|
20258
20899
|
let agentRequest;
|
|
@@ -20283,7 +20924,7 @@ async function evaluateCandidate(options) {
|
|
|
20283
20924
|
return {
|
|
20284
20925
|
timestamp: completedAt.toISOString(),
|
|
20285
20926
|
testId: evalCase.id,
|
|
20286
|
-
|
|
20927
|
+
suite: evalCase.suite,
|
|
20287
20928
|
category: evalCase.category,
|
|
20288
20929
|
conversationId: evalCase.conversation_id,
|
|
20289
20930
|
score: score.score,
|
|
@@ -20326,7 +20967,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
20326
20967
|
targetResolver,
|
|
20327
20968
|
availableTargets,
|
|
20328
20969
|
fileChanges,
|
|
20329
|
-
workspacePath
|
|
20970
|
+
workspacePath,
|
|
20971
|
+
threshold
|
|
20330
20972
|
} = options;
|
|
20331
20973
|
if (evalCase.assertions && evalCase.assertions.length > 0) {
|
|
20332
20974
|
return runEvaluatorList({
|
|
@@ -20352,7 +20994,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
20352
20994
|
targetResolver,
|
|
20353
20995
|
availableTargets,
|
|
20354
20996
|
fileChanges,
|
|
20355
|
-
workspacePath
|
|
20997
|
+
workspacePath,
|
|
20998
|
+
threshold
|
|
20356
20999
|
});
|
|
20357
21000
|
}
|
|
20358
21001
|
const evaluatorKind = evalCase.evaluator ?? "llm-grader";
|
|
@@ -20454,7 +21097,8 @@ async function runEvaluatorList(options) {
|
|
|
20454
21097
|
name: evaluatorConfig.name,
|
|
20455
21098
|
type: evaluatorConfig.type,
|
|
20456
21099
|
weight,
|
|
20457
|
-
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
21100
|
+
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
|
|
21101
|
+
...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
|
|
20458
21102
|
});
|
|
20459
21103
|
scores.push({
|
|
20460
21104
|
name: evaluatorConfig.name,
|
|
@@ -20489,7 +21133,8 @@ async function runEvaluatorList(options) {
|
|
|
20489
21133
|
name: evaluatorConfig.name ?? "unknown",
|
|
20490
21134
|
type: evaluatorConfig.type ?? "llm-grader",
|
|
20491
21135
|
weight,
|
|
20492
|
-
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
21136
|
+
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
|
|
21137
|
+
...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
|
|
20493
21138
|
});
|
|
20494
21139
|
scores.push({
|
|
20495
21140
|
name: evaluatorConfig.name ?? "unknown",
|
|
@@ -20523,9 +21168,10 @@ async function runEvaluatorList(options) {
|
|
|
20523
21168
|
}
|
|
20524
21169
|
}
|
|
20525
21170
|
}
|
|
21171
|
+
const effectiveThreshold = options.threshold ?? DEFAULT_THRESHOLD;
|
|
20526
21172
|
const hasRequiredFailure = scored.some((entry) => {
|
|
20527
21173
|
if (!entry.required) return false;
|
|
20528
|
-
const minScore = typeof entry.required === "number" ? entry.required :
|
|
21174
|
+
const minScore = entry.min_score ?? (typeof entry.required === "number" ? entry.required : effectiveThreshold);
|
|
20529
21175
|
return entry.score.score < minScore;
|
|
20530
21176
|
});
|
|
20531
21177
|
const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
|
|
@@ -20536,17 +21182,23 @@ async function runEvaluatorList(options) {
|
|
|
20536
21182
|
const expectedAspectCount = assertions.length || 1;
|
|
20537
21183
|
const score = {
|
|
20538
21184
|
score: aggregateScore,
|
|
20539
|
-
verdict: scoreToVerdict(aggregateScore),
|
|
21185
|
+
verdict: scoreToVerdict(aggregateScore, effectiveThreshold),
|
|
20540
21186
|
assertions,
|
|
20541
21187
|
expectedAspectCount
|
|
20542
21188
|
};
|
|
20543
21189
|
return { score, scores };
|
|
20544
21190
|
}
|
|
21191
|
+
function formatFilter(filter) {
|
|
21192
|
+
return typeof filter === "string" ? filter : filter.join(", ");
|
|
21193
|
+
}
|
|
21194
|
+
function matchesFilter3(id, filter) {
|
|
21195
|
+
return typeof filter === "string" ? import_micromatch3.default.isMatch(id, filter) : filter.some((pattern) => import_micromatch3.default.isMatch(id, pattern));
|
|
21196
|
+
}
|
|
20545
21197
|
function filterEvalCases(evalCases, filter) {
|
|
20546
21198
|
if (!filter) {
|
|
20547
21199
|
return evalCases;
|
|
20548
21200
|
}
|
|
20549
|
-
return evalCases.filter((evalCase) =>
|
|
21201
|
+
return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter));
|
|
20550
21202
|
}
|
|
20551
21203
|
function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
|
|
20552
21204
|
const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
|
|
@@ -20633,7 +21285,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
20633
21285
|
return {
|
|
20634
21286
|
timestamp: timestamp.toISOString(),
|
|
20635
21287
|
testId: evalCase.id,
|
|
20636
|
-
|
|
21288
|
+
suite: evalCase.suite,
|
|
20637
21289
|
category: evalCase.category,
|
|
20638
21290
|
conversationId: evalCase.conversation_id,
|
|
20639
21291
|
score: 0,
|
|
@@ -20907,6 +21559,7 @@ async function evaluate(config) {
|
|
|
20907
21559
|
verbose: config.verbose,
|
|
20908
21560
|
maxConcurrency: config.workers ?? 3,
|
|
20909
21561
|
filter: config.filter,
|
|
21562
|
+
threshold: config.threshold,
|
|
20910
21563
|
evalCases,
|
|
20911
21564
|
onResult: async (result) => {
|
|
20912
21565
|
collectedResults.push(result);
|
|
@@ -20917,19 +21570,19 @@ async function evaluate(config) {
|
|
|
20917
21570
|
const durationMs = Date.now() - startTime;
|
|
20918
21571
|
return {
|
|
20919
21572
|
results: allResults,
|
|
20920
|
-
summary: computeSummary(allResults, durationMs)
|
|
21573
|
+
summary: computeSummary(allResults, durationMs, config.threshold)
|
|
20921
21574
|
};
|
|
20922
21575
|
}
|
|
20923
21576
|
function mapAssertionType(type) {
|
|
20924
21577
|
return type.replace(/_/g, "-");
|
|
20925
21578
|
}
|
|
20926
|
-
function computeSummary(results, durationMs) {
|
|
21579
|
+
function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
|
|
20927
21580
|
const total = results.length;
|
|
20928
21581
|
let passed = 0;
|
|
20929
21582
|
let scoreSum = 0;
|
|
20930
21583
|
for (const r of results) {
|
|
20931
21584
|
scoreSum += r.score;
|
|
20932
|
-
if (r.score >=
|
|
21585
|
+
if (r.score >= threshold) {
|
|
20933
21586
|
passed++;
|
|
20934
21587
|
}
|
|
20935
21588
|
}
|
|
@@ -20960,7 +21613,7 @@ async function discoverDefaultTarget(repoRoot) {
|
|
|
20960
21613
|
return null;
|
|
20961
21614
|
}
|
|
20962
21615
|
async function loadEnvHierarchy(repoRoot, startPath) {
|
|
20963
|
-
const { readFileSync:
|
|
21616
|
+
const { readFileSync: readFileSync4 } = await import("fs");
|
|
20964
21617
|
const chain = buildDirectoryChain2(startPath, repoRoot);
|
|
20965
21618
|
const envFiles = [];
|
|
20966
21619
|
for (const dir of chain) {
|
|
@@ -20969,7 +21622,7 @@ async function loadEnvHierarchy(repoRoot, startPath) {
|
|
|
20969
21622
|
}
|
|
20970
21623
|
for (let i = 0; i < envFiles.length; i++) {
|
|
20971
21624
|
try {
|
|
20972
|
-
const content =
|
|
21625
|
+
const content = readFileSync4(envFiles[i], "utf8");
|
|
20973
21626
|
for (const line of content.split("\n")) {
|
|
20974
21627
|
const trimmed = line.trim();
|
|
20975
21628
|
if (!trimmed || trimmed.startsWith("#")) continue;
|
|
@@ -21043,7 +21696,7 @@ var CONFIG_FILE_NAMES = [
|
|
|
21043
21696
|
];
|
|
21044
21697
|
async function loadTsConfig(projectRoot) {
|
|
21045
21698
|
const { existsSync: existsSync7 } = await import("fs");
|
|
21046
|
-
const { pathToFileURL } = await import("url");
|
|
21699
|
+
const { pathToFileURL: pathToFileURL2 } = await import("url");
|
|
21047
21700
|
const { join: join2 } = await import("path");
|
|
21048
21701
|
for (const fileName of CONFIG_FILE_NAMES) {
|
|
21049
21702
|
const filePath = join2(projectRoot, fileName);
|
|
@@ -21051,7 +21704,7 @@ async function loadTsConfig(projectRoot) {
|
|
|
21051
21704
|
continue;
|
|
21052
21705
|
}
|
|
21053
21706
|
try {
|
|
21054
|
-
const fileUrl =
|
|
21707
|
+
const fileUrl = pathToFileURL2(filePath).href;
|
|
21055
21708
|
const mod = await import(fileUrl);
|
|
21056
21709
|
const config = mod.default ?? mod;
|
|
21057
21710
|
return AgentVConfigSchema.parse(config);
|
|
@@ -21492,7 +22145,7 @@ var OtelTraceExporter = class {
|
|
|
21492
22145
|
rootSpan.setAttribute("gen_ai.system", "agentv");
|
|
21493
22146
|
rootSpan.setAttribute("agentv.test_id", result.testId);
|
|
21494
22147
|
rootSpan.setAttribute("agentv.target", result.target);
|
|
21495
|
-
if (result.
|
|
22148
|
+
if (result.suite) rootSpan.setAttribute("agentv.suite", result.suite);
|
|
21496
22149
|
rootSpan.setAttribute("agentv.score", result.score);
|
|
21497
22150
|
if (captureContent && result.output.length > 0) {
|
|
21498
22151
|
const lastMsg = result.output[result.output.length - 1];
|
|
@@ -21701,7 +22354,7 @@ var OtelStreamingObserver = class {
|
|
|
21701
22354
|
this.rootSpan.setAttribute("gen_ai.system", "agentv");
|
|
21702
22355
|
this.rootSpan.setAttribute("agentv.test_id", testId);
|
|
21703
22356
|
this.rootSpan.setAttribute("agentv.target", target);
|
|
21704
|
-
if (evalSet) this.rootSpan.setAttribute("agentv.
|
|
22357
|
+
if (evalSet) this.rootSpan.setAttribute("agentv.suite", evalSet);
|
|
21705
22358
|
this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
|
|
21706
22359
|
}
|
|
21707
22360
|
/** Create and immediately export a tool span */
|
|
@@ -22057,12 +22710,244 @@ function extractToolResultContent(content) {
|
|
|
22057
22710
|
return parts.length > 0 ? parts.join("") : void 0;
|
|
22058
22711
|
}
|
|
22059
22712
|
|
|
22060
|
-
// src/import/
|
|
22713
|
+
// src/import/codex-parser.ts
|
|
22714
|
+
init_cjs_shims();
|
|
22715
|
+
function parseCodexSession(jsonl) {
|
|
22716
|
+
const messages = [];
|
|
22717
|
+
let sessionId = "";
|
|
22718
|
+
let cwd;
|
|
22719
|
+
let model;
|
|
22720
|
+
let version;
|
|
22721
|
+
let startTimestamp;
|
|
22722
|
+
let endTimestamp;
|
|
22723
|
+
const pendingCalls = /* @__PURE__ */ new Map();
|
|
22724
|
+
const lines = jsonl.split("\n").filter((l) => l.trim().length > 0);
|
|
22725
|
+
for (const line of lines) {
|
|
22726
|
+
let entry;
|
|
22727
|
+
try {
|
|
22728
|
+
entry = JSON.parse(line);
|
|
22729
|
+
} catch {
|
|
22730
|
+
continue;
|
|
22731
|
+
}
|
|
22732
|
+
if (!entry.type) continue;
|
|
22733
|
+
if (entry.timestamp) {
|
|
22734
|
+
if (!startTimestamp) startTimestamp = entry.timestamp;
|
|
22735
|
+
endTimestamp = entry.timestamp;
|
|
22736
|
+
}
|
|
22737
|
+
const payload = entry.payload ?? {};
|
|
22738
|
+
switch (entry.type) {
|
|
22739
|
+
case "session_meta": {
|
|
22740
|
+
sessionId = String(payload.id ?? "");
|
|
22741
|
+
cwd = payload.cwd ? String(payload.cwd) : void 0;
|
|
22742
|
+
version = payload.cli_version ? String(payload.cli_version) : void 0;
|
|
22743
|
+
if (payload.model && !model) {
|
|
22744
|
+
model = String(payload.model);
|
|
22745
|
+
}
|
|
22746
|
+
break;
|
|
22747
|
+
}
|
|
22748
|
+
case "turn_context": {
|
|
22749
|
+
if (payload.model && !model) {
|
|
22750
|
+
model = String(payload.model);
|
|
22751
|
+
}
|
|
22752
|
+
if (payload.cwd && !cwd) {
|
|
22753
|
+
cwd = String(payload.cwd);
|
|
22754
|
+
}
|
|
22755
|
+
break;
|
|
22756
|
+
}
|
|
22757
|
+
case "response_item": {
|
|
22758
|
+
const itemType = String(payload.type ?? "");
|
|
22759
|
+
const role = String(payload.role ?? "");
|
|
22760
|
+
switch (itemType) {
|
|
22761
|
+
case "message": {
|
|
22762
|
+
if (role === "developer") break;
|
|
22763
|
+
const content = extractResponseItemContent(payload.content);
|
|
22764
|
+
if (role === "user" && content) {
|
|
22765
|
+
messages.push({ role: "user", content });
|
|
22766
|
+
} else if (role === "assistant" && content) {
|
|
22767
|
+
messages.push({ role: "assistant", content });
|
|
22768
|
+
}
|
|
22769
|
+
break;
|
|
22770
|
+
}
|
|
22771
|
+
case "function_call": {
|
|
22772
|
+
const toolName = String(payload.name ?? "");
|
|
22773
|
+
const callId = String(payload.call_id ?? "");
|
|
22774
|
+
let input;
|
|
22775
|
+
if (typeof payload.arguments === "string") {
|
|
22776
|
+
try {
|
|
22777
|
+
input = JSON.parse(payload.arguments);
|
|
22778
|
+
} catch {
|
|
22779
|
+
input = payload.arguments;
|
|
22780
|
+
}
|
|
22781
|
+
} else {
|
|
22782
|
+
input = payload.arguments;
|
|
22783
|
+
}
|
|
22784
|
+
const toolCall = { tool: toolName, input, id: callId };
|
|
22785
|
+
const msgIdx = messages.length;
|
|
22786
|
+
messages.push({
|
|
22787
|
+
role: "assistant",
|
|
22788
|
+
toolCalls: [toolCall]
|
|
22789
|
+
});
|
|
22790
|
+
if (callId) {
|
|
22791
|
+
pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
|
|
22792
|
+
}
|
|
22793
|
+
break;
|
|
22794
|
+
}
|
|
22795
|
+
case "custom_tool_call": {
|
|
22796
|
+
const toolName = String(payload.name ?? "");
|
|
22797
|
+
const callId = String(payload.call_id ?? "");
|
|
22798
|
+
let input;
|
|
22799
|
+
if (typeof payload.arguments === "string") {
|
|
22800
|
+
try {
|
|
22801
|
+
input = JSON.parse(payload.arguments);
|
|
22802
|
+
} catch {
|
|
22803
|
+
input = payload.arguments;
|
|
22804
|
+
}
|
|
22805
|
+
} else {
|
|
22806
|
+
input = payload.arguments;
|
|
22807
|
+
}
|
|
22808
|
+
const toolCall = { tool: toolName, input, id: callId };
|
|
22809
|
+
const msgIdx = messages.length;
|
|
22810
|
+
messages.push({
|
|
22811
|
+
role: "assistant",
|
|
22812
|
+
toolCalls: [toolCall]
|
|
22813
|
+
});
|
|
22814
|
+
if (callId) {
|
|
22815
|
+
pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
|
|
22816
|
+
}
|
|
22817
|
+
break;
|
|
22818
|
+
}
|
|
22819
|
+
case "function_call_output":
|
|
22820
|
+
case "custom_tool_call_output": {
|
|
22821
|
+
const callId = String(payload.call_id ?? "");
|
|
22822
|
+
const pending = pendingCalls.get(callId);
|
|
22823
|
+
if (pending) {
|
|
22824
|
+
const existingMsg = messages[pending.msgIdx];
|
|
22825
|
+
const existingCalls = [...existingMsg.toolCalls ?? []];
|
|
22826
|
+
existingCalls[pending.toolIdx] = {
|
|
22827
|
+
...existingCalls[pending.toolIdx],
|
|
22828
|
+
output: payload.output
|
|
22829
|
+
};
|
|
22830
|
+
messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls };
|
|
22831
|
+
pendingCalls.delete(callId);
|
|
22832
|
+
}
|
|
22833
|
+
break;
|
|
22834
|
+
}
|
|
22835
|
+
// Skip reasoning blocks (thinking tokens)
|
|
22836
|
+
case "reasoning":
|
|
22837
|
+
break;
|
|
22838
|
+
}
|
|
22839
|
+
break;
|
|
22840
|
+
}
|
|
22841
|
+
}
|
|
22842
|
+
}
|
|
22843
|
+
let durationMs;
|
|
22844
|
+
if (startTimestamp && endTimestamp) {
|
|
22845
|
+
durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime();
|
|
22846
|
+
}
|
|
22847
|
+
const source = {
|
|
22848
|
+
provider: "codex",
|
|
22849
|
+
sessionId,
|
|
22850
|
+
cwd,
|
|
22851
|
+
startedAt: startTimestamp,
|
|
22852
|
+
model,
|
|
22853
|
+
version
|
|
22854
|
+
};
|
|
22855
|
+
return {
|
|
22856
|
+
messages,
|
|
22857
|
+
source,
|
|
22858
|
+
// Codex rollout files don't include token counts (only rate limit info)
|
|
22859
|
+
tokenUsage: void 0,
|
|
22860
|
+
durationMs,
|
|
22861
|
+
costUsd: null
|
|
22862
|
+
};
|
|
22863
|
+
}
|
|
22864
|
+
function extractResponseItemContent(content) {
|
|
22865
|
+
if (typeof content === "string") return content;
|
|
22866
|
+
if (!Array.isArray(content)) return void 0;
|
|
22867
|
+
const parts = [];
|
|
22868
|
+
for (const block of content) {
|
|
22869
|
+
if (typeof block === "object" && block !== null) {
|
|
22870
|
+
const b = block;
|
|
22871
|
+
if (typeof b.text === "string") {
|
|
22872
|
+
parts.push(b.text);
|
|
22873
|
+
}
|
|
22874
|
+
}
|
|
22875
|
+
}
|
|
22876
|
+
return parts.length > 0 ? parts.join("") : void 0;
|
|
22877
|
+
}
|
|
22878
|
+
|
|
22879
|
+
// src/import/codex-session-discovery.ts
|
|
22061
22880
|
init_cjs_shims();
|
|
22062
22881
|
var import_promises36 = require("fs/promises");
|
|
22063
22882
|
var import_node_os8 = require("os");
|
|
22064
22883
|
var import_node_path53 = __toESM(require("path"), 1);
|
|
22065
|
-
var
|
|
22884
|
+
var DEFAULT_SESSIONS_DIR = () => import_node_path53.default.join((0, import_node_os8.homedir)(), ".codex", "sessions");
|
|
22885
|
+
async function discoverCodexSessions(opts) {
|
|
22886
|
+
const sessionsDir = opts?.sessionsDir ?? DEFAULT_SESSIONS_DIR();
|
|
22887
|
+
const limit = opts?.latest ? 1 : opts?.limit ?? 10;
|
|
22888
|
+
const sessions = [];
|
|
22889
|
+
let yearDirs;
|
|
22890
|
+
try {
|
|
22891
|
+
yearDirs = await (0, import_promises36.readdir)(sessionsDir);
|
|
22892
|
+
} catch {
|
|
22893
|
+
return [];
|
|
22894
|
+
}
|
|
22895
|
+
for (const year of yearDirs) {
|
|
22896
|
+
const yearPath = import_node_path53.default.join(sessionsDir, year);
|
|
22897
|
+
let monthDirs;
|
|
22898
|
+
try {
|
|
22899
|
+
monthDirs = await (0, import_promises36.readdir)(yearPath);
|
|
22900
|
+
} catch {
|
|
22901
|
+
continue;
|
|
22902
|
+
}
|
|
22903
|
+
for (const month of monthDirs) {
|
|
22904
|
+
const monthPath = import_node_path53.default.join(yearPath, month);
|
|
22905
|
+
let dayDirs;
|
|
22906
|
+
try {
|
|
22907
|
+
dayDirs = await (0, import_promises36.readdir)(monthPath);
|
|
22908
|
+
} catch {
|
|
22909
|
+
continue;
|
|
22910
|
+
}
|
|
22911
|
+
for (const day of dayDirs) {
|
|
22912
|
+
if (opts?.date) {
|
|
22913
|
+
const dirDate = `${year}-${month}-${day}`;
|
|
22914
|
+
if (dirDate !== opts.date) continue;
|
|
22915
|
+
}
|
|
22916
|
+
const dayPath = import_node_path53.default.join(monthPath, day);
|
|
22917
|
+
let files;
|
|
22918
|
+
try {
|
|
22919
|
+
files = await (0, import_promises36.readdir)(dayPath);
|
|
22920
|
+
} catch {
|
|
22921
|
+
continue;
|
|
22922
|
+
}
|
|
22923
|
+
for (const file of files) {
|
|
22924
|
+
if (!file.startsWith("rollout-") || !file.endsWith(".jsonl")) continue;
|
|
22925
|
+
const filePath = import_node_path53.default.join(dayPath, file);
|
|
22926
|
+
const nameWithoutExt = file.replace(/\.jsonl$/, "");
|
|
22927
|
+
const parts = nameWithoutExt.split("-");
|
|
22928
|
+
const sessionId = parts.length >= 6 ? parts.slice(-5).join("-") : nameWithoutExt;
|
|
22929
|
+
let updatedAt;
|
|
22930
|
+
try {
|
|
22931
|
+
const fileStat = await (0, import_promises36.stat)(filePath);
|
|
22932
|
+
updatedAt = fileStat.mtime;
|
|
22933
|
+
} catch {
|
|
22934
|
+
updatedAt = /* @__PURE__ */ new Date(0);
|
|
22935
|
+
}
|
|
22936
|
+
sessions.push({ sessionId, filePath, filename: file, updatedAt });
|
|
22937
|
+
}
|
|
22938
|
+
}
|
|
22939
|
+
}
|
|
22940
|
+
}
|
|
22941
|
+
sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
|
|
22942
|
+
return sessions.slice(0, limit);
|
|
22943
|
+
}
|
|
22944
|
+
|
|
22945
|
+
// src/import/session-discovery.ts
|
|
22946
|
+
init_cjs_shims();
|
|
22947
|
+
var import_promises37 = require("fs/promises");
|
|
22948
|
+
var import_node_os9 = require("os");
|
|
22949
|
+
var import_node_path54 = __toESM(require("path"), 1);
|
|
22950
|
+
var DEFAULT_PROJECTS_DIR = () => import_node_path54.default.join((0, import_node_os9.homedir)(), ".claude", "projects");
|
|
22066
22951
|
function encodeProjectPath(projectPath) {
|
|
22067
22952
|
return projectPath.replace(/\//g, "-");
|
|
22068
22953
|
}
|
|
@@ -22071,7 +22956,7 @@ async function discoverClaudeSessions(opts) {
|
|
|
22071
22956
|
const limit = opts?.latest ? 1 : opts?.limit ?? 10;
|
|
22072
22957
|
let projectDirs;
|
|
22073
22958
|
try {
|
|
22074
|
-
projectDirs = await (0,
|
|
22959
|
+
projectDirs = await (0, import_promises37.readdir)(projectsDir);
|
|
22075
22960
|
} catch {
|
|
22076
22961
|
return [];
|
|
22077
22962
|
}
|
|
@@ -22081,10 +22966,10 @@ async function discoverClaudeSessions(opts) {
|
|
|
22081
22966
|
}
|
|
22082
22967
|
const sessions = [];
|
|
22083
22968
|
for (const projectDir of projectDirs) {
|
|
22084
|
-
const dirPath =
|
|
22969
|
+
const dirPath = import_node_path54.default.join(projectsDir, projectDir);
|
|
22085
22970
|
let entries;
|
|
22086
22971
|
try {
|
|
22087
|
-
entries = await (0,
|
|
22972
|
+
entries = await (0, import_promises37.readdir)(dirPath);
|
|
22088
22973
|
} catch {
|
|
22089
22974
|
continue;
|
|
22090
22975
|
}
|
|
@@ -22092,10 +22977,10 @@ async function discoverClaudeSessions(opts) {
|
|
|
22092
22977
|
if (!entry.endsWith(".jsonl")) continue;
|
|
22093
22978
|
const sessionId = entry.replace(/\.jsonl$/, "");
|
|
22094
22979
|
if (opts?.sessionId && sessionId !== opts.sessionId) continue;
|
|
22095
|
-
const filePath =
|
|
22980
|
+
const filePath = import_node_path54.default.join(dirPath, entry);
|
|
22096
22981
|
let updatedAt;
|
|
22097
22982
|
try {
|
|
22098
|
-
const fileStat = await (0,
|
|
22983
|
+
const fileStat = await (0, import_promises37.stat)(filePath);
|
|
22099
22984
|
updatedAt = fileStat.mtime;
|
|
22100
22985
|
} catch {
|
|
22101
22986
|
updatedAt = /* @__PURE__ */ new Date(0);
|
|
@@ -22112,13 +22997,91 @@ async function discoverClaudeSessions(opts) {
|
|
|
22112
22997
|
return sessions.slice(0, limit);
|
|
22113
22998
|
}
|
|
22114
22999
|
|
|
23000
|
+
// src/import/transcript-provider.ts
|
|
23001
|
+
init_cjs_shims();
|
|
23002
|
+
|
|
22115
23003
|
// src/import/types.ts
|
|
22116
23004
|
init_cjs_shims();
|
|
22117
|
-
var
|
|
23005
|
+
var import_promises38 = require("fs/promises");
|
|
23006
|
+
function toTranscriptJsonLine(entry) {
|
|
23007
|
+
const firstUserMessage = entry.messages.find((m) => m.role === "user");
|
|
23008
|
+
const input = typeof firstUserMessage?.content === "string" ? firstUserMessage.content : "";
|
|
23009
|
+
return {
|
|
23010
|
+
input,
|
|
23011
|
+
output: entry.messages,
|
|
23012
|
+
token_usage: entry.tokenUsage ? {
|
|
23013
|
+
input: entry.tokenUsage.input,
|
|
23014
|
+
output: entry.tokenUsage.output,
|
|
23015
|
+
cached: entry.tokenUsage.cached
|
|
23016
|
+
} : void 0,
|
|
23017
|
+
duration_ms: entry.durationMs,
|
|
23018
|
+
cost_usd: entry.costUsd,
|
|
23019
|
+
source: {
|
|
23020
|
+
provider: entry.source.provider,
|
|
23021
|
+
session_id: entry.source.sessionId,
|
|
23022
|
+
model: entry.source.model,
|
|
23023
|
+
timestamp: entry.source.startedAt,
|
|
23024
|
+
git_branch: entry.source.gitBranch,
|
|
23025
|
+
cwd: entry.source.cwd ?? entry.source.projectPath,
|
|
23026
|
+
version: entry.source.version
|
|
23027
|
+
}
|
|
23028
|
+
};
|
|
23029
|
+
}
|
|
23030
|
+
async function readTranscriptJsonl(filePath) {
|
|
23031
|
+
const text = await (0, import_promises38.readFile)(filePath, "utf8");
|
|
23032
|
+
return text.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
23033
|
+
}
|
|
22118
23034
|
async function readTranscriptFile(filePath) {
|
|
22119
|
-
return (0,
|
|
23035
|
+
return (0, import_promises38.readFile)(filePath, "utf8");
|
|
22120
23036
|
}
|
|
22121
23037
|
|
|
23038
|
+
// src/import/transcript-provider.ts
|
|
23039
|
+
var TranscriptProvider = class _TranscriptProvider {
|
|
23040
|
+
id;
|
|
23041
|
+
kind = "transcript";
|
|
23042
|
+
targetName;
|
|
23043
|
+
lines;
|
|
23044
|
+
cursor = 0;
|
|
23045
|
+
constructor(targetName, lines) {
|
|
23046
|
+
this.targetName = targetName;
|
|
23047
|
+
this.id = `transcript:${targetName}`;
|
|
23048
|
+
this.lines = lines;
|
|
23049
|
+
}
|
|
23050
|
+
/**
|
|
23051
|
+
* Create a TranscriptProvider from a JSONL file path.
|
|
23052
|
+
*/
|
|
23053
|
+
static async fromFile(filePath) {
|
|
23054
|
+
const lines = await readTranscriptJsonl(filePath);
|
|
23055
|
+
if (lines.length === 0) {
|
|
23056
|
+
throw new Error(`Transcript file is empty: ${filePath}`);
|
|
23057
|
+
}
|
|
23058
|
+
const providerName = lines[0].source.provider ?? "transcript";
|
|
23059
|
+
return new _TranscriptProvider(providerName, lines);
|
|
23060
|
+
}
|
|
23061
|
+
get lineCount() {
|
|
23062
|
+
return this.lines.length;
|
|
23063
|
+
}
|
|
23064
|
+
async invoke(_request) {
|
|
23065
|
+
if (this.cursor >= this.lines.length) {
|
|
23066
|
+
throw new Error(
|
|
23067
|
+
`Transcript exhausted: ${this.lines.length} line(s) available but ${this.cursor + 1} invocations attempted. Each transcript line maps to one test case.`
|
|
23068
|
+
);
|
|
23069
|
+
}
|
|
23070
|
+
const line = this.lines[this.cursor++];
|
|
23071
|
+
return {
|
|
23072
|
+
output: line.output,
|
|
23073
|
+
tokenUsage: line.token_usage ? {
|
|
23074
|
+
input: line.token_usage.input,
|
|
23075
|
+
output: line.token_usage.output,
|
|
23076
|
+
cached: line.token_usage.cached
|
|
23077
|
+
} : void 0,
|
|
23078
|
+
durationMs: line.duration_ms,
|
|
23079
|
+
costUsd: line.cost_usd ?? void 0,
|
|
23080
|
+
startTime: line.source.timestamp
|
|
23081
|
+
};
|
|
23082
|
+
}
|
|
23083
|
+
};
|
|
23084
|
+
|
|
22122
23085
|
// src/index.ts
|
|
22123
23086
|
function createAgentKernel() {
|
|
22124
23087
|
return { status: "stub" };
|
|
@@ -22133,6 +23096,7 @@ function createAgentKernel() {
|
|
|
22133
23096
|
DEFAULT_EVALUATOR_TEMPLATE,
|
|
22134
23097
|
DEFAULT_EVAL_PATTERNS,
|
|
22135
23098
|
DEFAULT_EXPLORATION_TOOLS,
|
|
23099
|
+
DEFAULT_THRESHOLD,
|
|
22136
23100
|
DeterministicAssertionEvaluator,
|
|
22137
23101
|
EvaluatorRegistry,
|
|
22138
23102
|
ExecutionMetricsEvaluator,
|
|
@@ -22154,6 +23118,7 @@ function createAgentKernel() {
|
|
|
22154
23118
|
TemplateNotFoundError,
|
|
22155
23119
|
TokenUsageEvaluator,
|
|
22156
23120
|
ToolTrajectoryEvaluator,
|
|
23121
|
+
TranscriptProvider,
|
|
22157
23122
|
WorkspaceCreationError,
|
|
22158
23123
|
WorkspacePoolManager,
|
|
22159
23124
|
addProject,
|
|
@@ -22190,6 +23155,7 @@ function createAgentKernel() {
|
|
|
22190
23155
|
detectFormat,
|
|
22191
23156
|
discoverAssertions,
|
|
22192
23157
|
discoverClaudeSessions,
|
|
23158
|
+
discoverCodexSessions,
|
|
22193
23159
|
discoverCopilotSessions,
|
|
22194
23160
|
discoverGraders,
|
|
22195
23161
|
discoverJudges,
|
|
@@ -22250,6 +23216,8 @@ function createAgentKernel() {
|
|
|
22250
23216
|
normalizeLineEndings,
|
|
22251
23217
|
parseAgentSkillsEvals,
|
|
22252
23218
|
parseClaudeSession,
|
|
23219
|
+
parseCodexSession,
|
|
23220
|
+
parseCopilotEvents,
|
|
22253
23221
|
parseJsonFromText,
|
|
22254
23222
|
parseJsonSafe,
|
|
22255
23223
|
readJsonFile,
|
|
@@ -22257,8 +23225,10 @@ function createAgentKernel() {
|
|
|
22257
23225
|
readTestSuiteMetadata,
|
|
22258
23226
|
readTextFile,
|
|
22259
23227
|
readTranscriptFile,
|
|
23228
|
+
readTranscriptJsonl,
|
|
22260
23229
|
removeProject,
|
|
22261
23230
|
resolveAndCreateProvider,
|
|
23231
|
+
resolveDelegatedTargetDefinition,
|
|
22262
23232
|
resolveFileReference,
|
|
22263
23233
|
resolveTargetDefinition,
|
|
22264
23234
|
resolveWorkspaceTemplate,
|
|
@@ -22288,6 +23258,7 @@ function createAgentKernel() {
|
|
|
22288
23258
|
substituteVariables,
|
|
22289
23259
|
toCamelCaseDeep,
|
|
22290
23260
|
toSnakeCaseDeep,
|
|
23261
|
+
toTranscriptJsonLine,
|
|
22291
23262
|
tokensPerTool,
|
|
22292
23263
|
touchProject,
|
|
22293
23264
|
transpileEvalYaml,
|