@agentv/core 4.6.1 → 4.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-ZK4GG7PR.js → chunk-75RFVESM.js} +215 -127
- package/dist/chunk-75RFVESM.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +110 -95
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +30 -72
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1271 -465
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +288 -74
- package/dist/index.d.ts +288 -74
- package/dist/index.js +1024 -311
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-ZK4GG7PR.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -31,12 +31,9 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
31
31
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
32
32
|
|
|
33
33
|
// ../../node_modules/.bun/tsup@8.3.5+19811ebab77a7b1c/node_modules/tsup/assets/cjs_shims.js
|
|
34
|
-
var getImportMetaUrl, importMetaUrl;
|
|
35
34
|
var init_cjs_shims = __esm({
|
|
36
35
|
"../../node_modules/.bun/tsup@8.3.5+19811ebab77a7b1c/node_modules/tsup/assets/cjs_shims.js"() {
|
|
37
36
|
"use strict";
|
|
38
|
-
getImportMetaUrl = () => typeof document === "undefined" ? new URL(`file:${__filename}`).href : document.currentScript && document.currentScript.src || new URL("main.js", document.baseURI).href;
|
|
39
|
-
importMetaUrl = /* @__PURE__ */ getImportMetaUrl();
|
|
40
37
|
}
|
|
41
38
|
});
|
|
42
39
|
|
|
@@ -1435,6 +1432,7 @@ __export(index_exports, {
|
|
|
1435
1432
|
DEFAULT_EVALUATOR_TEMPLATE: () => DEFAULT_EVALUATOR_TEMPLATE,
|
|
1436
1433
|
DEFAULT_EVAL_PATTERNS: () => DEFAULT_EVAL_PATTERNS,
|
|
1437
1434
|
DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
|
|
1435
|
+
DEFAULT_THRESHOLD: () => DEFAULT_THRESHOLD,
|
|
1438
1436
|
DeterministicAssertionEvaluator: () => DeterministicAssertionEvaluator,
|
|
1439
1437
|
EvaluatorRegistry: () => EvaluatorRegistry,
|
|
1440
1438
|
ExecutionMetricsEvaluator: () => ExecutionMetricsEvaluator,
|
|
@@ -1456,6 +1454,7 @@ __export(index_exports, {
|
|
|
1456
1454
|
TemplateNotFoundError: () => TemplateNotFoundError,
|
|
1457
1455
|
TokenUsageEvaluator: () => TokenUsageEvaluator,
|
|
1458
1456
|
ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
|
|
1457
|
+
TranscriptProvider: () => TranscriptProvider,
|
|
1459
1458
|
WorkspaceCreationError: () => WorkspaceCreationError,
|
|
1460
1459
|
WorkspacePoolManager: () => WorkspacePoolManager,
|
|
1461
1460
|
addProject: () => addProject,
|
|
@@ -1492,6 +1491,7 @@ __export(index_exports, {
|
|
|
1492
1491
|
detectFormat: () => detectFormat,
|
|
1493
1492
|
discoverAssertions: () => discoverAssertions,
|
|
1494
1493
|
discoverClaudeSessions: () => discoverClaudeSessions,
|
|
1494
|
+
discoverCodexSessions: () => discoverCodexSessions,
|
|
1495
1495
|
discoverCopilotSessions: () => discoverCopilotSessions,
|
|
1496
1496
|
discoverGraders: () => discoverGraders,
|
|
1497
1497
|
discoverJudges: () => discoverGraders,
|
|
@@ -1552,6 +1552,8 @@ __export(index_exports, {
|
|
|
1552
1552
|
normalizeLineEndings: () => normalizeLineEndings,
|
|
1553
1553
|
parseAgentSkillsEvals: () => parseAgentSkillsEvals,
|
|
1554
1554
|
parseClaudeSession: () => parseClaudeSession,
|
|
1555
|
+
parseCodexSession: () => parseCodexSession,
|
|
1556
|
+
parseCopilotEvents: () => parseCopilotEvents,
|
|
1555
1557
|
parseJsonFromText: () => parseJsonFromText,
|
|
1556
1558
|
parseJsonSafe: () => parseJsonSafe,
|
|
1557
1559
|
readJsonFile: () => readJsonFile,
|
|
@@ -1559,6 +1561,7 @@ __export(index_exports, {
|
|
|
1559
1561
|
readTestSuiteMetadata: () => readTestSuiteMetadata,
|
|
1560
1562
|
readTextFile: () => readTextFile,
|
|
1561
1563
|
readTranscriptFile: () => readTranscriptFile,
|
|
1564
|
+
readTranscriptJsonl: () => readTranscriptJsonl,
|
|
1562
1565
|
removeProject: () => removeProject,
|
|
1563
1566
|
resolveAndCreateProvider: () => resolveAndCreateProvider,
|
|
1564
1567
|
resolveDelegatedTargetDefinition: () => resolveDelegatedTargetDefinition,
|
|
@@ -1591,6 +1594,7 @@ __export(index_exports, {
|
|
|
1591
1594
|
substituteVariables: () => substituteVariables,
|
|
1592
1595
|
toCamelCaseDeep: () => toCamelCaseDeep,
|
|
1593
1596
|
toSnakeCaseDeep: () => toSnakeCaseDeep,
|
|
1597
|
+
toTranscriptJsonLine: () => toTranscriptJsonLine,
|
|
1594
1598
|
tokensPerTool: () => tokensPerTool,
|
|
1595
1599
|
touchProject: () => touchProject,
|
|
1596
1600
|
transpileEvalYaml: () => transpileEvalYaml,
|
|
@@ -2675,8 +2679,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2675
2679
|
const negate = rawEvaluator.negate === true ? true : void 0;
|
|
2676
2680
|
if (isCustomType) {
|
|
2677
2681
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
2678
|
-
const required2 =
|
|
2679
|
-
|
|
2682
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
2683
|
+
rawEvaluator.required,
|
|
2684
|
+
rawEvaluator.min_score,
|
|
2685
|
+
name,
|
|
2686
|
+
evalId
|
|
2687
|
+
);
|
|
2688
|
+
const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "min_score", "negate"]);
|
|
2680
2689
|
const config2 = {};
|
|
2681
2690
|
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
2682
2691
|
if (!knownProps2.has(key) && value !== void 0) {
|
|
@@ -2688,6 +2697,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2688
2697
|
type: customTypeName,
|
|
2689
2698
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
2690
2699
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
2700
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
2691
2701
|
...negate !== void 0 ? { negate } : {},
|
|
2692
2702
|
...Object.keys(config2).length > 0 ? { config: config2 } : {}
|
|
2693
2703
|
});
|
|
@@ -2757,7 +2767,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2757
2767
|
);
|
|
2758
2768
|
}
|
|
2759
2769
|
}
|
|
2760
|
-
const required2 =
|
|
2770
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
2771
|
+
rawEvaluator.required,
|
|
2772
|
+
rawEvaluator.min_score,
|
|
2773
|
+
name,
|
|
2774
|
+
evalId
|
|
2775
|
+
);
|
|
2761
2776
|
const knownProps2 = /* @__PURE__ */ new Set([
|
|
2762
2777
|
"name",
|
|
2763
2778
|
"type",
|
|
@@ -2783,6 +2798,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2783
2798
|
resolvedCwd,
|
|
2784
2799
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
2785
2800
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
2801
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
2786
2802
|
...negate !== void 0 ? { negate } : {},
|
|
2787
2803
|
...Object.keys(config2).length > 0 ? { config: config2 } : {},
|
|
2788
2804
|
...targetConfig !== void 0 ? { target: targetConfig } : {}
|
|
@@ -2911,7 +2927,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2911
2927
|
};
|
|
2912
2928
|
}
|
|
2913
2929
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
2914
|
-
const required2 =
|
|
2930
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
2931
|
+
rawEvaluator.required,
|
|
2932
|
+
rawEvaluator.min_score,
|
|
2933
|
+
name,
|
|
2934
|
+
evalId
|
|
2935
|
+
);
|
|
2915
2936
|
evaluators.push({
|
|
2916
2937
|
name,
|
|
2917
2938
|
type: "composite",
|
|
@@ -2919,6 +2940,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2919
2940
|
aggregator,
|
|
2920
2941
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
2921
2942
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
2943
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
2922
2944
|
...negate !== void 0 ? { negate } : {}
|
|
2923
2945
|
});
|
|
2924
2946
|
continue;
|
|
@@ -3029,7 +3051,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3029
3051
|
continue;
|
|
3030
3052
|
}
|
|
3031
3053
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3032
|
-
const required2 =
|
|
3054
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3055
|
+
rawEvaluator.required,
|
|
3056
|
+
rawEvaluator.min_score,
|
|
3057
|
+
name,
|
|
3058
|
+
evalId
|
|
3059
|
+
);
|
|
3033
3060
|
const config2 = {
|
|
3034
3061
|
name,
|
|
3035
3062
|
type: "tool-trajectory",
|
|
@@ -3038,6 +3065,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3038
3065
|
...expected ? { expected } : {},
|
|
3039
3066
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3040
3067
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3068
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3041
3069
|
...negate !== void 0 ? { negate } : {},
|
|
3042
3070
|
...argsMatch2 !== void 0 ? { argsMatch: argsMatch2 } : {}
|
|
3043
3071
|
};
|
|
@@ -3100,7 +3128,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3100
3128
|
const aggregation = asString(rawEvaluator.aggregation);
|
|
3101
3129
|
const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
|
|
3102
3130
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3103
|
-
const required2 =
|
|
3131
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3132
|
+
rawEvaluator.required,
|
|
3133
|
+
rawEvaluator.min_score,
|
|
3134
|
+
name,
|
|
3135
|
+
evalId
|
|
3136
|
+
);
|
|
3104
3137
|
evaluators.push({
|
|
3105
3138
|
name,
|
|
3106
3139
|
type: "field-accuracy",
|
|
@@ -3108,6 +3141,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3108
3141
|
...validAggregation ? { aggregation: validAggregation } : {},
|
|
3109
3142
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3110
3143
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3144
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3111
3145
|
...negate !== void 0 ? { negate } : {}
|
|
3112
3146
|
});
|
|
3113
3147
|
continue;
|
|
@@ -3121,13 +3155,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3121
3155
|
continue;
|
|
3122
3156
|
}
|
|
3123
3157
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3124
|
-
const required2 =
|
|
3158
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3159
|
+
rawEvaluator.required,
|
|
3160
|
+
rawEvaluator.min_score,
|
|
3161
|
+
name,
|
|
3162
|
+
evalId
|
|
3163
|
+
);
|
|
3125
3164
|
evaluators.push({
|
|
3126
3165
|
name,
|
|
3127
3166
|
type: "latency",
|
|
3128
3167
|
threshold,
|
|
3129
3168
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3130
3169
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3170
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3131
3171
|
...negate !== void 0 ? { negate } : {}
|
|
3132
3172
|
});
|
|
3133
3173
|
continue;
|
|
@@ -3141,13 +3181,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3141
3181
|
continue;
|
|
3142
3182
|
}
|
|
3143
3183
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3144
|
-
const required2 =
|
|
3184
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3185
|
+
rawEvaluator.required,
|
|
3186
|
+
rawEvaluator.min_score,
|
|
3187
|
+
name,
|
|
3188
|
+
evalId
|
|
3189
|
+
);
|
|
3145
3190
|
evaluators.push({
|
|
3146
3191
|
name,
|
|
3147
3192
|
type: "cost",
|
|
3148
3193
|
budget,
|
|
3149
3194
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3150
3195
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3196
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3151
3197
|
...negate !== void 0 ? { negate } : {}
|
|
3152
3198
|
});
|
|
3153
3199
|
continue;
|
|
@@ -3179,13 +3225,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3179
3225
|
continue;
|
|
3180
3226
|
}
|
|
3181
3227
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3182
|
-
const required2 =
|
|
3228
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3229
|
+
rawEvaluator.required,
|
|
3230
|
+
rawEvaluator.min_score,
|
|
3231
|
+
name,
|
|
3232
|
+
evalId
|
|
3233
|
+
);
|
|
3183
3234
|
evaluators.push({
|
|
3184
3235
|
name,
|
|
3185
3236
|
type: "token-usage",
|
|
3186
3237
|
...validLimits,
|
|
3187
3238
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3188
3239
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3240
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3189
3241
|
...negate !== void 0 ? { negate } : {}
|
|
3190
3242
|
});
|
|
3191
3243
|
continue;
|
|
@@ -3231,13 +3283,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3231
3283
|
continue;
|
|
3232
3284
|
}
|
|
3233
3285
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3234
|
-
const required2 =
|
|
3286
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3287
|
+
rawEvaluator.required,
|
|
3288
|
+
rawEvaluator.min_score,
|
|
3289
|
+
name,
|
|
3290
|
+
evalId
|
|
3291
|
+
);
|
|
3235
3292
|
evaluators.push({
|
|
3236
3293
|
name,
|
|
3237
3294
|
type: "execution-metrics",
|
|
3238
3295
|
...validThresholds,
|
|
3239
3296
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3240
3297
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3298
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3241
3299
|
...negate !== void 0 ? { negate } : {}
|
|
3242
3300
|
});
|
|
3243
3301
|
continue;
|
|
@@ -3251,7 +3309,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3251
3309
|
const rawShouldTrigger = rawEvaluator.should_trigger;
|
|
3252
3310
|
const shouldTrigger = typeof rawShouldTrigger === "boolean" ? rawShouldTrigger : void 0;
|
|
3253
3311
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3254
|
-
const required2 =
|
|
3312
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3313
|
+
rawEvaluator.required,
|
|
3314
|
+
rawEvaluator.min_score,
|
|
3315
|
+
name,
|
|
3316
|
+
evalId
|
|
3317
|
+
);
|
|
3255
3318
|
evaluators.push({
|
|
3256
3319
|
name,
|
|
3257
3320
|
type: "skill-trigger",
|
|
@@ -3259,6 +3322,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3259
3322
|
...shouldTrigger !== void 0 ? { should_trigger: shouldTrigger } : {},
|
|
3260
3323
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3261
3324
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3325
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3262
3326
|
...negate !== void 0 ? { negate } : {}
|
|
3263
3327
|
});
|
|
3264
3328
|
continue;
|
|
@@ -3270,13 +3334,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3270
3334
|
continue;
|
|
3271
3335
|
}
|
|
3272
3336
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3273
|
-
const required2 =
|
|
3337
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3338
|
+
rawEvaluator.required,
|
|
3339
|
+
rawEvaluator.min_score,
|
|
3340
|
+
name,
|
|
3341
|
+
evalId
|
|
3342
|
+
);
|
|
3274
3343
|
evaluators.push({
|
|
3275
3344
|
name,
|
|
3276
3345
|
type: "contains",
|
|
3277
3346
|
value,
|
|
3278
3347
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3279
3348
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3349
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3280
3350
|
...negate !== void 0 ? { negate } : {}
|
|
3281
3351
|
});
|
|
3282
3352
|
continue;
|
|
@@ -3290,13 +3360,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3290
3360
|
continue;
|
|
3291
3361
|
}
|
|
3292
3362
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3293
|
-
const required2 =
|
|
3363
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3364
|
+
rawEvaluator.required,
|
|
3365
|
+
rawEvaluator.min_score,
|
|
3366
|
+
name,
|
|
3367
|
+
evalId
|
|
3368
|
+
);
|
|
3294
3369
|
evaluators.push({
|
|
3295
3370
|
name,
|
|
3296
3371
|
type: typeValue,
|
|
3297
3372
|
value,
|
|
3298
3373
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3299
3374
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3375
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3300
3376
|
...negate !== void 0 ? { negate } : {}
|
|
3301
3377
|
});
|
|
3302
3378
|
continue;
|
|
@@ -3308,13 +3384,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3308
3384
|
continue;
|
|
3309
3385
|
}
|
|
3310
3386
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3311
|
-
const required2 =
|
|
3387
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3388
|
+
rawEvaluator.required,
|
|
3389
|
+
rawEvaluator.min_score,
|
|
3390
|
+
name,
|
|
3391
|
+
evalId
|
|
3392
|
+
);
|
|
3312
3393
|
evaluators.push({
|
|
3313
3394
|
name,
|
|
3314
3395
|
type: "icontains",
|
|
3315
3396
|
value,
|
|
3316
3397
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3317
3398
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3399
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3318
3400
|
...negate !== void 0 ? { negate } : {}
|
|
3319
3401
|
});
|
|
3320
3402
|
continue;
|
|
@@ -3328,13 +3410,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3328
3410
|
continue;
|
|
3329
3411
|
}
|
|
3330
3412
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3331
|
-
const required2 =
|
|
3413
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3414
|
+
rawEvaluator.required,
|
|
3415
|
+
rawEvaluator.min_score,
|
|
3416
|
+
name,
|
|
3417
|
+
evalId
|
|
3418
|
+
);
|
|
3332
3419
|
evaluators.push({
|
|
3333
3420
|
name,
|
|
3334
3421
|
type: typeValue,
|
|
3335
3422
|
value,
|
|
3336
3423
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3337
3424
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3425
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3338
3426
|
...negate !== void 0 ? { negate } : {}
|
|
3339
3427
|
});
|
|
3340
3428
|
continue;
|
|
@@ -3346,13 +3434,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3346
3434
|
continue;
|
|
3347
3435
|
}
|
|
3348
3436
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3349
|
-
const required2 =
|
|
3437
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3438
|
+
rawEvaluator.required,
|
|
3439
|
+
rawEvaluator.min_score,
|
|
3440
|
+
name,
|
|
3441
|
+
evalId
|
|
3442
|
+
);
|
|
3350
3443
|
evaluators.push({
|
|
3351
3444
|
name,
|
|
3352
3445
|
type: typeValue,
|
|
3353
3446
|
value,
|
|
3354
3447
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3355
3448
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3449
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3356
3450
|
...negate !== void 0 ? { negate } : {}
|
|
3357
3451
|
});
|
|
3358
3452
|
continue;
|
|
@@ -3365,7 +3459,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3365
3459
|
}
|
|
3366
3460
|
const flags = asString(rawEvaluator.flags);
|
|
3367
3461
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3368
|
-
const required2 =
|
|
3462
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3463
|
+
rawEvaluator.required,
|
|
3464
|
+
rawEvaluator.min_score,
|
|
3465
|
+
name,
|
|
3466
|
+
evalId
|
|
3467
|
+
);
|
|
3369
3468
|
evaluators.push({
|
|
3370
3469
|
name,
|
|
3371
3470
|
type: "regex",
|
|
@@ -3373,18 +3472,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3373
3472
|
...flags !== void 0 ? { flags } : {},
|
|
3374
3473
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3375
3474
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3475
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3376
3476
|
...negate !== void 0 ? { negate } : {}
|
|
3377
3477
|
});
|
|
3378
3478
|
continue;
|
|
3379
3479
|
}
|
|
3380
3480
|
if (typeValue === "is-json") {
|
|
3381
3481
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3382
|
-
const required2 =
|
|
3482
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3483
|
+
rawEvaluator.required,
|
|
3484
|
+
rawEvaluator.min_score,
|
|
3485
|
+
name,
|
|
3486
|
+
evalId
|
|
3487
|
+
);
|
|
3383
3488
|
evaluators.push({
|
|
3384
3489
|
name,
|
|
3385
3490
|
type: "is-json",
|
|
3386
3491
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3387
3492
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3493
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3388
3494
|
...negate !== void 0 ? { negate } : {}
|
|
3389
3495
|
});
|
|
3390
3496
|
continue;
|
|
@@ -3396,13 +3502,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3396
3502
|
continue;
|
|
3397
3503
|
}
|
|
3398
3504
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3399
|
-
const required2 =
|
|
3505
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3506
|
+
rawEvaluator.required,
|
|
3507
|
+
rawEvaluator.min_score,
|
|
3508
|
+
name,
|
|
3509
|
+
evalId
|
|
3510
|
+
);
|
|
3400
3511
|
evaluators.push({
|
|
3401
3512
|
name,
|
|
3402
3513
|
type: "equals",
|
|
3403
3514
|
value,
|
|
3404
3515
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3405
3516
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3517
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3406
3518
|
...negate !== void 0 ? { negate } : {}
|
|
3407
3519
|
});
|
|
3408
3520
|
continue;
|
|
@@ -3438,7 +3550,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3438
3550
|
continue;
|
|
3439
3551
|
}
|
|
3440
3552
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3441
|
-
const required2 =
|
|
3553
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3554
|
+
rawEvaluator.required,
|
|
3555
|
+
rawEvaluator.min_score,
|
|
3556
|
+
name,
|
|
3557
|
+
evalId
|
|
3558
|
+
);
|
|
3442
3559
|
evaluators.push({
|
|
3443
3560
|
name,
|
|
3444
3561
|
type: "llm-grader",
|
|
@@ -3446,6 +3563,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3446
3563
|
...graderTargetName ? { target: graderTargetName } : {},
|
|
3447
3564
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3448
3565
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3566
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3449
3567
|
...negate !== void 0 ? { negate } : {}
|
|
3450
3568
|
});
|
|
3451
3569
|
continue;
|
|
@@ -3515,7 +3633,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3515
3633
|
continue;
|
|
3516
3634
|
}
|
|
3517
3635
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3518
|
-
const required2 =
|
|
3636
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
3637
|
+
rawEvaluator.required,
|
|
3638
|
+
rawEvaluator.min_score,
|
|
3639
|
+
name,
|
|
3640
|
+
evalId
|
|
3641
|
+
);
|
|
3519
3642
|
evaluators.push({
|
|
3520
3643
|
name,
|
|
3521
3644
|
type: "llm-grader",
|
|
@@ -3523,12 +3646,18 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3523
3646
|
...graderTargetName ? { target: graderTargetName } : {},
|
|
3524
3647
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
3525
3648
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
3649
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
3526
3650
|
...negate !== void 0 ? { negate } : {}
|
|
3527
3651
|
});
|
|
3528
3652
|
continue;
|
|
3529
3653
|
}
|
|
3530
3654
|
const weight = validateWeight(rawEvaluator.weight, name, evalId);
|
|
3531
|
-
const required =
|
|
3655
|
+
const { required, min_score } = parseRequiredAndMinScore(
|
|
3656
|
+
rawEvaluator.required,
|
|
3657
|
+
rawEvaluator.min_score,
|
|
3658
|
+
name,
|
|
3659
|
+
evalId
|
|
3660
|
+
);
|
|
3532
3661
|
const knownProps = /* @__PURE__ */ new Set([
|
|
3533
3662
|
"name",
|
|
3534
3663
|
"type",
|
|
@@ -3539,6 +3668,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3539
3668
|
"weight",
|
|
3540
3669
|
"config",
|
|
3541
3670
|
"required",
|
|
3671
|
+
"min_score",
|
|
3542
3672
|
"negate",
|
|
3543
3673
|
"max_steps",
|
|
3544
3674
|
"maxSteps",
|
|
@@ -3568,6 +3698,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
3568
3698
|
...graderTargetName ? { target: graderTargetName } : {},
|
|
3569
3699
|
...weight !== void 0 ? { weight } : {},
|
|
3570
3700
|
...required !== void 0 ? { required } : {},
|
|
3701
|
+
...min_score !== void 0 ? { min_score } : {},
|
|
3571
3702
|
...negate !== void 0 ? { negate } : {},
|
|
3572
3703
|
...finalConfig ? { config: finalConfig } : {},
|
|
3573
3704
|
...llmMaxSteps !== void 0 ? { max_steps: llmMaxSteps } : {},
|
|
@@ -3699,10 +3830,23 @@ ${detailBlock}${ANSI_RESET5}`);
|
|
|
3699
3830
|
console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET5}`);
|
|
3700
3831
|
}
|
|
3701
3832
|
}
|
|
3702
|
-
function
|
|
3703
|
-
|
|
3704
|
-
if (typeof
|
|
3705
|
-
|
|
3833
|
+
function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalId) {
|
|
3834
|
+
const result = {};
|
|
3835
|
+
if (typeof rawMinScore === "number" && rawMinScore > 0 && rawMinScore <= 1) {
|
|
3836
|
+
result.min_score = rawMinScore;
|
|
3837
|
+
}
|
|
3838
|
+
if (rawRequired === true) {
|
|
3839
|
+
result.required = true;
|
|
3840
|
+
} else if (typeof rawRequired === "number" && rawRequired > 0 && rawRequired <= 1) {
|
|
3841
|
+
if (result.min_score === void 0) {
|
|
3842
|
+
result.min_score = rawRequired;
|
|
3843
|
+
}
|
|
3844
|
+
result.required = rawRequired;
|
|
3845
|
+
logWarning2(
|
|
3846
|
+
`Evaluator '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
|
|
3847
|
+
);
|
|
3848
|
+
}
|
|
3849
|
+
return result;
|
|
3706
3850
|
}
|
|
3707
3851
|
function validateWeight(rawWeight, evaluatorName, evalId) {
|
|
3708
3852
|
if (rawWeight === void 0) {
|
|
@@ -3745,16 +3889,30 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
3745
3889
|
const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
|
|
3746
3890
|
const expectedOutcome = asString(rawRubric.outcome) ?? "";
|
|
3747
3891
|
const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
|
|
3892
|
+
let minScore;
|
|
3748
3893
|
let requiredMinScore;
|
|
3749
3894
|
let required;
|
|
3750
|
-
if (typeof rawRubric.
|
|
3751
|
-
const
|
|
3752
|
-
if (
|
|
3895
|
+
if (typeof rawRubric.min_score === "number") {
|
|
3896
|
+
const ms = rawRubric.min_score;
|
|
3897
|
+
if (ms <= 0 || ms > 1) {
|
|
3753
3898
|
throw new Error(
|
|
3754
|
-
`Invalid
|
|
3899
|
+
`Invalid min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be in (0, 1] (got ${ms})`
|
|
3755
3900
|
);
|
|
3756
3901
|
}
|
|
3757
|
-
|
|
3902
|
+
minScore = ms;
|
|
3903
|
+
requiredMinScore = Math.round(ms * 10);
|
|
3904
|
+
} else if (typeof rawRubric.required_min_score === "number") {
|
|
3905
|
+
const rms = rawRubric.required_min_score;
|
|
3906
|
+
if (!Number.isInteger(rms) || rms < 0 || rms > 10) {
|
|
3907
|
+
throw new Error(
|
|
3908
|
+
`Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${rms})`
|
|
3909
|
+
);
|
|
3910
|
+
}
|
|
3911
|
+
requiredMinScore = rms;
|
|
3912
|
+
minScore = rms / 10;
|
|
3913
|
+
logWarning2(
|
|
3914
|
+
`Rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': 'required_min_score: ${rms}' is deprecated. Use 'min_score: ${rms / 10}' (0-1 scale) instead.`
|
|
3915
|
+
);
|
|
3758
3916
|
}
|
|
3759
3917
|
if (typeof rawRubric.required === "boolean") {
|
|
3760
3918
|
required = rawRubric.required;
|
|
@@ -3774,6 +3932,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
3774
3932
|
weight,
|
|
3775
3933
|
...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
|
|
3776
3934
|
...required !== void 0 ? { required } : {},
|
|
3935
|
+
...minScore !== void 0 ? { min_score: minScore } : {},
|
|
3777
3936
|
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
|
|
3778
3937
|
score_ranges: scoreRanges
|
|
3779
3938
|
});
|
|
@@ -3790,6 +3949,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
3790
3949
|
weight,
|
|
3791
3950
|
// Default to required: true if not specified (backward compatibility)
|
|
3792
3951
|
required: required ?? true,
|
|
3952
|
+
...minScore !== void 0 ? { min_score: minScore } : {},
|
|
3793
3953
|
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
|
|
3794
3954
|
});
|
|
3795
3955
|
}
|
|
@@ -3918,12 +4078,22 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
3918
4078
|
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
3919
4079
|
weight: typeof rubric.weight === "number" ? rubric.weight : 1
|
|
3920
4080
|
};
|
|
4081
|
+
let inlineMinScore;
|
|
4082
|
+
let inlineRequiredMinScore;
|
|
4083
|
+
if (typeof rubric.min_score === "number") {
|
|
4084
|
+
inlineMinScore = rubric.min_score;
|
|
4085
|
+
inlineRequiredMinScore = Math.round(inlineMinScore * 10);
|
|
4086
|
+
} else if (typeof rubric.required_min_score === "number") {
|
|
4087
|
+
inlineRequiredMinScore = rubric.required_min_score;
|
|
4088
|
+
inlineMinScore = inlineRequiredMinScore / 10;
|
|
4089
|
+
}
|
|
3921
4090
|
if (scoreRanges && scoreRanges.length > 0) {
|
|
3922
4091
|
return {
|
|
3923
4092
|
...baseRubric,
|
|
3924
4093
|
...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
|
|
3925
4094
|
...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
|
|
3926
|
-
...
|
|
4095
|
+
...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
|
|
4096
|
+
...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {},
|
|
3927
4097
|
score_ranges: scoreRanges
|
|
3928
4098
|
};
|
|
3929
4099
|
}
|
|
@@ -3931,7 +4101,8 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
3931
4101
|
...baseRubric,
|
|
3932
4102
|
outcome: expectedOutcome,
|
|
3933
4103
|
required: typeof rubric.required === "boolean" ? rubric.required : true,
|
|
3934
|
-
...
|
|
4104
|
+
...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
|
|
4105
|
+
...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {}
|
|
3935
4106
|
};
|
|
3936
4107
|
}).filter((r) => r.outcome && r.outcome.length > 0 || "score_ranges" in r && r.score_ranges);
|
|
3937
4108
|
if (rubricItems.length === 0) {
|
|
@@ -4335,6 +4506,9 @@ function resolveExpectedMessages(raw) {
|
|
|
4335
4506
|
var ANSI_YELLOW6 = "\x1B[33m";
|
|
4336
4507
|
var ANSI_RED2 = "\x1B[31m";
|
|
4337
4508
|
var ANSI_RESET7 = "\x1B[0m";
|
|
4509
|
+
function matchesFilter(id, filter) {
|
|
4510
|
+
return typeof filter === "string" ? import_micromatch.default.isMatch(id, filter) : filter.some((pattern) => import_micromatch.default.isMatch(id, pattern));
|
|
4511
|
+
}
|
|
4338
4512
|
function detectFormat(filePath) {
|
|
4339
4513
|
const ext = import_node_path7.default.extname(filePath).toLowerCase();
|
|
4340
4514
|
if (ext === ".jsonl") return "jsonl";
|
|
@@ -4402,40 +4576,40 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
4402
4576
|
const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
|
|
4403
4577
|
const rawFile = await (0, import_promises7.readFile)(absoluteTestPath, "utf8");
|
|
4404
4578
|
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
4405
|
-
const
|
|
4406
|
-
const
|
|
4579
|
+
const fallbackSuiteName = import_node_path7.default.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
4580
|
+
const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
|
|
4407
4581
|
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-grader";
|
|
4408
4582
|
const globalExecution = sidecar.execution;
|
|
4409
4583
|
if (verbose) {
|
|
4410
4584
|
console.log(`
|
|
4411
|
-
[JSONL
|
|
4585
|
+
[JSONL Suite: ${evalFilePath}]`);
|
|
4412
4586
|
console.log(` Cases: ${rawCases.length}`);
|
|
4413
|
-
console.log(`
|
|
4587
|
+
console.log(` Suite: ${suiteName}`);
|
|
4414
4588
|
if (sidecar.description) {
|
|
4415
4589
|
console.log(` Description: ${sidecar.description}`);
|
|
4416
4590
|
}
|
|
4417
4591
|
}
|
|
4418
4592
|
const results = [];
|
|
4419
4593
|
for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
|
|
4420
|
-
const
|
|
4594
|
+
const testCaseConfig = rawCases[lineIndex];
|
|
4421
4595
|
const lineNumber = lineIndex + 1;
|
|
4422
|
-
const id = asString4(
|
|
4423
|
-
if (filterPattern && (!id || !
|
|
4596
|
+
const id = asString4(testCaseConfig.id);
|
|
4597
|
+
if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
|
|
4424
4598
|
continue;
|
|
4425
4599
|
}
|
|
4426
|
-
const conversationId = asString4(
|
|
4427
|
-
let outcome = asString4(
|
|
4428
|
-
if (!outcome &&
|
|
4429
|
-
outcome = asString4(
|
|
4600
|
+
const conversationId = asString4(testCaseConfig.conversation_id);
|
|
4601
|
+
let outcome = asString4(testCaseConfig.criteria);
|
|
4602
|
+
if (!outcome && testCaseConfig.expected_outcome !== void 0) {
|
|
4603
|
+
outcome = asString4(testCaseConfig.expected_outcome);
|
|
4430
4604
|
if (outcome) {
|
|
4431
4605
|
logWarning4(
|
|
4432
|
-
`Test '${asString4(
|
|
4606
|
+
`Test '${asString4(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
|
|
4433
4607
|
);
|
|
4434
4608
|
}
|
|
4435
4609
|
}
|
|
4436
|
-
const rawInputMessages = resolveInputMessages(
|
|
4437
|
-
const expectedMessages = resolveExpectedMessages(
|
|
4438
|
-
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 ||
|
|
4610
|
+
const rawInputMessages = resolveInputMessages(testCaseConfig);
|
|
4611
|
+
const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
|
|
4612
|
+
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assert !== void 0;
|
|
4439
4613
|
if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
|
|
4440
4614
|
logError2(
|
|
4441
4615
|
`Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
|
|
@@ -4472,18 +4646,23 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
4472
4646
|
}
|
|
4473
4647
|
}
|
|
4474
4648
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
4475
|
-
const caseExecution = isJsonObject(
|
|
4649
|
+
const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
|
|
4476
4650
|
const mergedExecution = caseExecution ?? globalExecution;
|
|
4477
|
-
const
|
|
4651
|
+
const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
|
|
4478
4652
|
let evaluators;
|
|
4479
4653
|
try {
|
|
4480
|
-
evaluators = await parseEvaluators(
|
|
4654
|
+
evaluators = await parseEvaluators(
|
|
4655
|
+
testCaseConfig,
|
|
4656
|
+
mergedExecution,
|
|
4657
|
+
searchRoots,
|
|
4658
|
+
id ?? "unknown"
|
|
4659
|
+
);
|
|
4481
4660
|
} catch (error) {
|
|
4482
4661
|
const message = error instanceof Error ? error.message : String(error);
|
|
4483
4662
|
logError2(`Skipping test '${id}' at line ${lineNumber}: ${message}`);
|
|
4484
4663
|
continue;
|
|
4485
4664
|
}
|
|
4486
|
-
const inlineRubrics =
|
|
4665
|
+
const inlineRubrics = testCaseConfig.rubrics;
|
|
4487
4666
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
4488
4667
|
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
4489
4668
|
if (rubricEvaluator) {
|
|
@@ -4494,7 +4673,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
4494
4673
|
const userFilePaths = collectResolvedInputFilePaths(inputMessages);
|
|
4495
4674
|
const testCase = {
|
|
4496
4675
|
id,
|
|
4497
|
-
|
|
4676
|
+
suite: suiteName,
|
|
4498
4677
|
conversation_id: conversationId,
|
|
4499
4678
|
question,
|
|
4500
4679
|
input: inputMessages,
|
|
@@ -4502,7 +4681,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
4502
4681
|
reference_answer: referenceAnswer,
|
|
4503
4682
|
file_paths: userFilePaths,
|
|
4504
4683
|
criteria: outcome ?? "",
|
|
4505
|
-
evaluator:
|
|
4684
|
+
evaluator: testCaseEvaluatorKind,
|
|
4506
4685
|
assertions: evaluators
|
|
4507
4686
|
};
|
|
4508
4687
|
results.push(testCase);
|
|
@@ -4687,6 +4866,9 @@ function buildChatPromptFromSegments(options) {
|
|
|
4687
4866
|
var ANSI_YELLOW7 = "\x1B[33m";
|
|
4688
4867
|
var ANSI_RED3 = "\x1B[31m";
|
|
4689
4868
|
var ANSI_RESET8 = "\x1B[0m";
|
|
4869
|
+
function matchesFilter2(id, filter) {
|
|
4870
|
+
return typeof filter === "string" ? import_micromatch2.default.isMatch(id, filter) : filter.some((pattern) => import_micromatch2.default.isMatch(id, pattern));
|
|
4871
|
+
}
|
|
4690
4872
|
function resolveTests(suite) {
|
|
4691
4873
|
if (suite.tests !== void 0) return suite.tests;
|
|
4692
4874
|
if (suite.eval_cases !== void 0) {
|
|
@@ -4766,18 +4948,18 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4766
4948
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
4767
4949
|
}
|
|
4768
4950
|
const suite = interpolated;
|
|
4769
|
-
const
|
|
4770
|
-
const
|
|
4771
|
-
const
|
|
4772
|
-
const
|
|
4951
|
+
const suiteNameFromFile = asString5(suite.name)?.trim();
|
|
4952
|
+
const fallbackSuiteName = import_node_path8.default.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
|
|
4953
|
+
const suiteName = suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName;
|
|
4954
|
+
const rawTestCases = resolveTests(suite);
|
|
4773
4955
|
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
|
|
4774
4956
|
const evalFileDir = import_node_path8.default.dirname(absoluteTestPath);
|
|
4775
|
-
let
|
|
4776
|
-
if (typeof
|
|
4777
|
-
const externalPath = import_node_path8.default.resolve(evalFileDir,
|
|
4778
|
-
|
|
4779
|
-
} else if (Array.isArray(
|
|
4780
|
-
|
|
4957
|
+
let expandedTestCases;
|
|
4958
|
+
if (typeof rawTestCases === "string") {
|
|
4959
|
+
const externalPath = import_node_path8.default.resolve(evalFileDir, rawTestCases);
|
|
4960
|
+
expandedTestCases = await loadCasesFromFile(externalPath);
|
|
4961
|
+
} else if (Array.isArray(rawTestCases)) {
|
|
4962
|
+
expandedTestCases = await expandFileReferences(rawTestCases, evalFileDir);
|
|
4781
4963
|
} else {
|
|
4782
4964
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
|
|
4783
4965
|
}
|
|
@@ -4792,32 +4974,33 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4792
4974
|
}
|
|
4793
4975
|
const globalExecution = suiteAssertions !== void 0 ? { ...rawGlobalExecution ?? {}, assertions: suiteAssertions } : rawGlobalExecution;
|
|
4794
4976
|
const results = [];
|
|
4795
|
-
for (const
|
|
4796
|
-
if (!isJsonObject(
|
|
4977
|
+
for (const rawTestCase of expandedTestCases) {
|
|
4978
|
+
if (!isJsonObject(rawTestCase)) {
|
|
4797
4979
|
logWarning5("Skipping invalid test entry (expected object)");
|
|
4798
4980
|
continue;
|
|
4799
4981
|
}
|
|
4800
|
-
const
|
|
4801
|
-
const id = asString5(
|
|
4802
|
-
if (filterPattern && (!id || !
|
|
4982
|
+
const testCaseConfig = rawTestCase;
|
|
4983
|
+
const id = asString5(testCaseConfig.id);
|
|
4984
|
+
if (filterPattern && (!id || !matchesFilter2(id, filterPattern))) {
|
|
4803
4985
|
continue;
|
|
4804
4986
|
}
|
|
4805
|
-
const conversationId = asString5(
|
|
4806
|
-
let outcome = asString5(
|
|
4807
|
-
if (!outcome &&
|
|
4808
|
-
outcome = asString5(
|
|
4987
|
+
const conversationId = asString5(testCaseConfig.conversation_id);
|
|
4988
|
+
let outcome = asString5(testCaseConfig.criteria);
|
|
4989
|
+
if (!outcome && testCaseConfig.expected_outcome !== void 0) {
|
|
4990
|
+
outcome = asString5(testCaseConfig.expected_outcome);
|
|
4809
4991
|
if (outcome) {
|
|
4810
4992
|
logWarning5(
|
|
4811
|
-
`Test '${asString5(
|
|
4993
|
+
`Test '${asString5(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
|
|
4812
4994
|
);
|
|
4813
4995
|
}
|
|
4814
4996
|
}
|
|
4815
|
-
const caseExecution = isJsonObject(
|
|
4997
|
+
const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
|
|
4816
4998
|
const skipDefaults = caseExecution?.skip_defaults === true;
|
|
4999
|
+
const caseThreshold = typeof caseExecution?.threshold === "number" && caseExecution.threshold >= 0 && caseExecution.threshold <= 1 ? caseExecution.threshold : void 0;
|
|
4817
5000
|
const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
|
|
4818
|
-
const testInputMessages = resolveInputMessages(
|
|
4819
|
-
const expectedMessages = resolveExpectedMessages(
|
|
4820
|
-
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 ||
|
|
5001
|
+
const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
|
|
5002
|
+
const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
|
|
5003
|
+
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
|
|
4821
5004
|
if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
|
|
4822
5005
|
logError3(
|
|
4823
5006
|
`Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
|
|
@@ -4864,16 +5047,21 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4864
5047
|
}
|
|
4865
5048
|
}
|
|
4866
5049
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
4867
|
-
const
|
|
5050
|
+
const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
|
|
4868
5051
|
let evaluators;
|
|
4869
5052
|
try {
|
|
4870
|
-
evaluators = await parseEvaluators(
|
|
5053
|
+
evaluators = await parseEvaluators(
|
|
5054
|
+
testCaseConfig,
|
|
5055
|
+
globalExecution,
|
|
5056
|
+
searchRoots,
|
|
5057
|
+
id ?? "unknown"
|
|
5058
|
+
);
|
|
4871
5059
|
} catch (error) {
|
|
4872
5060
|
const message = error instanceof Error ? error.message : String(error);
|
|
4873
5061
|
logError3(`Skipping test '${id}': ${message}`);
|
|
4874
5062
|
continue;
|
|
4875
5063
|
}
|
|
4876
|
-
const inlineRubrics =
|
|
5064
|
+
const inlineRubrics = testCaseConfig.rubrics;
|
|
4877
5065
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
4878
5066
|
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
4879
5067
|
if (rubricEvaluator) {
|
|
@@ -4882,13 +5070,13 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4882
5070
|
}
|
|
4883
5071
|
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
4884
5072
|
const userFilePaths = collectResolvedInputFilePaths(inputMessages);
|
|
4885
|
-
const caseWorkspace = await resolveWorkspaceConfig(
|
|
5073
|
+
const caseWorkspace = await resolveWorkspaceConfig(testCaseConfig.workspace, evalFileDir);
|
|
4886
5074
|
const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
|
|
4887
|
-
const metadata = isJsonObject(
|
|
4888
|
-
const caseTargets = extractTargetsFromTestCase(
|
|
5075
|
+
const metadata = isJsonObject(testCaseConfig.metadata) ? testCaseConfig.metadata : void 0;
|
|
5076
|
+
const caseTargets = extractTargetsFromTestCase(testCaseConfig);
|
|
4889
5077
|
const testCase = {
|
|
4890
5078
|
id,
|
|
4891
|
-
|
|
5079
|
+
suite: suiteName,
|
|
4892
5080
|
category: options?.category,
|
|
4893
5081
|
conversation_id: conversationId,
|
|
4894
5082
|
question,
|
|
@@ -4897,11 +5085,12 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4897
5085
|
reference_answer: referenceAnswer,
|
|
4898
5086
|
file_paths: userFilePaths,
|
|
4899
5087
|
criteria: outcome ?? "",
|
|
4900
|
-
evaluator:
|
|
5088
|
+
evaluator: testCaseEvaluatorKind,
|
|
4901
5089
|
assertions: evaluators,
|
|
4902
5090
|
workspace: mergedWorkspace,
|
|
4903
5091
|
metadata,
|
|
4904
|
-
targets: caseTargets
|
|
5092
|
+
targets: caseTargets,
|
|
5093
|
+
...caseThreshold !== void 0 ? { threshold: caseThreshold } : {}
|
|
4905
5094
|
};
|
|
4906
5095
|
results.push(testCase);
|
|
4907
5096
|
}
|
|
@@ -5567,7 +5756,7 @@ var AzureProvider = class {
|
|
|
5567
5756
|
};
|
|
5568
5757
|
this.retryConfig = config.retry;
|
|
5569
5758
|
const azure = (0, import_azure2.createAzure)(buildAzureOptions(config));
|
|
5570
|
-
this.model = azure.chat(config.deploymentName);
|
|
5759
|
+
this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
|
|
5571
5760
|
}
|
|
5572
5761
|
id;
|
|
5573
5762
|
kind = "azure";
|
|
@@ -5693,7 +5882,9 @@ function buildAzureOptions(config) {
|
|
|
5693
5882
|
const options = {
|
|
5694
5883
|
apiKey: config.apiKey,
|
|
5695
5884
|
apiVersion: config.version,
|
|
5696
|
-
|
|
5885
|
+
// Chat completions still use deployment-scoped Azure URLs for compatibility
|
|
5886
|
+
// with existing deployments. Responses API should use the SDK's v1 path.
|
|
5887
|
+
useDeploymentBasedUrls: config.apiFormat !== "responses"
|
|
5697
5888
|
};
|
|
5698
5889
|
const baseURL = normalizeAzureBaseUrl(config.resourceName);
|
|
5699
5890
|
if (baseURL) {
|
|
@@ -9322,6 +9513,22 @@ function extractAzureResourceName(baseUrl) {
|
|
|
9322
9513
|
if (urlMatch) return urlMatch[1];
|
|
9323
9514
|
return baseUrl;
|
|
9324
9515
|
}
|
|
9516
|
+
function normalizeAzureSdkBaseUrl(baseUrl) {
|
|
9517
|
+
const trimmed = baseUrl.trim().replace(/\/+$/, "");
|
|
9518
|
+
if (!trimmed) {
|
|
9519
|
+
return trimmed;
|
|
9520
|
+
}
|
|
9521
|
+
if (!/^https?:\/\//i.test(trimmed)) {
|
|
9522
|
+
return `https://${trimmed}.openai.azure.com/openai/v1`;
|
|
9523
|
+
}
|
|
9524
|
+
if (/\/openai\/v1$/i.test(trimmed)) {
|
|
9525
|
+
return trimmed;
|
|
9526
|
+
}
|
|
9527
|
+
if (/\/openai$/i.test(trimmed)) {
|
|
9528
|
+
return `${trimmed}/v1`;
|
|
9529
|
+
}
|
|
9530
|
+
return `${trimmed}/openai/v1`;
|
|
9531
|
+
}
|
|
9325
9532
|
|
|
9326
9533
|
// src/evaluation/providers/pi-utils.ts
|
|
9327
9534
|
init_cjs_shims();
|
|
@@ -10156,9 +10363,40 @@ var import_node_child_process5 = require("child_process");
|
|
|
10156
10363
|
var import_node_crypto8 = require("crypto");
|
|
10157
10364
|
var import_node_fs10 = require("fs");
|
|
10158
10365
|
var import_promises19 = require("fs/promises");
|
|
10159
|
-
var
|
|
10366
|
+
var import_node_path23 = __toESM(require("path"), 1);
|
|
10160
10367
|
var import_node_readline = require("readline");
|
|
10161
10368
|
var import_node_url3 = require("url");
|
|
10369
|
+
|
|
10370
|
+
// src/paths.ts
|
|
10371
|
+
init_cjs_shims();
|
|
10372
|
+
var import_node_os6 = __toESM(require("os"), 1);
|
|
10373
|
+
var import_node_path22 = __toESM(require("path"), 1);
|
|
10374
|
+
var logged = false;
|
|
10375
|
+
function getAgentvHome() {
|
|
10376
|
+
const envHome = process.env.AGENTV_HOME;
|
|
10377
|
+
if (envHome && envHome !== "undefined") {
|
|
10378
|
+
if (!logged) {
|
|
10379
|
+
logged = true;
|
|
10380
|
+
console.warn(`Using AGENTV_HOME: ${envHome}`);
|
|
10381
|
+
}
|
|
10382
|
+
return envHome;
|
|
10383
|
+
}
|
|
10384
|
+
return import_node_path22.default.join(import_node_os6.default.homedir(), ".agentv");
|
|
10385
|
+
}
|
|
10386
|
+
function getWorkspacesRoot() {
|
|
10387
|
+
return import_node_path22.default.join(getAgentvHome(), "workspaces");
|
|
10388
|
+
}
|
|
10389
|
+
function getSubagentsRoot() {
|
|
10390
|
+
return import_node_path22.default.join(getAgentvHome(), "subagents");
|
|
10391
|
+
}
|
|
10392
|
+
function getTraceStateRoot() {
|
|
10393
|
+
return import_node_path22.default.join(getAgentvHome(), "trace-state");
|
|
10394
|
+
}
|
|
10395
|
+
function getWorkspacePoolRoot() {
|
|
10396
|
+
return import_node_path22.default.join(getAgentvHome(), "workspace-pool");
|
|
10397
|
+
}
|
|
10398
|
+
|
|
10399
|
+
// src/evaluation/providers/pi-coding-agent.ts
|
|
10162
10400
|
var piCodingAgentModule = null;
|
|
10163
10401
|
var piAiModule = null;
|
|
10164
10402
|
var loadingPromise = null;
|
|
@@ -10176,46 +10414,126 @@ async function promptInstall() {
|
|
|
10176
10414
|
rl.close();
|
|
10177
10415
|
}
|
|
10178
10416
|
}
|
|
10179
|
-
function
|
|
10180
|
-
|
|
10181
|
-
|
|
10182
|
-
|
|
10417
|
+
function findManagedSdkInstallRoot() {
|
|
10418
|
+
return import_node_path23.default.join(getAgentvHome(), "deps", "pi-sdk");
|
|
10419
|
+
}
|
|
10420
|
+
function resolveGlobalNpmRoot() {
|
|
10421
|
+
try {
|
|
10422
|
+
const root = (0, import_node_child_process5.execSync)("npm root -g", {
|
|
10423
|
+
encoding: "utf-8",
|
|
10424
|
+
stdio: ["ignore", "pipe", "ignore"]
|
|
10425
|
+
}).trim();
|
|
10426
|
+
return root.length > 0 ? root : void 0;
|
|
10427
|
+
} catch {
|
|
10428
|
+
return void 0;
|
|
10429
|
+
}
|
|
10430
|
+
}
|
|
10431
|
+
function buildGlobalModuleEntry(moduleName, globalNpmRoot) {
|
|
10432
|
+
return import_node_path23.default.join(globalNpmRoot, ...moduleName.split("/"), "dist", "index.js");
|
|
10433
|
+
}
|
|
10434
|
+
function findAccessiblePath(paths) {
|
|
10435
|
+
for (const candidate of paths) {
|
|
10183
10436
|
try {
|
|
10184
|
-
|
|
10185
|
-
|
|
10186
|
-
return dir;
|
|
10437
|
+
(0, import_node_fs10.accessSync)(candidate);
|
|
10438
|
+
return candidate;
|
|
10187
10439
|
} catch {
|
|
10188
|
-
const parent = import_node_path22.default.dirname(dir);
|
|
10189
|
-
if (parent === dir) break;
|
|
10190
|
-
dir = parent;
|
|
10191
10440
|
}
|
|
10192
10441
|
}
|
|
10193
|
-
return
|
|
10442
|
+
return void 0;
|
|
10194
10443
|
}
|
|
10195
|
-
async function
|
|
10444
|
+
async function tryImportLocalSdkModules() {
|
|
10196
10445
|
try {
|
|
10197
10446
|
[piCodingAgentModule, piAiModule] = await Promise.all([
|
|
10198
10447
|
import("@mariozechner/pi-coding-agent"),
|
|
10199
10448
|
import("@mariozechner/pi-ai")
|
|
10200
10449
|
]);
|
|
10450
|
+
return true;
|
|
10201
10451
|
} catch {
|
|
10202
|
-
|
|
10203
|
-
|
|
10204
|
-
|
|
10205
|
-
|
|
10206
|
-
|
|
10207
|
-
|
|
10208
|
-
|
|
10209
|
-
|
|
10210
|
-
|
|
10211
|
-
|
|
10212
|
-
|
|
10213
|
-
|
|
10214
|
-
|
|
10215
|
-
|
|
10216
|
-
|
|
10452
|
+
return false;
|
|
10453
|
+
}
|
|
10454
|
+
}
|
|
10455
|
+
async function tryImportManagedSdkModules() {
|
|
10456
|
+
const managedRoot = findManagedSdkInstallRoot();
|
|
10457
|
+
const piCodingAgentEntry = findAccessiblePath([
|
|
10458
|
+
import_node_path23.default.join(managedRoot, "node_modules", "@mariozechner", "pi-coding-agent", "dist", "index.js")
|
|
10459
|
+
]);
|
|
10460
|
+
const piAiEntry = findAccessiblePath([
|
|
10461
|
+
import_node_path23.default.join(managedRoot, "node_modules", "@mariozechner", "pi-ai", "dist", "index.js"),
|
|
10462
|
+
import_node_path23.default.join(
|
|
10463
|
+
managedRoot,
|
|
10464
|
+
"node_modules",
|
|
10465
|
+
"@mariozechner",
|
|
10466
|
+
"pi-coding-agent",
|
|
10467
|
+
"node_modules",
|
|
10468
|
+
"@mariozechner",
|
|
10469
|
+
"pi-ai",
|
|
10470
|
+
"dist",
|
|
10471
|
+
"index.js"
|
|
10472
|
+
)
|
|
10473
|
+
]);
|
|
10474
|
+
if (!piCodingAgentEntry || !piAiEntry) return false;
|
|
10475
|
+
try {
|
|
10476
|
+
[piCodingAgentModule, piAiModule] = await Promise.all([
|
|
10477
|
+
import((0, import_node_url3.pathToFileURL)(piCodingAgentEntry).href),
|
|
10478
|
+
import((0, import_node_url3.pathToFileURL)(piAiEntry).href)
|
|
10479
|
+
]);
|
|
10480
|
+
return true;
|
|
10481
|
+
} catch {
|
|
10482
|
+
return false;
|
|
10483
|
+
}
|
|
10484
|
+
}
|
|
10485
|
+
async function tryImportGlobalSdkModules() {
|
|
10486
|
+
const globalNpmRoot = resolveGlobalNpmRoot();
|
|
10487
|
+
if (!globalNpmRoot) return false;
|
|
10488
|
+
const piCodingAgentEntry = findAccessiblePath([
|
|
10489
|
+
buildGlobalModuleEntry("@mariozechner/pi-coding-agent", globalNpmRoot)
|
|
10490
|
+
]);
|
|
10491
|
+
const piAiEntry = findAccessiblePath([
|
|
10492
|
+
buildGlobalModuleEntry("@mariozechner/pi-ai", globalNpmRoot),
|
|
10493
|
+
import_node_path23.default.join(
|
|
10494
|
+
globalNpmRoot,
|
|
10495
|
+
"@mariozechner",
|
|
10496
|
+
"pi-coding-agent",
|
|
10497
|
+
"node_modules",
|
|
10498
|
+
"@mariozechner",
|
|
10499
|
+
"pi-ai",
|
|
10500
|
+
"dist",
|
|
10501
|
+
"index.js"
|
|
10502
|
+
)
|
|
10503
|
+
]);
|
|
10504
|
+
if (!piCodingAgentEntry || !piAiEntry) return false;
|
|
10505
|
+
try {
|
|
10506
|
+
[piCodingAgentModule, piAiModule] = await Promise.all([
|
|
10507
|
+
import((0, import_node_url3.pathToFileURL)(piCodingAgentEntry).href),
|
|
10508
|
+
import((0, import_node_url3.pathToFileURL)(piAiEntry).href)
|
|
10509
|
+
]);
|
|
10510
|
+
return true;
|
|
10511
|
+
} catch {
|
|
10512
|
+
return false;
|
|
10513
|
+
}
|
|
10514
|
+
}
|
|
10515
|
+
function installSdkModules(installDir) {
|
|
10516
|
+
console.error(`Installing @mariozechner/pi-coding-agent into ${installDir} via npm...`);
|
|
10517
|
+
(0, import_node_fs10.mkdirSync)(installDir, { recursive: true });
|
|
10518
|
+
(0, import_node_child_process5.execSync)("npm install --no-save --no-package-lock @mariozechner/pi-coding-agent", {
|
|
10519
|
+
cwd: installDir,
|
|
10520
|
+
stdio: "inherit"
|
|
10521
|
+
});
|
|
10522
|
+
}
|
|
10523
|
+
async function doLoadSdkModules() {
|
|
10524
|
+
if (await tryImportLocalSdkModules() || await tryImportManagedSdkModules() || await tryImportGlobalSdkModules()) {
|
|
10525
|
+
return;
|
|
10526
|
+
}
|
|
10527
|
+
if (await promptInstall()) {
|
|
10528
|
+
const installDir = findManagedSdkInstallRoot();
|
|
10529
|
+
installSdkModules(installDir);
|
|
10530
|
+
if (await tryImportManagedSdkModules()) {
|
|
10531
|
+
return;
|
|
10217
10532
|
}
|
|
10218
10533
|
}
|
|
10534
|
+
throw new Error(
|
|
10535
|
+
"pi-coding-agent SDK is not installed. Install it with:\n npm install @mariozechner/pi-coding-agent"
|
|
10536
|
+
);
|
|
10219
10537
|
}
|
|
10220
10538
|
async function loadSdkModules() {
|
|
10221
10539
|
if (!piCodingAgentModule || !piAiModule) {
|
|
@@ -10272,12 +10590,16 @@ var PiCodingAgentProvider = class {
|
|
|
10272
10590
|
try {
|
|
10273
10591
|
const cwd = this.resolveCwd(request.cwd);
|
|
10274
10592
|
const rawProvider = this.config.subprovider ?? "google";
|
|
10275
|
-
const
|
|
10593
|
+
const normalizedBaseUrl = this.normalizeSdkBaseUrl(rawProvider, this.config.baseUrl);
|
|
10594
|
+
const hasBaseUrl = !!normalizedBaseUrl;
|
|
10276
10595
|
const providerName = resolveSubprovider(rawProvider, hasBaseUrl);
|
|
10277
10596
|
const modelId = this.config.model ?? "gemini-2.5-flash";
|
|
10278
10597
|
this.setApiKeyEnv(rawProvider, hasBaseUrl);
|
|
10279
|
-
this.setBaseUrlEnv(rawProvider, hasBaseUrl);
|
|
10598
|
+
this.setBaseUrlEnv(rawProvider, normalizedBaseUrl, hasBaseUrl);
|
|
10280
10599
|
let model = sdk.getModel(providerName, modelId);
|
|
10600
|
+
if (model && normalizedBaseUrl) {
|
|
10601
|
+
model = { ...model, baseUrl: normalizedBaseUrl };
|
|
10602
|
+
}
|
|
10281
10603
|
if (!model) {
|
|
10282
10604
|
const envProvider = providerName.replace(/-responses$/, "");
|
|
10283
10605
|
model = {
|
|
@@ -10285,7 +10607,7 @@ var PiCodingAgentProvider = class {
|
|
|
10285
10607
|
name: modelId,
|
|
10286
10608
|
api: providerName,
|
|
10287
10609
|
provider: envProvider,
|
|
10288
|
-
baseUrl:
|
|
10610
|
+
baseUrl: normalizedBaseUrl ?? "",
|
|
10289
10611
|
reasoning: false,
|
|
10290
10612
|
input: ["text"],
|
|
10291
10613
|
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
|
@@ -10452,19 +10774,27 @@ ${fileList}`;
|
|
|
10452
10774
|
}
|
|
10453
10775
|
}
|
|
10454
10776
|
/** Maps config baseUrl to the provider-specific env var the SDK reads. */
|
|
10455
|
-
setBaseUrlEnv(providerName, hasBaseUrl = false) {
|
|
10456
|
-
|
|
10777
|
+
setBaseUrlEnv(providerName, baseUrl = this.config.baseUrl, hasBaseUrl = false) {
|
|
10778
|
+
const normalizedBaseUrl = this.normalizeSdkBaseUrl(providerName, baseUrl);
|
|
10779
|
+
if (!normalizedBaseUrl) return;
|
|
10457
10780
|
const envKey = resolveEnvBaseUrlName(providerName, hasBaseUrl);
|
|
10458
10781
|
if (envKey) {
|
|
10459
|
-
process.env[envKey] =
|
|
10782
|
+
process.env[envKey] = normalizedBaseUrl;
|
|
10460
10783
|
}
|
|
10461
10784
|
}
|
|
10785
|
+
normalizeSdkBaseUrl(providerName, baseUrl) {
|
|
10786
|
+
if (!baseUrl) return void 0;
|
|
10787
|
+
if (providerName.toLowerCase() === "azure") {
|
|
10788
|
+
return normalizeAzureSdkBaseUrl(baseUrl);
|
|
10789
|
+
}
|
|
10790
|
+
return baseUrl;
|
|
10791
|
+
}
|
|
10462
10792
|
resolveCwd(cwdOverride) {
|
|
10463
10793
|
if (cwdOverride) {
|
|
10464
|
-
return
|
|
10794
|
+
return import_node_path23.default.resolve(cwdOverride);
|
|
10465
10795
|
}
|
|
10466
10796
|
if (this.config.cwd) {
|
|
10467
|
-
return
|
|
10797
|
+
return import_node_path23.default.resolve(this.config.cwd);
|
|
10468
10798
|
}
|
|
10469
10799
|
return process.cwd();
|
|
10470
10800
|
}
|
|
@@ -10483,9 +10813,9 @@ ${fileList}`;
|
|
|
10483
10813
|
}
|
|
10484
10814
|
resolveLogDirectory() {
|
|
10485
10815
|
if (this.config.logDir) {
|
|
10486
|
-
return
|
|
10816
|
+
return import_node_path23.default.resolve(this.config.logDir);
|
|
10487
10817
|
}
|
|
10488
|
-
return
|
|
10818
|
+
return import_node_path23.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
10489
10819
|
}
|
|
10490
10820
|
async createStreamLogger(request) {
|
|
10491
10821
|
const logDir = this.resolveLogDirectory();
|
|
@@ -10499,7 +10829,7 @@ ${fileList}`;
|
|
|
10499
10829
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
10500
10830
|
return void 0;
|
|
10501
10831
|
}
|
|
10502
|
-
const filePath =
|
|
10832
|
+
const filePath = import_node_path23.default.join(logDir, buildLogFilename6(request, this.targetName));
|
|
10503
10833
|
try {
|
|
10504
10834
|
const logger = await PiStreamLogger2.create({
|
|
10505
10835
|
filePath,
|
|
@@ -10714,19 +11044,17 @@ var ProviderRegistry = class {
|
|
|
10714
11044
|
|
|
10715
11045
|
// src/evaluation/providers/targets.ts
|
|
10716
11046
|
init_cjs_shims();
|
|
10717
|
-
var
|
|
11047
|
+
var import_node_path24 = __toESM(require("path"), 1);
|
|
10718
11048
|
var import_zod3 = require("zod");
|
|
10719
11049
|
var CliHealthcheckHttpInputSchema = import_zod3.z.object({
|
|
10720
11050
|
url: import_zod3.z.string().min(1, "healthcheck URL is required"),
|
|
10721
|
-
timeout_seconds: import_zod3.z.number().positive().optional()
|
|
10722
|
-
|
|
10723
|
-
});
|
|
11051
|
+
timeout_seconds: import_zod3.z.number().positive().optional()
|
|
11052
|
+
}).passthrough();
|
|
10724
11053
|
var CliHealthcheckCommandInputSchema = import_zod3.z.object({
|
|
10725
11054
|
command: import_zod3.z.string().min(1, "healthcheck command is required"),
|
|
10726
11055
|
cwd: import_zod3.z.string().optional(),
|
|
10727
|
-
timeout_seconds: import_zod3.z.number().positive().optional()
|
|
10728
|
-
|
|
10729
|
-
});
|
|
11056
|
+
timeout_seconds: import_zod3.z.number().positive().optional()
|
|
11057
|
+
}).passthrough();
|
|
10730
11058
|
var CliHealthcheckInputSchema = import_zod3.z.union([
|
|
10731
11059
|
CliHealthcheckHttpInputSchema,
|
|
10732
11060
|
CliHealthcheckCommandInputSchema
|
|
@@ -10738,36 +11066,28 @@ var CliTargetInputSchema = import_zod3.z.object({
|
|
|
10738
11066
|
command: import_zod3.z.string(),
|
|
10739
11067
|
// Files format - optional
|
|
10740
11068
|
files_format: import_zod3.z.string().optional(),
|
|
10741
|
-
filesFormat: import_zod3.z.string().optional(),
|
|
10742
11069
|
attachments_format: import_zod3.z.string().optional(),
|
|
10743
|
-
attachmentsFormat: import_zod3.z.string().optional(),
|
|
10744
11070
|
// Working directory - optional
|
|
10745
11071
|
cwd: import_zod3.z.string().optional(),
|
|
10746
11072
|
// Workspace template directory - optional (mutually exclusive with cwd)
|
|
10747
11073
|
workspace_template: import_zod3.z.string().optional(),
|
|
10748
|
-
workspaceTemplate: import_zod3.z.string().optional(),
|
|
10749
11074
|
// Timeout in seconds - optional
|
|
10750
11075
|
timeout_seconds: import_zod3.z.number().positive().optional(),
|
|
10751
|
-
timeoutSeconds: import_zod3.z.number().positive().optional(),
|
|
10752
11076
|
// Healthcheck configuration - optional
|
|
10753
11077
|
healthcheck: CliHealthcheckInputSchema.optional(),
|
|
10754
11078
|
// Verbose mode - optional
|
|
10755
11079
|
verbose: import_zod3.z.boolean().optional(),
|
|
10756
11080
|
cli_verbose: import_zod3.z.boolean().optional(),
|
|
10757
|
-
cliVerbose: import_zod3.z.boolean().optional(),
|
|
10758
11081
|
// Keep temp files - optional
|
|
10759
11082
|
keep_temp_files: import_zod3.z.boolean().optional(),
|
|
10760
|
-
keepTempFiles: import_zod3.z.boolean().optional(),
|
|
10761
11083
|
keep_output_files: import_zod3.z.boolean().optional(),
|
|
10762
|
-
keepOutputFiles: import_zod3.z.boolean().optional(),
|
|
10763
11084
|
// Common target fields
|
|
10764
11085
|
grader_target: import_zod3.z.string().optional(),
|
|
10765
11086
|
judge_target: import_zod3.z.string().optional(),
|
|
10766
11087
|
// backward compat
|
|
10767
11088
|
workers: import_zod3.z.number().int().min(1).optional(),
|
|
10768
|
-
provider_batching: import_zod3.z.boolean().optional()
|
|
10769
|
-
|
|
10770
|
-
});
|
|
11089
|
+
provider_batching: import_zod3.z.boolean().optional()
|
|
11090
|
+
}).passthrough();
|
|
10771
11091
|
var CliHealthcheckHttpSchema = import_zod3.z.object({
|
|
10772
11092
|
url: import_zod3.z.string().min(1),
|
|
10773
11093
|
timeoutMs: import_zod3.z.number().positive().optional()
|
|
@@ -10792,7 +11112,7 @@ var CliTargetConfigSchema = import_zod3.z.object({
|
|
|
10792
11112
|
keepTempFiles: import_zod3.z.boolean().optional()
|
|
10793
11113
|
}).strict();
|
|
10794
11114
|
function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
|
|
10795
|
-
const timeoutSeconds = input.timeout_seconds
|
|
11115
|
+
const timeoutSeconds = input.timeout_seconds;
|
|
10796
11116
|
const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
|
|
10797
11117
|
if ("url" in input && input.url) {
|
|
10798
11118
|
const url = resolveString(input.url, env, `${targetName} healthcheck URL`);
|
|
@@ -10811,11 +11131,11 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
|
|
|
10811
11131
|
allowLiteral: true,
|
|
10812
11132
|
optionalEnv: true
|
|
10813
11133
|
});
|
|
10814
|
-
if (cwd && evalFilePath && !
|
|
10815
|
-
cwd =
|
|
11134
|
+
if (cwd && evalFilePath && !import_node_path24.default.isAbsolute(cwd)) {
|
|
11135
|
+
cwd = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), cwd);
|
|
10816
11136
|
}
|
|
10817
11137
|
if (!cwd && evalFilePath) {
|
|
10818
|
-
cwd =
|
|
11138
|
+
cwd = import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath));
|
|
10819
11139
|
}
|
|
10820
11140
|
return {
|
|
10821
11141
|
command,
|
|
@@ -10826,9 +11146,9 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
|
|
|
10826
11146
|
function normalizeCliTargetInput(input, env, evalFilePath) {
|
|
10827
11147
|
const targetName = input.name;
|
|
10828
11148
|
const command = resolveString(input.command, env, `${targetName} CLI command`, true);
|
|
10829
|
-
const filesFormatSource = input.files_format ?? input.
|
|
11149
|
+
const filesFormatSource = input.files_format ?? input.attachments_format;
|
|
10830
11150
|
const filesFormat = resolveOptionalLiteralString(filesFormatSource);
|
|
10831
|
-
const workspaceTemplateSource = input.workspace_template
|
|
11151
|
+
const workspaceTemplateSource = input.workspace_template;
|
|
10832
11152
|
let workspaceTemplate = resolveOptionalString(
|
|
10833
11153
|
workspaceTemplateSource,
|
|
10834
11154
|
env,
|
|
@@ -10838,15 +11158,15 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
|
|
|
10838
11158
|
optionalEnv: true
|
|
10839
11159
|
}
|
|
10840
11160
|
);
|
|
10841
|
-
if (workspaceTemplate && evalFilePath && !
|
|
10842
|
-
workspaceTemplate =
|
|
11161
|
+
if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
|
|
11162
|
+
workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
|
|
10843
11163
|
}
|
|
10844
11164
|
let cwd = resolveOptionalString(input.cwd, env, `${targetName} working directory`, {
|
|
10845
11165
|
allowLiteral: true,
|
|
10846
11166
|
optionalEnv: true
|
|
10847
11167
|
});
|
|
10848
|
-
if (cwd && evalFilePath && !
|
|
10849
|
-
cwd =
|
|
11168
|
+
if (cwd && evalFilePath && !import_node_path24.default.isAbsolute(cwd)) {
|
|
11169
|
+
cwd = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), cwd);
|
|
10850
11170
|
}
|
|
10851
11171
|
if (cwd && workspaceTemplate) {
|
|
10852
11172
|
throw new Error(
|
|
@@ -10854,14 +11174,12 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
|
|
|
10854
11174
|
);
|
|
10855
11175
|
}
|
|
10856
11176
|
if (!cwd && !workspaceTemplate && evalFilePath) {
|
|
10857
|
-
cwd =
|
|
11177
|
+
cwd = import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath));
|
|
10858
11178
|
}
|
|
10859
|
-
const timeoutSeconds = input.timeout_seconds
|
|
11179
|
+
const timeoutSeconds = input.timeout_seconds;
|
|
10860
11180
|
const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
|
|
10861
|
-
const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose
|
|
10862
|
-
const keepTempFiles = resolveOptionalBoolean(
|
|
10863
|
-
input.keep_temp_files ?? input.keepTempFiles ?? input.keep_output_files ?? input.keepOutputFiles
|
|
10864
|
-
);
|
|
11181
|
+
const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose);
|
|
11182
|
+
const keepTempFiles = resolveOptionalBoolean(input.keep_temp_files ?? input.keep_output_files);
|
|
10865
11183
|
const healthcheck = input.healthcheck ? normalizeCliHealthcheck(input.healthcheck, env, targetName, evalFilePath) : void 0;
|
|
10866
11184
|
return {
|
|
10867
11185
|
command,
|
|
@@ -10882,14 +11200,104 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
|
|
|
10882
11200
|
"FILES",
|
|
10883
11201
|
"OUTPUT_FILE"
|
|
10884
11202
|
]);
|
|
11203
|
+
var DEPRECATED_TARGET_CAMEL_CASE_FIELDS = /* @__PURE__ */ new Map([
|
|
11204
|
+
["providerBatching", "provider_batching"],
|
|
11205
|
+
["subagentModeAllowed", "subagent_mode_allowed"],
|
|
11206
|
+
["fallbackTargets", "fallback_targets"],
|
|
11207
|
+
["resourceName", "endpoint"],
|
|
11208
|
+
["baseUrl", "base_url"],
|
|
11209
|
+
["apiKey", "api_key"],
|
|
11210
|
+
["deploymentName", "model"],
|
|
11211
|
+
["thinkingBudget", "thinking_budget"],
|
|
11212
|
+
["maxTokens", "max_output_tokens"],
|
|
11213
|
+
["apiFormat", "api_format"],
|
|
11214
|
+
["timeoutSeconds", "timeout_seconds"],
|
|
11215
|
+
["logDir", "log_dir"],
|
|
11216
|
+
["logDirectory", "log_directory"],
|
|
11217
|
+
["logFormat", "log_format"],
|
|
11218
|
+
["logOutputFormat", "log_output_format"],
|
|
11219
|
+
["systemPrompt", "system_prompt"],
|
|
11220
|
+
["maxTurns", "max_turns"],
|
|
11221
|
+
["maxBudgetUsd", "max_budget_usd"],
|
|
11222
|
+
["dryRun", "dry_run"],
|
|
11223
|
+
["subagentRoot", "subagent_root"],
|
|
11224
|
+
["filesFormat", "files_format"],
|
|
11225
|
+
["attachmentsFormat", "attachments_format"],
|
|
11226
|
+
["cliUrl", "cli_url"],
|
|
11227
|
+
["cliPath", "cli_path"],
|
|
11228
|
+
["githubToken", "github_token"],
|
|
11229
|
+
["sessionDir", "session_dir"],
|
|
11230
|
+
["sessionId", "session_id"],
|
|
11231
|
+
["sessionStateDir", "session_state_dir"],
|
|
11232
|
+
["maxRetries", "max_retries"],
|
|
11233
|
+
["retryInitialDelayMs", "retry_initial_delay_ms"],
|
|
11234
|
+
["retryMaxDelayMs", "retry_max_delay_ms"],
|
|
11235
|
+
["retryBackoffFactor", "retry_backoff_factor"],
|
|
11236
|
+
["retryStatusCodes", "retry_status_codes"]
|
|
11237
|
+
]);
|
|
11238
|
+
var DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS = /* @__PURE__ */ new Map([
|
|
11239
|
+
["timeoutSeconds", "timeout_seconds"]
|
|
11240
|
+
]);
|
|
11241
|
+
function collectDeprecatedCamelCaseWarnings(value, location, aliases) {
|
|
11242
|
+
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
11243
|
+
return [];
|
|
11244
|
+
}
|
|
11245
|
+
const warnings = [];
|
|
11246
|
+
for (const [camelCaseField, snakeCaseField] of aliases) {
|
|
11247
|
+
if (Object.prototype.hasOwnProperty.call(value, camelCaseField)) {
|
|
11248
|
+
warnings.push({
|
|
11249
|
+
location: `${location}.${camelCaseField}`,
|
|
11250
|
+
message: `camelCase field '${camelCaseField}' is no longer supported in targets.yaml. Use '${snakeCaseField}' instead.`
|
|
11251
|
+
});
|
|
11252
|
+
}
|
|
11253
|
+
}
|
|
11254
|
+
return warnings;
|
|
11255
|
+
}
|
|
11256
|
+
function assertNoDeprecatedCamelCaseTargetFields(definition) {
|
|
11257
|
+
if (Object.prototype.hasOwnProperty.call(definition, "workspaceTemplate")) {
|
|
11258
|
+
throw new Error(
|
|
11259
|
+
`${definition.name}: target-level workspace_template has been removed. Use eval-level workspace.template.`
|
|
11260
|
+
);
|
|
11261
|
+
}
|
|
11262
|
+
const warning = findDeprecatedCamelCaseTargetWarnings(
|
|
11263
|
+
definition,
|
|
11264
|
+
`target "${definition.name}"`
|
|
11265
|
+
)[0];
|
|
11266
|
+
if (!warning) {
|
|
11267
|
+
return;
|
|
11268
|
+
}
|
|
11269
|
+
const fieldMatch = warning.message.match(/field '([^']+)'/);
|
|
11270
|
+
const replacementMatch = warning.message.match(/Use '([^']+)' instead/);
|
|
11271
|
+
const field = fieldMatch?.[1] ?? "unknown";
|
|
11272
|
+
const replacement = replacementMatch?.[1] ?? "snake_case";
|
|
11273
|
+
throw new Error(
|
|
11274
|
+
`${warning.location}: camelCase field '${field}' is no longer supported in targets.yaml. Use '${replacement}' instead.`
|
|
11275
|
+
);
|
|
11276
|
+
}
|
|
11277
|
+
function findDeprecatedCamelCaseTargetWarnings(target, location) {
|
|
11278
|
+
const warnings = collectDeprecatedCamelCaseWarnings(
|
|
11279
|
+
target,
|
|
11280
|
+
location,
|
|
11281
|
+
DEPRECATED_TARGET_CAMEL_CASE_FIELDS
|
|
11282
|
+
);
|
|
11283
|
+
if (typeof target !== "object" || target === null || Array.isArray(target)) {
|
|
11284
|
+
return warnings;
|
|
11285
|
+
}
|
|
11286
|
+
const healthcheck = target.healthcheck;
|
|
11287
|
+
warnings.push(
|
|
11288
|
+
...collectDeprecatedCamelCaseWarnings(
|
|
11289
|
+
healthcheck,
|
|
11290
|
+
`${location}.healthcheck`,
|
|
11291
|
+
DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS
|
|
11292
|
+
)
|
|
11293
|
+
);
|
|
11294
|
+
return warnings;
|
|
11295
|
+
}
|
|
10885
11296
|
var COMMON_TARGET_SETTINGS = [
|
|
10886
11297
|
"use_target",
|
|
10887
11298
|
"provider_batching",
|
|
10888
|
-
"providerBatching",
|
|
10889
11299
|
"subagent_mode_allowed",
|
|
10890
|
-
"
|
|
10891
|
-
"fallback_targets",
|
|
10892
|
-
"fallbackTargets"
|
|
11300
|
+
"fallback_targets"
|
|
10893
11301
|
];
|
|
10894
11302
|
var USE_TARGET_ENV_PATTERN = /^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i;
|
|
10895
11303
|
var BASE_TARGET_SCHEMA = import_zod3.z.object({
|
|
@@ -10901,43 +11309,40 @@ var BASE_TARGET_SCHEMA = import_zod3.z.object({
|
|
|
10901
11309
|
// backward compat
|
|
10902
11310
|
workers: import_zod3.z.number().int().min(1).optional(),
|
|
10903
11311
|
workspace_template: import_zod3.z.string().optional(),
|
|
10904
|
-
workspaceTemplate: import_zod3.z.string().optional(),
|
|
10905
11312
|
subagent_mode_allowed: import_zod3.z.boolean().optional(),
|
|
10906
|
-
fallback_targets: import_zod3.z.array(import_zod3.z.string().min(1)).optional()
|
|
10907
|
-
fallbackTargets: import_zod3.z.array(import_zod3.z.string().min(1)).optional()
|
|
11313
|
+
fallback_targets: import_zod3.z.array(import_zod3.z.string().min(1)).optional()
|
|
10908
11314
|
}).passthrough();
|
|
10909
11315
|
var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
|
|
11316
|
+
var DEFAULT_AZURE_RESPONSES_API_VERSION = "v1";
|
|
10910
11317
|
var DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
|
|
10911
|
-
function normalizeAzureApiVersion(value) {
|
|
11318
|
+
function normalizeAzureApiVersion(value, apiFormat) {
|
|
11319
|
+
const defaultVersion = apiFormat === "responses" ? DEFAULT_AZURE_RESPONSES_API_VERSION : DEFAULT_AZURE_API_VERSION;
|
|
10912
11320
|
if (!value) {
|
|
10913
|
-
return
|
|
11321
|
+
return defaultVersion;
|
|
10914
11322
|
}
|
|
10915
11323
|
const trimmed = value.trim();
|
|
10916
11324
|
if (trimmed.length === 0) {
|
|
10917
|
-
return
|
|
11325
|
+
return defaultVersion;
|
|
10918
11326
|
}
|
|
10919
11327
|
const withoutPrefix = trimmed.replace(/^api[-_]?version\s*=\s*/i, "").trim();
|
|
10920
|
-
return withoutPrefix.length > 0 ? withoutPrefix :
|
|
11328
|
+
return withoutPrefix.length > 0 ? withoutPrefix : defaultVersion;
|
|
10921
11329
|
}
|
|
10922
11330
|
function resolveRetryConfig(target) {
|
|
10923
|
-
const maxRetries = resolveOptionalNumber(
|
|
10924
|
-
target.max_retries ?? target.maxRetries,
|
|
10925
|
-
`${target.name} max retries`
|
|
10926
|
-
);
|
|
11331
|
+
const maxRetries = resolveOptionalNumber(target.max_retries, `${target.name} max retries`);
|
|
10927
11332
|
const initialDelayMs = resolveOptionalNumber(
|
|
10928
|
-
target.retry_initial_delay_ms
|
|
11333
|
+
target.retry_initial_delay_ms,
|
|
10929
11334
|
`${target.name} retry initial delay`
|
|
10930
11335
|
);
|
|
10931
11336
|
const maxDelayMs = resolveOptionalNumber(
|
|
10932
|
-
target.retry_max_delay_ms
|
|
11337
|
+
target.retry_max_delay_ms,
|
|
10933
11338
|
`${target.name} retry max delay`
|
|
10934
11339
|
);
|
|
10935
11340
|
const backoffFactor = resolveOptionalNumber(
|
|
10936
|
-
target.retry_backoff_factor
|
|
11341
|
+
target.retry_backoff_factor,
|
|
10937
11342
|
`${target.name} retry backoff factor`
|
|
10938
11343
|
);
|
|
10939
11344
|
const retryableStatusCodes = resolveOptionalNumberArray(
|
|
10940
|
-
target.retry_status_codes
|
|
11345
|
+
target.retry_status_codes,
|
|
10941
11346
|
`${target.name} retry status codes`
|
|
10942
11347
|
);
|
|
10943
11348
|
if (maxRetries === void 0 && initialDelayMs === void 0 && maxDelayMs === void 0 && backoffFactor === void 0 && retryableStatusCodes === void 0) {
|
|
@@ -10997,9 +11402,10 @@ function resolveDelegatedTargetDefinition(name, definitions, env = process.env)
|
|
|
10997
11402
|
`Target "${name}" exceeded the maximum use_target resolution depth (10). Check for a delegation loop or overly deep alias chain.`
|
|
10998
11403
|
);
|
|
10999
11404
|
}
|
|
11000
|
-
function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
11405
|
+
function resolveTargetDefinition(definition, env = process.env, evalFilePath, options) {
|
|
11406
|
+
assertNoDeprecatedCamelCaseTargetFields(definition);
|
|
11001
11407
|
const parsed = BASE_TARGET_SCHEMA.parse(definition);
|
|
11002
|
-
if (parsed.workspace_template !== void 0
|
|
11408
|
+
if (parsed.workspace_template !== void 0) {
|
|
11003
11409
|
throw new Error(
|
|
11004
11410
|
`${parsed.name}: target-level workspace_template has been removed. Use eval-level workspace.template.`
|
|
11005
11411
|
);
|
|
@@ -11015,13 +11421,9 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
|
11015
11421
|
`${parsed.name} provider`,
|
|
11016
11422
|
true
|
|
11017
11423
|
).toLowerCase();
|
|
11018
|
-
const providerBatching = resolveOptionalBoolean(
|
|
11019
|
-
|
|
11020
|
-
|
|
11021
|
-
const subagentModeAllowed = resolveOptionalBoolean(
|
|
11022
|
-
parsed.subagent_mode_allowed ?? parsed.subagentModeAllowed
|
|
11023
|
-
);
|
|
11024
|
-
const fallbackTargets = parsed.fallback_targets ?? parsed.fallbackTargets;
|
|
11424
|
+
const providerBatching = resolveOptionalBoolean(parsed.provider_batching);
|
|
11425
|
+
const subagentModeAllowed = resolveOptionalBoolean(parsed.subagent_mode_allowed);
|
|
11426
|
+
const fallbackTargets = parsed.fallback_targets;
|
|
11025
11427
|
const base = {
|
|
11026
11428
|
name: parsed.name,
|
|
11027
11429
|
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
@@ -11171,20 +11573,22 @@ function normalizeOpenAIBaseUrl(value) {
|
|
|
11171
11573
|
return trimmed.endsWith("/v1") ? trimmed : `${trimmed}/v1`;
|
|
11172
11574
|
}
|
|
11173
11575
|
function resolveAzureConfig(target, env) {
|
|
11174
|
-
const endpointSource = target.endpoint ?? target.resource
|
|
11175
|
-
const apiKeySource = target.api_key
|
|
11176
|
-
const deploymentSource = target.deployment ?? target.
|
|
11576
|
+
const endpointSource = target.endpoint ?? target.resource;
|
|
11577
|
+
const apiKeySource = target.api_key;
|
|
11578
|
+
const deploymentSource = target.deployment ?? target.model;
|
|
11177
11579
|
const versionSource = target.version ?? target.api_version;
|
|
11178
11580
|
const temperatureSource = target.temperature;
|
|
11179
|
-
const maxTokensSource = target.max_output_tokens
|
|
11581
|
+
const maxTokensSource = target.max_output_tokens;
|
|
11180
11582
|
const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
|
|
11181
11583
|
const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
|
|
11182
11584
|
const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
|
|
11585
|
+
const apiFormat = resolveApiFormat(target, env, target.name);
|
|
11183
11586
|
const version = normalizeAzureApiVersion(
|
|
11184
11587
|
resolveOptionalString(versionSource, env, `${target.name} api version`, {
|
|
11185
11588
|
allowLiteral: true,
|
|
11186
11589
|
optionalEnv: true
|
|
11187
|
-
})
|
|
11590
|
+
}),
|
|
11591
|
+
apiFormat
|
|
11188
11592
|
);
|
|
11189
11593
|
const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
|
|
11190
11594
|
const maxOutputTokens = resolveOptionalNumber(
|
|
@@ -11197,13 +11601,17 @@ function resolveAzureConfig(target, env) {
|
|
|
11197
11601
|
deploymentName,
|
|
11198
11602
|
apiKey,
|
|
11199
11603
|
version,
|
|
11604
|
+
apiFormat,
|
|
11200
11605
|
temperature,
|
|
11201
11606
|
maxOutputTokens,
|
|
11202
11607
|
retry
|
|
11203
11608
|
};
|
|
11204
11609
|
}
|
|
11205
|
-
function resolveApiFormat(target, targetName) {
|
|
11206
|
-
const raw = target.api_format
|
|
11610
|
+
function resolveApiFormat(target, env, targetName) {
|
|
11611
|
+
const raw = resolveOptionalString(target.api_format, env, `${targetName} api format`, {
|
|
11612
|
+
allowLiteral: true,
|
|
11613
|
+
optionalEnv: true
|
|
11614
|
+
});
|
|
11207
11615
|
if (raw === void 0) return void 0;
|
|
11208
11616
|
if (raw === "chat" || raw === "responses") return raw;
|
|
11209
11617
|
throw new Error(
|
|
@@ -11211,11 +11619,11 @@ function resolveApiFormat(target, targetName) {
|
|
|
11211
11619
|
);
|
|
11212
11620
|
}
|
|
11213
11621
|
function resolveOpenAIConfig(target, env) {
|
|
11214
|
-
const endpointSource = target.endpoint ?? target.base_url
|
|
11215
|
-
const apiKeySource = target.api_key
|
|
11622
|
+
const endpointSource = target.endpoint ?? target.base_url;
|
|
11623
|
+
const apiKeySource = target.api_key;
|
|
11216
11624
|
const modelSource = target.model ?? target.deployment ?? target.variant;
|
|
11217
11625
|
const temperatureSource = target.temperature;
|
|
11218
|
-
const maxTokensSource = target.max_output_tokens
|
|
11626
|
+
const maxTokensSource = target.max_output_tokens;
|
|
11219
11627
|
const baseURL = normalizeOpenAIBaseUrl(
|
|
11220
11628
|
resolveOptionalString(endpointSource, env, `${target.name} endpoint`, {
|
|
11221
11629
|
allowLiteral: true,
|
|
@@ -11229,17 +11637,17 @@ function resolveOpenAIConfig(target, env) {
|
|
|
11229
11637
|
baseURL,
|
|
11230
11638
|
apiKey,
|
|
11231
11639
|
model,
|
|
11232
|
-
apiFormat: resolveApiFormat(target, target.name),
|
|
11640
|
+
apiFormat: resolveApiFormat(target, env, target.name),
|
|
11233
11641
|
temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
|
|
11234
11642
|
maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
|
|
11235
11643
|
retry
|
|
11236
11644
|
};
|
|
11237
11645
|
}
|
|
11238
11646
|
function resolveOpenRouterConfig(target, env) {
|
|
11239
|
-
const apiKeySource = target.api_key
|
|
11647
|
+
const apiKeySource = target.api_key;
|
|
11240
11648
|
const modelSource = target.model ?? target.deployment ?? target.variant;
|
|
11241
11649
|
const temperatureSource = target.temperature;
|
|
11242
|
-
const maxTokensSource = target.max_output_tokens
|
|
11650
|
+
const maxTokensSource = target.max_output_tokens;
|
|
11243
11651
|
const retry = resolveRetryConfig(target);
|
|
11244
11652
|
return {
|
|
11245
11653
|
apiKey: resolveString(apiKeySource, env, `${target.name} OpenRouter api key`),
|
|
@@ -11250,11 +11658,11 @@ function resolveOpenRouterConfig(target, env) {
|
|
|
11250
11658
|
};
|
|
11251
11659
|
}
|
|
11252
11660
|
function resolveAnthropicConfig(target, env) {
|
|
11253
|
-
const apiKeySource = target.api_key
|
|
11661
|
+
const apiKeySource = target.api_key;
|
|
11254
11662
|
const modelSource = target.model ?? target.deployment ?? target.variant;
|
|
11255
11663
|
const temperatureSource = target.temperature;
|
|
11256
|
-
const maxTokensSource = target.max_output_tokens
|
|
11257
|
-
const thinkingBudgetSource = target.thinking_budget
|
|
11664
|
+
const maxTokensSource = target.max_output_tokens;
|
|
11665
|
+
const thinkingBudgetSource = target.thinking_budget;
|
|
11258
11666
|
const apiKey = resolveString(apiKeySource, env, `${target.name} Anthropic api key`);
|
|
11259
11667
|
const model = resolveString(modelSource, env, `${target.name} Anthropic model`);
|
|
11260
11668
|
const retry = resolveRetryConfig(target);
|
|
@@ -11268,10 +11676,10 @@ function resolveAnthropicConfig(target, env) {
|
|
|
11268
11676
|
};
|
|
11269
11677
|
}
|
|
11270
11678
|
function resolveGeminiConfig(target, env) {
|
|
11271
|
-
const apiKeySource = target.api_key
|
|
11679
|
+
const apiKeySource = target.api_key;
|
|
11272
11680
|
const modelSource = target.model ?? target.deployment ?? target.variant;
|
|
11273
11681
|
const temperatureSource = target.temperature;
|
|
11274
|
-
const maxTokensSource = target.max_output_tokens
|
|
11682
|
+
const maxTokensSource = target.max_output_tokens;
|
|
11275
11683
|
const apiKey = resolveString(apiKeySource, env, `${target.name} Google API key`);
|
|
11276
11684
|
const model = resolveOptionalString(modelSource, env, `${target.name} Gemini model`, {
|
|
11277
11685
|
allowLiteral: true,
|
|
@@ -11291,11 +11699,11 @@ function resolveCodexConfig(target, env, evalFilePath) {
|
|
|
11291
11699
|
const executableSource = target.executable ?? target.command ?? target.binary;
|
|
11292
11700
|
const argsSource = target.args ?? target.arguments;
|
|
11293
11701
|
const cwdSource = target.cwd;
|
|
11294
|
-
const workspaceTemplateSource = target.workspace_template
|
|
11295
|
-
const timeoutSource = target.timeout_seconds
|
|
11296
|
-
const logDirSource = target.log_dir ?? target.
|
|
11297
|
-
const logFormatSource = target.log_format ?? target.
|
|
11298
|
-
const systemPromptSource = target.system_prompt
|
|
11702
|
+
const workspaceTemplateSource = target.workspace_template;
|
|
11703
|
+
const timeoutSource = target.timeout_seconds;
|
|
11704
|
+
const logDirSource = target.log_dir ?? target.log_directory;
|
|
11705
|
+
const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CODEX_LOG_FORMAT;
|
|
11706
|
+
const systemPromptSource = target.system_prompt;
|
|
11299
11707
|
const model = resolveOptionalString(modelSource, env, `${target.name} codex model`, {
|
|
11300
11708
|
allowLiteral: true,
|
|
11301
11709
|
optionalEnv: true
|
|
@@ -11318,8 +11726,8 @@ function resolveCodexConfig(target, env, evalFilePath) {
|
|
|
11318
11726
|
optionalEnv: true
|
|
11319
11727
|
}
|
|
11320
11728
|
);
|
|
11321
|
-
if (workspaceTemplate && evalFilePath && !
|
|
11322
|
-
workspaceTemplate =
|
|
11729
|
+
if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
|
|
11730
|
+
workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
|
|
11323
11731
|
}
|
|
11324
11732
|
if (cwd && workspaceTemplate) {
|
|
11325
11733
|
throw new Error(
|
|
@@ -11359,16 +11767,16 @@ function normalizeCodexLogFormat(value) {
|
|
|
11359
11767
|
throw new Error("codex log format must be 'summary' or 'json'");
|
|
11360
11768
|
}
|
|
11361
11769
|
function resolveCopilotSdkConfig(target, env, evalFilePath) {
|
|
11362
|
-
const cliUrlSource = target.cli_url
|
|
11363
|
-
const cliPathSource = target.cli_path
|
|
11364
|
-
const githubTokenSource = target.github_token
|
|
11770
|
+
const cliUrlSource = target.cli_url;
|
|
11771
|
+
const cliPathSource = target.cli_path;
|
|
11772
|
+
const githubTokenSource = target.github_token;
|
|
11365
11773
|
const modelSource = target.model;
|
|
11366
11774
|
const cwdSource = target.cwd;
|
|
11367
|
-
const workspaceTemplateSource = target.workspace_template
|
|
11368
|
-
const timeoutSource = target.timeout_seconds
|
|
11369
|
-
const logDirSource = target.log_dir ?? target.
|
|
11370
|
-
const logFormatSource = target.log_format
|
|
11371
|
-
const systemPromptSource = target.system_prompt
|
|
11775
|
+
const workspaceTemplateSource = target.workspace_template;
|
|
11776
|
+
const timeoutSource = target.timeout_seconds;
|
|
11777
|
+
const logDirSource = target.log_dir ?? target.log_directory;
|
|
11778
|
+
const logFormatSource = target.log_format;
|
|
11779
|
+
const systemPromptSource = target.system_prompt;
|
|
11372
11780
|
const cliUrl = resolveOptionalString(cliUrlSource, env, `${target.name} copilot-sdk cli URL`, {
|
|
11373
11781
|
allowLiteral: true,
|
|
11374
11782
|
optionalEnv: true
|
|
@@ -11403,8 +11811,8 @@ function resolveCopilotSdkConfig(target, env, evalFilePath) {
|
|
|
11403
11811
|
optionalEnv: true
|
|
11404
11812
|
}
|
|
11405
11813
|
);
|
|
11406
|
-
if (workspaceTemplate && evalFilePath && !
|
|
11407
|
-
workspaceTemplate =
|
|
11814
|
+
if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
|
|
11815
|
+
workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
|
|
11408
11816
|
}
|
|
11409
11817
|
if (cwd && workspaceTemplate) {
|
|
11410
11818
|
throw new Error(
|
|
@@ -11441,11 +11849,11 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
|
|
|
11441
11849
|
const modelSource = target.model;
|
|
11442
11850
|
const argsSource = target.args ?? target.arguments;
|
|
11443
11851
|
const cwdSource = target.cwd;
|
|
11444
|
-
const workspaceTemplateSource = target.workspace_template
|
|
11445
|
-
const timeoutSource = target.timeout_seconds
|
|
11446
|
-
const logDirSource = target.log_dir ?? target.
|
|
11447
|
-
const logFormatSource = target.log_format
|
|
11448
|
-
const systemPromptSource = target.system_prompt
|
|
11852
|
+
const workspaceTemplateSource = target.workspace_template;
|
|
11853
|
+
const timeoutSource = target.timeout_seconds;
|
|
11854
|
+
const logDirSource = target.log_dir ?? target.log_directory;
|
|
11855
|
+
const logFormatSource = target.log_format;
|
|
11856
|
+
const systemPromptSource = target.system_prompt;
|
|
11449
11857
|
const executable = resolveOptionalString(executableSource, env, `${target.name} copilot-cli executable`, {
|
|
11450
11858
|
allowLiteral: true,
|
|
11451
11859
|
optionalEnv: true
|
|
@@ -11468,8 +11876,8 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
|
|
|
11468
11876
|
optionalEnv: true
|
|
11469
11877
|
}
|
|
11470
11878
|
);
|
|
11471
|
-
if (workspaceTemplate && evalFilePath && !
|
|
11472
|
-
workspaceTemplate =
|
|
11879
|
+
if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
|
|
11880
|
+
workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
|
|
11473
11881
|
}
|
|
11474
11882
|
if (cwd && workspaceTemplate) {
|
|
11475
11883
|
throw new Error(
|
|
@@ -11509,16 +11917,16 @@ function normalizeCopilotLogFormat(value) {
|
|
|
11509
11917
|
}
|
|
11510
11918
|
function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
11511
11919
|
const subproviderSource = target.subprovider;
|
|
11512
|
-
const modelSource = target.model ?? target.pi_model
|
|
11513
|
-
const apiKeySource = target.api_key
|
|
11514
|
-
const toolsSource = target.tools ?? target.pi_tools
|
|
11515
|
-
const thinkingSource = target.thinking ?? target.pi_thinking
|
|
11920
|
+
const modelSource = target.model ?? target.pi_model;
|
|
11921
|
+
const apiKeySource = target.api_key;
|
|
11922
|
+
const toolsSource = target.tools ?? target.pi_tools;
|
|
11923
|
+
const thinkingSource = target.thinking ?? target.pi_thinking;
|
|
11516
11924
|
const cwdSource = target.cwd;
|
|
11517
|
-
const workspaceTemplateSource = target.workspace_template
|
|
11518
|
-
const timeoutSource = target.timeout_seconds
|
|
11519
|
-
const logDirSource = target.log_dir ?? target.
|
|
11520
|
-
const logFormatSource = target.log_format
|
|
11521
|
-
const systemPromptSource = target.system_prompt
|
|
11925
|
+
const workspaceTemplateSource = target.workspace_template;
|
|
11926
|
+
const timeoutSource = target.timeout_seconds;
|
|
11927
|
+
const logDirSource = target.log_dir ?? target.log_directory;
|
|
11928
|
+
const logFormatSource = target.log_format;
|
|
11929
|
+
const systemPromptSource = target.system_prompt;
|
|
11522
11930
|
const subprovider = resolveOptionalString(
|
|
11523
11931
|
subproviderSource,
|
|
11524
11932
|
env,
|
|
@@ -11536,7 +11944,7 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
|
11536
11944
|
allowLiteral: false,
|
|
11537
11945
|
optionalEnv: true
|
|
11538
11946
|
});
|
|
11539
|
-
const baseUrlSource = target.base_url ?? target.
|
|
11947
|
+
const baseUrlSource = target.base_url ?? target.endpoint;
|
|
11540
11948
|
const baseUrl = resolveOptionalString(baseUrlSource, env, `${target.name} pi base url`, {
|
|
11541
11949
|
allowLiteral: true,
|
|
11542
11950
|
optionalEnv: true
|
|
@@ -11562,8 +11970,8 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
|
11562
11970
|
optionalEnv: true
|
|
11563
11971
|
}
|
|
11564
11972
|
);
|
|
11565
|
-
if (workspaceTemplate && evalFilePath && !
|
|
11566
|
-
workspaceTemplate =
|
|
11973
|
+
if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
|
|
11974
|
+
workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
|
|
11567
11975
|
}
|
|
11568
11976
|
if (cwd && workspaceTemplate) {
|
|
11569
11977
|
throw new Error(
|
|
@@ -11595,16 +12003,16 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
|
11595
12003
|
function resolvePiCliConfig(target, env, evalFilePath) {
|
|
11596
12004
|
const executableSource = target.executable ?? target.command ?? target.binary;
|
|
11597
12005
|
const subproviderSource = target.subprovider;
|
|
11598
|
-
const modelSource = target.model ?? target.pi_model
|
|
11599
|
-
const apiKeySource = target.api_key
|
|
11600
|
-
const toolsSource = target.tools ?? target.pi_tools
|
|
11601
|
-
const thinkingSource = target.thinking ?? target.pi_thinking
|
|
12006
|
+
const modelSource = target.model ?? target.pi_model;
|
|
12007
|
+
const apiKeySource = target.api_key;
|
|
12008
|
+
const toolsSource = target.tools ?? target.pi_tools;
|
|
12009
|
+
const thinkingSource = target.thinking ?? target.pi_thinking;
|
|
11602
12010
|
const cwdSource = target.cwd;
|
|
11603
|
-
const workspaceTemplateSource = target.workspace_template
|
|
11604
|
-
const timeoutSource = target.timeout_seconds
|
|
11605
|
-
const logDirSource = target.log_dir ?? target.
|
|
11606
|
-
const logFormatSource = target.log_format
|
|
11607
|
-
const systemPromptSource = target.system_prompt
|
|
12011
|
+
const workspaceTemplateSource = target.workspace_template;
|
|
12012
|
+
const timeoutSource = target.timeout_seconds;
|
|
12013
|
+
const logDirSource = target.log_dir ?? target.log_directory;
|
|
12014
|
+
const logFormatSource = target.log_format;
|
|
12015
|
+
const systemPromptSource = target.system_prompt;
|
|
11608
12016
|
const executable = resolveOptionalString(executableSource, env, `${target.name} pi-cli executable`, {
|
|
11609
12017
|
allowLiteral: true,
|
|
11610
12018
|
optionalEnv: true
|
|
@@ -11623,7 +12031,7 @@ function resolvePiCliConfig(target, env, evalFilePath) {
|
|
|
11623
12031
|
allowLiteral: false,
|
|
11624
12032
|
optionalEnv: true
|
|
11625
12033
|
});
|
|
11626
|
-
const baseUrlSource = target.base_url ?? target.
|
|
12034
|
+
const baseUrlSource = target.base_url ?? target.endpoint;
|
|
11627
12035
|
const baseUrl = resolveOptionalString(baseUrlSource, env, `${target.name} pi-cli base url`, {
|
|
11628
12036
|
allowLiteral: true,
|
|
11629
12037
|
optionalEnv: true
|
|
@@ -11648,8 +12056,8 @@ function resolvePiCliConfig(target, env, evalFilePath) {
|
|
|
11648
12056
|
`${target.name} pi-cli workspace template`,
|
|
11649
12057
|
{ allowLiteral: true, optionalEnv: true }
|
|
11650
12058
|
);
|
|
11651
|
-
if (workspaceTemplate && evalFilePath && !
|
|
11652
|
-
workspaceTemplate =
|
|
12059
|
+
if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
|
|
12060
|
+
workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
|
|
11653
12061
|
}
|
|
11654
12062
|
if (cwd && workspaceTemplate) {
|
|
11655
12063
|
throw new Error(`${target.name}: 'cwd' and 'workspace_template' are mutually exclusive.`);
|
|
@@ -11681,11 +12089,11 @@ function resolvePiCliConfig(target, env, evalFilePath) {
|
|
|
11681
12089
|
function resolveClaudeConfig(target, env, evalFilePath) {
|
|
11682
12090
|
const modelSource = target.model;
|
|
11683
12091
|
const cwdSource = target.cwd;
|
|
11684
|
-
const workspaceTemplateSource = target.workspace_template
|
|
11685
|
-
const timeoutSource = target.timeout_seconds
|
|
11686
|
-
const logDirSource = target.log_dir ?? target.
|
|
11687
|
-
const logFormatSource = target.log_format ?? target.
|
|
11688
|
-
const systemPromptSource = target.system_prompt
|
|
12092
|
+
const workspaceTemplateSource = target.workspace_template;
|
|
12093
|
+
const timeoutSource = target.timeout_seconds;
|
|
12094
|
+
const logDirSource = target.log_dir ?? target.log_directory;
|
|
12095
|
+
const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CLAUDE_LOG_FORMAT;
|
|
12096
|
+
const systemPromptSource = target.system_prompt;
|
|
11689
12097
|
const model = resolveOptionalString(modelSource, env, `${target.name} claude model`, {
|
|
11690
12098
|
allowLiteral: true,
|
|
11691
12099
|
optionalEnv: true
|
|
@@ -11703,8 +12111,8 @@ function resolveClaudeConfig(target, env, evalFilePath) {
|
|
|
11703
12111
|
optionalEnv: true
|
|
11704
12112
|
}
|
|
11705
12113
|
);
|
|
11706
|
-
if (workspaceTemplate && evalFilePath && !
|
|
11707
|
-
workspaceTemplate =
|
|
12114
|
+
if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
|
|
12115
|
+
workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
|
|
11708
12116
|
}
|
|
11709
12117
|
if (cwd && workspaceTemplate) {
|
|
11710
12118
|
throw new Error(
|
|
@@ -11718,8 +12126,8 @@ function resolveClaudeConfig(target, env, evalFilePath) {
|
|
|
11718
12126
|
});
|
|
11719
12127
|
const logFormat = normalizeClaudeLogFormat(logFormatSource);
|
|
11720
12128
|
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
11721
|
-
const maxTurns = typeof target.max_turns === "number" ? target.max_turns :
|
|
11722
|
-
const maxBudgetUsd = typeof target.max_budget_usd === "number" ? target.max_budget_usd :
|
|
12129
|
+
const maxTurns = typeof target.max_turns === "number" ? target.max_turns : void 0;
|
|
12130
|
+
const maxBudgetUsd = typeof target.max_budget_usd === "number" ? target.max_budget_usd : void 0;
|
|
11723
12131
|
return {
|
|
11724
12132
|
model,
|
|
11725
12133
|
systemPrompt,
|
|
@@ -11750,9 +12158,7 @@ function resolveMockConfig(target) {
|
|
|
11750
12158
|
return { response };
|
|
11751
12159
|
}
|
|
11752
12160
|
function resolveVSCodeConfig(target, env, insiders, evalFilePath) {
|
|
11753
|
-
const workspaceTemplateEnvVar = resolveOptionalLiteralString(
|
|
11754
|
-
target.workspace_template ?? target.workspaceTemplate
|
|
11755
|
-
);
|
|
12161
|
+
const workspaceTemplateEnvVar = resolveOptionalLiteralString(target.workspace_template);
|
|
11756
12162
|
let workspaceTemplate = workspaceTemplateEnvVar ? resolveOptionalString(
|
|
11757
12163
|
workspaceTemplateEnvVar,
|
|
11758
12164
|
env,
|
|
@@ -11762,14 +12168,14 @@ function resolveVSCodeConfig(target, env, insiders, evalFilePath) {
|
|
|
11762
12168
|
optionalEnv: true
|
|
11763
12169
|
}
|
|
11764
12170
|
) : void 0;
|
|
11765
|
-
if (workspaceTemplate && evalFilePath && !
|
|
11766
|
-
workspaceTemplate =
|
|
12171
|
+
if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
|
|
12172
|
+
workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
|
|
11767
12173
|
}
|
|
11768
12174
|
const executableSource = target.executable;
|
|
11769
12175
|
const waitSource = target.wait;
|
|
11770
|
-
const dryRunSource = target.dry_run
|
|
11771
|
-
const subagentRootSource = target.subagent_root
|
|
11772
|
-
const timeoutSource = target.timeout_seconds
|
|
12176
|
+
const dryRunSource = target.dry_run;
|
|
12177
|
+
const subagentRootSource = target.subagent_root;
|
|
12178
|
+
const timeoutSource = target.timeout_seconds;
|
|
11773
12179
|
const defaultCommand = insiders ? "code-insiders" : "code";
|
|
11774
12180
|
const executable = resolveOptionalString(executableSource, env, `${target.name} vscode executable`, {
|
|
11775
12181
|
allowLiteral: true,
|
|
@@ -11804,8 +12210,8 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
11804
12210
|
const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
|
|
11805
12211
|
if (!parseResult.success) {
|
|
11806
12212
|
const firstError = parseResult.error.errors[0];
|
|
11807
|
-
const
|
|
11808
|
-
const prefix =
|
|
12213
|
+
const path53 = firstError?.path.join(".") || "";
|
|
12214
|
+
const prefix = path53 ? `${target.name} ${path53}: ` : `${target.name}: `;
|
|
11809
12215
|
throw new Error(`${prefix}${firstError?.message}`);
|
|
11810
12216
|
}
|
|
11811
12217
|
const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
|
|
@@ -11820,17 +12226,17 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
11820
12226
|
}
|
|
11821
12227
|
function resolveDiscoveredProviderConfig(target, providerKind, env, evalFilePath) {
|
|
11822
12228
|
const command = target.command ? resolveString(target.command, env, `${target.name} command`, true) : `bun run .agentv/providers/${providerKind}.ts {PROMPT}`;
|
|
11823
|
-
const timeoutSeconds = target.timeout_seconds
|
|
12229
|
+
const timeoutSeconds = target.timeout_seconds;
|
|
11824
12230
|
const timeoutMs = resolveTimeoutMs(timeoutSeconds, `${target.name} timeout`);
|
|
11825
12231
|
let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
|
|
11826
12232
|
allowLiteral: true,
|
|
11827
12233
|
optionalEnv: true
|
|
11828
12234
|
});
|
|
11829
|
-
if (cwd && evalFilePath && !
|
|
11830
|
-
cwd =
|
|
12235
|
+
if (cwd && evalFilePath && !import_node_path24.default.isAbsolute(cwd)) {
|
|
12236
|
+
cwd = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), cwd);
|
|
11831
12237
|
}
|
|
11832
12238
|
if (!cwd && evalFilePath) {
|
|
11833
|
-
cwd =
|
|
12239
|
+
cwd = import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath));
|
|
11834
12240
|
}
|
|
11835
12241
|
return {
|
|
11836
12242
|
command,
|
|
@@ -11884,10 +12290,10 @@ function resolveDiscover(value, targetName) {
|
|
|
11884
12290
|
throw new Error(`Target "${targetName}": discover must be "latest" (got "${String(value)}")`);
|
|
11885
12291
|
}
|
|
11886
12292
|
function resolveCopilotLogConfig(target, env) {
|
|
11887
|
-
const sessionDirSource = target.session_dir
|
|
11888
|
-
const sessionIdSource = target.session_id
|
|
12293
|
+
const sessionDirSource = target.session_dir;
|
|
12294
|
+
const sessionIdSource = target.session_id;
|
|
11889
12295
|
const discoverSource = target.discover;
|
|
11890
|
-
const sessionStateDirSource = target.session_state_dir
|
|
12296
|
+
const sessionStateDirSource = target.session_state_dir;
|
|
11891
12297
|
const cwdSource = target.cwd;
|
|
11892
12298
|
return {
|
|
11893
12299
|
sessionDir: resolveOptionalString(
|
|
@@ -12068,7 +12474,7 @@ var import_node_path33 = __toESM(require("path"), 1);
|
|
|
12068
12474
|
init_cjs_shims();
|
|
12069
12475
|
var import_node_fs11 = require("fs");
|
|
12070
12476
|
var import_promises20 = require("fs/promises");
|
|
12071
|
-
var
|
|
12477
|
+
var import_node_path25 = __toESM(require("path"), 1);
|
|
12072
12478
|
async function pathExists(target) {
|
|
12073
12479
|
try {
|
|
12074
12480
|
await (0, import_promises20.access)(target, import_node_fs11.constants.F_OK);
|
|
@@ -12084,7 +12490,7 @@ async function readDirEntries(target) {
|
|
|
12084
12490
|
const entries = await (0, import_promises20.readdir)(target, { withFileTypes: true });
|
|
12085
12491
|
return entries.map((entry) => ({
|
|
12086
12492
|
name: entry.name,
|
|
12087
|
-
absolutePath:
|
|
12493
|
+
absolutePath: import_node_path25.default.join(target, entry.name),
|
|
12088
12494
|
isDirectory: entry.isDirectory()
|
|
12089
12495
|
}));
|
|
12090
12496
|
}
|
|
@@ -12100,9 +12506,9 @@ async function removeIfExists(target) {
|
|
|
12100
12506
|
|
|
12101
12507
|
// src/evaluation/providers/vscode/utils/path.ts
|
|
12102
12508
|
init_cjs_shims();
|
|
12103
|
-
var
|
|
12509
|
+
var import_node_path26 = __toESM(require("path"), 1);
|
|
12104
12510
|
function pathToFileUri2(filePath) {
|
|
12105
|
-
const absolutePath =
|
|
12511
|
+
const absolutePath = import_node_path26.default.isAbsolute(filePath) ? filePath : import_node_path26.default.resolve(filePath);
|
|
12106
12512
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
12107
12513
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
12108
12514
|
return `file:///${normalizedPath}`;
|
|
@@ -12112,7 +12518,7 @@ function pathToFileUri2(filePath) {
|
|
|
12112
12518
|
|
|
12113
12519
|
// src/evaluation/providers/vscode/dispatch/promptBuilder.ts
|
|
12114
12520
|
init_cjs_shims();
|
|
12115
|
-
var
|
|
12521
|
+
var import_node_path27 = __toESM(require("path"), 1);
|
|
12116
12522
|
|
|
12117
12523
|
// src/evaluation/providers/vscode/utils/template.ts
|
|
12118
12524
|
init_cjs_shims();
|
|
@@ -12206,8 +12612,8 @@ function createBatchRequestPrompt(userQuery, responseFileTmp, responseFileFinal,
|
|
|
12206
12612
|
});
|
|
12207
12613
|
}
|
|
12208
12614
|
function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateContent) {
|
|
12209
|
-
const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${
|
|
12210
|
-
const responseList = responseFiles.map((file) => `"${
|
|
12615
|
+
const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${import_node_path27.default.basename(file)}`).join("\n");
|
|
12616
|
+
const responseList = responseFiles.map((file) => `"${import_node_path27.default.basename(file)}"`).join(", ");
|
|
12211
12617
|
return renderTemplate2(templateContent, {
|
|
12212
12618
|
requestFiles: requestLines,
|
|
12213
12619
|
responseList
|
|
@@ -12217,7 +12623,7 @@ function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateCont
|
|
|
12217
12623
|
// src/evaluation/providers/vscode/dispatch/responseWaiter.ts
|
|
12218
12624
|
init_cjs_shims();
|
|
12219
12625
|
var import_promises21 = require("fs/promises");
|
|
12220
|
-
var
|
|
12626
|
+
var import_node_path28 = __toESM(require("path"), 1);
|
|
12221
12627
|
|
|
12222
12628
|
// src/evaluation/providers/vscode/utils/time.ts
|
|
12223
12629
|
init_cjs_shims();
|
|
@@ -12277,7 +12683,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
|
|
|
12277
12683
|
}
|
|
12278
12684
|
async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, silent = false, timeoutMs = DEFAULT_TIMEOUT_MS) {
|
|
12279
12685
|
if (!silent) {
|
|
12280
|
-
const fileList = responseFilesFinal.map((file) =>
|
|
12686
|
+
const fileList = responseFilesFinal.map((file) => import_node_path28.default.basename(file)).join(", ");
|
|
12281
12687
|
console.error(`waiting for ${responseFilesFinal.length} batch response(s): ${fileList}`);
|
|
12282
12688
|
}
|
|
12283
12689
|
const deadline = Date.now() + timeoutMs;
|
|
@@ -12286,7 +12692,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
|
|
|
12286
12692
|
while (pending.size > 0) {
|
|
12287
12693
|
if (Date.now() >= deadline) {
|
|
12288
12694
|
if (!silent) {
|
|
12289
|
-
const remaining = [...pending].map((f) =>
|
|
12695
|
+
const remaining = [...pending].map((f) => import_node_path28.default.basename(f)).join(", ");
|
|
12290
12696
|
console.error(
|
|
12291
12697
|
`error: timed out after ${Math.round(timeoutMs / 1e3)}s waiting for batch responses. Still pending: ${remaining}`
|
|
12292
12698
|
);
|
|
@@ -12344,37 +12750,6 @@ var import_node_util2 = require("util");
|
|
|
12344
12750
|
// src/evaluation/providers/vscode/dispatch/constants.ts
|
|
12345
12751
|
init_cjs_shims();
|
|
12346
12752
|
var import_node_path29 = __toESM(require("path"), 1);
|
|
12347
|
-
|
|
12348
|
-
// src/paths.ts
|
|
12349
|
-
init_cjs_shims();
|
|
12350
|
-
var import_node_os6 = __toESM(require("os"), 1);
|
|
12351
|
-
var import_node_path28 = __toESM(require("path"), 1);
|
|
12352
|
-
var logged = false;
|
|
12353
|
-
function getAgentvHome() {
|
|
12354
|
-
const envHome = process.env.AGENTV_HOME;
|
|
12355
|
-
if (envHome && envHome !== "undefined") {
|
|
12356
|
-
if (!logged) {
|
|
12357
|
-
logged = true;
|
|
12358
|
-
console.warn(`Using AGENTV_HOME: ${envHome}`);
|
|
12359
|
-
}
|
|
12360
|
-
return envHome;
|
|
12361
|
-
}
|
|
12362
|
-
return import_node_path28.default.join(import_node_os6.default.homedir(), ".agentv");
|
|
12363
|
-
}
|
|
12364
|
-
function getWorkspacesRoot() {
|
|
12365
|
-
return import_node_path28.default.join(getAgentvHome(), "workspaces");
|
|
12366
|
-
}
|
|
12367
|
-
function getSubagentsRoot() {
|
|
12368
|
-
return import_node_path28.default.join(getAgentvHome(), "subagents");
|
|
12369
|
-
}
|
|
12370
|
-
function getTraceStateRoot() {
|
|
12371
|
-
return import_node_path28.default.join(getAgentvHome(), "trace-state");
|
|
12372
|
-
}
|
|
12373
|
-
function getWorkspacePoolRoot() {
|
|
12374
|
-
return import_node_path28.default.join(getAgentvHome(), "workspace-pool");
|
|
12375
|
-
}
|
|
12376
|
-
|
|
12377
|
-
// src/evaluation/providers/vscode/dispatch/constants.ts
|
|
12378
12753
|
var DEFAULT_LOCK_NAME = "subagent.lock";
|
|
12379
12754
|
var DEFAULT_ALIVE_FILENAME = ".alive";
|
|
12380
12755
|
function getDefaultSubagentRoot(vscodeCmd = "code") {
|
|
@@ -13527,6 +13902,15 @@ var AGENT_PROVIDER_KINDS = [
|
|
|
13527
13902
|
"vscode",
|
|
13528
13903
|
"vscode-insiders"
|
|
13529
13904
|
];
|
|
13905
|
+
var LLM_GRADER_CAPABLE_KINDS = [
|
|
13906
|
+
"openai",
|
|
13907
|
+
"openrouter",
|
|
13908
|
+
"azure",
|
|
13909
|
+
"anthropic",
|
|
13910
|
+
"gemini",
|
|
13911
|
+
"agentv",
|
|
13912
|
+
"mock"
|
|
13913
|
+
];
|
|
13530
13914
|
function extractLastAssistantContent(messages) {
|
|
13531
13915
|
if (!messages || messages.length === 0) {
|
|
13532
13916
|
return "";
|
|
@@ -13680,9 +14064,10 @@ init_cjs_shims();
|
|
|
13680
14064
|
|
|
13681
14065
|
// src/evaluation/evaluators/scoring.ts
|
|
13682
14066
|
init_cjs_shims();
|
|
13683
|
-
var
|
|
13684
|
-
|
|
13685
|
-
|
|
14067
|
+
var DEFAULT_THRESHOLD = 0.8;
|
|
14068
|
+
var PASS_THRESHOLD = DEFAULT_THRESHOLD;
|
|
14069
|
+
function scoreToVerdict(score, threshold = DEFAULT_THRESHOLD) {
|
|
14070
|
+
return score >= threshold ? "pass" : "fail";
|
|
13686
14071
|
}
|
|
13687
14072
|
function clampScore(value) {
|
|
13688
14073
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
@@ -13873,13 +14258,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
13873
14258
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
13874
14259
|
const { mkdir: mkdir17, readFile: readFile17, rm: rm6, writeFile: writeFile9 } = await import("fs/promises");
|
|
13875
14260
|
const { tmpdir: tmpdir3 } = await import("os");
|
|
13876
|
-
const
|
|
14261
|
+
const path53 = await import("path");
|
|
13877
14262
|
const { randomUUID: randomUUID10 } = await import("crypto");
|
|
13878
|
-
const dir =
|
|
14263
|
+
const dir = path53.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
|
|
13879
14264
|
await mkdir17(dir, { recursive: true });
|
|
13880
|
-
const stdinPath =
|
|
13881
|
-
const stdoutPath =
|
|
13882
|
-
const stderrPath =
|
|
14265
|
+
const stdinPath = path53.join(dir, "stdin.txt");
|
|
14266
|
+
const stdoutPath = path53.join(dir, "stdout.txt");
|
|
14267
|
+
const stderrPath = path53.join(dir, "stderr.txt");
|
|
13883
14268
|
await writeFile9(stdinPath, stdinPayload, "utf8");
|
|
13884
14269
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
13885
14270
|
const { spawn: spawn5 } = await import("child_process");
|
|
@@ -15081,7 +15466,7 @@ ${outputSchema}`;
|
|
|
15081
15466
|
parts.push("[[ ## scoring_criteria ## ]]");
|
|
15082
15467
|
for (const rubric of rubrics) {
|
|
15083
15468
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
15084
|
-
const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
15469
|
+
const minScoreLabel = rubric.min_score !== void 0 ? ` [REQUIRED: min score ${rubric.min_score}]` : rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
15085
15470
|
parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
|
|
15086
15471
|
if (rubric.outcome) {
|
|
15087
15472
|
parts.push(`Description: ${rubric.outcome}`);
|
|
@@ -15135,54 +15520,106 @@ ${outputSchema}`;
|
|
|
15135
15520
|
async runWithRetry(options) {
|
|
15136
15521
|
const { context: context2, graderProvider, systemPrompt, userPrompt, schema, images } = options;
|
|
15137
15522
|
let lastError;
|
|
15523
|
+
let lastInvalidResponse;
|
|
15524
|
+
let shouldAttemptStructureFix = false;
|
|
15138
15525
|
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
15139
15526
|
try {
|
|
15140
|
-
const
|
|
15141
|
-
|
|
15142
|
-
|
|
15143
|
-
|
|
15144
|
-
|
|
15145
|
-
|
|
15146
|
-
|
|
15147
|
-
|
|
15148
|
-
|
|
15149
|
-
|
|
15150
|
-
|
|
15151
|
-
|
|
15152
|
-
|
|
15153
|
-
|
|
15154
|
-
|
|
15155
|
-
|
|
15156
|
-
]
|
|
15157
|
-
}
|
|
15158
|
-
],
|
|
15159
|
-
...modelOptions
|
|
15160
|
-
}) : await (0, import_ai2.generateText)({
|
|
15161
|
-
model,
|
|
15162
|
-
system: systemPrompt,
|
|
15163
|
-
prompt: userPrompt,
|
|
15164
|
-
...modelOptions
|
|
15165
|
-
});
|
|
15166
|
-
const data2 = schema.parse(parseJsonFromText(result.text));
|
|
15167
|
-
const rawUsage = result.usage;
|
|
15168
|
-
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
15169
|
-
return { data: data2, tokenUsage };
|
|
15527
|
+
const result = await this.generateStructuredResponse({
|
|
15528
|
+
context: context2,
|
|
15529
|
+
graderProvider,
|
|
15530
|
+
systemPrompt,
|
|
15531
|
+
userPrompt,
|
|
15532
|
+
images
|
|
15533
|
+
});
|
|
15534
|
+
const canRepairResponse = result.text.trim().length > 0;
|
|
15535
|
+
lastInvalidResponse = canRepairResponse ? result : void 0;
|
|
15536
|
+
let data;
|
|
15537
|
+
try {
|
|
15538
|
+
data = schema.parse(parseJsonFromText(result.text));
|
|
15539
|
+
} catch (e) {
|
|
15540
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
15541
|
+
shouldAttemptStructureFix = canRepairResponse;
|
|
15542
|
+
continue;
|
|
15170
15543
|
}
|
|
15171
|
-
|
|
15172
|
-
|
|
15544
|
+
return {
|
|
15545
|
+
data,
|
|
15546
|
+
providerResponse: result.providerResponse,
|
|
15547
|
+
tokenUsage: result.tokenUsage
|
|
15548
|
+
};
|
|
15549
|
+
} catch (e) {
|
|
15550
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
15551
|
+
}
|
|
15552
|
+
}
|
|
15553
|
+
if (shouldAttemptStructureFix && lastInvalidResponse) {
|
|
15554
|
+
try {
|
|
15555
|
+
const repaired = await this.generateStructuredResponse({
|
|
15556
|
+
context: context2,
|
|
15557
|
+
graderProvider,
|
|
15173
15558
|
systemPrompt,
|
|
15174
|
-
|
|
15175
|
-
|
|
15176
|
-
|
|
15177
|
-
|
|
15559
|
+
userPrompt: buildStructureRepairPrompt({
|
|
15560
|
+
validationError: lastError?.message ?? "Schema validation failed",
|
|
15561
|
+
invalidResponse: lastInvalidResponse.text
|
|
15562
|
+
})
|
|
15178
15563
|
});
|
|
15179
|
-
const data = schema.parse(parseJsonFromText(
|
|
15180
|
-
return {
|
|
15564
|
+
const data = schema.parse(parseJsonFromText(repaired.text));
|
|
15565
|
+
return {
|
|
15566
|
+
data,
|
|
15567
|
+
providerResponse: repaired.providerResponse,
|
|
15568
|
+
tokenUsage: sumTokenUsage(lastInvalidResponse.tokenUsage, repaired.tokenUsage)
|
|
15569
|
+
};
|
|
15181
15570
|
} catch (e) {
|
|
15182
15571
|
lastError = e instanceof Error ? e : new Error(String(e));
|
|
15183
15572
|
}
|
|
15184
15573
|
}
|
|
15185
|
-
throw new Error(
|
|
15574
|
+
throw new Error(
|
|
15575
|
+
`Failed to parse evaluator response after 3 attempts and 1 structure-fix attempt: ${lastError?.message}`
|
|
15576
|
+
);
|
|
15577
|
+
}
|
|
15578
|
+
async generateStructuredResponse(options) {
|
|
15579
|
+
const { context: context2, graderProvider, systemPrompt, userPrompt, images } = options;
|
|
15580
|
+
const model = graderProvider.asLanguageModel?.();
|
|
15581
|
+
if (model) {
|
|
15582
|
+
const modelOptions = {
|
|
15583
|
+
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
15584
|
+
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
15585
|
+
};
|
|
15586
|
+
const hasImages = images && images.length > 0;
|
|
15587
|
+
const result = hasImages ? await (0, import_ai2.generateText)({
|
|
15588
|
+
model,
|
|
15589
|
+
system: systemPrompt,
|
|
15590
|
+
messages: [
|
|
15591
|
+
{
|
|
15592
|
+
role: "user",
|
|
15593
|
+
content: [
|
|
15594
|
+
{ type: "text", text: userPrompt },
|
|
15595
|
+
...toAiSdkImageParts(images)
|
|
15596
|
+
]
|
|
15597
|
+
}
|
|
15598
|
+
],
|
|
15599
|
+
...modelOptions
|
|
15600
|
+
}) : await (0, import_ai2.generateText)({
|
|
15601
|
+
model,
|
|
15602
|
+
system: systemPrompt,
|
|
15603
|
+
prompt: userPrompt,
|
|
15604
|
+
...modelOptions
|
|
15605
|
+
});
|
|
15606
|
+
const rawUsage = result.usage;
|
|
15607
|
+
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
15608
|
+
return { text: result.text, tokenUsage };
|
|
15609
|
+
}
|
|
15610
|
+
const response = await graderProvider.invoke({
|
|
15611
|
+
question: userPrompt,
|
|
15612
|
+
systemPrompt,
|
|
15613
|
+
evalCaseId: context2.evalCase.id,
|
|
15614
|
+
attempt: context2.attempt,
|
|
15615
|
+
maxOutputTokens: this.maxOutputTokens,
|
|
15616
|
+
temperature: this.temperature
|
|
15617
|
+
});
|
|
15618
|
+
return {
|
|
15619
|
+
text: extractLastAssistantContent(response.output),
|
|
15620
|
+
providerResponse: response,
|
|
15621
|
+
tokenUsage: response.tokenUsage
|
|
15622
|
+
};
|
|
15186
15623
|
}
|
|
15187
15624
|
};
|
|
15188
15625
|
function buildOutputSchema() {
|
|
@@ -15202,6 +15639,29 @@ function buildOutputSchema() {
|
|
|
15202
15639
|
"}"
|
|
15203
15640
|
].join("\n");
|
|
15204
15641
|
}
|
|
15642
|
+
function buildStructureRepairPrompt(options) {
|
|
15643
|
+
const { validationError, invalidResponse } = options;
|
|
15644
|
+
return [
|
|
15645
|
+
"The following evaluation response has useful grading content but invalid JSON structure.",
|
|
15646
|
+
"Repair it to satisfy the schema in the system prompt.",
|
|
15647
|
+
"Preserve the evaluation meaning, do not re-grade the answer, and return only a single JSON object.",
|
|
15648
|
+
"",
|
|
15649
|
+
"Validation error:",
|
|
15650
|
+
validationError,
|
|
15651
|
+
"",
|
|
15652
|
+
"Invalid response:",
|
|
15653
|
+
invalidResponse
|
|
15654
|
+
].join("\n");
|
|
15655
|
+
}
|
|
15656
|
+
function sumTokenUsage(first, second) {
|
|
15657
|
+
if (!first && !second) {
|
|
15658
|
+
return void 0;
|
|
15659
|
+
}
|
|
15660
|
+
return {
|
|
15661
|
+
input: (first?.input ?? 0) + (second?.input ?? 0),
|
|
15662
|
+
output: (first?.output ?? 0) + (second?.output ?? 0)
|
|
15663
|
+
};
|
|
15664
|
+
}
|
|
15205
15665
|
function buildRubricOutputSchema() {
|
|
15206
15666
|
return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
15207
15667
|
You must return a valid JSON object matching this schema:
|
|
@@ -15301,19 +15761,21 @@ function calculateScoreRangeResult(result, rubrics) {
|
|
|
15301
15761
|
rawScores[rubric.id] = rawScore;
|
|
15302
15762
|
totalWeight += rubric.weight;
|
|
15303
15763
|
weightedScoreSum += normalizedScore * rubric.weight;
|
|
15304
|
-
let
|
|
15305
|
-
if (rubric.
|
|
15306
|
-
|
|
15764
|
+
let minScoreThreshold;
|
|
15765
|
+
if (rubric.min_score !== void 0) {
|
|
15766
|
+
minScoreThreshold = rubric.min_score;
|
|
15767
|
+
} else if (rubric.required_min_score !== void 0) {
|
|
15768
|
+
minScoreThreshold = rubric.required_min_score / 10;
|
|
15307
15769
|
} else if (rubric.required === true) {
|
|
15308
|
-
|
|
15770
|
+
minScoreThreshold = 1;
|
|
15309
15771
|
}
|
|
15310
15772
|
const matchingRange = rubric.score_ranges?.find(
|
|
15311
15773
|
(r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
|
|
15312
15774
|
);
|
|
15313
15775
|
const rangeDescription = matchingRange?.outcome ?? "";
|
|
15314
15776
|
const criterionLabel = rubric.outcome ?? rubric.id;
|
|
15315
|
-
const passed = !(
|
|
15316
|
-
if (
|
|
15777
|
+
const passed = !(minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) && rawScore >= 7;
|
|
15778
|
+
if (minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) {
|
|
15317
15779
|
failedRequired = true;
|
|
15318
15780
|
}
|
|
15319
15781
|
assertions.push({
|
|
@@ -15390,11 +15852,11 @@ function createFilesystemTools(workspacePath) {
|
|
|
15390
15852
|
execute: async (input) => {
|
|
15391
15853
|
try {
|
|
15392
15854
|
const resolved = resolveSandboxed(workspacePath, input.path);
|
|
15393
|
-
const
|
|
15394
|
-
if (
|
|
15855
|
+
const stat11 = await import_promises29.default.stat(resolved);
|
|
15856
|
+
if (stat11.isDirectory()) {
|
|
15395
15857
|
return { error: `'${input.path}' is a directory, not a file` };
|
|
15396
15858
|
}
|
|
15397
|
-
const buffer = Buffer.alloc(Math.min(
|
|
15859
|
+
const buffer = Buffer.alloc(Math.min(stat11.size, MAX_FILE_SIZE));
|
|
15398
15860
|
const fd = await import_promises29.default.open(resolved, "r");
|
|
15399
15861
|
try {
|
|
15400
15862
|
await fd.read(buffer, 0, buffer.length, 0);
|
|
@@ -15402,8 +15864,8 @@ function createFilesystemTools(workspacePath) {
|
|
|
15402
15864
|
await fd.close();
|
|
15403
15865
|
}
|
|
15404
15866
|
const content = buffer.toString("utf-8");
|
|
15405
|
-
const truncated =
|
|
15406
|
-
return { content, truncated, size:
|
|
15867
|
+
const truncated = stat11.size > MAX_FILE_SIZE;
|
|
15868
|
+
return { content, truncated, size: stat11.size };
|
|
15407
15869
|
} catch (error) {
|
|
15408
15870
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
15409
15871
|
}
|
|
@@ -15454,8 +15916,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
|
15454
15916
|
const ext = import_node_path39.default.extname(entry.name).toLowerCase();
|
|
15455
15917
|
if (BINARY_EXTENSIONS.has(ext)) continue;
|
|
15456
15918
|
try {
|
|
15457
|
-
const
|
|
15458
|
-
if (
|
|
15919
|
+
const stat11 = await import_promises29.default.stat(fullPath);
|
|
15920
|
+
if (stat11.size > MAX_FILE_SIZE) continue;
|
|
15459
15921
|
const content = await import_promises29.default.readFile(fullPath, "utf-8");
|
|
15460
15922
|
const lines = content.split("\n");
|
|
15461
15923
|
for (let i = 0; i < lines.length; i++) {
|
|
@@ -16099,115 +16561,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
16099
16561
|
* Evaluate a single field against the expected value.
|
|
16100
16562
|
*/
|
|
16101
16563
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
16102
|
-
const { path:
|
|
16103
|
-
const candidateValue = resolvePath(candidateData,
|
|
16104
|
-
const expectedValue = resolvePath(expectedData,
|
|
16564
|
+
const { path: path53, match, required = true, weight = 1 } = fieldConfig;
|
|
16565
|
+
const candidateValue = resolvePath(candidateData, path53);
|
|
16566
|
+
const expectedValue = resolvePath(expectedData, path53);
|
|
16105
16567
|
if (expectedValue === void 0) {
|
|
16106
16568
|
return {
|
|
16107
|
-
path:
|
|
16569
|
+
path: path53,
|
|
16108
16570
|
score: 1,
|
|
16109
16571
|
// No expected value means no comparison needed
|
|
16110
16572
|
weight,
|
|
16111
16573
|
hit: true,
|
|
16112
|
-
message: `${
|
|
16574
|
+
message: `${path53}: no expected value`
|
|
16113
16575
|
};
|
|
16114
16576
|
}
|
|
16115
16577
|
if (candidateValue === void 0) {
|
|
16116
16578
|
if (required) {
|
|
16117
16579
|
return {
|
|
16118
|
-
path:
|
|
16580
|
+
path: path53,
|
|
16119
16581
|
score: 0,
|
|
16120
16582
|
weight,
|
|
16121
16583
|
hit: false,
|
|
16122
|
-
message: `${
|
|
16584
|
+
message: `${path53} (required, missing)`
|
|
16123
16585
|
};
|
|
16124
16586
|
}
|
|
16125
16587
|
return {
|
|
16126
|
-
path:
|
|
16588
|
+
path: path53,
|
|
16127
16589
|
score: 1,
|
|
16128
16590
|
// Don't penalize missing optional fields
|
|
16129
16591
|
weight: 0,
|
|
16130
16592
|
// Zero weight means it won't affect the score
|
|
16131
16593
|
hit: true,
|
|
16132
|
-
message: `${
|
|
16594
|
+
message: `${path53}: optional field missing`
|
|
16133
16595
|
};
|
|
16134
16596
|
}
|
|
16135
16597
|
switch (match) {
|
|
16136
16598
|
case "exact":
|
|
16137
|
-
return this.compareExact(
|
|
16599
|
+
return this.compareExact(path53, candidateValue, expectedValue, weight);
|
|
16138
16600
|
case "numeric_tolerance":
|
|
16139
16601
|
return this.compareNumericTolerance(
|
|
16140
|
-
|
|
16602
|
+
path53,
|
|
16141
16603
|
candidateValue,
|
|
16142
16604
|
expectedValue,
|
|
16143
16605
|
fieldConfig,
|
|
16144
16606
|
weight
|
|
16145
16607
|
);
|
|
16146
16608
|
case "date":
|
|
16147
|
-
return this.compareDate(
|
|
16609
|
+
return this.compareDate(path53, candidateValue, expectedValue, fieldConfig, weight);
|
|
16148
16610
|
default:
|
|
16149
16611
|
return {
|
|
16150
|
-
path:
|
|
16612
|
+
path: path53,
|
|
16151
16613
|
score: 0,
|
|
16152
16614
|
weight,
|
|
16153
16615
|
hit: false,
|
|
16154
|
-
message: `${
|
|
16616
|
+
message: `${path53}: unknown match type "${match}"`
|
|
16155
16617
|
};
|
|
16156
16618
|
}
|
|
16157
16619
|
}
|
|
16158
16620
|
/**
|
|
16159
16621
|
* Exact equality comparison.
|
|
16160
16622
|
*/
|
|
16161
|
-
compareExact(
|
|
16623
|
+
compareExact(path53, candidateValue, expectedValue, weight) {
|
|
16162
16624
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
16163
16625
|
return {
|
|
16164
|
-
path:
|
|
16626
|
+
path: path53,
|
|
16165
16627
|
score: 1,
|
|
16166
16628
|
weight,
|
|
16167
16629
|
hit: true,
|
|
16168
|
-
message:
|
|
16630
|
+
message: path53
|
|
16169
16631
|
};
|
|
16170
16632
|
}
|
|
16171
16633
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
16172
16634
|
return {
|
|
16173
|
-
path:
|
|
16635
|
+
path: path53,
|
|
16174
16636
|
score: 0,
|
|
16175
16637
|
weight,
|
|
16176
16638
|
hit: false,
|
|
16177
|
-
message: `${
|
|
16639
|
+
message: `${path53} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
16178
16640
|
};
|
|
16179
16641
|
}
|
|
16180
16642
|
return {
|
|
16181
|
-
path:
|
|
16643
|
+
path: path53,
|
|
16182
16644
|
score: 0,
|
|
16183
16645
|
weight,
|
|
16184
16646
|
hit: false,
|
|
16185
|
-
message: `${
|
|
16647
|
+
message: `${path53} (value mismatch)`
|
|
16186
16648
|
};
|
|
16187
16649
|
}
|
|
16188
16650
|
/**
|
|
16189
16651
|
* Numeric comparison with absolute or relative tolerance.
|
|
16190
16652
|
*/
|
|
16191
|
-
compareNumericTolerance(
|
|
16653
|
+
compareNumericTolerance(path53, candidateValue, expectedValue, fieldConfig, weight) {
|
|
16192
16654
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
16193
16655
|
const candidateNum = toNumber(candidateValue);
|
|
16194
16656
|
const expectedNum = toNumber(expectedValue);
|
|
16195
16657
|
if (candidateNum === null || expectedNum === null) {
|
|
16196
16658
|
return {
|
|
16197
|
-
path:
|
|
16659
|
+
path: path53,
|
|
16198
16660
|
score: 0,
|
|
16199
16661
|
weight,
|
|
16200
16662
|
hit: false,
|
|
16201
|
-
message: `${
|
|
16663
|
+
message: `${path53} (non-numeric value)`
|
|
16202
16664
|
};
|
|
16203
16665
|
}
|
|
16204
16666
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
16205
16667
|
return {
|
|
16206
|
-
path:
|
|
16668
|
+
path: path53,
|
|
16207
16669
|
score: 0,
|
|
16208
16670
|
weight,
|
|
16209
16671
|
hit: false,
|
|
16210
|
-
message: `${
|
|
16672
|
+
message: `${path53} (invalid numeric value)`
|
|
16211
16673
|
};
|
|
16212
16674
|
}
|
|
16213
16675
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -16220,61 +16682,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
16220
16682
|
}
|
|
16221
16683
|
if (withinTolerance) {
|
|
16222
16684
|
return {
|
|
16223
|
-
path:
|
|
16685
|
+
path: path53,
|
|
16224
16686
|
score: 1,
|
|
16225
16687
|
weight,
|
|
16226
16688
|
hit: true,
|
|
16227
|
-
message: `${
|
|
16689
|
+
message: `${path53} (within tolerance: diff=${diff.toFixed(2)})`
|
|
16228
16690
|
};
|
|
16229
16691
|
}
|
|
16230
16692
|
return {
|
|
16231
|
-
path:
|
|
16693
|
+
path: path53,
|
|
16232
16694
|
score: 0,
|
|
16233
16695
|
weight,
|
|
16234
16696
|
hit: false,
|
|
16235
|
-
message: `${
|
|
16697
|
+
message: `${path53} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
16236
16698
|
};
|
|
16237
16699
|
}
|
|
16238
16700
|
/**
|
|
16239
16701
|
* Date comparison with format normalization.
|
|
16240
16702
|
*/
|
|
16241
|
-
compareDate(
|
|
16703
|
+
compareDate(path53, candidateValue, expectedValue, fieldConfig, weight) {
|
|
16242
16704
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
16243
16705
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
16244
16706
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
16245
16707
|
if (candidateDate === null) {
|
|
16246
16708
|
return {
|
|
16247
|
-
path:
|
|
16709
|
+
path: path53,
|
|
16248
16710
|
score: 0,
|
|
16249
16711
|
weight,
|
|
16250
16712
|
hit: false,
|
|
16251
|
-
message: `${
|
|
16713
|
+
message: `${path53} (unparseable candidate date)`
|
|
16252
16714
|
};
|
|
16253
16715
|
}
|
|
16254
16716
|
if (expectedDate === null) {
|
|
16255
16717
|
return {
|
|
16256
|
-
path:
|
|
16718
|
+
path: path53,
|
|
16257
16719
|
score: 0,
|
|
16258
16720
|
weight,
|
|
16259
16721
|
hit: false,
|
|
16260
|
-
message: `${
|
|
16722
|
+
message: `${path53} (unparseable expected date)`
|
|
16261
16723
|
};
|
|
16262
16724
|
}
|
|
16263
16725
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
16264
16726
|
return {
|
|
16265
|
-
path:
|
|
16727
|
+
path: path53,
|
|
16266
16728
|
score: 1,
|
|
16267
16729
|
weight,
|
|
16268
16730
|
hit: true,
|
|
16269
|
-
message:
|
|
16731
|
+
message: path53
|
|
16270
16732
|
};
|
|
16271
16733
|
}
|
|
16272
16734
|
return {
|
|
16273
|
-
path:
|
|
16735
|
+
path: path53,
|
|
16274
16736
|
score: 0,
|
|
16275
16737
|
weight,
|
|
16276
16738
|
hit: false,
|
|
16277
|
-
message: `${
|
|
16739
|
+
message: `${path53} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
16278
16740
|
};
|
|
16279
16741
|
}
|
|
16280
16742
|
/**
|
|
@@ -16307,11 +16769,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
16307
16769
|
};
|
|
16308
16770
|
}
|
|
16309
16771
|
};
|
|
16310
|
-
function resolvePath(obj,
|
|
16311
|
-
if (!
|
|
16772
|
+
function resolvePath(obj, path53) {
|
|
16773
|
+
if (!path53 || !obj) {
|
|
16312
16774
|
return void 0;
|
|
16313
16775
|
}
|
|
16314
|
-
const parts =
|
|
16776
|
+
const parts = path53.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
16315
16777
|
let current = obj;
|
|
16316
16778
|
for (const part of parts) {
|
|
16317
16779
|
if (current === null || current === void 0) {
|
|
@@ -16808,8 +17270,8 @@ var TokenUsageEvaluator = class {
|
|
|
16808
17270
|
|
|
16809
17271
|
// src/evaluation/evaluators/tool-trajectory.ts
|
|
16810
17272
|
init_cjs_shims();
|
|
16811
|
-
function getNestedValue(obj,
|
|
16812
|
-
const parts =
|
|
17273
|
+
function getNestedValue(obj, path53) {
|
|
17274
|
+
const parts = path53.split(".");
|
|
16813
17275
|
let current = obj;
|
|
16814
17276
|
for (const part of parts) {
|
|
16815
17277
|
if (current === null || current === void 0 || typeof current !== "object") {
|
|
@@ -18602,7 +19064,7 @@ var WorkspacePoolManager = class {
|
|
|
18602
19064
|
}
|
|
18603
19065
|
/**
|
|
18604
19066
|
* Reset an existing slot for reuse:
|
|
18605
|
-
* 1. Reset repos (git reset --hard
|
|
19067
|
+
* 1. Reset repos (fetch from origin when resolve=remote, then git reset --hard && git clean per repo)
|
|
18606
19068
|
* 2. Re-copy template files (skip repo directories)
|
|
18607
19069
|
*/
|
|
18608
19070
|
async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
|
|
@@ -18615,7 +19077,17 @@ var WorkspacePoolManager = class {
|
|
|
18615
19077
|
continue;
|
|
18616
19078
|
}
|
|
18617
19079
|
const ref = repo.checkout?.ref ?? "HEAD";
|
|
18618
|
-
|
|
19080
|
+
const resolve = repo.checkout?.resolve ?? "remote";
|
|
19081
|
+
if (resolve === "remote") {
|
|
19082
|
+
const fetchArgs = ["fetch", "origin", ref];
|
|
19083
|
+
if (repo.clone?.depth) {
|
|
19084
|
+
fetchArgs.splice(1, 0, "--depth", String(repo.clone.depth));
|
|
19085
|
+
}
|
|
19086
|
+
await git(fetchArgs, { cwd: repoDir });
|
|
19087
|
+
await git(["reset", "--hard", "FETCH_HEAD"], { cwd: repoDir });
|
|
19088
|
+
} else {
|
|
19089
|
+
await git(["reset", "--hard", ref], { cwd: repoDir });
|
|
19090
|
+
}
|
|
18619
19091
|
const cleanFlag = poolReset === "strict" ? "-fdx" : "-fd";
|
|
18620
19092
|
await git(["clean", cleanFlag], { cwd: repoDir });
|
|
18621
19093
|
}
|
|
@@ -18915,7 +19387,7 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
|
|
|
18915
19387
|
}
|
|
18916
19388
|
|
|
18917
19389
|
// src/evaluation/orchestrator.ts
|
|
18918
|
-
function classifyQualityStatus(score, threshold =
|
|
19390
|
+
function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
|
|
18919
19391
|
return score >= threshold ? "ok" : "quality_failure";
|
|
18920
19392
|
}
|
|
18921
19393
|
function buildSkippedEvaluatorError(scores) {
|
|
@@ -19007,7 +19479,7 @@ async function runEvaluation(options) {
|
|
|
19007
19479
|
const filteredEvalCases = filterEvalCases(evalCases, filter);
|
|
19008
19480
|
if (filteredEvalCases.length === 0) {
|
|
19009
19481
|
if (filter) {
|
|
19010
|
-
throw new Error(`No tests matched filter '${filter}' in ${evalFilePath}`);
|
|
19482
|
+
throw new Error(`No tests matched filter '${formatFilter(filter)}' in ${evalFilePath}`);
|
|
19011
19483
|
}
|
|
19012
19484
|
return [];
|
|
19013
19485
|
}
|
|
@@ -19059,6 +19531,9 @@ async function runEvaluation(options) {
|
|
|
19059
19531
|
const graderName = targetContext.graderTarget ?? targetContext.name;
|
|
19060
19532
|
const resolvedGrader = resolveTargetByName(graderName);
|
|
19061
19533
|
if (!resolvedGrader) {
|
|
19534
|
+
if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) {
|
|
19535
|
+
return void 0;
|
|
19536
|
+
}
|
|
19062
19537
|
return getOrCreateProvider(targetContext);
|
|
19063
19538
|
}
|
|
19064
19539
|
return getOrCreateProvider(resolvedGrader);
|
|
@@ -19389,7 +19864,7 @@ async function runEvaluation(options) {
|
|
|
19389
19864
|
const budgetResult = {
|
|
19390
19865
|
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
19391
19866
|
testId: evalCase.id,
|
|
19392
|
-
|
|
19867
|
+
suite: evalCase.suite,
|
|
19393
19868
|
category: evalCase.category,
|
|
19394
19869
|
score: 0,
|
|
19395
19870
|
assertions: [],
|
|
@@ -19426,7 +19901,7 @@ async function runEvaluation(options) {
|
|
|
19426
19901
|
const haltResult = {
|
|
19427
19902
|
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
19428
19903
|
testId: evalCase.id,
|
|
19429
|
-
|
|
19904
|
+
suite: evalCase.suite,
|
|
19430
19905
|
category: evalCase.category,
|
|
19431
19906
|
score: 0,
|
|
19432
19907
|
assertions: [],
|
|
@@ -19738,7 +20213,7 @@ async function runBatchEvaluation(options) {
|
|
|
19738
20213
|
targetResolver,
|
|
19739
20214
|
availableTargets,
|
|
19740
20215
|
verbose,
|
|
19741
|
-
threshold: batchThreshold
|
|
20216
|
+
threshold: evalCase.threshold ?? batchThreshold
|
|
19742
20217
|
});
|
|
19743
20218
|
if (providerError) {
|
|
19744
20219
|
result = {
|
|
@@ -20200,8 +20675,9 @@ async function runEvalCase(options) {
|
|
|
20200
20675
|
fileChanges,
|
|
20201
20676
|
workspacePath,
|
|
20202
20677
|
verbose,
|
|
20203
|
-
threshold: caseThreshold
|
|
20678
|
+
threshold: evalCase.threshold ?? caseThreshold
|
|
20204
20679
|
});
|
|
20680
|
+
const effectiveThreshold = evalCase.threshold ?? caseThreshold;
|
|
20205
20681
|
const totalDurationMs = Date.now() - caseStartMs;
|
|
20206
20682
|
const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
|
|
20207
20683
|
const evalRunTokenUsage = tokenUsage || graderTokens ? {
|
|
@@ -20215,7 +20691,7 @@ async function runEvalCase(options) {
|
|
|
20215
20691
|
...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
|
|
20216
20692
|
};
|
|
20217
20693
|
const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
|
|
20218
|
-
const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score,
|
|
20694
|
+
const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, effectiveThreshold);
|
|
20219
20695
|
const targetUsedField = targetUsed ? { targetUsed } : {};
|
|
20220
20696
|
const finalResult = providerError ? {
|
|
20221
20697
|
...result,
|
|
@@ -20416,7 +20892,8 @@ async function evaluateCandidate(options) {
|
|
|
20416
20892
|
targetResolver,
|
|
20417
20893
|
availableTargets,
|
|
20418
20894
|
fileChanges,
|
|
20419
|
-
workspacePath
|
|
20895
|
+
workspacePath,
|
|
20896
|
+
threshold: evalThreshold
|
|
20420
20897
|
});
|
|
20421
20898
|
const completedAt = nowFn();
|
|
20422
20899
|
let agentRequest;
|
|
@@ -20447,7 +20924,7 @@ async function evaluateCandidate(options) {
|
|
|
20447
20924
|
return {
|
|
20448
20925
|
timestamp: completedAt.toISOString(),
|
|
20449
20926
|
testId: evalCase.id,
|
|
20450
|
-
|
|
20927
|
+
suite: evalCase.suite,
|
|
20451
20928
|
category: evalCase.category,
|
|
20452
20929
|
conversationId: evalCase.conversation_id,
|
|
20453
20930
|
score: score.score,
|
|
@@ -20490,7 +20967,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
20490
20967
|
targetResolver,
|
|
20491
20968
|
availableTargets,
|
|
20492
20969
|
fileChanges,
|
|
20493
|
-
workspacePath
|
|
20970
|
+
workspacePath,
|
|
20971
|
+
threshold
|
|
20494
20972
|
} = options;
|
|
20495
20973
|
if (evalCase.assertions && evalCase.assertions.length > 0) {
|
|
20496
20974
|
return runEvaluatorList({
|
|
@@ -20516,7 +20994,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
20516
20994
|
targetResolver,
|
|
20517
20995
|
availableTargets,
|
|
20518
20996
|
fileChanges,
|
|
20519
|
-
workspacePath
|
|
20997
|
+
workspacePath,
|
|
20998
|
+
threshold
|
|
20520
20999
|
});
|
|
20521
21000
|
}
|
|
20522
21001
|
const evaluatorKind = evalCase.evaluator ?? "llm-grader";
|
|
@@ -20618,7 +21097,8 @@ async function runEvaluatorList(options) {
|
|
|
20618
21097
|
name: evaluatorConfig.name,
|
|
20619
21098
|
type: evaluatorConfig.type,
|
|
20620
21099
|
weight,
|
|
20621
|
-
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
21100
|
+
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
|
|
21101
|
+
...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
|
|
20622
21102
|
});
|
|
20623
21103
|
scores.push({
|
|
20624
21104
|
name: evaluatorConfig.name,
|
|
@@ -20653,7 +21133,8 @@ async function runEvaluatorList(options) {
|
|
|
20653
21133
|
name: evaluatorConfig.name ?? "unknown",
|
|
20654
21134
|
type: evaluatorConfig.type ?? "llm-grader",
|
|
20655
21135
|
weight,
|
|
20656
|
-
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
21136
|
+
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
|
|
21137
|
+
...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
|
|
20657
21138
|
});
|
|
20658
21139
|
scores.push({
|
|
20659
21140
|
name: evaluatorConfig.name ?? "unknown",
|
|
@@ -20687,9 +21168,10 @@ async function runEvaluatorList(options) {
|
|
|
20687
21168
|
}
|
|
20688
21169
|
}
|
|
20689
21170
|
}
|
|
21171
|
+
const effectiveThreshold = options.threshold ?? DEFAULT_THRESHOLD;
|
|
20690
21172
|
const hasRequiredFailure = scored.some((entry) => {
|
|
20691
21173
|
if (!entry.required) return false;
|
|
20692
|
-
const minScore = typeof entry.required === "number" ? entry.required :
|
|
21174
|
+
const minScore = entry.min_score ?? (typeof entry.required === "number" ? entry.required : effectiveThreshold);
|
|
20693
21175
|
return entry.score.score < minScore;
|
|
20694
21176
|
});
|
|
20695
21177
|
const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
|
|
@@ -20700,17 +21182,23 @@ async function runEvaluatorList(options) {
|
|
|
20700
21182
|
const expectedAspectCount = assertions.length || 1;
|
|
20701
21183
|
const score = {
|
|
20702
21184
|
score: aggregateScore,
|
|
20703
|
-
verdict: scoreToVerdict(aggregateScore),
|
|
21185
|
+
verdict: scoreToVerdict(aggregateScore, effectiveThreshold),
|
|
20704
21186
|
assertions,
|
|
20705
21187
|
expectedAspectCount
|
|
20706
21188
|
};
|
|
20707
21189
|
return { score, scores };
|
|
20708
21190
|
}
|
|
21191
|
+
function formatFilter(filter) {
|
|
21192
|
+
return typeof filter === "string" ? filter : filter.join(", ");
|
|
21193
|
+
}
|
|
21194
|
+
function matchesFilter3(id, filter) {
|
|
21195
|
+
return typeof filter === "string" ? import_micromatch3.default.isMatch(id, filter) : filter.some((pattern) => import_micromatch3.default.isMatch(id, pattern));
|
|
21196
|
+
}
|
|
20709
21197
|
function filterEvalCases(evalCases, filter) {
|
|
20710
21198
|
if (!filter) {
|
|
20711
21199
|
return evalCases;
|
|
20712
21200
|
}
|
|
20713
|
-
return evalCases.filter((evalCase) =>
|
|
21201
|
+
return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter));
|
|
20714
21202
|
}
|
|
20715
21203
|
function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
|
|
20716
21204
|
const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
|
|
@@ -20797,7 +21285,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
20797
21285
|
return {
|
|
20798
21286
|
timestamp: timestamp.toISOString(),
|
|
20799
21287
|
testId: evalCase.id,
|
|
20800
|
-
|
|
21288
|
+
suite: evalCase.suite,
|
|
20801
21289
|
category: evalCase.category,
|
|
20802
21290
|
conversationId: evalCase.conversation_id,
|
|
20803
21291
|
score: 0,
|
|
@@ -21071,6 +21559,7 @@ async function evaluate(config) {
|
|
|
21071
21559
|
verbose: config.verbose,
|
|
21072
21560
|
maxConcurrency: config.workers ?? 3,
|
|
21073
21561
|
filter: config.filter,
|
|
21562
|
+
threshold: config.threshold,
|
|
21074
21563
|
evalCases,
|
|
21075
21564
|
onResult: async (result) => {
|
|
21076
21565
|
collectedResults.push(result);
|
|
@@ -21081,19 +21570,19 @@ async function evaluate(config) {
|
|
|
21081
21570
|
const durationMs = Date.now() - startTime;
|
|
21082
21571
|
return {
|
|
21083
21572
|
results: allResults,
|
|
21084
|
-
summary: computeSummary(allResults, durationMs)
|
|
21573
|
+
summary: computeSummary(allResults, durationMs, config.threshold)
|
|
21085
21574
|
};
|
|
21086
21575
|
}
|
|
21087
21576
|
function mapAssertionType(type) {
|
|
21088
21577
|
return type.replace(/_/g, "-");
|
|
21089
21578
|
}
|
|
21090
|
-
function computeSummary(results, durationMs) {
|
|
21579
|
+
function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
|
|
21091
21580
|
const total = results.length;
|
|
21092
21581
|
let passed = 0;
|
|
21093
21582
|
let scoreSum = 0;
|
|
21094
21583
|
for (const r of results) {
|
|
21095
21584
|
scoreSum += r.score;
|
|
21096
|
-
if (r.score >=
|
|
21585
|
+
if (r.score >= threshold) {
|
|
21097
21586
|
passed++;
|
|
21098
21587
|
}
|
|
21099
21588
|
}
|
|
@@ -21207,7 +21696,7 @@ var CONFIG_FILE_NAMES = [
|
|
|
21207
21696
|
];
|
|
21208
21697
|
async function loadTsConfig(projectRoot) {
|
|
21209
21698
|
const { existsSync: existsSync7 } = await import("fs");
|
|
21210
|
-
const { pathToFileURL } = await import("url");
|
|
21699
|
+
const { pathToFileURL: pathToFileURL2 } = await import("url");
|
|
21211
21700
|
const { join: join2 } = await import("path");
|
|
21212
21701
|
for (const fileName of CONFIG_FILE_NAMES) {
|
|
21213
21702
|
const filePath = join2(projectRoot, fileName);
|
|
@@ -21215,7 +21704,7 @@ async function loadTsConfig(projectRoot) {
|
|
|
21215
21704
|
continue;
|
|
21216
21705
|
}
|
|
21217
21706
|
try {
|
|
21218
|
-
const fileUrl =
|
|
21707
|
+
const fileUrl = pathToFileURL2(filePath).href;
|
|
21219
21708
|
const mod = await import(fileUrl);
|
|
21220
21709
|
const config = mod.default ?? mod;
|
|
21221
21710
|
return AgentVConfigSchema.parse(config);
|
|
@@ -21656,7 +22145,7 @@ var OtelTraceExporter = class {
|
|
|
21656
22145
|
rootSpan.setAttribute("gen_ai.system", "agentv");
|
|
21657
22146
|
rootSpan.setAttribute("agentv.test_id", result.testId);
|
|
21658
22147
|
rootSpan.setAttribute("agentv.target", result.target);
|
|
21659
|
-
if (result.
|
|
22148
|
+
if (result.suite) rootSpan.setAttribute("agentv.suite", result.suite);
|
|
21660
22149
|
rootSpan.setAttribute("agentv.score", result.score);
|
|
21661
22150
|
if (captureContent && result.output.length > 0) {
|
|
21662
22151
|
const lastMsg = result.output[result.output.length - 1];
|
|
@@ -21865,7 +22354,7 @@ var OtelStreamingObserver = class {
|
|
|
21865
22354
|
this.rootSpan.setAttribute("gen_ai.system", "agentv");
|
|
21866
22355
|
this.rootSpan.setAttribute("agentv.test_id", testId);
|
|
21867
22356
|
this.rootSpan.setAttribute("agentv.target", target);
|
|
21868
|
-
if (evalSet) this.rootSpan.setAttribute("agentv.
|
|
22357
|
+
if (evalSet) this.rootSpan.setAttribute("agentv.suite", evalSet);
|
|
21869
22358
|
this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
|
|
21870
22359
|
}
|
|
21871
22360
|
/** Create and immediately export a tool span */
|
|
@@ -22221,12 +22710,244 @@ function extractToolResultContent(content) {
|
|
|
22221
22710
|
return parts.length > 0 ? parts.join("") : void 0;
|
|
22222
22711
|
}
|
|
22223
22712
|
|
|
22224
|
-
// src/import/
|
|
22713
|
+
// src/import/codex-parser.ts
|
|
22714
|
+
init_cjs_shims();
|
|
22715
|
+
function parseCodexSession(jsonl) {
|
|
22716
|
+
const messages = [];
|
|
22717
|
+
let sessionId = "";
|
|
22718
|
+
let cwd;
|
|
22719
|
+
let model;
|
|
22720
|
+
let version;
|
|
22721
|
+
let startTimestamp;
|
|
22722
|
+
let endTimestamp;
|
|
22723
|
+
const pendingCalls = /* @__PURE__ */ new Map();
|
|
22724
|
+
const lines = jsonl.split("\n").filter((l) => l.trim().length > 0);
|
|
22725
|
+
for (const line of lines) {
|
|
22726
|
+
let entry;
|
|
22727
|
+
try {
|
|
22728
|
+
entry = JSON.parse(line);
|
|
22729
|
+
} catch {
|
|
22730
|
+
continue;
|
|
22731
|
+
}
|
|
22732
|
+
if (!entry.type) continue;
|
|
22733
|
+
if (entry.timestamp) {
|
|
22734
|
+
if (!startTimestamp) startTimestamp = entry.timestamp;
|
|
22735
|
+
endTimestamp = entry.timestamp;
|
|
22736
|
+
}
|
|
22737
|
+
const payload = entry.payload ?? {};
|
|
22738
|
+
switch (entry.type) {
|
|
22739
|
+
case "session_meta": {
|
|
22740
|
+
sessionId = String(payload.id ?? "");
|
|
22741
|
+
cwd = payload.cwd ? String(payload.cwd) : void 0;
|
|
22742
|
+
version = payload.cli_version ? String(payload.cli_version) : void 0;
|
|
22743
|
+
if (payload.model && !model) {
|
|
22744
|
+
model = String(payload.model);
|
|
22745
|
+
}
|
|
22746
|
+
break;
|
|
22747
|
+
}
|
|
22748
|
+
case "turn_context": {
|
|
22749
|
+
if (payload.model && !model) {
|
|
22750
|
+
model = String(payload.model);
|
|
22751
|
+
}
|
|
22752
|
+
if (payload.cwd && !cwd) {
|
|
22753
|
+
cwd = String(payload.cwd);
|
|
22754
|
+
}
|
|
22755
|
+
break;
|
|
22756
|
+
}
|
|
22757
|
+
case "response_item": {
|
|
22758
|
+
const itemType = String(payload.type ?? "");
|
|
22759
|
+
const role = String(payload.role ?? "");
|
|
22760
|
+
switch (itemType) {
|
|
22761
|
+
case "message": {
|
|
22762
|
+
if (role === "developer") break;
|
|
22763
|
+
const content = extractResponseItemContent(payload.content);
|
|
22764
|
+
if (role === "user" && content) {
|
|
22765
|
+
messages.push({ role: "user", content });
|
|
22766
|
+
} else if (role === "assistant" && content) {
|
|
22767
|
+
messages.push({ role: "assistant", content });
|
|
22768
|
+
}
|
|
22769
|
+
break;
|
|
22770
|
+
}
|
|
22771
|
+
case "function_call": {
|
|
22772
|
+
const toolName = String(payload.name ?? "");
|
|
22773
|
+
const callId = String(payload.call_id ?? "");
|
|
22774
|
+
let input;
|
|
22775
|
+
if (typeof payload.arguments === "string") {
|
|
22776
|
+
try {
|
|
22777
|
+
input = JSON.parse(payload.arguments);
|
|
22778
|
+
} catch {
|
|
22779
|
+
input = payload.arguments;
|
|
22780
|
+
}
|
|
22781
|
+
} else {
|
|
22782
|
+
input = payload.arguments;
|
|
22783
|
+
}
|
|
22784
|
+
const toolCall = { tool: toolName, input, id: callId };
|
|
22785
|
+
const msgIdx = messages.length;
|
|
22786
|
+
messages.push({
|
|
22787
|
+
role: "assistant",
|
|
22788
|
+
toolCalls: [toolCall]
|
|
22789
|
+
});
|
|
22790
|
+
if (callId) {
|
|
22791
|
+
pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
|
|
22792
|
+
}
|
|
22793
|
+
break;
|
|
22794
|
+
}
|
|
22795
|
+
case "custom_tool_call": {
|
|
22796
|
+
const toolName = String(payload.name ?? "");
|
|
22797
|
+
const callId = String(payload.call_id ?? "");
|
|
22798
|
+
let input;
|
|
22799
|
+
if (typeof payload.arguments === "string") {
|
|
22800
|
+
try {
|
|
22801
|
+
input = JSON.parse(payload.arguments);
|
|
22802
|
+
} catch {
|
|
22803
|
+
input = payload.arguments;
|
|
22804
|
+
}
|
|
22805
|
+
} else {
|
|
22806
|
+
input = payload.arguments;
|
|
22807
|
+
}
|
|
22808
|
+
const toolCall = { tool: toolName, input, id: callId };
|
|
22809
|
+
const msgIdx = messages.length;
|
|
22810
|
+
messages.push({
|
|
22811
|
+
role: "assistant",
|
|
22812
|
+
toolCalls: [toolCall]
|
|
22813
|
+
});
|
|
22814
|
+
if (callId) {
|
|
22815
|
+
pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
|
|
22816
|
+
}
|
|
22817
|
+
break;
|
|
22818
|
+
}
|
|
22819
|
+
case "function_call_output":
|
|
22820
|
+
case "custom_tool_call_output": {
|
|
22821
|
+
const callId = String(payload.call_id ?? "");
|
|
22822
|
+
const pending = pendingCalls.get(callId);
|
|
22823
|
+
if (pending) {
|
|
22824
|
+
const existingMsg = messages[pending.msgIdx];
|
|
22825
|
+
const existingCalls = [...existingMsg.toolCalls ?? []];
|
|
22826
|
+
existingCalls[pending.toolIdx] = {
|
|
22827
|
+
...existingCalls[pending.toolIdx],
|
|
22828
|
+
output: payload.output
|
|
22829
|
+
};
|
|
22830
|
+
messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls };
|
|
22831
|
+
pendingCalls.delete(callId);
|
|
22832
|
+
}
|
|
22833
|
+
break;
|
|
22834
|
+
}
|
|
22835
|
+
// Skip reasoning blocks (thinking tokens)
|
|
22836
|
+
case "reasoning":
|
|
22837
|
+
break;
|
|
22838
|
+
}
|
|
22839
|
+
break;
|
|
22840
|
+
}
|
|
22841
|
+
}
|
|
22842
|
+
}
|
|
22843
|
+
let durationMs;
|
|
22844
|
+
if (startTimestamp && endTimestamp) {
|
|
22845
|
+
durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime();
|
|
22846
|
+
}
|
|
22847
|
+
const source = {
|
|
22848
|
+
provider: "codex",
|
|
22849
|
+
sessionId,
|
|
22850
|
+
cwd,
|
|
22851
|
+
startedAt: startTimestamp,
|
|
22852
|
+
model,
|
|
22853
|
+
version
|
|
22854
|
+
};
|
|
22855
|
+
return {
|
|
22856
|
+
messages,
|
|
22857
|
+
source,
|
|
22858
|
+
// Codex rollout files don't include token counts (only rate limit info)
|
|
22859
|
+
tokenUsage: void 0,
|
|
22860
|
+
durationMs,
|
|
22861
|
+
costUsd: null
|
|
22862
|
+
};
|
|
22863
|
+
}
|
|
22864
|
+
function extractResponseItemContent(content) {
|
|
22865
|
+
if (typeof content === "string") return content;
|
|
22866
|
+
if (!Array.isArray(content)) return void 0;
|
|
22867
|
+
const parts = [];
|
|
22868
|
+
for (const block of content) {
|
|
22869
|
+
if (typeof block === "object" && block !== null) {
|
|
22870
|
+
const b = block;
|
|
22871
|
+
if (typeof b.text === "string") {
|
|
22872
|
+
parts.push(b.text);
|
|
22873
|
+
}
|
|
22874
|
+
}
|
|
22875
|
+
}
|
|
22876
|
+
return parts.length > 0 ? parts.join("") : void 0;
|
|
22877
|
+
}
|
|
22878
|
+
|
|
22879
|
+
// src/import/codex-session-discovery.ts
|
|
22225
22880
|
init_cjs_shims();
|
|
22226
22881
|
var import_promises36 = require("fs/promises");
|
|
22227
22882
|
var import_node_os8 = require("os");
|
|
22228
22883
|
var import_node_path53 = __toESM(require("path"), 1);
|
|
22229
|
-
var
|
|
22884
|
+
var DEFAULT_SESSIONS_DIR = () => import_node_path53.default.join((0, import_node_os8.homedir)(), ".codex", "sessions");
|
|
22885
|
+
async function discoverCodexSessions(opts) {
|
|
22886
|
+
const sessionsDir = opts?.sessionsDir ?? DEFAULT_SESSIONS_DIR();
|
|
22887
|
+
const limit = opts?.latest ? 1 : opts?.limit ?? 10;
|
|
22888
|
+
const sessions = [];
|
|
22889
|
+
let yearDirs;
|
|
22890
|
+
try {
|
|
22891
|
+
yearDirs = await (0, import_promises36.readdir)(sessionsDir);
|
|
22892
|
+
} catch {
|
|
22893
|
+
return [];
|
|
22894
|
+
}
|
|
22895
|
+
for (const year of yearDirs) {
|
|
22896
|
+
const yearPath = import_node_path53.default.join(sessionsDir, year);
|
|
22897
|
+
let monthDirs;
|
|
22898
|
+
try {
|
|
22899
|
+
monthDirs = await (0, import_promises36.readdir)(yearPath);
|
|
22900
|
+
} catch {
|
|
22901
|
+
continue;
|
|
22902
|
+
}
|
|
22903
|
+
for (const month of monthDirs) {
|
|
22904
|
+
const monthPath = import_node_path53.default.join(yearPath, month);
|
|
22905
|
+
let dayDirs;
|
|
22906
|
+
try {
|
|
22907
|
+
dayDirs = await (0, import_promises36.readdir)(monthPath);
|
|
22908
|
+
} catch {
|
|
22909
|
+
continue;
|
|
22910
|
+
}
|
|
22911
|
+
for (const day of dayDirs) {
|
|
22912
|
+
if (opts?.date) {
|
|
22913
|
+
const dirDate = `${year}-${month}-${day}`;
|
|
22914
|
+
if (dirDate !== opts.date) continue;
|
|
22915
|
+
}
|
|
22916
|
+
const dayPath = import_node_path53.default.join(monthPath, day);
|
|
22917
|
+
let files;
|
|
22918
|
+
try {
|
|
22919
|
+
files = await (0, import_promises36.readdir)(dayPath);
|
|
22920
|
+
} catch {
|
|
22921
|
+
continue;
|
|
22922
|
+
}
|
|
22923
|
+
for (const file of files) {
|
|
22924
|
+
if (!file.startsWith("rollout-") || !file.endsWith(".jsonl")) continue;
|
|
22925
|
+
const filePath = import_node_path53.default.join(dayPath, file);
|
|
22926
|
+
const nameWithoutExt = file.replace(/\.jsonl$/, "");
|
|
22927
|
+
const parts = nameWithoutExt.split("-");
|
|
22928
|
+
const sessionId = parts.length >= 6 ? parts.slice(-5).join("-") : nameWithoutExt;
|
|
22929
|
+
let updatedAt;
|
|
22930
|
+
try {
|
|
22931
|
+
const fileStat = await (0, import_promises36.stat)(filePath);
|
|
22932
|
+
updatedAt = fileStat.mtime;
|
|
22933
|
+
} catch {
|
|
22934
|
+
updatedAt = /* @__PURE__ */ new Date(0);
|
|
22935
|
+
}
|
|
22936
|
+
sessions.push({ sessionId, filePath, filename: file, updatedAt });
|
|
22937
|
+
}
|
|
22938
|
+
}
|
|
22939
|
+
}
|
|
22940
|
+
}
|
|
22941
|
+
sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
|
|
22942
|
+
return sessions.slice(0, limit);
|
|
22943
|
+
}
|
|
22944
|
+
|
|
22945
|
+
// src/import/session-discovery.ts
|
|
22946
|
+
init_cjs_shims();
|
|
22947
|
+
var import_promises37 = require("fs/promises");
|
|
22948
|
+
var import_node_os9 = require("os");
|
|
22949
|
+
var import_node_path54 = __toESM(require("path"), 1);
|
|
22950
|
+
var DEFAULT_PROJECTS_DIR = () => import_node_path54.default.join((0, import_node_os9.homedir)(), ".claude", "projects");
|
|
22230
22951
|
function encodeProjectPath(projectPath) {
|
|
22231
22952
|
return projectPath.replace(/\//g, "-");
|
|
22232
22953
|
}
|
|
@@ -22235,7 +22956,7 @@ async function discoverClaudeSessions(opts) {
|
|
|
22235
22956
|
const limit = opts?.latest ? 1 : opts?.limit ?? 10;
|
|
22236
22957
|
let projectDirs;
|
|
22237
22958
|
try {
|
|
22238
|
-
projectDirs = await (0,
|
|
22959
|
+
projectDirs = await (0, import_promises37.readdir)(projectsDir);
|
|
22239
22960
|
} catch {
|
|
22240
22961
|
return [];
|
|
22241
22962
|
}
|
|
@@ -22245,10 +22966,10 @@ async function discoverClaudeSessions(opts) {
|
|
|
22245
22966
|
}
|
|
22246
22967
|
const sessions = [];
|
|
22247
22968
|
for (const projectDir of projectDirs) {
|
|
22248
|
-
const dirPath =
|
|
22969
|
+
const dirPath = import_node_path54.default.join(projectsDir, projectDir);
|
|
22249
22970
|
let entries;
|
|
22250
22971
|
try {
|
|
22251
|
-
entries = await (0,
|
|
22972
|
+
entries = await (0, import_promises37.readdir)(dirPath);
|
|
22252
22973
|
} catch {
|
|
22253
22974
|
continue;
|
|
22254
22975
|
}
|
|
@@ -22256,10 +22977,10 @@ async function discoverClaudeSessions(opts) {
|
|
|
22256
22977
|
if (!entry.endsWith(".jsonl")) continue;
|
|
22257
22978
|
const sessionId = entry.replace(/\.jsonl$/, "");
|
|
22258
22979
|
if (opts?.sessionId && sessionId !== opts.sessionId) continue;
|
|
22259
|
-
const filePath =
|
|
22980
|
+
const filePath = import_node_path54.default.join(dirPath, entry);
|
|
22260
22981
|
let updatedAt;
|
|
22261
22982
|
try {
|
|
22262
|
-
const fileStat = await (0,
|
|
22983
|
+
const fileStat = await (0, import_promises37.stat)(filePath);
|
|
22263
22984
|
updatedAt = fileStat.mtime;
|
|
22264
22985
|
} catch {
|
|
22265
22986
|
updatedAt = /* @__PURE__ */ new Date(0);
|
|
@@ -22276,13 +22997,91 @@ async function discoverClaudeSessions(opts) {
|
|
|
22276
22997
|
return sessions.slice(0, limit);
|
|
22277
22998
|
}
|
|
22278
22999
|
|
|
23000
|
+
// src/import/transcript-provider.ts
|
|
23001
|
+
init_cjs_shims();
|
|
23002
|
+
|
|
22279
23003
|
// src/import/types.ts
|
|
22280
23004
|
init_cjs_shims();
|
|
22281
|
-
var
|
|
23005
|
+
var import_promises38 = require("fs/promises");
|
|
23006
|
+
function toTranscriptJsonLine(entry) {
|
|
23007
|
+
const firstUserMessage = entry.messages.find((m) => m.role === "user");
|
|
23008
|
+
const input = typeof firstUserMessage?.content === "string" ? firstUserMessage.content : "";
|
|
23009
|
+
return {
|
|
23010
|
+
input,
|
|
23011
|
+
output: entry.messages,
|
|
23012
|
+
token_usage: entry.tokenUsage ? {
|
|
23013
|
+
input: entry.tokenUsage.input,
|
|
23014
|
+
output: entry.tokenUsage.output,
|
|
23015
|
+
cached: entry.tokenUsage.cached
|
|
23016
|
+
} : void 0,
|
|
23017
|
+
duration_ms: entry.durationMs,
|
|
23018
|
+
cost_usd: entry.costUsd,
|
|
23019
|
+
source: {
|
|
23020
|
+
provider: entry.source.provider,
|
|
23021
|
+
session_id: entry.source.sessionId,
|
|
23022
|
+
model: entry.source.model,
|
|
23023
|
+
timestamp: entry.source.startedAt,
|
|
23024
|
+
git_branch: entry.source.gitBranch,
|
|
23025
|
+
cwd: entry.source.cwd ?? entry.source.projectPath,
|
|
23026
|
+
version: entry.source.version
|
|
23027
|
+
}
|
|
23028
|
+
};
|
|
23029
|
+
}
|
|
23030
|
+
async function readTranscriptJsonl(filePath) {
|
|
23031
|
+
const text = await (0, import_promises38.readFile)(filePath, "utf8");
|
|
23032
|
+
return text.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
23033
|
+
}
|
|
22282
23034
|
async function readTranscriptFile(filePath) {
|
|
22283
|
-
return (0,
|
|
23035
|
+
return (0, import_promises38.readFile)(filePath, "utf8");
|
|
22284
23036
|
}
|
|
22285
23037
|
|
|
23038
|
+
// src/import/transcript-provider.ts
|
|
23039
|
+
var TranscriptProvider = class _TranscriptProvider {
|
|
23040
|
+
id;
|
|
23041
|
+
kind = "transcript";
|
|
23042
|
+
targetName;
|
|
23043
|
+
lines;
|
|
23044
|
+
cursor = 0;
|
|
23045
|
+
constructor(targetName, lines) {
|
|
23046
|
+
this.targetName = targetName;
|
|
23047
|
+
this.id = `transcript:${targetName}`;
|
|
23048
|
+
this.lines = lines;
|
|
23049
|
+
}
|
|
23050
|
+
/**
|
|
23051
|
+
* Create a TranscriptProvider from a JSONL file path.
|
|
23052
|
+
*/
|
|
23053
|
+
static async fromFile(filePath) {
|
|
23054
|
+
const lines = await readTranscriptJsonl(filePath);
|
|
23055
|
+
if (lines.length === 0) {
|
|
23056
|
+
throw new Error(`Transcript file is empty: ${filePath}`);
|
|
23057
|
+
}
|
|
23058
|
+
const providerName = lines[0].source.provider ?? "transcript";
|
|
23059
|
+
return new _TranscriptProvider(providerName, lines);
|
|
23060
|
+
}
|
|
23061
|
+
get lineCount() {
|
|
23062
|
+
return this.lines.length;
|
|
23063
|
+
}
|
|
23064
|
+
async invoke(_request) {
|
|
23065
|
+
if (this.cursor >= this.lines.length) {
|
|
23066
|
+
throw new Error(
|
|
23067
|
+
`Transcript exhausted: ${this.lines.length} line(s) available but ${this.cursor + 1} invocations attempted. Each transcript line maps to one test case.`
|
|
23068
|
+
);
|
|
23069
|
+
}
|
|
23070
|
+
const line = this.lines[this.cursor++];
|
|
23071
|
+
return {
|
|
23072
|
+
output: line.output,
|
|
23073
|
+
tokenUsage: line.token_usage ? {
|
|
23074
|
+
input: line.token_usage.input,
|
|
23075
|
+
output: line.token_usage.output,
|
|
23076
|
+
cached: line.token_usage.cached
|
|
23077
|
+
} : void 0,
|
|
23078
|
+
durationMs: line.duration_ms,
|
|
23079
|
+
costUsd: line.cost_usd ?? void 0,
|
|
23080
|
+
startTime: line.source.timestamp
|
|
23081
|
+
};
|
|
23082
|
+
}
|
|
23083
|
+
};
|
|
23084
|
+
|
|
22286
23085
|
// src/index.ts
|
|
22287
23086
|
function createAgentKernel() {
|
|
22288
23087
|
return { status: "stub" };
|
|
@@ -22297,6 +23096,7 @@ function createAgentKernel() {
|
|
|
22297
23096
|
DEFAULT_EVALUATOR_TEMPLATE,
|
|
22298
23097
|
DEFAULT_EVAL_PATTERNS,
|
|
22299
23098
|
DEFAULT_EXPLORATION_TOOLS,
|
|
23099
|
+
DEFAULT_THRESHOLD,
|
|
22300
23100
|
DeterministicAssertionEvaluator,
|
|
22301
23101
|
EvaluatorRegistry,
|
|
22302
23102
|
ExecutionMetricsEvaluator,
|
|
@@ -22318,6 +23118,7 @@ function createAgentKernel() {
|
|
|
22318
23118
|
TemplateNotFoundError,
|
|
22319
23119
|
TokenUsageEvaluator,
|
|
22320
23120
|
ToolTrajectoryEvaluator,
|
|
23121
|
+
TranscriptProvider,
|
|
22321
23122
|
WorkspaceCreationError,
|
|
22322
23123
|
WorkspacePoolManager,
|
|
22323
23124
|
addProject,
|
|
@@ -22354,6 +23155,7 @@ function createAgentKernel() {
|
|
|
22354
23155
|
detectFormat,
|
|
22355
23156
|
discoverAssertions,
|
|
22356
23157
|
discoverClaudeSessions,
|
|
23158
|
+
discoverCodexSessions,
|
|
22357
23159
|
discoverCopilotSessions,
|
|
22358
23160
|
discoverGraders,
|
|
22359
23161
|
discoverJudges,
|
|
@@ -22414,6 +23216,8 @@ function createAgentKernel() {
|
|
|
22414
23216
|
normalizeLineEndings,
|
|
22415
23217
|
parseAgentSkillsEvals,
|
|
22416
23218
|
parseClaudeSession,
|
|
23219
|
+
parseCodexSession,
|
|
23220
|
+
parseCopilotEvents,
|
|
22417
23221
|
parseJsonFromText,
|
|
22418
23222
|
parseJsonSafe,
|
|
22419
23223
|
readJsonFile,
|
|
@@ -22421,6 +23225,7 @@ function createAgentKernel() {
|
|
|
22421
23225
|
readTestSuiteMetadata,
|
|
22422
23226
|
readTextFile,
|
|
22423
23227
|
readTranscriptFile,
|
|
23228
|
+
readTranscriptJsonl,
|
|
22424
23229
|
removeProject,
|
|
22425
23230
|
resolveAndCreateProvider,
|
|
22426
23231
|
resolveDelegatedTargetDefinition,
|
|
@@ -22453,6 +23258,7 @@ function createAgentKernel() {
|
|
|
22453
23258
|
substituteVariables,
|
|
22454
23259
|
toCamelCaseDeep,
|
|
22455
23260
|
toSnakeCaseDeep,
|
|
23261
|
+
toTranscriptJsonLine,
|
|
22456
23262
|
tokensPerTool,
|
|
22457
23263
|
touchProject,
|
|
22458
23264
|
transpileEvalYaml,
|