@agentv/core 4.6.1 → 4.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -31,12 +31,9 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
31
31
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
32
32
 
33
33
  // ../../node_modules/.bun/tsup@8.3.5+19811ebab77a7b1c/node_modules/tsup/assets/cjs_shims.js
34
- var getImportMetaUrl, importMetaUrl;
35
34
  var init_cjs_shims = __esm({
36
35
  "../../node_modules/.bun/tsup@8.3.5+19811ebab77a7b1c/node_modules/tsup/assets/cjs_shims.js"() {
37
36
  "use strict";
38
- getImportMetaUrl = () => typeof document === "undefined" ? new URL(`file:${__filename}`).href : document.currentScript && document.currentScript.src || new URL("main.js", document.baseURI).href;
39
- importMetaUrl = /* @__PURE__ */ getImportMetaUrl();
40
37
  }
41
38
  });
42
39
 
@@ -1435,6 +1432,7 @@ __export(index_exports, {
1435
1432
  DEFAULT_EVALUATOR_TEMPLATE: () => DEFAULT_EVALUATOR_TEMPLATE,
1436
1433
  DEFAULT_EVAL_PATTERNS: () => DEFAULT_EVAL_PATTERNS,
1437
1434
  DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
1435
+ DEFAULT_THRESHOLD: () => DEFAULT_THRESHOLD,
1438
1436
  DeterministicAssertionEvaluator: () => DeterministicAssertionEvaluator,
1439
1437
  EvaluatorRegistry: () => EvaluatorRegistry,
1440
1438
  ExecutionMetricsEvaluator: () => ExecutionMetricsEvaluator,
@@ -1456,6 +1454,7 @@ __export(index_exports, {
1456
1454
  TemplateNotFoundError: () => TemplateNotFoundError,
1457
1455
  TokenUsageEvaluator: () => TokenUsageEvaluator,
1458
1456
  ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
1457
+ TranscriptProvider: () => TranscriptProvider,
1459
1458
  WorkspaceCreationError: () => WorkspaceCreationError,
1460
1459
  WorkspacePoolManager: () => WorkspacePoolManager,
1461
1460
  addProject: () => addProject,
@@ -1492,6 +1491,7 @@ __export(index_exports, {
1492
1491
  detectFormat: () => detectFormat,
1493
1492
  discoverAssertions: () => discoverAssertions,
1494
1493
  discoverClaudeSessions: () => discoverClaudeSessions,
1494
+ discoverCodexSessions: () => discoverCodexSessions,
1495
1495
  discoverCopilotSessions: () => discoverCopilotSessions,
1496
1496
  discoverGraders: () => discoverGraders,
1497
1497
  discoverJudges: () => discoverGraders,
@@ -1552,6 +1552,8 @@ __export(index_exports, {
1552
1552
  normalizeLineEndings: () => normalizeLineEndings,
1553
1553
  parseAgentSkillsEvals: () => parseAgentSkillsEvals,
1554
1554
  parseClaudeSession: () => parseClaudeSession,
1555
+ parseCodexSession: () => parseCodexSession,
1556
+ parseCopilotEvents: () => parseCopilotEvents,
1555
1557
  parseJsonFromText: () => parseJsonFromText,
1556
1558
  parseJsonSafe: () => parseJsonSafe,
1557
1559
  readJsonFile: () => readJsonFile,
@@ -1559,6 +1561,7 @@ __export(index_exports, {
1559
1561
  readTestSuiteMetadata: () => readTestSuiteMetadata,
1560
1562
  readTextFile: () => readTextFile,
1561
1563
  readTranscriptFile: () => readTranscriptFile,
1564
+ readTranscriptJsonl: () => readTranscriptJsonl,
1562
1565
  removeProject: () => removeProject,
1563
1566
  resolveAndCreateProvider: () => resolveAndCreateProvider,
1564
1567
  resolveDelegatedTargetDefinition: () => resolveDelegatedTargetDefinition,
@@ -1591,6 +1594,7 @@ __export(index_exports, {
1591
1594
  substituteVariables: () => substituteVariables,
1592
1595
  toCamelCaseDeep: () => toCamelCaseDeep,
1593
1596
  toSnakeCaseDeep: () => toSnakeCaseDeep,
1597
+ toTranscriptJsonLine: () => toTranscriptJsonLine,
1594
1598
  tokensPerTool: () => tokensPerTool,
1595
1599
  touchProject: () => touchProject,
1596
1600
  transpileEvalYaml: () => transpileEvalYaml,
@@ -2675,8 +2679,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2675
2679
  const negate = rawEvaluator.negate === true ? true : void 0;
2676
2680
  if (isCustomType) {
2677
2681
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
2678
- const required2 = parseRequired(rawEvaluator.required);
2679
- const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "negate"]);
2682
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
2683
+ rawEvaluator.required,
2684
+ rawEvaluator.min_score,
2685
+ name,
2686
+ evalId
2687
+ );
2688
+ const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "min_score", "negate"]);
2680
2689
  const config2 = {};
2681
2690
  for (const [key, value] of Object.entries(rawEvaluator)) {
2682
2691
  if (!knownProps2.has(key) && value !== void 0) {
@@ -2688,6 +2697,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2688
2697
  type: customTypeName,
2689
2698
  ...weight2 !== void 0 ? { weight: weight2 } : {},
2690
2699
  ...required2 !== void 0 ? { required: required2 } : {},
2700
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
2691
2701
  ...negate !== void 0 ? { negate } : {},
2692
2702
  ...Object.keys(config2).length > 0 ? { config: config2 } : {}
2693
2703
  });
@@ -2757,7 +2767,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2757
2767
  );
2758
2768
  }
2759
2769
  }
2760
- const required2 = parseRequired(rawEvaluator.required);
2770
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
2771
+ rawEvaluator.required,
2772
+ rawEvaluator.min_score,
2773
+ name,
2774
+ evalId
2775
+ );
2761
2776
  const knownProps2 = /* @__PURE__ */ new Set([
2762
2777
  "name",
2763
2778
  "type",
@@ -2783,6 +2798,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2783
2798
  resolvedCwd,
2784
2799
  ...weight2 !== void 0 ? { weight: weight2 } : {},
2785
2800
  ...required2 !== void 0 ? { required: required2 } : {},
2801
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
2786
2802
  ...negate !== void 0 ? { negate } : {},
2787
2803
  ...Object.keys(config2).length > 0 ? { config: config2 } : {},
2788
2804
  ...targetConfig !== void 0 ? { target: targetConfig } : {}
@@ -2911,7 +2927,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2911
2927
  };
2912
2928
  }
2913
2929
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
2914
- const required2 = parseRequired(rawEvaluator.required);
2930
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
2931
+ rawEvaluator.required,
2932
+ rawEvaluator.min_score,
2933
+ name,
2934
+ evalId
2935
+ );
2915
2936
  evaluators.push({
2916
2937
  name,
2917
2938
  type: "composite",
@@ -2919,6 +2940,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2919
2940
  aggregator,
2920
2941
  ...weight2 !== void 0 ? { weight: weight2 } : {},
2921
2942
  ...required2 !== void 0 ? { required: required2 } : {},
2943
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
2922
2944
  ...negate !== void 0 ? { negate } : {}
2923
2945
  });
2924
2946
  continue;
@@ -3029,7 +3051,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3029
3051
  continue;
3030
3052
  }
3031
3053
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3032
- const required2 = parseRequired(rawEvaluator.required);
3054
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3055
+ rawEvaluator.required,
3056
+ rawEvaluator.min_score,
3057
+ name,
3058
+ evalId
3059
+ );
3033
3060
  const config2 = {
3034
3061
  name,
3035
3062
  type: "tool-trajectory",
@@ -3038,6 +3065,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3038
3065
  ...expected ? { expected } : {},
3039
3066
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3040
3067
  ...required2 !== void 0 ? { required: required2 } : {},
3068
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3041
3069
  ...negate !== void 0 ? { negate } : {},
3042
3070
  ...argsMatch2 !== void 0 ? { argsMatch: argsMatch2 } : {}
3043
3071
  };
@@ -3100,7 +3128,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3100
3128
  const aggregation = asString(rawEvaluator.aggregation);
3101
3129
  const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
3102
3130
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3103
- const required2 = parseRequired(rawEvaluator.required);
3131
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3132
+ rawEvaluator.required,
3133
+ rawEvaluator.min_score,
3134
+ name,
3135
+ evalId
3136
+ );
3104
3137
  evaluators.push({
3105
3138
  name,
3106
3139
  type: "field-accuracy",
@@ -3108,6 +3141,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3108
3141
  ...validAggregation ? { aggregation: validAggregation } : {},
3109
3142
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3110
3143
  ...required2 !== void 0 ? { required: required2 } : {},
3144
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3111
3145
  ...negate !== void 0 ? { negate } : {}
3112
3146
  });
3113
3147
  continue;
@@ -3121,13 +3155,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3121
3155
  continue;
3122
3156
  }
3123
3157
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3124
- const required2 = parseRequired(rawEvaluator.required);
3158
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3159
+ rawEvaluator.required,
3160
+ rawEvaluator.min_score,
3161
+ name,
3162
+ evalId
3163
+ );
3125
3164
  evaluators.push({
3126
3165
  name,
3127
3166
  type: "latency",
3128
3167
  threshold,
3129
3168
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3130
3169
  ...required2 !== void 0 ? { required: required2 } : {},
3170
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3131
3171
  ...negate !== void 0 ? { negate } : {}
3132
3172
  });
3133
3173
  continue;
@@ -3141,13 +3181,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3141
3181
  continue;
3142
3182
  }
3143
3183
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3144
- const required2 = parseRequired(rawEvaluator.required);
3184
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3185
+ rawEvaluator.required,
3186
+ rawEvaluator.min_score,
3187
+ name,
3188
+ evalId
3189
+ );
3145
3190
  evaluators.push({
3146
3191
  name,
3147
3192
  type: "cost",
3148
3193
  budget,
3149
3194
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3150
3195
  ...required2 !== void 0 ? { required: required2 } : {},
3196
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3151
3197
  ...negate !== void 0 ? { negate } : {}
3152
3198
  });
3153
3199
  continue;
@@ -3179,13 +3225,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3179
3225
  continue;
3180
3226
  }
3181
3227
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3182
- const required2 = parseRequired(rawEvaluator.required);
3228
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3229
+ rawEvaluator.required,
3230
+ rawEvaluator.min_score,
3231
+ name,
3232
+ evalId
3233
+ );
3183
3234
  evaluators.push({
3184
3235
  name,
3185
3236
  type: "token-usage",
3186
3237
  ...validLimits,
3187
3238
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3188
3239
  ...required2 !== void 0 ? { required: required2 } : {},
3240
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3189
3241
  ...negate !== void 0 ? { negate } : {}
3190
3242
  });
3191
3243
  continue;
@@ -3231,13 +3283,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3231
3283
  continue;
3232
3284
  }
3233
3285
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3234
- const required2 = parseRequired(rawEvaluator.required);
3286
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3287
+ rawEvaluator.required,
3288
+ rawEvaluator.min_score,
3289
+ name,
3290
+ evalId
3291
+ );
3235
3292
  evaluators.push({
3236
3293
  name,
3237
3294
  type: "execution-metrics",
3238
3295
  ...validThresholds,
3239
3296
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3240
3297
  ...required2 !== void 0 ? { required: required2 } : {},
3298
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3241
3299
  ...negate !== void 0 ? { negate } : {}
3242
3300
  });
3243
3301
  continue;
@@ -3251,7 +3309,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3251
3309
  const rawShouldTrigger = rawEvaluator.should_trigger;
3252
3310
  const shouldTrigger = typeof rawShouldTrigger === "boolean" ? rawShouldTrigger : void 0;
3253
3311
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3254
- const required2 = parseRequired(rawEvaluator.required);
3312
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3313
+ rawEvaluator.required,
3314
+ rawEvaluator.min_score,
3315
+ name,
3316
+ evalId
3317
+ );
3255
3318
  evaluators.push({
3256
3319
  name,
3257
3320
  type: "skill-trigger",
@@ -3259,6 +3322,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3259
3322
  ...shouldTrigger !== void 0 ? { should_trigger: shouldTrigger } : {},
3260
3323
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3261
3324
  ...required2 !== void 0 ? { required: required2 } : {},
3325
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3262
3326
  ...negate !== void 0 ? { negate } : {}
3263
3327
  });
3264
3328
  continue;
@@ -3270,13 +3334,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3270
3334
  continue;
3271
3335
  }
3272
3336
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3273
- const required2 = parseRequired(rawEvaluator.required);
3337
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3338
+ rawEvaluator.required,
3339
+ rawEvaluator.min_score,
3340
+ name,
3341
+ evalId
3342
+ );
3274
3343
  evaluators.push({
3275
3344
  name,
3276
3345
  type: "contains",
3277
3346
  value,
3278
3347
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3279
3348
  ...required2 !== void 0 ? { required: required2 } : {},
3349
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3280
3350
  ...negate !== void 0 ? { negate } : {}
3281
3351
  });
3282
3352
  continue;
@@ -3290,13 +3360,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3290
3360
  continue;
3291
3361
  }
3292
3362
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3293
- const required2 = parseRequired(rawEvaluator.required);
3363
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3364
+ rawEvaluator.required,
3365
+ rawEvaluator.min_score,
3366
+ name,
3367
+ evalId
3368
+ );
3294
3369
  evaluators.push({
3295
3370
  name,
3296
3371
  type: typeValue,
3297
3372
  value,
3298
3373
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3299
3374
  ...required2 !== void 0 ? { required: required2 } : {},
3375
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3300
3376
  ...negate !== void 0 ? { negate } : {}
3301
3377
  });
3302
3378
  continue;
@@ -3308,13 +3384,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3308
3384
  continue;
3309
3385
  }
3310
3386
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3311
- const required2 = parseRequired(rawEvaluator.required);
3387
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3388
+ rawEvaluator.required,
3389
+ rawEvaluator.min_score,
3390
+ name,
3391
+ evalId
3392
+ );
3312
3393
  evaluators.push({
3313
3394
  name,
3314
3395
  type: "icontains",
3315
3396
  value,
3316
3397
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3317
3398
  ...required2 !== void 0 ? { required: required2 } : {},
3399
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3318
3400
  ...negate !== void 0 ? { negate } : {}
3319
3401
  });
3320
3402
  continue;
@@ -3328,13 +3410,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3328
3410
  continue;
3329
3411
  }
3330
3412
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3331
- const required2 = parseRequired(rawEvaluator.required);
3413
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3414
+ rawEvaluator.required,
3415
+ rawEvaluator.min_score,
3416
+ name,
3417
+ evalId
3418
+ );
3332
3419
  evaluators.push({
3333
3420
  name,
3334
3421
  type: typeValue,
3335
3422
  value,
3336
3423
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3337
3424
  ...required2 !== void 0 ? { required: required2 } : {},
3425
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3338
3426
  ...negate !== void 0 ? { negate } : {}
3339
3427
  });
3340
3428
  continue;
@@ -3346,13 +3434,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3346
3434
  continue;
3347
3435
  }
3348
3436
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3349
- const required2 = parseRequired(rawEvaluator.required);
3437
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3438
+ rawEvaluator.required,
3439
+ rawEvaluator.min_score,
3440
+ name,
3441
+ evalId
3442
+ );
3350
3443
  evaluators.push({
3351
3444
  name,
3352
3445
  type: typeValue,
3353
3446
  value,
3354
3447
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3355
3448
  ...required2 !== void 0 ? { required: required2 } : {},
3449
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3356
3450
  ...negate !== void 0 ? { negate } : {}
3357
3451
  });
3358
3452
  continue;
@@ -3365,7 +3459,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3365
3459
  }
3366
3460
  const flags = asString(rawEvaluator.flags);
3367
3461
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3368
- const required2 = parseRequired(rawEvaluator.required);
3462
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3463
+ rawEvaluator.required,
3464
+ rawEvaluator.min_score,
3465
+ name,
3466
+ evalId
3467
+ );
3369
3468
  evaluators.push({
3370
3469
  name,
3371
3470
  type: "regex",
@@ -3373,18 +3472,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3373
3472
  ...flags !== void 0 ? { flags } : {},
3374
3473
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3375
3474
  ...required2 !== void 0 ? { required: required2 } : {},
3475
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3376
3476
  ...negate !== void 0 ? { negate } : {}
3377
3477
  });
3378
3478
  continue;
3379
3479
  }
3380
3480
  if (typeValue === "is-json") {
3381
3481
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3382
- const required2 = parseRequired(rawEvaluator.required);
3482
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3483
+ rawEvaluator.required,
3484
+ rawEvaluator.min_score,
3485
+ name,
3486
+ evalId
3487
+ );
3383
3488
  evaluators.push({
3384
3489
  name,
3385
3490
  type: "is-json",
3386
3491
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3387
3492
  ...required2 !== void 0 ? { required: required2 } : {},
3493
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3388
3494
  ...negate !== void 0 ? { negate } : {}
3389
3495
  });
3390
3496
  continue;
@@ -3396,13 +3502,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3396
3502
  continue;
3397
3503
  }
3398
3504
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3399
- const required2 = parseRequired(rawEvaluator.required);
3505
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3506
+ rawEvaluator.required,
3507
+ rawEvaluator.min_score,
3508
+ name,
3509
+ evalId
3510
+ );
3400
3511
  evaluators.push({
3401
3512
  name,
3402
3513
  type: "equals",
3403
3514
  value,
3404
3515
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3405
3516
  ...required2 !== void 0 ? { required: required2 } : {},
3517
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3406
3518
  ...negate !== void 0 ? { negate } : {}
3407
3519
  });
3408
3520
  continue;
@@ -3438,7 +3550,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3438
3550
  continue;
3439
3551
  }
3440
3552
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3441
- const required2 = parseRequired(rawEvaluator.required);
3553
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3554
+ rawEvaluator.required,
3555
+ rawEvaluator.min_score,
3556
+ name,
3557
+ evalId
3558
+ );
3442
3559
  evaluators.push({
3443
3560
  name,
3444
3561
  type: "llm-grader",
@@ -3446,6 +3563,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3446
3563
  ...graderTargetName ? { target: graderTargetName } : {},
3447
3564
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3448
3565
  ...required2 !== void 0 ? { required: required2 } : {},
3566
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3449
3567
  ...negate !== void 0 ? { negate } : {}
3450
3568
  });
3451
3569
  continue;
@@ -3515,7 +3633,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3515
3633
  continue;
3516
3634
  }
3517
3635
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3518
- const required2 = parseRequired(rawEvaluator.required);
3636
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3637
+ rawEvaluator.required,
3638
+ rawEvaluator.min_score,
3639
+ name,
3640
+ evalId
3641
+ );
3519
3642
  evaluators.push({
3520
3643
  name,
3521
3644
  type: "llm-grader",
@@ -3523,12 +3646,18 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3523
3646
  ...graderTargetName ? { target: graderTargetName } : {},
3524
3647
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3525
3648
  ...required2 !== void 0 ? { required: required2 } : {},
3649
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3526
3650
  ...negate !== void 0 ? { negate } : {}
3527
3651
  });
3528
3652
  continue;
3529
3653
  }
3530
3654
  const weight = validateWeight(rawEvaluator.weight, name, evalId);
3531
- const required = parseRequired(rawEvaluator.required);
3655
+ const { required, min_score } = parseRequiredAndMinScore(
3656
+ rawEvaluator.required,
3657
+ rawEvaluator.min_score,
3658
+ name,
3659
+ evalId
3660
+ );
3532
3661
  const knownProps = /* @__PURE__ */ new Set([
3533
3662
  "name",
3534
3663
  "type",
@@ -3539,6 +3668,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3539
3668
  "weight",
3540
3669
  "config",
3541
3670
  "required",
3671
+ "min_score",
3542
3672
  "negate",
3543
3673
  "max_steps",
3544
3674
  "maxSteps",
@@ -3568,6 +3698,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3568
3698
  ...graderTargetName ? { target: graderTargetName } : {},
3569
3699
  ...weight !== void 0 ? { weight } : {},
3570
3700
  ...required !== void 0 ? { required } : {},
3701
+ ...min_score !== void 0 ? { min_score } : {},
3571
3702
  ...negate !== void 0 ? { negate } : {},
3572
3703
  ...finalConfig ? { config: finalConfig } : {},
3573
3704
  ...llmMaxSteps !== void 0 ? { max_steps: llmMaxSteps } : {},
@@ -3699,10 +3830,23 @@ ${detailBlock}${ANSI_RESET5}`);
3699
3830
  console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET5}`);
3700
3831
  }
3701
3832
  }
3702
- function parseRequired(value) {
3703
- if (value === true) return true;
3704
- if (typeof value === "number" && value > 0 && value <= 1) return value;
3705
- return void 0;
3833
+ function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalId) {
3834
+ const result = {};
3835
+ if (typeof rawMinScore === "number" && rawMinScore > 0 && rawMinScore <= 1) {
3836
+ result.min_score = rawMinScore;
3837
+ }
3838
+ if (rawRequired === true) {
3839
+ result.required = true;
3840
+ } else if (typeof rawRequired === "number" && rawRequired > 0 && rawRequired <= 1) {
3841
+ if (result.min_score === void 0) {
3842
+ result.min_score = rawRequired;
3843
+ }
3844
+ result.required = rawRequired;
3845
+ logWarning2(
3846
+ `Evaluator '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
3847
+ );
3848
+ }
3849
+ return result;
3706
3850
  }
3707
3851
  function validateWeight(rawWeight, evaluatorName, evalId) {
3708
3852
  if (rawWeight === void 0) {
@@ -3745,16 +3889,30 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
3745
3889
  const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
3746
3890
  const expectedOutcome = asString(rawRubric.outcome) ?? "";
3747
3891
  const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
3892
+ let minScore;
3748
3893
  let requiredMinScore;
3749
3894
  let required;
3750
- if (typeof rawRubric.required_min_score === "number") {
3751
- const minScore = rawRubric.required_min_score;
3752
- if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
3895
+ if (typeof rawRubric.min_score === "number") {
3896
+ const ms = rawRubric.min_score;
3897
+ if (ms <= 0 || ms > 1) {
3753
3898
  throw new Error(
3754
- `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
3899
+ `Invalid min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be in (0, 1] (got ${ms})`
3755
3900
  );
3756
3901
  }
3757
- requiredMinScore = minScore;
3902
+ minScore = ms;
3903
+ requiredMinScore = Math.round(ms * 10);
3904
+ } else if (typeof rawRubric.required_min_score === "number") {
3905
+ const rms = rawRubric.required_min_score;
3906
+ if (!Number.isInteger(rms) || rms < 0 || rms > 10) {
3907
+ throw new Error(
3908
+ `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${rms})`
3909
+ );
3910
+ }
3911
+ requiredMinScore = rms;
3912
+ minScore = rms / 10;
3913
+ logWarning2(
3914
+ `Rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': 'required_min_score: ${rms}' is deprecated. Use 'min_score: ${rms / 10}' (0-1 scale) instead.`
3915
+ );
3758
3916
  }
3759
3917
  if (typeof rawRubric.required === "boolean") {
3760
3918
  required = rawRubric.required;
@@ -3774,6 +3932,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
3774
3932
  weight,
3775
3933
  ...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
3776
3934
  ...required !== void 0 ? { required } : {},
3935
+ ...minScore !== void 0 ? { min_score: minScore } : {},
3777
3936
  ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
3778
3937
  score_ranges: scoreRanges
3779
3938
  });
@@ -3790,6 +3949,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
3790
3949
  weight,
3791
3950
  // Default to required: true if not specified (backward compatibility)
3792
3951
  required: required ?? true,
3952
+ ...minScore !== void 0 ? { min_score: minScore } : {},
3793
3953
  ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
3794
3954
  });
3795
3955
  }
@@ -3918,12 +4078,22 @@ function parseInlineRubrics(rawRubrics) {
3918
4078
  id: asString(rubric.id) ?? `rubric-${index + 1}`,
3919
4079
  weight: typeof rubric.weight === "number" ? rubric.weight : 1
3920
4080
  };
4081
+ let inlineMinScore;
4082
+ let inlineRequiredMinScore;
4083
+ if (typeof rubric.min_score === "number") {
4084
+ inlineMinScore = rubric.min_score;
4085
+ inlineRequiredMinScore = Math.round(inlineMinScore * 10);
4086
+ } else if (typeof rubric.required_min_score === "number") {
4087
+ inlineRequiredMinScore = rubric.required_min_score;
4088
+ inlineMinScore = inlineRequiredMinScore / 10;
4089
+ }
3921
4090
  if (scoreRanges && scoreRanges.length > 0) {
3922
4091
  return {
3923
4092
  ...baseRubric,
3924
4093
  ...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
3925
4094
  ...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
3926
- ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
4095
+ ...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
4096
+ ...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {},
3927
4097
  score_ranges: scoreRanges
3928
4098
  };
3929
4099
  }
@@ -3931,7 +4101,8 @@ function parseInlineRubrics(rawRubrics) {
3931
4101
  ...baseRubric,
3932
4102
  outcome: expectedOutcome,
3933
4103
  required: typeof rubric.required === "boolean" ? rubric.required : true,
3934
- ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
4104
+ ...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
4105
+ ...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {}
3935
4106
  };
3936
4107
  }).filter((r) => r.outcome && r.outcome.length > 0 || "score_ranges" in r && r.score_ranges);
3937
4108
  if (rubricItems.length === 0) {
@@ -4335,6 +4506,9 @@ function resolveExpectedMessages(raw) {
4335
4506
  var ANSI_YELLOW6 = "\x1B[33m";
4336
4507
  var ANSI_RED2 = "\x1B[31m";
4337
4508
  var ANSI_RESET7 = "\x1B[0m";
4509
+ function matchesFilter(id, filter) {
4510
+ return typeof filter === "string" ? import_micromatch.default.isMatch(id, filter) : filter.some((pattern) => import_micromatch.default.isMatch(id, pattern));
4511
+ }
4338
4512
  function detectFormat(filePath) {
4339
4513
  const ext = import_node_path7.default.extname(filePath).toLowerCase();
4340
4514
  if (ext === ".jsonl") return "jsonl";
@@ -4402,40 +4576,40 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
4402
4576
  const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
4403
4577
  const rawFile = await (0, import_promises7.readFile)(absoluteTestPath, "utf8");
4404
4578
  const rawCases = parseJsonlContent(rawFile, evalFilePath);
4405
- const fallbackEvalSet = import_node_path7.default.basename(absoluteTestPath, ".jsonl") || "eval";
4406
- const evalSetName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackEvalSet;
4579
+ const fallbackSuiteName = import_node_path7.default.basename(absoluteTestPath, ".jsonl") || "eval";
4580
+ const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
4407
4581
  const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-grader";
4408
4582
  const globalExecution = sidecar.execution;
4409
4583
  if (verbose) {
4410
4584
  console.log(`
4411
- [JSONL Dataset: ${evalFilePath}]`);
4585
+ [JSONL Suite: ${evalFilePath}]`);
4412
4586
  console.log(` Cases: ${rawCases.length}`);
4413
- console.log(` Eval set: ${evalSetName}`);
4587
+ console.log(` Suite: ${suiteName}`);
4414
4588
  if (sidecar.description) {
4415
4589
  console.log(` Description: ${sidecar.description}`);
4416
4590
  }
4417
4591
  }
4418
4592
  const results = [];
4419
4593
  for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
4420
- const evalcase = rawCases[lineIndex];
4594
+ const testCaseConfig = rawCases[lineIndex];
4421
4595
  const lineNumber = lineIndex + 1;
4422
- const id = asString4(evalcase.id);
4423
- if (filterPattern && (!id || !import_micromatch.default.isMatch(id, filterPattern))) {
4596
+ const id = asString4(testCaseConfig.id);
4597
+ if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
4424
4598
  continue;
4425
4599
  }
4426
- const conversationId = asString4(evalcase.conversation_id);
4427
- let outcome = asString4(evalcase.criteria);
4428
- if (!outcome && evalcase.expected_outcome !== void 0) {
4429
- outcome = asString4(evalcase.expected_outcome);
4600
+ const conversationId = asString4(testCaseConfig.conversation_id);
4601
+ let outcome = asString4(testCaseConfig.criteria);
4602
+ if (!outcome && testCaseConfig.expected_outcome !== void 0) {
4603
+ outcome = asString4(testCaseConfig.expected_outcome);
4430
4604
  if (outcome) {
4431
4605
  logWarning4(
4432
- `Test '${asString4(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
4606
+ `Test '${asString4(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
4433
4607
  );
4434
4608
  }
4435
4609
  }
4436
- const rawInputMessages = resolveInputMessages(evalcase);
4437
- const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
4438
- const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
4610
+ const rawInputMessages = resolveInputMessages(testCaseConfig);
4611
+ const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
4612
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assert !== void 0;
4439
4613
  if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
4440
4614
  logError2(
4441
4615
  `Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
@@ -4472,18 +4646,23 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
4472
4646
  }
4473
4647
  }
4474
4648
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
4475
- const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
4649
+ const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
4476
4650
  const mergedExecution = caseExecution ?? globalExecution;
4477
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
4651
+ const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
4478
4652
  let evaluators;
4479
4653
  try {
4480
- evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? "unknown");
4654
+ evaluators = await parseEvaluators(
4655
+ testCaseConfig,
4656
+ mergedExecution,
4657
+ searchRoots,
4658
+ id ?? "unknown"
4659
+ );
4481
4660
  } catch (error) {
4482
4661
  const message = error instanceof Error ? error.message : String(error);
4483
4662
  logError2(`Skipping test '${id}' at line ${lineNumber}: ${message}`);
4484
4663
  continue;
4485
4664
  }
4486
- const inlineRubrics = evalcase.rubrics;
4665
+ const inlineRubrics = testCaseConfig.rubrics;
4487
4666
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
4488
4667
  const rubricEvaluator = parseInlineRubrics(inlineRubrics);
4489
4668
  if (rubricEvaluator) {
@@ -4494,7 +4673,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
4494
4673
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
4495
4674
  const testCase = {
4496
4675
  id,
4497
- dataset: evalSetName,
4676
+ suite: suiteName,
4498
4677
  conversation_id: conversationId,
4499
4678
  question,
4500
4679
  input: inputMessages,
@@ -4502,7 +4681,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
4502
4681
  reference_answer: referenceAnswer,
4503
4682
  file_paths: userFilePaths,
4504
4683
  criteria: outcome ?? "",
4505
- evaluator: evalCaseEvaluatorKind,
4684
+ evaluator: testCaseEvaluatorKind,
4506
4685
  assertions: evaluators
4507
4686
  };
4508
4687
  results.push(testCase);
@@ -4687,6 +4866,9 @@ function buildChatPromptFromSegments(options) {
4687
4866
  var ANSI_YELLOW7 = "\x1B[33m";
4688
4867
  var ANSI_RED3 = "\x1B[31m";
4689
4868
  var ANSI_RESET8 = "\x1B[0m";
4869
+ function matchesFilter2(id, filter) {
4870
+ return typeof filter === "string" ? import_micromatch2.default.isMatch(id, filter) : filter.some((pattern) => import_micromatch2.default.isMatch(id, pattern));
4871
+ }
4690
4872
  function resolveTests(suite) {
4691
4873
  if (suite.tests !== void 0) return suite.tests;
4692
4874
  if (suite.eval_cases !== void 0) {
@@ -4766,18 +4948,18 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4766
4948
  throw new Error(`Invalid test file format: ${evalFilePath}`);
4767
4949
  }
4768
4950
  const suite = interpolated;
4769
- const evalSetNameFromSuite = asString5(suite.name)?.trim();
4770
- const fallbackEvalSet = import_node_path8.default.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
4771
- const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
4772
- const rawTestcases = resolveTests(suite);
4951
+ const suiteNameFromFile = asString5(suite.name)?.trim();
4952
+ const fallbackSuiteName = import_node_path8.default.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
4953
+ const suiteName = suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName;
4954
+ const rawTestCases = resolveTests(suite);
4773
4955
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
4774
4956
  const evalFileDir = import_node_path8.default.dirname(absoluteTestPath);
4775
- let expandedTestcases;
4776
- if (typeof rawTestcases === "string") {
4777
- const externalPath = import_node_path8.default.resolve(evalFileDir, rawTestcases);
4778
- expandedTestcases = await loadCasesFromFile(externalPath);
4779
- } else if (Array.isArray(rawTestcases)) {
4780
- expandedTestcases = await expandFileReferences(rawTestcases, evalFileDir);
4957
+ let expandedTestCases;
4958
+ if (typeof rawTestCases === "string") {
4959
+ const externalPath = import_node_path8.default.resolve(evalFileDir, rawTestCases);
4960
+ expandedTestCases = await loadCasesFromFile(externalPath);
4961
+ } else if (Array.isArray(rawTestCases)) {
4962
+ expandedTestCases = await expandFileReferences(rawTestCases, evalFileDir);
4781
4963
  } else {
4782
4964
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
4783
4965
  }
@@ -4792,32 +4974,33 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4792
4974
  }
4793
4975
  const globalExecution = suiteAssertions !== void 0 ? { ...rawGlobalExecution ?? {}, assertions: suiteAssertions } : rawGlobalExecution;
4794
4976
  const results = [];
4795
- for (const rawEvalcase of expandedTestcases) {
4796
- if (!isJsonObject(rawEvalcase)) {
4977
+ for (const rawTestCase of expandedTestCases) {
4978
+ if (!isJsonObject(rawTestCase)) {
4797
4979
  logWarning5("Skipping invalid test entry (expected object)");
4798
4980
  continue;
4799
4981
  }
4800
- const evalcase = rawEvalcase;
4801
- const id = asString5(evalcase.id);
4802
- if (filterPattern && (!id || !import_micromatch2.default.isMatch(id, filterPattern))) {
4982
+ const testCaseConfig = rawTestCase;
4983
+ const id = asString5(testCaseConfig.id);
4984
+ if (filterPattern && (!id || !matchesFilter2(id, filterPattern))) {
4803
4985
  continue;
4804
4986
  }
4805
- const conversationId = asString5(evalcase.conversation_id);
4806
- let outcome = asString5(evalcase.criteria);
4807
- if (!outcome && evalcase.expected_outcome !== void 0) {
4808
- outcome = asString5(evalcase.expected_outcome);
4987
+ const conversationId = asString5(testCaseConfig.conversation_id);
4988
+ let outcome = asString5(testCaseConfig.criteria);
4989
+ if (!outcome && testCaseConfig.expected_outcome !== void 0) {
4990
+ outcome = asString5(testCaseConfig.expected_outcome);
4809
4991
  if (outcome) {
4810
4992
  logWarning5(
4811
- `Test '${asString5(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
4993
+ `Test '${asString5(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
4812
4994
  );
4813
4995
  }
4814
4996
  }
4815
- const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
4997
+ const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
4816
4998
  const skipDefaults = caseExecution?.skip_defaults === true;
4999
+ const caseThreshold = typeof caseExecution?.threshold === "number" && caseExecution.threshold >= 0 && caseExecution.threshold <= 1 ? caseExecution.threshold : void 0;
4817
5000
  const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
4818
- const testInputMessages = resolveInputMessages(evalcase, effectiveSuiteInputFiles);
4819
- const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
4820
- const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assertions !== void 0 || evalcase.assert !== void 0;
5001
+ const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
5002
+ const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
5003
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
4821
5004
  if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
4822
5005
  logError3(
4823
5006
  `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
@@ -4864,16 +5047,21 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4864
5047
  }
4865
5048
  }
4866
5049
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
4867
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
5050
+ const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
4868
5051
  let evaluators;
4869
5052
  try {
4870
- evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
5053
+ evaluators = await parseEvaluators(
5054
+ testCaseConfig,
5055
+ globalExecution,
5056
+ searchRoots,
5057
+ id ?? "unknown"
5058
+ );
4871
5059
  } catch (error) {
4872
5060
  const message = error instanceof Error ? error.message : String(error);
4873
5061
  logError3(`Skipping test '${id}': ${message}`);
4874
5062
  continue;
4875
5063
  }
4876
- const inlineRubrics = evalcase.rubrics;
5064
+ const inlineRubrics = testCaseConfig.rubrics;
4877
5065
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
4878
5066
  const rubricEvaluator = parseInlineRubrics(inlineRubrics);
4879
5067
  if (rubricEvaluator) {
@@ -4882,13 +5070,13 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4882
5070
  }
4883
5071
  warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
4884
5072
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
4885
- const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
5073
+ const caseWorkspace = await resolveWorkspaceConfig(testCaseConfig.workspace, evalFileDir);
4886
5074
  const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
4887
- const metadata = isJsonObject(evalcase.metadata) ? evalcase.metadata : void 0;
4888
- const caseTargets = extractTargetsFromTestCase(evalcase);
5075
+ const metadata = isJsonObject(testCaseConfig.metadata) ? testCaseConfig.metadata : void 0;
5076
+ const caseTargets = extractTargetsFromTestCase(testCaseConfig);
4889
5077
  const testCase = {
4890
5078
  id,
4891
- dataset: evalSetName,
5079
+ suite: suiteName,
4892
5080
  category: options?.category,
4893
5081
  conversation_id: conversationId,
4894
5082
  question,
@@ -4897,11 +5085,12 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4897
5085
  reference_answer: referenceAnswer,
4898
5086
  file_paths: userFilePaths,
4899
5087
  criteria: outcome ?? "",
4900
- evaluator: evalCaseEvaluatorKind,
5088
+ evaluator: testCaseEvaluatorKind,
4901
5089
  assertions: evaluators,
4902
5090
  workspace: mergedWorkspace,
4903
5091
  metadata,
4904
- targets: caseTargets
5092
+ targets: caseTargets,
5093
+ ...caseThreshold !== void 0 ? { threshold: caseThreshold } : {}
4905
5094
  };
4906
5095
  results.push(testCase);
4907
5096
  }
@@ -5567,7 +5756,7 @@ var AzureProvider = class {
5567
5756
  };
5568
5757
  this.retryConfig = config.retry;
5569
5758
  const azure = (0, import_azure2.createAzure)(buildAzureOptions(config));
5570
- this.model = azure.chat(config.deploymentName);
5759
+ this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
5571
5760
  }
5572
5761
  id;
5573
5762
  kind = "azure";
@@ -5693,7 +5882,9 @@ function buildAzureOptions(config) {
5693
5882
  const options = {
5694
5883
  apiKey: config.apiKey,
5695
5884
  apiVersion: config.version,
5696
- useDeploymentBasedUrls: true
5885
+ // Chat completions still use deployment-scoped Azure URLs for compatibility
5886
+ // with existing deployments. Responses API should use the SDK's v1 path.
5887
+ useDeploymentBasedUrls: config.apiFormat !== "responses"
5697
5888
  };
5698
5889
  const baseURL = normalizeAzureBaseUrl(config.resourceName);
5699
5890
  if (baseURL) {
@@ -8910,6 +9101,25 @@ var CopilotSdkProvider = class {
8910
9101
  content: systemPrompt
8911
9102
  };
8912
9103
  }
9104
+ if (this.config.byokBaseUrl) {
9105
+ const byokType = this.config.byokType ?? "openai";
9106
+ const provider = {
9107
+ type: byokType,
9108
+ baseUrl: normalizeByokBaseUrl(this.config.byokBaseUrl, byokType)
9109
+ };
9110
+ if (this.config.byokBearerToken) {
9111
+ provider.bearerToken = this.config.byokBearerToken;
9112
+ } else if (this.config.byokApiKey) {
9113
+ provider.apiKey = this.config.byokApiKey;
9114
+ }
9115
+ if (this.config.byokWireApi) {
9116
+ provider.wireApi = this.config.byokWireApi;
9117
+ }
9118
+ if (this.config.byokType === "azure" && this.config.byokApiVersion) {
9119
+ provider.azure = { apiVersion: this.config.byokApiVersion };
9120
+ }
9121
+ sessionOptions.provider = provider;
9122
+ }
8913
9123
  let session;
8914
9124
  try {
8915
9125
  session = await client.createSession(sessionOptions);
@@ -9141,6 +9351,16 @@ function resolveSkillDirectories(cwd) {
9141
9351
  ];
9142
9352
  return candidates.filter((dir) => (0, import_node_fs8.existsSync)(dir));
9143
9353
  }
9354
+ function normalizeByokBaseUrl(baseUrl, type) {
9355
+ const trimmed = baseUrl.trim().replace(/\/+$/, "");
9356
+ if (/^https?:\/\//i.test(trimmed)) {
9357
+ return trimmed;
9358
+ }
9359
+ if (type === "azure") {
9360
+ return `https://${trimmed}.openai.azure.com`;
9361
+ }
9362
+ return trimmed;
9363
+ }
9144
9364
  function summarizeSdkEvent(eventType, data) {
9145
9365
  if (!data || typeof data !== "object") {
9146
9366
  return eventType;
@@ -9322,6 +9542,22 @@ function extractAzureResourceName(baseUrl) {
9322
9542
  if (urlMatch) return urlMatch[1];
9323
9543
  return baseUrl;
9324
9544
  }
9545
+ function normalizeAzureSdkBaseUrl(baseUrl) {
9546
+ const trimmed = baseUrl.trim().replace(/\/+$/, "");
9547
+ if (!trimmed) {
9548
+ return trimmed;
9549
+ }
9550
+ if (!/^https?:\/\//i.test(trimmed)) {
9551
+ return `https://${trimmed}.openai.azure.com/openai/v1`;
9552
+ }
9553
+ if (/\/openai\/v1$/i.test(trimmed)) {
9554
+ return trimmed;
9555
+ }
9556
+ if (/\/openai$/i.test(trimmed)) {
9557
+ return `${trimmed}/v1`;
9558
+ }
9559
+ return `${trimmed}/openai/v1`;
9560
+ }
9325
9561
 
9326
9562
  // src/evaluation/providers/pi-utils.ts
9327
9563
  init_cjs_shims();
@@ -10156,9 +10392,40 @@ var import_node_child_process5 = require("child_process");
10156
10392
  var import_node_crypto8 = require("crypto");
10157
10393
  var import_node_fs10 = require("fs");
10158
10394
  var import_promises19 = require("fs/promises");
10159
- var import_node_path22 = __toESM(require("path"), 1);
10395
+ var import_node_path23 = __toESM(require("path"), 1);
10160
10396
  var import_node_readline = require("readline");
10161
10397
  var import_node_url3 = require("url");
10398
+
10399
+ // src/paths.ts
10400
+ init_cjs_shims();
10401
+ var import_node_os6 = __toESM(require("os"), 1);
10402
+ var import_node_path22 = __toESM(require("path"), 1);
10403
+ var logged = false;
10404
+ function getAgentvHome() {
10405
+ const envHome = process.env.AGENTV_HOME;
10406
+ if (envHome && envHome !== "undefined") {
10407
+ if (!logged) {
10408
+ logged = true;
10409
+ console.warn(`Using AGENTV_HOME: ${envHome}`);
10410
+ }
10411
+ return envHome;
10412
+ }
10413
+ return import_node_path22.default.join(import_node_os6.default.homedir(), ".agentv");
10414
+ }
10415
+ function getWorkspacesRoot() {
10416
+ return import_node_path22.default.join(getAgentvHome(), "workspaces");
10417
+ }
10418
+ function getSubagentsRoot() {
10419
+ return import_node_path22.default.join(getAgentvHome(), "subagents");
10420
+ }
10421
+ function getTraceStateRoot() {
10422
+ return import_node_path22.default.join(getAgentvHome(), "trace-state");
10423
+ }
10424
+ function getWorkspacePoolRoot() {
10425
+ return import_node_path22.default.join(getAgentvHome(), "workspace-pool");
10426
+ }
10427
+
10428
+ // src/evaluation/providers/pi-coding-agent.ts
10162
10429
  var piCodingAgentModule = null;
10163
10430
  var piAiModule = null;
10164
10431
  var loadingPromise = null;
@@ -10176,46 +10443,126 @@ async function promptInstall() {
10176
10443
  rl.close();
10177
10444
  }
10178
10445
  }
10179
- function findAgentvRoot() {
10180
- const thisFile = (0, import_node_url3.fileURLToPath)(importMetaUrl);
10181
- let dir = import_node_path22.default.dirname(thisFile);
10182
- for (let i = 0; i < 10; i++) {
10446
+ function findManagedSdkInstallRoot() {
10447
+ return import_node_path23.default.join(getAgentvHome(), "deps", "pi-sdk");
10448
+ }
10449
+ function resolveGlobalNpmRoot() {
10450
+ try {
10451
+ const root = (0, import_node_child_process5.execSync)("npm root -g", {
10452
+ encoding: "utf-8",
10453
+ stdio: ["ignore", "pipe", "ignore"]
10454
+ }).trim();
10455
+ return root.length > 0 ? root : void 0;
10456
+ } catch {
10457
+ return void 0;
10458
+ }
10459
+ }
10460
+ function buildGlobalModuleEntry(moduleName, globalNpmRoot) {
10461
+ return import_node_path23.default.join(globalNpmRoot, ...moduleName.split("/"), "dist", "index.js");
10462
+ }
10463
+ function findAccessiblePath(paths) {
10464
+ for (const candidate of paths) {
10183
10465
  try {
10184
- const pkg = import_node_path22.default.join(dir, "package.json");
10185
- (0, import_node_fs10.accessSync)(pkg);
10186
- return dir;
10466
+ (0, import_node_fs10.accessSync)(candidate);
10467
+ return candidate;
10187
10468
  } catch {
10188
- const parent = import_node_path22.default.dirname(dir);
10189
- if (parent === dir) break;
10190
- dir = parent;
10191
10469
  }
10192
10470
  }
10193
- return import_node_path22.default.dirname(thisFile);
10471
+ return void 0;
10194
10472
  }
10195
- async function doLoadSdkModules() {
10473
+ async function tryImportLocalSdkModules() {
10196
10474
  try {
10197
10475
  [piCodingAgentModule, piAiModule] = await Promise.all([
10198
10476
  import("@mariozechner/pi-coding-agent"),
10199
10477
  import("@mariozechner/pi-ai")
10200
10478
  ]);
10479
+ return true;
10201
10480
  } catch {
10202
- if (await promptInstall()) {
10203
- const installDir = findAgentvRoot();
10204
- console.error(`Installing @mariozechner/pi-coding-agent into ${installDir}...`);
10205
- (0, import_node_child_process5.execSync)("bun add @mariozechner/pi-coding-agent", {
10206
- cwd: installDir,
10207
- stdio: "inherit"
10208
- });
10209
- [piCodingAgentModule, piAiModule] = await Promise.all([
10210
- import("@mariozechner/pi-coding-agent"),
10211
- import("@mariozechner/pi-ai")
10212
- ]);
10213
- } else {
10214
- throw new Error(
10215
- "pi-coding-agent SDK is not installed. Install it with:\n bun add @mariozechner/pi-coding-agent"
10216
- );
10481
+ return false;
10482
+ }
10483
+ }
10484
+ async function tryImportManagedSdkModules() {
10485
+ const managedRoot = findManagedSdkInstallRoot();
10486
+ const piCodingAgentEntry = findAccessiblePath([
10487
+ import_node_path23.default.join(managedRoot, "node_modules", "@mariozechner", "pi-coding-agent", "dist", "index.js")
10488
+ ]);
10489
+ const piAiEntry = findAccessiblePath([
10490
+ import_node_path23.default.join(managedRoot, "node_modules", "@mariozechner", "pi-ai", "dist", "index.js"),
10491
+ import_node_path23.default.join(
10492
+ managedRoot,
10493
+ "node_modules",
10494
+ "@mariozechner",
10495
+ "pi-coding-agent",
10496
+ "node_modules",
10497
+ "@mariozechner",
10498
+ "pi-ai",
10499
+ "dist",
10500
+ "index.js"
10501
+ )
10502
+ ]);
10503
+ if (!piCodingAgentEntry || !piAiEntry) return false;
10504
+ try {
10505
+ [piCodingAgentModule, piAiModule] = await Promise.all([
10506
+ import((0, import_node_url3.pathToFileURL)(piCodingAgentEntry).href),
10507
+ import((0, import_node_url3.pathToFileURL)(piAiEntry).href)
10508
+ ]);
10509
+ return true;
10510
+ } catch {
10511
+ return false;
10512
+ }
10513
+ }
10514
+ async function tryImportGlobalSdkModules() {
10515
+ const globalNpmRoot = resolveGlobalNpmRoot();
10516
+ if (!globalNpmRoot) return false;
10517
+ const piCodingAgentEntry = findAccessiblePath([
10518
+ buildGlobalModuleEntry("@mariozechner/pi-coding-agent", globalNpmRoot)
10519
+ ]);
10520
+ const piAiEntry = findAccessiblePath([
10521
+ buildGlobalModuleEntry("@mariozechner/pi-ai", globalNpmRoot),
10522
+ import_node_path23.default.join(
10523
+ globalNpmRoot,
10524
+ "@mariozechner",
10525
+ "pi-coding-agent",
10526
+ "node_modules",
10527
+ "@mariozechner",
10528
+ "pi-ai",
10529
+ "dist",
10530
+ "index.js"
10531
+ )
10532
+ ]);
10533
+ if (!piCodingAgentEntry || !piAiEntry) return false;
10534
+ try {
10535
+ [piCodingAgentModule, piAiModule] = await Promise.all([
10536
+ import((0, import_node_url3.pathToFileURL)(piCodingAgentEntry).href),
10537
+ import((0, import_node_url3.pathToFileURL)(piAiEntry).href)
10538
+ ]);
10539
+ return true;
10540
+ } catch {
10541
+ return false;
10542
+ }
10543
+ }
10544
+ function installSdkModules(installDir) {
10545
+ console.error(`Installing @mariozechner/pi-coding-agent into ${installDir} via npm...`);
10546
+ (0, import_node_fs10.mkdirSync)(installDir, { recursive: true });
10547
+ (0, import_node_child_process5.execSync)("npm install --no-save --no-package-lock @mariozechner/pi-coding-agent", {
10548
+ cwd: installDir,
10549
+ stdio: "inherit"
10550
+ });
10551
+ }
10552
+ async function doLoadSdkModules() {
10553
+ if (await tryImportLocalSdkModules() || await tryImportManagedSdkModules() || await tryImportGlobalSdkModules()) {
10554
+ return;
10555
+ }
10556
+ if (await promptInstall()) {
10557
+ const installDir = findManagedSdkInstallRoot();
10558
+ installSdkModules(installDir);
10559
+ if (await tryImportManagedSdkModules()) {
10560
+ return;
10217
10561
  }
10218
10562
  }
10563
+ throw new Error(
10564
+ "pi-coding-agent SDK is not installed. Install it with:\n npm install @mariozechner/pi-coding-agent"
10565
+ );
10219
10566
  }
10220
10567
  async function loadSdkModules() {
10221
10568
  if (!piCodingAgentModule || !piAiModule) {
@@ -10272,12 +10619,16 @@ var PiCodingAgentProvider = class {
10272
10619
  try {
10273
10620
  const cwd = this.resolveCwd(request.cwd);
10274
10621
  const rawProvider = this.config.subprovider ?? "google";
10275
- const hasBaseUrl = !!this.config.baseUrl;
10622
+ const normalizedBaseUrl = this.normalizeSdkBaseUrl(rawProvider, this.config.baseUrl);
10623
+ const hasBaseUrl = !!normalizedBaseUrl;
10276
10624
  const providerName = resolveSubprovider(rawProvider, hasBaseUrl);
10277
10625
  const modelId = this.config.model ?? "gemini-2.5-flash";
10278
10626
  this.setApiKeyEnv(rawProvider, hasBaseUrl);
10279
- this.setBaseUrlEnv(rawProvider, hasBaseUrl);
10627
+ this.setBaseUrlEnv(rawProvider, normalizedBaseUrl, hasBaseUrl);
10280
10628
  let model = sdk.getModel(providerName, modelId);
10629
+ if (model && normalizedBaseUrl) {
10630
+ model = { ...model, baseUrl: normalizedBaseUrl };
10631
+ }
10281
10632
  if (!model) {
10282
10633
  const envProvider = providerName.replace(/-responses$/, "");
10283
10634
  model = {
@@ -10285,7 +10636,7 @@ var PiCodingAgentProvider = class {
10285
10636
  name: modelId,
10286
10637
  api: providerName,
10287
10638
  provider: envProvider,
10288
- baseUrl: this.config.baseUrl ?? "",
10639
+ baseUrl: normalizedBaseUrl ?? "",
10289
10640
  reasoning: false,
10290
10641
  input: ["text"],
10291
10642
  cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
@@ -10452,19 +10803,27 @@ ${fileList}`;
10452
10803
  }
10453
10804
  }
10454
10805
  /** Maps config baseUrl to the provider-specific env var the SDK reads. */
10455
- setBaseUrlEnv(providerName, hasBaseUrl = false) {
10456
- if (!this.config.baseUrl) return;
10806
+ setBaseUrlEnv(providerName, baseUrl = this.config.baseUrl, hasBaseUrl = false) {
10807
+ const normalizedBaseUrl = this.normalizeSdkBaseUrl(providerName, baseUrl);
10808
+ if (!normalizedBaseUrl) return;
10457
10809
  const envKey = resolveEnvBaseUrlName(providerName, hasBaseUrl);
10458
10810
  if (envKey) {
10459
- process.env[envKey] = this.config.baseUrl;
10811
+ process.env[envKey] = normalizedBaseUrl;
10812
+ }
10813
+ }
10814
+ normalizeSdkBaseUrl(providerName, baseUrl) {
10815
+ if (!baseUrl) return void 0;
10816
+ if (providerName.toLowerCase() === "azure") {
10817
+ return normalizeAzureSdkBaseUrl(baseUrl);
10460
10818
  }
10819
+ return baseUrl;
10461
10820
  }
10462
10821
  resolveCwd(cwdOverride) {
10463
10822
  if (cwdOverride) {
10464
- return import_node_path22.default.resolve(cwdOverride);
10823
+ return import_node_path23.default.resolve(cwdOverride);
10465
10824
  }
10466
10825
  if (this.config.cwd) {
10467
- return import_node_path22.default.resolve(this.config.cwd);
10826
+ return import_node_path23.default.resolve(this.config.cwd);
10468
10827
  }
10469
10828
  return process.cwd();
10470
10829
  }
@@ -10483,9 +10842,9 @@ ${fileList}`;
10483
10842
  }
10484
10843
  resolveLogDirectory() {
10485
10844
  if (this.config.logDir) {
10486
- return import_node_path22.default.resolve(this.config.logDir);
10845
+ return import_node_path23.default.resolve(this.config.logDir);
10487
10846
  }
10488
- return import_node_path22.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
10847
+ return import_node_path23.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
10489
10848
  }
10490
10849
  async createStreamLogger(request) {
10491
10850
  const logDir = this.resolveLogDirectory();
@@ -10499,7 +10858,7 @@ ${fileList}`;
10499
10858
  console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
10500
10859
  return void 0;
10501
10860
  }
10502
- const filePath = import_node_path22.default.join(logDir, buildLogFilename6(request, this.targetName));
10861
+ const filePath = import_node_path23.default.join(logDir, buildLogFilename6(request, this.targetName));
10503
10862
  try {
10504
10863
  const logger = await PiStreamLogger2.create({
10505
10864
  filePath,
@@ -10714,19 +11073,17 @@ var ProviderRegistry = class {
10714
11073
 
10715
11074
  // src/evaluation/providers/targets.ts
10716
11075
  init_cjs_shims();
10717
- var import_node_path23 = __toESM(require("path"), 1);
11076
+ var import_node_path24 = __toESM(require("path"), 1);
10718
11077
  var import_zod3 = require("zod");
10719
11078
  var CliHealthcheckHttpInputSchema = import_zod3.z.object({
10720
11079
  url: import_zod3.z.string().min(1, "healthcheck URL is required"),
10721
- timeout_seconds: import_zod3.z.number().positive().optional(),
10722
- timeoutSeconds: import_zod3.z.number().positive().optional()
10723
- });
11080
+ timeout_seconds: import_zod3.z.number().positive().optional()
11081
+ }).passthrough();
10724
11082
  var CliHealthcheckCommandInputSchema = import_zod3.z.object({
10725
11083
  command: import_zod3.z.string().min(1, "healthcheck command is required"),
10726
11084
  cwd: import_zod3.z.string().optional(),
10727
- timeout_seconds: import_zod3.z.number().positive().optional(),
10728
- timeoutSeconds: import_zod3.z.number().positive().optional()
10729
- });
11085
+ timeout_seconds: import_zod3.z.number().positive().optional()
11086
+ }).passthrough();
10730
11087
  var CliHealthcheckInputSchema = import_zod3.z.union([
10731
11088
  CliHealthcheckHttpInputSchema,
10732
11089
  CliHealthcheckCommandInputSchema
@@ -10738,36 +11095,28 @@ var CliTargetInputSchema = import_zod3.z.object({
10738
11095
  command: import_zod3.z.string(),
10739
11096
  // Files format - optional
10740
11097
  files_format: import_zod3.z.string().optional(),
10741
- filesFormat: import_zod3.z.string().optional(),
10742
11098
  attachments_format: import_zod3.z.string().optional(),
10743
- attachmentsFormat: import_zod3.z.string().optional(),
10744
11099
  // Working directory - optional
10745
11100
  cwd: import_zod3.z.string().optional(),
10746
11101
  // Workspace template directory - optional (mutually exclusive with cwd)
10747
11102
  workspace_template: import_zod3.z.string().optional(),
10748
- workspaceTemplate: import_zod3.z.string().optional(),
10749
11103
  // Timeout in seconds - optional
10750
11104
  timeout_seconds: import_zod3.z.number().positive().optional(),
10751
- timeoutSeconds: import_zod3.z.number().positive().optional(),
10752
11105
  // Healthcheck configuration - optional
10753
11106
  healthcheck: CliHealthcheckInputSchema.optional(),
10754
11107
  // Verbose mode - optional
10755
11108
  verbose: import_zod3.z.boolean().optional(),
10756
11109
  cli_verbose: import_zod3.z.boolean().optional(),
10757
- cliVerbose: import_zod3.z.boolean().optional(),
10758
11110
  // Keep temp files - optional
10759
11111
  keep_temp_files: import_zod3.z.boolean().optional(),
10760
- keepTempFiles: import_zod3.z.boolean().optional(),
10761
11112
  keep_output_files: import_zod3.z.boolean().optional(),
10762
- keepOutputFiles: import_zod3.z.boolean().optional(),
10763
11113
  // Common target fields
10764
11114
  grader_target: import_zod3.z.string().optional(),
10765
11115
  judge_target: import_zod3.z.string().optional(),
10766
11116
  // backward compat
10767
11117
  workers: import_zod3.z.number().int().min(1).optional(),
10768
- provider_batching: import_zod3.z.boolean().optional(),
10769
- providerBatching: import_zod3.z.boolean().optional()
10770
- });
11118
+ provider_batching: import_zod3.z.boolean().optional()
11119
+ }).passthrough();
10771
11120
  var CliHealthcheckHttpSchema = import_zod3.z.object({
10772
11121
  url: import_zod3.z.string().min(1),
10773
11122
  timeoutMs: import_zod3.z.number().positive().optional()
@@ -10792,7 +11141,7 @@ var CliTargetConfigSchema = import_zod3.z.object({
10792
11141
  keepTempFiles: import_zod3.z.boolean().optional()
10793
11142
  }).strict();
10794
11143
  function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
10795
- const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
11144
+ const timeoutSeconds = input.timeout_seconds;
10796
11145
  const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
10797
11146
  if ("url" in input && input.url) {
10798
11147
  const url = resolveString(input.url, env, `${targetName} healthcheck URL`);
@@ -10811,11 +11160,11 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
10811
11160
  allowLiteral: true,
10812
11161
  optionalEnv: true
10813
11162
  });
10814
- if (cwd && evalFilePath && !import_node_path23.default.isAbsolute(cwd)) {
10815
- cwd = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), cwd);
11163
+ if (cwd && evalFilePath && !import_node_path24.default.isAbsolute(cwd)) {
11164
+ cwd = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), cwd);
10816
11165
  }
10817
11166
  if (!cwd && evalFilePath) {
10818
- cwd = import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath));
11167
+ cwd = import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath));
10819
11168
  }
10820
11169
  return {
10821
11170
  command,
@@ -10826,9 +11175,9 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
10826
11175
  function normalizeCliTargetInput(input, env, evalFilePath) {
10827
11176
  const targetName = input.name;
10828
11177
  const command = resolveString(input.command, env, `${targetName} CLI command`, true);
10829
- const filesFormatSource = input.files_format ?? input.filesFormat ?? input.attachments_format ?? input.attachmentsFormat;
11178
+ const filesFormatSource = input.files_format ?? input.attachments_format;
10830
11179
  const filesFormat = resolveOptionalLiteralString(filesFormatSource);
10831
- const workspaceTemplateSource = input.workspace_template ?? input.workspaceTemplate;
11180
+ const workspaceTemplateSource = input.workspace_template;
10832
11181
  let workspaceTemplate = resolveOptionalString(
10833
11182
  workspaceTemplateSource,
10834
11183
  env,
@@ -10838,15 +11187,15 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
10838
11187
  optionalEnv: true
10839
11188
  }
10840
11189
  );
10841
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
10842
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
11190
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
11191
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
10843
11192
  }
10844
11193
  let cwd = resolveOptionalString(input.cwd, env, `${targetName} working directory`, {
10845
11194
  allowLiteral: true,
10846
11195
  optionalEnv: true
10847
11196
  });
10848
- if (cwd && evalFilePath && !import_node_path23.default.isAbsolute(cwd)) {
10849
- cwd = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), cwd);
11197
+ if (cwd && evalFilePath && !import_node_path24.default.isAbsolute(cwd)) {
11198
+ cwd = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), cwd);
10850
11199
  }
10851
11200
  if (cwd && workspaceTemplate) {
10852
11201
  throw new Error(
@@ -10854,14 +11203,12 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
10854
11203
  );
10855
11204
  }
10856
11205
  if (!cwd && !workspaceTemplate && evalFilePath) {
10857
- cwd = import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath));
11206
+ cwd = import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath));
10858
11207
  }
10859
- const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
11208
+ const timeoutSeconds = input.timeout_seconds;
10860
11209
  const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
10861
- const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose ?? input.cliVerbose);
10862
- const keepTempFiles = resolveOptionalBoolean(
10863
- input.keep_temp_files ?? input.keepTempFiles ?? input.keep_output_files ?? input.keepOutputFiles
10864
- );
11210
+ const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose);
11211
+ const keepTempFiles = resolveOptionalBoolean(input.keep_temp_files ?? input.keep_output_files);
10865
11212
  const healthcheck = input.healthcheck ? normalizeCliHealthcheck(input.healthcheck, env, targetName, evalFilePath) : void 0;
10866
11213
  return {
10867
11214
  command,
@@ -10882,14 +11229,104 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
10882
11229
  "FILES",
10883
11230
  "OUTPUT_FILE"
10884
11231
  ]);
11232
+ var DEPRECATED_TARGET_CAMEL_CASE_FIELDS = /* @__PURE__ */ new Map([
11233
+ ["providerBatching", "provider_batching"],
11234
+ ["subagentModeAllowed", "subagent_mode_allowed"],
11235
+ ["fallbackTargets", "fallback_targets"],
11236
+ ["resourceName", "endpoint"],
11237
+ ["baseUrl", "base_url"],
11238
+ ["apiKey", "api_key"],
11239
+ ["deploymentName", "model"],
11240
+ ["thinkingBudget", "thinking_budget"],
11241
+ ["maxTokens", "max_output_tokens"],
11242
+ ["apiFormat", "api_format"],
11243
+ ["timeoutSeconds", "timeout_seconds"],
11244
+ ["logDir", "log_dir"],
11245
+ ["logDirectory", "log_directory"],
11246
+ ["logFormat", "log_format"],
11247
+ ["logOutputFormat", "log_output_format"],
11248
+ ["systemPrompt", "system_prompt"],
11249
+ ["maxTurns", "max_turns"],
11250
+ ["maxBudgetUsd", "max_budget_usd"],
11251
+ ["dryRun", "dry_run"],
11252
+ ["subagentRoot", "subagent_root"],
11253
+ ["filesFormat", "files_format"],
11254
+ ["attachmentsFormat", "attachments_format"],
11255
+ ["cliUrl", "cli_url"],
11256
+ ["cliPath", "cli_path"],
11257
+ ["githubToken", "github_token"],
11258
+ ["sessionDir", "session_dir"],
11259
+ ["sessionId", "session_id"],
11260
+ ["sessionStateDir", "session_state_dir"],
11261
+ ["maxRetries", "max_retries"],
11262
+ ["retryInitialDelayMs", "retry_initial_delay_ms"],
11263
+ ["retryMaxDelayMs", "retry_max_delay_ms"],
11264
+ ["retryBackoffFactor", "retry_backoff_factor"],
11265
+ ["retryStatusCodes", "retry_status_codes"]
11266
+ ]);
11267
+ var DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS = /* @__PURE__ */ new Map([
11268
+ ["timeoutSeconds", "timeout_seconds"]
11269
+ ]);
11270
+ function collectDeprecatedCamelCaseWarnings(value, location, aliases) {
11271
+ if (typeof value !== "object" || value === null || Array.isArray(value)) {
11272
+ return [];
11273
+ }
11274
+ const warnings = [];
11275
+ for (const [camelCaseField, snakeCaseField] of aliases) {
11276
+ if (Object.prototype.hasOwnProperty.call(value, camelCaseField)) {
11277
+ warnings.push({
11278
+ location: `${location}.${camelCaseField}`,
11279
+ message: `camelCase field '${camelCaseField}' is no longer supported in targets.yaml. Use '${snakeCaseField}' instead.`
11280
+ });
11281
+ }
11282
+ }
11283
+ return warnings;
11284
+ }
11285
+ function assertNoDeprecatedCamelCaseTargetFields(definition) {
11286
+ if (Object.prototype.hasOwnProperty.call(definition, "workspaceTemplate")) {
11287
+ throw new Error(
11288
+ `${definition.name}: target-level workspace_template has been removed. Use eval-level workspace.template.`
11289
+ );
11290
+ }
11291
+ const warning = findDeprecatedCamelCaseTargetWarnings(
11292
+ definition,
11293
+ `target "${definition.name}"`
11294
+ )[0];
11295
+ if (!warning) {
11296
+ return;
11297
+ }
11298
+ const fieldMatch = warning.message.match(/field '([^']+)'/);
11299
+ const replacementMatch = warning.message.match(/Use '([^']+)' instead/);
11300
+ const field = fieldMatch?.[1] ?? "unknown";
11301
+ const replacement = replacementMatch?.[1] ?? "snake_case";
11302
+ throw new Error(
11303
+ `${warning.location}: camelCase field '${field}' is no longer supported in targets.yaml. Use '${replacement}' instead.`
11304
+ );
11305
+ }
11306
+ function findDeprecatedCamelCaseTargetWarnings(target, location) {
11307
+ const warnings = collectDeprecatedCamelCaseWarnings(
11308
+ target,
11309
+ location,
11310
+ DEPRECATED_TARGET_CAMEL_CASE_FIELDS
11311
+ );
11312
+ if (typeof target !== "object" || target === null || Array.isArray(target)) {
11313
+ return warnings;
11314
+ }
11315
+ const healthcheck = target.healthcheck;
11316
+ warnings.push(
11317
+ ...collectDeprecatedCamelCaseWarnings(
11318
+ healthcheck,
11319
+ `${location}.healthcheck`,
11320
+ DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS
11321
+ )
11322
+ );
11323
+ return warnings;
11324
+ }
10885
11325
  var COMMON_TARGET_SETTINGS = [
10886
11326
  "use_target",
10887
11327
  "provider_batching",
10888
- "providerBatching",
10889
11328
  "subagent_mode_allowed",
10890
- "subagentModeAllowed",
10891
- "fallback_targets",
10892
- "fallbackTargets"
11329
+ "fallback_targets"
10893
11330
  ];
10894
11331
  var USE_TARGET_ENV_PATTERN = /^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i;
10895
11332
  var BASE_TARGET_SCHEMA = import_zod3.z.object({
@@ -10901,43 +11338,40 @@ var BASE_TARGET_SCHEMA = import_zod3.z.object({
10901
11338
  // backward compat
10902
11339
  workers: import_zod3.z.number().int().min(1).optional(),
10903
11340
  workspace_template: import_zod3.z.string().optional(),
10904
- workspaceTemplate: import_zod3.z.string().optional(),
10905
11341
  subagent_mode_allowed: import_zod3.z.boolean().optional(),
10906
- fallback_targets: import_zod3.z.array(import_zod3.z.string().min(1)).optional(),
10907
- fallbackTargets: import_zod3.z.array(import_zod3.z.string().min(1)).optional()
11342
+ fallback_targets: import_zod3.z.array(import_zod3.z.string().min(1)).optional()
10908
11343
  }).passthrough();
10909
11344
  var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
11345
+ var DEFAULT_AZURE_RESPONSES_API_VERSION = "v1";
10910
11346
  var DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
10911
- function normalizeAzureApiVersion(value) {
11347
+ function normalizeAzureApiVersion(value, apiFormat) {
11348
+ const defaultVersion = apiFormat === "responses" ? DEFAULT_AZURE_RESPONSES_API_VERSION : DEFAULT_AZURE_API_VERSION;
10912
11349
  if (!value) {
10913
- return DEFAULT_AZURE_API_VERSION;
11350
+ return defaultVersion;
10914
11351
  }
10915
11352
  const trimmed = value.trim();
10916
11353
  if (trimmed.length === 0) {
10917
- return DEFAULT_AZURE_API_VERSION;
11354
+ return defaultVersion;
10918
11355
  }
10919
11356
  const withoutPrefix = trimmed.replace(/^api[-_]?version\s*=\s*/i, "").trim();
10920
- return withoutPrefix.length > 0 ? withoutPrefix : DEFAULT_AZURE_API_VERSION;
11357
+ return withoutPrefix.length > 0 ? withoutPrefix : defaultVersion;
10921
11358
  }
10922
11359
  function resolveRetryConfig(target) {
10923
- const maxRetries = resolveOptionalNumber(
10924
- target.max_retries ?? target.maxRetries,
10925
- `${target.name} max retries`
10926
- );
11360
+ const maxRetries = resolveOptionalNumber(target.max_retries, `${target.name} max retries`);
10927
11361
  const initialDelayMs = resolveOptionalNumber(
10928
- target.retry_initial_delay_ms ?? target.retryInitialDelayMs,
11362
+ target.retry_initial_delay_ms,
10929
11363
  `${target.name} retry initial delay`
10930
11364
  );
10931
11365
  const maxDelayMs = resolveOptionalNumber(
10932
- target.retry_max_delay_ms ?? target.retryMaxDelayMs,
11366
+ target.retry_max_delay_ms,
10933
11367
  `${target.name} retry max delay`
10934
11368
  );
10935
11369
  const backoffFactor = resolveOptionalNumber(
10936
- target.retry_backoff_factor ?? target.retryBackoffFactor,
11370
+ target.retry_backoff_factor,
10937
11371
  `${target.name} retry backoff factor`
10938
11372
  );
10939
11373
  const retryableStatusCodes = resolveOptionalNumberArray(
10940
- target.retry_status_codes ?? target.retryStatusCodes,
11374
+ target.retry_status_codes,
10941
11375
  `${target.name} retry status codes`
10942
11376
  );
10943
11377
  if (maxRetries === void 0 && initialDelayMs === void 0 && maxDelayMs === void 0 && backoffFactor === void 0 && retryableStatusCodes === void 0) {
@@ -10997,9 +11431,10 @@ function resolveDelegatedTargetDefinition(name, definitions, env = process.env)
10997
11431
  `Target "${name}" exceeded the maximum use_target resolution depth (10). Check for a delegation loop or overly deep alias chain.`
10998
11432
  );
10999
11433
  }
11000
- function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
11434
+ function resolveTargetDefinition(definition, env = process.env, evalFilePath, options) {
11435
+ assertNoDeprecatedCamelCaseTargetFields(definition);
11001
11436
  const parsed = BASE_TARGET_SCHEMA.parse(definition);
11002
- if (parsed.workspace_template !== void 0 || parsed.workspaceTemplate !== void 0) {
11437
+ if (parsed.workspace_template !== void 0) {
11003
11438
  throw new Error(
11004
11439
  `${parsed.name}: target-level workspace_template has been removed. Use eval-level workspace.template.`
11005
11440
  );
@@ -11015,13 +11450,9 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
11015
11450
  `${parsed.name} provider`,
11016
11451
  true
11017
11452
  ).toLowerCase();
11018
- const providerBatching = resolveOptionalBoolean(
11019
- parsed.provider_batching ?? parsed.providerBatching
11020
- );
11021
- const subagentModeAllowed = resolveOptionalBoolean(
11022
- parsed.subagent_mode_allowed ?? parsed.subagentModeAllowed
11023
- );
11024
- const fallbackTargets = parsed.fallback_targets ?? parsed.fallbackTargets;
11453
+ const providerBatching = resolveOptionalBoolean(parsed.provider_batching);
11454
+ const subagentModeAllowed = resolveOptionalBoolean(parsed.subagent_mode_allowed);
11455
+ const fallbackTargets = parsed.fallback_targets;
11025
11456
  const base = {
11026
11457
  name: parsed.name,
11027
11458
  graderTarget: parsed.grader_target ?? parsed.judge_target,
@@ -11171,20 +11602,22 @@ function normalizeOpenAIBaseUrl(value) {
11171
11602
  return trimmed.endsWith("/v1") ? trimmed : `${trimmed}/v1`;
11172
11603
  }
11173
11604
  function resolveAzureConfig(target, env) {
11174
- const endpointSource = target.endpoint ?? target.resource ?? target.resourceName;
11175
- const apiKeySource = target.api_key ?? target.apiKey;
11176
- const deploymentSource = target.deployment ?? target.deploymentName ?? target.model;
11605
+ const endpointSource = target.endpoint ?? target.resource;
11606
+ const apiKeySource = target.api_key;
11607
+ const deploymentSource = target.deployment ?? target.model;
11177
11608
  const versionSource = target.version ?? target.api_version;
11178
11609
  const temperatureSource = target.temperature;
11179
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
11610
+ const maxTokensSource = target.max_output_tokens;
11180
11611
  const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
11181
11612
  const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
11182
11613
  const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
11614
+ const apiFormat = resolveApiFormat(target, env, target.name);
11183
11615
  const version = normalizeAzureApiVersion(
11184
11616
  resolveOptionalString(versionSource, env, `${target.name} api version`, {
11185
11617
  allowLiteral: true,
11186
11618
  optionalEnv: true
11187
- })
11619
+ }),
11620
+ apiFormat
11188
11621
  );
11189
11622
  const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
11190
11623
  const maxOutputTokens = resolveOptionalNumber(
@@ -11197,13 +11630,17 @@ function resolveAzureConfig(target, env) {
11197
11630
  deploymentName,
11198
11631
  apiKey,
11199
11632
  version,
11633
+ apiFormat,
11200
11634
  temperature,
11201
11635
  maxOutputTokens,
11202
11636
  retry
11203
11637
  };
11204
11638
  }
11205
- function resolveApiFormat(target, targetName) {
11206
- const raw = target.api_format ?? target.apiFormat;
11639
+ function resolveApiFormat(target, env, targetName) {
11640
+ const raw = resolveOptionalString(target.api_format, env, `${targetName} api format`, {
11641
+ allowLiteral: true,
11642
+ optionalEnv: true
11643
+ });
11207
11644
  if (raw === void 0) return void 0;
11208
11645
  if (raw === "chat" || raw === "responses") return raw;
11209
11646
  throw new Error(
@@ -11211,11 +11648,11 @@ function resolveApiFormat(target, targetName) {
11211
11648
  );
11212
11649
  }
11213
11650
  function resolveOpenAIConfig(target, env) {
11214
- const endpointSource = target.endpoint ?? target.base_url ?? target.baseUrl;
11215
- const apiKeySource = target.api_key ?? target.apiKey;
11651
+ const endpointSource = target.endpoint ?? target.base_url;
11652
+ const apiKeySource = target.api_key;
11216
11653
  const modelSource = target.model ?? target.deployment ?? target.variant;
11217
11654
  const temperatureSource = target.temperature;
11218
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
11655
+ const maxTokensSource = target.max_output_tokens;
11219
11656
  const baseURL = normalizeOpenAIBaseUrl(
11220
11657
  resolveOptionalString(endpointSource, env, `${target.name} endpoint`, {
11221
11658
  allowLiteral: true,
@@ -11229,17 +11666,17 @@ function resolveOpenAIConfig(target, env) {
11229
11666
  baseURL,
11230
11667
  apiKey,
11231
11668
  model,
11232
- apiFormat: resolveApiFormat(target, target.name),
11669
+ apiFormat: resolveApiFormat(target, env, target.name),
11233
11670
  temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
11234
11671
  maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
11235
11672
  retry
11236
11673
  };
11237
11674
  }
11238
11675
  function resolveOpenRouterConfig(target, env) {
11239
- const apiKeySource = target.api_key ?? target.apiKey;
11676
+ const apiKeySource = target.api_key;
11240
11677
  const modelSource = target.model ?? target.deployment ?? target.variant;
11241
11678
  const temperatureSource = target.temperature;
11242
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
11679
+ const maxTokensSource = target.max_output_tokens;
11243
11680
  const retry = resolveRetryConfig(target);
11244
11681
  return {
11245
11682
  apiKey: resolveString(apiKeySource, env, `${target.name} OpenRouter api key`),
@@ -11250,11 +11687,11 @@ function resolveOpenRouterConfig(target, env) {
11250
11687
  };
11251
11688
  }
11252
11689
  function resolveAnthropicConfig(target, env) {
11253
- const apiKeySource = target.api_key ?? target.apiKey;
11690
+ const apiKeySource = target.api_key;
11254
11691
  const modelSource = target.model ?? target.deployment ?? target.variant;
11255
11692
  const temperatureSource = target.temperature;
11256
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
11257
- const thinkingBudgetSource = target.thinking_budget ?? target.thinkingBudget;
11693
+ const maxTokensSource = target.max_output_tokens;
11694
+ const thinkingBudgetSource = target.thinking_budget;
11258
11695
  const apiKey = resolveString(apiKeySource, env, `${target.name} Anthropic api key`);
11259
11696
  const model = resolveString(modelSource, env, `${target.name} Anthropic model`);
11260
11697
  const retry = resolveRetryConfig(target);
@@ -11268,10 +11705,10 @@ function resolveAnthropicConfig(target, env) {
11268
11705
  };
11269
11706
  }
11270
11707
  function resolveGeminiConfig(target, env) {
11271
- const apiKeySource = target.api_key ?? target.apiKey;
11708
+ const apiKeySource = target.api_key;
11272
11709
  const modelSource = target.model ?? target.deployment ?? target.variant;
11273
11710
  const temperatureSource = target.temperature;
11274
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
11711
+ const maxTokensSource = target.max_output_tokens;
11275
11712
  const apiKey = resolveString(apiKeySource, env, `${target.name} Google API key`);
11276
11713
  const model = resolveOptionalString(modelSource, env, `${target.name} Gemini model`, {
11277
11714
  allowLiteral: true,
@@ -11291,11 +11728,11 @@ function resolveCodexConfig(target, env, evalFilePath) {
11291
11728
  const executableSource = target.executable ?? target.command ?? target.binary;
11292
11729
  const argsSource = target.args ?? target.arguments;
11293
11730
  const cwdSource = target.cwd;
11294
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
11295
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
11296
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
11297
- const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
11298
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
11731
+ const workspaceTemplateSource = target.workspace_template;
11732
+ const timeoutSource = target.timeout_seconds;
11733
+ const logDirSource = target.log_dir ?? target.log_directory;
11734
+ const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CODEX_LOG_FORMAT;
11735
+ const systemPromptSource = target.system_prompt;
11299
11736
  const model = resolveOptionalString(modelSource, env, `${target.name} codex model`, {
11300
11737
  allowLiteral: true,
11301
11738
  optionalEnv: true
@@ -11318,8 +11755,8 @@ function resolveCodexConfig(target, env, evalFilePath) {
11318
11755
  optionalEnv: true
11319
11756
  }
11320
11757
  );
11321
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11322
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
11758
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
11759
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11323
11760
  }
11324
11761
  if (cwd && workspaceTemplate) {
11325
11762
  throw new Error(
@@ -11359,16 +11796,16 @@ function normalizeCodexLogFormat(value) {
11359
11796
  throw new Error("codex log format must be 'summary' or 'json'");
11360
11797
  }
11361
11798
  function resolveCopilotSdkConfig(target, env, evalFilePath) {
11362
- const cliUrlSource = target.cli_url ?? target.cliUrl;
11363
- const cliPathSource = target.cli_path ?? target.cliPath;
11364
- const githubTokenSource = target.github_token ?? target.githubToken;
11799
+ const cliUrlSource = target.cli_url;
11800
+ const cliPathSource = target.cli_path;
11801
+ const githubTokenSource = target.github_token;
11365
11802
  const modelSource = target.model;
11366
11803
  const cwdSource = target.cwd;
11367
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
11368
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
11369
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
11370
- const logFormatSource = target.log_format ?? target.logFormat;
11371
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
11804
+ const workspaceTemplateSource = target.workspace_template;
11805
+ const timeoutSource = target.timeout_seconds;
11806
+ const logDirSource = target.log_dir ?? target.log_directory;
11807
+ const logFormatSource = target.log_format;
11808
+ const systemPromptSource = target.system_prompt;
11372
11809
  const cliUrl = resolveOptionalString(cliUrlSource, env, `${target.name} copilot-sdk cli URL`, {
11373
11810
  allowLiteral: true,
11374
11811
  optionalEnv: true
@@ -11403,8 +11840,8 @@ function resolveCopilotSdkConfig(target, env, evalFilePath) {
11403
11840
  optionalEnv: true
11404
11841
  }
11405
11842
  );
11406
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11407
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
11843
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
11844
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11408
11845
  }
11409
11846
  if (cwd && workspaceTemplate) {
11410
11847
  throw new Error(
@@ -11423,6 +11860,52 @@ function resolveCopilotSdkConfig(target, env, evalFilePath) {
11423
11860
  );
11424
11861
  const logFormat = normalizeCopilotLogFormat(logFormatSource);
11425
11862
  const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
11863
+ const byok = target.byok;
11864
+ let byokType;
11865
+ let byokBaseUrl;
11866
+ let byokApiKey;
11867
+ let byokBearerToken;
11868
+ let byokApiVersion;
11869
+ let byokWireApi;
11870
+ if (byok && typeof byok === "object") {
11871
+ byokType = resolveOptionalString(byok.type, env, `${target.name} byok type`, {
11872
+ allowLiteral: true,
11873
+ optionalEnv: true
11874
+ });
11875
+ byokBaseUrl = resolveOptionalString(byok.base_url, env, `${target.name} byok base URL`, {
11876
+ allowLiteral: true,
11877
+ optionalEnv: true
11878
+ });
11879
+ byokApiKey = resolveOptionalString(byok.api_key, env, `${target.name} byok API key`, {
11880
+ allowLiteral: false,
11881
+ optionalEnv: true
11882
+ });
11883
+ byokBearerToken = resolveOptionalString(
11884
+ byok.bearer_token,
11885
+ env,
11886
+ `${target.name} byok bearer token`,
11887
+ {
11888
+ allowLiteral: false,
11889
+ optionalEnv: true
11890
+ }
11891
+ );
11892
+ byokApiVersion = resolveOptionalString(
11893
+ byok.api_version,
11894
+ env,
11895
+ `${target.name} byok API version`,
11896
+ {
11897
+ allowLiteral: true,
11898
+ optionalEnv: true
11899
+ }
11900
+ );
11901
+ byokWireApi = resolveOptionalString(byok.wire_api, env, `${target.name} byok wire API`, {
11902
+ allowLiteral: true,
11903
+ optionalEnv: true
11904
+ });
11905
+ if (!byokBaseUrl) {
11906
+ throw new Error(`${target.name}: 'byok.base_url' is required when 'byok' is specified`);
11907
+ }
11908
+ }
11426
11909
  return {
11427
11910
  cliUrl,
11428
11911
  cliPath,
@@ -11433,7 +11916,13 @@ function resolveCopilotSdkConfig(target, env, evalFilePath) {
11433
11916
  timeoutMs,
11434
11917
  logDir,
11435
11918
  logFormat,
11436
- systemPrompt
11919
+ systemPrompt,
11920
+ byokType,
11921
+ byokBaseUrl,
11922
+ byokApiKey,
11923
+ byokBearerToken,
11924
+ byokApiVersion,
11925
+ byokWireApi
11437
11926
  };
11438
11927
  }
11439
11928
  function resolveCopilotCliConfig(target, env, evalFilePath) {
@@ -11441,11 +11930,11 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
11441
11930
  const modelSource = target.model;
11442
11931
  const argsSource = target.args ?? target.arguments;
11443
11932
  const cwdSource = target.cwd;
11444
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
11445
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
11446
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
11447
- const logFormatSource = target.log_format ?? target.logFormat;
11448
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
11933
+ const workspaceTemplateSource = target.workspace_template;
11934
+ const timeoutSource = target.timeout_seconds;
11935
+ const logDirSource = target.log_dir ?? target.log_directory;
11936
+ const logFormatSource = target.log_format;
11937
+ const systemPromptSource = target.system_prompt;
11449
11938
  const executable = resolveOptionalString(executableSource, env, `${target.name} copilot-cli executable`, {
11450
11939
  allowLiteral: true,
11451
11940
  optionalEnv: true
@@ -11468,8 +11957,8 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
11468
11957
  optionalEnv: true
11469
11958
  }
11470
11959
  );
11471
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11472
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
11960
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
11961
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11473
11962
  }
11474
11963
  if (cwd && workspaceTemplate) {
11475
11964
  throw new Error(
@@ -11509,16 +11998,16 @@ function normalizeCopilotLogFormat(value) {
11509
11998
  }
11510
11999
  function resolvePiCodingAgentConfig(target, env, evalFilePath) {
11511
12000
  const subproviderSource = target.subprovider;
11512
- const modelSource = target.model ?? target.pi_model ?? target.piModel;
11513
- const apiKeySource = target.api_key ?? target.apiKey;
11514
- const toolsSource = target.tools ?? target.pi_tools ?? target.piTools;
11515
- const thinkingSource = target.thinking ?? target.pi_thinking ?? target.piThinking;
12001
+ const modelSource = target.model ?? target.pi_model;
12002
+ const apiKeySource = target.api_key;
12003
+ const toolsSource = target.tools ?? target.pi_tools;
12004
+ const thinkingSource = target.thinking ?? target.pi_thinking;
11516
12005
  const cwdSource = target.cwd;
11517
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
11518
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
11519
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
11520
- const logFormatSource = target.log_format ?? target.logFormat;
11521
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
12006
+ const workspaceTemplateSource = target.workspace_template;
12007
+ const timeoutSource = target.timeout_seconds;
12008
+ const logDirSource = target.log_dir ?? target.log_directory;
12009
+ const logFormatSource = target.log_format;
12010
+ const systemPromptSource = target.system_prompt;
11522
12011
  const subprovider = resolveOptionalString(
11523
12012
  subproviderSource,
11524
12013
  env,
@@ -11536,7 +12025,7 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
11536
12025
  allowLiteral: false,
11537
12026
  optionalEnv: true
11538
12027
  });
11539
- const baseUrlSource = target.base_url ?? target.baseUrl ?? target.endpoint;
12028
+ const baseUrlSource = target.base_url ?? target.endpoint;
11540
12029
  const baseUrl = resolveOptionalString(baseUrlSource, env, `${target.name} pi base url`, {
11541
12030
  allowLiteral: true,
11542
12031
  optionalEnv: true
@@ -11562,8 +12051,8 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
11562
12051
  optionalEnv: true
11563
12052
  }
11564
12053
  );
11565
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11566
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
12054
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
12055
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11567
12056
  }
11568
12057
  if (cwd && workspaceTemplate) {
11569
12058
  throw new Error(
@@ -11595,16 +12084,16 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
11595
12084
  function resolvePiCliConfig(target, env, evalFilePath) {
11596
12085
  const executableSource = target.executable ?? target.command ?? target.binary;
11597
12086
  const subproviderSource = target.subprovider;
11598
- const modelSource = target.model ?? target.pi_model ?? target.piModel;
11599
- const apiKeySource = target.api_key ?? target.apiKey;
11600
- const toolsSource = target.tools ?? target.pi_tools ?? target.piTools;
11601
- const thinkingSource = target.thinking ?? target.pi_thinking ?? target.piThinking;
12087
+ const modelSource = target.model ?? target.pi_model;
12088
+ const apiKeySource = target.api_key;
12089
+ const toolsSource = target.tools ?? target.pi_tools;
12090
+ const thinkingSource = target.thinking ?? target.pi_thinking;
11602
12091
  const cwdSource = target.cwd;
11603
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
11604
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
11605
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
11606
- const logFormatSource = target.log_format ?? target.logFormat;
11607
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
12092
+ const workspaceTemplateSource = target.workspace_template;
12093
+ const timeoutSource = target.timeout_seconds;
12094
+ const logDirSource = target.log_dir ?? target.log_directory;
12095
+ const logFormatSource = target.log_format;
12096
+ const systemPromptSource = target.system_prompt;
11608
12097
  const executable = resolveOptionalString(executableSource, env, `${target.name} pi-cli executable`, {
11609
12098
  allowLiteral: true,
11610
12099
  optionalEnv: true
@@ -11623,7 +12112,7 @@ function resolvePiCliConfig(target, env, evalFilePath) {
11623
12112
  allowLiteral: false,
11624
12113
  optionalEnv: true
11625
12114
  });
11626
- const baseUrlSource = target.base_url ?? target.baseUrl ?? target.endpoint;
12115
+ const baseUrlSource = target.base_url ?? target.endpoint;
11627
12116
  const baseUrl = resolveOptionalString(baseUrlSource, env, `${target.name} pi-cli base url`, {
11628
12117
  allowLiteral: true,
11629
12118
  optionalEnv: true
@@ -11648,8 +12137,8 @@ function resolvePiCliConfig(target, env, evalFilePath) {
11648
12137
  `${target.name} pi-cli workspace template`,
11649
12138
  { allowLiteral: true, optionalEnv: true }
11650
12139
  );
11651
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11652
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
12140
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
12141
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11653
12142
  }
11654
12143
  if (cwd && workspaceTemplate) {
11655
12144
  throw new Error(`${target.name}: 'cwd' and 'workspace_template' are mutually exclusive.`);
@@ -11681,11 +12170,11 @@ function resolvePiCliConfig(target, env, evalFilePath) {
11681
12170
  function resolveClaudeConfig(target, env, evalFilePath) {
11682
12171
  const modelSource = target.model;
11683
12172
  const cwdSource = target.cwd;
11684
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
11685
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
11686
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
11687
- const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CLAUDE_LOG_FORMAT;
11688
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
12173
+ const workspaceTemplateSource = target.workspace_template;
12174
+ const timeoutSource = target.timeout_seconds;
12175
+ const logDirSource = target.log_dir ?? target.log_directory;
12176
+ const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CLAUDE_LOG_FORMAT;
12177
+ const systemPromptSource = target.system_prompt;
11689
12178
  const model = resolveOptionalString(modelSource, env, `${target.name} claude model`, {
11690
12179
  allowLiteral: true,
11691
12180
  optionalEnv: true
@@ -11703,8 +12192,8 @@ function resolveClaudeConfig(target, env, evalFilePath) {
11703
12192
  optionalEnv: true
11704
12193
  }
11705
12194
  );
11706
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11707
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
12195
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
12196
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11708
12197
  }
11709
12198
  if (cwd && workspaceTemplate) {
11710
12199
  throw new Error(
@@ -11718,8 +12207,8 @@ function resolveClaudeConfig(target, env, evalFilePath) {
11718
12207
  });
11719
12208
  const logFormat = normalizeClaudeLogFormat(logFormatSource);
11720
12209
  const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
11721
- const maxTurns = typeof target.max_turns === "number" ? target.max_turns : typeof target.maxTurns === "number" ? target.maxTurns : void 0;
11722
- const maxBudgetUsd = typeof target.max_budget_usd === "number" ? target.max_budget_usd : typeof target.maxBudgetUsd === "number" ? target.maxBudgetUsd : void 0;
12210
+ const maxTurns = typeof target.max_turns === "number" ? target.max_turns : void 0;
12211
+ const maxBudgetUsd = typeof target.max_budget_usd === "number" ? target.max_budget_usd : void 0;
11723
12212
  return {
11724
12213
  model,
11725
12214
  systemPrompt,
@@ -11750,9 +12239,7 @@ function resolveMockConfig(target) {
11750
12239
  return { response };
11751
12240
  }
11752
12241
  function resolveVSCodeConfig(target, env, insiders, evalFilePath) {
11753
- const workspaceTemplateEnvVar = resolveOptionalLiteralString(
11754
- target.workspace_template ?? target.workspaceTemplate
11755
- );
12242
+ const workspaceTemplateEnvVar = resolveOptionalLiteralString(target.workspace_template);
11756
12243
  let workspaceTemplate = workspaceTemplateEnvVar ? resolveOptionalString(
11757
12244
  workspaceTemplateEnvVar,
11758
12245
  env,
@@ -11762,14 +12249,14 @@ function resolveVSCodeConfig(target, env, insiders, evalFilePath) {
11762
12249
  optionalEnv: true
11763
12250
  }
11764
12251
  ) : void 0;
11765
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11766
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
12252
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
12253
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11767
12254
  }
11768
12255
  const executableSource = target.executable;
11769
12256
  const waitSource = target.wait;
11770
- const dryRunSource = target.dry_run ?? target.dryRun;
11771
- const subagentRootSource = target.subagent_root ?? target.subagentRoot;
11772
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
12257
+ const dryRunSource = target.dry_run;
12258
+ const subagentRootSource = target.subagent_root;
12259
+ const timeoutSource = target.timeout_seconds;
11773
12260
  const defaultCommand = insiders ? "code-insiders" : "code";
11774
12261
  const executable = resolveOptionalString(executableSource, env, `${target.name} vscode executable`, {
11775
12262
  allowLiteral: true,
@@ -11804,8 +12291,8 @@ function resolveCliConfig(target, env, evalFilePath) {
11804
12291
  const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
11805
12292
  if (!parseResult.success) {
11806
12293
  const firstError = parseResult.error.errors[0];
11807
- const path52 = firstError?.path.join(".") || "";
11808
- const prefix = path52 ? `${target.name} ${path52}: ` : `${target.name}: `;
12294
+ const path53 = firstError?.path.join(".") || "";
12295
+ const prefix = path53 ? `${target.name} ${path53}: ` : `${target.name}: `;
11809
12296
  throw new Error(`${prefix}${firstError?.message}`);
11810
12297
  }
11811
12298
  const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
@@ -11820,17 +12307,17 @@ function resolveCliConfig(target, env, evalFilePath) {
11820
12307
  }
11821
12308
  function resolveDiscoveredProviderConfig(target, providerKind, env, evalFilePath) {
11822
12309
  const command = target.command ? resolveString(target.command, env, `${target.name} command`, true) : `bun run .agentv/providers/${providerKind}.ts {PROMPT}`;
11823
- const timeoutSeconds = target.timeout_seconds ?? target.timeoutSeconds;
12310
+ const timeoutSeconds = target.timeout_seconds;
11824
12311
  const timeoutMs = resolveTimeoutMs(timeoutSeconds, `${target.name} timeout`);
11825
12312
  let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
11826
12313
  allowLiteral: true,
11827
12314
  optionalEnv: true
11828
12315
  });
11829
- if (cwd && evalFilePath && !import_node_path23.default.isAbsolute(cwd)) {
11830
- cwd = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), cwd);
12316
+ if (cwd && evalFilePath && !import_node_path24.default.isAbsolute(cwd)) {
12317
+ cwd = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), cwd);
11831
12318
  }
11832
12319
  if (!cwd && evalFilePath) {
11833
- cwd = import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath));
12320
+ cwd = import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath));
11834
12321
  }
11835
12322
  return {
11836
12323
  command,
@@ -11884,10 +12371,10 @@ function resolveDiscover(value, targetName) {
11884
12371
  throw new Error(`Target "${targetName}": discover must be "latest" (got "${String(value)}")`);
11885
12372
  }
11886
12373
  function resolveCopilotLogConfig(target, env) {
11887
- const sessionDirSource = target.session_dir ?? target.sessionDir;
11888
- const sessionIdSource = target.session_id ?? target.sessionId;
12374
+ const sessionDirSource = target.session_dir;
12375
+ const sessionIdSource = target.session_id;
11889
12376
  const discoverSource = target.discover;
11890
- const sessionStateDirSource = target.session_state_dir ?? target.sessionStateDir;
12377
+ const sessionStateDirSource = target.session_state_dir;
11891
12378
  const cwdSource = target.cwd;
11892
12379
  return {
11893
12380
  sessionDir: resolveOptionalString(
@@ -12068,7 +12555,7 @@ var import_node_path33 = __toESM(require("path"), 1);
12068
12555
  init_cjs_shims();
12069
12556
  var import_node_fs11 = require("fs");
12070
12557
  var import_promises20 = require("fs/promises");
12071
- var import_node_path24 = __toESM(require("path"), 1);
12558
+ var import_node_path25 = __toESM(require("path"), 1);
12072
12559
  async function pathExists(target) {
12073
12560
  try {
12074
12561
  await (0, import_promises20.access)(target, import_node_fs11.constants.F_OK);
@@ -12084,7 +12571,7 @@ async function readDirEntries(target) {
12084
12571
  const entries = await (0, import_promises20.readdir)(target, { withFileTypes: true });
12085
12572
  return entries.map((entry) => ({
12086
12573
  name: entry.name,
12087
- absolutePath: import_node_path24.default.join(target, entry.name),
12574
+ absolutePath: import_node_path25.default.join(target, entry.name),
12088
12575
  isDirectory: entry.isDirectory()
12089
12576
  }));
12090
12577
  }
@@ -12100,9 +12587,9 @@ async function removeIfExists(target) {
12100
12587
 
12101
12588
  // src/evaluation/providers/vscode/utils/path.ts
12102
12589
  init_cjs_shims();
12103
- var import_node_path25 = __toESM(require("path"), 1);
12590
+ var import_node_path26 = __toESM(require("path"), 1);
12104
12591
  function pathToFileUri2(filePath) {
12105
- const absolutePath = import_node_path25.default.isAbsolute(filePath) ? filePath : import_node_path25.default.resolve(filePath);
12592
+ const absolutePath = import_node_path26.default.isAbsolute(filePath) ? filePath : import_node_path26.default.resolve(filePath);
12106
12593
  const normalizedPath = absolutePath.replace(/\\/g, "/");
12107
12594
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
12108
12595
  return `file:///${normalizedPath}`;
@@ -12112,7 +12599,7 @@ function pathToFileUri2(filePath) {
12112
12599
 
12113
12600
  // src/evaluation/providers/vscode/dispatch/promptBuilder.ts
12114
12601
  init_cjs_shims();
12115
- var import_node_path26 = __toESM(require("path"), 1);
12602
+ var import_node_path27 = __toESM(require("path"), 1);
12116
12603
 
12117
12604
  // src/evaluation/providers/vscode/utils/template.ts
12118
12605
  init_cjs_shims();
@@ -12206,8 +12693,8 @@ function createBatchRequestPrompt(userQuery, responseFileTmp, responseFileFinal,
12206
12693
  });
12207
12694
  }
12208
12695
  function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateContent) {
12209
- const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${import_node_path26.default.basename(file)}`).join("\n");
12210
- const responseList = responseFiles.map((file) => `"${import_node_path26.default.basename(file)}"`).join(", ");
12696
+ const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${import_node_path27.default.basename(file)}`).join("\n");
12697
+ const responseList = responseFiles.map((file) => `"${import_node_path27.default.basename(file)}"`).join(", ");
12211
12698
  return renderTemplate2(templateContent, {
12212
12699
  requestFiles: requestLines,
12213
12700
  responseList
@@ -12217,7 +12704,7 @@ function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateCont
12217
12704
  // src/evaluation/providers/vscode/dispatch/responseWaiter.ts
12218
12705
  init_cjs_shims();
12219
12706
  var import_promises21 = require("fs/promises");
12220
- var import_node_path27 = __toESM(require("path"), 1);
12707
+ var import_node_path28 = __toESM(require("path"), 1);
12221
12708
 
12222
12709
  // src/evaluation/providers/vscode/utils/time.ts
12223
12710
  init_cjs_shims();
@@ -12277,7 +12764,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
12277
12764
  }
12278
12765
  async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, silent = false, timeoutMs = DEFAULT_TIMEOUT_MS) {
12279
12766
  if (!silent) {
12280
- const fileList = responseFilesFinal.map((file) => import_node_path27.default.basename(file)).join(", ");
12767
+ const fileList = responseFilesFinal.map((file) => import_node_path28.default.basename(file)).join(", ");
12281
12768
  console.error(`waiting for ${responseFilesFinal.length} batch response(s): ${fileList}`);
12282
12769
  }
12283
12770
  const deadline = Date.now() + timeoutMs;
@@ -12286,7 +12773,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
12286
12773
  while (pending.size > 0) {
12287
12774
  if (Date.now() >= deadline) {
12288
12775
  if (!silent) {
12289
- const remaining = [...pending].map((f) => import_node_path27.default.basename(f)).join(", ");
12776
+ const remaining = [...pending].map((f) => import_node_path28.default.basename(f)).join(", ");
12290
12777
  console.error(
12291
12778
  `error: timed out after ${Math.round(timeoutMs / 1e3)}s waiting for batch responses. Still pending: ${remaining}`
12292
12779
  );
@@ -12344,37 +12831,6 @@ var import_node_util2 = require("util");
12344
12831
  // src/evaluation/providers/vscode/dispatch/constants.ts
12345
12832
  init_cjs_shims();
12346
12833
  var import_node_path29 = __toESM(require("path"), 1);
12347
-
12348
- // src/paths.ts
12349
- init_cjs_shims();
12350
- var import_node_os6 = __toESM(require("os"), 1);
12351
- var import_node_path28 = __toESM(require("path"), 1);
12352
- var logged = false;
12353
- function getAgentvHome() {
12354
- const envHome = process.env.AGENTV_HOME;
12355
- if (envHome && envHome !== "undefined") {
12356
- if (!logged) {
12357
- logged = true;
12358
- console.warn(`Using AGENTV_HOME: ${envHome}`);
12359
- }
12360
- return envHome;
12361
- }
12362
- return import_node_path28.default.join(import_node_os6.default.homedir(), ".agentv");
12363
- }
12364
- function getWorkspacesRoot() {
12365
- return import_node_path28.default.join(getAgentvHome(), "workspaces");
12366
- }
12367
- function getSubagentsRoot() {
12368
- return import_node_path28.default.join(getAgentvHome(), "subagents");
12369
- }
12370
- function getTraceStateRoot() {
12371
- return import_node_path28.default.join(getAgentvHome(), "trace-state");
12372
- }
12373
- function getWorkspacePoolRoot() {
12374
- return import_node_path28.default.join(getAgentvHome(), "workspace-pool");
12375
- }
12376
-
12377
- // src/evaluation/providers/vscode/dispatch/constants.ts
12378
12834
  var DEFAULT_LOCK_NAME = "subagent.lock";
12379
12835
  var DEFAULT_ALIVE_FILENAME = ".alive";
12380
12836
  function getDefaultSubagentRoot(vscodeCmd = "code") {
@@ -13527,6 +13983,15 @@ var AGENT_PROVIDER_KINDS = [
13527
13983
  "vscode",
13528
13984
  "vscode-insiders"
13529
13985
  ];
13986
+ var LLM_GRADER_CAPABLE_KINDS = [
13987
+ "openai",
13988
+ "openrouter",
13989
+ "azure",
13990
+ "anthropic",
13991
+ "gemini",
13992
+ "agentv",
13993
+ "mock"
13994
+ ];
13530
13995
  function extractLastAssistantContent(messages) {
13531
13996
  if (!messages || messages.length === 0) {
13532
13997
  return "";
@@ -13680,9 +14145,10 @@ init_cjs_shims();
13680
14145
 
13681
14146
  // src/evaluation/evaluators/scoring.ts
13682
14147
  init_cjs_shims();
13683
- var PASS_THRESHOLD = 0.8;
13684
- function scoreToVerdict(score) {
13685
- return score >= PASS_THRESHOLD ? "pass" : "fail";
14148
+ var DEFAULT_THRESHOLD = 0.8;
14149
+ var PASS_THRESHOLD = DEFAULT_THRESHOLD;
14150
+ function scoreToVerdict(score, threshold = DEFAULT_THRESHOLD) {
14151
+ return score >= threshold ? "pass" : "fail";
13686
14152
  }
13687
14153
  function clampScore(value) {
13688
14154
  if (Number.isNaN(value) || !Number.isFinite(value)) {
@@ -13873,13 +14339,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
13873
14339
  async function execShellWithStdin(command, stdinPayload, options = {}) {
13874
14340
  const { mkdir: mkdir17, readFile: readFile17, rm: rm6, writeFile: writeFile9 } = await import("fs/promises");
13875
14341
  const { tmpdir: tmpdir3 } = await import("os");
13876
- const path52 = await import("path");
14342
+ const path53 = await import("path");
13877
14343
  const { randomUUID: randomUUID10 } = await import("crypto");
13878
- const dir = path52.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
14344
+ const dir = path53.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
13879
14345
  await mkdir17(dir, { recursive: true });
13880
- const stdinPath = path52.join(dir, "stdin.txt");
13881
- const stdoutPath = path52.join(dir, "stdout.txt");
13882
- const stderrPath = path52.join(dir, "stderr.txt");
14346
+ const stdinPath = path53.join(dir, "stdin.txt");
14347
+ const stdoutPath = path53.join(dir, "stdout.txt");
14348
+ const stderrPath = path53.join(dir, "stderr.txt");
13883
14349
  await writeFile9(stdinPath, stdinPayload, "utf8");
13884
14350
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
13885
14351
  const { spawn: spawn5 } = await import("child_process");
@@ -15081,7 +15547,7 @@ ${outputSchema}`;
15081
15547
  parts.push("[[ ## scoring_criteria ## ]]");
15082
15548
  for (const rubric of rubrics) {
15083
15549
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
15084
- const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
15550
+ const minScoreLabel = rubric.min_score !== void 0 ? ` [REQUIRED: min score ${rubric.min_score}]` : rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
15085
15551
  parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
15086
15552
  if (rubric.outcome) {
15087
15553
  parts.push(`Description: ${rubric.outcome}`);
@@ -15135,54 +15601,106 @@ ${outputSchema}`;
15135
15601
  async runWithRetry(options) {
15136
15602
  const { context: context2, graderProvider, systemPrompt, userPrompt, schema, images } = options;
15137
15603
  let lastError;
15604
+ let lastInvalidResponse;
15605
+ let shouldAttemptStructureFix = false;
15138
15606
  for (let attempt = 1; attempt <= 3; attempt++) {
15139
15607
  try {
15140
- const model = graderProvider.asLanguageModel?.();
15141
- if (model) {
15142
- const modelOptions = {
15143
- ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
15144
- ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
15145
- };
15146
- const hasImages = images && images.length > 0;
15147
- const result = hasImages ? await (0, import_ai2.generateText)({
15148
- model,
15149
- system: systemPrompt,
15150
- messages: [
15151
- {
15152
- role: "user",
15153
- content: [
15154
- { type: "text", text: userPrompt },
15155
- ...toAiSdkImageParts(images)
15156
- ]
15157
- }
15158
- ],
15159
- ...modelOptions
15160
- }) : await (0, import_ai2.generateText)({
15161
- model,
15162
- system: systemPrompt,
15163
- prompt: userPrompt,
15164
- ...modelOptions
15165
- });
15166
- const data2 = schema.parse(parseJsonFromText(result.text));
15167
- const rawUsage = result.usage;
15168
- const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
15169
- return { data: data2, tokenUsage };
15608
+ const result = await this.generateStructuredResponse({
15609
+ context: context2,
15610
+ graderProvider,
15611
+ systemPrompt,
15612
+ userPrompt,
15613
+ images
15614
+ });
15615
+ const canRepairResponse = result.text.trim().length > 0;
15616
+ lastInvalidResponse = canRepairResponse ? result : void 0;
15617
+ let data;
15618
+ try {
15619
+ data = schema.parse(parseJsonFromText(result.text));
15620
+ } catch (e) {
15621
+ lastError = e instanceof Error ? e : new Error(String(e));
15622
+ shouldAttemptStructureFix = canRepairResponse;
15623
+ continue;
15170
15624
  }
15171
- const response = await graderProvider.invoke({
15172
- question: userPrompt,
15625
+ return {
15626
+ data,
15627
+ providerResponse: result.providerResponse,
15628
+ tokenUsage: result.tokenUsage
15629
+ };
15630
+ } catch (e) {
15631
+ lastError = e instanceof Error ? e : new Error(String(e));
15632
+ }
15633
+ }
15634
+ if (shouldAttemptStructureFix && lastInvalidResponse) {
15635
+ try {
15636
+ const repaired = await this.generateStructuredResponse({
15637
+ context: context2,
15638
+ graderProvider,
15173
15639
  systemPrompt,
15174
- evalCaseId: context2.evalCase.id,
15175
- attempt: context2.attempt,
15176
- maxOutputTokens: this.maxOutputTokens,
15177
- temperature: this.temperature
15640
+ userPrompt: buildStructureRepairPrompt({
15641
+ validationError: lastError?.message ?? "Schema validation failed",
15642
+ invalidResponse: lastInvalidResponse.text
15643
+ })
15178
15644
  });
15179
- const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output)));
15180
- return { data, providerResponse: response, tokenUsage: response.tokenUsage };
15645
+ const data = schema.parse(parseJsonFromText(repaired.text));
15646
+ return {
15647
+ data,
15648
+ providerResponse: repaired.providerResponse,
15649
+ tokenUsage: sumTokenUsage(lastInvalidResponse.tokenUsage, repaired.tokenUsage)
15650
+ };
15181
15651
  } catch (e) {
15182
15652
  lastError = e instanceof Error ? e : new Error(String(e));
15183
15653
  }
15184
15654
  }
15185
- throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
15655
+ throw new Error(
15656
+ `Failed to parse evaluator response after 3 attempts and 1 structure-fix attempt: ${lastError?.message}`
15657
+ );
15658
+ }
15659
+ async generateStructuredResponse(options) {
15660
+ const { context: context2, graderProvider, systemPrompt, userPrompt, images } = options;
15661
+ const model = graderProvider.asLanguageModel?.();
15662
+ if (model) {
15663
+ const modelOptions = {
15664
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
15665
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
15666
+ };
15667
+ const hasImages = images && images.length > 0;
15668
+ const result = hasImages ? await (0, import_ai2.generateText)({
15669
+ model,
15670
+ system: systemPrompt,
15671
+ messages: [
15672
+ {
15673
+ role: "user",
15674
+ content: [
15675
+ { type: "text", text: userPrompt },
15676
+ ...toAiSdkImageParts(images)
15677
+ ]
15678
+ }
15679
+ ],
15680
+ ...modelOptions
15681
+ }) : await (0, import_ai2.generateText)({
15682
+ model,
15683
+ system: systemPrompt,
15684
+ prompt: userPrompt,
15685
+ ...modelOptions
15686
+ });
15687
+ const rawUsage = result.usage;
15688
+ const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
15689
+ return { text: result.text, tokenUsage };
15690
+ }
15691
+ const response = await graderProvider.invoke({
15692
+ question: userPrompt,
15693
+ systemPrompt,
15694
+ evalCaseId: context2.evalCase.id,
15695
+ attempt: context2.attempt,
15696
+ maxOutputTokens: this.maxOutputTokens,
15697
+ temperature: this.temperature
15698
+ });
15699
+ return {
15700
+ text: extractLastAssistantContent(response.output),
15701
+ providerResponse: response,
15702
+ tokenUsage: response.tokenUsage
15703
+ };
15186
15704
  }
15187
15705
  };
15188
15706
  function buildOutputSchema() {
@@ -15202,6 +15720,29 @@ function buildOutputSchema() {
15202
15720
  "}"
15203
15721
  ].join("\n");
15204
15722
  }
15723
+ function buildStructureRepairPrompt(options) {
15724
+ const { validationError, invalidResponse } = options;
15725
+ return [
15726
+ "The following evaluation response has useful grading content but invalid JSON structure.",
15727
+ "Repair it to satisfy the schema in the system prompt.",
15728
+ "Preserve the evaluation meaning, do not re-grade the answer, and return only a single JSON object.",
15729
+ "",
15730
+ "Validation error:",
15731
+ validationError,
15732
+ "",
15733
+ "Invalid response:",
15734
+ invalidResponse
15735
+ ].join("\n");
15736
+ }
15737
+ function sumTokenUsage(first, second) {
15738
+ if (!first && !second) {
15739
+ return void 0;
15740
+ }
15741
+ return {
15742
+ input: (first?.input ?? 0) + (second?.input ?? 0),
15743
+ output: (first?.output ?? 0) + (second?.output ?? 0)
15744
+ };
15745
+ }
15205
15746
  function buildRubricOutputSchema() {
15206
15747
  return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
15207
15748
  You must return a valid JSON object matching this schema:
@@ -15301,19 +15842,21 @@ function calculateScoreRangeResult(result, rubrics) {
15301
15842
  rawScores[rubric.id] = rawScore;
15302
15843
  totalWeight += rubric.weight;
15303
15844
  weightedScoreSum += normalizedScore * rubric.weight;
15304
- let requiredMinScore;
15305
- if (rubric.required_min_score !== void 0) {
15306
- requiredMinScore = rubric.required_min_score;
15845
+ let minScoreThreshold;
15846
+ if (rubric.min_score !== void 0) {
15847
+ minScoreThreshold = rubric.min_score;
15848
+ } else if (rubric.required_min_score !== void 0) {
15849
+ minScoreThreshold = rubric.required_min_score / 10;
15307
15850
  } else if (rubric.required === true) {
15308
- requiredMinScore = 10;
15851
+ minScoreThreshold = 1;
15309
15852
  }
15310
15853
  const matchingRange = rubric.score_ranges?.find(
15311
15854
  (r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
15312
15855
  );
15313
15856
  const rangeDescription = matchingRange?.outcome ?? "";
15314
15857
  const criterionLabel = rubric.outcome ?? rubric.id;
15315
- const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
15316
- if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
15858
+ const passed = !(minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) && rawScore >= 7;
15859
+ if (minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) {
15317
15860
  failedRequired = true;
15318
15861
  }
15319
15862
  assertions.push({
@@ -15390,11 +15933,11 @@ function createFilesystemTools(workspacePath) {
15390
15933
  execute: async (input) => {
15391
15934
  try {
15392
15935
  const resolved = resolveSandboxed(workspacePath, input.path);
15393
- const stat10 = await import_promises29.default.stat(resolved);
15394
- if (stat10.isDirectory()) {
15936
+ const stat11 = await import_promises29.default.stat(resolved);
15937
+ if (stat11.isDirectory()) {
15395
15938
  return { error: `'${input.path}' is a directory, not a file` };
15396
15939
  }
15397
- const buffer = Buffer.alloc(Math.min(stat10.size, MAX_FILE_SIZE));
15940
+ const buffer = Buffer.alloc(Math.min(stat11.size, MAX_FILE_SIZE));
15398
15941
  const fd = await import_promises29.default.open(resolved, "r");
15399
15942
  try {
15400
15943
  await fd.read(buffer, 0, buffer.length, 0);
@@ -15402,8 +15945,8 @@ function createFilesystemTools(workspacePath) {
15402
15945
  await fd.close();
15403
15946
  }
15404
15947
  const content = buffer.toString("utf-8");
15405
- const truncated = stat10.size > MAX_FILE_SIZE;
15406
- return { content, truncated, size: stat10.size };
15948
+ const truncated = stat11.size > MAX_FILE_SIZE;
15949
+ return { content, truncated, size: stat11.size };
15407
15950
  } catch (error) {
15408
15951
  return { error: error instanceof Error ? error.message : String(error) };
15409
15952
  }
@@ -15454,8 +15997,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
15454
15997
  const ext = import_node_path39.default.extname(entry.name).toLowerCase();
15455
15998
  if (BINARY_EXTENSIONS.has(ext)) continue;
15456
15999
  try {
15457
- const stat10 = await import_promises29.default.stat(fullPath);
15458
- if (stat10.size > MAX_FILE_SIZE) continue;
16000
+ const stat11 = await import_promises29.default.stat(fullPath);
16001
+ if (stat11.size > MAX_FILE_SIZE) continue;
15459
16002
  const content = await import_promises29.default.readFile(fullPath, "utf-8");
15460
16003
  const lines = content.split("\n");
15461
16004
  for (let i = 0; i < lines.length; i++) {
@@ -16099,115 +16642,115 @@ var FieldAccuracyEvaluator = class {
16099
16642
  * Evaluate a single field against the expected value.
16100
16643
  */
16101
16644
  evaluateField(fieldConfig, candidateData, expectedData) {
16102
- const { path: path52, match, required = true, weight = 1 } = fieldConfig;
16103
- const candidateValue = resolvePath(candidateData, path52);
16104
- const expectedValue = resolvePath(expectedData, path52);
16645
+ const { path: path53, match, required = true, weight = 1 } = fieldConfig;
16646
+ const candidateValue = resolvePath(candidateData, path53);
16647
+ const expectedValue = resolvePath(expectedData, path53);
16105
16648
  if (expectedValue === void 0) {
16106
16649
  return {
16107
- path: path52,
16650
+ path: path53,
16108
16651
  score: 1,
16109
16652
  // No expected value means no comparison needed
16110
16653
  weight,
16111
16654
  hit: true,
16112
- message: `${path52}: no expected value`
16655
+ message: `${path53}: no expected value`
16113
16656
  };
16114
16657
  }
16115
16658
  if (candidateValue === void 0) {
16116
16659
  if (required) {
16117
16660
  return {
16118
- path: path52,
16661
+ path: path53,
16119
16662
  score: 0,
16120
16663
  weight,
16121
16664
  hit: false,
16122
- message: `${path52} (required, missing)`
16665
+ message: `${path53} (required, missing)`
16123
16666
  };
16124
16667
  }
16125
16668
  return {
16126
- path: path52,
16669
+ path: path53,
16127
16670
  score: 1,
16128
16671
  // Don't penalize missing optional fields
16129
16672
  weight: 0,
16130
16673
  // Zero weight means it won't affect the score
16131
16674
  hit: true,
16132
- message: `${path52}: optional field missing`
16675
+ message: `${path53}: optional field missing`
16133
16676
  };
16134
16677
  }
16135
16678
  switch (match) {
16136
16679
  case "exact":
16137
- return this.compareExact(path52, candidateValue, expectedValue, weight);
16680
+ return this.compareExact(path53, candidateValue, expectedValue, weight);
16138
16681
  case "numeric_tolerance":
16139
16682
  return this.compareNumericTolerance(
16140
- path52,
16683
+ path53,
16141
16684
  candidateValue,
16142
16685
  expectedValue,
16143
16686
  fieldConfig,
16144
16687
  weight
16145
16688
  );
16146
16689
  case "date":
16147
- return this.compareDate(path52, candidateValue, expectedValue, fieldConfig, weight);
16690
+ return this.compareDate(path53, candidateValue, expectedValue, fieldConfig, weight);
16148
16691
  default:
16149
16692
  return {
16150
- path: path52,
16693
+ path: path53,
16151
16694
  score: 0,
16152
16695
  weight,
16153
16696
  hit: false,
16154
- message: `${path52}: unknown match type "${match}"`
16697
+ message: `${path53}: unknown match type "${match}"`
16155
16698
  };
16156
16699
  }
16157
16700
  }
16158
16701
  /**
16159
16702
  * Exact equality comparison.
16160
16703
  */
16161
- compareExact(path52, candidateValue, expectedValue, weight) {
16704
+ compareExact(path53, candidateValue, expectedValue, weight) {
16162
16705
  if (deepEqual(candidateValue, expectedValue)) {
16163
16706
  return {
16164
- path: path52,
16707
+ path: path53,
16165
16708
  score: 1,
16166
16709
  weight,
16167
16710
  hit: true,
16168
- message: path52
16711
+ message: path53
16169
16712
  };
16170
16713
  }
16171
16714
  if (typeof candidateValue !== typeof expectedValue) {
16172
16715
  return {
16173
- path: path52,
16716
+ path: path53,
16174
16717
  score: 0,
16175
16718
  weight,
16176
16719
  hit: false,
16177
- message: `${path52} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
16720
+ message: `${path53} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
16178
16721
  };
16179
16722
  }
16180
16723
  return {
16181
- path: path52,
16724
+ path: path53,
16182
16725
  score: 0,
16183
16726
  weight,
16184
16727
  hit: false,
16185
- message: `${path52} (value mismatch)`
16728
+ message: `${path53} (value mismatch)`
16186
16729
  };
16187
16730
  }
16188
16731
  /**
16189
16732
  * Numeric comparison with absolute or relative tolerance.
16190
16733
  */
16191
- compareNumericTolerance(path52, candidateValue, expectedValue, fieldConfig, weight) {
16734
+ compareNumericTolerance(path53, candidateValue, expectedValue, fieldConfig, weight) {
16192
16735
  const { tolerance = 0, relative = false } = fieldConfig;
16193
16736
  const candidateNum = toNumber(candidateValue);
16194
16737
  const expectedNum = toNumber(expectedValue);
16195
16738
  if (candidateNum === null || expectedNum === null) {
16196
16739
  return {
16197
- path: path52,
16740
+ path: path53,
16198
16741
  score: 0,
16199
16742
  weight,
16200
16743
  hit: false,
16201
- message: `${path52} (non-numeric value)`
16744
+ message: `${path53} (non-numeric value)`
16202
16745
  };
16203
16746
  }
16204
16747
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
16205
16748
  return {
16206
- path: path52,
16749
+ path: path53,
16207
16750
  score: 0,
16208
16751
  weight,
16209
16752
  hit: false,
16210
- message: `${path52} (invalid numeric value)`
16753
+ message: `${path53} (invalid numeric value)`
16211
16754
  };
16212
16755
  }
16213
16756
  const diff = Math.abs(candidateNum - expectedNum);
@@ -16220,61 +16763,61 @@ var FieldAccuracyEvaluator = class {
16220
16763
  }
16221
16764
  if (withinTolerance) {
16222
16765
  return {
16223
- path: path52,
16766
+ path: path53,
16224
16767
  score: 1,
16225
16768
  weight,
16226
16769
  hit: true,
16227
- message: `${path52} (within tolerance: diff=${diff.toFixed(2)})`
16770
+ message: `${path53} (within tolerance: diff=${diff.toFixed(2)})`
16228
16771
  };
16229
16772
  }
16230
16773
  return {
16231
- path: path52,
16774
+ path: path53,
16232
16775
  score: 0,
16233
16776
  weight,
16234
16777
  hit: false,
16235
- message: `${path52} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
16778
+ message: `${path53} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
16236
16779
  };
16237
16780
  }
16238
16781
  /**
16239
16782
  * Date comparison with format normalization.
16240
16783
  */
16241
- compareDate(path52, candidateValue, expectedValue, fieldConfig, weight) {
16784
+ compareDate(path53, candidateValue, expectedValue, fieldConfig, weight) {
16242
16785
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
16243
16786
  const candidateDate = parseDate(String(candidateValue), formats);
16244
16787
  const expectedDate = parseDate(String(expectedValue), formats);
16245
16788
  if (candidateDate === null) {
16246
16789
  return {
16247
- path: path52,
16790
+ path: path53,
16248
16791
  score: 0,
16249
16792
  weight,
16250
16793
  hit: false,
16251
- message: `${path52} (unparseable candidate date)`
16794
+ message: `${path53} (unparseable candidate date)`
16252
16795
  };
16253
16796
  }
16254
16797
  if (expectedDate === null) {
16255
16798
  return {
16256
- path: path52,
16799
+ path: path53,
16257
16800
  score: 0,
16258
16801
  weight,
16259
16802
  hit: false,
16260
- message: `${path52} (unparseable expected date)`
16803
+ message: `${path53} (unparseable expected date)`
16261
16804
  };
16262
16805
  }
16263
16806
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
16264
16807
  return {
16265
- path: path52,
16808
+ path: path53,
16266
16809
  score: 1,
16267
16810
  weight,
16268
16811
  hit: true,
16269
- message: path52
16812
+ message: path53
16270
16813
  };
16271
16814
  }
16272
16815
  return {
16273
- path: path52,
16816
+ path: path53,
16274
16817
  score: 0,
16275
16818
  weight,
16276
16819
  hit: false,
16277
- message: `${path52} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
16820
+ message: `${path53} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
16278
16821
  };
16279
16822
  }
16280
16823
  /**
@@ -16307,11 +16850,11 @@ var FieldAccuracyEvaluator = class {
16307
16850
  };
16308
16851
  }
16309
16852
  };
16310
- function resolvePath(obj, path52) {
16311
- if (!path52 || !obj) {
16853
+ function resolvePath(obj, path53) {
16854
+ if (!path53 || !obj) {
16312
16855
  return void 0;
16313
16856
  }
16314
- const parts = path52.split(/\.|\[|\]/).filter((p) => p.length > 0);
16857
+ const parts = path53.split(/\.|\[|\]/).filter((p) => p.length > 0);
16315
16858
  let current = obj;
16316
16859
  for (const part of parts) {
16317
16860
  if (current === null || current === void 0) {
@@ -16808,8 +17351,8 @@ var TokenUsageEvaluator = class {
16808
17351
 
16809
17352
  // src/evaluation/evaluators/tool-trajectory.ts
16810
17353
  init_cjs_shims();
16811
- function getNestedValue(obj, path52) {
16812
- const parts = path52.split(".");
17354
+ function getNestedValue(obj, path53) {
17355
+ const parts = path53.split(".");
16813
17356
  let current = obj;
16814
17357
  for (const part of parts) {
16815
17358
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -18602,7 +19145,7 @@ var WorkspacePoolManager = class {
18602
19145
  }
18603
19146
  /**
18604
19147
  * Reset an existing slot for reuse:
18605
- * 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
19148
+ * 1. Reset repos (fetch from origin when resolve=remote, then git reset --hard && git clean per repo)
18606
19149
  * 2. Re-copy template files (skip repo directories)
18607
19150
  */
18608
19151
  async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
@@ -18615,7 +19158,17 @@ var WorkspacePoolManager = class {
18615
19158
  continue;
18616
19159
  }
18617
19160
  const ref = repo.checkout?.ref ?? "HEAD";
18618
- await git(["reset", "--hard", ref], { cwd: repoDir });
19161
+ const resolve = repo.checkout?.resolve ?? "remote";
19162
+ if (resolve === "remote") {
19163
+ const fetchArgs = ["fetch", "origin", ref];
19164
+ if (repo.clone?.depth) {
19165
+ fetchArgs.splice(1, 0, "--depth", String(repo.clone.depth));
19166
+ }
19167
+ await git(fetchArgs, { cwd: repoDir });
19168
+ await git(["reset", "--hard", "FETCH_HEAD"], { cwd: repoDir });
19169
+ } else {
19170
+ await git(["reset", "--hard", ref], { cwd: repoDir });
19171
+ }
18619
19172
  const cleanFlag = poolReset === "strict" ? "-fdx" : "-fd";
18620
19173
  await git(["clean", cleanFlag], { cwd: repoDir });
18621
19174
  }
@@ -18915,7 +19468,7 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
18915
19468
  }
18916
19469
 
18917
19470
  // src/evaluation/orchestrator.ts
18918
- function classifyQualityStatus(score, threshold = PASS_THRESHOLD) {
19471
+ function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
18919
19472
  return score >= threshold ? "ok" : "quality_failure";
18920
19473
  }
18921
19474
  function buildSkippedEvaluatorError(scores) {
@@ -19007,7 +19560,7 @@ async function runEvaluation(options) {
19007
19560
  const filteredEvalCases = filterEvalCases(evalCases, filter);
19008
19561
  if (filteredEvalCases.length === 0) {
19009
19562
  if (filter) {
19010
- throw new Error(`No tests matched filter '${filter}' in ${evalFilePath}`);
19563
+ throw new Error(`No tests matched filter '${formatFilter(filter)}' in ${evalFilePath}`);
19011
19564
  }
19012
19565
  return [];
19013
19566
  }
@@ -19059,6 +19612,9 @@ async function runEvaluation(options) {
19059
19612
  const graderName = targetContext.graderTarget ?? targetContext.name;
19060
19613
  const resolvedGrader = resolveTargetByName(graderName);
19061
19614
  if (!resolvedGrader) {
19615
+ if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) {
19616
+ return void 0;
19617
+ }
19062
19618
  return getOrCreateProvider(targetContext);
19063
19619
  }
19064
19620
  return getOrCreateProvider(resolvedGrader);
@@ -19389,7 +19945,7 @@ async function runEvaluation(options) {
19389
19945
  const budgetResult = {
19390
19946
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
19391
19947
  testId: evalCase.id,
19392
- dataset: evalCase.dataset,
19948
+ suite: evalCase.suite,
19393
19949
  category: evalCase.category,
19394
19950
  score: 0,
19395
19951
  assertions: [],
@@ -19426,7 +19982,7 @@ async function runEvaluation(options) {
19426
19982
  const haltResult = {
19427
19983
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
19428
19984
  testId: evalCase.id,
19429
- dataset: evalCase.dataset,
19985
+ suite: evalCase.suite,
19430
19986
  category: evalCase.category,
19431
19987
  score: 0,
19432
19988
  assertions: [],
@@ -19738,7 +20294,7 @@ async function runBatchEvaluation(options) {
19738
20294
  targetResolver,
19739
20295
  availableTargets,
19740
20296
  verbose,
19741
- threshold: batchThreshold
20297
+ threshold: evalCase.threshold ?? batchThreshold
19742
20298
  });
19743
20299
  if (providerError) {
19744
20300
  result = {
@@ -20200,8 +20756,9 @@ async function runEvalCase(options) {
20200
20756
  fileChanges,
20201
20757
  workspacePath,
20202
20758
  verbose,
20203
- threshold: caseThreshold
20759
+ threshold: evalCase.threshold ?? caseThreshold
20204
20760
  });
20761
+ const effectiveThreshold = evalCase.threshold ?? caseThreshold;
20205
20762
  const totalDurationMs = Date.now() - caseStartMs;
20206
20763
  const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
20207
20764
  const evalRunTokenUsage = tokenUsage || graderTokens ? {
@@ -20215,7 +20772,7 @@ async function runEvalCase(options) {
20215
20772
  ...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
20216
20773
  };
20217
20774
  const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
20218
- const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, caseThreshold);
20775
+ const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, effectiveThreshold);
20219
20776
  const targetUsedField = targetUsed ? { targetUsed } : {};
20220
20777
  const finalResult = providerError ? {
20221
20778
  ...result,
@@ -20416,7 +20973,8 @@ async function evaluateCandidate(options) {
20416
20973
  targetResolver,
20417
20974
  availableTargets,
20418
20975
  fileChanges,
20419
- workspacePath
20976
+ workspacePath,
20977
+ threshold: evalThreshold
20420
20978
  });
20421
20979
  const completedAt = nowFn();
20422
20980
  let agentRequest;
@@ -20447,7 +21005,7 @@ async function evaluateCandidate(options) {
20447
21005
  return {
20448
21006
  timestamp: completedAt.toISOString(),
20449
21007
  testId: evalCase.id,
20450
- dataset: evalCase.dataset,
21008
+ suite: evalCase.suite,
20451
21009
  category: evalCase.category,
20452
21010
  conversationId: evalCase.conversation_id,
20453
21011
  score: score.score,
@@ -20490,7 +21048,8 @@ async function runEvaluatorsForCase(options) {
20490
21048
  targetResolver,
20491
21049
  availableTargets,
20492
21050
  fileChanges,
20493
- workspacePath
21051
+ workspacePath,
21052
+ threshold
20494
21053
  } = options;
20495
21054
  if (evalCase.assertions && evalCase.assertions.length > 0) {
20496
21055
  return runEvaluatorList({
@@ -20516,7 +21075,8 @@ async function runEvaluatorsForCase(options) {
20516
21075
  targetResolver,
20517
21076
  availableTargets,
20518
21077
  fileChanges,
20519
- workspacePath
21078
+ workspacePath,
21079
+ threshold
20520
21080
  });
20521
21081
  }
20522
21082
  const evaluatorKind = evalCase.evaluator ?? "llm-grader";
@@ -20618,7 +21178,8 @@ async function runEvaluatorList(options) {
20618
21178
  name: evaluatorConfig.name,
20619
21179
  type: evaluatorConfig.type,
20620
21180
  weight,
20621
- ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
21181
+ ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
21182
+ ...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
20622
21183
  });
20623
21184
  scores.push({
20624
21185
  name: evaluatorConfig.name,
@@ -20653,7 +21214,8 @@ async function runEvaluatorList(options) {
20653
21214
  name: evaluatorConfig.name ?? "unknown",
20654
21215
  type: evaluatorConfig.type ?? "llm-grader",
20655
21216
  weight,
20656
- ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
21217
+ ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
21218
+ ...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
20657
21219
  });
20658
21220
  scores.push({
20659
21221
  name: evaluatorConfig.name ?? "unknown",
@@ -20687,9 +21249,10 @@ async function runEvaluatorList(options) {
20687
21249
  }
20688
21250
  }
20689
21251
  }
21252
+ const effectiveThreshold = options.threshold ?? DEFAULT_THRESHOLD;
20690
21253
  const hasRequiredFailure = scored.some((entry) => {
20691
21254
  if (!entry.required) return false;
20692
- const minScore = typeof entry.required === "number" ? entry.required : PASS_THRESHOLD;
21255
+ const minScore = entry.min_score ?? (typeof entry.required === "number" ? entry.required : effectiveThreshold);
20693
21256
  return entry.score.score < minScore;
20694
21257
  });
20695
21258
  const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
@@ -20700,17 +21263,23 @@ async function runEvaluatorList(options) {
20700
21263
  const expectedAspectCount = assertions.length || 1;
20701
21264
  const score = {
20702
21265
  score: aggregateScore,
20703
- verdict: scoreToVerdict(aggregateScore),
21266
+ verdict: scoreToVerdict(aggregateScore, effectiveThreshold),
20704
21267
  assertions,
20705
21268
  expectedAspectCount
20706
21269
  };
20707
21270
  return { score, scores };
20708
21271
  }
21272
+ function formatFilter(filter) {
21273
+ return typeof filter === "string" ? filter : filter.join(", ");
21274
+ }
21275
+ function matchesFilter3(id, filter) {
21276
+ return typeof filter === "string" ? import_micromatch3.default.isMatch(id, filter) : filter.some((pattern) => import_micromatch3.default.isMatch(id, pattern));
21277
+ }
20709
21278
  function filterEvalCases(evalCases, filter) {
20710
21279
  if (!filter) {
20711
21280
  return evalCases;
20712
21281
  }
20713
- return evalCases.filter((evalCase) => import_micromatch3.default.isMatch(evalCase.id, filter));
21282
+ return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter));
20714
21283
  }
20715
21284
  function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
20716
21285
  const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
@@ -20797,7 +21366,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
20797
21366
  return {
20798
21367
  timestamp: timestamp.toISOString(),
20799
21368
  testId: evalCase.id,
20800
- dataset: evalCase.dataset,
21369
+ suite: evalCase.suite,
20801
21370
  category: evalCase.category,
20802
21371
  conversationId: evalCase.conversation_id,
20803
21372
  score: 0,
@@ -21071,6 +21640,7 @@ async function evaluate(config) {
21071
21640
  verbose: config.verbose,
21072
21641
  maxConcurrency: config.workers ?? 3,
21073
21642
  filter: config.filter,
21643
+ threshold: config.threshold,
21074
21644
  evalCases,
21075
21645
  onResult: async (result) => {
21076
21646
  collectedResults.push(result);
@@ -21081,19 +21651,19 @@ async function evaluate(config) {
21081
21651
  const durationMs = Date.now() - startTime;
21082
21652
  return {
21083
21653
  results: allResults,
21084
- summary: computeSummary(allResults, durationMs)
21654
+ summary: computeSummary(allResults, durationMs, config.threshold)
21085
21655
  };
21086
21656
  }
21087
21657
  function mapAssertionType(type) {
21088
21658
  return type.replace(/_/g, "-");
21089
21659
  }
21090
- function computeSummary(results, durationMs) {
21660
+ function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
21091
21661
  const total = results.length;
21092
21662
  let passed = 0;
21093
21663
  let scoreSum = 0;
21094
21664
  for (const r of results) {
21095
21665
  scoreSum += r.score;
21096
- if (r.score >= PASS_THRESHOLD) {
21666
+ if (r.score >= threshold) {
21097
21667
  passed++;
21098
21668
  }
21099
21669
  }
@@ -21207,7 +21777,7 @@ var CONFIG_FILE_NAMES = [
21207
21777
  ];
21208
21778
  async function loadTsConfig(projectRoot) {
21209
21779
  const { existsSync: existsSync7 } = await import("fs");
21210
- const { pathToFileURL } = await import("url");
21780
+ const { pathToFileURL: pathToFileURL2 } = await import("url");
21211
21781
  const { join: join2 } = await import("path");
21212
21782
  for (const fileName of CONFIG_FILE_NAMES) {
21213
21783
  const filePath = join2(projectRoot, fileName);
@@ -21215,7 +21785,7 @@ async function loadTsConfig(projectRoot) {
21215
21785
  continue;
21216
21786
  }
21217
21787
  try {
21218
- const fileUrl = pathToFileURL(filePath).href;
21788
+ const fileUrl = pathToFileURL2(filePath).href;
21219
21789
  const mod = await import(fileUrl);
21220
21790
  const config = mod.default ?? mod;
21221
21791
  return AgentVConfigSchema.parse(config);
@@ -21656,7 +22226,7 @@ var OtelTraceExporter = class {
21656
22226
  rootSpan.setAttribute("gen_ai.system", "agentv");
21657
22227
  rootSpan.setAttribute("agentv.test_id", result.testId);
21658
22228
  rootSpan.setAttribute("agentv.target", result.target);
21659
- if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
22229
+ if (result.suite) rootSpan.setAttribute("agentv.suite", result.suite);
21660
22230
  rootSpan.setAttribute("agentv.score", result.score);
21661
22231
  if (captureContent && result.output.length > 0) {
21662
22232
  const lastMsg = result.output[result.output.length - 1];
@@ -21865,7 +22435,7 @@ var OtelStreamingObserver = class {
21865
22435
  this.rootSpan.setAttribute("gen_ai.system", "agentv");
21866
22436
  this.rootSpan.setAttribute("agentv.test_id", testId);
21867
22437
  this.rootSpan.setAttribute("agentv.target", target);
21868
- if (evalSet) this.rootSpan.setAttribute("agentv.dataset", evalSet);
22438
+ if (evalSet) this.rootSpan.setAttribute("agentv.suite", evalSet);
21869
22439
  this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
21870
22440
  }
21871
22441
  /** Create and immediately export a tool span */
@@ -22221,12 +22791,244 @@ function extractToolResultContent(content) {
22221
22791
  return parts.length > 0 ? parts.join("") : void 0;
22222
22792
  }
22223
22793
 
22224
- // src/import/session-discovery.ts
22794
+ // src/import/codex-parser.ts
22795
+ init_cjs_shims();
22796
+ function parseCodexSession(jsonl) {
22797
+ const messages = [];
22798
+ let sessionId = "";
22799
+ let cwd;
22800
+ let model;
22801
+ let version;
22802
+ let startTimestamp;
22803
+ let endTimestamp;
22804
+ const pendingCalls = /* @__PURE__ */ new Map();
22805
+ const lines = jsonl.split("\n").filter((l) => l.trim().length > 0);
22806
+ for (const line of lines) {
22807
+ let entry;
22808
+ try {
22809
+ entry = JSON.parse(line);
22810
+ } catch {
22811
+ continue;
22812
+ }
22813
+ if (!entry.type) continue;
22814
+ if (entry.timestamp) {
22815
+ if (!startTimestamp) startTimestamp = entry.timestamp;
22816
+ endTimestamp = entry.timestamp;
22817
+ }
22818
+ const payload = entry.payload ?? {};
22819
+ switch (entry.type) {
22820
+ case "session_meta": {
22821
+ sessionId = String(payload.id ?? "");
22822
+ cwd = payload.cwd ? String(payload.cwd) : void 0;
22823
+ version = payload.cli_version ? String(payload.cli_version) : void 0;
22824
+ if (payload.model && !model) {
22825
+ model = String(payload.model);
22826
+ }
22827
+ break;
22828
+ }
22829
+ case "turn_context": {
22830
+ if (payload.model && !model) {
22831
+ model = String(payload.model);
22832
+ }
22833
+ if (payload.cwd && !cwd) {
22834
+ cwd = String(payload.cwd);
22835
+ }
22836
+ break;
22837
+ }
22838
+ case "response_item": {
22839
+ const itemType = String(payload.type ?? "");
22840
+ const role = String(payload.role ?? "");
22841
+ switch (itemType) {
22842
+ case "message": {
22843
+ if (role === "developer") break;
22844
+ const content = extractResponseItemContent(payload.content);
22845
+ if (role === "user" && content) {
22846
+ messages.push({ role: "user", content });
22847
+ } else if (role === "assistant" && content) {
22848
+ messages.push({ role: "assistant", content });
22849
+ }
22850
+ break;
22851
+ }
22852
+ case "function_call": {
22853
+ const toolName = String(payload.name ?? "");
22854
+ const callId = String(payload.call_id ?? "");
22855
+ let input;
22856
+ if (typeof payload.arguments === "string") {
22857
+ try {
22858
+ input = JSON.parse(payload.arguments);
22859
+ } catch {
22860
+ input = payload.arguments;
22861
+ }
22862
+ } else {
22863
+ input = payload.arguments;
22864
+ }
22865
+ const toolCall = { tool: toolName, input, id: callId };
22866
+ const msgIdx = messages.length;
22867
+ messages.push({
22868
+ role: "assistant",
22869
+ toolCalls: [toolCall]
22870
+ });
22871
+ if (callId) {
22872
+ pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
22873
+ }
22874
+ break;
22875
+ }
22876
+ case "custom_tool_call": {
22877
+ const toolName = String(payload.name ?? "");
22878
+ const callId = String(payload.call_id ?? "");
22879
+ let input;
22880
+ if (typeof payload.arguments === "string") {
22881
+ try {
22882
+ input = JSON.parse(payload.arguments);
22883
+ } catch {
22884
+ input = payload.arguments;
22885
+ }
22886
+ } else {
22887
+ input = payload.arguments;
22888
+ }
22889
+ const toolCall = { tool: toolName, input, id: callId };
22890
+ const msgIdx = messages.length;
22891
+ messages.push({
22892
+ role: "assistant",
22893
+ toolCalls: [toolCall]
22894
+ });
22895
+ if (callId) {
22896
+ pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
22897
+ }
22898
+ break;
22899
+ }
22900
+ case "function_call_output":
22901
+ case "custom_tool_call_output": {
22902
+ const callId = String(payload.call_id ?? "");
22903
+ const pending = pendingCalls.get(callId);
22904
+ if (pending) {
22905
+ const existingMsg = messages[pending.msgIdx];
22906
+ const existingCalls = [...existingMsg.toolCalls ?? []];
22907
+ existingCalls[pending.toolIdx] = {
22908
+ ...existingCalls[pending.toolIdx],
22909
+ output: payload.output
22910
+ };
22911
+ messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls };
22912
+ pendingCalls.delete(callId);
22913
+ }
22914
+ break;
22915
+ }
22916
+ // Skip reasoning blocks (thinking tokens)
22917
+ case "reasoning":
22918
+ break;
22919
+ }
22920
+ break;
22921
+ }
22922
+ }
22923
+ }
22924
+ let durationMs;
22925
+ if (startTimestamp && endTimestamp) {
22926
+ durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime();
22927
+ }
22928
+ const source = {
22929
+ provider: "codex",
22930
+ sessionId,
22931
+ cwd,
22932
+ startedAt: startTimestamp,
22933
+ model,
22934
+ version
22935
+ };
22936
+ return {
22937
+ messages,
22938
+ source,
22939
+ // Codex rollout files don't include token counts (only rate limit info)
22940
+ tokenUsage: void 0,
22941
+ durationMs,
22942
+ costUsd: null
22943
+ };
22944
+ }
22945
+ function extractResponseItemContent(content) {
22946
+ if (typeof content === "string") return content;
22947
+ if (!Array.isArray(content)) return void 0;
22948
+ const parts = [];
22949
+ for (const block of content) {
22950
+ if (typeof block === "object" && block !== null) {
22951
+ const b = block;
22952
+ if (typeof b.text === "string") {
22953
+ parts.push(b.text);
22954
+ }
22955
+ }
22956
+ }
22957
+ return parts.length > 0 ? parts.join("") : void 0;
22958
+ }
22959
+
22960
+ // src/import/codex-session-discovery.ts
22225
22961
  init_cjs_shims();
22226
22962
  var import_promises36 = require("fs/promises");
22227
22963
  var import_node_os8 = require("os");
22228
22964
  var import_node_path53 = __toESM(require("path"), 1);
22229
- var DEFAULT_PROJECTS_DIR = () => import_node_path53.default.join((0, import_node_os8.homedir)(), ".claude", "projects");
22965
+ var DEFAULT_SESSIONS_DIR = () => import_node_path53.default.join((0, import_node_os8.homedir)(), ".codex", "sessions");
22966
+ async function discoverCodexSessions(opts) {
22967
+ const sessionsDir = opts?.sessionsDir ?? DEFAULT_SESSIONS_DIR();
22968
+ const limit = opts?.latest ? 1 : opts?.limit ?? 10;
22969
+ const sessions = [];
22970
+ let yearDirs;
22971
+ try {
22972
+ yearDirs = await (0, import_promises36.readdir)(sessionsDir);
22973
+ } catch {
22974
+ return [];
22975
+ }
22976
+ for (const year of yearDirs) {
22977
+ const yearPath = import_node_path53.default.join(sessionsDir, year);
22978
+ let monthDirs;
22979
+ try {
22980
+ monthDirs = await (0, import_promises36.readdir)(yearPath);
22981
+ } catch {
22982
+ continue;
22983
+ }
22984
+ for (const month of monthDirs) {
22985
+ const monthPath = import_node_path53.default.join(yearPath, month);
22986
+ let dayDirs;
22987
+ try {
22988
+ dayDirs = await (0, import_promises36.readdir)(monthPath);
22989
+ } catch {
22990
+ continue;
22991
+ }
22992
+ for (const day of dayDirs) {
22993
+ if (opts?.date) {
22994
+ const dirDate = `${year}-${month}-${day}`;
22995
+ if (dirDate !== opts.date) continue;
22996
+ }
22997
+ const dayPath = import_node_path53.default.join(monthPath, day);
22998
+ let files;
22999
+ try {
23000
+ files = await (0, import_promises36.readdir)(dayPath);
23001
+ } catch {
23002
+ continue;
23003
+ }
23004
+ for (const file of files) {
23005
+ if (!file.startsWith("rollout-") || !file.endsWith(".jsonl")) continue;
23006
+ const filePath = import_node_path53.default.join(dayPath, file);
23007
+ const nameWithoutExt = file.replace(/\.jsonl$/, "");
23008
+ const parts = nameWithoutExt.split("-");
23009
+ const sessionId = parts.length >= 6 ? parts.slice(-5).join("-") : nameWithoutExt;
23010
+ let updatedAt;
23011
+ try {
23012
+ const fileStat = await (0, import_promises36.stat)(filePath);
23013
+ updatedAt = fileStat.mtime;
23014
+ } catch {
23015
+ updatedAt = /* @__PURE__ */ new Date(0);
23016
+ }
23017
+ sessions.push({ sessionId, filePath, filename: file, updatedAt });
23018
+ }
23019
+ }
23020
+ }
23021
+ }
23022
+ sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
23023
+ return sessions.slice(0, limit);
23024
+ }
23025
+
23026
+ // src/import/session-discovery.ts
23027
+ init_cjs_shims();
23028
+ var import_promises37 = require("fs/promises");
23029
+ var import_node_os9 = require("os");
23030
+ var import_node_path54 = __toESM(require("path"), 1);
23031
+ var DEFAULT_PROJECTS_DIR = () => import_node_path54.default.join((0, import_node_os9.homedir)(), ".claude", "projects");
22230
23032
  function encodeProjectPath(projectPath) {
22231
23033
  return projectPath.replace(/\//g, "-");
22232
23034
  }
@@ -22235,7 +23037,7 @@ async function discoverClaudeSessions(opts) {
22235
23037
  const limit = opts?.latest ? 1 : opts?.limit ?? 10;
22236
23038
  let projectDirs;
22237
23039
  try {
22238
- projectDirs = await (0, import_promises36.readdir)(projectsDir);
23040
+ projectDirs = await (0, import_promises37.readdir)(projectsDir);
22239
23041
  } catch {
22240
23042
  return [];
22241
23043
  }
@@ -22245,10 +23047,10 @@ async function discoverClaudeSessions(opts) {
22245
23047
  }
22246
23048
  const sessions = [];
22247
23049
  for (const projectDir of projectDirs) {
22248
- const dirPath = import_node_path53.default.join(projectsDir, projectDir);
23050
+ const dirPath = import_node_path54.default.join(projectsDir, projectDir);
22249
23051
  let entries;
22250
23052
  try {
22251
- entries = await (0, import_promises36.readdir)(dirPath);
23053
+ entries = await (0, import_promises37.readdir)(dirPath);
22252
23054
  } catch {
22253
23055
  continue;
22254
23056
  }
@@ -22256,10 +23058,10 @@ async function discoverClaudeSessions(opts) {
22256
23058
  if (!entry.endsWith(".jsonl")) continue;
22257
23059
  const sessionId = entry.replace(/\.jsonl$/, "");
22258
23060
  if (opts?.sessionId && sessionId !== opts.sessionId) continue;
22259
- const filePath = import_node_path53.default.join(dirPath, entry);
23061
+ const filePath = import_node_path54.default.join(dirPath, entry);
22260
23062
  let updatedAt;
22261
23063
  try {
22262
- const fileStat = await (0, import_promises36.stat)(filePath);
23064
+ const fileStat = await (0, import_promises37.stat)(filePath);
22263
23065
  updatedAt = fileStat.mtime;
22264
23066
  } catch {
22265
23067
  updatedAt = /* @__PURE__ */ new Date(0);
@@ -22276,13 +23078,91 @@ async function discoverClaudeSessions(opts) {
22276
23078
  return sessions.slice(0, limit);
22277
23079
  }
22278
23080
 
23081
+ // src/import/transcript-provider.ts
23082
+ init_cjs_shims();
23083
+
22279
23084
  // src/import/types.ts
22280
23085
  init_cjs_shims();
22281
- var import_promises37 = require("fs/promises");
23086
+ var import_promises38 = require("fs/promises");
23087
+ function toTranscriptJsonLine(entry) {
23088
+ const firstUserMessage = entry.messages.find((m) => m.role === "user");
23089
+ const input = typeof firstUserMessage?.content === "string" ? firstUserMessage.content : "";
23090
+ return {
23091
+ input,
23092
+ output: entry.messages,
23093
+ token_usage: entry.tokenUsage ? {
23094
+ input: entry.tokenUsage.input,
23095
+ output: entry.tokenUsage.output,
23096
+ cached: entry.tokenUsage.cached
23097
+ } : void 0,
23098
+ duration_ms: entry.durationMs,
23099
+ cost_usd: entry.costUsd,
23100
+ source: {
23101
+ provider: entry.source.provider,
23102
+ session_id: entry.source.sessionId,
23103
+ model: entry.source.model,
23104
+ timestamp: entry.source.startedAt,
23105
+ git_branch: entry.source.gitBranch,
23106
+ cwd: entry.source.cwd ?? entry.source.projectPath,
23107
+ version: entry.source.version
23108
+ }
23109
+ };
23110
+ }
23111
+ async function readTranscriptJsonl(filePath) {
23112
+ const text = await (0, import_promises38.readFile)(filePath, "utf8");
23113
+ return text.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
23114
+ }
22282
23115
  async function readTranscriptFile(filePath) {
22283
- return (0, import_promises37.readFile)(filePath, "utf8");
23116
+ return (0, import_promises38.readFile)(filePath, "utf8");
22284
23117
  }
22285
23118
 
23119
+ // src/import/transcript-provider.ts
23120
+ var TranscriptProvider = class _TranscriptProvider {
23121
+ id;
23122
+ kind = "transcript";
23123
+ targetName;
23124
+ lines;
23125
+ cursor = 0;
23126
+ constructor(targetName, lines) {
23127
+ this.targetName = targetName;
23128
+ this.id = `transcript:${targetName}`;
23129
+ this.lines = lines;
23130
+ }
23131
+ /**
23132
+ * Create a TranscriptProvider from a JSONL file path.
23133
+ */
23134
+ static async fromFile(filePath) {
23135
+ const lines = await readTranscriptJsonl(filePath);
23136
+ if (lines.length === 0) {
23137
+ throw new Error(`Transcript file is empty: ${filePath}`);
23138
+ }
23139
+ const providerName = lines[0].source.provider ?? "transcript";
23140
+ return new _TranscriptProvider(providerName, lines);
23141
+ }
23142
+ get lineCount() {
23143
+ return this.lines.length;
23144
+ }
23145
+ async invoke(_request) {
23146
+ if (this.cursor >= this.lines.length) {
23147
+ throw new Error(
23148
+ `Transcript exhausted: ${this.lines.length} line(s) available but ${this.cursor + 1} invocations attempted. Each transcript line maps to one test case.`
23149
+ );
23150
+ }
23151
+ const line = this.lines[this.cursor++];
23152
+ return {
23153
+ output: line.output,
23154
+ tokenUsage: line.token_usage ? {
23155
+ input: line.token_usage.input,
23156
+ output: line.token_usage.output,
23157
+ cached: line.token_usage.cached
23158
+ } : void 0,
23159
+ durationMs: line.duration_ms,
23160
+ costUsd: line.cost_usd ?? void 0,
23161
+ startTime: line.source.timestamp
23162
+ };
23163
+ }
23164
+ };
23165
+
22286
23166
  // src/index.ts
22287
23167
  function createAgentKernel() {
22288
23168
  return { status: "stub" };
@@ -22297,6 +23177,7 @@ function createAgentKernel() {
22297
23177
  DEFAULT_EVALUATOR_TEMPLATE,
22298
23178
  DEFAULT_EVAL_PATTERNS,
22299
23179
  DEFAULT_EXPLORATION_TOOLS,
23180
+ DEFAULT_THRESHOLD,
22300
23181
  DeterministicAssertionEvaluator,
22301
23182
  EvaluatorRegistry,
22302
23183
  ExecutionMetricsEvaluator,
@@ -22318,6 +23199,7 @@ function createAgentKernel() {
22318
23199
  TemplateNotFoundError,
22319
23200
  TokenUsageEvaluator,
22320
23201
  ToolTrajectoryEvaluator,
23202
+ TranscriptProvider,
22321
23203
  WorkspaceCreationError,
22322
23204
  WorkspacePoolManager,
22323
23205
  addProject,
@@ -22354,6 +23236,7 @@ function createAgentKernel() {
22354
23236
  detectFormat,
22355
23237
  discoverAssertions,
22356
23238
  discoverClaudeSessions,
23239
+ discoverCodexSessions,
22357
23240
  discoverCopilotSessions,
22358
23241
  discoverGraders,
22359
23242
  discoverJudges,
@@ -22414,6 +23297,8 @@ function createAgentKernel() {
22414
23297
  normalizeLineEndings,
22415
23298
  parseAgentSkillsEvals,
22416
23299
  parseClaudeSession,
23300
+ parseCodexSession,
23301
+ parseCopilotEvents,
22417
23302
  parseJsonFromText,
22418
23303
  parseJsonSafe,
22419
23304
  readJsonFile,
@@ -22421,6 +23306,7 @@ function createAgentKernel() {
22421
23306
  readTestSuiteMetadata,
22422
23307
  readTextFile,
22423
23308
  readTranscriptFile,
23309
+ readTranscriptJsonl,
22424
23310
  removeProject,
22425
23311
  resolveAndCreateProvider,
22426
23312
  resolveDelegatedTargetDefinition,
@@ -22453,6 +23339,7 @@ function createAgentKernel() {
22453
23339
  substituteVariables,
22454
23340
  toCamelCaseDeep,
22455
23341
  toSnakeCaseDeep,
23342
+ toTranscriptJsonLine,
22456
23343
  tokensPerTool,
22457
23344
  touchProject,
22458
23345
  transpileEvalYaml,