@agentv/core 4.6.1 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -31,12 +31,9 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
31
31
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
32
32
 
33
33
  // ../../node_modules/.bun/tsup@8.3.5+19811ebab77a7b1c/node_modules/tsup/assets/cjs_shims.js
34
- var getImportMetaUrl, importMetaUrl;
35
34
  var init_cjs_shims = __esm({
36
35
  "../../node_modules/.bun/tsup@8.3.5+19811ebab77a7b1c/node_modules/tsup/assets/cjs_shims.js"() {
37
36
  "use strict";
38
- getImportMetaUrl = () => typeof document === "undefined" ? new URL(`file:${__filename}`).href : document.currentScript && document.currentScript.src || new URL("main.js", document.baseURI).href;
39
- importMetaUrl = /* @__PURE__ */ getImportMetaUrl();
40
37
  }
41
38
  });
42
39
 
@@ -1435,6 +1432,7 @@ __export(index_exports, {
1435
1432
  DEFAULT_EVALUATOR_TEMPLATE: () => DEFAULT_EVALUATOR_TEMPLATE,
1436
1433
  DEFAULT_EVAL_PATTERNS: () => DEFAULT_EVAL_PATTERNS,
1437
1434
  DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
1435
+ DEFAULT_THRESHOLD: () => DEFAULT_THRESHOLD,
1438
1436
  DeterministicAssertionEvaluator: () => DeterministicAssertionEvaluator,
1439
1437
  EvaluatorRegistry: () => EvaluatorRegistry,
1440
1438
  ExecutionMetricsEvaluator: () => ExecutionMetricsEvaluator,
@@ -1456,6 +1454,7 @@ __export(index_exports, {
1456
1454
  TemplateNotFoundError: () => TemplateNotFoundError,
1457
1455
  TokenUsageEvaluator: () => TokenUsageEvaluator,
1458
1456
  ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
1457
+ TranscriptProvider: () => TranscriptProvider,
1459
1458
  WorkspaceCreationError: () => WorkspaceCreationError,
1460
1459
  WorkspacePoolManager: () => WorkspacePoolManager,
1461
1460
  addProject: () => addProject,
@@ -1492,6 +1491,7 @@ __export(index_exports, {
1492
1491
  detectFormat: () => detectFormat,
1493
1492
  discoverAssertions: () => discoverAssertions,
1494
1493
  discoverClaudeSessions: () => discoverClaudeSessions,
1494
+ discoverCodexSessions: () => discoverCodexSessions,
1495
1495
  discoverCopilotSessions: () => discoverCopilotSessions,
1496
1496
  discoverGraders: () => discoverGraders,
1497
1497
  discoverJudges: () => discoverGraders,
@@ -1552,6 +1552,8 @@ __export(index_exports, {
1552
1552
  normalizeLineEndings: () => normalizeLineEndings,
1553
1553
  parseAgentSkillsEvals: () => parseAgentSkillsEvals,
1554
1554
  parseClaudeSession: () => parseClaudeSession,
1555
+ parseCodexSession: () => parseCodexSession,
1556
+ parseCopilotEvents: () => parseCopilotEvents,
1555
1557
  parseJsonFromText: () => parseJsonFromText,
1556
1558
  parseJsonSafe: () => parseJsonSafe,
1557
1559
  readJsonFile: () => readJsonFile,
@@ -1559,6 +1561,7 @@ __export(index_exports, {
1559
1561
  readTestSuiteMetadata: () => readTestSuiteMetadata,
1560
1562
  readTextFile: () => readTextFile,
1561
1563
  readTranscriptFile: () => readTranscriptFile,
1564
+ readTranscriptJsonl: () => readTranscriptJsonl,
1562
1565
  removeProject: () => removeProject,
1563
1566
  resolveAndCreateProvider: () => resolveAndCreateProvider,
1564
1567
  resolveDelegatedTargetDefinition: () => resolveDelegatedTargetDefinition,
@@ -1591,6 +1594,7 @@ __export(index_exports, {
1591
1594
  substituteVariables: () => substituteVariables,
1592
1595
  toCamelCaseDeep: () => toCamelCaseDeep,
1593
1596
  toSnakeCaseDeep: () => toSnakeCaseDeep,
1597
+ toTranscriptJsonLine: () => toTranscriptJsonLine,
1594
1598
  tokensPerTool: () => tokensPerTool,
1595
1599
  touchProject: () => touchProject,
1596
1600
  transpileEvalYaml: () => transpileEvalYaml,
@@ -2675,8 +2679,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2675
2679
  const negate = rawEvaluator.negate === true ? true : void 0;
2676
2680
  if (isCustomType) {
2677
2681
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
2678
- const required2 = parseRequired(rawEvaluator.required);
2679
- const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "negate"]);
2682
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
2683
+ rawEvaluator.required,
2684
+ rawEvaluator.min_score,
2685
+ name,
2686
+ evalId
2687
+ );
2688
+ const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "min_score", "negate"]);
2680
2689
  const config2 = {};
2681
2690
  for (const [key, value] of Object.entries(rawEvaluator)) {
2682
2691
  if (!knownProps2.has(key) && value !== void 0) {
@@ -2688,6 +2697,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2688
2697
  type: customTypeName,
2689
2698
  ...weight2 !== void 0 ? { weight: weight2 } : {},
2690
2699
  ...required2 !== void 0 ? { required: required2 } : {},
2700
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
2691
2701
  ...negate !== void 0 ? { negate } : {},
2692
2702
  ...Object.keys(config2).length > 0 ? { config: config2 } : {}
2693
2703
  });
@@ -2757,7 +2767,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2757
2767
  );
2758
2768
  }
2759
2769
  }
2760
- const required2 = parseRequired(rawEvaluator.required);
2770
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
2771
+ rawEvaluator.required,
2772
+ rawEvaluator.min_score,
2773
+ name,
2774
+ evalId
2775
+ );
2761
2776
  const knownProps2 = /* @__PURE__ */ new Set([
2762
2777
  "name",
2763
2778
  "type",
@@ -2783,6 +2798,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2783
2798
  resolvedCwd,
2784
2799
  ...weight2 !== void 0 ? { weight: weight2 } : {},
2785
2800
  ...required2 !== void 0 ? { required: required2 } : {},
2801
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
2786
2802
  ...negate !== void 0 ? { negate } : {},
2787
2803
  ...Object.keys(config2).length > 0 ? { config: config2 } : {},
2788
2804
  ...targetConfig !== void 0 ? { target: targetConfig } : {}
@@ -2911,7 +2927,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2911
2927
  };
2912
2928
  }
2913
2929
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
2914
- const required2 = parseRequired(rawEvaluator.required);
2930
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
2931
+ rawEvaluator.required,
2932
+ rawEvaluator.min_score,
2933
+ name,
2934
+ evalId
2935
+ );
2915
2936
  evaluators.push({
2916
2937
  name,
2917
2938
  type: "composite",
@@ -2919,6 +2940,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2919
2940
  aggregator,
2920
2941
  ...weight2 !== void 0 ? { weight: weight2 } : {},
2921
2942
  ...required2 !== void 0 ? { required: required2 } : {},
2943
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
2922
2944
  ...negate !== void 0 ? { negate } : {}
2923
2945
  });
2924
2946
  continue;
@@ -3029,7 +3051,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3029
3051
  continue;
3030
3052
  }
3031
3053
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3032
- const required2 = parseRequired(rawEvaluator.required);
3054
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3055
+ rawEvaluator.required,
3056
+ rawEvaluator.min_score,
3057
+ name,
3058
+ evalId
3059
+ );
3033
3060
  const config2 = {
3034
3061
  name,
3035
3062
  type: "tool-trajectory",
@@ -3038,6 +3065,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3038
3065
  ...expected ? { expected } : {},
3039
3066
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3040
3067
  ...required2 !== void 0 ? { required: required2 } : {},
3068
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3041
3069
  ...negate !== void 0 ? { negate } : {},
3042
3070
  ...argsMatch2 !== void 0 ? { argsMatch: argsMatch2 } : {}
3043
3071
  };
@@ -3100,7 +3128,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3100
3128
  const aggregation = asString(rawEvaluator.aggregation);
3101
3129
  const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
3102
3130
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3103
- const required2 = parseRequired(rawEvaluator.required);
3131
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3132
+ rawEvaluator.required,
3133
+ rawEvaluator.min_score,
3134
+ name,
3135
+ evalId
3136
+ );
3104
3137
  evaluators.push({
3105
3138
  name,
3106
3139
  type: "field-accuracy",
@@ -3108,6 +3141,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3108
3141
  ...validAggregation ? { aggregation: validAggregation } : {},
3109
3142
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3110
3143
  ...required2 !== void 0 ? { required: required2 } : {},
3144
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3111
3145
  ...negate !== void 0 ? { negate } : {}
3112
3146
  });
3113
3147
  continue;
@@ -3121,13 +3155,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3121
3155
  continue;
3122
3156
  }
3123
3157
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3124
- const required2 = parseRequired(rawEvaluator.required);
3158
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3159
+ rawEvaluator.required,
3160
+ rawEvaluator.min_score,
3161
+ name,
3162
+ evalId
3163
+ );
3125
3164
  evaluators.push({
3126
3165
  name,
3127
3166
  type: "latency",
3128
3167
  threshold,
3129
3168
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3130
3169
  ...required2 !== void 0 ? { required: required2 } : {},
3170
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3131
3171
  ...negate !== void 0 ? { negate } : {}
3132
3172
  });
3133
3173
  continue;
@@ -3141,13 +3181,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3141
3181
  continue;
3142
3182
  }
3143
3183
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3144
- const required2 = parseRequired(rawEvaluator.required);
3184
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3185
+ rawEvaluator.required,
3186
+ rawEvaluator.min_score,
3187
+ name,
3188
+ evalId
3189
+ );
3145
3190
  evaluators.push({
3146
3191
  name,
3147
3192
  type: "cost",
3148
3193
  budget,
3149
3194
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3150
3195
  ...required2 !== void 0 ? { required: required2 } : {},
3196
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3151
3197
  ...negate !== void 0 ? { negate } : {}
3152
3198
  });
3153
3199
  continue;
@@ -3179,13 +3225,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3179
3225
  continue;
3180
3226
  }
3181
3227
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3182
- const required2 = parseRequired(rawEvaluator.required);
3228
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3229
+ rawEvaluator.required,
3230
+ rawEvaluator.min_score,
3231
+ name,
3232
+ evalId
3233
+ );
3183
3234
  evaluators.push({
3184
3235
  name,
3185
3236
  type: "token-usage",
3186
3237
  ...validLimits,
3187
3238
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3188
3239
  ...required2 !== void 0 ? { required: required2 } : {},
3240
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3189
3241
  ...negate !== void 0 ? { negate } : {}
3190
3242
  });
3191
3243
  continue;
@@ -3231,13 +3283,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3231
3283
  continue;
3232
3284
  }
3233
3285
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3234
- const required2 = parseRequired(rawEvaluator.required);
3286
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3287
+ rawEvaluator.required,
3288
+ rawEvaluator.min_score,
3289
+ name,
3290
+ evalId
3291
+ );
3235
3292
  evaluators.push({
3236
3293
  name,
3237
3294
  type: "execution-metrics",
3238
3295
  ...validThresholds,
3239
3296
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3240
3297
  ...required2 !== void 0 ? { required: required2 } : {},
3298
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3241
3299
  ...negate !== void 0 ? { negate } : {}
3242
3300
  });
3243
3301
  continue;
@@ -3251,7 +3309,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3251
3309
  const rawShouldTrigger = rawEvaluator.should_trigger;
3252
3310
  const shouldTrigger = typeof rawShouldTrigger === "boolean" ? rawShouldTrigger : void 0;
3253
3311
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3254
- const required2 = parseRequired(rawEvaluator.required);
3312
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3313
+ rawEvaluator.required,
3314
+ rawEvaluator.min_score,
3315
+ name,
3316
+ evalId
3317
+ );
3255
3318
  evaluators.push({
3256
3319
  name,
3257
3320
  type: "skill-trigger",
@@ -3259,6 +3322,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3259
3322
  ...shouldTrigger !== void 0 ? { should_trigger: shouldTrigger } : {},
3260
3323
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3261
3324
  ...required2 !== void 0 ? { required: required2 } : {},
3325
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3262
3326
  ...negate !== void 0 ? { negate } : {}
3263
3327
  });
3264
3328
  continue;
@@ -3270,13 +3334,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3270
3334
  continue;
3271
3335
  }
3272
3336
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3273
- const required2 = parseRequired(rawEvaluator.required);
3337
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3338
+ rawEvaluator.required,
3339
+ rawEvaluator.min_score,
3340
+ name,
3341
+ evalId
3342
+ );
3274
3343
  evaluators.push({
3275
3344
  name,
3276
3345
  type: "contains",
3277
3346
  value,
3278
3347
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3279
3348
  ...required2 !== void 0 ? { required: required2 } : {},
3349
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3280
3350
  ...negate !== void 0 ? { negate } : {}
3281
3351
  });
3282
3352
  continue;
@@ -3290,13 +3360,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3290
3360
  continue;
3291
3361
  }
3292
3362
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3293
- const required2 = parseRequired(rawEvaluator.required);
3363
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3364
+ rawEvaluator.required,
3365
+ rawEvaluator.min_score,
3366
+ name,
3367
+ evalId
3368
+ );
3294
3369
  evaluators.push({
3295
3370
  name,
3296
3371
  type: typeValue,
3297
3372
  value,
3298
3373
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3299
3374
  ...required2 !== void 0 ? { required: required2 } : {},
3375
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3300
3376
  ...negate !== void 0 ? { negate } : {}
3301
3377
  });
3302
3378
  continue;
@@ -3308,13 +3384,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3308
3384
  continue;
3309
3385
  }
3310
3386
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3311
- const required2 = parseRequired(rawEvaluator.required);
3387
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3388
+ rawEvaluator.required,
3389
+ rawEvaluator.min_score,
3390
+ name,
3391
+ evalId
3392
+ );
3312
3393
  evaluators.push({
3313
3394
  name,
3314
3395
  type: "icontains",
3315
3396
  value,
3316
3397
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3317
3398
  ...required2 !== void 0 ? { required: required2 } : {},
3399
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3318
3400
  ...negate !== void 0 ? { negate } : {}
3319
3401
  });
3320
3402
  continue;
@@ -3328,13 +3410,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3328
3410
  continue;
3329
3411
  }
3330
3412
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3331
- const required2 = parseRequired(rawEvaluator.required);
3413
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3414
+ rawEvaluator.required,
3415
+ rawEvaluator.min_score,
3416
+ name,
3417
+ evalId
3418
+ );
3332
3419
  evaluators.push({
3333
3420
  name,
3334
3421
  type: typeValue,
3335
3422
  value,
3336
3423
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3337
3424
  ...required2 !== void 0 ? { required: required2 } : {},
3425
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3338
3426
  ...negate !== void 0 ? { negate } : {}
3339
3427
  });
3340
3428
  continue;
@@ -3346,13 +3434,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3346
3434
  continue;
3347
3435
  }
3348
3436
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3349
- const required2 = parseRequired(rawEvaluator.required);
3437
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3438
+ rawEvaluator.required,
3439
+ rawEvaluator.min_score,
3440
+ name,
3441
+ evalId
3442
+ );
3350
3443
  evaluators.push({
3351
3444
  name,
3352
3445
  type: typeValue,
3353
3446
  value,
3354
3447
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3355
3448
  ...required2 !== void 0 ? { required: required2 } : {},
3449
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3356
3450
  ...negate !== void 0 ? { negate } : {}
3357
3451
  });
3358
3452
  continue;
@@ -3365,7 +3459,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3365
3459
  }
3366
3460
  const flags = asString(rawEvaluator.flags);
3367
3461
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3368
- const required2 = parseRequired(rawEvaluator.required);
3462
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3463
+ rawEvaluator.required,
3464
+ rawEvaluator.min_score,
3465
+ name,
3466
+ evalId
3467
+ );
3369
3468
  evaluators.push({
3370
3469
  name,
3371
3470
  type: "regex",
@@ -3373,18 +3472,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3373
3472
  ...flags !== void 0 ? { flags } : {},
3374
3473
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3375
3474
  ...required2 !== void 0 ? { required: required2 } : {},
3475
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3376
3476
  ...negate !== void 0 ? { negate } : {}
3377
3477
  });
3378
3478
  continue;
3379
3479
  }
3380
3480
  if (typeValue === "is-json") {
3381
3481
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3382
- const required2 = parseRequired(rawEvaluator.required);
3482
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3483
+ rawEvaluator.required,
3484
+ rawEvaluator.min_score,
3485
+ name,
3486
+ evalId
3487
+ );
3383
3488
  evaluators.push({
3384
3489
  name,
3385
3490
  type: "is-json",
3386
3491
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3387
3492
  ...required2 !== void 0 ? { required: required2 } : {},
3493
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3388
3494
  ...negate !== void 0 ? { negate } : {}
3389
3495
  });
3390
3496
  continue;
@@ -3396,13 +3502,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3396
3502
  continue;
3397
3503
  }
3398
3504
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3399
- const required2 = parseRequired(rawEvaluator.required);
3505
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3506
+ rawEvaluator.required,
3507
+ rawEvaluator.min_score,
3508
+ name,
3509
+ evalId
3510
+ );
3400
3511
  evaluators.push({
3401
3512
  name,
3402
3513
  type: "equals",
3403
3514
  value,
3404
3515
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3405
3516
  ...required2 !== void 0 ? { required: required2 } : {},
3517
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3406
3518
  ...negate !== void 0 ? { negate } : {}
3407
3519
  });
3408
3520
  continue;
@@ -3438,7 +3550,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3438
3550
  continue;
3439
3551
  }
3440
3552
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3441
- const required2 = parseRequired(rawEvaluator.required);
3553
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3554
+ rawEvaluator.required,
3555
+ rawEvaluator.min_score,
3556
+ name,
3557
+ evalId
3558
+ );
3442
3559
  evaluators.push({
3443
3560
  name,
3444
3561
  type: "llm-grader",
@@ -3446,6 +3563,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3446
3563
  ...graderTargetName ? { target: graderTargetName } : {},
3447
3564
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3448
3565
  ...required2 !== void 0 ? { required: required2 } : {},
3566
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3449
3567
  ...negate !== void 0 ? { negate } : {}
3450
3568
  });
3451
3569
  continue;
@@ -3515,7 +3633,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3515
3633
  continue;
3516
3634
  }
3517
3635
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3518
- const required2 = parseRequired(rawEvaluator.required);
3636
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3637
+ rawEvaluator.required,
3638
+ rawEvaluator.min_score,
3639
+ name,
3640
+ evalId
3641
+ );
3519
3642
  evaluators.push({
3520
3643
  name,
3521
3644
  type: "llm-grader",
@@ -3523,12 +3646,18 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3523
3646
  ...graderTargetName ? { target: graderTargetName } : {},
3524
3647
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3525
3648
  ...required2 !== void 0 ? { required: required2 } : {},
3649
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3526
3650
  ...negate !== void 0 ? { negate } : {}
3527
3651
  });
3528
3652
  continue;
3529
3653
  }
3530
3654
  const weight = validateWeight(rawEvaluator.weight, name, evalId);
3531
- const required = parseRequired(rawEvaluator.required);
3655
+ const { required, min_score } = parseRequiredAndMinScore(
3656
+ rawEvaluator.required,
3657
+ rawEvaluator.min_score,
3658
+ name,
3659
+ evalId
3660
+ );
3532
3661
  const knownProps = /* @__PURE__ */ new Set([
3533
3662
  "name",
3534
3663
  "type",
@@ -3539,6 +3668,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3539
3668
  "weight",
3540
3669
  "config",
3541
3670
  "required",
3671
+ "min_score",
3542
3672
  "negate",
3543
3673
  "max_steps",
3544
3674
  "maxSteps",
@@ -3568,6 +3698,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3568
3698
  ...graderTargetName ? { target: graderTargetName } : {},
3569
3699
  ...weight !== void 0 ? { weight } : {},
3570
3700
  ...required !== void 0 ? { required } : {},
3701
+ ...min_score !== void 0 ? { min_score } : {},
3571
3702
  ...negate !== void 0 ? { negate } : {},
3572
3703
  ...finalConfig ? { config: finalConfig } : {},
3573
3704
  ...llmMaxSteps !== void 0 ? { max_steps: llmMaxSteps } : {},
@@ -3699,10 +3830,23 @@ ${detailBlock}${ANSI_RESET5}`);
3699
3830
  console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET5}`);
3700
3831
  }
3701
3832
  }
3702
- function parseRequired(value) {
3703
- if (value === true) return true;
3704
- if (typeof value === "number" && value > 0 && value <= 1) return value;
3705
- return void 0;
3833
+ function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalId) {
3834
+ const result = {};
3835
+ if (typeof rawMinScore === "number" && rawMinScore > 0 && rawMinScore <= 1) {
3836
+ result.min_score = rawMinScore;
3837
+ }
3838
+ if (rawRequired === true) {
3839
+ result.required = true;
3840
+ } else if (typeof rawRequired === "number" && rawRequired > 0 && rawRequired <= 1) {
3841
+ if (result.min_score === void 0) {
3842
+ result.min_score = rawRequired;
3843
+ }
3844
+ result.required = rawRequired;
3845
+ logWarning2(
3846
+ `Evaluator '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
3847
+ );
3848
+ }
3849
+ return result;
3706
3850
  }
3707
3851
  function validateWeight(rawWeight, evaluatorName, evalId) {
3708
3852
  if (rawWeight === void 0) {
@@ -3745,16 +3889,30 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
3745
3889
  const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
3746
3890
  const expectedOutcome = asString(rawRubric.outcome) ?? "";
3747
3891
  const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
3892
+ let minScore;
3748
3893
  let requiredMinScore;
3749
3894
  let required;
3750
- if (typeof rawRubric.required_min_score === "number") {
3751
- const minScore = rawRubric.required_min_score;
3752
- if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
3895
+ if (typeof rawRubric.min_score === "number") {
3896
+ const ms = rawRubric.min_score;
3897
+ if (ms <= 0 || ms > 1) {
3753
3898
  throw new Error(
3754
- `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
3899
+ `Invalid min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be in (0, 1] (got ${ms})`
3755
3900
  );
3756
3901
  }
3757
- requiredMinScore = minScore;
3902
+ minScore = ms;
3903
+ requiredMinScore = Math.round(ms * 10);
3904
+ } else if (typeof rawRubric.required_min_score === "number") {
3905
+ const rms = rawRubric.required_min_score;
3906
+ if (!Number.isInteger(rms) || rms < 0 || rms > 10) {
3907
+ throw new Error(
3908
+ `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${rms})`
3909
+ );
3910
+ }
3911
+ requiredMinScore = rms;
3912
+ minScore = rms / 10;
3913
+ logWarning2(
3914
+ `Rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': 'required_min_score: ${rms}' is deprecated. Use 'min_score: ${rms / 10}' (0-1 scale) instead.`
3915
+ );
3758
3916
  }
3759
3917
  if (typeof rawRubric.required === "boolean") {
3760
3918
  required = rawRubric.required;
@@ -3774,6 +3932,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
3774
3932
  weight,
3775
3933
  ...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
3776
3934
  ...required !== void 0 ? { required } : {},
3935
+ ...minScore !== void 0 ? { min_score: minScore } : {},
3777
3936
  ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
3778
3937
  score_ranges: scoreRanges
3779
3938
  });
@@ -3790,6 +3949,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
3790
3949
  weight,
3791
3950
  // Default to required: true if not specified (backward compatibility)
3792
3951
  required: required ?? true,
3952
+ ...minScore !== void 0 ? { min_score: minScore } : {},
3793
3953
  ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
3794
3954
  });
3795
3955
  }
@@ -3918,12 +4078,22 @@ function parseInlineRubrics(rawRubrics) {
3918
4078
  id: asString(rubric.id) ?? `rubric-${index + 1}`,
3919
4079
  weight: typeof rubric.weight === "number" ? rubric.weight : 1
3920
4080
  };
4081
+ let inlineMinScore;
4082
+ let inlineRequiredMinScore;
4083
+ if (typeof rubric.min_score === "number") {
4084
+ inlineMinScore = rubric.min_score;
4085
+ inlineRequiredMinScore = Math.round(inlineMinScore * 10);
4086
+ } else if (typeof rubric.required_min_score === "number") {
4087
+ inlineRequiredMinScore = rubric.required_min_score;
4088
+ inlineMinScore = inlineRequiredMinScore / 10;
4089
+ }
3921
4090
  if (scoreRanges && scoreRanges.length > 0) {
3922
4091
  return {
3923
4092
  ...baseRubric,
3924
4093
  ...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
3925
4094
  ...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
3926
- ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
4095
+ ...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
4096
+ ...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {},
3927
4097
  score_ranges: scoreRanges
3928
4098
  };
3929
4099
  }
@@ -3931,7 +4101,8 @@ function parseInlineRubrics(rawRubrics) {
3931
4101
  ...baseRubric,
3932
4102
  outcome: expectedOutcome,
3933
4103
  required: typeof rubric.required === "boolean" ? rubric.required : true,
3934
- ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
4104
+ ...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
4105
+ ...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {}
3935
4106
  };
3936
4107
  }).filter((r) => r.outcome && r.outcome.length > 0 || "score_ranges" in r && r.score_ranges);
3937
4108
  if (rubricItems.length === 0) {
@@ -4335,6 +4506,9 @@ function resolveExpectedMessages(raw) {
4335
4506
  var ANSI_YELLOW6 = "\x1B[33m";
4336
4507
  var ANSI_RED2 = "\x1B[31m";
4337
4508
  var ANSI_RESET7 = "\x1B[0m";
4509
+ function matchesFilter(id, filter) {
4510
+ return typeof filter === "string" ? import_micromatch.default.isMatch(id, filter) : filter.some((pattern) => import_micromatch.default.isMatch(id, pattern));
4511
+ }
4338
4512
  function detectFormat(filePath) {
4339
4513
  const ext = import_node_path7.default.extname(filePath).toLowerCase();
4340
4514
  if (ext === ".jsonl") return "jsonl";
@@ -4402,40 +4576,40 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
4402
4576
  const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
4403
4577
  const rawFile = await (0, import_promises7.readFile)(absoluteTestPath, "utf8");
4404
4578
  const rawCases = parseJsonlContent(rawFile, evalFilePath);
4405
- const fallbackEvalSet = import_node_path7.default.basename(absoluteTestPath, ".jsonl") || "eval";
4406
- const evalSetName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackEvalSet;
4579
+ const fallbackSuiteName = import_node_path7.default.basename(absoluteTestPath, ".jsonl") || "eval";
4580
+ const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
4407
4581
  const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-grader";
4408
4582
  const globalExecution = sidecar.execution;
4409
4583
  if (verbose) {
4410
4584
  console.log(`
4411
- [JSONL Dataset: ${evalFilePath}]`);
4585
+ [JSONL Suite: ${evalFilePath}]`);
4412
4586
  console.log(` Cases: ${rawCases.length}`);
4413
- console.log(` Eval set: ${evalSetName}`);
4587
+ console.log(` Suite: ${suiteName}`);
4414
4588
  if (sidecar.description) {
4415
4589
  console.log(` Description: ${sidecar.description}`);
4416
4590
  }
4417
4591
  }
4418
4592
  const results = [];
4419
4593
  for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
4420
- const evalcase = rawCases[lineIndex];
4594
+ const testCaseConfig = rawCases[lineIndex];
4421
4595
  const lineNumber = lineIndex + 1;
4422
- const id = asString4(evalcase.id);
4423
- if (filterPattern && (!id || !import_micromatch.default.isMatch(id, filterPattern))) {
4596
+ const id = asString4(testCaseConfig.id);
4597
+ if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
4424
4598
  continue;
4425
4599
  }
4426
- const conversationId = asString4(evalcase.conversation_id);
4427
- let outcome = asString4(evalcase.criteria);
4428
- if (!outcome && evalcase.expected_outcome !== void 0) {
4429
- outcome = asString4(evalcase.expected_outcome);
4600
+ const conversationId = asString4(testCaseConfig.conversation_id);
4601
+ let outcome = asString4(testCaseConfig.criteria);
4602
+ if (!outcome && testCaseConfig.expected_outcome !== void 0) {
4603
+ outcome = asString4(testCaseConfig.expected_outcome);
4430
4604
  if (outcome) {
4431
4605
  logWarning4(
4432
- `Test '${asString4(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
4606
+ `Test '${asString4(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
4433
4607
  );
4434
4608
  }
4435
4609
  }
4436
- const rawInputMessages = resolveInputMessages(evalcase);
4437
- const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
4438
- const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
4610
+ const rawInputMessages = resolveInputMessages(testCaseConfig);
4611
+ const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
4612
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assert !== void 0;
4439
4613
  if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
4440
4614
  logError2(
4441
4615
  `Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
@@ -4472,18 +4646,23 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
4472
4646
  }
4473
4647
  }
4474
4648
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
4475
- const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
4649
+ const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
4476
4650
  const mergedExecution = caseExecution ?? globalExecution;
4477
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
4651
+ const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
4478
4652
  let evaluators;
4479
4653
  try {
4480
- evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? "unknown");
4654
+ evaluators = await parseEvaluators(
4655
+ testCaseConfig,
4656
+ mergedExecution,
4657
+ searchRoots,
4658
+ id ?? "unknown"
4659
+ );
4481
4660
  } catch (error) {
4482
4661
  const message = error instanceof Error ? error.message : String(error);
4483
4662
  logError2(`Skipping test '${id}' at line ${lineNumber}: ${message}`);
4484
4663
  continue;
4485
4664
  }
4486
- const inlineRubrics = evalcase.rubrics;
4665
+ const inlineRubrics = testCaseConfig.rubrics;
4487
4666
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
4488
4667
  const rubricEvaluator = parseInlineRubrics(inlineRubrics);
4489
4668
  if (rubricEvaluator) {
@@ -4494,7 +4673,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
4494
4673
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
4495
4674
  const testCase = {
4496
4675
  id,
4497
- dataset: evalSetName,
4676
+ suite: suiteName,
4498
4677
  conversation_id: conversationId,
4499
4678
  question,
4500
4679
  input: inputMessages,
@@ -4502,7 +4681,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
4502
4681
  reference_answer: referenceAnswer,
4503
4682
  file_paths: userFilePaths,
4504
4683
  criteria: outcome ?? "",
4505
- evaluator: evalCaseEvaluatorKind,
4684
+ evaluator: testCaseEvaluatorKind,
4506
4685
  assertions: evaluators
4507
4686
  };
4508
4687
  results.push(testCase);
@@ -4687,6 +4866,9 @@ function buildChatPromptFromSegments(options) {
4687
4866
  var ANSI_YELLOW7 = "\x1B[33m";
4688
4867
  var ANSI_RED3 = "\x1B[31m";
4689
4868
  var ANSI_RESET8 = "\x1B[0m";
4869
+ function matchesFilter2(id, filter) {
4870
+ return typeof filter === "string" ? import_micromatch2.default.isMatch(id, filter) : filter.some((pattern) => import_micromatch2.default.isMatch(id, pattern));
4871
+ }
4690
4872
  function resolveTests(suite) {
4691
4873
  if (suite.tests !== void 0) return suite.tests;
4692
4874
  if (suite.eval_cases !== void 0) {
@@ -4766,18 +4948,18 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4766
4948
  throw new Error(`Invalid test file format: ${evalFilePath}`);
4767
4949
  }
4768
4950
  const suite = interpolated;
4769
- const evalSetNameFromSuite = asString5(suite.name)?.trim();
4770
- const fallbackEvalSet = import_node_path8.default.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
4771
- const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
4772
- const rawTestcases = resolveTests(suite);
4951
+ const suiteNameFromFile = asString5(suite.name)?.trim();
4952
+ const fallbackSuiteName = import_node_path8.default.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
4953
+ const suiteName = suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName;
4954
+ const rawTestCases = resolveTests(suite);
4773
4955
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
4774
4956
  const evalFileDir = import_node_path8.default.dirname(absoluteTestPath);
4775
- let expandedTestcases;
4776
- if (typeof rawTestcases === "string") {
4777
- const externalPath = import_node_path8.default.resolve(evalFileDir, rawTestcases);
4778
- expandedTestcases = await loadCasesFromFile(externalPath);
4779
- } else if (Array.isArray(rawTestcases)) {
4780
- expandedTestcases = await expandFileReferences(rawTestcases, evalFileDir);
4957
+ let expandedTestCases;
4958
+ if (typeof rawTestCases === "string") {
4959
+ const externalPath = import_node_path8.default.resolve(evalFileDir, rawTestCases);
4960
+ expandedTestCases = await loadCasesFromFile(externalPath);
4961
+ } else if (Array.isArray(rawTestCases)) {
4962
+ expandedTestCases = await expandFileReferences(rawTestCases, evalFileDir);
4781
4963
  } else {
4782
4964
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
4783
4965
  }
@@ -4792,32 +4974,33 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4792
4974
  }
4793
4975
  const globalExecution = suiteAssertions !== void 0 ? { ...rawGlobalExecution ?? {}, assertions: suiteAssertions } : rawGlobalExecution;
4794
4976
  const results = [];
4795
- for (const rawEvalcase of expandedTestcases) {
4796
- if (!isJsonObject(rawEvalcase)) {
4977
+ for (const rawTestCase of expandedTestCases) {
4978
+ if (!isJsonObject(rawTestCase)) {
4797
4979
  logWarning5("Skipping invalid test entry (expected object)");
4798
4980
  continue;
4799
4981
  }
4800
- const evalcase = rawEvalcase;
4801
- const id = asString5(evalcase.id);
4802
- if (filterPattern && (!id || !import_micromatch2.default.isMatch(id, filterPattern))) {
4982
+ const testCaseConfig = rawTestCase;
4983
+ const id = asString5(testCaseConfig.id);
4984
+ if (filterPattern && (!id || !matchesFilter2(id, filterPattern))) {
4803
4985
  continue;
4804
4986
  }
4805
- const conversationId = asString5(evalcase.conversation_id);
4806
- let outcome = asString5(evalcase.criteria);
4807
- if (!outcome && evalcase.expected_outcome !== void 0) {
4808
- outcome = asString5(evalcase.expected_outcome);
4987
+ const conversationId = asString5(testCaseConfig.conversation_id);
4988
+ let outcome = asString5(testCaseConfig.criteria);
4989
+ if (!outcome && testCaseConfig.expected_outcome !== void 0) {
4990
+ outcome = asString5(testCaseConfig.expected_outcome);
4809
4991
  if (outcome) {
4810
4992
  logWarning5(
4811
- `Test '${asString5(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
4993
+ `Test '${asString5(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
4812
4994
  );
4813
4995
  }
4814
4996
  }
4815
- const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
4997
+ const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
4816
4998
  const skipDefaults = caseExecution?.skip_defaults === true;
4999
+ const caseThreshold = typeof caseExecution?.threshold === "number" && caseExecution.threshold >= 0 && caseExecution.threshold <= 1 ? caseExecution.threshold : void 0;
4817
5000
  const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
4818
- const testInputMessages = resolveInputMessages(evalcase, effectiveSuiteInputFiles);
4819
- const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
4820
- const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assertions !== void 0 || evalcase.assert !== void 0;
5001
+ const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
5002
+ const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
5003
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
4821
5004
  if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
4822
5005
  logError3(
4823
5006
  `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
@@ -4864,16 +5047,21 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4864
5047
  }
4865
5048
  }
4866
5049
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
4867
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
5050
+ const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
4868
5051
  let evaluators;
4869
5052
  try {
4870
- evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
5053
+ evaluators = await parseEvaluators(
5054
+ testCaseConfig,
5055
+ globalExecution,
5056
+ searchRoots,
5057
+ id ?? "unknown"
5058
+ );
4871
5059
  } catch (error) {
4872
5060
  const message = error instanceof Error ? error.message : String(error);
4873
5061
  logError3(`Skipping test '${id}': ${message}`);
4874
5062
  continue;
4875
5063
  }
4876
- const inlineRubrics = evalcase.rubrics;
5064
+ const inlineRubrics = testCaseConfig.rubrics;
4877
5065
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
4878
5066
  const rubricEvaluator = parseInlineRubrics(inlineRubrics);
4879
5067
  if (rubricEvaluator) {
@@ -4882,13 +5070,13 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4882
5070
  }
4883
5071
  warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
4884
5072
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
4885
- const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
5073
+ const caseWorkspace = await resolveWorkspaceConfig(testCaseConfig.workspace, evalFileDir);
4886
5074
  const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
4887
- const metadata = isJsonObject(evalcase.metadata) ? evalcase.metadata : void 0;
4888
- const caseTargets = extractTargetsFromTestCase(evalcase);
5075
+ const metadata = isJsonObject(testCaseConfig.metadata) ? testCaseConfig.metadata : void 0;
5076
+ const caseTargets = extractTargetsFromTestCase(testCaseConfig);
4889
5077
  const testCase = {
4890
5078
  id,
4891
- dataset: evalSetName,
5079
+ suite: suiteName,
4892
5080
  category: options?.category,
4893
5081
  conversation_id: conversationId,
4894
5082
  question,
@@ -4897,11 +5085,12 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4897
5085
  reference_answer: referenceAnswer,
4898
5086
  file_paths: userFilePaths,
4899
5087
  criteria: outcome ?? "",
4900
- evaluator: evalCaseEvaluatorKind,
5088
+ evaluator: testCaseEvaluatorKind,
4901
5089
  assertions: evaluators,
4902
5090
  workspace: mergedWorkspace,
4903
5091
  metadata,
4904
- targets: caseTargets
5092
+ targets: caseTargets,
5093
+ ...caseThreshold !== void 0 ? { threshold: caseThreshold } : {}
4905
5094
  };
4906
5095
  results.push(testCase);
4907
5096
  }
@@ -5567,7 +5756,7 @@ var AzureProvider = class {
5567
5756
  };
5568
5757
  this.retryConfig = config.retry;
5569
5758
  const azure = (0, import_azure2.createAzure)(buildAzureOptions(config));
5570
- this.model = azure.chat(config.deploymentName);
5759
+ this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
5571
5760
  }
5572
5761
  id;
5573
5762
  kind = "azure";
@@ -5693,7 +5882,9 @@ function buildAzureOptions(config) {
5693
5882
  const options = {
5694
5883
  apiKey: config.apiKey,
5695
5884
  apiVersion: config.version,
5696
- useDeploymentBasedUrls: true
5885
+ // Chat completions still use deployment-scoped Azure URLs for compatibility
5886
+ // with existing deployments. Responses API should use the SDK's v1 path.
5887
+ useDeploymentBasedUrls: config.apiFormat !== "responses"
5697
5888
  };
5698
5889
  const baseURL = normalizeAzureBaseUrl(config.resourceName);
5699
5890
  if (baseURL) {
@@ -9322,6 +9513,22 @@ function extractAzureResourceName(baseUrl) {
9322
9513
  if (urlMatch) return urlMatch[1];
9323
9514
  return baseUrl;
9324
9515
  }
9516
+ function normalizeAzureSdkBaseUrl(baseUrl) {
9517
+ const trimmed = baseUrl.trim().replace(/\/+$/, "");
9518
+ if (!trimmed) {
9519
+ return trimmed;
9520
+ }
9521
+ if (!/^https?:\/\//i.test(trimmed)) {
9522
+ return `https://${trimmed}.openai.azure.com/openai/v1`;
9523
+ }
9524
+ if (/\/openai\/v1$/i.test(trimmed)) {
9525
+ return trimmed;
9526
+ }
9527
+ if (/\/openai$/i.test(trimmed)) {
9528
+ return `${trimmed}/v1`;
9529
+ }
9530
+ return `${trimmed}/openai/v1`;
9531
+ }
9325
9532
 
9326
9533
  // src/evaluation/providers/pi-utils.ts
9327
9534
  init_cjs_shims();
@@ -10156,9 +10363,40 @@ var import_node_child_process5 = require("child_process");
10156
10363
  var import_node_crypto8 = require("crypto");
10157
10364
  var import_node_fs10 = require("fs");
10158
10365
  var import_promises19 = require("fs/promises");
10159
- var import_node_path22 = __toESM(require("path"), 1);
10366
+ var import_node_path23 = __toESM(require("path"), 1);
10160
10367
  var import_node_readline = require("readline");
10161
10368
  var import_node_url3 = require("url");
10369
+
10370
+ // src/paths.ts
10371
+ init_cjs_shims();
10372
+ var import_node_os6 = __toESM(require("os"), 1);
10373
+ var import_node_path22 = __toESM(require("path"), 1);
10374
+ var logged = false;
10375
+ function getAgentvHome() {
10376
+ const envHome = process.env.AGENTV_HOME;
10377
+ if (envHome && envHome !== "undefined") {
10378
+ if (!logged) {
10379
+ logged = true;
10380
+ console.warn(`Using AGENTV_HOME: ${envHome}`);
10381
+ }
10382
+ return envHome;
10383
+ }
10384
+ return import_node_path22.default.join(import_node_os6.default.homedir(), ".agentv");
10385
+ }
10386
+ function getWorkspacesRoot() {
10387
+ return import_node_path22.default.join(getAgentvHome(), "workspaces");
10388
+ }
10389
+ function getSubagentsRoot() {
10390
+ return import_node_path22.default.join(getAgentvHome(), "subagents");
10391
+ }
10392
+ function getTraceStateRoot() {
10393
+ return import_node_path22.default.join(getAgentvHome(), "trace-state");
10394
+ }
10395
+ function getWorkspacePoolRoot() {
10396
+ return import_node_path22.default.join(getAgentvHome(), "workspace-pool");
10397
+ }
10398
+
10399
+ // src/evaluation/providers/pi-coding-agent.ts
10162
10400
  var piCodingAgentModule = null;
10163
10401
  var piAiModule = null;
10164
10402
  var loadingPromise = null;
@@ -10176,46 +10414,126 @@ async function promptInstall() {
10176
10414
  rl.close();
10177
10415
  }
10178
10416
  }
10179
- function findAgentvRoot() {
10180
- const thisFile = (0, import_node_url3.fileURLToPath)(importMetaUrl);
10181
- let dir = import_node_path22.default.dirname(thisFile);
10182
- for (let i = 0; i < 10; i++) {
10417
+ function findManagedSdkInstallRoot() {
10418
+ return import_node_path23.default.join(getAgentvHome(), "deps", "pi-sdk");
10419
+ }
10420
+ function resolveGlobalNpmRoot() {
10421
+ try {
10422
+ const root = (0, import_node_child_process5.execSync)("npm root -g", {
10423
+ encoding: "utf-8",
10424
+ stdio: ["ignore", "pipe", "ignore"]
10425
+ }).trim();
10426
+ return root.length > 0 ? root : void 0;
10427
+ } catch {
10428
+ return void 0;
10429
+ }
10430
+ }
10431
+ function buildGlobalModuleEntry(moduleName, globalNpmRoot) {
10432
+ return import_node_path23.default.join(globalNpmRoot, ...moduleName.split("/"), "dist", "index.js");
10433
+ }
10434
+ function findAccessiblePath(paths) {
10435
+ for (const candidate of paths) {
10183
10436
  try {
10184
- const pkg = import_node_path22.default.join(dir, "package.json");
10185
- (0, import_node_fs10.accessSync)(pkg);
10186
- return dir;
10437
+ (0, import_node_fs10.accessSync)(candidate);
10438
+ return candidate;
10187
10439
  } catch {
10188
- const parent = import_node_path22.default.dirname(dir);
10189
- if (parent === dir) break;
10190
- dir = parent;
10191
10440
  }
10192
10441
  }
10193
- return import_node_path22.default.dirname(thisFile);
10442
+ return void 0;
10194
10443
  }
10195
- async function doLoadSdkModules() {
10444
+ async function tryImportLocalSdkModules() {
10196
10445
  try {
10197
10446
  [piCodingAgentModule, piAiModule] = await Promise.all([
10198
10447
  import("@mariozechner/pi-coding-agent"),
10199
10448
  import("@mariozechner/pi-ai")
10200
10449
  ]);
10450
+ return true;
10201
10451
  } catch {
10202
- if (await promptInstall()) {
10203
- const installDir = findAgentvRoot();
10204
- console.error(`Installing @mariozechner/pi-coding-agent into ${installDir}...`);
10205
- (0, import_node_child_process5.execSync)("bun add @mariozechner/pi-coding-agent", {
10206
- cwd: installDir,
10207
- stdio: "inherit"
10208
- });
10209
- [piCodingAgentModule, piAiModule] = await Promise.all([
10210
- import("@mariozechner/pi-coding-agent"),
10211
- import("@mariozechner/pi-ai")
10212
- ]);
10213
- } else {
10214
- throw new Error(
10215
- "pi-coding-agent SDK is not installed. Install it with:\n bun add @mariozechner/pi-coding-agent"
10216
- );
10452
+ return false;
10453
+ }
10454
+ }
10455
+ async function tryImportManagedSdkModules() {
10456
+ const managedRoot = findManagedSdkInstallRoot();
10457
+ const piCodingAgentEntry = findAccessiblePath([
10458
+ import_node_path23.default.join(managedRoot, "node_modules", "@mariozechner", "pi-coding-agent", "dist", "index.js")
10459
+ ]);
10460
+ const piAiEntry = findAccessiblePath([
10461
+ import_node_path23.default.join(managedRoot, "node_modules", "@mariozechner", "pi-ai", "dist", "index.js"),
10462
+ import_node_path23.default.join(
10463
+ managedRoot,
10464
+ "node_modules",
10465
+ "@mariozechner",
10466
+ "pi-coding-agent",
10467
+ "node_modules",
10468
+ "@mariozechner",
10469
+ "pi-ai",
10470
+ "dist",
10471
+ "index.js"
10472
+ )
10473
+ ]);
10474
+ if (!piCodingAgentEntry || !piAiEntry) return false;
10475
+ try {
10476
+ [piCodingAgentModule, piAiModule] = await Promise.all([
10477
+ import((0, import_node_url3.pathToFileURL)(piCodingAgentEntry).href),
10478
+ import((0, import_node_url3.pathToFileURL)(piAiEntry).href)
10479
+ ]);
10480
+ return true;
10481
+ } catch {
10482
+ return false;
10483
+ }
10484
+ }
10485
+ async function tryImportGlobalSdkModules() {
10486
+ const globalNpmRoot = resolveGlobalNpmRoot();
10487
+ if (!globalNpmRoot) return false;
10488
+ const piCodingAgentEntry = findAccessiblePath([
10489
+ buildGlobalModuleEntry("@mariozechner/pi-coding-agent", globalNpmRoot)
10490
+ ]);
10491
+ const piAiEntry = findAccessiblePath([
10492
+ buildGlobalModuleEntry("@mariozechner/pi-ai", globalNpmRoot),
10493
+ import_node_path23.default.join(
10494
+ globalNpmRoot,
10495
+ "@mariozechner",
10496
+ "pi-coding-agent",
10497
+ "node_modules",
10498
+ "@mariozechner",
10499
+ "pi-ai",
10500
+ "dist",
10501
+ "index.js"
10502
+ )
10503
+ ]);
10504
+ if (!piCodingAgentEntry || !piAiEntry) return false;
10505
+ try {
10506
+ [piCodingAgentModule, piAiModule] = await Promise.all([
10507
+ import((0, import_node_url3.pathToFileURL)(piCodingAgentEntry).href),
10508
+ import((0, import_node_url3.pathToFileURL)(piAiEntry).href)
10509
+ ]);
10510
+ return true;
10511
+ } catch {
10512
+ return false;
10513
+ }
10514
+ }
10515
+ function installSdkModules(installDir) {
10516
+ console.error(`Installing @mariozechner/pi-coding-agent into ${installDir} via npm...`);
10517
+ (0, import_node_fs10.mkdirSync)(installDir, { recursive: true });
10518
+ (0, import_node_child_process5.execSync)("npm install --no-save --no-package-lock @mariozechner/pi-coding-agent", {
10519
+ cwd: installDir,
10520
+ stdio: "inherit"
10521
+ });
10522
+ }
10523
+ async function doLoadSdkModules() {
10524
+ if (await tryImportLocalSdkModules() || await tryImportManagedSdkModules() || await tryImportGlobalSdkModules()) {
10525
+ return;
10526
+ }
10527
+ if (await promptInstall()) {
10528
+ const installDir = findManagedSdkInstallRoot();
10529
+ installSdkModules(installDir);
10530
+ if (await tryImportManagedSdkModules()) {
10531
+ return;
10217
10532
  }
10218
10533
  }
10534
+ throw new Error(
10535
+ "pi-coding-agent SDK is not installed. Install it with:\n npm install @mariozechner/pi-coding-agent"
10536
+ );
10219
10537
  }
10220
10538
  async function loadSdkModules() {
10221
10539
  if (!piCodingAgentModule || !piAiModule) {
@@ -10272,12 +10590,16 @@ var PiCodingAgentProvider = class {
10272
10590
  try {
10273
10591
  const cwd = this.resolveCwd(request.cwd);
10274
10592
  const rawProvider = this.config.subprovider ?? "google";
10275
- const hasBaseUrl = !!this.config.baseUrl;
10593
+ const normalizedBaseUrl = this.normalizeSdkBaseUrl(rawProvider, this.config.baseUrl);
10594
+ const hasBaseUrl = !!normalizedBaseUrl;
10276
10595
  const providerName = resolveSubprovider(rawProvider, hasBaseUrl);
10277
10596
  const modelId = this.config.model ?? "gemini-2.5-flash";
10278
10597
  this.setApiKeyEnv(rawProvider, hasBaseUrl);
10279
- this.setBaseUrlEnv(rawProvider, hasBaseUrl);
10598
+ this.setBaseUrlEnv(rawProvider, normalizedBaseUrl, hasBaseUrl);
10280
10599
  let model = sdk.getModel(providerName, modelId);
10600
+ if (model && normalizedBaseUrl) {
10601
+ model = { ...model, baseUrl: normalizedBaseUrl };
10602
+ }
10281
10603
  if (!model) {
10282
10604
  const envProvider = providerName.replace(/-responses$/, "");
10283
10605
  model = {
@@ -10285,7 +10607,7 @@ var PiCodingAgentProvider = class {
10285
10607
  name: modelId,
10286
10608
  api: providerName,
10287
10609
  provider: envProvider,
10288
- baseUrl: this.config.baseUrl ?? "",
10610
+ baseUrl: normalizedBaseUrl ?? "",
10289
10611
  reasoning: false,
10290
10612
  input: ["text"],
10291
10613
  cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
@@ -10452,19 +10774,27 @@ ${fileList}`;
10452
10774
  }
10453
10775
  }
10454
10776
  /** Maps config baseUrl to the provider-specific env var the SDK reads. */
10455
- setBaseUrlEnv(providerName, hasBaseUrl = false) {
10456
- if (!this.config.baseUrl) return;
10777
+ setBaseUrlEnv(providerName, baseUrl = this.config.baseUrl, hasBaseUrl = false) {
10778
+ const normalizedBaseUrl = this.normalizeSdkBaseUrl(providerName, baseUrl);
10779
+ if (!normalizedBaseUrl) return;
10457
10780
  const envKey = resolveEnvBaseUrlName(providerName, hasBaseUrl);
10458
10781
  if (envKey) {
10459
- process.env[envKey] = this.config.baseUrl;
10782
+ process.env[envKey] = normalizedBaseUrl;
10460
10783
  }
10461
10784
  }
10785
+ normalizeSdkBaseUrl(providerName, baseUrl) {
10786
+ if (!baseUrl) return void 0;
10787
+ if (providerName.toLowerCase() === "azure") {
10788
+ return normalizeAzureSdkBaseUrl(baseUrl);
10789
+ }
10790
+ return baseUrl;
10791
+ }
10462
10792
  resolveCwd(cwdOverride) {
10463
10793
  if (cwdOverride) {
10464
- return import_node_path22.default.resolve(cwdOverride);
10794
+ return import_node_path23.default.resolve(cwdOverride);
10465
10795
  }
10466
10796
  if (this.config.cwd) {
10467
- return import_node_path22.default.resolve(this.config.cwd);
10797
+ return import_node_path23.default.resolve(this.config.cwd);
10468
10798
  }
10469
10799
  return process.cwd();
10470
10800
  }
@@ -10483,9 +10813,9 @@ ${fileList}`;
10483
10813
  }
10484
10814
  resolveLogDirectory() {
10485
10815
  if (this.config.logDir) {
10486
- return import_node_path22.default.resolve(this.config.logDir);
10816
+ return import_node_path23.default.resolve(this.config.logDir);
10487
10817
  }
10488
- return import_node_path22.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
10818
+ return import_node_path23.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
10489
10819
  }
10490
10820
  async createStreamLogger(request) {
10491
10821
  const logDir = this.resolveLogDirectory();
@@ -10499,7 +10829,7 @@ ${fileList}`;
10499
10829
  console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
10500
10830
  return void 0;
10501
10831
  }
10502
- const filePath = import_node_path22.default.join(logDir, buildLogFilename6(request, this.targetName));
10832
+ const filePath = import_node_path23.default.join(logDir, buildLogFilename6(request, this.targetName));
10503
10833
  try {
10504
10834
  const logger = await PiStreamLogger2.create({
10505
10835
  filePath,
@@ -10714,19 +11044,17 @@ var ProviderRegistry = class {
10714
11044
 
10715
11045
  // src/evaluation/providers/targets.ts
10716
11046
  init_cjs_shims();
10717
- var import_node_path23 = __toESM(require("path"), 1);
11047
+ var import_node_path24 = __toESM(require("path"), 1);
10718
11048
  var import_zod3 = require("zod");
10719
11049
  var CliHealthcheckHttpInputSchema = import_zod3.z.object({
10720
11050
  url: import_zod3.z.string().min(1, "healthcheck URL is required"),
10721
- timeout_seconds: import_zod3.z.number().positive().optional(),
10722
- timeoutSeconds: import_zod3.z.number().positive().optional()
10723
- });
11051
+ timeout_seconds: import_zod3.z.number().positive().optional()
11052
+ }).passthrough();
10724
11053
  var CliHealthcheckCommandInputSchema = import_zod3.z.object({
10725
11054
  command: import_zod3.z.string().min(1, "healthcheck command is required"),
10726
11055
  cwd: import_zod3.z.string().optional(),
10727
- timeout_seconds: import_zod3.z.number().positive().optional(),
10728
- timeoutSeconds: import_zod3.z.number().positive().optional()
10729
- });
11056
+ timeout_seconds: import_zod3.z.number().positive().optional()
11057
+ }).passthrough();
10730
11058
  var CliHealthcheckInputSchema = import_zod3.z.union([
10731
11059
  CliHealthcheckHttpInputSchema,
10732
11060
  CliHealthcheckCommandInputSchema
@@ -10738,36 +11066,28 @@ var CliTargetInputSchema = import_zod3.z.object({
10738
11066
  command: import_zod3.z.string(),
10739
11067
  // Files format - optional
10740
11068
  files_format: import_zod3.z.string().optional(),
10741
- filesFormat: import_zod3.z.string().optional(),
10742
11069
  attachments_format: import_zod3.z.string().optional(),
10743
- attachmentsFormat: import_zod3.z.string().optional(),
10744
11070
  // Working directory - optional
10745
11071
  cwd: import_zod3.z.string().optional(),
10746
11072
  // Workspace template directory - optional (mutually exclusive with cwd)
10747
11073
  workspace_template: import_zod3.z.string().optional(),
10748
- workspaceTemplate: import_zod3.z.string().optional(),
10749
11074
  // Timeout in seconds - optional
10750
11075
  timeout_seconds: import_zod3.z.number().positive().optional(),
10751
- timeoutSeconds: import_zod3.z.number().positive().optional(),
10752
11076
  // Healthcheck configuration - optional
10753
11077
  healthcheck: CliHealthcheckInputSchema.optional(),
10754
11078
  // Verbose mode - optional
10755
11079
  verbose: import_zod3.z.boolean().optional(),
10756
11080
  cli_verbose: import_zod3.z.boolean().optional(),
10757
- cliVerbose: import_zod3.z.boolean().optional(),
10758
11081
  // Keep temp files - optional
10759
11082
  keep_temp_files: import_zod3.z.boolean().optional(),
10760
- keepTempFiles: import_zod3.z.boolean().optional(),
10761
11083
  keep_output_files: import_zod3.z.boolean().optional(),
10762
- keepOutputFiles: import_zod3.z.boolean().optional(),
10763
11084
  // Common target fields
10764
11085
  grader_target: import_zod3.z.string().optional(),
10765
11086
  judge_target: import_zod3.z.string().optional(),
10766
11087
  // backward compat
10767
11088
  workers: import_zod3.z.number().int().min(1).optional(),
10768
- provider_batching: import_zod3.z.boolean().optional(),
10769
- providerBatching: import_zod3.z.boolean().optional()
10770
- });
11089
+ provider_batching: import_zod3.z.boolean().optional()
11090
+ }).passthrough();
10771
11091
  var CliHealthcheckHttpSchema = import_zod3.z.object({
10772
11092
  url: import_zod3.z.string().min(1),
10773
11093
  timeoutMs: import_zod3.z.number().positive().optional()
@@ -10792,7 +11112,7 @@ var CliTargetConfigSchema = import_zod3.z.object({
10792
11112
  keepTempFiles: import_zod3.z.boolean().optional()
10793
11113
  }).strict();
10794
11114
  function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
10795
- const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
11115
+ const timeoutSeconds = input.timeout_seconds;
10796
11116
  const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
10797
11117
  if ("url" in input && input.url) {
10798
11118
  const url = resolveString(input.url, env, `${targetName} healthcheck URL`);
@@ -10811,11 +11131,11 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
10811
11131
  allowLiteral: true,
10812
11132
  optionalEnv: true
10813
11133
  });
10814
- if (cwd && evalFilePath && !import_node_path23.default.isAbsolute(cwd)) {
10815
- cwd = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), cwd);
11134
+ if (cwd && evalFilePath && !import_node_path24.default.isAbsolute(cwd)) {
11135
+ cwd = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), cwd);
10816
11136
  }
10817
11137
  if (!cwd && evalFilePath) {
10818
- cwd = import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath));
11138
+ cwd = import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath));
10819
11139
  }
10820
11140
  return {
10821
11141
  command,
@@ -10826,9 +11146,9 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
10826
11146
  function normalizeCliTargetInput(input, env, evalFilePath) {
10827
11147
  const targetName = input.name;
10828
11148
  const command = resolveString(input.command, env, `${targetName} CLI command`, true);
10829
- const filesFormatSource = input.files_format ?? input.filesFormat ?? input.attachments_format ?? input.attachmentsFormat;
11149
+ const filesFormatSource = input.files_format ?? input.attachments_format;
10830
11150
  const filesFormat = resolveOptionalLiteralString(filesFormatSource);
10831
- const workspaceTemplateSource = input.workspace_template ?? input.workspaceTemplate;
11151
+ const workspaceTemplateSource = input.workspace_template;
10832
11152
  let workspaceTemplate = resolveOptionalString(
10833
11153
  workspaceTemplateSource,
10834
11154
  env,
@@ -10838,15 +11158,15 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
10838
11158
  optionalEnv: true
10839
11159
  }
10840
11160
  );
10841
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
10842
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
11161
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
11162
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
10843
11163
  }
10844
11164
  let cwd = resolveOptionalString(input.cwd, env, `${targetName} working directory`, {
10845
11165
  allowLiteral: true,
10846
11166
  optionalEnv: true
10847
11167
  });
10848
- if (cwd && evalFilePath && !import_node_path23.default.isAbsolute(cwd)) {
10849
- cwd = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), cwd);
11168
+ if (cwd && evalFilePath && !import_node_path24.default.isAbsolute(cwd)) {
11169
+ cwd = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), cwd);
10850
11170
  }
10851
11171
  if (cwd && workspaceTemplate) {
10852
11172
  throw new Error(
@@ -10854,14 +11174,12 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
10854
11174
  );
10855
11175
  }
10856
11176
  if (!cwd && !workspaceTemplate && evalFilePath) {
10857
- cwd = import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath));
11177
+ cwd = import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath));
10858
11178
  }
10859
- const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
11179
+ const timeoutSeconds = input.timeout_seconds;
10860
11180
  const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
10861
- const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose ?? input.cliVerbose);
10862
- const keepTempFiles = resolveOptionalBoolean(
10863
- input.keep_temp_files ?? input.keepTempFiles ?? input.keep_output_files ?? input.keepOutputFiles
10864
- );
11181
+ const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose);
11182
+ const keepTempFiles = resolveOptionalBoolean(input.keep_temp_files ?? input.keep_output_files);
10865
11183
  const healthcheck = input.healthcheck ? normalizeCliHealthcheck(input.healthcheck, env, targetName, evalFilePath) : void 0;
10866
11184
  return {
10867
11185
  command,
@@ -10882,14 +11200,104 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
10882
11200
  "FILES",
10883
11201
  "OUTPUT_FILE"
10884
11202
  ]);
11203
+ var DEPRECATED_TARGET_CAMEL_CASE_FIELDS = /* @__PURE__ */ new Map([
11204
+ ["providerBatching", "provider_batching"],
11205
+ ["subagentModeAllowed", "subagent_mode_allowed"],
11206
+ ["fallbackTargets", "fallback_targets"],
11207
+ ["resourceName", "endpoint"],
11208
+ ["baseUrl", "base_url"],
11209
+ ["apiKey", "api_key"],
11210
+ ["deploymentName", "model"],
11211
+ ["thinkingBudget", "thinking_budget"],
11212
+ ["maxTokens", "max_output_tokens"],
11213
+ ["apiFormat", "api_format"],
11214
+ ["timeoutSeconds", "timeout_seconds"],
11215
+ ["logDir", "log_dir"],
11216
+ ["logDirectory", "log_directory"],
11217
+ ["logFormat", "log_format"],
11218
+ ["logOutputFormat", "log_output_format"],
11219
+ ["systemPrompt", "system_prompt"],
11220
+ ["maxTurns", "max_turns"],
11221
+ ["maxBudgetUsd", "max_budget_usd"],
11222
+ ["dryRun", "dry_run"],
11223
+ ["subagentRoot", "subagent_root"],
11224
+ ["filesFormat", "files_format"],
11225
+ ["attachmentsFormat", "attachments_format"],
11226
+ ["cliUrl", "cli_url"],
11227
+ ["cliPath", "cli_path"],
11228
+ ["githubToken", "github_token"],
11229
+ ["sessionDir", "session_dir"],
11230
+ ["sessionId", "session_id"],
11231
+ ["sessionStateDir", "session_state_dir"],
11232
+ ["maxRetries", "max_retries"],
11233
+ ["retryInitialDelayMs", "retry_initial_delay_ms"],
11234
+ ["retryMaxDelayMs", "retry_max_delay_ms"],
11235
+ ["retryBackoffFactor", "retry_backoff_factor"],
11236
+ ["retryStatusCodes", "retry_status_codes"]
11237
+ ]);
11238
+ var DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS = /* @__PURE__ */ new Map([
11239
+ ["timeoutSeconds", "timeout_seconds"]
11240
+ ]);
11241
+ function collectDeprecatedCamelCaseWarnings(value, location, aliases) {
11242
+ if (typeof value !== "object" || value === null || Array.isArray(value)) {
11243
+ return [];
11244
+ }
11245
+ const warnings = [];
11246
+ for (const [camelCaseField, snakeCaseField] of aliases) {
11247
+ if (Object.prototype.hasOwnProperty.call(value, camelCaseField)) {
11248
+ warnings.push({
11249
+ location: `${location}.${camelCaseField}`,
11250
+ message: `camelCase field '${camelCaseField}' is no longer supported in targets.yaml. Use '${snakeCaseField}' instead.`
11251
+ });
11252
+ }
11253
+ }
11254
+ return warnings;
11255
+ }
11256
+ function assertNoDeprecatedCamelCaseTargetFields(definition) {
11257
+ if (Object.prototype.hasOwnProperty.call(definition, "workspaceTemplate")) {
11258
+ throw new Error(
11259
+ `${definition.name}: target-level workspace_template has been removed. Use eval-level workspace.template.`
11260
+ );
11261
+ }
11262
+ const warning = findDeprecatedCamelCaseTargetWarnings(
11263
+ definition,
11264
+ `target "${definition.name}"`
11265
+ )[0];
11266
+ if (!warning) {
11267
+ return;
11268
+ }
11269
+ const fieldMatch = warning.message.match(/field '([^']+)'/);
11270
+ const replacementMatch = warning.message.match(/Use '([^']+)' instead/);
11271
+ const field = fieldMatch?.[1] ?? "unknown";
11272
+ const replacement = replacementMatch?.[1] ?? "snake_case";
11273
+ throw new Error(
11274
+ `${warning.location}: camelCase field '${field}' is no longer supported in targets.yaml. Use '${replacement}' instead.`
11275
+ );
11276
+ }
11277
+ function findDeprecatedCamelCaseTargetWarnings(target, location) {
11278
+ const warnings = collectDeprecatedCamelCaseWarnings(
11279
+ target,
11280
+ location,
11281
+ DEPRECATED_TARGET_CAMEL_CASE_FIELDS
11282
+ );
11283
+ if (typeof target !== "object" || target === null || Array.isArray(target)) {
11284
+ return warnings;
11285
+ }
11286
+ const healthcheck = target.healthcheck;
11287
+ warnings.push(
11288
+ ...collectDeprecatedCamelCaseWarnings(
11289
+ healthcheck,
11290
+ `${location}.healthcheck`,
11291
+ DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS
11292
+ )
11293
+ );
11294
+ return warnings;
11295
+ }
10885
11296
  var COMMON_TARGET_SETTINGS = [
10886
11297
  "use_target",
10887
11298
  "provider_batching",
10888
- "providerBatching",
10889
11299
  "subagent_mode_allowed",
10890
- "subagentModeAllowed",
10891
- "fallback_targets",
10892
- "fallbackTargets"
11300
+ "fallback_targets"
10893
11301
  ];
10894
11302
  var USE_TARGET_ENV_PATTERN = /^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i;
10895
11303
  var BASE_TARGET_SCHEMA = import_zod3.z.object({
@@ -10901,43 +11309,40 @@ var BASE_TARGET_SCHEMA = import_zod3.z.object({
10901
11309
  // backward compat
10902
11310
  workers: import_zod3.z.number().int().min(1).optional(),
10903
11311
  workspace_template: import_zod3.z.string().optional(),
10904
- workspaceTemplate: import_zod3.z.string().optional(),
10905
11312
  subagent_mode_allowed: import_zod3.z.boolean().optional(),
10906
- fallback_targets: import_zod3.z.array(import_zod3.z.string().min(1)).optional(),
10907
- fallbackTargets: import_zod3.z.array(import_zod3.z.string().min(1)).optional()
11313
+ fallback_targets: import_zod3.z.array(import_zod3.z.string().min(1)).optional()
10908
11314
  }).passthrough();
10909
11315
  var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
11316
+ var DEFAULT_AZURE_RESPONSES_API_VERSION = "v1";
10910
11317
  var DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
10911
- function normalizeAzureApiVersion(value) {
11318
+ function normalizeAzureApiVersion(value, apiFormat) {
11319
+ const defaultVersion = apiFormat === "responses" ? DEFAULT_AZURE_RESPONSES_API_VERSION : DEFAULT_AZURE_API_VERSION;
10912
11320
  if (!value) {
10913
- return DEFAULT_AZURE_API_VERSION;
11321
+ return defaultVersion;
10914
11322
  }
10915
11323
  const trimmed = value.trim();
10916
11324
  if (trimmed.length === 0) {
10917
- return DEFAULT_AZURE_API_VERSION;
11325
+ return defaultVersion;
10918
11326
  }
10919
11327
  const withoutPrefix = trimmed.replace(/^api[-_]?version\s*=\s*/i, "").trim();
10920
- return withoutPrefix.length > 0 ? withoutPrefix : DEFAULT_AZURE_API_VERSION;
11328
+ return withoutPrefix.length > 0 ? withoutPrefix : defaultVersion;
10921
11329
  }
10922
11330
  function resolveRetryConfig(target) {
10923
- const maxRetries = resolveOptionalNumber(
10924
- target.max_retries ?? target.maxRetries,
10925
- `${target.name} max retries`
10926
- );
11331
+ const maxRetries = resolveOptionalNumber(target.max_retries, `${target.name} max retries`);
10927
11332
  const initialDelayMs = resolveOptionalNumber(
10928
- target.retry_initial_delay_ms ?? target.retryInitialDelayMs,
11333
+ target.retry_initial_delay_ms,
10929
11334
  `${target.name} retry initial delay`
10930
11335
  );
10931
11336
  const maxDelayMs = resolveOptionalNumber(
10932
- target.retry_max_delay_ms ?? target.retryMaxDelayMs,
11337
+ target.retry_max_delay_ms,
10933
11338
  `${target.name} retry max delay`
10934
11339
  );
10935
11340
  const backoffFactor = resolveOptionalNumber(
10936
- target.retry_backoff_factor ?? target.retryBackoffFactor,
11341
+ target.retry_backoff_factor,
10937
11342
  `${target.name} retry backoff factor`
10938
11343
  );
10939
11344
  const retryableStatusCodes = resolveOptionalNumberArray(
10940
- target.retry_status_codes ?? target.retryStatusCodes,
11345
+ target.retry_status_codes,
10941
11346
  `${target.name} retry status codes`
10942
11347
  );
10943
11348
  if (maxRetries === void 0 && initialDelayMs === void 0 && maxDelayMs === void 0 && backoffFactor === void 0 && retryableStatusCodes === void 0) {
@@ -10997,9 +11402,10 @@ function resolveDelegatedTargetDefinition(name, definitions, env = process.env)
10997
11402
  `Target "${name}" exceeded the maximum use_target resolution depth (10). Check for a delegation loop or overly deep alias chain.`
10998
11403
  );
10999
11404
  }
11000
- function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
11405
+ function resolveTargetDefinition(definition, env = process.env, evalFilePath, options) {
11406
+ assertNoDeprecatedCamelCaseTargetFields(definition);
11001
11407
  const parsed = BASE_TARGET_SCHEMA.parse(definition);
11002
- if (parsed.workspace_template !== void 0 || parsed.workspaceTemplate !== void 0) {
11408
+ if (parsed.workspace_template !== void 0) {
11003
11409
  throw new Error(
11004
11410
  `${parsed.name}: target-level workspace_template has been removed. Use eval-level workspace.template.`
11005
11411
  );
@@ -11015,13 +11421,9 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
11015
11421
  `${parsed.name} provider`,
11016
11422
  true
11017
11423
  ).toLowerCase();
11018
- const providerBatching = resolveOptionalBoolean(
11019
- parsed.provider_batching ?? parsed.providerBatching
11020
- );
11021
- const subagentModeAllowed = resolveOptionalBoolean(
11022
- parsed.subagent_mode_allowed ?? parsed.subagentModeAllowed
11023
- );
11024
- const fallbackTargets = parsed.fallback_targets ?? parsed.fallbackTargets;
11424
+ const providerBatching = resolveOptionalBoolean(parsed.provider_batching);
11425
+ const subagentModeAllowed = resolveOptionalBoolean(parsed.subagent_mode_allowed);
11426
+ const fallbackTargets = parsed.fallback_targets;
11025
11427
  const base = {
11026
11428
  name: parsed.name,
11027
11429
  graderTarget: parsed.grader_target ?? parsed.judge_target,
@@ -11171,20 +11573,22 @@ function normalizeOpenAIBaseUrl(value) {
11171
11573
  return trimmed.endsWith("/v1") ? trimmed : `${trimmed}/v1`;
11172
11574
  }
11173
11575
  function resolveAzureConfig(target, env) {
11174
- const endpointSource = target.endpoint ?? target.resource ?? target.resourceName;
11175
- const apiKeySource = target.api_key ?? target.apiKey;
11176
- const deploymentSource = target.deployment ?? target.deploymentName ?? target.model;
11576
+ const endpointSource = target.endpoint ?? target.resource;
11577
+ const apiKeySource = target.api_key;
11578
+ const deploymentSource = target.deployment ?? target.model;
11177
11579
  const versionSource = target.version ?? target.api_version;
11178
11580
  const temperatureSource = target.temperature;
11179
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
11581
+ const maxTokensSource = target.max_output_tokens;
11180
11582
  const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
11181
11583
  const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
11182
11584
  const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
11585
+ const apiFormat = resolveApiFormat(target, env, target.name);
11183
11586
  const version = normalizeAzureApiVersion(
11184
11587
  resolveOptionalString(versionSource, env, `${target.name} api version`, {
11185
11588
  allowLiteral: true,
11186
11589
  optionalEnv: true
11187
- })
11590
+ }),
11591
+ apiFormat
11188
11592
  );
11189
11593
  const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
11190
11594
  const maxOutputTokens = resolveOptionalNumber(
@@ -11197,13 +11601,17 @@ function resolveAzureConfig(target, env) {
11197
11601
  deploymentName,
11198
11602
  apiKey,
11199
11603
  version,
11604
+ apiFormat,
11200
11605
  temperature,
11201
11606
  maxOutputTokens,
11202
11607
  retry
11203
11608
  };
11204
11609
  }
11205
- function resolveApiFormat(target, targetName) {
11206
- const raw = target.api_format ?? target.apiFormat;
11610
+ function resolveApiFormat(target, env, targetName) {
11611
+ const raw = resolveOptionalString(target.api_format, env, `${targetName} api format`, {
11612
+ allowLiteral: true,
11613
+ optionalEnv: true
11614
+ });
11207
11615
  if (raw === void 0) return void 0;
11208
11616
  if (raw === "chat" || raw === "responses") return raw;
11209
11617
  throw new Error(
@@ -11211,11 +11619,11 @@ function resolveApiFormat(target, targetName) {
11211
11619
  );
11212
11620
  }
11213
11621
  function resolveOpenAIConfig(target, env) {
11214
- const endpointSource = target.endpoint ?? target.base_url ?? target.baseUrl;
11215
- const apiKeySource = target.api_key ?? target.apiKey;
11622
+ const endpointSource = target.endpoint ?? target.base_url;
11623
+ const apiKeySource = target.api_key;
11216
11624
  const modelSource = target.model ?? target.deployment ?? target.variant;
11217
11625
  const temperatureSource = target.temperature;
11218
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
11626
+ const maxTokensSource = target.max_output_tokens;
11219
11627
  const baseURL = normalizeOpenAIBaseUrl(
11220
11628
  resolveOptionalString(endpointSource, env, `${target.name} endpoint`, {
11221
11629
  allowLiteral: true,
@@ -11229,17 +11637,17 @@ function resolveOpenAIConfig(target, env) {
11229
11637
  baseURL,
11230
11638
  apiKey,
11231
11639
  model,
11232
- apiFormat: resolveApiFormat(target, target.name),
11640
+ apiFormat: resolveApiFormat(target, env, target.name),
11233
11641
  temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
11234
11642
  maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
11235
11643
  retry
11236
11644
  };
11237
11645
  }
11238
11646
  function resolveOpenRouterConfig(target, env) {
11239
- const apiKeySource = target.api_key ?? target.apiKey;
11647
+ const apiKeySource = target.api_key;
11240
11648
  const modelSource = target.model ?? target.deployment ?? target.variant;
11241
11649
  const temperatureSource = target.temperature;
11242
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
11650
+ const maxTokensSource = target.max_output_tokens;
11243
11651
  const retry = resolveRetryConfig(target);
11244
11652
  return {
11245
11653
  apiKey: resolveString(apiKeySource, env, `${target.name} OpenRouter api key`),
@@ -11250,11 +11658,11 @@ function resolveOpenRouterConfig(target, env) {
11250
11658
  };
11251
11659
  }
11252
11660
  function resolveAnthropicConfig(target, env) {
11253
- const apiKeySource = target.api_key ?? target.apiKey;
11661
+ const apiKeySource = target.api_key;
11254
11662
  const modelSource = target.model ?? target.deployment ?? target.variant;
11255
11663
  const temperatureSource = target.temperature;
11256
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
11257
- const thinkingBudgetSource = target.thinking_budget ?? target.thinkingBudget;
11664
+ const maxTokensSource = target.max_output_tokens;
11665
+ const thinkingBudgetSource = target.thinking_budget;
11258
11666
  const apiKey = resolveString(apiKeySource, env, `${target.name} Anthropic api key`);
11259
11667
  const model = resolveString(modelSource, env, `${target.name} Anthropic model`);
11260
11668
  const retry = resolveRetryConfig(target);
@@ -11268,10 +11676,10 @@ function resolveAnthropicConfig(target, env) {
11268
11676
  };
11269
11677
  }
11270
11678
  function resolveGeminiConfig(target, env) {
11271
- const apiKeySource = target.api_key ?? target.apiKey;
11679
+ const apiKeySource = target.api_key;
11272
11680
  const modelSource = target.model ?? target.deployment ?? target.variant;
11273
11681
  const temperatureSource = target.temperature;
11274
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
11682
+ const maxTokensSource = target.max_output_tokens;
11275
11683
  const apiKey = resolveString(apiKeySource, env, `${target.name} Google API key`);
11276
11684
  const model = resolveOptionalString(modelSource, env, `${target.name} Gemini model`, {
11277
11685
  allowLiteral: true,
@@ -11291,11 +11699,11 @@ function resolveCodexConfig(target, env, evalFilePath) {
11291
11699
  const executableSource = target.executable ?? target.command ?? target.binary;
11292
11700
  const argsSource = target.args ?? target.arguments;
11293
11701
  const cwdSource = target.cwd;
11294
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
11295
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
11296
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
11297
- const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
11298
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
11702
+ const workspaceTemplateSource = target.workspace_template;
11703
+ const timeoutSource = target.timeout_seconds;
11704
+ const logDirSource = target.log_dir ?? target.log_directory;
11705
+ const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CODEX_LOG_FORMAT;
11706
+ const systemPromptSource = target.system_prompt;
11299
11707
  const model = resolveOptionalString(modelSource, env, `${target.name} codex model`, {
11300
11708
  allowLiteral: true,
11301
11709
  optionalEnv: true
@@ -11318,8 +11726,8 @@ function resolveCodexConfig(target, env, evalFilePath) {
11318
11726
  optionalEnv: true
11319
11727
  }
11320
11728
  );
11321
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11322
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
11729
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
11730
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11323
11731
  }
11324
11732
  if (cwd && workspaceTemplate) {
11325
11733
  throw new Error(
@@ -11359,16 +11767,16 @@ function normalizeCodexLogFormat(value) {
11359
11767
  throw new Error("codex log format must be 'summary' or 'json'");
11360
11768
  }
11361
11769
  function resolveCopilotSdkConfig(target, env, evalFilePath) {
11362
- const cliUrlSource = target.cli_url ?? target.cliUrl;
11363
- const cliPathSource = target.cli_path ?? target.cliPath;
11364
- const githubTokenSource = target.github_token ?? target.githubToken;
11770
+ const cliUrlSource = target.cli_url;
11771
+ const cliPathSource = target.cli_path;
11772
+ const githubTokenSource = target.github_token;
11365
11773
  const modelSource = target.model;
11366
11774
  const cwdSource = target.cwd;
11367
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
11368
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
11369
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
11370
- const logFormatSource = target.log_format ?? target.logFormat;
11371
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
11775
+ const workspaceTemplateSource = target.workspace_template;
11776
+ const timeoutSource = target.timeout_seconds;
11777
+ const logDirSource = target.log_dir ?? target.log_directory;
11778
+ const logFormatSource = target.log_format;
11779
+ const systemPromptSource = target.system_prompt;
11372
11780
  const cliUrl = resolveOptionalString(cliUrlSource, env, `${target.name} copilot-sdk cli URL`, {
11373
11781
  allowLiteral: true,
11374
11782
  optionalEnv: true
@@ -11403,8 +11811,8 @@ function resolveCopilotSdkConfig(target, env, evalFilePath) {
11403
11811
  optionalEnv: true
11404
11812
  }
11405
11813
  );
11406
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11407
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
11814
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
11815
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11408
11816
  }
11409
11817
  if (cwd && workspaceTemplate) {
11410
11818
  throw new Error(
@@ -11441,11 +11849,11 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
11441
11849
  const modelSource = target.model;
11442
11850
  const argsSource = target.args ?? target.arguments;
11443
11851
  const cwdSource = target.cwd;
11444
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
11445
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
11446
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
11447
- const logFormatSource = target.log_format ?? target.logFormat;
11448
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
11852
+ const workspaceTemplateSource = target.workspace_template;
11853
+ const timeoutSource = target.timeout_seconds;
11854
+ const logDirSource = target.log_dir ?? target.log_directory;
11855
+ const logFormatSource = target.log_format;
11856
+ const systemPromptSource = target.system_prompt;
11449
11857
  const executable = resolveOptionalString(executableSource, env, `${target.name} copilot-cli executable`, {
11450
11858
  allowLiteral: true,
11451
11859
  optionalEnv: true
@@ -11468,8 +11876,8 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
11468
11876
  optionalEnv: true
11469
11877
  }
11470
11878
  );
11471
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11472
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
11879
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
11880
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11473
11881
  }
11474
11882
  if (cwd && workspaceTemplate) {
11475
11883
  throw new Error(
@@ -11509,16 +11917,16 @@ function normalizeCopilotLogFormat(value) {
11509
11917
  }
11510
11918
  function resolvePiCodingAgentConfig(target, env, evalFilePath) {
11511
11919
  const subproviderSource = target.subprovider;
11512
- const modelSource = target.model ?? target.pi_model ?? target.piModel;
11513
- const apiKeySource = target.api_key ?? target.apiKey;
11514
- const toolsSource = target.tools ?? target.pi_tools ?? target.piTools;
11515
- const thinkingSource = target.thinking ?? target.pi_thinking ?? target.piThinking;
11920
+ const modelSource = target.model ?? target.pi_model;
11921
+ const apiKeySource = target.api_key;
11922
+ const toolsSource = target.tools ?? target.pi_tools;
11923
+ const thinkingSource = target.thinking ?? target.pi_thinking;
11516
11924
  const cwdSource = target.cwd;
11517
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
11518
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
11519
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
11520
- const logFormatSource = target.log_format ?? target.logFormat;
11521
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
11925
+ const workspaceTemplateSource = target.workspace_template;
11926
+ const timeoutSource = target.timeout_seconds;
11927
+ const logDirSource = target.log_dir ?? target.log_directory;
11928
+ const logFormatSource = target.log_format;
11929
+ const systemPromptSource = target.system_prompt;
11522
11930
  const subprovider = resolveOptionalString(
11523
11931
  subproviderSource,
11524
11932
  env,
@@ -11536,7 +11944,7 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
11536
11944
  allowLiteral: false,
11537
11945
  optionalEnv: true
11538
11946
  });
11539
- const baseUrlSource = target.base_url ?? target.baseUrl ?? target.endpoint;
11947
+ const baseUrlSource = target.base_url ?? target.endpoint;
11540
11948
  const baseUrl = resolveOptionalString(baseUrlSource, env, `${target.name} pi base url`, {
11541
11949
  allowLiteral: true,
11542
11950
  optionalEnv: true
@@ -11562,8 +11970,8 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
11562
11970
  optionalEnv: true
11563
11971
  }
11564
11972
  );
11565
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11566
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
11973
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
11974
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11567
11975
  }
11568
11976
  if (cwd && workspaceTemplate) {
11569
11977
  throw new Error(
@@ -11595,16 +12003,16 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
11595
12003
  function resolvePiCliConfig(target, env, evalFilePath) {
11596
12004
  const executableSource = target.executable ?? target.command ?? target.binary;
11597
12005
  const subproviderSource = target.subprovider;
11598
- const modelSource = target.model ?? target.pi_model ?? target.piModel;
11599
- const apiKeySource = target.api_key ?? target.apiKey;
11600
- const toolsSource = target.tools ?? target.pi_tools ?? target.piTools;
11601
- const thinkingSource = target.thinking ?? target.pi_thinking ?? target.piThinking;
12006
+ const modelSource = target.model ?? target.pi_model;
12007
+ const apiKeySource = target.api_key;
12008
+ const toolsSource = target.tools ?? target.pi_tools;
12009
+ const thinkingSource = target.thinking ?? target.pi_thinking;
11602
12010
  const cwdSource = target.cwd;
11603
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
11604
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
11605
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
11606
- const logFormatSource = target.log_format ?? target.logFormat;
11607
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
12011
+ const workspaceTemplateSource = target.workspace_template;
12012
+ const timeoutSource = target.timeout_seconds;
12013
+ const logDirSource = target.log_dir ?? target.log_directory;
12014
+ const logFormatSource = target.log_format;
12015
+ const systemPromptSource = target.system_prompt;
11608
12016
  const executable = resolveOptionalString(executableSource, env, `${target.name} pi-cli executable`, {
11609
12017
  allowLiteral: true,
11610
12018
  optionalEnv: true
@@ -11623,7 +12031,7 @@ function resolvePiCliConfig(target, env, evalFilePath) {
11623
12031
  allowLiteral: false,
11624
12032
  optionalEnv: true
11625
12033
  });
11626
- const baseUrlSource = target.base_url ?? target.baseUrl ?? target.endpoint;
12034
+ const baseUrlSource = target.base_url ?? target.endpoint;
11627
12035
  const baseUrl = resolveOptionalString(baseUrlSource, env, `${target.name} pi-cli base url`, {
11628
12036
  allowLiteral: true,
11629
12037
  optionalEnv: true
@@ -11648,8 +12056,8 @@ function resolvePiCliConfig(target, env, evalFilePath) {
11648
12056
  `${target.name} pi-cli workspace template`,
11649
12057
  { allowLiteral: true, optionalEnv: true }
11650
12058
  );
11651
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11652
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
12059
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
12060
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11653
12061
  }
11654
12062
  if (cwd && workspaceTemplate) {
11655
12063
  throw new Error(`${target.name}: 'cwd' and 'workspace_template' are mutually exclusive.`);
@@ -11681,11 +12089,11 @@ function resolvePiCliConfig(target, env, evalFilePath) {
11681
12089
  function resolveClaudeConfig(target, env, evalFilePath) {
11682
12090
  const modelSource = target.model;
11683
12091
  const cwdSource = target.cwd;
11684
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
11685
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
11686
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
11687
- const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CLAUDE_LOG_FORMAT;
11688
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
12092
+ const workspaceTemplateSource = target.workspace_template;
12093
+ const timeoutSource = target.timeout_seconds;
12094
+ const logDirSource = target.log_dir ?? target.log_directory;
12095
+ const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CLAUDE_LOG_FORMAT;
12096
+ const systemPromptSource = target.system_prompt;
11689
12097
  const model = resolveOptionalString(modelSource, env, `${target.name} claude model`, {
11690
12098
  allowLiteral: true,
11691
12099
  optionalEnv: true
@@ -11703,8 +12111,8 @@ function resolveClaudeConfig(target, env, evalFilePath) {
11703
12111
  optionalEnv: true
11704
12112
  }
11705
12113
  );
11706
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11707
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
12114
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
12115
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11708
12116
  }
11709
12117
  if (cwd && workspaceTemplate) {
11710
12118
  throw new Error(
@@ -11718,8 +12126,8 @@ function resolveClaudeConfig(target, env, evalFilePath) {
11718
12126
  });
11719
12127
  const logFormat = normalizeClaudeLogFormat(logFormatSource);
11720
12128
  const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
11721
- const maxTurns = typeof target.max_turns === "number" ? target.max_turns : typeof target.maxTurns === "number" ? target.maxTurns : void 0;
11722
- const maxBudgetUsd = typeof target.max_budget_usd === "number" ? target.max_budget_usd : typeof target.maxBudgetUsd === "number" ? target.maxBudgetUsd : void 0;
12129
+ const maxTurns = typeof target.max_turns === "number" ? target.max_turns : void 0;
12130
+ const maxBudgetUsd = typeof target.max_budget_usd === "number" ? target.max_budget_usd : void 0;
11723
12131
  return {
11724
12132
  model,
11725
12133
  systemPrompt,
@@ -11750,9 +12158,7 @@ function resolveMockConfig(target) {
11750
12158
  return { response };
11751
12159
  }
11752
12160
  function resolveVSCodeConfig(target, env, insiders, evalFilePath) {
11753
- const workspaceTemplateEnvVar = resolveOptionalLiteralString(
11754
- target.workspace_template ?? target.workspaceTemplate
11755
- );
12161
+ const workspaceTemplateEnvVar = resolveOptionalLiteralString(target.workspace_template);
11756
12162
  let workspaceTemplate = workspaceTemplateEnvVar ? resolveOptionalString(
11757
12163
  workspaceTemplateEnvVar,
11758
12164
  env,
@@ -11762,14 +12168,14 @@ function resolveVSCodeConfig(target, env, insiders, evalFilePath) {
11762
12168
  optionalEnv: true
11763
12169
  }
11764
12170
  ) : void 0;
11765
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11766
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
12171
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
12172
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11767
12173
  }
11768
12174
  const executableSource = target.executable;
11769
12175
  const waitSource = target.wait;
11770
- const dryRunSource = target.dry_run ?? target.dryRun;
11771
- const subagentRootSource = target.subagent_root ?? target.subagentRoot;
11772
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
12176
+ const dryRunSource = target.dry_run;
12177
+ const subagentRootSource = target.subagent_root;
12178
+ const timeoutSource = target.timeout_seconds;
11773
12179
  const defaultCommand = insiders ? "code-insiders" : "code";
11774
12180
  const executable = resolveOptionalString(executableSource, env, `${target.name} vscode executable`, {
11775
12181
  allowLiteral: true,
@@ -11804,8 +12210,8 @@ function resolveCliConfig(target, env, evalFilePath) {
11804
12210
  const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
11805
12211
  if (!parseResult.success) {
11806
12212
  const firstError = parseResult.error.errors[0];
11807
- const path52 = firstError?.path.join(".") || "";
11808
- const prefix = path52 ? `${target.name} ${path52}: ` : `${target.name}: `;
12213
+ const path53 = firstError?.path.join(".") || "";
12214
+ const prefix = path53 ? `${target.name} ${path53}: ` : `${target.name}: `;
11809
12215
  throw new Error(`${prefix}${firstError?.message}`);
11810
12216
  }
11811
12217
  const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
@@ -11820,17 +12226,17 @@ function resolveCliConfig(target, env, evalFilePath) {
11820
12226
  }
11821
12227
  function resolveDiscoveredProviderConfig(target, providerKind, env, evalFilePath) {
11822
12228
  const command = target.command ? resolveString(target.command, env, `${target.name} command`, true) : `bun run .agentv/providers/${providerKind}.ts {PROMPT}`;
11823
- const timeoutSeconds = target.timeout_seconds ?? target.timeoutSeconds;
12229
+ const timeoutSeconds = target.timeout_seconds;
11824
12230
  const timeoutMs = resolveTimeoutMs(timeoutSeconds, `${target.name} timeout`);
11825
12231
  let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
11826
12232
  allowLiteral: true,
11827
12233
  optionalEnv: true
11828
12234
  });
11829
- if (cwd && evalFilePath && !import_node_path23.default.isAbsolute(cwd)) {
11830
- cwd = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), cwd);
12235
+ if (cwd && evalFilePath && !import_node_path24.default.isAbsolute(cwd)) {
12236
+ cwd = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), cwd);
11831
12237
  }
11832
12238
  if (!cwd && evalFilePath) {
11833
- cwd = import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath));
12239
+ cwd = import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath));
11834
12240
  }
11835
12241
  return {
11836
12242
  command,
@@ -11884,10 +12290,10 @@ function resolveDiscover(value, targetName) {
11884
12290
  throw new Error(`Target "${targetName}": discover must be "latest" (got "${String(value)}")`);
11885
12291
  }
11886
12292
  function resolveCopilotLogConfig(target, env) {
11887
- const sessionDirSource = target.session_dir ?? target.sessionDir;
11888
- const sessionIdSource = target.session_id ?? target.sessionId;
12293
+ const sessionDirSource = target.session_dir;
12294
+ const sessionIdSource = target.session_id;
11889
12295
  const discoverSource = target.discover;
11890
- const sessionStateDirSource = target.session_state_dir ?? target.sessionStateDir;
12296
+ const sessionStateDirSource = target.session_state_dir;
11891
12297
  const cwdSource = target.cwd;
11892
12298
  return {
11893
12299
  sessionDir: resolveOptionalString(
@@ -12068,7 +12474,7 @@ var import_node_path33 = __toESM(require("path"), 1);
12068
12474
  init_cjs_shims();
12069
12475
  var import_node_fs11 = require("fs");
12070
12476
  var import_promises20 = require("fs/promises");
12071
- var import_node_path24 = __toESM(require("path"), 1);
12477
+ var import_node_path25 = __toESM(require("path"), 1);
12072
12478
  async function pathExists(target) {
12073
12479
  try {
12074
12480
  await (0, import_promises20.access)(target, import_node_fs11.constants.F_OK);
@@ -12084,7 +12490,7 @@ async function readDirEntries(target) {
12084
12490
  const entries = await (0, import_promises20.readdir)(target, { withFileTypes: true });
12085
12491
  return entries.map((entry) => ({
12086
12492
  name: entry.name,
12087
- absolutePath: import_node_path24.default.join(target, entry.name),
12493
+ absolutePath: import_node_path25.default.join(target, entry.name),
12088
12494
  isDirectory: entry.isDirectory()
12089
12495
  }));
12090
12496
  }
@@ -12100,9 +12506,9 @@ async function removeIfExists(target) {
12100
12506
 
12101
12507
  // src/evaluation/providers/vscode/utils/path.ts
12102
12508
  init_cjs_shims();
12103
- var import_node_path25 = __toESM(require("path"), 1);
12509
+ var import_node_path26 = __toESM(require("path"), 1);
12104
12510
  function pathToFileUri2(filePath) {
12105
- const absolutePath = import_node_path25.default.isAbsolute(filePath) ? filePath : import_node_path25.default.resolve(filePath);
12511
+ const absolutePath = import_node_path26.default.isAbsolute(filePath) ? filePath : import_node_path26.default.resolve(filePath);
12106
12512
  const normalizedPath = absolutePath.replace(/\\/g, "/");
12107
12513
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
12108
12514
  return `file:///${normalizedPath}`;
@@ -12112,7 +12518,7 @@ function pathToFileUri2(filePath) {
12112
12518
 
12113
12519
  // src/evaluation/providers/vscode/dispatch/promptBuilder.ts
12114
12520
  init_cjs_shims();
12115
- var import_node_path26 = __toESM(require("path"), 1);
12521
+ var import_node_path27 = __toESM(require("path"), 1);
12116
12522
 
12117
12523
  // src/evaluation/providers/vscode/utils/template.ts
12118
12524
  init_cjs_shims();
@@ -12206,8 +12612,8 @@ function createBatchRequestPrompt(userQuery, responseFileTmp, responseFileFinal,
12206
12612
  });
12207
12613
  }
12208
12614
  function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateContent) {
12209
- const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${import_node_path26.default.basename(file)}`).join("\n");
12210
- const responseList = responseFiles.map((file) => `"${import_node_path26.default.basename(file)}"`).join(", ");
12615
+ const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${import_node_path27.default.basename(file)}`).join("\n");
12616
+ const responseList = responseFiles.map((file) => `"${import_node_path27.default.basename(file)}"`).join(", ");
12211
12617
  return renderTemplate2(templateContent, {
12212
12618
  requestFiles: requestLines,
12213
12619
  responseList
@@ -12217,7 +12623,7 @@ function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateCont
12217
12623
  // src/evaluation/providers/vscode/dispatch/responseWaiter.ts
12218
12624
  init_cjs_shims();
12219
12625
  var import_promises21 = require("fs/promises");
12220
- var import_node_path27 = __toESM(require("path"), 1);
12626
+ var import_node_path28 = __toESM(require("path"), 1);
12221
12627
 
12222
12628
  // src/evaluation/providers/vscode/utils/time.ts
12223
12629
  init_cjs_shims();
@@ -12277,7 +12683,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
12277
12683
  }
12278
12684
  async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, silent = false, timeoutMs = DEFAULT_TIMEOUT_MS) {
12279
12685
  if (!silent) {
12280
- const fileList = responseFilesFinal.map((file) => import_node_path27.default.basename(file)).join(", ");
12686
+ const fileList = responseFilesFinal.map((file) => import_node_path28.default.basename(file)).join(", ");
12281
12687
  console.error(`waiting for ${responseFilesFinal.length} batch response(s): ${fileList}`);
12282
12688
  }
12283
12689
  const deadline = Date.now() + timeoutMs;
@@ -12286,7 +12692,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
12286
12692
  while (pending.size > 0) {
12287
12693
  if (Date.now() >= deadline) {
12288
12694
  if (!silent) {
12289
- const remaining = [...pending].map((f) => import_node_path27.default.basename(f)).join(", ");
12695
+ const remaining = [...pending].map((f) => import_node_path28.default.basename(f)).join(", ");
12290
12696
  console.error(
12291
12697
  `error: timed out after ${Math.round(timeoutMs / 1e3)}s waiting for batch responses. Still pending: ${remaining}`
12292
12698
  );
@@ -12344,37 +12750,6 @@ var import_node_util2 = require("util");
12344
12750
  // src/evaluation/providers/vscode/dispatch/constants.ts
12345
12751
  init_cjs_shims();
12346
12752
  var import_node_path29 = __toESM(require("path"), 1);
12347
-
12348
- // src/paths.ts
12349
- init_cjs_shims();
12350
- var import_node_os6 = __toESM(require("os"), 1);
12351
- var import_node_path28 = __toESM(require("path"), 1);
12352
- var logged = false;
12353
- function getAgentvHome() {
12354
- const envHome = process.env.AGENTV_HOME;
12355
- if (envHome && envHome !== "undefined") {
12356
- if (!logged) {
12357
- logged = true;
12358
- console.warn(`Using AGENTV_HOME: ${envHome}`);
12359
- }
12360
- return envHome;
12361
- }
12362
- return import_node_path28.default.join(import_node_os6.default.homedir(), ".agentv");
12363
- }
12364
- function getWorkspacesRoot() {
12365
- return import_node_path28.default.join(getAgentvHome(), "workspaces");
12366
- }
12367
- function getSubagentsRoot() {
12368
- return import_node_path28.default.join(getAgentvHome(), "subagents");
12369
- }
12370
- function getTraceStateRoot() {
12371
- return import_node_path28.default.join(getAgentvHome(), "trace-state");
12372
- }
12373
- function getWorkspacePoolRoot() {
12374
- return import_node_path28.default.join(getAgentvHome(), "workspace-pool");
12375
- }
12376
-
12377
- // src/evaluation/providers/vscode/dispatch/constants.ts
12378
12753
  var DEFAULT_LOCK_NAME = "subagent.lock";
12379
12754
  var DEFAULT_ALIVE_FILENAME = ".alive";
12380
12755
  function getDefaultSubagentRoot(vscodeCmd = "code") {
@@ -13527,6 +13902,15 @@ var AGENT_PROVIDER_KINDS = [
13527
13902
  "vscode",
13528
13903
  "vscode-insiders"
13529
13904
  ];
13905
+ var LLM_GRADER_CAPABLE_KINDS = [
13906
+ "openai",
13907
+ "openrouter",
13908
+ "azure",
13909
+ "anthropic",
13910
+ "gemini",
13911
+ "agentv",
13912
+ "mock"
13913
+ ];
13530
13914
  function extractLastAssistantContent(messages) {
13531
13915
  if (!messages || messages.length === 0) {
13532
13916
  return "";
@@ -13680,9 +14064,10 @@ init_cjs_shims();
13680
14064
 
13681
14065
  // src/evaluation/evaluators/scoring.ts
13682
14066
  init_cjs_shims();
13683
- var PASS_THRESHOLD = 0.8;
13684
- function scoreToVerdict(score) {
13685
- return score >= PASS_THRESHOLD ? "pass" : "fail";
14067
+ var DEFAULT_THRESHOLD = 0.8;
14068
+ var PASS_THRESHOLD = DEFAULT_THRESHOLD;
14069
+ function scoreToVerdict(score, threshold = DEFAULT_THRESHOLD) {
14070
+ return score >= threshold ? "pass" : "fail";
13686
14071
  }
13687
14072
  function clampScore(value) {
13688
14073
  if (Number.isNaN(value) || !Number.isFinite(value)) {
@@ -13873,13 +14258,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
13873
14258
  async function execShellWithStdin(command, stdinPayload, options = {}) {
13874
14259
  const { mkdir: mkdir17, readFile: readFile17, rm: rm6, writeFile: writeFile9 } = await import("fs/promises");
13875
14260
  const { tmpdir: tmpdir3 } = await import("os");
13876
- const path52 = await import("path");
14261
+ const path53 = await import("path");
13877
14262
  const { randomUUID: randomUUID10 } = await import("crypto");
13878
- const dir = path52.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
14263
+ const dir = path53.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
13879
14264
  await mkdir17(dir, { recursive: true });
13880
- const stdinPath = path52.join(dir, "stdin.txt");
13881
- const stdoutPath = path52.join(dir, "stdout.txt");
13882
- const stderrPath = path52.join(dir, "stderr.txt");
14265
+ const stdinPath = path53.join(dir, "stdin.txt");
14266
+ const stdoutPath = path53.join(dir, "stdout.txt");
14267
+ const stderrPath = path53.join(dir, "stderr.txt");
13883
14268
  await writeFile9(stdinPath, stdinPayload, "utf8");
13884
14269
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
13885
14270
  const { spawn: spawn5 } = await import("child_process");
@@ -15081,7 +15466,7 @@ ${outputSchema}`;
15081
15466
  parts.push("[[ ## scoring_criteria ## ]]");
15082
15467
  for (const rubric of rubrics) {
15083
15468
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
15084
- const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
15469
+ const minScoreLabel = rubric.min_score !== void 0 ? ` [REQUIRED: min score ${rubric.min_score}]` : rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
15085
15470
  parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
15086
15471
  if (rubric.outcome) {
15087
15472
  parts.push(`Description: ${rubric.outcome}`);
@@ -15135,54 +15520,106 @@ ${outputSchema}`;
15135
15520
  async runWithRetry(options) {
15136
15521
  const { context: context2, graderProvider, systemPrompt, userPrompt, schema, images } = options;
15137
15522
  let lastError;
15523
+ let lastInvalidResponse;
15524
+ let shouldAttemptStructureFix = false;
15138
15525
  for (let attempt = 1; attempt <= 3; attempt++) {
15139
15526
  try {
15140
- const model = graderProvider.asLanguageModel?.();
15141
- if (model) {
15142
- const modelOptions = {
15143
- ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
15144
- ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
15145
- };
15146
- const hasImages = images && images.length > 0;
15147
- const result = hasImages ? await (0, import_ai2.generateText)({
15148
- model,
15149
- system: systemPrompt,
15150
- messages: [
15151
- {
15152
- role: "user",
15153
- content: [
15154
- { type: "text", text: userPrompt },
15155
- ...toAiSdkImageParts(images)
15156
- ]
15157
- }
15158
- ],
15159
- ...modelOptions
15160
- }) : await (0, import_ai2.generateText)({
15161
- model,
15162
- system: systemPrompt,
15163
- prompt: userPrompt,
15164
- ...modelOptions
15165
- });
15166
- const data2 = schema.parse(parseJsonFromText(result.text));
15167
- const rawUsage = result.usage;
15168
- const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
15169
- return { data: data2, tokenUsage };
15527
+ const result = await this.generateStructuredResponse({
15528
+ context: context2,
15529
+ graderProvider,
15530
+ systemPrompt,
15531
+ userPrompt,
15532
+ images
15533
+ });
15534
+ const canRepairResponse = result.text.trim().length > 0;
15535
+ lastInvalidResponse = canRepairResponse ? result : void 0;
15536
+ let data;
15537
+ try {
15538
+ data = schema.parse(parseJsonFromText(result.text));
15539
+ } catch (e) {
15540
+ lastError = e instanceof Error ? e : new Error(String(e));
15541
+ shouldAttemptStructureFix = canRepairResponse;
15542
+ continue;
15170
15543
  }
15171
- const response = await graderProvider.invoke({
15172
- question: userPrompt,
15544
+ return {
15545
+ data,
15546
+ providerResponse: result.providerResponse,
15547
+ tokenUsage: result.tokenUsage
15548
+ };
15549
+ } catch (e) {
15550
+ lastError = e instanceof Error ? e : new Error(String(e));
15551
+ }
15552
+ }
15553
+ if (shouldAttemptStructureFix && lastInvalidResponse) {
15554
+ try {
15555
+ const repaired = await this.generateStructuredResponse({
15556
+ context: context2,
15557
+ graderProvider,
15173
15558
  systemPrompt,
15174
- evalCaseId: context2.evalCase.id,
15175
- attempt: context2.attempt,
15176
- maxOutputTokens: this.maxOutputTokens,
15177
- temperature: this.temperature
15559
+ userPrompt: buildStructureRepairPrompt({
15560
+ validationError: lastError?.message ?? "Schema validation failed",
15561
+ invalidResponse: lastInvalidResponse.text
15562
+ })
15178
15563
  });
15179
- const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output)));
15180
- return { data, providerResponse: response, tokenUsage: response.tokenUsage };
15564
+ const data = schema.parse(parseJsonFromText(repaired.text));
15565
+ return {
15566
+ data,
15567
+ providerResponse: repaired.providerResponse,
15568
+ tokenUsage: sumTokenUsage(lastInvalidResponse.tokenUsage, repaired.tokenUsage)
15569
+ };
15181
15570
  } catch (e) {
15182
15571
  lastError = e instanceof Error ? e : new Error(String(e));
15183
15572
  }
15184
15573
  }
15185
- throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
15574
+ throw new Error(
15575
+ `Failed to parse evaluator response after 3 attempts and 1 structure-fix attempt: ${lastError?.message}`
15576
+ );
15577
+ }
15578
+ async generateStructuredResponse(options) {
15579
+ const { context: context2, graderProvider, systemPrompt, userPrompt, images } = options;
15580
+ const model = graderProvider.asLanguageModel?.();
15581
+ if (model) {
15582
+ const modelOptions = {
15583
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
15584
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
15585
+ };
15586
+ const hasImages = images && images.length > 0;
15587
+ const result = hasImages ? await (0, import_ai2.generateText)({
15588
+ model,
15589
+ system: systemPrompt,
15590
+ messages: [
15591
+ {
15592
+ role: "user",
15593
+ content: [
15594
+ { type: "text", text: userPrompt },
15595
+ ...toAiSdkImageParts(images)
15596
+ ]
15597
+ }
15598
+ ],
15599
+ ...modelOptions
15600
+ }) : await (0, import_ai2.generateText)({
15601
+ model,
15602
+ system: systemPrompt,
15603
+ prompt: userPrompt,
15604
+ ...modelOptions
15605
+ });
15606
+ const rawUsage = result.usage;
15607
+ const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
15608
+ return { text: result.text, tokenUsage };
15609
+ }
15610
+ const response = await graderProvider.invoke({
15611
+ question: userPrompt,
15612
+ systemPrompt,
15613
+ evalCaseId: context2.evalCase.id,
15614
+ attempt: context2.attempt,
15615
+ maxOutputTokens: this.maxOutputTokens,
15616
+ temperature: this.temperature
15617
+ });
15618
+ return {
15619
+ text: extractLastAssistantContent(response.output),
15620
+ providerResponse: response,
15621
+ tokenUsage: response.tokenUsage
15622
+ };
15186
15623
  }
15187
15624
  };
15188
15625
  function buildOutputSchema() {
@@ -15202,6 +15639,29 @@ function buildOutputSchema() {
15202
15639
  "}"
15203
15640
  ].join("\n");
15204
15641
  }
15642
+ function buildStructureRepairPrompt(options) {
15643
+ const { validationError, invalidResponse } = options;
15644
+ return [
15645
+ "The following evaluation response has useful grading content but invalid JSON structure.",
15646
+ "Repair it to satisfy the schema in the system prompt.",
15647
+ "Preserve the evaluation meaning, do not re-grade the answer, and return only a single JSON object.",
15648
+ "",
15649
+ "Validation error:",
15650
+ validationError,
15651
+ "",
15652
+ "Invalid response:",
15653
+ invalidResponse
15654
+ ].join("\n");
15655
+ }
15656
+ function sumTokenUsage(first, second) {
15657
+ if (!first && !second) {
15658
+ return void 0;
15659
+ }
15660
+ return {
15661
+ input: (first?.input ?? 0) + (second?.input ?? 0),
15662
+ output: (first?.output ?? 0) + (second?.output ?? 0)
15663
+ };
15664
+ }
15205
15665
  function buildRubricOutputSchema() {
15206
15666
  return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
15207
15667
  You must return a valid JSON object matching this schema:
@@ -15301,19 +15761,21 @@ function calculateScoreRangeResult(result, rubrics) {
15301
15761
  rawScores[rubric.id] = rawScore;
15302
15762
  totalWeight += rubric.weight;
15303
15763
  weightedScoreSum += normalizedScore * rubric.weight;
15304
- let requiredMinScore;
15305
- if (rubric.required_min_score !== void 0) {
15306
- requiredMinScore = rubric.required_min_score;
15764
+ let minScoreThreshold;
15765
+ if (rubric.min_score !== void 0) {
15766
+ minScoreThreshold = rubric.min_score;
15767
+ } else if (rubric.required_min_score !== void 0) {
15768
+ minScoreThreshold = rubric.required_min_score / 10;
15307
15769
  } else if (rubric.required === true) {
15308
- requiredMinScore = 10;
15770
+ minScoreThreshold = 1;
15309
15771
  }
15310
15772
  const matchingRange = rubric.score_ranges?.find(
15311
15773
  (r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
15312
15774
  );
15313
15775
  const rangeDescription = matchingRange?.outcome ?? "";
15314
15776
  const criterionLabel = rubric.outcome ?? rubric.id;
15315
- const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
15316
- if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
15777
+ const passed = !(minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) && rawScore >= 7;
15778
+ if (minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) {
15317
15779
  failedRequired = true;
15318
15780
  }
15319
15781
  assertions.push({
@@ -15390,11 +15852,11 @@ function createFilesystemTools(workspacePath) {
15390
15852
  execute: async (input) => {
15391
15853
  try {
15392
15854
  const resolved = resolveSandboxed(workspacePath, input.path);
15393
- const stat10 = await import_promises29.default.stat(resolved);
15394
- if (stat10.isDirectory()) {
15855
+ const stat11 = await import_promises29.default.stat(resolved);
15856
+ if (stat11.isDirectory()) {
15395
15857
  return { error: `'${input.path}' is a directory, not a file` };
15396
15858
  }
15397
- const buffer = Buffer.alloc(Math.min(stat10.size, MAX_FILE_SIZE));
15859
+ const buffer = Buffer.alloc(Math.min(stat11.size, MAX_FILE_SIZE));
15398
15860
  const fd = await import_promises29.default.open(resolved, "r");
15399
15861
  try {
15400
15862
  await fd.read(buffer, 0, buffer.length, 0);
@@ -15402,8 +15864,8 @@ function createFilesystemTools(workspacePath) {
15402
15864
  await fd.close();
15403
15865
  }
15404
15866
  const content = buffer.toString("utf-8");
15405
- const truncated = stat10.size > MAX_FILE_SIZE;
15406
- return { content, truncated, size: stat10.size };
15867
+ const truncated = stat11.size > MAX_FILE_SIZE;
15868
+ return { content, truncated, size: stat11.size };
15407
15869
  } catch (error) {
15408
15870
  return { error: error instanceof Error ? error.message : String(error) };
15409
15871
  }
@@ -15454,8 +15916,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
15454
15916
  const ext = import_node_path39.default.extname(entry.name).toLowerCase();
15455
15917
  if (BINARY_EXTENSIONS.has(ext)) continue;
15456
15918
  try {
15457
- const stat10 = await import_promises29.default.stat(fullPath);
15458
- if (stat10.size > MAX_FILE_SIZE) continue;
15919
+ const stat11 = await import_promises29.default.stat(fullPath);
15920
+ if (stat11.size > MAX_FILE_SIZE) continue;
15459
15921
  const content = await import_promises29.default.readFile(fullPath, "utf-8");
15460
15922
  const lines = content.split("\n");
15461
15923
  for (let i = 0; i < lines.length; i++) {
@@ -16099,115 +16561,115 @@ var FieldAccuracyEvaluator = class {
16099
16561
  * Evaluate a single field against the expected value.
16100
16562
  */
16101
16563
  evaluateField(fieldConfig, candidateData, expectedData) {
16102
- const { path: path52, match, required = true, weight = 1 } = fieldConfig;
16103
- const candidateValue = resolvePath(candidateData, path52);
16104
- const expectedValue = resolvePath(expectedData, path52);
16564
+ const { path: path53, match, required = true, weight = 1 } = fieldConfig;
16565
+ const candidateValue = resolvePath(candidateData, path53);
16566
+ const expectedValue = resolvePath(expectedData, path53);
16105
16567
  if (expectedValue === void 0) {
16106
16568
  return {
16107
- path: path52,
16569
+ path: path53,
16108
16570
  score: 1,
16109
16571
  // No expected value means no comparison needed
16110
16572
  weight,
16111
16573
  hit: true,
16112
- message: `${path52}: no expected value`
16574
+ message: `${path53}: no expected value`
16113
16575
  };
16114
16576
  }
16115
16577
  if (candidateValue === void 0) {
16116
16578
  if (required) {
16117
16579
  return {
16118
- path: path52,
16580
+ path: path53,
16119
16581
  score: 0,
16120
16582
  weight,
16121
16583
  hit: false,
16122
- message: `${path52} (required, missing)`
16584
+ message: `${path53} (required, missing)`
16123
16585
  };
16124
16586
  }
16125
16587
  return {
16126
- path: path52,
16588
+ path: path53,
16127
16589
  score: 1,
16128
16590
  // Don't penalize missing optional fields
16129
16591
  weight: 0,
16130
16592
  // Zero weight means it won't affect the score
16131
16593
  hit: true,
16132
- message: `${path52}: optional field missing`
16594
+ message: `${path53}: optional field missing`
16133
16595
  };
16134
16596
  }
16135
16597
  switch (match) {
16136
16598
  case "exact":
16137
- return this.compareExact(path52, candidateValue, expectedValue, weight);
16599
+ return this.compareExact(path53, candidateValue, expectedValue, weight);
16138
16600
  case "numeric_tolerance":
16139
16601
  return this.compareNumericTolerance(
16140
- path52,
16602
+ path53,
16141
16603
  candidateValue,
16142
16604
  expectedValue,
16143
16605
  fieldConfig,
16144
16606
  weight
16145
16607
  );
16146
16608
  case "date":
16147
- return this.compareDate(path52, candidateValue, expectedValue, fieldConfig, weight);
16609
+ return this.compareDate(path53, candidateValue, expectedValue, fieldConfig, weight);
16148
16610
  default:
16149
16611
  return {
16150
- path: path52,
16612
+ path: path53,
16151
16613
  score: 0,
16152
16614
  weight,
16153
16615
  hit: false,
16154
- message: `${path52}: unknown match type "${match}"`
16616
+ message: `${path53}: unknown match type "${match}"`
16155
16617
  };
16156
16618
  }
16157
16619
  }
16158
16620
  /**
16159
16621
  * Exact equality comparison.
16160
16622
  */
16161
- compareExact(path52, candidateValue, expectedValue, weight) {
16623
+ compareExact(path53, candidateValue, expectedValue, weight) {
16162
16624
  if (deepEqual(candidateValue, expectedValue)) {
16163
16625
  return {
16164
- path: path52,
16626
+ path: path53,
16165
16627
  score: 1,
16166
16628
  weight,
16167
16629
  hit: true,
16168
- message: path52
16630
+ message: path53
16169
16631
  };
16170
16632
  }
16171
16633
  if (typeof candidateValue !== typeof expectedValue) {
16172
16634
  return {
16173
- path: path52,
16635
+ path: path53,
16174
16636
  score: 0,
16175
16637
  weight,
16176
16638
  hit: false,
16177
- message: `${path52} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
16639
+ message: `${path53} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
16178
16640
  };
16179
16641
  }
16180
16642
  return {
16181
- path: path52,
16643
+ path: path53,
16182
16644
  score: 0,
16183
16645
  weight,
16184
16646
  hit: false,
16185
- message: `${path52} (value mismatch)`
16647
+ message: `${path53} (value mismatch)`
16186
16648
  };
16187
16649
  }
16188
16650
  /**
16189
16651
  * Numeric comparison with absolute or relative tolerance.
16190
16652
  */
16191
- compareNumericTolerance(path52, candidateValue, expectedValue, fieldConfig, weight) {
16653
+ compareNumericTolerance(path53, candidateValue, expectedValue, fieldConfig, weight) {
16192
16654
  const { tolerance = 0, relative = false } = fieldConfig;
16193
16655
  const candidateNum = toNumber(candidateValue);
16194
16656
  const expectedNum = toNumber(expectedValue);
16195
16657
  if (candidateNum === null || expectedNum === null) {
16196
16658
  return {
16197
- path: path52,
16659
+ path: path53,
16198
16660
  score: 0,
16199
16661
  weight,
16200
16662
  hit: false,
16201
- message: `${path52} (non-numeric value)`
16663
+ message: `${path53} (non-numeric value)`
16202
16664
  };
16203
16665
  }
16204
16666
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
16205
16667
  return {
16206
- path: path52,
16668
+ path: path53,
16207
16669
  score: 0,
16208
16670
  weight,
16209
16671
  hit: false,
16210
- message: `${path52} (invalid numeric value)`
16672
+ message: `${path53} (invalid numeric value)`
16211
16673
  };
16212
16674
  }
16213
16675
  const diff = Math.abs(candidateNum - expectedNum);
@@ -16220,61 +16682,61 @@ var FieldAccuracyEvaluator = class {
16220
16682
  }
16221
16683
  if (withinTolerance) {
16222
16684
  return {
16223
- path: path52,
16685
+ path: path53,
16224
16686
  score: 1,
16225
16687
  weight,
16226
16688
  hit: true,
16227
- message: `${path52} (within tolerance: diff=${diff.toFixed(2)})`
16689
+ message: `${path53} (within tolerance: diff=${diff.toFixed(2)})`
16228
16690
  };
16229
16691
  }
16230
16692
  return {
16231
- path: path52,
16693
+ path: path53,
16232
16694
  score: 0,
16233
16695
  weight,
16234
16696
  hit: false,
16235
- message: `${path52} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
16697
+ message: `${path53} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
16236
16698
  };
16237
16699
  }
16238
16700
  /**
16239
16701
  * Date comparison with format normalization.
16240
16702
  */
16241
- compareDate(path52, candidateValue, expectedValue, fieldConfig, weight) {
16703
+ compareDate(path53, candidateValue, expectedValue, fieldConfig, weight) {
16242
16704
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
16243
16705
  const candidateDate = parseDate(String(candidateValue), formats);
16244
16706
  const expectedDate = parseDate(String(expectedValue), formats);
16245
16707
  if (candidateDate === null) {
16246
16708
  return {
16247
- path: path52,
16709
+ path: path53,
16248
16710
  score: 0,
16249
16711
  weight,
16250
16712
  hit: false,
16251
- message: `${path52} (unparseable candidate date)`
16713
+ message: `${path53} (unparseable candidate date)`
16252
16714
  };
16253
16715
  }
16254
16716
  if (expectedDate === null) {
16255
16717
  return {
16256
- path: path52,
16718
+ path: path53,
16257
16719
  score: 0,
16258
16720
  weight,
16259
16721
  hit: false,
16260
- message: `${path52} (unparseable expected date)`
16722
+ message: `${path53} (unparseable expected date)`
16261
16723
  };
16262
16724
  }
16263
16725
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
16264
16726
  return {
16265
- path: path52,
16727
+ path: path53,
16266
16728
  score: 1,
16267
16729
  weight,
16268
16730
  hit: true,
16269
- message: path52
16731
+ message: path53
16270
16732
  };
16271
16733
  }
16272
16734
  return {
16273
- path: path52,
16735
+ path: path53,
16274
16736
  score: 0,
16275
16737
  weight,
16276
16738
  hit: false,
16277
- message: `${path52} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
16739
+ message: `${path53} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
16278
16740
  };
16279
16741
  }
16280
16742
  /**
@@ -16307,11 +16769,11 @@ var FieldAccuracyEvaluator = class {
16307
16769
  };
16308
16770
  }
16309
16771
  };
16310
- function resolvePath(obj, path52) {
16311
- if (!path52 || !obj) {
16772
+ function resolvePath(obj, path53) {
16773
+ if (!path53 || !obj) {
16312
16774
  return void 0;
16313
16775
  }
16314
- const parts = path52.split(/\.|\[|\]/).filter((p) => p.length > 0);
16776
+ const parts = path53.split(/\.|\[|\]/).filter((p) => p.length > 0);
16315
16777
  let current = obj;
16316
16778
  for (const part of parts) {
16317
16779
  if (current === null || current === void 0) {
@@ -16808,8 +17270,8 @@ var TokenUsageEvaluator = class {
16808
17270
 
16809
17271
  // src/evaluation/evaluators/tool-trajectory.ts
16810
17272
  init_cjs_shims();
16811
- function getNestedValue(obj, path52) {
16812
- const parts = path52.split(".");
17273
+ function getNestedValue(obj, path53) {
17274
+ const parts = path53.split(".");
16813
17275
  let current = obj;
16814
17276
  for (const part of parts) {
16815
17277
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -18602,7 +19064,7 @@ var WorkspacePoolManager = class {
18602
19064
  }
18603
19065
  /**
18604
19066
  * Reset an existing slot for reuse:
18605
- * 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
19067
+ * 1. Reset repos (fetch from origin when resolve=remote, then git reset --hard && git clean per repo)
18606
19068
  * 2. Re-copy template files (skip repo directories)
18607
19069
  */
18608
19070
  async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
@@ -18615,7 +19077,17 @@ var WorkspacePoolManager = class {
18615
19077
  continue;
18616
19078
  }
18617
19079
  const ref = repo.checkout?.ref ?? "HEAD";
18618
- await git(["reset", "--hard", ref], { cwd: repoDir });
19080
+ const resolve = repo.checkout?.resolve ?? "remote";
19081
+ if (resolve === "remote") {
19082
+ const fetchArgs = ["fetch", "origin", ref];
19083
+ if (repo.clone?.depth) {
19084
+ fetchArgs.splice(1, 0, "--depth", String(repo.clone.depth));
19085
+ }
19086
+ await git(fetchArgs, { cwd: repoDir });
19087
+ await git(["reset", "--hard", "FETCH_HEAD"], { cwd: repoDir });
19088
+ } else {
19089
+ await git(["reset", "--hard", ref], { cwd: repoDir });
19090
+ }
18619
19091
  const cleanFlag = poolReset === "strict" ? "-fdx" : "-fd";
18620
19092
  await git(["clean", cleanFlag], { cwd: repoDir });
18621
19093
  }
@@ -18915,7 +19387,7 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
18915
19387
  }
18916
19388
 
18917
19389
  // src/evaluation/orchestrator.ts
18918
- function classifyQualityStatus(score, threshold = PASS_THRESHOLD) {
19390
+ function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
18919
19391
  return score >= threshold ? "ok" : "quality_failure";
18920
19392
  }
18921
19393
  function buildSkippedEvaluatorError(scores) {
@@ -19007,7 +19479,7 @@ async function runEvaluation(options) {
19007
19479
  const filteredEvalCases = filterEvalCases(evalCases, filter);
19008
19480
  if (filteredEvalCases.length === 0) {
19009
19481
  if (filter) {
19010
- throw new Error(`No tests matched filter '${filter}' in ${evalFilePath}`);
19482
+ throw new Error(`No tests matched filter '${formatFilter(filter)}' in ${evalFilePath}`);
19011
19483
  }
19012
19484
  return [];
19013
19485
  }
@@ -19059,6 +19531,9 @@ async function runEvaluation(options) {
19059
19531
  const graderName = targetContext.graderTarget ?? targetContext.name;
19060
19532
  const resolvedGrader = resolveTargetByName(graderName);
19061
19533
  if (!resolvedGrader) {
19534
+ if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) {
19535
+ return void 0;
19536
+ }
19062
19537
  return getOrCreateProvider(targetContext);
19063
19538
  }
19064
19539
  return getOrCreateProvider(resolvedGrader);
@@ -19389,7 +19864,7 @@ async function runEvaluation(options) {
19389
19864
  const budgetResult = {
19390
19865
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
19391
19866
  testId: evalCase.id,
19392
- dataset: evalCase.dataset,
19867
+ suite: evalCase.suite,
19393
19868
  category: evalCase.category,
19394
19869
  score: 0,
19395
19870
  assertions: [],
@@ -19426,7 +19901,7 @@ async function runEvaluation(options) {
19426
19901
  const haltResult = {
19427
19902
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
19428
19903
  testId: evalCase.id,
19429
- dataset: evalCase.dataset,
19904
+ suite: evalCase.suite,
19430
19905
  category: evalCase.category,
19431
19906
  score: 0,
19432
19907
  assertions: [],
@@ -19738,7 +20213,7 @@ async function runBatchEvaluation(options) {
19738
20213
  targetResolver,
19739
20214
  availableTargets,
19740
20215
  verbose,
19741
- threshold: batchThreshold
20216
+ threshold: evalCase.threshold ?? batchThreshold
19742
20217
  });
19743
20218
  if (providerError) {
19744
20219
  result = {
@@ -20200,8 +20675,9 @@ async function runEvalCase(options) {
20200
20675
  fileChanges,
20201
20676
  workspacePath,
20202
20677
  verbose,
20203
- threshold: caseThreshold
20678
+ threshold: evalCase.threshold ?? caseThreshold
20204
20679
  });
20680
+ const effectiveThreshold = evalCase.threshold ?? caseThreshold;
20205
20681
  const totalDurationMs = Date.now() - caseStartMs;
20206
20682
  const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
20207
20683
  const evalRunTokenUsage = tokenUsage || graderTokens ? {
@@ -20215,7 +20691,7 @@ async function runEvalCase(options) {
20215
20691
  ...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
20216
20692
  };
20217
20693
  const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
20218
- const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, caseThreshold);
20694
+ const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, effectiveThreshold);
20219
20695
  const targetUsedField = targetUsed ? { targetUsed } : {};
20220
20696
  const finalResult = providerError ? {
20221
20697
  ...result,
@@ -20416,7 +20892,8 @@ async function evaluateCandidate(options) {
20416
20892
  targetResolver,
20417
20893
  availableTargets,
20418
20894
  fileChanges,
20419
- workspacePath
20895
+ workspacePath,
20896
+ threshold: evalThreshold
20420
20897
  });
20421
20898
  const completedAt = nowFn();
20422
20899
  let agentRequest;
@@ -20447,7 +20924,7 @@ async function evaluateCandidate(options) {
20447
20924
  return {
20448
20925
  timestamp: completedAt.toISOString(),
20449
20926
  testId: evalCase.id,
20450
- dataset: evalCase.dataset,
20927
+ suite: evalCase.suite,
20451
20928
  category: evalCase.category,
20452
20929
  conversationId: evalCase.conversation_id,
20453
20930
  score: score.score,
@@ -20490,7 +20967,8 @@ async function runEvaluatorsForCase(options) {
20490
20967
  targetResolver,
20491
20968
  availableTargets,
20492
20969
  fileChanges,
20493
- workspacePath
20970
+ workspacePath,
20971
+ threshold
20494
20972
  } = options;
20495
20973
  if (evalCase.assertions && evalCase.assertions.length > 0) {
20496
20974
  return runEvaluatorList({
@@ -20516,7 +20994,8 @@ async function runEvaluatorsForCase(options) {
20516
20994
  targetResolver,
20517
20995
  availableTargets,
20518
20996
  fileChanges,
20519
- workspacePath
20997
+ workspacePath,
20998
+ threshold
20520
20999
  });
20521
21000
  }
20522
21001
  const evaluatorKind = evalCase.evaluator ?? "llm-grader";
@@ -20618,7 +21097,8 @@ async function runEvaluatorList(options) {
20618
21097
  name: evaluatorConfig.name,
20619
21098
  type: evaluatorConfig.type,
20620
21099
  weight,
20621
- ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
21100
+ ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
21101
+ ...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
20622
21102
  });
20623
21103
  scores.push({
20624
21104
  name: evaluatorConfig.name,
@@ -20653,7 +21133,8 @@ async function runEvaluatorList(options) {
20653
21133
  name: evaluatorConfig.name ?? "unknown",
20654
21134
  type: evaluatorConfig.type ?? "llm-grader",
20655
21135
  weight,
20656
- ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
21136
+ ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
21137
+ ...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
20657
21138
  });
20658
21139
  scores.push({
20659
21140
  name: evaluatorConfig.name ?? "unknown",
@@ -20687,9 +21168,10 @@ async function runEvaluatorList(options) {
20687
21168
  }
20688
21169
  }
20689
21170
  }
21171
+ const effectiveThreshold = options.threshold ?? DEFAULT_THRESHOLD;
20690
21172
  const hasRequiredFailure = scored.some((entry) => {
20691
21173
  if (!entry.required) return false;
20692
- const minScore = typeof entry.required === "number" ? entry.required : PASS_THRESHOLD;
21174
+ const minScore = entry.min_score ?? (typeof entry.required === "number" ? entry.required : effectiveThreshold);
20693
21175
  return entry.score.score < minScore;
20694
21176
  });
20695
21177
  const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
@@ -20700,17 +21182,23 @@ async function runEvaluatorList(options) {
20700
21182
  const expectedAspectCount = assertions.length || 1;
20701
21183
  const score = {
20702
21184
  score: aggregateScore,
20703
- verdict: scoreToVerdict(aggregateScore),
21185
+ verdict: scoreToVerdict(aggregateScore, effectiveThreshold),
20704
21186
  assertions,
20705
21187
  expectedAspectCount
20706
21188
  };
20707
21189
  return { score, scores };
20708
21190
  }
21191
+ function formatFilter(filter) {
21192
+ return typeof filter === "string" ? filter : filter.join(", ");
21193
+ }
21194
+ function matchesFilter3(id, filter) {
21195
+ return typeof filter === "string" ? import_micromatch3.default.isMatch(id, filter) : filter.some((pattern) => import_micromatch3.default.isMatch(id, pattern));
21196
+ }
20709
21197
  function filterEvalCases(evalCases, filter) {
20710
21198
  if (!filter) {
20711
21199
  return evalCases;
20712
21200
  }
20713
- return evalCases.filter((evalCase) => import_micromatch3.default.isMatch(evalCase.id, filter));
21201
+ return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter));
20714
21202
  }
20715
21203
  function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
20716
21204
  const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
@@ -20797,7 +21285,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
20797
21285
  return {
20798
21286
  timestamp: timestamp.toISOString(),
20799
21287
  testId: evalCase.id,
20800
- dataset: evalCase.dataset,
21288
+ suite: evalCase.suite,
20801
21289
  category: evalCase.category,
20802
21290
  conversationId: evalCase.conversation_id,
20803
21291
  score: 0,
@@ -21071,6 +21559,7 @@ async function evaluate(config) {
21071
21559
  verbose: config.verbose,
21072
21560
  maxConcurrency: config.workers ?? 3,
21073
21561
  filter: config.filter,
21562
+ threshold: config.threshold,
21074
21563
  evalCases,
21075
21564
  onResult: async (result) => {
21076
21565
  collectedResults.push(result);
@@ -21081,19 +21570,19 @@ async function evaluate(config) {
21081
21570
  const durationMs = Date.now() - startTime;
21082
21571
  return {
21083
21572
  results: allResults,
21084
- summary: computeSummary(allResults, durationMs)
21573
+ summary: computeSummary(allResults, durationMs, config.threshold)
21085
21574
  };
21086
21575
  }
21087
21576
  function mapAssertionType(type) {
21088
21577
  return type.replace(/_/g, "-");
21089
21578
  }
21090
- function computeSummary(results, durationMs) {
21579
+ function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
21091
21580
  const total = results.length;
21092
21581
  let passed = 0;
21093
21582
  let scoreSum = 0;
21094
21583
  for (const r of results) {
21095
21584
  scoreSum += r.score;
21096
- if (r.score >= PASS_THRESHOLD) {
21585
+ if (r.score >= threshold) {
21097
21586
  passed++;
21098
21587
  }
21099
21588
  }
@@ -21207,7 +21696,7 @@ var CONFIG_FILE_NAMES = [
21207
21696
  ];
21208
21697
  async function loadTsConfig(projectRoot) {
21209
21698
  const { existsSync: existsSync7 } = await import("fs");
21210
- const { pathToFileURL } = await import("url");
21699
+ const { pathToFileURL: pathToFileURL2 } = await import("url");
21211
21700
  const { join: join2 } = await import("path");
21212
21701
  for (const fileName of CONFIG_FILE_NAMES) {
21213
21702
  const filePath = join2(projectRoot, fileName);
@@ -21215,7 +21704,7 @@ async function loadTsConfig(projectRoot) {
21215
21704
  continue;
21216
21705
  }
21217
21706
  try {
21218
- const fileUrl = pathToFileURL(filePath).href;
21707
+ const fileUrl = pathToFileURL2(filePath).href;
21219
21708
  const mod = await import(fileUrl);
21220
21709
  const config = mod.default ?? mod;
21221
21710
  return AgentVConfigSchema.parse(config);
@@ -21656,7 +22145,7 @@ var OtelTraceExporter = class {
21656
22145
  rootSpan.setAttribute("gen_ai.system", "agentv");
21657
22146
  rootSpan.setAttribute("agentv.test_id", result.testId);
21658
22147
  rootSpan.setAttribute("agentv.target", result.target);
21659
- if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
22148
+ if (result.suite) rootSpan.setAttribute("agentv.suite", result.suite);
21660
22149
  rootSpan.setAttribute("agentv.score", result.score);
21661
22150
  if (captureContent && result.output.length > 0) {
21662
22151
  const lastMsg = result.output[result.output.length - 1];
@@ -21865,7 +22354,7 @@ var OtelStreamingObserver = class {
21865
22354
  this.rootSpan.setAttribute("gen_ai.system", "agentv");
21866
22355
  this.rootSpan.setAttribute("agentv.test_id", testId);
21867
22356
  this.rootSpan.setAttribute("agentv.target", target);
21868
- if (evalSet) this.rootSpan.setAttribute("agentv.dataset", evalSet);
22357
+ if (evalSet) this.rootSpan.setAttribute("agentv.suite", evalSet);
21869
22358
  this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
21870
22359
  }
21871
22360
  /** Create and immediately export a tool span */
@@ -22221,12 +22710,244 @@ function extractToolResultContent(content) {
22221
22710
  return parts.length > 0 ? parts.join("") : void 0;
22222
22711
  }
22223
22712
 
22224
- // src/import/session-discovery.ts
22713
+ // src/import/codex-parser.ts
22714
+ init_cjs_shims();
22715
+ function parseCodexSession(jsonl) {
22716
+ const messages = [];
22717
+ let sessionId = "";
22718
+ let cwd;
22719
+ let model;
22720
+ let version;
22721
+ let startTimestamp;
22722
+ let endTimestamp;
22723
+ const pendingCalls = /* @__PURE__ */ new Map();
22724
+ const lines = jsonl.split("\n").filter((l) => l.trim().length > 0);
22725
+ for (const line of lines) {
22726
+ let entry;
22727
+ try {
22728
+ entry = JSON.parse(line);
22729
+ } catch {
22730
+ continue;
22731
+ }
22732
+ if (!entry.type) continue;
22733
+ if (entry.timestamp) {
22734
+ if (!startTimestamp) startTimestamp = entry.timestamp;
22735
+ endTimestamp = entry.timestamp;
22736
+ }
22737
+ const payload = entry.payload ?? {};
22738
+ switch (entry.type) {
22739
+ case "session_meta": {
22740
+ sessionId = String(payload.id ?? "");
22741
+ cwd = payload.cwd ? String(payload.cwd) : void 0;
22742
+ version = payload.cli_version ? String(payload.cli_version) : void 0;
22743
+ if (payload.model && !model) {
22744
+ model = String(payload.model);
22745
+ }
22746
+ break;
22747
+ }
22748
+ case "turn_context": {
22749
+ if (payload.model && !model) {
22750
+ model = String(payload.model);
22751
+ }
22752
+ if (payload.cwd && !cwd) {
22753
+ cwd = String(payload.cwd);
22754
+ }
22755
+ break;
22756
+ }
22757
+ case "response_item": {
22758
+ const itemType = String(payload.type ?? "");
22759
+ const role = String(payload.role ?? "");
22760
+ switch (itemType) {
22761
+ case "message": {
22762
+ if (role === "developer") break;
22763
+ const content = extractResponseItemContent(payload.content);
22764
+ if (role === "user" && content) {
22765
+ messages.push({ role: "user", content });
22766
+ } else if (role === "assistant" && content) {
22767
+ messages.push({ role: "assistant", content });
22768
+ }
22769
+ break;
22770
+ }
22771
+ case "function_call": {
22772
+ const toolName = String(payload.name ?? "");
22773
+ const callId = String(payload.call_id ?? "");
22774
+ let input;
22775
+ if (typeof payload.arguments === "string") {
22776
+ try {
22777
+ input = JSON.parse(payload.arguments);
22778
+ } catch {
22779
+ input = payload.arguments;
22780
+ }
22781
+ } else {
22782
+ input = payload.arguments;
22783
+ }
22784
+ const toolCall = { tool: toolName, input, id: callId };
22785
+ const msgIdx = messages.length;
22786
+ messages.push({
22787
+ role: "assistant",
22788
+ toolCalls: [toolCall]
22789
+ });
22790
+ if (callId) {
22791
+ pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
22792
+ }
22793
+ break;
22794
+ }
22795
+ case "custom_tool_call": {
22796
+ const toolName = String(payload.name ?? "");
22797
+ const callId = String(payload.call_id ?? "");
22798
+ let input;
22799
+ if (typeof payload.arguments === "string") {
22800
+ try {
22801
+ input = JSON.parse(payload.arguments);
22802
+ } catch {
22803
+ input = payload.arguments;
22804
+ }
22805
+ } else {
22806
+ input = payload.arguments;
22807
+ }
22808
+ const toolCall = { tool: toolName, input, id: callId };
22809
+ const msgIdx = messages.length;
22810
+ messages.push({
22811
+ role: "assistant",
22812
+ toolCalls: [toolCall]
22813
+ });
22814
+ if (callId) {
22815
+ pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
22816
+ }
22817
+ break;
22818
+ }
22819
+ case "function_call_output":
22820
+ case "custom_tool_call_output": {
22821
+ const callId = String(payload.call_id ?? "");
22822
+ const pending = pendingCalls.get(callId);
22823
+ if (pending) {
22824
+ const existingMsg = messages[pending.msgIdx];
22825
+ const existingCalls = [...existingMsg.toolCalls ?? []];
22826
+ existingCalls[pending.toolIdx] = {
22827
+ ...existingCalls[pending.toolIdx],
22828
+ output: payload.output
22829
+ };
22830
+ messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls };
22831
+ pendingCalls.delete(callId);
22832
+ }
22833
+ break;
22834
+ }
22835
+ // Skip reasoning blocks (thinking tokens)
22836
+ case "reasoning":
22837
+ break;
22838
+ }
22839
+ break;
22840
+ }
22841
+ }
22842
+ }
22843
+ let durationMs;
22844
+ if (startTimestamp && endTimestamp) {
22845
+ durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime();
22846
+ }
22847
+ const source = {
22848
+ provider: "codex",
22849
+ sessionId,
22850
+ cwd,
22851
+ startedAt: startTimestamp,
22852
+ model,
22853
+ version
22854
+ };
22855
+ return {
22856
+ messages,
22857
+ source,
22858
+ // Codex rollout files don't include token counts (only rate limit info)
22859
+ tokenUsage: void 0,
22860
+ durationMs,
22861
+ costUsd: null
22862
+ };
22863
+ }
22864
+ function extractResponseItemContent(content) {
22865
+ if (typeof content === "string") return content;
22866
+ if (!Array.isArray(content)) return void 0;
22867
+ const parts = [];
22868
+ for (const block of content) {
22869
+ if (typeof block === "object" && block !== null) {
22870
+ const b = block;
22871
+ if (typeof b.text === "string") {
22872
+ parts.push(b.text);
22873
+ }
22874
+ }
22875
+ }
22876
+ return parts.length > 0 ? parts.join("") : void 0;
22877
+ }
22878
+
22879
+ // src/import/codex-session-discovery.ts
22225
22880
  init_cjs_shims();
22226
22881
  var import_promises36 = require("fs/promises");
22227
22882
  var import_node_os8 = require("os");
22228
22883
  var import_node_path53 = __toESM(require("path"), 1);
22229
- var DEFAULT_PROJECTS_DIR = () => import_node_path53.default.join((0, import_node_os8.homedir)(), ".claude", "projects");
22884
+ var DEFAULT_SESSIONS_DIR = () => import_node_path53.default.join((0, import_node_os8.homedir)(), ".codex", "sessions");
22885
+ async function discoverCodexSessions(opts) {
22886
+ const sessionsDir = opts?.sessionsDir ?? DEFAULT_SESSIONS_DIR();
22887
+ const limit = opts?.latest ? 1 : opts?.limit ?? 10;
22888
+ const sessions = [];
22889
+ let yearDirs;
22890
+ try {
22891
+ yearDirs = await (0, import_promises36.readdir)(sessionsDir);
22892
+ } catch {
22893
+ return [];
22894
+ }
22895
+ for (const year of yearDirs) {
22896
+ const yearPath = import_node_path53.default.join(sessionsDir, year);
22897
+ let monthDirs;
22898
+ try {
22899
+ monthDirs = await (0, import_promises36.readdir)(yearPath);
22900
+ } catch {
22901
+ continue;
22902
+ }
22903
+ for (const month of monthDirs) {
22904
+ const monthPath = import_node_path53.default.join(yearPath, month);
22905
+ let dayDirs;
22906
+ try {
22907
+ dayDirs = await (0, import_promises36.readdir)(monthPath);
22908
+ } catch {
22909
+ continue;
22910
+ }
22911
+ for (const day of dayDirs) {
22912
+ if (opts?.date) {
22913
+ const dirDate = `${year}-${month}-${day}`;
22914
+ if (dirDate !== opts.date) continue;
22915
+ }
22916
+ const dayPath = import_node_path53.default.join(monthPath, day);
22917
+ let files;
22918
+ try {
22919
+ files = await (0, import_promises36.readdir)(dayPath);
22920
+ } catch {
22921
+ continue;
22922
+ }
22923
+ for (const file of files) {
22924
+ if (!file.startsWith("rollout-") || !file.endsWith(".jsonl")) continue;
22925
+ const filePath = import_node_path53.default.join(dayPath, file);
22926
+ const nameWithoutExt = file.replace(/\.jsonl$/, "");
22927
+ const parts = nameWithoutExt.split("-");
22928
+ const sessionId = parts.length >= 6 ? parts.slice(-5).join("-") : nameWithoutExt;
22929
+ let updatedAt;
22930
+ try {
22931
+ const fileStat = await (0, import_promises36.stat)(filePath);
22932
+ updatedAt = fileStat.mtime;
22933
+ } catch {
22934
+ updatedAt = /* @__PURE__ */ new Date(0);
22935
+ }
22936
+ sessions.push({ sessionId, filePath, filename: file, updatedAt });
22937
+ }
22938
+ }
22939
+ }
22940
+ }
22941
+ sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
22942
+ return sessions.slice(0, limit);
22943
+ }
22944
+
22945
+ // src/import/session-discovery.ts
22946
+ init_cjs_shims();
22947
+ var import_promises37 = require("fs/promises");
22948
+ var import_node_os9 = require("os");
22949
+ var import_node_path54 = __toESM(require("path"), 1);
22950
+ var DEFAULT_PROJECTS_DIR = () => import_node_path54.default.join((0, import_node_os9.homedir)(), ".claude", "projects");
22230
22951
  function encodeProjectPath(projectPath) {
22231
22952
  return projectPath.replace(/\//g, "-");
22232
22953
  }
@@ -22235,7 +22956,7 @@ async function discoverClaudeSessions(opts) {
22235
22956
  const limit = opts?.latest ? 1 : opts?.limit ?? 10;
22236
22957
  let projectDirs;
22237
22958
  try {
22238
- projectDirs = await (0, import_promises36.readdir)(projectsDir);
22959
+ projectDirs = await (0, import_promises37.readdir)(projectsDir);
22239
22960
  } catch {
22240
22961
  return [];
22241
22962
  }
@@ -22245,10 +22966,10 @@ async function discoverClaudeSessions(opts) {
22245
22966
  }
22246
22967
  const sessions = [];
22247
22968
  for (const projectDir of projectDirs) {
22248
- const dirPath = import_node_path53.default.join(projectsDir, projectDir);
22969
+ const dirPath = import_node_path54.default.join(projectsDir, projectDir);
22249
22970
  let entries;
22250
22971
  try {
22251
- entries = await (0, import_promises36.readdir)(dirPath);
22972
+ entries = await (0, import_promises37.readdir)(dirPath);
22252
22973
  } catch {
22253
22974
  continue;
22254
22975
  }
@@ -22256,10 +22977,10 @@ async function discoverClaudeSessions(opts) {
22256
22977
  if (!entry.endsWith(".jsonl")) continue;
22257
22978
  const sessionId = entry.replace(/\.jsonl$/, "");
22258
22979
  if (opts?.sessionId && sessionId !== opts.sessionId) continue;
22259
- const filePath = import_node_path53.default.join(dirPath, entry);
22980
+ const filePath = import_node_path54.default.join(dirPath, entry);
22260
22981
  let updatedAt;
22261
22982
  try {
22262
- const fileStat = await (0, import_promises36.stat)(filePath);
22983
+ const fileStat = await (0, import_promises37.stat)(filePath);
22263
22984
  updatedAt = fileStat.mtime;
22264
22985
  } catch {
22265
22986
  updatedAt = /* @__PURE__ */ new Date(0);
@@ -22276,13 +22997,91 @@ async function discoverClaudeSessions(opts) {
22276
22997
  return sessions.slice(0, limit);
22277
22998
  }
22278
22999
 
23000
+ // src/import/transcript-provider.ts
23001
+ init_cjs_shims();
23002
+
22279
23003
  // src/import/types.ts
22280
23004
  init_cjs_shims();
22281
- var import_promises37 = require("fs/promises");
23005
+ var import_promises38 = require("fs/promises");
23006
+ function toTranscriptJsonLine(entry) {
23007
+ const firstUserMessage = entry.messages.find((m) => m.role === "user");
23008
+ const input = typeof firstUserMessage?.content === "string" ? firstUserMessage.content : "";
23009
+ return {
23010
+ input,
23011
+ output: entry.messages,
23012
+ token_usage: entry.tokenUsage ? {
23013
+ input: entry.tokenUsage.input,
23014
+ output: entry.tokenUsage.output,
23015
+ cached: entry.tokenUsage.cached
23016
+ } : void 0,
23017
+ duration_ms: entry.durationMs,
23018
+ cost_usd: entry.costUsd,
23019
+ source: {
23020
+ provider: entry.source.provider,
23021
+ session_id: entry.source.sessionId,
23022
+ model: entry.source.model,
23023
+ timestamp: entry.source.startedAt,
23024
+ git_branch: entry.source.gitBranch,
23025
+ cwd: entry.source.cwd ?? entry.source.projectPath,
23026
+ version: entry.source.version
23027
+ }
23028
+ };
23029
+ }
23030
+ async function readTranscriptJsonl(filePath) {
23031
+ const text = await (0, import_promises38.readFile)(filePath, "utf8");
23032
+ return text.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
23033
+ }
22282
23034
  async function readTranscriptFile(filePath) {
22283
- return (0, import_promises37.readFile)(filePath, "utf8");
23035
+ return (0, import_promises38.readFile)(filePath, "utf8");
22284
23036
  }
22285
23037
 
23038
+ // src/import/transcript-provider.ts
23039
+ var TranscriptProvider = class _TranscriptProvider {
23040
+ id;
23041
+ kind = "transcript";
23042
+ targetName;
23043
+ lines;
23044
+ cursor = 0;
23045
+ constructor(targetName, lines) {
23046
+ this.targetName = targetName;
23047
+ this.id = `transcript:${targetName}`;
23048
+ this.lines = lines;
23049
+ }
23050
+ /**
23051
+ * Create a TranscriptProvider from a JSONL file path.
23052
+ */
23053
+ static async fromFile(filePath) {
23054
+ const lines = await readTranscriptJsonl(filePath);
23055
+ if (lines.length === 0) {
23056
+ throw new Error(`Transcript file is empty: ${filePath}`);
23057
+ }
23058
+ const providerName = lines[0].source.provider ?? "transcript";
23059
+ return new _TranscriptProvider(providerName, lines);
23060
+ }
23061
+ get lineCount() {
23062
+ return this.lines.length;
23063
+ }
23064
+ async invoke(_request) {
23065
+ if (this.cursor >= this.lines.length) {
23066
+ throw new Error(
23067
+ `Transcript exhausted: ${this.lines.length} line(s) available but ${this.cursor + 1} invocations attempted. Each transcript line maps to one test case.`
23068
+ );
23069
+ }
23070
+ const line = this.lines[this.cursor++];
23071
+ return {
23072
+ output: line.output,
23073
+ tokenUsage: line.token_usage ? {
23074
+ input: line.token_usage.input,
23075
+ output: line.token_usage.output,
23076
+ cached: line.token_usage.cached
23077
+ } : void 0,
23078
+ durationMs: line.duration_ms,
23079
+ costUsd: line.cost_usd ?? void 0,
23080
+ startTime: line.source.timestamp
23081
+ };
23082
+ }
23083
+ };
23084
+
22286
23085
  // src/index.ts
22287
23086
  function createAgentKernel() {
22288
23087
  return { status: "stub" };
@@ -22297,6 +23096,7 @@ function createAgentKernel() {
22297
23096
  DEFAULT_EVALUATOR_TEMPLATE,
22298
23097
  DEFAULT_EVAL_PATTERNS,
22299
23098
  DEFAULT_EXPLORATION_TOOLS,
23099
+ DEFAULT_THRESHOLD,
22300
23100
  DeterministicAssertionEvaluator,
22301
23101
  EvaluatorRegistry,
22302
23102
  ExecutionMetricsEvaluator,
@@ -22318,6 +23118,7 @@ function createAgentKernel() {
22318
23118
  TemplateNotFoundError,
22319
23119
  TokenUsageEvaluator,
22320
23120
  ToolTrajectoryEvaluator,
23121
+ TranscriptProvider,
22321
23122
  WorkspaceCreationError,
22322
23123
  WorkspacePoolManager,
22323
23124
  addProject,
@@ -22354,6 +23155,7 @@ function createAgentKernel() {
22354
23155
  detectFormat,
22355
23156
  discoverAssertions,
22356
23157
  discoverClaudeSessions,
23158
+ discoverCodexSessions,
22357
23159
  discoverCopilotSessions,
22358
23160
  discoverGraders,
22359
23161
  discoverJudges,
@@ -22414,6 +23216,8 @@ function createAgentKernel() {
22414
23216
  normalizeLineEndings,
22415
23217
  parseAgentSkillsEvals,
22416
23218
  parseClaudeSession,
23219
+ parseCodexSession,
23220
+ parseCopilotEvents,
22417
23221
  parseJsonFromText,
22418
23222
  parseJsonSafe,
22419
23223
  readJsonFile,
@@ -22421,6 +23225,7 @@ function createAgentKernel() {
22421
23225
  readTestSuiteMetadata,
22422
23226
  readTextFile,
22423
23227
  readTranscriptFile,
23228
+ readTranscriptJsonl,
22424
23229
  removeProject,
22425
23230
  resolveAndCreateProvider,
22426
23231
  resolveDelegatedTargetDefinition,
@@ -22453,6 +23258,7 @@ function createAgentKernel() {
22453
23258
  substituteVariables,
22454
23259
  toCamelCaseDeep,
22455
23260
  toSnakeCaseDeep,
23261
+ toTranscriptJsonLine,
22456
23262
  tokensPerTool,
22457
23263
  touchProject,
22458
23264
  transpileEvalYaml,