@agentv/core 4.6.0 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -31,12 +31,9 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
31
31
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
32
32
 
33
33
  // ../../node_modules/.bun/tsup@8.3.5+19811ebab77a7b1c/node_modules/tsup/assets/cjs_shims.js
34
- var getImportMetaUrl, importMetaUrl;
35
34
  var init_cjs_shims = __esm({
36
35
  "../../node_modules/.bun/tsup@8.3.5+19811ebab77a7b1c/node_modules/tsup/assets/cjs_shims.js"() {
37
36
  "use strict";
38
- getImportMetaUrl = () => typeof document === "undefined" ? new URL(`file:${__filename}`).href : document.currentScript && document.currentScript.src || new URL("main.js", document.baseURI).href;
39
- importMetaUrl = /* @__PURE__ */ getImportMetaUrl();
40
37
  }
41
38
  });
42
39
 
@@ -1435,6 +1432,7 @@ __export(index_exports, {
1435
1432
  DEFAULT_EVALUATOR_TEMPLATE: () => DEFAULT_EVALUATOR_TEMPLATE,
1436
1433
  DEFAULT_EVAL_PATTERNS: () => DEFAULT_EVAL_PATTERNS,
1437
1434
  DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
1435
+ DEFAULT_THRESHOLD: () => DEFAULT_THRESHOLD,
1438
1436
  DeterministicAssertionEvaluator: () => DeterministicAssertionEvaluator,
1439
1437
  EvaluatorRegistry: () => EvaluatorRegistry,
1440
1438
  ExecutionMetricsEvaluator: () => ExecutionMetricsEvaluator,
@@ -1456,6 +1454,7 @@ __export(index_exports, {
1456
1454
  TemplateNotFoundError: () => TemplateNotFoundError,
1457
1455
  TokenUsageEvaluator: () => TokenUsageEvaluator,
1458
1456
  ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
1457
+ TranscriptProvider: () => TranscriptProvider,
1459
1458
  WorkspaceCreationError: () => WorkspaceCreationError,
1460
1459
  WorkspacePoolManager: () => WorkspacePoolManager,
1461
1460
  addProject: () => addProject,
@@ -1492,6 +1491,7 @@ __export(index_exports, {
1492
1491
  detectFormat: () => detectFormat,
1493
1492
  discoverAssertions: () => discoverAssertions,
1494
1493
  discoverClaudeSessions: () => discoverClaudeSessions,
1494
+ discoverCodexSessions: () => discoverCodexSessions,
1495
1495
  discoverCopilotSessions: () => discoverCopilotSessions,
1496
1496
  discoverGraders: () => discoverGraders,
1497
1497
  discoverJudges: () => discoverGraders,
@@ -1552,6 +1552,8 @@ __export(index_exports, {
1552
1552
  normalizeLineEndings: () => normalizeLineEndings,
1553
1553
  parseAgentSkillsEvals: () => parseAgentSkillsEvals,
1554
1554
  parseClaudeSession: () => parseClaudeSession,
1555
+ parseCodexSession: () => parseCodexSession,
1556
+ parseCopilotEvents: () => parseCopilotEvents,
1555
1557
  parseJsonFromText: () => parseJsonFromText,
1556
1558
  parseJsonSafe: () => parseJsonSafe,
1557
1559
  readJsonFile: () => readJsonFile,
@@ -1559,8 +1561,10 @@ __export(index_exports, {
1559
1561
  readTestSuiteMetadata: () => readTestSuiteMetadata,
1560
1562
  readTextFile: () => readTextFile,
1561
1563
  readTranscriptFile: () => readTranscriptFile,
1564
+ readTranscriptJsonl: () => readTranscriptJsonl,
1562
1565
  removeProject: () => removeProject,
1563
1566
  resolveAndCreateProvider: () => resolveAndCreateProvider,
1567
+ resolveDelegatedTargetDefinition: () => resolveDelegatedTargetDefinition,
1564
1568
  resolveFileReference: () => resolveFileReference3,
1565
1569
  resolveTargetDefinition: () => resolveTargetDefinition,
1566
1570
  resolveWorkspaceTemplate: () => resolveWorkspaceTemplate,
@@ -1590,6 +1594,7 @@ __export(index_exports, {
1590
1594
  substituteVariables: () => substituteVariables,
1591
1595
  toCamelCaseDeep: () => toCamelCaseDeep,
1592
1596
  toSnakeCaseDeep: () => toSnakeCaseDeep,
1597
+ toTranscriptJsonLine: () => toTranscriptJsonLine,
1593
1598
  tokensPerTool: () => tokensPerTool,
1594
1599
  touchProject: () => touchProject,
1595
1600
  transpileEvalYaml: () => transpileEvalYaml,
@@ -2674,8 +2679,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2674
2679
  const negate = rawEvaluator.negate === true ? true : void 0;
2675
2680
  if (isCustomType) {
2676
2681
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
2677
- const required2 = parseRequired(rawEvaluator.required);
2678
- const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "negate"]);
2682
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
2683
+ rawEvaluator.required,
2684
+ rawEvaluator.min_score,
2685
+ name,
2686
+ evalId
2687
+ );
2688
+ const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "min_score", "negate"]);
2679
2689
  const config2 = {};
2680
2690
  for (const [key, value] of Object.entries(rawEvaluator)) {
2681
2691
  if (!knownProps2.has(key) && value !== void 0) {
@@ -2687,6 +2697,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2687
2697
  type: customTypeName,
2688
2698
  ...weight2 !== void 0 ? { weight: weight2 } : {},
2689
2699
  ...required2 !== void 0 ? { required: required2 } : {},
2700
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
2690
2701
  ...negate !== void 0 ? { negate } : {},
2691
2702
  ...Object.keys(config2).length > 0 ? { config: config2 } : {}
2692
2703
  });
@@ -2756,7 +2767,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2756
2767
  );
2757
2768
  }
2758
2769
  }
2759
- const required2 = parseRequired(rawEvaluator.required);
2770
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
2771
+ rawEvaluator.required,
2772
+ rawEvaluator.min_score,
2773
+ name,
2774
+ evalId
2775
+ );
2760
2776
  const knownProps2 = /* @__PURE__ */ new Set([
2761
2777
  "name",
2762
2778
  "type",
@@ -2782,6 +2798,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2782
2798
  resolvedCwd,
2783
2799
  ...weight2 !== void 0 ? { weight: weight2 } : {},
2784
2800
  ...required2 !== void 0 ? { required: required2 } : {},
2801
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
2785
2802
  ...negate !== void 0 ? { negate } : {},
2786
2803
  ...Object.keys(config2).length > 0 ? { config: config2 } : {},
2787
2804
  ...targetConfig !== void 0 ? { target: targetConfig } : {}
@@ -2910,7 +2927,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2910
2927
  };
2911
2928
  }
2912
2929
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
2913
- const required2 = parseRequired(rawEvaluator.required);
2930
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
2931
+ rawEvaluator.required,
2932
+ rawEvaluator.min_score,
2933
+ name,
2934
+ evalId
2935
+ );
2914
2936
  evaluators.push({
2915
2937
  name,
2916
2938
  type: "composite",
@@ -2918,6 +2940,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2918
2940
  aggregator,
2919
2941
  ...weight2 !== void 0 ? { weight: weight2 } : {},
2920
2942
  ...required2 !== void 0 ? { required: required2 } : {},
2943
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
2921
2944
  ...negate !== void 0 ? { negate } : {}
2922
2945
  });
2923
2946
  continue;
@@ -3028,7 +3051,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3028
3051
  continue;
3029
3052
  }
3030
3053
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3031
- const required2 = parseRequired(rawEvaluator.required);
3054
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3055
+ rawEvaluator.required,
3056
+ rawEvaluator.min_score,
3057
+ name,
3058
+ evalId
3059
+ );
3032
3060
  const config2 = {
3033
3061
  name,
3034
3062
  type: "tool-trajectory",
@@ -3037,6 +3065,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3037
3065
  ...expected ? { expected } : {},
3038
3066
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3039
3067
  ...required2 !== void 0 ? { required: required2 } : {},
3068
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3040
3069
  ...negate !== void 0 ? { negate } : {},
3041
3070
  ...argsMatch2 !== void 0 ? { argsMatch: argsMatch2 } : {}
3042
3071
  };
@@ -3099,7 +3128,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3099
3128
  const aggregation = asString(rawEvaluator.aggregation);
3100
3129
  const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
3101
3130
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3102
- const required2 = parseRequired(rawEvaluator.required);
3131
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3132
+ rawEvaluator.required,
3133
+ rawEvaluator.min_score,
3134
+ name,
3135
+ evalId
3136
+ );
3103
3137
  evaluators.push({
3104
3138
  name,
3105
3139
  type: "field-accuracy",
@@ -3107,6 +3141,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3107
3141
  ...validAggregation ? { aggregation: validAggregation } : {},
3108
3142
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3109
3143
  ...required2 !== void 0 ? { required: required2 } : {},
3144
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3110
3145
  ...negate !== void 0 ? { negate } : {}
3111
3146
  });
3112
3147
  continue;
@@ -3120,13 +3155,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3120
3155
  continue;
3121
3156
  }
3122
3157
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3123
- const required2 = parseRequired(rawEvaluator.required);
3158
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3159
+ rawEvaluator.required,
3160
+ rawEvaluator.min_score,
3161
+ name,
3162
+ evalId
3163
+ );
3124
3164
  evaluators.push({
3125
3165
  name,
3126
3166
  type: "latency",
3127
3167
  threshold,
3128
3168
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3129
3169
  ...required2 !== void 0 ? { required: required2 } : {},
3170
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3130
3171
  ...negate !== void 0 ? { negate } : {}
3131
3172
  });
3132
3173
  continue;
@@ -3140,13 +3181,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3140
3181
  continue;
3141
3182
  }
3142
3183
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3143
- const required2 = parseRequired(rawEvaluator.required);
3184
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3185
+ rawEvaluator.required,
3186
+ rawEvaluator.min_score,
3187
+ name,
3188
+ evalId
3189
+ );
3144
3190
  evaluators.push({
3145
3191
  name,
3146
3192
  type: "cost",
3147
3193
  budget,
3148
3194
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3149
3195
  ...required2 !== void 0 ? { required: required2 } : {},
3196
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3150
3197
  ...negate !== void 0 ? { negate } : {}
3151
3198
  });
3152
3199
  continue;
@@ -3178,13 +3225,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3178
3225
  continue;
3179
3226
  }
3180
3227
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3181
- const required2 = parseRequired(rawEvaluator.required);
3228
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3229
+ rawEvaluator.required,
3230
+ rawEvaluator.min_score,
3231
+ name,
3232
+ evalId
3233
+ );
3182
3234
  evaluators.push({
3183
3235
  name,
3184
3236
  type: "token-usage",
3185
3237
  ...validLimits,
3186
3238
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3187
3239
  ...required2 !== void 0 ? { required: required2 } : {},
3240
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3188
3241
  ...negate !== void 0 ? { negate } : {}
3189
3242
  });
3190
3243
  continue;
@@ -3230,13 +3283,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3230
3283
  continue;
3231
3284
  }
3232
3285
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3233
- const required2 = parseRequired(rawEvaluator.required);
3286
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3287
+ rawEvaluator.required,
3288
+ rawEvaluator.min_score,
3289
+ name,
3290
+ evalId
3291
+ );
3234
3292
  evaluators.push({
3235
3293
  name,
3236
3294
  type: "execution-metrics",
3237
3295
  ...validThresholds,
3238
3296
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3239
3297
  ...required2 !== void 0 ? { required: required2 } : {},
3298
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3240
3299
  ...negate !== void 0 ? { negate } : {}
3241
3300
  });
3242
3301
  continue;
@@ -3250,7 +3309,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3250
3309
  const rawShouldTrigger = rawEvaluator.should_trigger;
3251
3310
  const shouldTrigger = typeof rawShouldTrigger === "boolean" ? rawShouldTrigger : void 0;
3252
3311
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3253
- const required2 = parseRequired(rawEvaluator.required);
3312
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3313
+ rawEvaluator.required,
3314
+ rawEvaluator.min_score,
3315
+ name,
3316
+ evalId
3317
+ );
3254
3318
  evaluators.push({
3255
3319
  name,
3256
3320
  type: "skill-trigger",
@@ -3258,6 +3322,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3258
3322
  ...shouldTrigger !== void 0 ? { should_trigger: shouldTrigger } : {},
3259
3323
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3260
3324
  ...required2 !== void 0 ? { required: required2 } : {},
3325
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3261
3326
  ...negate !== void 0 ? { negate } : {}
3262
3327
  });
3263
3328
  continue;
@@ -3269,13 +3334,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3269
3334
  continue;
3270
3335
  }
3271
3336
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3272
- const required2 = parseRequired(rawEvaluator.required);
3337
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3338
+ rawEvaluator.required,
3339
+ rawEvaluator.min_score,
3340
+ name,
3341
+ evalId
3342
+ );
3273
3343
  evaluators.push({
3274
3344
  name,
3275
3345
  type: "contains",
3276
3346
  value,
3277
3347
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3278
3348
  ...required2 !== void 0 ? { required: required2 } : {},
3349
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3279
3350
  ...negate !== void 0 ? { negate } : {}
3280
3351
  });
3281
3352
  continue;
@@ -3289,13 +3360,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3289
3360
  continue;
3290
3361
  }
3291
3362
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3292
- const required2 = parseRequired(rawEvaluator.required);
3363
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3364
+ rawEvaluator.required,
3365
+ rawEvaluator.min_score,
3366
+ name,
3367
+ evalId
3368
+ );
3293
3369
  evaluators.push({
3294
3370
  name,
3295
3371
  type: typeValue,
3296
3372
  value,
3297
3373
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3298
3374
  ...required2 !== void 0 ? { required: required2 } : {},
3375
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3299
3376
  ...negate !== void 0 ? { negate } : {}
3300
3377
  });
3301
3378
  continue;
@@ -3307,13 +3384,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3307
3384
  continue;
3308
3385
  }
3309
3386
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3310
- const required2 = parseRequired(rawEvaluator.required);
3387
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3388
+ rawEvaluator.required,
3389
+ rawEvaluator.min_score,
3390
+ name,
3391
+ evalId
3392
+ );
3311
3393
  evaluators.push({
3312
3394
  name,
3313
3395
  type: "icontains",
3314
3396
  value,
3315
3397
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3316
3398
  ...required2 !== void 0 ? { required: required2 } : {},
3399
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3317
3400
  ...negate !== void 0 ? { negate } : {}
3318
3401
  });
3319
3402
  continue;
@@ -3327,13 +3410,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3327
3410
  continue;
3328
3411
  }
3329
3412
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3330
- const required2 = parseRequired(rawEvaluator.required);
3413
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3414
+ rawEvaluator.required,
3415
+ rawEvaluator.min_score,
3416
+ name,
3417
+ evalId
3418
+ );
3331
3419
  evaluators.push({
3332
3420
  name,
3333
3421
  type: typeValue,
3334
3422
  value,
3335
3423
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3336
3424
  ...required2 !== void 0 ? { required: required2 } : {},
3425
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3337
3426
  ...negate !== void 0 ? { negate } : {}
3338
3427
  });
3339
3428
  continue;
@@ -3345,13 +3434,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3345
3434
  continue;
3346
3435
  }
3347
3436
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3348
- const required2 = parseRequired(rawEvaluator.required);
3437
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3438
+ rawEvaluator.required,
3439
+ rawEvaluator.min_score,
3440
+ name,
3441
+ evalId
3442
+ );
3349
3443
  evaluators.push({
3350
3444
  name,
3351
3445
  type: typeValue,
3352
3446
  value,
3353
3447
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3354
3448
  ...required2 !== void 0 ? { required: required2 } : {},
3449
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3355
3450
  ...negate !== void 0 ? { negate } : {}
3356
3451
  });
3357
3452
  continue;
@@ -3364,7 +3459,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3364
3459
  }
3365
3460
  const flags = asString(rawEvaluator.flags);
3366
3461
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3367
- const required2 = parseRequired(rawEvaluator.required);
3462
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3463
+ rawEvaluator.required,
3464
+ rawEvaluator.min_score,
3465
+ name,
3466
+ evalId
3467
+ );
3368
3468
  evaluators.push({
3369
3469
  name,
3370
3470
  type: "regex",
@@ -3372,18 +3472,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3372
3472
  ...flags !== void 0 ? { flags } : {},
3373
3473
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3374
3474
  ...required2 !== void 0 ? { required: required2 } : {},
3475
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3375
3476
  ...negate !== void 0 ? { negate } : {}
3376
3477
  });
3377
3478
  continue;
3378
3479
  }
3379
3480
  if (typeValue === "is-json") {
3380
3481
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3381
- const required2 = parseRequired(rawEvaluator.required);
3482
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3483
+ rawEvaluator.required,
3484
+ rawEvaluator.min_score,
3485
+ name,
3486
+ evalId
3487
+ );
3382
3488
  evaluators.push({
3383
3489
  name,
3384
3490
  type: "is-json",
3385
3491
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3386
3492
  ...required2 !== void 0 ? { required: required2 } : {},
3493
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3387
3494
  ...negate !== void 0 ? { negate } : {}
3388
3495
  });
3389
3496
  continue;
@@ -3395,13 +3502,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3395
3502
  continue;
3396
3503
  }
3397
3504
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3398
- const required2 = parseRequired(rawEvaluator.required);
3505
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3506
+ rawEvaluator.required,
3507
+ rawEvaluator.min_score,
3508
+ name,
3509
+ evalId
3510
+ );
3399
3511
  evaluators.push({
3400
3512
  name,
3401
3513
  type: "equals",
3402
3514
  value,
3403
3515
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3404
3516
  ...required2 !== void 0 ? { required: required2 } : {},
3517
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3405
3518
  ...negate !== void 0 ? { negate } : {}
3406
3519
  });
3407
3520
  continue;
@@ -3437,7 +3550,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3437
3550
  continue;
3438
3551
  }
3439
3552
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3440
- const required2 = parseRequired(rawEvaluator.required);
3553
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3554
+ rawEvaluator.required,
3555
+ rawEvaluator.min_score,
3556
+ name,
3557
+ evalId
3558
+ );
3441
3559
  evaluators.push({
3442
3560
  name,
3443
3561
  type: "llm-grader",
@@ -3445,6 +3563,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3445
3563
  ...graderTargetName ? { target: graderTargetName } : {},
3446
3564
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3447
3565
  ...required2 !== void 0 ? { required: required2 } : {},
3566
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3448
3567
  ...negate !== void 0 ? { negate } : {}
3449
3568
  });
3450
3569
  continue;
@@ -3514,7 +3633,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3514
3633
  continue;
3515
3634
  }
3516
3635
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3517
- const required2 = parseRequired(rawEvaluator.required);
3636
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
3637
+ rawEvaluator.required,
3638
+ rawEvaluator.min_score,
3639
+ name,
3640
+ evalId
3641
+ );
3518
3642
  evaluators.push({
3519
3643
  name,
3520
3644
  type: "llm-grader",
@@ -3522,12 +3646,18 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3522
3646
  ...graderTargetName ? { target: graderTargetName } : {},
3523
3647
  ...weight2 !== void 0 ? { weight: weight2 } : {},
3524
3648
  ...required2 !== void 0 ? { required: required2 } : {},
3649
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
3525
3650
  ...negate !== void 0 ? { negate } : {}
3526
3651
  });
3527
3652
  continue;
3528
3653
  }
3529
3654
  const weight = validateWeight(rawEvaluator.weight, name, evalId);
3530
- const required = parseRequired(rawEvaluator.required);
3655
+ const { required, min_score } = parseRequiredAndMinScore(
3656
+ rawEvaluator.required,
3657
+ rawEvaluator.min_score,
3658
+ name,
3659
+ evalId
3660
+ );
3531
3661
  const knownProps = /* @__PURE__ */ new Set([
3532
3662
  "name",
3533
3663
  "type",
@@ -3538,6 +3668,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3538
3668
  "weight",
3539
3669
  "config",
3540
3670
  "required",
3671
+ "min_score",
3541
3672
  "negate",
3542
3673
  "max_steps",
3543
3674
  "maxSteps",
@@ -3567,6 +3698,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3567
3698
  ...graderTargetName ? { target: graderTargetName } : {},
3568
3699
  ...weight !== void 0 ? { weight } : {},
3569
3700
  ...required !== void 0 ? { required } : {},
3701
+ ...min_score !== void 0 ? { min_score } : {},
3570
3702
  ...negate !== void 0 ? { negate } : {},
3571
3703
  ...finalConfig ? { config: finalConfig } : {},
3572
3704
  ...llmMaxSteps !== void 0 ? { max_steps: llmMaxSteps } : {},
@@ -3698,10 +3830,23 @@ ${detailBlock}${ANSI_RESET5}`);
3698
3830
  console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET5}`);
3699
3831
  }
3700
3832
  }
3701
- function parseRequired(value) {
3702
- if (value === true) return true;
3703
- if (typeof value === "number" && value > 0 && value <= 1) return value;
3704
- return void 0;
3833
+ function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalId) {
3834
+ const result = {};
3835
+ if (typeof rawMinScore === "number" && rawMinScore > 0 && rawMinScore <= 1) {
3836
+ result.min_score = rawMinScore;
3837
+ }
3838
+ if (rawRequired === true) {
3839
+ result.required = true;
3840
+ } else if (typeof rawRequired === "number" && rawRequired > 0 && rawRequired <= 1) {
3841
+ if (result.min_score === void 0) {
3842
+ result.min_score = rawRequired;
3843
+ }
3844
+ result.required = rawRequired;
3845
+ logWarning2(
3846
+ `Evaluator '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
3847
+ );
3848
+ }
3849
+ return result;
3705
3850
  }
3706
3851
  function validateWeight(rawWeight, evaluatorName, evalId) {
3707
3852
  if (rawWeight === void 0) {
@@ -3744,16 +3889,30 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
3744
3889
  const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
3745
3890
  const expectedOutcome = asString(rawRubric.outcome) ?? "";
3746
3891
  const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
3892
+ let minScore;
3747
3893
  let requiredMinScore;
3748
3894
  let required;
3749
- if (typeof rawRubric.required_min_score === "number") {
3750
- const minScore = rawRubric.required_min_score;
3751
- if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
3895
+ if (typeof rawRubric.min_score === "number") {
3896
+ const ms = rawRubric.min_score;
3897
+ if (ms <= 0 || ms > 1) {
3752
3898
  throw new Error(
3753
- `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
3899
+ `Invalid min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be in (0, 1] (got ${ms})`
3754
3900
  );
3755
3901
  }
3756
- requiredMinScore = minScore;
3902
+ minScore = ms;
3903
+ requiredMinScore = Math.round(ms * 10);
3904
+ } else if (typeof rawRubric.required_min_score === "number") {
3905
+ const rms = rawRubric.required_min_score;
3906
+ if (!Number.isInteger(rms) || rms < 0 || rms > 10) {
3907
+ throw new Error(
3908
+ `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${rms})`
3909
+ );
3910
+ }
3911
+ requiredMinScore = rms;
3912
+ minScore = rms / 10;
3913
+ logWarning2(
3914
+ `Rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': 'required_min_score: ${rms}' is deprecated. Use 'min_score: ${rms / 10}' (0-1 scale) instead.`
3915
+ );
3757
3916
  }
3758
3917
  if (typeof rawRubric.required === "boolean") {
3759
3918
  required = rawRubric.required;
@@ -3773,6 +3932,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
3773
3932
  weight,
3774
3933
  ...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
3775
3934
  ...required !== void 0 ? { required } : {},
3935
+ ...minScore !== void 0 ? { min_score: minScore } : {},
3776
3936
  ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
3777
3937
  score_ranges: scoreRanges
3778
3938
  });
@@ -3789,6 +3949,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
3789
3949
  weight,
3790
3950
  // Default to required: true if not specified (backward compatibility)
3791
3951
  required: required ?? true,
3952
+ ...minScore !== void 0 ? { min_score: minScore } : {},
3792
3953
  ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
3793
3954
  });
3794
3955
  }
@@ -3917,12 +4078,22 @@ function parseInlineRubrics(rawRubrics) {
3917
4078
  id: asString(rubric.id) ?? `rubric-${index + 1}`,
3918
4079
  weight: typeof rubric.weight === "number" ? rubric.weight : 1
3919
4080
  };
4081
+ let inlineMinScore;
4082
+ let inlineRequiredMinScore;
4083
+ if (typeof rubric.min_score === "number") {
4084
+ inlineMinScore = rubric.min_score;
4085
+ inlineRequiredMinScore = Math.round(inlineMinScore * 10);
4086
+ } else if (typeof rubric.required_min_score === "number") {
4087
+ inlineRequiredMinScore = rubric.required_min_score;
4088
+ inlineMinScore = inlineRequiredMinScore / 10;
4089
+ }
3920
4090
  if (scoreRanges && scoreRanges.length > 0) {
3921
4091
  return {
3922
4092
  ...baseRubric,
3923
4093
  ...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
3924
4094
  ...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
3925
- ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
4095
+ ...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
4096
+ ...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {},
3926
4097
  score_ranges: scoreRanges
3927
4098
  };
3928
4099
  }
@@ -3930,7 +4101,8 @@ function parseInlineRubrics(rawRubrics) {
3930
4101
  ...baseRubric,
3931
4102
  outcome: expectedOutcome,
3932
4103
  required: typeof rubric.required === "boolean" ? rubric.required : true,
3933
- ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
4104
+ ...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
4105
+ ...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {}
3934
4106
  };
3935
4107
  }).filter((r) => r.outcome && r.outcome.length > 0 || "score_ranges" in r && r.score_ranges);
3936
4108
  if (rubricItems.length === 0) {
@@ -4334,6 +4506,9 @@ function resolveExpectedMessages(raw) {
4334
4506
  var ANSI_YELLOW6 = "\x1B[33m";
4335
4507
  var ANSI_RED2 = "\x1B[31m";
4336
4508
  var ANSI_RESET7 = "\x1B[0m";
4509
+ function matchesFilter(id, filter) {
4510
+ return typeof filter === "string" ? import_micromatch.default.isMatch(id, filter) : filter.some((pattern) => import_micromatch.default.isMatch(id, pattern));
4511
+ }
4337
4512
  function detectFormat(filePath) {
4338
4513
  const ext = import_node_path7.default.extname(filePath).toLowerCase();
4339
4514
  if (ext === ".jsonl") return "jsonl";
@@ -4401,40 +4576,40 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
4401
4576
  const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
4402
4577
  const rawFile = await (0, import_promises7.readFile)(absoluteTestPath, "utf8");
4403
4578
  const rawCases = parseJsonlContent(rawFile, evalFilePath);
4404
- const fallbackEvalSet = import_node_path7.default.basename(absoluteTestPath, ".jsonl") || "eval";
4405
- const evalSetName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackEvalSet;
4579
+ const fallbackSuiteName = import_node_path7.default.basename(absoluteTestPath, ".jsonl") || "eval";
4580
+ const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
4406
4581
  const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-grader";
4407
4582
  const globalExecution = sidecar.execution;
4408
4583
  if (verbose) {
4409
4584
  console.log(`
4410
- [JSONL Dataset: ${evalFilePath}]`);
4585
+ [JSONL Suite: ${evalFilePath}]`);
4411
4586
  console.log(` Cases: ${rawCases.length}`);
4412
- console.log(` Eval set: ${evalSetName}`);
4587
+ console.log(` Suite: ${suiteName}`);
4413
4588
  if (sidecar.description) {
4414
4589
  console.log(` Description: ${sidecar.description}`);
4415
4590
  }
4416
4591
  }
4417
4592
  const results = [];
4418
4593
  for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
4419
- const evalcase = rawCases[lineIndex];
4594
+ const testCaseConfig = rawCases[lineIndex];
4420
4595
  const lineNumber = lineIndex + 1;
4421
- const id = asString4(evalcase.id);
4422
- if (filterPattern && (!id || !import_micromatch.default.isMatch(id, filterPattern))) {
4596
+ const id = asString4(testCaseConfig.id);
4597
+ if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
4423
4598
  continue;
4424
4599
  }
4425
- const conversationId = asString4(evalcase.conversation_id);
4426
- let outcome = asString4(evalcase.criteria);
4427
- if (!outcome && evalcase.expected_outcome !== void 0) {
4428
- outcome = asString4(evalcase.expected_outcome);
4600
+ const conversationId = asString4(testCaseConfig.conversation_id);
4601
+ let outcome = asString4(testCaseConfig.criteria);
4602
+ if (!outcome && testCaseConfig.expected_outcome !== void 0) {
4603
+ outcome = asString4(testCaseConfig.expected_outcome);
4429
4604
  if (outcome) {
4430
4605
  logWarning4(
4431
- `Test '${asString4(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
4606
+ `Test '${asString4(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
4432
4607
  );
4433
4608
  }
4434
4609
  }
4435
- const rawInputMessages = resolveInputMessages(evalcase);
4436
- const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
4437
- const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
4610
+ const rawInputMessages = resolveInputMessages(testCaseConfig);
4611
+ const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
4612
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assert !== void 0;
4438
4613
  if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
4439
4614
  logError2(
4440
4615
  `Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
@@ -4471,18 +4646,23 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
4471
4646
  }
4472
4647
  }
4473
4648
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
4474
- const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
4649
+ const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
4475
4650
  const mergedExecution = caseExecution ?? globalExecution;
4476
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
4651
+ const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
4477
4652
  let evaluators;
4478
4653
  try {
4479
- evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? "unknown");
4654
+ evaluators = await parseEvaluators(
4655
+ testCaseConfig,
4656
+ mergedExecution,
4657
+ searchRoots,
4658
+ id ?? "unknown"
4659
+ );
4480
4660
  } catch (error) {
4481
4661
  const message = error instanceof Error ? error.message : String(error);
4482
4662
  logError2(`Skipping test '${id}' at line ${lineNumber}: ${message}`);
4483
4663
  continue;
4484
4664
  }
4485
- const inlineRubrics = evalcase.rubrics;
4665
+ const inlineRubrics = testCaseConfig.rubrics;
4486
4666
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
4487
4667
  const rubricEvaluator = parseInlineRubrics(inlineRubrics);
4488
4668
  if (rubricEvaluator) {
@@ -4493,7 +4673,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
4493
4673
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
4494
4674
  const testCase = {
4495
4675
  id,
4496
- dataset: evalSetName,
4676
+ suite: suiteName,
4497
4677
  conversation_id: conversationId,
4498
4678
  question,
4499
4679
  input: inputMessages,
@@ -4501,7 +4681,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
4501
4681
  reference_answer: referenceAnswer,
4502
4682
  file_paths: userFilePaths,
4503
4683
  criteria: outcome ?? "",
4504
- evaluator: evalCaseEvaluatorKind,
4684
+ evaluator: testCaseEvaluatorKind,
4505
4685
  assertions: evaluators
4506
4686
  };
4507
4687
  results.push(testCase);
@@ -4686,6 +4866,9 @@ function buildChatPromptFromSegments(options) {
4686
4866
  var ANSI_YELLOW7 = "\x1B[33m";
4687
4867
  var ANSI_RED3 = "\x1B[31m";
4688
4868
  var ANSI_RESET8 = "\x1B[0m";
4869
+ function matchesFilter2(id, filter) {
4870
+ return typeof filter === "string" ? import_micromatch2.default.isMatch(id, filter) : filter.some((pattern) => import_micromatch2.default.isMatch(id, pattern));
4871
+ }
4689
4872
  function resolveTests(suite) {
4690
4873
  if (suite.tests !== void 0) return suite.tests;
4691
4874
  if (suite.eval_cases !== void 0) {
@@ -4765,18 +4948,18 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4765
4948
  throw new Error(`Invalid test file format: ${evalFilePath}`);
4766
4949
  }
4767
4950
  const suite = interpolated;
4768
- const evalSetNameFromSuite = asString5(suite.name)?.trim();
4769
- const fallbackEvalSet = import_node_path8.default.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
4770
- const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
4771
- const rawTestcases = resolveTests(suite);
4951
+ const suiteNameFromFile = asString5(suite.name)?.trim();
4952
+ const fallbackSuiteName = import_node_path8.default.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
4953
+ const suiteName = suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName;
4954
+ const rawTestCases = resolveTests(suite);
4772
4955
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
4773
4956
  const evalFileDir = import_node_path8.default.dirname(absoluteTestPath);
4774
- let expandedTestcases;
4775
- if (typeof rawTestcases === "string") {
4776
- const externalPath = import_node_path8.default.resolve(evalFileDir, rawTestcases);
4777
- expandedTestcases = await loadCasesFromFile(externalPath);
4778
- } else if (Array.isArray(rawTestcases)) {
4779
- expandedTestcases = await expandFileReferences(rawTestcases, evalFileDir);
4957
+ let expandedTestCases;
4958
+ if (typeof rawTestCases === "string") {
4959
+ const externalPath = import_node_path8.default.resolve(evalFileDir, rawTestCases);
4960
+ expandedTestCases = await loadCasesFromFile(externalPath);
4961
+ } else if (Array.isArray(rawTestCases)) {
4962
+ expandedTestCases = await expandFileReferences(rawTestCases, evalFileDir);
4780
4963
  } else {
4781
4964
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
4782
4965
  }
@@ -4791,32 +4974,33 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4791
4974
  }
4792
4975
  const globalExecution = suiteAssertions !== void 0 ? { ...rawGlobalExecution ?? {}, assertions: suiteAssertions } : rawGlobalExecution;
4793
4976
  const results = [];
4794
- for (const rawEvalcase of expandedTestcases) {
4795
- if (!isJsonObject(rawEvalcase)) {
4977
+ for (const rawTestCase of expandedTestCases) {
4978
+ if (!isJsonObject(rawTestCase)) {
4796
4979
  logWarning5("Skipping invalid test entry (expected object)");
4797
4980
  continue;
4798
4981
  }
4799
- const evalcase = rawEvalcase;
4800
- const id = asString5(evalcase.id);
4801
- if (filterPattern && (!id || !import_micromatch2.default.isMatch(id, filterPattern))) {
4982
+ const testCaseConfig = rawTestCase;
4983
+ const id = asString5(testCaseConfig.id);
4984
+ if (filterPattern && (!id || !matchesFilter2(id, filterPattern))) {
4802
4985
  continue;
4803
4986
  }
4804
- const conversationId = asString5(evalcase.conversation_id);
4805
- let outcome = asString5(evalcase.criteria);
4806
- if (!outcome && evalcase.expected_outcome !== void 0) {
4807
- outcome = asString5(evalcase.expected_outcome);
4987
+ const conversationId = asString5(testCaseConfig.conversation_id);
4988
+ let outcome = asString5(testCaseConfig.criteria);
4989
+ if (!outcome && testCaseConfig.expected_outcome !== void 0) {
4990
+ outcome = asString5(testCaseConfig.expected_outcome);
4808
4991
  if (outcome) {
4809
4992
  logWarning5(
4810
- `Test '${asString5(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
4993
+ `Test '${asString5(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
4811
4994
  );
4812
4995
  }
4813
4996
  }
4814
- const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
4997
+ const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
4815
4998
  const skipDefaults = caseExecution?.skip_defaults === true;
4999
+ const caseThreshold = typeof caseExecution?.threshold === "number" && caseExecution.threshold >= 0 && caseExecution.threshold <= 1 ? caseExecution.threshold : void 0;
4816
5000
  const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
4817
- const testInputMessages = resolveInputMessages(evalcase, effectiveSuiteInputFiles);
4818
- const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
4819
- const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assertions !== void 0 || evalcase.assert !== void 0;
5001
+ const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
5002
+ const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
5003
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
4820
5004
  if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
4821
5005
  logError3(
4822
5006
  `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
@@ -4863,16 +5047,21 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4863
5047
  }
4864
5048
  }
4865
5049
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
4866
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
5050
+ const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
4867
5051
  let evaluators;
4868
5052
  try {
4869
- evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
5053
+ evaluators = await parseEvaluators(
5054
+ testCaseConfig,
5055
+ globalExecution,
5056
+ searchRoots,
5057
+ id ?? "unknown"
5058
+ );
4870
5059
  } catch (error) {
4871
5060
  const message = error instanceof Error ? error.message : String(error);
4872
5061
  logError3(`Skipping test '${id}': ${message}`);
4873
5062
  continue;
4874
5063
  }
4875
- const inlineRubrics = evalcase.rubrics;
5064
+ const inlineRubrics = testCaseConfig.rubrics;
4876
5065
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
4877
5066
  const rubricEvaluator = parseInlineRubrics(inlineRubrics);
4878
5067
  if (rubricEvaluator) {
@@ -4881,13 +5070,13 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4881
5070
  }
4882
5071
  warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
4883
5072
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
4884
- const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
5073
+ const caseWorkspace = await resolveWorkspaceConfig(testCaseConfig.workspace, evalFileDir);
4885
5074
  const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
4886
- const metadata = isJsonObject(evalcase.metadata) ? evalcase.metadata : void 0;
4887
- const caseTargets = extractTargetsFromTestCase(evalcase);
5075
+ const metadata = isJsonObject(testCaseConfig.metadata) ? testCaseConfig.metadata : void 0;
5076
+ const caseTargets = extractTargetsFromTestCase(testCaseConfig);
4888
5077
  const testCase = {
4889
5078
  id,
4890
- dataset: evalSetName,
5079
+ suite: suiteName,
4891
5080
  category: options?.category,
4892
5081
  conversation_id: conversationId,
4893
5082
  question,
@@ -4896,11 +5085,12 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4896
5085
  reference_answer: referenceAnswer,
4897
5086
  file_paths: userFilePaths,
4898
5087
  criteria: outcome ?? "",
4899
- evaluator: evalCaseEvaluatorKind,
5088
+ evaluator: testCaseEvaluatorKind,
4900
5089
  assertions: evaluators,
4901
5090
  workspace: mergedWorkspace,
4902
5091
  metadata,
4903
- targets: caseTargets
5092
+ targets: caseTargets,
5093
+ ...caseThreshold !== void 0 ? { threshold: caseThreshold } : {}
4904
5094
  };
4905
5095
  results.push(testCase);
4906
5096
  }
@@ -5566,7 +5756,7 @@ var AzureProvider = class {
5566
5756
  };
5567
5757
  this.retryConfig = config.retry;
5568
5758
  const azure = (0, import_azure2.createAzure)(buildAzureOptions(config));
5569
- this.model = azure.chat(config.deploymentName);
5759
+ this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
5570
5760
  }
5571
5761
  id;
5572
5762
  kind = "azure";
@@ -5692,7 +5882,9 @@ function buildAzureOptions(config) {
5692
5882
  const options = {
5693
5883
  apiKey: config.apiKey,
5694
5884
  apiVersion: config.version,
5695
- useDeploymentBasedUrls: true
5885
+ // Chat completions still use deployment-scoped Azure URLs for compatibility
5886
+ // with existing deployments. Responses API should use the SDK's v1 path.
5887
+ useDeploymentBasedUrls: config.apiFormat !== "responses"
5696
5888
  };
5697
5889
  const baseURL = normalizeAzureBaseUrl(config.resourceName);
5698
5890
  if (baseURL) {
@@ -7169,15 +7361,16 @@ var CliProvider = class {
7169
7361
  outputFilePath
7170
7362
  );
7171
7363
  const renderedCommand = renderTemplate(this.config.command, templateValues);
7364
+ const effectiveCwd = requests[0]?.cwd ?? this.config.cwd;
7172
7365
  if (this.verbose) {
7173
7366
  console.log(
7174
- `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
7367
+ `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${effectiveCwd ?? ""} command=${renderedCommand}`
7175
7368
  );
7176
7369
  }
7177
7370
  try {
7178
7371
  const startTime = Date.now();
7179
7372
  const result = await this.runCommand(renderedCommand, {
7180
- cwd: this.config.cwd,
7373
+ cwd: effectiveCwd,
7181
7374
  env: process.env,
7182
7375
  timeoutMs: this.config.timeoutMs,
7183
7376
  signal: controller.signal
@@ -7210,7 +7403,7 @@ var CliProvider = class {
7210
7403
  command: renderedCommand,
7211
7404
  stderr: result.stderr,
7212
7405
  exitCode: result.exitCode ?? 0,
7213
- cwd: this.config.cwd,
7406
+ cwd: effectiveCwd,
7214
7407
  outputFile: outputFilePath
7215
7408
  }
7216
7409
  };
@@ -7228,7 +7421,7 @@ var CliProvider = class {
7228
7421
  command: renderedCommand,
7229
7422
  stderr: result.stderr,
7230
7423
  exitCode: result.exitCode ?? 0,
7231
- cwd: this.config.cwd,
7424
+ cwd: effectiveCwd,
7232
7425
  outputFile: outputFilePath,
7233
7426
  error: errorMessage
7234
7427
  }
@@ -7243,7 +7436,7 @@ var CliProvider = class {
7243
7436
  command: renderedCommand,
7244
7437
  stderr: result.stderr,
7245
7438
  exitCode: result.exitCode ?? 0,
7246
- cwd: this.config.cwd,
7439
+ cwd: effectiveCwd,
7247
7440
  outputFile: outputFilePath,
7248
7441
  recordId: evalCaseId
7249
7442
  }
@@ -9267,6 +9460,76 @@ function subscribeToPiLogEntries(listener) {
9267
9460
  };
9268
9461
  }
9269
9462
 
9463
+ // src/evaluation/providers/pi-provider-aliases.ts
9464
+ init_cjs_shims();
9465
+ var SUBPROVIDER_ALIASES = {
9466
+ azure: "azure-openai-responses"
9467
+ };
9468
+ var SUBPROVIDER_ALIASES_WITH_BASE_URL = {
9469
+ // Azure v1 endpoints are OpenAI-compatible; use the standard client
9470
+ // to avoid AzureOpenAI adding api-version query params.
9471
+ azure: "openai-responses"
9472
+ };
9473
+ var ENV_KEY_MAP = {
9474
+ google: "GEMINI_API_KEY",
9475
+ gemini: "GEMINI_API_KEY",
9476
+ anthropic: "ANTHROPIC_API_KEY",
9477
+ openai: "OPENAI_API_KEY",
9478
+ groq: "GROQ_API_KEY",
9479
+ xai: "XAI_API_KEY",
9480
+ openrouter: "OPENROUTER_API_KEY",
9481
+ azure: "AZURE_OPENAI_API_KEY"
9482
+ };
9483
+ var ENV_BASE_URL_MAP = {
9484
+ openai: "OPENAI_BASE_URL",
9485
+ azure: "AZURE_OPENAI_BASE_URL",
9486
+ openrouter: "OPENROUTER_BASE_URL"
9487
+ };
9488
+ function resolveSubprovider(name, hasBaseUrl = false) {
9489
+ const lower = name.toLowerCase();
9490
+ if (hasBaseUrl) {
9491
+ const alias = SUBPROVIDER_ALIASES_WITH_BASE_URL[lower];
9492
+ if (alias) return alias;
9493
+ }
9494
+ return SUBPROVIDER_ALIASES[lower] ?? name;
9495
+ }
9496
+ function resolveCliProvider(name) {
9497
+ const lower = name.toLowerCase();
9498
+ if (lower === "azure") return "azure-openai-responses";
9499
+ return name;
9500
+ }
9501
+ function resolveEnvKeyName(provider, hasBaseUrl = false) {
9502
+ const lower = provider.toLowerCase();
9503
+ if (hasBaseUrl && lower === "azure") return "OPENAI_API_KEY";
9504
+ return ENV_KEY_MAP[lower];
9505
+ }
9506
+ function resolveEnvBaseUrlName(provider, hasBaseUrl = false) {
9507
+ const lower = provider.toLowerCase();
9508
+ if (hasBaseUrl && lower === "azure") return "OPENAI_BASE_URL";
9509
+ return ENV_BASE_URL_MAP[lower];
9510
+ }
9511
+ function extractAzureResourceName(baseUrl) {
9512
+ const urlMatch = baseUrl.match(/^https?:\/\/([^./]+)/);
9513
+ if (urlMatch) return urlMatch[1];
9514
+ return baseUrl;
9515
+ }
9516
+ function normalizeAzureSdkBaseUrl(baseUrl) {
9517
+ const trimmed = baseUrl.trim().replace(/\/+$/, "");
9518
+ if (!trimmed) {
9519
+ return trimmed;
9520
+ }
9521
+ if (!/^https?:\/\//i.test(trimmed)) {
9522
+ return `https://${trimmed}.openai.azure.com/openai/v1`;
9523
+ }
9524
+ if (/\/openai\/v1$/i.test(trimmed)) {
9525
+ return trimmed;
9526
+ }
9527
+ if (/\/openai$/i.test(trimmed)) {
9528
+ return `${trimmed}/v1`;
9529
+ }
9530
+ return `${trimmed}/openai/v1`;
9531
+ }
9532
+
9270
9533
  // src/evaluation/providers/pi-utils.ts
9271
9534
  init_cjs_shims();
9272
9535
  function extractPiTextContent(content) {
@@ -9426,12 +9689,12 @@ var PiCliProvider = class {
9426
9689
  buildPiArgs(prompt, inputFiles) {
9427
9690
  const args = [];
9428
9691
  if (this.config.subprovider) {
9429
- args.push("--provider", this.config.subprovider);
9692
+ args.push("--provider", resolveCliProvider(this.config.subprovider));
9430
9693
  }
9431
9694
  if (this.config.model) {
9432
9695
  args.push("--model", this.config.model);
9433
9696
  }
9434
- if (this.config.apiKey) {
9697
+ if (this.config.apiKey && this.config.subprovider?.toLowerCase() !== "azure") {
9435
9698
  args.push("--api-key", this.config.apiKey);
9436
9699
  }
9437
9700
  args.push("--mode", "json");
@@ -9483,35 +9746,35 @@ ${prompt}` : prompt;
9483
9746
  }
9484
9747
  buildEnv() {
9485
9748
  const env = { ...process.env };
9486
- if (this.config.apiKey) {
9487
- const provider = this.config.subprovider?.toLowerCase() ?? "google";
9488
- const ENV_KEY_MAP = {
9489
- google: "GEMINI_API_KEY",
9490
- gemini: "GEMINI_API_KEY",
9491
- anthropic: "ANTHROPIC_API_KEY",
9492
- openai: "OPENAI_API_KEY",
9493
- groq: "GROQ_API_KEY",
9494
- xai: "XAI_API_KEY",
9495
- openrouter: "OPENROUTER_API_KEY"
9496
- };
9497
- const envKey = ENV_KEY_MAP[provider];
9498
- if (envKey) {
9499
- env[envKey] = this.config.apiKey;
9749
+ const provider = this.config.subprovider?.toLowerCase() ?? "google";
9750
+ if (provider === "azure") {
9751
+ if (this.config.apiKey) {
9752
+ env.AZURE_OPENAI_API_KEY = this.config.apiKey;
9753
+ }
9754
+ if (this.config.baseUrl) {
9755
+ env.AZURE_OPENAI_RESOURCE_NAME = extractAzureResourceName(this.config.baseUrl);
9756
+ }
9757
+ } else {
9758
+ if (this.config.apiKey) {
9759
+ const envKey = resolveEnvKeyName(provider);
9760
+ if (envKey) {
9761
+ env[envKey] = this.config.apiKey;
9762
+ }
9500
9763
  }
9501
9764
  }
9502
9765
  if (this.config.subprovider) {
9503
- const provider = this.config.subprovider.toLowerCase();
9766
+ const resolvedProvider = resolveCliProvider(this.config.subprovider);
9504
9767
  const PROVIDER_OWN_PREFIXES = {
9505
9768
  openrouter: ["OPENROUTER_"],
9506
9769
  anthropic: ["ANTHROPIC_"],
9507
9770
  openai: ["OPENAI_"],
9508
- azure: ["AZURE_OPENAI_"],
9771
+ "azure-openai-responses": ["AZURE_OPENAI_"],
9509
9772
  google: ["GEMINI_", "GOOGLE_GENERATIVE_AI_"],
9510
9773
  gemini: ["GEMINI_", "GOOGLE_GENERATIVE_AI_"],
9511
9774
  groq: ["GROQ_"],
9512
9775
  xai: ["XAI_"]
9513
9776
  };
9514
- const ownPrefixes = PROVIDER_OWN_PREFIXES[provider] ?? [];
9777
+ const ownPrefixes = PROVIDER_OWN_PREFIXES[resolvedProvider] ?? [];
9515
9778
  const allOtherPrefixes = Object.entries(PROVIDER_OWN_PREFIXES).filter(([key]) => key !== provider).flatMap(([, prefixes]) => prefixes);
9516
9779
  for (const key of Object.keys(env)) {
9517
9780
  if (allOtherPrefixes.some((prefix) => key.startsWith(prefix)) && !ownPrefixes.some((prefix) => key.startsWith(prefix))) {
@@ -9802,6 +10065,24 @@ function extractMessages(events) {
9802
10065
  }
9803
10066
  }
9804
10067
  }
10068
+ if (messages) {
10069
+ for (let i = messages.length - 1; i >= 0; i--) {
10070
+ if (messages[i].role === "assistant" && !messages[i].content) {
10071
+ for (let j = events.length - 1; j >= 0; j--) {
10072
+ const evt = events[j];
10073
+ if (!evt || evt.type !== "message_end") continue;
10074
+ const msg = evt.message;
10075
+ if (msg?.role !== "assistant") continue;
10076
+ const text = extractPiTextContent(msg.content);
10077
+ if (text) {
10078
+ messages[i] = { ...messages[i], content: text };
10079
+ break;
10080
+ }
10081
+ }
10082
+ break;
10083
+ }
10084
+ }
10085
+ }
9805
10086
  const eventToolCalls = extractToolCallsFromEvents(events);
9806
10087
  if (eventToolCalls.length > 0) {
9807
10088
  injectEventToolCalls(messages, eventToolCalls);
@@ -9986,17 +10267,43 @@ function formatTimeoutSuffix3(timeoutMs) {
9986
10267
  if (!timeoutMs || timeoutMs <= 0) return "";
9987
10268
  return ` after ${Math.ceil(timeoutMs / 1e3)}s`;
9988
10269
  }
10270
+ function resolveWindowsCmd(executable) {
10271
+ if (process.platform !== "win32") return [executable, []];
10272
+ const lower = executable.toLowerCase();
10273
+ if (lower.endsWith(".js") || lower.endsWith(".exe")) return [executable, []];
10274
+ let fullPath;
10275
+ try {
10276
+ fullPath = (0, import_node_child_process4.execSync)(`where ${executable}`, { encoding: "utf-8" }).trim().split(/\r?\n/)[0].trim();
10277
+ } catch {
10278
+ return [executable, []];
10279
+ }
10280
+ const cmdPath = fullPath.endsWith(".cmd") ? fullPath : `${fullPath}.cmd`;
10281
+ try {
10282
+ const content = (0, import_node_fs9.readFileSync)(cmdPath, "utf-8");
10283
+ const match = content.match(/"?%_prog%"?\s+"([^"]+\.js)"/);
10284
+ if (match) {
10285
+ const dp0 = import_node_path21.default.dirname(import_node_path21.default.resolve(cmdPath));
10286
+ const scriptPath = match[1].replace(/%dp0%[/\\]?/gi, `${dp0}${import_node_path21.default.sep}`);
10287
+ try {
10288
+ (0, import_node_fs9.accessSync)(scriptPath);
10289
+ return ["node", [scriptPath]];
10290
+ } catch {
10291
+ }
10292
+ }
10293
+ } catch {
10294
+ }
10295
+ return [executable, []];
10296
+ }
9989
10297
  async function defaultPiRunner(options) {
9990
10298
  return await new Promise((resolve, reject) => {
9991
10299
  const parts = options.executable.split(/\s+/);
9992
- const executable = parts[0];
9993
- const executableArgs = parts.slice(1);
10300
+ const [resolvedExe, prefixArgs] = resolveWindowsCmd(parts[0]);
10301
+ const executableArgs = [...prefixArgs, ...parts.slice(1)];
9994
10302
  const allArgs = [...executableArgs, ...options.args];
9995
- const child = (0, import_node_child_process4.spawn)(executable, allArgs, {
10303
+ const child = (0, import_node_child_process4.spawn)(resolvedExe, allArgs, {
9996
10304
  cwd: options.cwd,
9997
10305
  env: options.env,
9998
- stdio: ["pipe", "pipe", "pipe"],
9999
- shell: false
10306
+ stdio: ["pipe", "pipe", "pipe"]
10000
10307
  });
10001
10308
  let stdout = "";
10002
10309
  let stderr = "";
@@ -10056,9 +10363,40 @@ var import_node_child_process5 = require("child_process");
10056
10363
  var import_node_crypto8 = require("crypto");
10057
10364
  var import_node_fs10 = require("fs");
10058
10365
  var import_promises19 = require("fs/promises");
10059
- var import_node_path22 = __toESM(require("path"), 1);
10366
+ var import_node_path23 = __toESM(require("path"), 1);
10060
10367
  var import_node_readline = require("readline");
10061
10368
  var import_node_url3 = require("url");
10369
+
10370
+ // src/paths.ts
10371
+ init_cjs_shims();
10372
+ var import_node_os6 = __toESM(require("os"), 1);
10373
+ var import_node_path22 = __toESM(require("path"), 1);
10374
+ var logged = false;
10375
+ function getAgentvHome() {
10376
+ const envHome = process.env.AGENTV_HOME;
10377
+ if (envHome && envHome !== "undefined") {
10378
+ if (!logged) {
10379
+ logged = true;
10380
+ console.warn(`Using AGENTV_HOME: ${envHome}`);
10381
+ }
10382
+ return envHome;
10383
+ }
10384
+ return import_node_path22.default.join(import_node_os6.default.homedir(), ".agentv");
10385
+ }
10386
+ function getWorkspacesRoot() {
10387
+ return import_node_path22.default.join(getAgentvHome(), "workspaces");
10388
+ }
10389
+ function getSubagentsRoot() {
10390
+ return import_node_path22.default.join(getAgentvHome(), "subagents");
10391
+ }
10392
+ function getTraceStateRoot() {
10393
+ return import_node_path22.default.join(getAgentvHome(), "trace-state");
10394
+ }
10395
+ function getWorkspacePoolRoot() {
10396
+ return import_node_path22.default.join(getAgentvHome(), "workspace-pool");
10397
+ }
10398
+
10399
+ // src/evaluation/providers/pi-coding-agent.ts
10062
10400
  var piCodingAgentModule = null;
10063
10401
  var piAiModule = null;
10064
10402
  var loadingPromise = null;
@@ -10076,46 +10414,126 @@ async function promptInstall() {
10076
10414
  rl.close();
10077
10415
  }
10078
10416
  }
10079
- function findAgentvRoot() {
10080
- const thisFile = (0, import_node_url3.fileURLToPath)(importMetaUrl);
10081
- let dir = import_node_path22.default.dirname(thisFile);
10082
- for (let i = 0; i < 10; i++) {
10417
+ function findManagedSdkInstallRoot() {
10418
+ return import_node_path23.default.join(getAgentvHome(), "deps", "pi-sdk");
10419
+ }
10420
+ function resolveGlobalNpmRoot() {
10421
+ try {
10422
+ const root = (0, import_node_child_process5.execSync)("npm root -g", {
10423
+ encoding: "utf-8",
10424
+ stdio: ["ignore", "pipe", "ignore"]
10425
+ }).trim();
10426
+ return root.length > 0 ? root : void 0;
10427
+ } catch {
10428
+ return void 0;
10429
+ }
10430
+ }
10431
+ function buildGlobalModuleEntry(moduleName, globalNpmRoot) {
10432
+ return import_node_path23.default.join(globalNpmRoot, ...moduleName.split("/"), "dist", "index.js");
10433
+ }
10434
+ function findAccessiblePath(paths) {
10435
+ for (const candidate of paths) {
10083
10436
  try {
10084
- const pkg = import_node_path22.default.join(dir, "package.json");
10085
- (0, import_node_fs10.accessSync)(pkg);
10086
- return dir;
10437
+ (0, import_node_fs10.accessSync)(candidate);
10438
+ return candidate;
10087
10439
  } catch {
10088
- const parent = import_node_path22.default.dirname(dir);
10089
- if (parent === dir) break;
10090
- dir = parent;
10091
10440
  }
10092
10441
  }
10093
- return import_node_path22.default.dirname(thisFile);
10442
+ return void 0;
10094
10443
  }
10095
- async function doLoadSdkModules() {
10444
+ async function tryImportLocalSdkModules() {
10096
10445
  try {
10097
10446
  [piCodingAgentModule, piAiModule] = await Promise.all([
10098
10447
  import("@mariozechner/pi-coding-agent"),
10099
10448
  import("@mariozechner/pi-ai")
10100
10449
  ]);
10450
+ return true;
10101
10451
  } catch {
10102
- if (await promptInstall()) {
10103
- const installDir = findAgentvRoot();
10104
- console.error(`Installing @mariozechner/pi-coding-agent into ${installDir}...`);
10105
- (0, import_node_child_process5.execSync)("bun add @mariozechner/pi-coding-agent", {
10106
- cwd: installDir,
10107
- stdio: "inherit"
10108
- });
10109
- [piCodingAgentModule, piAiModule] = await Promise.all([
10110
- import("@mariozechner/pi-coding-agent"),
10111
- import("@mariozechner/pi-ai")
10112
- ]);
10113
- } else {
10114
- throw new Error(
10115
- "pi-coding-agent SDK is not installed. Install it with:\n bun add @mariozechner/pi-coding-agent"
10116
- );
10452
+ return false;
10453
+ }
10454
+ }
10455
+ async function tryImportManagedSdkModules() {
10456
+ const managedRoot = findManagedSdkInstallRoot();
10457
+ const piCodingAgentEntry = findAccessiblePath([
10458
+ import_node_path23.default.join(managedRoot, "node_modules", "@mariozechner", "pi-coding-agent", "dist", "index.js")
10459
+ ]);
10460
+ const piAiEntry = findAccessiblePath([
10461
+ import_node_path23.default.join(managedRoot, "node_modules", "@mariozechner", "pi-ai", "dist", "index.js"),
10462
+ import_node_path23.default.join(
10463
+ managedRoot,
10464
+ "node_modules",
10465
+ "@mariozechner",
10466
+ "pi-coding-agent",
10467
+ "node_modules",
10468
+ "@mariozechner",
10469
+ "pi-ai",
10470
+ "dist",
10471
+ "index.js"
10472
+ )
10473
+ ]);
10474
+ if (!piCodingAgentEntry || !piAiEntry) return false;
10475
+ try {
10476
+ [piCodingAgentModule, piAiModule] = await Promise.all([
10477
+ import((0, import_node_url3.pathToFileURL)(piCodingAgentEntry).href),
10478
+ import((0, import_node_url3.pathToFileURL)(piAiEntry).href)
10479
+ ]);
10480
+ return true;
10481
+ } catch {
10482
+ return false;
10483
+ }
10484
+ }
10485
+ async function tryImportGlobalSdkModules() {
10486
+ const globalNpmRoot = resolveGlobalNpmRoot();
10487
+ if (!globalNpmRoot) return false;
10488
+ const piCodingAgentEntry = findAccessiblePath([
10489
+ buildGlobalModuleEntry("@mariozechner/pi-coding-agent", globalNpmRoot)
10490
+ ]);
10491
+ const piAiEntry = findAccessiblePath([
10492
+ buildGlobalModuleEntry("@mariozechner/pi-ai", globalNpmRoot),
10493
+ import_node_path23.default.join(
10494
+ globalNpmRoot,
10495
+ "@mariozechner",
10496
+ "pi-coding-agent",
10497
+ "node_modules",
10498
+ "@mariozechner",
10499
+ "pi-ai",
10500
+ "dist",
10501
+ "index.js"
10502
+ )
10503
+ ]);
10504
+ if (!piCodingAgentEntry || !piAiEntry) return false;
10505
+ try {
10506
+ [piCodingAgentModule, piAiModule] = await Promise.all([
10507
+ import((0, import_node_url3.pathToFileURL)(piCodingAgentEntry).href),
10508
+ import((0, import_node_url3.pathToFileURL)(piAiEntry).href)
10509
+ ]);
10510
+ return true;
10511
+ } catch {
10512
+ return false;
10513
+ }
10514
+ }
10515
+ function installSdkModules(installDir) {
10516
+ console.error(`Installing @mariozechner/pi-coding-agent into ${installDir} via npm...`);
10517
+ (0, import_node_fs10.mkdirSync)(installDir, { recursive: true });
10518
+ (0, import_node_child_process5.execSync)("npm install --no-save --no-package-lock @mariozechner/pi-coding-agent", {
10519
+ cwd: installDir,
10520
+ stdio: "inherit"
10521
+ });
10522
+ }
10523
+ async function doLoadSdkModules() {
10524
+ if (await tryImportLocalSdkModules() || await tryImportManagedSdkModules() || await tryImportGlobalSdkModules()) {
10525
+ return;
10526
+ }
10527
+ if (await promptInstall()) {
10528
+ const installDir = findManagedSdkInstallRoot();
10529
+ installSdkModules(installDir);
10530
+ if (await tryImportManagedSdkModules()) {
10531
+ return;
10117
10532
  }
10118
10533
  }
10534
+ throw new Error(
10535
+ "pi-coding-agent SDK is not installed. Install it with:\n npm install @mariozechner/pi-coding-agent"
10536
+ );
10119
10537
  }
10120
10538
  async function loadSdkModules() {
10121
10539
  if (!piCodingAgentModule || !piAiModule) {
@@ -10143,7 +10561,9 @@ async function loadSdkModules() {
10143
10561
  codingTools: piSdk.codingTools,
10144
10562
  toolMap,
10145
10563
  SessionManager: piSdk.SessionManager,
10146
- getModel: piAi.getModel
10564
+ getModel: piAi.getModel,
10565
+ // biome-ignore lint/suspicious/noExplicitAny: registerBuiltInApiProviders exists at runtime but not in type defs
10566
+ registerBuiltInApiProviders: piAi.registerBuiltInApiProviders
10147
10567
  };
10148
10568
  }
10149
10569
  var PiCodingAgentProvider = class {
@@ -10165,17 +10585,35 @@ var PiCodingAgentProvider = class {
10165
10585
  const startTime = (/* @__PURE__ */ new Date()).toISOString();
10166
10586
  const startMs = Date.now();
10167
10587
  const sdk = await loadSdkModules();
10588
+ sdk.registerBuiltInApiProviders();
10168
10589
  const logger = await this.createStreamLogger(request).catch(() => void 0);
10169
10590
  try {
10170
10591
  const cwd = this.resolveCwd(request.cwd);
10171
- const providerName = this.config.subprovider ?? "google";
10592
+ const rawProvider = this.config.subprovider ?? "google";
10593
+ const normalizedBaseUrl = this.normalizeSdkBaseUrl(rawProvider, this.config.baseUrl);
10594
+ const hasBaseUrl = !!normalizedBaseUrl;
10595
+ const providerName = resolveSubprovider(rawProvider, hasBaseUrl);
10172
10596
  const modelId = this.config.model ?? "gemini-2.5-flash";
10173
- this.setApiKeyEnv(providerName);
10174
- const model = sdk.getModel(providerName, modelId);
10597
+ this.setApiKeyEnv(rawProvider, hasBaseUrl);
10598
+ this.setBaseUrlEnv(rawProvider, normalizedBaseUrl, hasBaseUrl);
10599
+ let model = sdk.getModel(providerName, modelId);
10600
+ if (model && normalizedBaseUrl) {
10601
+ model = { ...model, baseUrl: normalizedBaseUrl };
10602
+ }
10175
10603
  if (!model) {
10176
- throw new Error(
10177
- `pi-coding-agent: getModel('${providerName}', '${modelId}') returned undefined. The model '${modelId}' is not registered for provider '${providerName}' in pi-ai. Check that subprovider and model are correct in your target config.`
10178
- );
10604
+ const envProvider = providerName.replace(/-responses$/, "");
10605
+ model = {
10606
+ id: modelId,
10607
+ name: modelId,
10608
+ api: providerName,
10609
+ provider: envProvider,
10610
+ baseUrl: normalizedBaseUrl ?? "",
10611
+ reasoning: false,
10612
+ input: ["text"],
10613
+ cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
10614
+ contextWindow: 128e3,
10615
+ maxTokens: 16384
10616
+ };
10179
10617
  }
10180
10618
  const tools = this.resolveTools(sdk);
10181
10619
  const { session } = await sdk.createAgentSession({
@@ -10328,28 +10766,35 @@ ${fileList}`;
10328
10766
  }
10329
10767
  }
10330
10768
  /** Maps config apiKey to the provider-specific env var the SDK reads. */
10331
- setApiKeyEnv(providerName) {
10769
+ setApiKeyEnv(providerName, hasBaseUrl = false) {
10332
10770
  if (!this.config.apiKey) return;
10333
- const ENV_KEY_MAP = {
10334
- google: "GEMINI_API_KEY",
10335
- gemini: "GEMINI_API_KEY",
10336
- anthropic: "ANTHROPIC_API_KEY",
10337
- openai: "OPENAI_API_KEY",
10338
- groq: "GROQ_API_KEY",
10339
- xai: "XAI_API_KEY",
10340
- openrouter: "OPENROUTER_API_KEY"
10341
- };
10342
- const envKey = ENV_KEY_MAP[providerName.toLowerCase()];
10771
+ const envKey = resolveEnvKeyName(providerName, hasBaseUrl);
10343
10772
  if (envKey) {
10344
10773
  process.env[envKey] = this.config.apiKey;
10345
10774
  }
10346
10775
  }
10776
+ /** Maps config baseUrl to the provider-specific env var the SDK reads. */
10777
+ setBaseUrlEnv(providerName, baseUrl = this.config.baseUrl, hasBaseUrl = false) {
10778
+ const normalizedBaseUrl = this.normalizeSdkBaseUrl(providerName, baseUrl);
10779
+ if (!normalizedBaseUrl) return;
10780
+ const envKey = resolveEnvBaseUrlName(providerName, hasBaseUrl);
10781
+ if (envKey) {
10782
+ process.env[envKey] = normalizedBaseUrl;
10783
+ }
10784
+ }
10785
+ normalizeSdkBaseUrl(providerName, baseUrl) {
10786
+ if (!baseUrl) return void 0;
10787
+ if (providerName.toLowerCase() === "azure") {
10788
+ return normalizeAzureSdkBaseUrl(baseUrl);
10789
+ }
10790
+ return baseUrl;
10791
+ }
10347
10792
  resolveCwd(cwdOverride) {
10348
10793
  if (cwdOverride) {
10349
- return import_node_path22.default.resolve(cwdOverride);
10794
+ return import_node_path23.default.resolve(cwdOverride);
10350
10795
  }
10351
10796
  if (this.config.cwd) {
10352
- return import_node_path22.default.resolve(this.config.cwd);
10797
+ return import_node_path23.default.resolve(this.config.cwd);
10353
10798
  }
10354
10799
  return process.cwd();
10355
10800
  }
@@ -10368,9 +10813,9 @@ ${fileList}`;
10368
10813
  }
10369
10814
  resolveLogDirectory() {
10370
10815
  if (this.config.logDir) {
10371
- return import_node_path22.default.resolve(this.config.logDir);
10816
+ return import_node_path23.default.resolve(this.config.logDir);
10372
10817
  }
10373
- return import_node_path22.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
10818
+ return import_node_path23.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
10374
10819
  }
10375
10820
  async createStreamLogger(request) {
10376
10821
  const logDir = this.resolveLogDirectory();
@@ -10384,7 +10829,7 @@ ${fileList}`;
10384
10829
  console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
10385
10830
  return void 0;
10386
10831
  }
10387
- const filePath = import_node_path22.default.join(logDir, buildLogFilename6(request, this.targetName));
10832
+ const filePath = import_node_path23.default.join(logDir, buildLogFilename6(request, this.targetName));
10388
10833
  try {
10389
10834
  const logger = await PiStreamLogger2.create({
10390
10835
  filePath,
@@ -10599,19 +11044,17 @@ var ProviderRegistry = class {
10599
11044
 
10600
11045
  // src/evaluation/providers/targets.ts
10601
11046
  init_cjs_shims();
10602
- var import_node_path23 = __toESM(require("path"), 1);
11047
+ var import_node_path24 = __toESM(require("path"), 1);
10603
11048
  var import_zod3 = require("zod");
10604
11049
  var CliHealthcheckHttpInputSchema = import_zod3.z.object({
10605
11050
  url: import_zod3.z.string().min(1, "healthcheck URL is required"),
10606
- timeout_seconds: import_zod3.z.number().positive().optional(),
10607
- timeoutSeconds: import_zod3.z.number().positive().optional()
10608
- });
11051
+ timeout_seconds: import_zod3.z.number().positive().optional()
11052
+ }).passthrough();
10609
11053
  var CliHealthcheckCommandInputSchema = import_zod3.z.object({
10610
11054
  command: import_zod3.z.string().min(1, "healthcheck command is required"),
10611
11055
  cwd: import_zod3.z.string().optional(),
10612
- timeout_seconds: import_zod3.z.number().positive().optional(),
10613
- timeoutSeconds: import_zod3.z.number().positive().optional()
10614
- });
11056
+ timeout_seconds: import_zod3.z.number().positive().optional()
11057
+ }).passthrough();
10615
11058
  var CliHealthcheckInputSchema = import_zod3.z.union([
10616
11059
  CliHealthcheckHttpInputSchema,
10617
11060
  CliHealthcheckCommandInputSchema
@@ -10623,36 +11066,28 @@ var CliTargetInputSchema = import_zod3.z.object({
10623
11066
  command: import_zod3.z.string(),
10624
11067
  // Files format - optional
10625
11068
  files_format: import_zod3.z.string().optional(),
10626
- filesFormat: import_zod3.z.string().optional(),
10627
11069
  attachments_format: import_zod3.z.string().optional(),
10628
- attachmentsFormat: import_zod3.z.string().optional(),
10629
11070
  // Working directory - optional
10630
11071
  cwd: import_zod3.z.string().optional(),
10631
11072
  // Workspace template directory - optional (mutually exclusive with cwd)
10632
11073
  workspace_template: import_zod3.z.string().optional(),
10633
- workspaceTemplate: import_zod3.z.string().optional(),
10634
11074
  // Timeout in seconds - optional
10635
11075
  timeout_seconds: import_zod3.z.number().positive().optional(),
10636
- timeoutSeconds: import_zod3.z.number().positive().optional(),
10637
11076
  // Healthcheck configuration - optional
10638
11077
  healthcheck: CliHealthcheckInputSchema.optional(),
10639
11078
  // Verbose mode - optional
10640
11079
  verbose: import_zod3.z.boolean().optional(),
10641
11080
  cli_verbose: import_zod3.z.boolean().optional(),
10642
- cliVerbose: import_zod3.z.boolean().optional(),
10643
11081
  // Keep temp files - optional
10644
11082
  keep_temp_files: import_zod3.z.boolean().optional(),
10645
- keepTempFiles: import_zod3.z.boolean().optional(),
10646
11083
  keep_output_files: import_zod3.z.boolean().optional(),
10647
- keepOutputFiles: import_zod3.z.boolean().optional(),
10648
11084
  // Common target fields
10649
11085
  grader_target: import_zod3.z.string().optional(),
10650
11086
  judge_target: import_zod3.z.string().optional(),
10651
11087
  // backward compat
10652
11088
  workers: import_zod3.z.number().int().min(1).optional(),
10653
- provider_batching: import_zod3.z.boolean().optional(),
10654
- providerBatching: import_zod3.z.boolean().optional()
10655
- });
11089
+ provider_batching: import_zod3.z.boolean().optional()
11090
+ }).passthrough();
10656
11091
  var CliHealthcheckHttpSchema = import_zod3.z.object({
10657
11092
  url: import_zod3.z.string().min(1),
10658
11093
  timeoutMs: import_zod3.z.number().positive().optional()
@@ -10677,7 +11112,7 @@ var CliTargetConfigSchema = import_zod3.z.object({
10677
11112
  keepTempFiles: import_zod3.z.boolean().optional()
10678
11113
  }).strict();
10679
11114
  function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
10680
- const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
11115
+ const timeoutSeconds = input.timeout_seconds;
10681
11116
  const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
10682
11117
  if ("url" in input && input.url) {
10683
11118
  const url = resolveString(input.url, env, `${targetName} healthcheck URL`);
@@ -10696,11 +11131,11 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
10696
11131
  allowLiteral: true,
10697
11132
  optionalEnv: true
10698
11133
  });
10699
- if (cwd && evalFilePath && !import_node_path23.default.isAbsolute(cwd)) {
10700
- cwd = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), cwd);
11134
+ if (cwd && evalFilePath && !import_node_path24.default.isAbsolute(cwd)) {
11135
+ cwd = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), cwd);
10701
11136
  }
10702
11137
  if (!cwd && evalFilePath) {
10703
- cwd = import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath));
11138
+ cwd = import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath));
10704
11139
  }
10705
11140
  return {
10706
11141
  command,
@@ -10711,9 +11146,9 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
10711
11146
  function normalizeCliTargetInput(input, env, evalFilePath) {
10712
11147
  const targetName = input.name;
10713
11148
  const command = resolveString(input.command, env, `${targetName} CLI command`, true);
10714
- const filesFormatSource = input.files_format ?? input.filesFormat ?? input.attachments_format ?? input.attachmentsFormat;
11149
+ const filesFormatSource = input.files_format ?? input.attachments_format;
10715
11150
  const filesFormat = resolveOptionalLiteralString(filesFormatSource);
10716
- const workspaceTemplateSource = input.workspace_template ?? input.workspaceTemplate;
11151
+ const workspaceTemplateSource = input.workspace_template;
10717
11152
  let workspaceTemplate = resolveOptionalString(
10718
11153
  workspaceTemplateSource,
10719
11154
  env,
@@ -10723,15 +11158,15 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
10723
11158
  optionalEnv: true
10724
11159
  }
10725
11160
  );
10726
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
10727
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
11161
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
11162
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
10728
11163
  }
10729
11164
  let cwd = resolveOptionalString(input.cwd, env, `${targetName} working directory`, {
10730
11165
  allowLiteral: true,
10731
11166
  optionalEnv: true
10732
11167
  });
10733
- if (cwd && evalFilePath && !import_node_path23.default.isAbsolute(cwd)) {
10734
- cwd = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), cwd);
11168
+ if (cwd && evalFilePath && !import_node_path24.default.isAbsolute(cwd)) {
11169
+ cwd = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), cwd);
10735
11170
  }
10736
11171
  if (cwd && workspaceTemplate) {
10737
11172
  throw new Error(
@@ -10739,14 +11174,12 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
10739
11174
  );
10740
11175
  }
10741
11176
  if (!cwd && !workspaceTemplate && evalFilePath) {
10742
- cwd = import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath));
11177
+ cwd = import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath));
10743
11178
  }
10744
- const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
11179
+ const timeoutSeconds = input.timeout_seconds;
10745
11180
  const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
10746
- const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose ?? input.cliVerbose);
10747
- const keepTempFiles = resolveOptionalBoolean(
10748
- input.keep_temp_files ?? input.keepTempFiles ?? input.keep_output_files ?? input.keepOutputFiles
10749
- );
11181
+ const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose);
11182
+ const keepTempFiles = resolveOptionalBoolean(input.keep_temp_files ?? input.keep_output_files);
10750
11183
  const healthcheck = input.healthcheck ? normalizeCliHealthcheck(input.healthcheck, env, targetName, evalFilePath) : void 0;
10751
11184
  return {
10752
11185
  command,
@@ -10767,15 +11200,106 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
10767
11200
  "FILES",
10768
11201
  "OUTPUT_FILE"
10769
11202
  ]);
11203
+ var DEPRECATED_TARGET_CAMEL_CASE_FIELDS = /* @__PURE__ */ new Map([
11204
+ ["providerBatching", "provider_batching"],
11205
+ ["subagentModeAllowed", "subagent_mode_allowed"],
11206
+ ["fallbackTargets", "fallback_targets"],
11207
+ ["resourceName", "endpoint"],
11208
+ ["baseUrl", "base_url"],
11209
+ ["apiKey", "api_key"],
11210
+ ["deploymentName", "model"],
11211
+ ["thinkingBudget", "thinking_budget"],
11212
+ ["maxTokens", "max_output_tokens"],
11213
+ ["apiFormat", "api_format"],
11214
+ ["timeoutSeconds", "timeout_seconds"],
11215
+ ["logDir", "log_dir"],
11216
+ ["logDirectory", "log_directory"],
11217
+ ["logFormat", "log_format"],
11218
+ ["logOutputFormat", "log_output_format"],
11219
+ ["systemPrompt", "system_prompt"],
11220
+ ["maxTurns", "max_turns"],
11221
+ ["maxBudgetUsd", "max_budget_usd"],
11222
+ ["dryRun", "dry_run"],
11223
+ ["subagentRoot", "subagent_root"],
11224
+ ["filesFormat", "files_format"],
11225
+ ["attachmentsFormat", "attachments_format"],
11226
+ ["cliUrl", "cli_url"],
11227
+ ["cliPath", "cli_path"],
11228
+ ["githubToken", "github_token"],
11229
+ ["sessionDir", "session_dir"],
11230
+ ["sessionId", "session_id"],
11231
+ ["sessionStateDir", "session_state_dir"],
11232
+ ["maxRetries", "max_retries"],
11233
+ ["retryInitialDelayMs", "retry_initial_delay_ms"],
11234
+ ["retryMaxDelayMs", "retry_max_delay_ms"],
11235
+ ["retryBackoffFactor", "retry_backoff_factor"],
11236
+ ["retryStatusCodes", "retry_status_codes"]
11237
+ ]);
11238
+ var DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS = /* @__PURE__ */ new Map([
11239
+ ["timeoutSeconds", "timeout_seconds"]
11240
+ ]);
11241
+ function collectDeprecatedCamelCaseWarnings(value, location, aliases) {
11242
+ if (typeof value !== "object" || value === null || Array.isArray(value)) {
11243
+ return [];
11244
+ }
11245
+ const warnings = [];
11246
+ for (const [camelCaseField, snakeCaseField] of aliases) {
11247
+ if (Object.prototype.hasOwnProperty.call(value, camelCaseField)) {
11248
+ warnings.push({
11249
+ location: `${location}.${camelCaseField}`,
11250
+ message: `camelCase field '${camelCaseField}' is no longer supported in targets.yaml. Use '${snakeCaseField}' instead.`
11251
+ });
11252
+ }
11253
+ }
11254
+ return warnings;
11255
+ }
11256
+ function assertNoDeprecatedCamelCaseTargetFields(definition) {
11257
+ if (Object.prototype.hasOwnProperty.call(definition, "workspaceTemplate")) {
11258
+ throw new Error(
11259
+ `${definition.name}: target-level workspace_template has been removed. Use eval-level workspace.template.`
11260
+ );
11261
+ }
11262
+ const warning = findDeprecatedCamelCaseTargetWarnings(
11263
+ definition,
11264
+ `target "${definition.name}"`
11265
+ )[0];
11266
+ if (!warning) {
11267
+ return;
11268
+ }
11269
+ const fieldMatch = warning.message.match(/field '([^']+)'/);
11270
+ const replacementMatch = warning.message.match(/Use '([^']+)' instead/);
11271
+ const field = fieldMatch?.[1] ?? "unknown";
11272
+ const replacement = replacementMatch?.[1] ?? "snake_case";
11273
+ throw new Error(
11274
+ `${warning.location}: camelCase field '${field}' is no longer supported in targets.yaml. Use '${replacement}' instead.`
11275
+ );
11276
+ }
11277
+ function findDeprecatedCamelCaseTargetWarnings(target, location) {
11278
+ const warnings = collectDeprecatedCamelCaseWarnings(
11279
+ target,
11280
+ location,
11281
+ DEPRECATED_TARGET_CAMEL_CASE_FIELDS
11282
+ );
11283
+ if (typeof target !== "object" || target === null || Array.isArray(target)) {
11284
+ return warnings;
11285
+ }
11286
+ const healthcheck = target.healthcheck;
11287
+ warnings.push(
11288
+ ...collectDeprecatedCamelCaseWarnings(
11289
+ healthcheck,
11290
+ `${location}.healthcheck`,
11291
+ DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS
11292
+ )
11293
+ );
11294
+ return warnings;
11295
+ }
10770
11296
  var COMMON_TARGET_SETTINGS = [
10771
11297
  "use_target",
10772
11298
  "provider_batching",
10773
- "providerBatching",
10774
11299
  "subagent_mode_allowed",
10775
- "subagentModeAllowed",
10776
- "fallback_targets",
10777
- "fallbackTargets"
11300
+ "fallback_targets"
10778
11301
  ];
11302
+ var USE_TARGET_ENV_PATTERN = /^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i;
10779
11303
  var BASE_TARGET_SCHEMA = import_zod3.z.object({
10780
11304
  name: import_zod3.z.string().min(1, "target name is required"),
10781
11305
  provider: import_zod3.z.string().optional(),
@@ -10785,43 +11309,40 @@ var BASE_TARGET_SCHEMA = import_zod3.z.object({
10785
11309
  // backward compat
10786
11310
  workers: import_zod3.z.number().int().min(1).optional(),
10787
11311
  workspace_template: import_zod3.z.string().optional(),
10788
- workspaceTemplate: import_zod3.z.string().optional(),
10789
11312
  subagent_mode_allowed: import_zod3.z.boolean().optional(),
10790
- fallback_targets: import_zod3.z.array(import_zod3.z.string().min(1)).optional(),
10791
- fallbackTargets: import_zod3.z.array(import_zod3.z.string().min(1)).optional()
11313
+ fallback_targets: import_zod3.z.array(import_zod3.z.string().min(1)).optional()
10792
11314
  }).passthrough();
10793
11315
  var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
11316
+ var DEFAULT_AZURE_RESPONSES_API_VERSION = "v1";
10794
11317
  var DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
10795
- function normalizeAzureApiVersion(value) {
11318
+ function normalizeAzureApiVersion(value, apiFormat) {
11319
+ const defaultVersion = apiFormat === "responses" ? DEFAULT_AZURE_RESPONSES_API_VERSION : DEFAULT_AZURE_API_VERSION;
10796
11320
  if (!value) {
10797
- return DEFAULT_AZURE_API_VERSION;
11321
+ return defaultVersion;
10798
11322
  }
10799
11323
  const trimmed = value.trim();
10800
11324
  if (trimmed.length === 0) {
10801
- return DEFAULT_AZURE_API_VERSION;
11325
+ return defaultVersion;
10802
11326
  }
10803
11327
  const withoutPrefix = trimmed.replace(/^api[-_]?version\s*=\s*/i, "").trim();
10804
- return withoutPrefix.length > 0 ? withoutPrefix : DEFAULT_AZURE_API_VERSION;
11328
+ return withoutPrefix.length > 0 ? withoutPrefix : defaultVersion;
10805
11329
  }
10806
11330
  function resolveRetryConfig(target) {
10807
- const maxRetries = resolveOptionalNumber(
10808
- target.max_retries ?? target.maxRetries,
10809
- `${target.name} max retries`
10810
- );
11331
+ const maxRetries = resolveOptionalNumber(target.max_retries, `${target.name} max retries`);
10811
11332
  const initialDelayMs = resolveOptionalNumber(
10812
- target.retry_initial_delay_ms ?? target.retryInitialDelayMs,
11333
+ target.retry_initial_delay_ms,
10813
11334
  `${target.name} retry initial delay`
10814
11335
  );
10815
11336
  const maxDelayMs = resolveOptionalNumber(
10816
- target.retry_max_delay_ms ?? target.retryMaxDelayMs,
11337
+ target.retry_max_delay_ms,
10817
11338
  `${target.name} retry max delay`
10818
11339
  );
10819
11340
  const backoffFactor = resolveOptionalNumber(
10820
- target.retry_backoff_factor ?? target.retryBackoffFactor,
11341
+ target.retry_backoff_factor,
10821
11342
  `${target.name} retry backoff factor`
10822
11343
  );
10823
11344
  const retryableStatusCodes = resolveOptionalNumberArray(
10824
- target.retry_status_codes ?? target.retryStatusCodes,
11345
+ target.retry_status_codes,
10825
11346
  `${target.name} retry status codes`
10826
11347
  );
10827
11348
  if (maxRetries === void 0 && initialDelayMs === void 0 && maxDelayMs === void 0 && backoffFactor === void 0 && retryableStatusCodes === void 0) {
@@ -10835,9 +11356,56 @@ function resolveRetryConfig(target) {
10835
11356
  retryableStatusCodes
10836
11357
  };
10837
11358
  }
10838
- function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
11359
+ function resolveDelegatedTargetDefinition(name, definitions, env = process.env) {
11360
+ let definition = definitions.get(name);
11361
+ if (!definition) {
11362
+ return void 0;
11363
+ }
11364
+ const visited = [definition.name];
11365
+ for (let depth = 0; depth < 10; depth++) {
11366
+ const rawUseTarget = typeof definition.use_target === "string" ? definition.use_target.trim() : void 0;
11367
+ if (!rawUseTarget) {
11368
+ return definition;
11369
+ }
11370
+ const envMatch = rawUseTarget.match(USE_TARGET_ENV_PATTERN);
11371
+ const envVarName = envMatch?.[1];
11372
+ const resolvedName = envVarName ? env[envVarName]?.trim() ?? "" : rawUseTarget;
11373
+ if (resolvedName.length === 0) {
11374
+ if (envVarName) {
11375
+ throw new Error(
11376
+ `Target "${definition.name}" uses use_target: \${{ ${envVarName} }}, but ${envVarName} is not set. Set ${envVarName} to the name of a concrete target (for example, "azure") before running the eval.`
11377
+ );
11378
+ }
11379
+ throw new Error(
11380
+ `Target "${definition.name}" has an empty use_target value. Point it at a concrete target name before running the eval.`
11381
+ );
11382
+ }
11383
+ const next = definitions.get(resolvedName);
11384
+ if (!next) {
11385
+ if (envVarName) {
11386
+ throw new Error(
11387
+ `Target "${definition.name}" uses use_target: \${{ ${envVarName} }}, which resolved to "${resolvedName}", but no target named "${resolvedName}" exists.`
11388
+ );
11389
+ }
11390
+ throw new Error(
11391
+ `Target "${definition.name}" uses use_target: "${resolvedName}", but no target named "${resolvedName}" exists.`
11392
+ );
11393
+ }
11394
+ if (visited.includes(next.name)) {
11395
+ const chain = [...visited, next.name].join(" -> ");
11396
+ throw new Error(`Circular use_target reference detected: ${chain}`);
11397
+ }
11398
+ definition = next;
11399
+ visited.push(definition.name);
11400
+ }
11401
+ throw new Error(
11402
+ `Target "${name}" exceeded the maximum use_target resolution depth (10). Check for a delegation loop or overly deep alias chain.`
11403
+ );
11404
+ }
11405
+ function resolveTargetDefinition(definition, env = process.env, evalFilePath, options) {
11406
+ assertNoDeprecatedCamelCaseTargetFields(definition);
10839
11407
  const parsed = BASE_TARGET_SCHEMA.parse(definition);
10840
- if (parsed.workspace_template !== void 0 || parsed.workspaceTemplate !== void 0) {
11408
+ if (parsed.workspace_template !== void 0) {
10841
11409
  throw new Error(
10842
11410
  `${parsed.name}: target-level workspace_template has been removed. Use eval-level workspace.template.`
10843
11411
  );
@@ -10853,13 +11421,9 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
10853
11421
  `${parsed.name} provider`,
10854
11422
  true
10855
11423
  ).toLowerCase();
10856
- const providerBatching = resolveOptionalBoolean(
10857
- parsed.provider_batching ?? parsed.providerBatching
10858
- );
10859
- const subagentModeAllowed = resolveOptionalBoolean(
10860
- parsed.subagent_mode_allowed ?? parsed.subagentModeAllowed
10861
- );
10862
- const fallbackTargets = parsed.fallback_targets ?? parsed.fallbackTargets;
11424
+ const providerBatching = resolveOptionalBoolean(parsed.provider_batching);
11425
+ const subagentModeAllowed = resolveOptionalBoolean(parsed.subagent_mode_allowed);
11426
+ const fallbackTargets = parsed.fallback_targets;
10863
11427
  const base = {
10864
11428
  name: parsed.name,
10865
11429
  graderTarget: parsed.grader_target ?? parsed.judge_target,
@@ -11009,20 +11573,22 @@ function normalizeOpenAIBaseUrl(value) {
11009
11573
  return trimmed.endsWith("/v1") ? trimmed : `${trimmed}/v1`;
11010
11574
  }
11011
11575
  function resolveAzureConfig(target, env) {
11012
- const endpointSource = target.endpoint ?? target.resource ?? target.resourceName;
11013
- const apiKeySource = target.api_key ?? target.apiKey;
11014
- const deploymentSource = target.deployment ?? target.deploymentName ?? target.model;
11576
+ const endpointSource = target.endpoint ?? target.resource;
11577
+ const apiKeySource = target.api_key;
11578
+ const deploymentSource = target.deployment ?? target.model;
11015
11579
  const versionSource = target.version ?? target.api_version;
11016
11580
  const temperatureSource = target.temperature;
11017
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
11581
+ const maxTokensSource = target.max_output_tokens;
11018
11582
  const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
11019
11583
  const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
11020
11584
  const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
11585
+ const apiFormat = resolveApiFormat(target, env, target.name);
11021
11586
  const version = normalizeAzureApiVersion(
11022
11587
  resolveOptionalString(versionSource, env, `${target.name} api version`, {
11023
11588
  allowLiteral: true,
11024
11589
  optionalEnv: true
11025
- })
11590
+ }),
11591
+ apiFormat
11026
11592
  );
11027
11593
  const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
11028
11594
  const maxOutputTokens = resolveOptionalNumber(
@@ -11035,13 +11601,17 @@ function resolveAzureConfig(target, env) {
11035
11601
  deploymentName,
11036
11602
  apiKey,
11037
11603
  version,
11604
+ apiFormat,
11038
11605
  temperature,
11039
11606
  maxOutputTokens,
11040
11607
  retry
11041
11608
  };
11042
11609
  }
11043
- function resolveApiFormat(target, targetName) {
11044
- const raw = target.api_format ?? target.apiFormat;
11610
+ function resolveApiFormat(target, env, targetName) {
11611
+ const raw = resolveOptionalString(target.api_format, env, `${targetName} api format`, {
11612
+ allowLiteral: true,
11613
+ optionalEnv: true
11614
+ });
11045
11615
  if (raw === void 0) return void 0;
11046
11616
  if (raw === "chat" || raw === "responses") return raw;
11047
11617
  throw new Error(
@@ -11049,11 +11619,11 @@ function resolveApiFormat(target, targetName) {
11049
11619
  );
11050
11620
  }
11051
11621
  function resolveOpenAIConfig(target, env) {
11052
- const endpointSource = target.endpoint ?? target.base_url ?? target.baseUrl;
11053
- const apiKeySource = target.api_key ?? target.apiKey;
11622
+ const endpointSource = target.endpoint ?? target.base_url;
11623
+ const apiKeySource = target.api_key;
11054
11624
  const modelSource = target.model ?? target.deployment ?? target.variant;
11055
11625
  const temperatureSource = target.temperature;
11056
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
11626
+ const maxTokensSource = target.max_output_tokens;
11057
11627
  const baseURL = normalizeOpenAIBaseUrl(
11058
11628
  resolveOptionalString(endpointSource, env, `${target.name} endpoint`, {
11059
11629
  allowLiteral: true,
@@ -11067,17 +11637,17 @@ function resolveOpenAIConfig(target, env) {
11067
11637
  baseURL,
11068
11638
  apiKey,
11069
11639
  model,
11070
- apiFormat: resolveApiFormat(target, target.name),
11640
+ apiFormat: resolveApiFormat(target, env, target.name),
11071
11641
  temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
11072
11642
  maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
11073
11643
  retry
11074
11644
  };
11075
11645
  }
11076
11646
  function resolveOpenRouterConfig(target, env) {
11077
- const apiKeySource = target.api_key ?? target.apiKey;
11647
+ const apiKeySource = target.api_key;
11078
11648
  const modelSource = target.model ?? target.deployment ?? target.variant;
11079
11649
  const temperatureSource = target.temperature;
11080
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
11650
+ const maxTokensSource = target.max_output_tokens;
11081
11651
  const retry = resolveRetryConfig(target);
11082
11652
  return {
11083
11653
  apiKey: resolveString(apiKeySource, env, `${target.name} OpenRouter api key`),
@@ -11088,11 +11658,11 @@ function resolveOpenRouterConfig(target, env) {
11088
11658
  };
11089
11659
  }
11090
11660
  function resolveAnthropicConfig(target, env) {
11091
- const apiKeySource = target.api_key ?? target.apiKey;
11661
+ const apiKeySource = target.api_key;
11092
11662
  const modelSource = target.model ?? target.deployment ?? target.variant;
11093
11663
  const temperatureSource = target.temperature;
11094
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
11095
- const thinkingBudgetSource = target.thinking_budget ?? target.thinkingBudget;
11664
+ const maxTokensSource = target.max_output_tokens;
11665
+ const thinkingBudgetSource = target.thinking_budget;
11096
11666
  const apiKey = resolveString(apiKeySource, env, `${target.name} Anthropic api key`);
11097
11667
  const model = resolveString(modelSource, env, `${target.name} Anthropic model`);
11098
11668
  const retry = resolveRetryConfig(target);
@@ -11106,10 +11676,10 @@ function resolveAnthropicConfig(target, env) {
11106
11676
  };
11107
11677
  }
11108
11678
  function resolveGeminiConfig(target, env) {
11109
- const apiKeySource = target.api_key ?? target.apiKey;
11679
+ const apiKeySource = target.api_key;
11110
11680
  const modelSource = target.model ?? target.deployment ?? target.variant;
11111
11681
  const temperatureSource = target.temperature;
11112
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
11682
+ const maxTokensSource = target.max_output_tokens;
11113
11683
  const apiKey = resolveString(apiKeySource, env, `${target.name} Google API key`);
11114
11684
  const model = resolveOptionalString(modelSource, env, `${target.name} Gemini model`, {
11115
11685
  allowLiteral: true,
@@ -11129,11 +11699,11 @@ function resolveCodexConfig(target, env, evalFilePath) {
11129
11699
  const executableSource = target.executable ?? target.command ?? target.binary;
11130
11700
  const argsSource = target.args ?? target.arguments;
11131
11701
  const cwdSource = target.cwd;
11132
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
11133
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
11134
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
11135
- const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
11136
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
11702
+ const workspaceTemplateSource = target.workspace_template;
11703
+ const timeoutSource = target.timeout_seconds;
11704
+ const logDirSource = target.log_dir ?? target.log_directory;
11705
+ const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CODEX_LOG_FORMAT;
11706
+ const systemPromptSource = target.system_prompt;
11137
11707
  const model = resolveOptionalString(modelSource, env, `${target.name} codex model`, {
11138
11708
  allowLiteral: true,
11139
11709
  optionalEnv: true
@@ -11156,8 +11726,8 @@ function resolveCodexConfig(target, env, evalFilePath) {
11156
11726
  optionalEnv: true
11157
11727
  }
11158
11728
  );
11159
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11160
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
11729
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
11730
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11161
11731
  }
11162
11732
  if (cwd && workspaceTemplate) {
11163
11733
  throw new Error(
@@ -11197,16 +11767,16 @@ function normalizeCodexLogFormat(value) {
11197
11767
  throw new Error("codex log format must be 'summary' or 'json'");
11198
11768
  }
11199
11769
  function resolveCopilotSdkConfig(target, env, evalFilePath) {
11200
- const cliUrlSource = target.cli_url ?? target.cliUrl;
11201
- const cliPathSource = target.cli_path ?? target.cliPath;
11202
- const githubTokenSource = target.github_token ?? target.githubToken;
11770
+ const cliUrlSource = target.cli_url;
11771
+ const cliPathSource = target.cli_path;
11772
+ const githubTokenSource = target.github_token;
11203
11773
  const modelSource = target.model;
11204
11774
  const cwdSource = target.cwd;
11205
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
11206
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
11207
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
11208
- const logFormatSource = target.log_format ?? target.logFormat;
11209
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
11775
+ const workspaceTemplateSource = target.workspace_template;
11776
+ const timeoutSource = target.timeout_seconds;
11777
+ const logDirSource = target.log_dir ?? target.log_directory;
11778
+ const logFormatSource = target.log_format;
11779
+ const systemPromptSource = target.system_prompt;
11210
11780
  const cliUrl = resolveOptionalString(cliUrlSource, env, `${target.name} copilot-sdk cli URL`, {
11211
11781
  allowLiteral: true,
11212
11782
  optionalEnv: true
@@ -11241,8 +11811,8 @@ function resolveCopilotSdkConfig(target, env, evalFilePath) {
11241
11811
  optionalEnv: true
11242
11812
  }
11243
11813
  );
11244
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11245
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
11814
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
11815
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11246
11816
  }
11247
11817
  if (cwd && workspaceTemplate) {
11248
11818
  throw new Error(
@@ -11279,11 +11849,11 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
11279
11849
  const modelSource = target.model;
11280
11850
  const argsSource = target.args ?? target.arguments;
11281
11851
  const cwdSource = target.cwd;
11282
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
11283
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
11284
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
11285
- const logFormatSource = target.log_format ?? target.logFormat;
11286
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
11852
+ const workspaceTemplateSource = target.workspace_template;
11853
+ const timeoutSource = target.timeout_seconds;
11854
+ const logDirSource = target.log_dir ?? target.log_directory;
11855
+ const logFormatSource = target.log_format;
11856
+ const systemPromptSource = target.system_prompt;
11287
11857
  const executable = resolveOptionalString(executableSource, env, `${target.name} copilot-cli executable`, {
11288
11858
  allowLiteral: true,
11289
11859
  optionalEnv: true
@@ -11306,8 +11876,8 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
11306
11876
  optionalEnv: true
11307
11877
  }
11308
11878
  );
11309
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11310
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
11879
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
11880
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11311
11881
  }
11312
11882
  if (cwd && workspaceTemplate) {
11313
11883
  throw new Error(
@@ -11347,16 +11917,16 @@ function normalizeCopilotLogFormat(value) {
11347
11917
  }
11348
11918
  function resolvePiCodingAgentConfig(target, env, evalFilePath) {
11349
11919
  const subproviderSource = target.subprovider;
11350
- const modelSource = target.model ?? target.pi_model ?? target.piModel;
11351
- const apiKeySource = target.api_key ?? target.apiKey;
11352
- const toolsSource = target.tools ?? target.pi_tools ?? target.piTools;
11353
- const thinkingSource = target.thinking ?? target.pi_thinking ?? target.piThinking;
11920
+ const modelSource = target.model ?? target.pi_model;
11921
+ const apiKeySource = target.api_key;
11922
+ const toolsSource = target.tools ?? target.pi_tools;
11923
+ const thinkingSource = target.thinking ?? target.pi_thinking;
11354
11924
  const cwdSource = target.cwd;
11355
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
11356
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
11357
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
11358
- const logFormatSource = target.log_format ?? target.logFormat;
11359
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
11925
+ const workspaceTemplateSource = target.workspace_template;
11926
+ const timeoutSource = target.timeout_seconds;
11927
+ const logDirSource = target.log_dir ?? target.log_directory;
11928
+ const logFormatSource = target.log_format;
11929
+ const systemPromptSource = target.system_prompt;
11360
11930
  const subprovider = resolveOptionalString(
11361
11931
  subproviderSource,
11362
11932
  env,
@@ -11374,6 +11944,11 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
11374
11944
  allowLiteral: false,
11375
11945
  optionalEnv: true
11376
11946
  });
11947
+ const baseUrlSource = target.base_url ?? target.endpoint;
11948
+ const baseUrl = resolveOptionalString(baseUrlSource, env, `${target.name} pi base url`, {
11949
+ allowLiteral: true,
11950
+ optionalEnv: true
11951
+ });
11377
11952
  const tools = resolveOptionalString(toolsSource, env, `${target.name} pi tools`, {
11378
11953
  allowLiteral: true,
11379
11954
  optionalEnv: true
@@ -11395,8 +11970,8 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
11395
11970
  optionalEnv: true
11396
11971
  }
11397
11972
  );
11398
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11399
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
11973
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
11974
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11400
11975
  }
11401
11976
  if (cwd && workspaceTemplate) {
11402
11977
  throw new Error(
@@ -11414,6 +11989,7 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
11414
11989
  subprovider,
11415
11990
  model,
11416
11991
  apiKey,
11992
+ baseUrl,
11417
11993
  tools,
11418
11994
  thinking,
11419
11995
  cwd,
@@ -11427,16 +12003,16 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
11427
12003
  function resolvePiCliConfig(target, env, evalFilePath) {
11428
12004
  const executableSource = target.executable ?? target.command ?? target.binary;
11429
12005
  const subproviderSource = target.subprovider;
11430
- const modelSource = target.model ?? target.pi_model ?? target.piModel;
11431
- const apiKeySource = target.api_key ?? target.apiKey;
11432
- const toolsSource = target.tools ?? target.pi_tools ?? target.piTools;
11433
- const thinkingSource = target.thinking ?? target.pi_thinking ?? target.piThinking;
12006
+ const modelSource = target.model ?? target.pi_model;
12007
+ const apiKeySource = target.api_key;
12008
+ const toolsSource = target.tools ?? target.pi_tools;
12009
+ const thinkingSource = target.thinking ?? target.pi_thinking;
11434
12010
  const cwdSource = target.cwd;
11435
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
11436
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
11437
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
11438
- const logFormatSource = target.log_format ?? target.logFormat;
11439
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
12011
+ const workspaceTemplateSource = target.workspace_template;
12012
+ const timeoutSource = target.timeout_seconds;
12013
+ const logDirSource = target.log_dir ?? target.log_directory;
12014
+ const logFormatSource = target.log_format;
12015
+ const systemPromptSource = target.system_prompt;
11440
12016
  const executable = resolveOptionalString(executableSource, env, `${target.name} pi-cli executable`, {
11441
12017
  allowLiteral: true,
11442
12018
  optionalEnv: true
@@ -11455,6 +12031,11 @@ function resolvePiCliConfig(target, env, evalFilePath) {
11455
12031
  allowLiteral: false,
11456
12032
  optionalEnv: true
11457
12033
  });
12034
+ const baseUrlSource = target.base_url ?? target.endpoint;
12035
+ const baseUrl = resolveOptionalString(baseUrlSource, env, `${target.name} pi-cli base url`, {
12036
+ allowLiteral: true,
12037
+ optionalEnv: true
12038
+ });
11458
12039
  const tools = resolveOptionalString(toolsSource, env, `${target.name} pi-cli tools`, {
11459
12040
  allowLiteral: true,
11460
12041
  optionalEnv: true
@@ -11475,8 +12056,8 @@ function resolvePiCliConfig(target, env, evalFilePath) {
11475
12056
  `${target.name} pi-cli workspace template`,
11476
12057
  { allowLiteral: true, optionalEnv: true }
11477
12058
  );
11478
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11479
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
12059
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
12060
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11480
12061
  }
11481
12062
  if (cwd && workspaceTemplate) {
11482
12063
  throw new Error(`${target.name}: 'cwd' and 'workspace_template' are mutually exclusive.`);
@@ -11493,6 +12074,7 @@ function resolvePiCliConfig(target, env, evalFilePath) {
11493
12074
  subprovider,
11494
12075
  model,
11495
12076
  apiKey,
12077
+ baseUrl,
11496
12078
  tools,
11497
12079
  thinking,
11498
12080
  args,
@@ -11507,11 +12089,11 @@ function resolvePiCliConfig(target, env, evalFilePath) {
11507
12089
  function resolveClaudeConfig(target, env, evalFilePath) {
11508
12090
  const modelSource = target.model;
11509
12091
  const cwdSource = target.cwd;
11510
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
11511
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
11512
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
11513
- const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CLAUDE_LOG_FORMAT;
11514
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
12092
+ const workspaceTemplateSource = target.workspace_template;
12093
+ const timeoutSource = target.timeout_seconds;
12094
+ const logDirSource = target.log_dir ?? target.log_directory;
12095
+ const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CLAUDE_LOG_FORMAT;
12096
+ const systemPromptSource = target.system_prompt;
11515
12097
  const model = resolveOptionalString(modelSource, env, `${target.name} claude model`, {
11516
12098
  allowLiteral: true,
11517
12099
  optionalEnv: true
@@ -11529,8 +12111,8 @@ function resolveClaudeConfig(target, env, evalFilePath) {
11529
12111
  optionalEnv: true
11530
12112
  }
11531
12113
  );
11532
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11533
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
12114
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
12115
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11534
12116
  }
11535
12117
  if (cwd && workspaceTemplate) {
11536
12118
  throw new Error(
@@ -11544,8 +12126,8 @@ function resolveClaudeConfig(target, env, evalFilePath) {
11544
12126
  });
11545
12127
  const logFormat = normalizeClaudeLogFormat(logFormatSource);
11546
12128
  const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
11547
- const maxTurns = typeof target.max_turns === "number" ? target.max_turns : typeof target.maxTurns === "number" ? target.maxTurns : void 0;
11548
- const maxBudgetUsd = typeof target.max_budget_usd === "number" ? target.max_budget_usd : typeof target.maxBudgetUsd === "number" ? target.maxBudgetUsd : void 0;
12129
+ const maxTurns = typeof target.max_turns === "number" ? target.max_turns : void 0;
12130
+ const maxBudgetUsd = typeof target.max_budget_usd === "number" ? target.max_budget_usd : void 0;
11549
12131
  return {
11550
12132
  model,
11551
12133
  systemPrompt,
@@ -11576,9 +12158,7 @@ function resolveMockConfig(target) {
11576
12158
  return { response };
11577
12159
  }
11578
12160
  function resolveVSCodeConfig(target, env, insiders, evalFilePath) {
11579
- const workspaceTemplateEnvVar = resolveOptionalLiteralString(
11580
- target.workspace_template ?? target.workspaceTemplate
11581
- );
12161
+ const workspaceTemplateEnvVar = resolveOptionalLiteralString(target.workspace_template);
11582
12162
  let workspaceTemplate = workspaceTemplateEnvVar ? resolveOptionalString(
11583
12163
  workspaceTemplateEnvVar,
11584
12164
  env,
@@ -11588,14 +12168,14 @@ function resolveVSCodeConfig(target, env, insiders, evalFilePath) {
11588
12168
  optionalEnv: true
11589
12169
  }
11590
12170
  ) : void 0;
11591
- if (workspaceTemplate && evalFilePath && !import_node_path23.default.isAbsolute(workspaceTemplate)) {
11592
- workspaceTemplate = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), workspaceTemplate);
12171
+ if (workspaceTemplate && evalFilePath && !import_node_path24.default.isAbsolute(workspaceTemplate)) {
12172
+ workspaceTemplate = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), workspaceTemplate);
11593
12173
  }
11594
12174
  const executableSource = target.executable;
11595
12175
  const waitSource = target.wait;
11596
- const dryRunSource = target.dry_run ?? target.dryRun;
11597
- const subagentRootSource = target.subagent_root ?? target.subagentRoot;
11598
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
12176
+ const dryRunSource = target.dry_run;
12177
+ const subagentRootSource = target.subagent_root;
12178
+ const timeoutSource = target.timeout_seconds;
11599
12179
  const defaultCommand = insiders ? "code-insiders" : "code";
11600
12180
  const executable = resolveOptionalString(executableSource, env, `${target.name} vscode executable`, {
11601
12181
  allowLiteral: true,
@@ -11630,8 +12210,8 @@ function resolveCliConfig(target, env, evalFilePath) {
11630
12210
  const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
11631
12211
  if (!parseResult.success) {
11632
12212
  const firstError = parseResult.error.errors[0];
11633
- const path52 = firstError?.path.join(".") || "";
11634
- const prefix = path52 ? `${target.name} ${path52}: ` : `${target.name}: `;
12213
+ const path53 = firstError?.path.join(".") || "";
12214
+ const prefix = path53 ? `${target.name} ${path53}: ` : `${target.name}: `;
11635
12215
  throw new Error(`${prefix}${firstError?.message}`);
11636
12216
  }
11637
12217
  const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
@@ -11646,17 +12226,17 @@ function resolveCliConfig(target, env, evalFilePath) {
11646
12226
  }
11647
12227
  function resolveDiscoveredProviderConfig(target, providerKind, env, evalFilePath) {
11648
12228
  const command = target.command ? resolveString(target.command, env, `${target.name} command`, true) : `bun run .agentv/providers/${providerKind}.ts {PROMPT}`;
11649
- const timeoutSeconds = target.timeout_seconds ?? target.timeoutSeconds;
12229
+ const timeoutSeconds = target.timeout_seconds;
11650
12230
  const timeoutMs = resolveTimeoutMs(timeoutSeconds, `${target.name} timeout`);
11651
12231
  let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
11652
12232
  allowLiteral: true,
11653
12233
  optionalEnv: true
11654
12234
  });
11655
- if (cwd && evalFilePath && !import_node_path23.default.isAbsolute(cwd)) {
11656
- cwd = import_node_path23.default.resolve(import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath)), cwd);
12235
+ if (cwd && evalFilePath && !import_node_path24.default.isAbsolute(cwd)) {
12236
+ cwd = import_node_path24.default.resolve(import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath)), cwd);
11657
12237
  }
11658
12238
  if (!cwd && evalFilePath) {
11659
- cwd = import_node_path23.default.dirname(import_node_path23.default.resolve(evalFilePath));
12239
+ cwd = import_node_path24.default.dirname(import_node_path24.default.resolve(evalFilePath));
11660
12240
  }
11661
12241
  return {
11662
12242
  command,
@@ -11710,10 +12290,10 @@ function resolveDiscover(value, targetName) {
11710
12290
  throw new Error(`Target "${targetName}": discover must be "latest" (got "${String(value)}")`);
11711
12291
  }
11712
12292
  function resolveCopilotLogConfig(target, env) {
11713
- const sessionDirSource = target.session_dir ?? target.sessionDir;
11714
- const sessionIdSource = target.session_id ?? target.sessionId;
12293
+ const sessionDirSource = target.session_dir;
12294
+ const sessionIdSource = target.session_id;
11715
12295
  const discoverSource = target.discover;
11716
- const sessionStateDirSource = target.session_state_dir ?? target.sessionStateDir;
12296
+ const sessionStateDirSource = target.session_state_dir;
11717
12297
  const cwdSource = target.cwd;
11718
12298
  return {
11719
12299
  sessionDir: resolveOptionalString(
@@ -11894,7 +12474,7 @@ var import_node_path33 = __toESM(require("path"), 1);
11894
12474
  init_cjs_shims();
11895
12475
  var import_node_fs11 = require("fs");
11896
12476
  var import_promises20 = require("fs/promises");
11897
- var import_node_path24 = __toESM(require("path"), 1);
12477
+ var import_node_path25 = __toESM(require("path"), 1);
11898
12478
  async function pathExists(target) {
11899
12479
  try {
11900
12480
  await (0, import_promises20.access)(target, import_node_fs11.constants.F_OK);
@@ -11910,7 +12490,7 @@ async function readDirEntries(target) {
11910
12490
  const entries = await (0, import_promises20.readdir)(target, { withFileTypes: true });
11911
12491
  return entries.map((entry) => ({
11912
12492
  name: entry.name,
11913
- absolutePath: import_node_path24.default.join(target, entry.name),
12493
+ absolutePath: import_node_path25.default.join(target, entry.name),
11914
12494
  isDirectory: entry.isDirectory()
11915
12495
  }));
11916
12496
  }
@@ -11926,9 +12506,9 @@ async function removeIfExists(target) {
11926
12506
 
11927
12507
  // src/evaluation/providers/vscode/utils/path.ts
11928
12508
  init_cjs_shims();
11929
- var import_node_path25 = __toESM(require("path"), 1);
12509
+ var import_node_path26 = __toESM(require("path"), 1);
11930
12510
  function pathToFileUri2(filePath) {
11931
- const absolutePath = import_node_path25.default.isAbsolute(filePath) ? filePath : import_node_path25.default.resolve(filePath);
12511
+ const absolutePath = import_node_path26.default.isAbsolute(filePath) ? filePath : import_node_path26.default.resolve(filePath);
11932
12512
  const normalizedPath = absolutePath.replace(/\\/g, "/");
11933
12513
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
11934
12514
  return `file:///${normalizedPath}`;
@@ -11938,7 +12518,7 @@ function pathToFileUri2(filePath) {
11938
12518
 
11939
12519
  // src/evaluation/providers/vscode/dispatch/promptBuilder.ts
11940
12520
  init_cjs_shims();
11941
- var import_node_path26 = __toESM(require("path"), 1);
12521
+ var import_node_path27 = __toESM(require("path"), 1);
11942
12522
 
11943
12523
  // src/evaluation/providers/vscode/utils/template.ts
11944
12524
  init_cjs_shims();
@@ -12032,8 +12612,8 @@ function createBatchRequestPrompt(userQuery, responseFileTmp, responseFileFinal,
12032
12612
  });
12033
12613
  }
12034
12614
  function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateContent) {
12035
- const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${import_node_path26.default.basename(file)}`).join("\n");
12036
- const responseList = responseFiles.map((file) => `"${import_node_path26.default.basename(file)}"`).join(", ");
12615
+ const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${import_node_path27.default.basename(file)}`).join("\n");
12616
+ const responseList = responseFiles.map((file) => `"${import_node_path27.default.basename(file)}"`).join(", ");
12037
12617
  return renderTemplate2(templateContent, {
12038
12618
  requestFiles: requestLines,
12039
12619
  responseList
@@ -12043,7 +12623,7 @@ function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateCont
12043
12623
  // src/evaluation/providers/vscode/dispatch/responseWaiter.ts
12044
12624
  init_cjs_shims();
12045
12625
  var import_promises21 = require("fs/promises");
12046
- var import_node_path27 = __toESM(require("path"), 1);
12626
+ var import_node_path28 = __toESM(require("path"), 1);
12047
12627
 
12048
12628
  // src/evaluation/providers/vscode/utils/time.ts
12049
12629
  init_cjs_shims();
@@ -12103,7 +12683,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
12103
12683
  }
12104
12684
  async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, silent = false, timeoutMs = DEFAULT_TIMEOUT_MS) {
12105
12685
  if (!silent) {
12106
- const fileList = responseFilesFinal.map((file) => import_node_path27.default.basename(file)).join(", ");
12686
+ const fileList = responseFilesFinal.map((file) => import_node_path28.default.basename(file)).join(", ");
12107
12687
  console.error(`waiting for ${responseFilesFinal.length} batch response(s): ${fileList}`);
12108
12688
  }
12109
12689
  const deadline = Date.now() + timeoutMs;
@@ -12112,7 +12692,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
12112
12692
  while (pending.size > 0) {
12113
12693
  if (Date.now() >= deadline) {
12114
12694
  if (!silent) {
12115
- const remaining = [...pending].map((f) => import_node_path27.default.basename(f)).join(", ");
12695
+ const remaining = [...pending].map((f) => import_node_path28.default.basename(f)).join(", ");
12116
12696
  console.error(
12117
12697
  `error: timed out after ${Math.round(timeoutMs / 1e3)}s waiting for batch responses. Still pending: ${remaining}`
12118
12698
  );
@@ -12170,37 +12750,6 @@ var import_node_util2 = require("util");
12170
12750
  // src/evaluation/providers/vscode/dispatch/constants.ts
12171
12751
  init_cjs_shims();
12172
12752
  var import_node_path29 = __toESM(require("path"), 1);
12173
-
12174
- // src/paths.ts
12175
- init_cjs_shims();
12176
- var import_node_os6 = __toESM(require("os"), 1);
12177
- var import_node_path28 = __toESM(require("path"), 1);
12178
- var logged = false;
12179
- function getAgentvHome() {
12180
- const envHome = process.env.AGENTV_HOME;
12181
- if (envHome && envHome !== "undefined") {
12182
- if (!logged) {
12183
- logged = true;
12184
- console.warn(`Using AGENTV_HOME: ${envHome}`);
12185
- }
12186
- return envHome;
12187
- }
12188
- return import_node_path28.default.join(import_node_os6.default.homedir(), ".agentv");
12189
- }
12190
- function getWorkspacesRoot() {
12191
- return import_node_path28.default.join(getAgentvHome(), "workspaces");
12192
- }
12193
- function getSubagentsRoot() {
12194
- return import_node_path28.default.join(getAgentvHome(), "subagents");
12195
- }
12196
- function getTraceStateRoot() {
12197
- return import_node_path28.default.join(getAgentvHome(), "trace-state");
12198
- }
12199
- function getWorkspacePoolRoot() {
12200
- return import_node_path28.default.join(getAgentvHome(), "workspace-pool");
12201
- }
12202
-
12203
- // src/evaluation/providers/vscode/dispatch/constants.ts
12204
12753
  var DEFAULT_LOCK_NAME = "subagent.lock";
12205
12754
  var DEFAULT_ALIVE_FILENAME = ".alive";
12206
12755
  function getDefaultSubagentRoot(vscodeCmd = "code") {
@@ -13353,6 +13902,15 @@ var AGENT_PROVIDER_KINDS = [
13353
13902
  "vscode",
13354
13903
  "vscode-insiders"
13355
13904
  ];
13905
+ var LLM_GRADER_CAPABLE_KINDS = [
13906
+ "openai",
13907
+ "openrouter",
13908
+ "azure",
13909
+ "anthropic",
13910
+ "gemini",
13911
+ "agentv",
13912
+ "mock"
13913
+ ];
13356
13914
  function extractLastAssistantContent(messages) {
13357
13915
  if (!messages || messages.length === 0) {
13358
13916
  return "";
@@ -13506,9 +14064,10 @@ init_cjs_shims();
13506
14064
 
13507
14065
  // src/evaluation/evaluators/scoring.ts
13508
14066
  init_cjs_shims();
13509
- var PASS_THRESHOLD = 0.8;
13510
- function scoreToVerdict(score) {
13511
- return score >= PASS_THRESHOLD ? "pass" : "fail";
14067
+ var DEFAULT_THRESHOLD = 0.8;
14068
+ var PASS_THRESHOLD = DEFAULT_THRESHOLD;
14069
+ function scoreToVerdict(score, threshold = DEFAULT_THRESHOLD) {
14070
+ return score >= threshold ? "pass" : "fail";
13512
14071
  }
13513
14072
  function clampScore(value) {
13514
14073
  if (Number.isNaN(value) || !Number.isFinite(value)) {
@@ -13699,13 +14258,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
13699
14258
  async function execShellWithStdin(command, stdinPayload, options = {}) {
13700
14259
  const { mkdir: mkdir17, readFile: readFile17, rm: rm6, writeFile: writeFile9 } = await import("fs/promises");
13701
14260
  const { tmpdir: tmpdir3 } = await import("os");
13702
- const path52 = await import("path");
14261
+ const path53 = await import("path");
13703
14262
  const { randomUUID: randomUUID10 } = await import("crypto");
13704
- const dir = path52.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
14263
+ const dir = path53.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
13705
14264
  await mkdir17(dir, { recursive: true });
13706
- const stdinPath = path52.join(dir, "stdin.txt");
13707
- const stdoutPath = path52.join(dir, "stdout.txt");
13708
- const stderrPath = path52.join(dir, "stderr.txt");
14265
+ const stdinPath = path53.join(dir, "stdin.txt");
14266
+ const stdoutPath = path53.join(dir, "stdout.txt");
14267
+ const stderrPath = path53.join(dir, "stderr.txt");
13709
14268
  await writeFile9(stdinPath, stdinPayload, "utf8");
13710
14269
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
13711
14270
  const { spawn: spawn5 } = await import("child_process");
@@ -14907,7 +15466,7 @@ ${outputSchema}`;
14907
15466
  parts.push("[[ ## scoring_criteria ## ]]");
14908
15467
  for (const rubric of rubrics) {
14909
15468
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
14910
- const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
15469
+ const minScoreLabel = rubric.min_score !== void 0 ? ` [REQUIRED: min score ${rubric.min_score}]` : rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
14911
15470
  parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
14912
15471
  if (rubric.outcome) {
14913
15472
  parts.push(`Description: ${rubric.outcome}`);
@@ -14961,54 +15520,106 @@ ${outputSchema}`;
14961
15520
  async runWithRetry(options) {
14962
15521
  const { context: context2, graderProvider, systemPrompt, userPrompt, schema, images } = options;
14963
15522
  let lastError;
15523
+ let lastInvalidResponse;
15524
+ let shouldAttemptStructureFix = false;
14964
15525
  for (let attempt = 1; attempt <= 3; attempt++) {
14965
15526
  try {
14966
- const model = graderProvider.asLanguageModel?.();
14967
- if (model) {
14968
- const modelOptions = {
14969
- ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
14970
- ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
14971
- };
14972
- const hasImages = images && images.length > 0;
14973
- const result = hasImages ? await (0, import_ai2.generateText)({
14974
- model,
14975
- system: systemPrompt,
14976
- messages: [
14977
- {
14978
- role: "user",
14979
- content: [
14980
- { type: "text", text: userPrompt },
14981
- ...toAiSdkImageParts(images)
14982
- ]
14983
- }
14984
- ],
14985
- ...modelOptions
14986
- }) : await (0, import_ai2.generateText)({
14987
- model,
14988
- system: systemPrompt,
14989
- prompt: userPrompt,
14990
- ...modelOptions
14991
- });
14992
- const data2 = schema.parse(parseJsonFromText(result.text));
14993
- const rawUsage = result.usage;
14994
- const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
14995
- return { data: data2, tokenUsage };
15527
+ const result = await this.generateStructuredResponse({
15528
+ context: context2,
15529
+ graderProvider,
15530
+ systemPrompt,
15531
+ userPrompt,
15532
+ images
15533
+ });
15534
+ const canRepairResponse = result.text.trim().length > 0;
15535
+ lastInvalidResponse = canRepairResponse ? result : void 0;
15536
+ let data;
15537
+ try {
15538
+ data = schema.parse(parseJsonFromText(result.text));
15539
+ } catch (e) {
15540
+ lastError = e instanceof Error ? e : new Error(String(e));
15541
+ shouldAttemptStructureFix = canRepairResponse;
15542
+ continue;
14996
15543
  }
14997
- const response = await graderProvider.invoke({
14998
- question: userPrompt,
15544
+ return {
15545
+ data,
15546
+ providerResponse: result.providerResponse,
15547
+ tokenUsage: result.tokenUsage
15548
+ };
15549
+ } catch (e) {
15550
+ lastError = e instanceof Error ? e : new Error(String(e));
15551
+ }
15552
+ }
15553
+ if (shouldAttemptStructureFix && lastInvalidResponse) {
15554
+ try {
15555
+ const repaired = await this.generateStructuredResponse({
15556
+ context: context2,
15557
+ graderProvider,
14999
15558
  systemPrompt,
15000
- evalCaseId: context2.evalCase.id,
15001
- attempt: context2.attempt,
15002
- maxOutputTokens: this.maxOutputTokens,
15003
- temperature: this.temperature
15559
+ userPrompt: buildStructureRepairPrompt({
15560
+ validationError: lastError?.message ?? "Schema validation failed",
15561
+ invalidResponse: lastInvalidResponse.text
15562
+ })
15004
15563
  });
15005
- const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output)));
15006
- return { data, providerResponse: response, tokenUsage: response.tokenUsage };
15564
+ const data = schema.parse(parseJsonFromText(repaired.text));
15565
+ return {
15566
+ data,
15567
+ providerResponse: repaired.providerResponse,
15568
+ tokenUsage: sumTokenUsage(lastInvalidResponse.tokenUsage, repaired.tokenUsage)
15569
+ };
15007
15570
  } catch (e) {
15008
15571
  lastError = e instanceof Error ? e : new Error(String(e));
15009
15572
  }
15010
15573
  }
15011
- throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
15574
+ throw new Error(
15575
+ `Failed to parse evaluator response after 3 attempts and 1 structure-fix attempt: ${lastError?.message}`
15576
+ );
15577
+ }
15578
+ async generateStructuredResponse(options) {
15579
+ const { context: context2, graderProvider, systemPrompt, userPrompt, images } = options;
15580
+ const model = graderProvider.asLanguageModel?.();
15581
+ if (model) {
15582
+ const modelOptions = {
15583
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
15584
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
15585
+ };
15586
+ const hasImages = images && images.length > 0;
15587
+ const result = hasImages ? await (0, import_ai2.generateText)({
15588
+ model,
15589
+ system: systemPrompt,
15590
+ messages: [
15591
+ {
15592
+ role: "user",
15593
+ content: [
15594
+ { type: "text", text: userPrompt },
15595
+ ...toAiSdkImageParts(images)
15596
+ ]
15597
+ }
15598
+ ],
15599
+ ...modelOptions
15600
+ }) : await (0, import_ai2.generateText)({
15601
+ model,
15602
+ system: systemPrompt,
15603
+ prompt: userPrompt,
15604
+ ...modelOptions
15605
+ });
15606
+ const rawUsage = result.usage;
15607
+ const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
15608
+ return { text: result.text, tokenUsage };
15609
+ }
15610
+ const response = await graderProvider.invoke({
15611
+ question: userPrompt,
15612
+ systemPrompt,
15613
+ evalCaseId: context2.evalCase.id,
15614
+ attempt: context2.attempt,
15615
+ maxOutputTokens: this.maxOutputTokens,
15616
+ temperature: this.temperature
15617
+ });
15618
+ return {
15619
+ text: extractLastAssistantContent(response.output),
15620
+ providerResponse: response,
15621
+ tokenUsage: response.tokenUsage
15622
+ };
15012
15623
  }
15013
15624
  };
15014
15625
  function buildOutputSchema() {
@@ -15028,6 +15639,29 @@ function buildOutputSchema() {
15028
15639
  "}"
15029
15640
  ].join("\n");
15030
15641
  }
15642
+ function buildStructureRepairPrompt(options) {
15643
+ const { validationError, invalidResponse } = options;
15644
+ return [
15645
+ "The following evaluation response has useful grading content but invalid JSON structure.",
15646
+ "Repair it to satisfy the schema in the system prompt.",
15647
+ "Preserve the evaluation meaning, do not re-grade the answer, and return only a single JSON object.",
15648
+ "",
15649
+ "Validation error:",
15650
+ validationError,
15651
+ "",
15652
+ "Invalid response:",
15653
+ invalidResponse
15654
+ ].join("\n");
15655
+ }
15656
+ function sumTokenUsage(first, second) {
15657
+ if (!first && !second) {
15658
+ return void 0;
15659
+ }
15660
+ return {
15661
+ input: (first?.input ?? 0) + (second?.input ?? 0),
15662
+ output: (first?.output ?? 0) + (second?.output ?? 0)
15663
+ };
15664
+ }
15031
15665
  function buildRubricOutputSchema() {
15032
15666
  return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
15033
15667
  You must return a valid JSON object matching this schema:
@@ -15127,19 +15761,21 @@ function calculateScoreRangeResult(result, rubrics) {
15127
15761
  rawScores[rubric.id] = rawScore;
15128
15762
  totalWeight += rubric.weight;
15129
15763
  weightedScoreSum += normalizedScore * rubric.weight;
15130
- let requiredMinScore;
15131
- if (rubric.required_min_score !== void 0) {
15132
- requiredMinScore = rubric.required_min_score;
15764
+ let minScoreThreshold;
15765
+ if (rubric.min_score !== void 0) {
15766
+ minScoreThreshold = rubric.min_score;
15767
+ } else if (rubric.required_min_score !== void 0) {
15768
+ minScoreThreshold = rubric.required_min_score / 10;
15133
15769
  } else if (rubric.required === true) {
15134
- requiredMinScore = 10;
15770
+ minScoreThreshold = 1;
15135
15771
  }
15136
15772
  const matchingRange = rubric.score_ranges?.find(
15137
15773
  (r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
15138
15774
  );
15139
15775
  const rangeDescription = matchingRange?.outcome ?? "";
15140
15776
  const criterionLabel = rubric.outcome ?? rubric.id;
15141
- const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
15142
- if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
15777
+ const passed = !(minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) && rawScore >= 7;
15778
+ if (minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) {
15143
15779
  failedRequired = true;
15144
15780
  }
15145
15781
  assertions.push({
@@ -15216,11 +15852,11 @@ function createFilesystemTools(workspacePath) {
15216
15852
  execute: async (input) => {
15217
15853
  try {
15218
15854
  const resolved = resolveSandboxed(workspacePath, input.path);
15219
- const stat10 = await import_promises29.default.stat(resolved);
15220
- if (stat10.isDirectory()) {
15855
+ const stat11 = await import_promises29.default.stat(resolved);
15856
+ if (stat11.isDirectory()) {
15221
15857
  return { error: `'${input.path}' is a directory, not a file` };
15222
15858
  }
15223
- const buffer = Buffer.alloc(Math.min(stat10.size, MAX_FILE_SIZE));
15859
+ const buffer = Buffer.alloc(Math.min(stat11.size, MAX_FILE_SIZE));
15224
15860
  const fd = await import_promises29.default.open(resolved, "r");
15225
15861
  try {
15226
15862
  await fd.read(buffer, 0, buffer.length, 0);
@@ -15228,8 +15864,8 @@ function createFilesystemTools(workspacePath) {
15228
15864
  await fd.close();
15229
15865
  }
15230
15866
  const content = buffer.toString("utf-8");
15231
- const truncated = stat10.size > MAX_FILE_SIZE;
15232
- return { content, truncated, size: stat10.size };
15867
+ const truncated = stat11.size > MAX_FILE_SIZE;
15868
+ return { content, truncated, size: stat11.size };
15233
15869
  } catch (error) {
15234
15870
  return { error: error instanceof Error ? error.message : String(error) };
15235
15871
  }
@@ -15280,8 +15916,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
15280
15916
  const ext = import_node_path39.default.extname(entry.name).toLowerCase();
15281
15917
  if (BINARY_EXTENSIONS.has(ext)) continue;
15282
15918
  try {
15283
- const stat10 = await import_promises29.default.stat(fullPath);
15284
- if (stat10.size > MAX_FILE_SIZE) continue;
15919
+ const stat11 = await import_promises29.default.stat(fullPath);
15920
+ if (stat11.size > MAX_FILE_SIZE) continue;
15285
15921
  const content = await import_promises29.default.readFile(fullPath, "utf-8");
15286
15922
  const lines = content.split("\n");
15287
15923
  for (let i = 0; i < lines.length; i++) {
@@ -15925,115 +16561,115 @@ var FieldAccuracyEvaluator = class {
15925
16561
  * Evaluate a single field against the expected value.
15926
16562
  */
15927
16563
  evaluateField(fieldConfig, candidateData, expectedData) {
15928
- const { path: path52, match, required = true, weight = 1 } = fieldConfig;
15929
- const candidateValue = resolvePath(candidateData, path52);
15930
- const expectedValue = resolvePath(expectedData, path52);
16564
+ const { path: path53, match, required = true, weight = 1 } = fieldConfig;
16565
+ const candidateValue = resolvePath(candidateData, path53);
16566
+ const expectedValue = resolvePath(expectedData, path53);
15931
16567
  if (expectedValue === void 0) {
15932
16568
  return {
15933
- path: path52,
16569
+ path: path53,
15934
16570
  score: 1,
15935
16571
  // No expected value means no comparison needed
15936
16572
  weight,
15937
16573
  hit: true,
15938
- message: `${path52}: no expected value`
16574
+ message: `${path53}: no expected value`
15939
16575
  };
15940
16576
  }
15941
16577
  if (candidateValue === void 0) {
15942
16578
  if (required) {
15943
16579
  return {
15944
- path: path52,
16580
+ path: path53,
15945
16581
  score: 0,
15946
16582
  weight,
15947
16583
  hit: false,
15948
- message: `${path52} (required, missing)`
16584
+ message: `${path53} (required, missing)`
15949
16585
  };
15950
16586
  }
15951
16587
  return {
15952
- path: path52,
16588
+ path: path53,
15953
16589
  score: 1,
15954
16590
  // Don't penalize missing optional fields
15955
16591
  weight: 0,
15956
16592
  // Zero weight means it won't affect the score
15957
16593
  hit: true,
15958
- message: `${path52}: optional field missing`
16594
+ message: `${path53}: optional field missing`
15959
16595
  };
15960
16596
  }
15961
16597
  switch (match) {
15962
16598
  case "exact":
15963
- return this.compareExact(path52, candidateValue, expectedValue, weight);
16599
+ return this.compareExact(path53, candidateValue, expectedValue, weight);
15964
16600
  case "numeric_tolerance":
15965
16601
  return this.compareNumericTolerance(
15966
- path52,
16602
+ path53,
15967
16603
  candidateValue,
15968
16604
  expectedValue,
15969
16605
  fieldConfig,
15970
16606
  weight
15971
16607
  );
15972
16608
  case "date":
15973
- return this.compareDate(path52, candidateValue, expectedValue, fieldConfig, weight);
16609
+ return this.compareDate(path53, candidateValue, expectedValue, fieldConfig, weight);
15974
16610
  default:
15975
16611
  return {
15976
- path: path52,
16612
+ path: path53,
15977
16613
  score: 0,
15978
16614
  weight,
15979
16615
  hit: false,
15980
- message: `${path52}: unknown match type "${match}"`
16616
+ message: `${path53}: unknown match type "${match}"`
15981
16617
  };
15982
16618
  }
15983
16619
  }
15984
16620
  /**
15985
16621
  * Exact equality comparison.
15986
16622
  */
15987
- compareExact(path52, candidateValue, expectedValue, weight) {
16623
+ compareExact(path53, candidateValue, expectedValue, weight) {
15988
16624
  if (deepEqual(candidateValue, expectedValue)) {
15989
16625
  return {
15990
- path: path52,
16626
+ path: path53,
15991
16627
  score: 1,
15992
16628
  weight,
15993
16629
  hit: true,
15994
- message: path52
16630
+ message: path53
15995
16631
  };
15996
16632
  }
15997
16633
  if (typeof candidateValue !== typeof expectedValue) {
15998
16634
  return {
15999
- path: path52,
16635
+ path: path53,
16000
16636
  score: 0,
16001
16637
  weight,
16002
16638
  hit: false,
16003
- message: `${path52} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
16639
+ message: `${path53} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
16004
16640
  };
16005
16641
  }
16006
16642
  return {
16007
- path: path52,
16643
+ path: path53,
16008
16644
  score: 0,
16009
16645
  weight,
16010
16646
  hit: false,
16011
- message: `${path52} (value mismatch)`
16647
+ message: `${path53} (value mismatch)`
16012
16648
  };
16013
16649
  }
16014
16650
  /**
16015
16651
  * Numeric comparison with absolute or relative tolerance.
16016
16652
  */
16017
- compareNumericTolerance(path52, candidateValue, expectedValue, fieldConfig, weight) {
16653
+ compareNumericTolerance(path53, candidateValue, expectedValue, fieldConfig, weight) {
16018
16654
  const { tolerance = 0, relative = false } = fieldConfig;
16019
16655
  const candidateNum = toNumber(candidateValue);
16020
16656
  const expectedNum = toNumber(expectedValue);
16021
16657
  if (candidateNum === null || expectedNum === null) {
16022
16658
  return {
16023
- path: path52,
16659
+ path: path53,
16024
16660
  score: 0,
16025
16661
  weight,
16026
16662
  hit: false,
16027
- message: `${path52} (non-numeric value)`
16663
+ message: `${path53} (non-numeric value)`
16028
16664
  };
16029
16665
  }
16030
16666
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
16031
16667
  return {
16032
- path: path52,
16668
+ path: path53,
16033
16669
  score: 0,
16034
16670
  weight,
16035
16671
  hit: false,
16036
- message: `${path52} (invalid numeric value)`
16672
+ message: `${path53} (invalid numeric value)`
16037
16673
  };
16038
16674
  }
16039
16675
  const diff = Math.abs(candidateNum - expectedNum);
@@ -16046,61 +16682,61 @@ var FieldAccuracyEvaluator = class {
16046
16682
  }
16047
16683
  if (withinTolerance) {
16048
16684
  return {
16049
- path: path52,
16685
+ path: path53,
16050
16686
  score: 1,
16051
16687
  weight,
16052
16688
  hit: true,
16053
- message: `${path52} (within tolerance: diff=${diff.toFixed(2)})`
16689
+ message: `${path53} (within tolerance: diff=${diff.toFixed(2)})`
16054
16690
  };
16055
16691
  }
16056
16692
  return {
16057
- path: path52,
16693
+ path: path53,
16058
16694
  score: 0,
16059
16695
  weight,
16060
16696
  hit: false,
16061
- message: `${path52} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
16697
+ message: `${path53} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
16062
16698
  };
16063
16699
  }
16064
16700
  /**
16065
16701
  * Date comparison with format normalization.
16066
16702
  */
16067
- compareDate(path52, candidateValue, expectedValue, fieldConfig, weight) {
16703
+ compareDate(path53, candidateValue, expectedValue, fieldConfig, weight) {
16068
16704
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
16069
16705
  const candidateDate = parseDate(String(candidateValue), formats);
16070
16706
  const expectedDate = parseDate(String(expectedValue), formats);
16071
16707
  if (candidateDate === null) {
16072
16708
  return {
16073
- path: path52,
16709
+ path: path53,
16074
16710
  score: 0,
16075
16711
  weight,
16076
16712
  hit: false,
16077
- message: `${path52} (unparseable candidate date)`
16713
+ message: `${path53} (unparseable candidate date)`
16078
16714
  };
16079
16715
  }
16080
16716
  if (expectedDate === null) {
16081
16717
  return {
16082
- path: path52,
16718
+ path: path53,
16083
16719
  score: 0,
16084
16720
  weight,
16085
16721
  hit: false,
16086
- message: `${path52} (unparseable expected date)`
16722
+ message: `${path53} (unparseable expected date)`
16087
16723
  };
16088
16724
  }
16089
16725
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
16090
16726
  return {
16091
- path: path52,
16727
+ path: path53,
16092
16728
  score: 1,
16093
16729
  weight,
16094
16730
  hit: true,
16095
- message: path52
16731
+ message: path53
16096
16732
  };
16097
16733
  }
16098
16734
  return {
16099
- path: path52,
16735
+ path: path53,
16100
16736
  score: 0,
16101
16737
  weight,
16102
16738
  hit: false,
16103
- message: `${path52} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
16739
+ message: `${path53} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
16104
16740
  };
16105
16741
  }
16106
16742
  /**
@@ -16133,11 +16769,11 @@ var FieldAccuracyEvaluator = class {
16133
16769
  };
16134
16770
  }
16135
16771
  };
16136
- function resolvePath(obj, path52) {
16137
- if (!path52 || !obj) {
16772
+ function resolvePath(obj, path53) {
16773
+ if (!path53 || !obj) {
16138
16774
  return void 0;
16139
16775
  }
16140
- const parts = path52.split(/\.|\[|\]/).filter((p) => p.length > 0);
16776
+ const parts = path53.split(/\.|\[|\]/).filter((p) => p.length > 0);
16141
16777
  let current = obj;
16142
16778
  for (const part of parts) {
16143
16779
  if (current === null || current === void 0) {
@@ -16634,8 +17270,8 @@ var TokenUsageEvaluator = class {
16634
17270
 
16635
17271
  // src/evaluation/evaluators/tool-trajectory.ts
16636
17272
  init_cjs_shims();
16637
- function getNestedValue(obj, path52) {
16638
- const parts = path52.split(".");
17273
+ function getNestedValue(obj, path53) {
17274
+ const parts = path53.split(".");
16639
17275
  let current = obj;
16640
17276
  for (const part of parts) {
16641
17277
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -18428,7 +19064,7 @@ var WorkspacePoolManager = class {
18428
19064
  }
18429
19065
  /**
18430
19066
  * Reset an existing slot for reuse:
18431
- * 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
19067
+ * 1. Reset repos (fetch from origin when resolve=remote, then git reset --hard && git clean per repo)
18432
19068
  * 2. Re-copy template files (skip repo directories)
18433
19069
  */
18434
19070
  async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
@@ -18441,7 +19077,17 @@ var WorkspacePoolManager = class {
18441
19077
  continue;
18442
19078
  }
18443
19079
  const ref = repo.checkout?.ref ?? "HEAD";
18444
- await git(["reset", "--hard", ref], { cwd: repoDir });
19080
+ const resolve = repo.checkout?.resolve ?? "remote";
19081
+ if (resolve === "remote") {
19082
+ const fetchArgs = ["fetch", "origin", ref];
19083
+ if (repo.clone?.depth) {
19084
+ fetchArgs.splice(1, 0, "--depth", String(repo.clone.depth));
19085
+ }
19086
+ await git(fetchArgs, { cwd: repoDir });
19087
+ await git(["reset", "--hard", "FETCH_HEAD"], { cwd: repoDir });
19088
+ } else {
19089
+ await git(["reset", "--hard", ref], { cwd: repoDir });
19090
+ }
18445
19091
  const cleanFlag = poolReset === "strict" ? "-fdx" : "-fd";
18446
19092
  await git(["clean", cleanFlag], { cwd: repoDir });
18447
19093
  }
@@ -18741,7 +19387,7 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
18741
19387
  }
18742
19388
 
18743
19389
  // src/evaluation/orchestrator.ts
18744
- function classifyQualityStatus(score, threshold = PASS_THRESHOLD) {
19390
+ function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
18745
19391
  return score >= threshold ? "ok" : "quality_failure";
18746
19392
  }
18747
19393
  function buildSkippedEvaluatorError(scores) {
@@ -18833,7 +19479,7 @@ async function runEvaluation(options) {
18833
19479
  const filteredEvalCases = filterEvalCases(evalCases, filter);
18834
19480
  if (filteredEvalCases.length === 0) {
18835
19481
  if (filter) {
18836
- throw new Error(`No tests matched filter '${filter}' in ${evalFilePath}`);
19482
+ throw new Error(`No tests matched filter '${formatFilter(filter)}' in ${evalFilePath}`);
18837
19483
  }
18838
19484
  return [];
18839
19485
  }
@@ -18859,20 +19505,10 @@ async function runEvaluation(options) {
18859
19505
  if (resolvedTargetsByName.has(name)) {
18860
19506
  return resolvedTargetsByName.get(name);
18861
19507
  }
18862
- let definition = targetDefinitions.get(name);
19508
+ const definition = resolveDelegatedTargetDefinition(name, targetDefinitions, envLookup);
18863
19509
  if (!definition) {
18864
19510
  return void 0;
18865
19511
  }
18866
- for (let depth = 0; depth < 5; depth++) {
18867
- const useTarget = definition.use_target;
18868
- if (typeof useTarget !== "string" || useTarget.trim().length === 0) break;
18869
- const envMatch = useTarget.trim().match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
18870
- const resolvedName = envMatch ? envLookup[envMatch[1]] ?? "" : useTarget.trim();
18871
- if (resolvedName.length === 0) break;
18872
- const next = targetDefinitions.get(resolvedName);
18873
- if (!next) break;
18874
- definition = next;
18875
- }
18876
19512
  const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
18877
19513
  resolvedTargetsByName.set(name, resolved);
18878
19514
  return resolved;
@@ -18895,6 +19531,9 @@ async function runEvaluation(options) {
18895
19531
  const graderName = targetContext.graderTarget ?? targetContext.name;
18896
19532
  const resolvedGrader = resolveTargetByName(graderName);
18897
19533
  if (!resolvedGrader) {
19534
+ if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) {
19535
+ return void 0;
19536
+ }
18898
19537
  return getOrCreateProvider(targetContext);
18899
19538
  }
18900
19539
  return getOrCreateProvider(resolvedGrader);
@@ -19225,7 +19864,7 @@ async function runEvaluation(options) {
19225
19864
  const budgetResult = {
19226
19865
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
19227
19866
  testId: evalCase.id,
19228
- dataset: evalCase.dataset,
19867
+ suite: evalCase.suite,
19229
19868
  category: evalCase.category,
19230
19869
  score: 0,
19231
19870
  assertions: [],
@@ -19262,7 +19901,7 @@ async function runEvaluation(options) {
19262
19901
  const haltResult = {
19263
19902
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
19264
19903
  testId: evalCase.id,
19265
- dataset: evalCase.dataset,
19904
+ suite: evalCase.suite,
19266
19905
  category: evalCase.category,
19267
19906
  score: 0,
19268
19907
  assertions: [],
@@ -19574,7 +20213,7 @@ async function runBatchEvaluation(options) {
19574
20213
  targetResolver,
19575
20214
  availableTargets,
19576
20215
  verbose,
19577
- threshold: batchThreshold
20216
+ threshold: evalCase.threshold ?? batchThreshold
19578
20217
  });
19579
20218
  if (providerError) {
19580
20219
  result = {
@@ -20036,8 +20675,9 @@ async function runEvalCase(options) {
20036
20675
  fileChanges,
20037
20676
  workspacePath,
20038
20677
  verbose,
20039
- threshold: caseThreshold
20678
+ threshold: evalCase.threshold ?? caseThreshold
20040
20679
  });
20680
+ const effectiveThreshold = evalCase.threshold ?? caseThreshold;
20041
20681
  const totalDurationMs = Date.now() - caseStartMs;
20042
20682
  const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
20043
20683
  const evalRunTokenUsage = tokenUsage || graderTokens ? {
@@ -20051,7 +20691,7 @@ async function runEvalCase(options) {
20051
20691
  ...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
20052
20692
  };
20053
20693
  const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
20054
- const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, caseThreshold);
20694
+ const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, effectiveThreshold);
20055
20695
  const targetUsedField = targetUsed ? { targetUsed } : {};
20056
20696
  const finalResult = providerError ? {
20057
20697
  ...result,
@@ -20252,7 +20892,8 @@ async function evaluateCandidate(options) {
20252
20892
  targetResolver,
20253
20893
  availableTargets,
20254
20894
  fileChanges,
20255
- workspacePath
20895
+ workspacePath,
20896
+ threshold: evalThreshold
20256
20897
  });
20257
20898
  const completedAt = nowFn();
20258
20899
  let agentRequest;
@@ -20283,7 +20924,7 @@ async function evaluateCandidate(options) {
20283
20924
  return {
20284
20925
  timestamp: completedAt.toISOString(),
20285
20926
  testId: evalCase.id,
20286
- dataset: evalCase.dataset,
20927
+ suite: evalCase.suite,
20287
20928
  category: evalCase.category,
20288
20929
  conversationId: evalCase.conversation_id,
20289
20930
  score: score.score,
@@ -20326,7 +20967,8 @@ async function runEvaluatorsForCase(options) {
20326
20967
  targetResolver,
20327
20968
  availableTargets,
20328
20969
  fileChanges,
20329
- workspacePath
20970
+ workspacePath,
20971
+ threshold
20330
20972
  } = options;
20331
20973
  if (evalCase.assertions && evalCase.assertions.length > 0) {
20332
20974
  return runEvaluatorList({
@@ -20352,7 +20994,8 @@ async function runEvaluatorsForCase(options) {
20352
20994
  targetResolver,
20353
20995
  availableTargets,
20354
20996
  fileChanges,
20355
- workspacePath
20997
+ workspacePath,
20998
+ threshold
20356
20999
  });
20357
21000
  }
20358
21001
  const evaluatorKind = evalCase.evaluator ?? "llm-grader";
@@ -20454,7 +21097,8 @@ async function runEvaluatorList(options) {
20454
21097
  name: evaluatorConfig.name,
20455
21098
  type: evaluatorConfig.type,
20456
21099
  weight,
20457
- ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
21100
+ ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
21101
+ ...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
20458
21102
  });
20459
21103
  scores.push({
20460
21104
  name: evaluatorConfig.name,
@@ -20489,7 +21133,8 @@ async function runEvaluatorList(options) {
20489
21133
  name: evaluatorConfig.name ?? "unknown",
20490
21134
  type: evaluatorConfig.type ?? "llm-grader",
20491
21135
  weight,
20492
- ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
21136
+ ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
21137
+ ...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
20493
21138
  });
20494
21139
  scores.push({
20495
21140
  name: evaluatorConfig.name ?? "unknown",
@@ -20523,9 +21168,10 @@ async function runEvaluatorList(options) {
20523
21168
  }
20524
21169
  }
20525
21170
  }
21171
+ const effectiveThreshold = options.threshold ?? DEFAULT_THRESHOLD;
20526
21172
  const hasRequiredFailure = scored.some((entry) => {
20527
21173
  if (!entry.required) return false;
20528
- const minScore = typeof entry.required === "number" ? entry.required : PASS_THRESHOLD;
21174
+ const minScore = entry.min_score ?? (typeof entry.required === "number" ? entry.required : effectiveThreshold);
20529
21175
  return entry.score.score < minScore;
20530
21176
  });
20531
21177
  const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
@@ -20536,17 +21182,23 @@ async function runEvaluatorList(options) {
20536
21182
  const expectedAspectCount = assertions.length || 1;
20537
21183
  const score = {
20538
21184
  score: aggregateScore,
20539
- verdict: scoreToVerdict(aggregateScore),
21185
+ verdict: scoreToVerdict(aggregateScore, effectiveThreshold),
20540
21186
  assertions,
20541
21187
  expectedAspectCount
20542
21188
  };
20543
21189
  return { score, scores };
20544
21190
  }
21191
+ function formatFilter(filter) {
21192
+ return typeof filter === "string" ? filter : filter.join(", ");
21193
+ }
21194
+ function matchesFilter3(id, filter) {
21195
+ return typeof filter === "string" ? import_micromatch3.default.isMatch(id, filter) : filter.some((pattern) => import_micromatch3.default.isMatch(id, pattern));
21196
+ }
20545
21197
  function filterEvalCases(evalCases, filter) {
20546
21198
  if (!filter) {
20547
21199
  return evalCases;
20548
21200
  }
20549
- return evalCases.filter((evalCase) => import_micromatch3.default.isMatch(evalCase.id, filter));
21201
+ return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter));
20550
21202
  }
20551
21203
  function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
20552
21204
  const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
@@ -20633,7 +21285,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
20633
21285
  return {
20634
21286
  timestamp: timestamp.toISOString(),
20635
21287
  testId: evalCase.id,
20636
- dataset: evalCase.dataset,
21288
+ suite: evalCase.suite,
20637
21289
  category: evalCase.category,
20638
21290
  conversationId: evalCase.conversation_id,
20639
21291
  score: 0,
@@ -20907,6 +21559,7 @@ async function evaluate(config) {
20907
21559
  verbose: config.verbose,
20908
21560
  maxConcurrency: config.workers ?? 3,
20909
21561
  filter: config.filter,
21562
+ threshold: config.threshold,
20910
21563
  evalCases,
20911
21564
  onResult: async (result) => {
20912
21565
  collectedResults.push(result);
@@ -20917,19 +21570,19 @@ async function evaluate(config) {
20917
21570
  const durationMs = Date.now() - startTime;
20918
21571
  return {
20919
21572
  results: allResults,
20920
- summary: computeSummary(allResults, durationMs)
21573
+ summary: computeSummary(allResults, durationMs, config.threshold)
20921
21574
  };
20922
21575
  }
20923
21576
  function mapAssertionType(type) {
20924
21577
  return type.replace(/_/g, "-");
20925
21578
  }
20926
- function computeSummary(results, durationMs) {
21579
+ function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
20927
21580
  const total = results.length;
20928
21581
  let passed = 0;
20929
21582
  let scoreSum = 0;
20930
21583
  for (const r of results) {
20931
21584
  scoreSum += r.score;
20932
- if (r.score >= PASS_THRESHOLD) {
21585
+ if (r.score >= threshold) {
20933
21586
  passed++;
20934
21587
  }
20935
21588
  }
@@ -20960,7 +21613,7 @@ async function discoverDefaultTarget(repoRoot) {
20960
21613
  return null;
20961
21614
  }
20962
21615
  async function loadEnvHierarchy(repoRoot, startPath) {
20963
- const { readFileSync: readFileSync3 } = await import("fs");
21616
+ const { readFileSync: readFileSync4 } = await import("fs");
20964
21617
  const chain = buildDirectoryChain2(startPath, repoRoot);
20965
21618
  const envFiles = [];
20966
21619
  for (const dir of chain) {
@@ -20969,7 +21622,7 @@ async function loadEnvHierarchy(repoRoot, startPath) {
20969
21622
  }
20970
21623
  for (let i = 0; i < envFiles.length; i++) {
20971
21624
  try {
20972
- const content = readFileSync3(envFiles[i], "utf8");
21625
+ const content = readFileSync4(envFiles[i], "utf8");
20973
21626
  for (const line of content.split("\n")) {
20974
21627
  const trimmed = line.trim();
20975
21628
  if (!trimmed || trimmed.startsWith("#")) continue;
@@ -21043,7 +21696,7 @@ var CONFIG_FILE_NAMES = [
21043
21696
  ];
21044
21697
  async function loadTsConfig(projectRoot) {
21045
21698
  const { existsSync: existsSync7 } = await import("fs");
21046
- const { pathToFileURL } = await import("url");
21699
+ const { pathToFileURL: pathToFileURL2 } = await import("url");
21047
21700
  const { join: join2 } = await import("path");
21048
21701
  for (const fileName of CONFIG_FILE_NAMES) {
21049
21702
  const filePath = join2(projectRoot, fileName);
@@ -21051,7 +21704,7 @@ async function loadTsConfig(projectRoot) {
21051
21704
  continue;
21052
21705
  }
21053
21706
  try {
21054
- const fileUrl = pathToFileURL(filePath).href;
21707
+ const fileUrl = pathToFileURL2(filePath).href;
21055
21708
  const mod = await import(fileUrl);
21056
21709
  const config = mod.default ?? mod;
21057
21710
  return AgentVConfigSchema.parse(config);
@@ -21492,7 +22145,7 @@ var OtelTraceExporter = class {
21492
22145
  rootSpan.setAttribute("gen_ai.system", "agentv");
21493
22146
  rootSpan.setAttribute("agentv.test_id", result.testId);
21494
22147
  rootSpan.setAttribute("agentv.target", result.target);
21495
- if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
22148
+ if (result.suite) rootSpan.setAttribute("agentv.suite", result.suite);
21496
22149
  rootSpan.setAttribute("agentv.score", result.score);
21497
22150
  if (captureContent && result.output.length > 0) {
21498
22151
  const lastMsg = result.output[result.output.length - 1];
@@ -21701,7 +22354,7 @@ var OtelStreamingObserver = class {
21701
22354
  this.rootSpan.setAttribute("gen_ai.system", "agentv");
21702
22355
  this.rootSpan.setAttribute("agentv.test_id", testId);
21703
22356
  this.rootSpan.setAttribute("agentv.target", target);
21704
- if (evalSet) this.rootSpan.setAttribute("agentv.dataset", evalSet);
22357
+ if (evalSet) this.rootSpan.setAttribute("agentv.suite", evalSet);
21705
22358
  this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
21706
22359
  }
21707
22360
  /** Create and immediately export a tool span */
@@ -22057,12 +22710,244 @@ function extractToolResultContent(content) {
22057
22710
  return parts.length > 0 ? parts.join("") : void 0;
22058
22711
  }
22059
22712
 
22060
- // src/import/session-discovery.ts
22713
+ // src/import/codex-parser.ts
22714
+ init_cjs_shims();
22715
+ function parseCodexSession(jsonl) {
22716
+ const messages = [];
22717
+ let sessionId = "";
22718
+ let cwd;
22719
+ let model;
22720
+ let version;
22721
+ let startTimestamp;
22722
+ let endTimestamp;
22723
+ const pendingCalls = /* @__PURE__ */ new Map();
22724
+ const lines = jsonl.split("\n").filter((l) => l.trim().length > 0);
22725
+ for (const line of lines) {
22726
+ let entry;
22727
+ try {
22728
+ entry = JSON.parse(line);
22729
+ } catch {
22730
+ continue;
22731
+ }
22732
+ if (!entry.type) continue;
22733
+ if (entry.timestamp) {
22734
+ if (!startTimestamp) startTimestamp = entry.timestamp;
22735
+ endTimestamp = entry.timestamp;
22736
+ }
22737
+ const payload = entry.payload ?? {};
22738
+ switch (entry.type) {
22739
+ case "session_meta": {
22740
+ sessionId = String(payload.id ?? "");
22741
+ cwd = payload.cwd ? String(payload.cwd) : void 0;
22742
+ version = payload.cli_version ? String(payload.cli_version) : void 0;
22743
+ if (payload.model && !model) {
22744
+ model = String(payload.model);
22745
+ }
22746
+ break;
22747
+ }
22748
+ case "turn_context": {
22749
+ if (payload.model && !model) {
22750
+ model = String(payload.model);
22751
+ }
22752
+ if (payload.cwd && !cwd) {
22753
+ cwd = String(payload.cwd);
22754
+ }
22755
+ break;
22756
+ }
22757
+ case "response_item": {
22758
+ const itemType = String(payload.type ?? "");
22759
+ const role = String(payload.role ?? "");
22760
+ switch (itemType) {
22761
+ case "message": {
22762
+ if (role === "developer") break;
22763
+ const content = extractResponseItemContent(payload.content);
22764
+ if (role === "user" && content) {
22765
+ messages.push({ role: "user", content });
22766
+ } else if (role === "assistant" && content) {
22767
+ messages.push({ role: "assistant", content });
22768
+ }
22769
+ break;
22770
+ }
22771
+ case "function_call": {
22772
+ const toolName = String(payload.name ?? "");
22773
+ const callId = String(payload.call_id ?? "");
22774
+ let input;
22775
+ if (typeof payload.arguments === "string") {
22776
+ try {
22777
+ input = JSON.parse(payload.arguments);
22778
+ } catch {
22779
+ input = payload.arguments;
22780
+ }
22781
+ } else {
22782
+ input = payload.arguments;
22783
+ }
22784
+ const toolCall = { tool: toolName, input, id: callId };
22785
+ const msgIdx = messages.length;
22786
+ messages.push({
22787
+ role: "assistant",
22788
+ toolCalls: [toolCall]
22789
+ });
22790
+ if (callId) {
22791
+ pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
22792
+ }
22793
+ break;
22794
+ }
22795
+ case "custom_tool_call": {
22796
+ const toolName = String(payload.name ?? "");
22797
+ const callId = String(payload.call_id ?? "");
22798
+ let input;
22799
+ if (typeof payload.arguments === "string") {
22800
+ try {
22801
+ input = JSON.parse(payload.arguments);
22802
+ } catch {
22803
+ input = payload.arguments;
22804
+ }
22805
+ } else {
22806
+ input = payload.arguments;
22807
+ }
22808
+ const toolCall = { tool: toolName, input, id: callId };
22809
+ const msgIdx = messages.length;
22810
+ messages.push({
22811
+ role: "assistant",
22812
+ toolCalls: [toolCall]
22813
+ });
22814
+ if (callId) {
22815
+ pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
22816
+ }
22817
+ break;
22818
+ }
22819
+ case "function_call_output":
22820
+ case "custom_tool_call_output": {
22821
+ const callId = String(payload.call_id ?? "");
22822
+ const pending = pendingCalls.get(callId);
22823
+ if (pending) {
22824
+ const existingMsg = messages[pending.msgIdx];
22825
+ const existingCalls = [...existingMsg.toolCalls ?? []];
22826
+ existingCalls[pending.toolIdx] = {
22827
+ ...existingCalls[pending.toolIdx],
22828
+ output: payload.output
22829
+ };
22830
+ messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls };
22831
+ pendingCalls.delete(callId);
22832
+ }
22833
+ break;
22834
+ }
22835
+ // Skip reasoning blocks (thinking tokens)
22836
+ case "reasoning":
22837
+ break;
22838
+ }
22839
+ break;
22840
+ }
22841
+ }
22842
+ }
22843
+ let durationMs;
22844
+ if (startTimestamp && endTimestamp) {
22845
+ durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime();
22846
+ }
22847
+ const source = {
22848
+ provider: "codex",
22849
+ sessionId,
22850
+ cwd,
22851
+ startedAt: startTimestamp,
22852
+ model,
22853
+ version
22854
+ };
22855
+ return {
22856
+ messages,
22857
+ source,
22858
+ // Codex rollout files don't include token counts (only rate limit info)
22859
+ tokenUsage: void 0,
22860
+ durationMs,
22861
+ costUsd: null
22862
+ };
22863
+ }
22864
+ function extractResponseItemContent(content) {
22865
+ if (typeof content === "string") return content;
22866
+ if (!Array.isArray(content)) return void 0;
22867
+ const parts = [];
22868
+ for (const block of content) {
22869
+ if (typeof block === "object" && block !== null) {
22870
+ const b = block;
22871
+ if (typeof b.text === "string") {
22872
+ parts.push(b.text);
22873
+ }
22874
+ }
22875
+ }
22876
+ return parts.length > 0 ? parts.join("") : void 0;
22877
+ }
22878
+
22879
+ // src/import/codex-session-discovery.ts
22061
22880
  init_cjs_shims();
22062
22881
  var import_promises36 = require("fs/promises");
22063
22882
  var import_node_os8 = require("os");
22064
22883
  var import_node_path53 = __toESM(require("path"), 1);
22065
- var DEFAULT_PROJECTS_DIR = () => import_node_path53.default.join((0, import_node_os8.homedir)(), ".claude", "projects");
22884
+ var DEFAULT_SESSIONS_DIR = () => import_node_path53.default.join((0, import_node_os8.homedir)(), ".codex", "sessions");
22885
+ async function discoverCodexSessions(opts) {
22886
+ const sessionsDir = opts?.sessionsDir ?? DEFAULT_SESSIONS_DIR();
22887
+ const limit = opts?.latest ? 1 : opts?.limit ?? 10;
22888
+ const sessions = [];
22889
+ let yearDirs;
22890
+ try {
22891
+ yearDirs = await (0, import_promises36.readdir)(sessionsDir);
22892
+ } catch {
22893
+ return [];
22894
+ }
22895
+ for (const year of yearDirs) {
22896
+ const yearPath = import_node_path53.default.join(sessionsDir, year);
22897
+ let monthDirs;
22898
+ try {
22899
+ monthDirs = await (0, import_promises36.readdir)(yearPath);
22900
+ } catch {
22901
+ continue;
22902
+ }
22903
+ for (const month of monthDirs) {
22904
+ const monthPath = import_node_path53.default.join(yearPath, month);
22905
+ let dayDirs;
22906
+ try {
22907
+ dayDirs = await (0, import_promises36.readdir)(monthPath);
22908
+ } catch {
22909
+ continue;
22910
+ }
22911
+ for (const day of dayDirs) {
22912
+ if (opts?.date) {
22913
+ const dirDate = `${year}-${month}-${day}`;
22914
+ if (dirDate !== opts.date) continue;
22915
+ }
22916
+ const dayPath = import_node_path53.default.join(monthPath, day);
22917
+ let files;
22918
+ try {
22919
+ files = await (0, import_promises36.readdir)(dayPath);
22920
+ } catch {
22921
+ continue;
22922
+ }
22923
+ for (const file of files) {
22924
+ if (!file.startsWith("rollout-") || !file.endsWith(".jsonl")) continue;
22925
+ const filePath = import_node_path53.default.join(dayPath, file);
22926
+ const nameWithoutExt = file.replace(/\.jsonl$/, "");
22927
+ const parts = nameWithoutExt.split("-");
22928
+ const sessionId = parts.length >= 6 ? parts.slice(-5).join("-") : nameWithoutExt;
22929
+ let updatedAt;
22930
+ try {
22931
+ const fileStat = await (0, import_promises36.stat)(filePath);
22932
+ updatedAt = fileStat.mtime;
22933
+ } catch {
22934
+ updatedAt = /* @__PURE__ */ new Date(0);
22935
+ }
22936
+ sessions.push({ sessionId, filePath, filename: file, updatedAt });
22937
+ }
22938
+ }
22939
+ }
22940
+ }
22941
+ sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
22942
+ return sessions.slice(0, limit);
22943
+ }
22944
+
22945
+ // src/import/session-discovery.ts
22946
+ init_cjs_shims();
22947
+ var import_promises37 = require("fs/promises");
22948
+ var import_node_os9 = require("os");
22949
+ var import_node_path54 = __toESM(require("path"), 1);
22950
+ var DEFAULT_PROJECTS_DIR = () => import_node_path54.default.join((0, import_node_os9.homedir)(), ".claude", "projects");
22066
22951
  function encodeProjectPath(projectPath) {
22067
22952
  return projectPath.replace(/\//g, "-");
22068
22953
  }
@@ -22071,7 +22956,7 @@ async function discoverClaudeSessions(opts) {
22071
22956
  const limit = opts?.latest ? 1 : opts?.limit ?? 10;
22072
22957
  let projectDirs;
22073
22958
  try {
22074
- projectDirs = await (0, import_promises36.readdir)(projectsDir);
22959
+ projectDirs = await (0, import_promises37.readdir)(projectsDir);
22075
22960
  } catch {
22076
22961
  return [];
22077
22962
  }
@@ -22081,10 +22966,10 @@ async function discoverClaudeSessions(opts) {
22081
22966
  }
22082
22967
  const sessions = [];
22083
22968
  for (const projectDir of projectDirs) {
22084
- const dirPath = import_node_path53.default.join(projectsDir, projectDir);
22969
+ const dirPath = import_node_path54.default.join(projectsDir, projectDir);
22085
22970
  let entries;
22086
22971
  try {
22087
- entries = await (0, import_promises36.readdir)(dirPath);
22972
+ entries = await (0, import_promises37.readdir)(dirPath);
22088
22973
  } catch {
22089
22974
  continue;
22090
22975
  }
@@ -22092,10 +22977,10 @@ async function discoverClaudeSessions(opts) {
22092
22977
  if (!entry.endsWith(".jsonl")) continue;
22093
22978
  const sessionId = entry.replace(/\.jsonl$/, "");
22094
22979
  if (opts?.sessionId && sessionId !== opts.sessionId) continue;
22095
- const filePath = import_node_path53.default.join(dirPath, entry);
22980
+ const filePath = import_node_path54.default.join(dirPath, entry);
22096
22981
  let updatedAt;
22097
22982
  try {
22098
- const fileStat = await (0, import_promises36.stat)(filePath);
22983
+ const fileStat = await (0, import_promises37.stat)(filePath);
22099
22984
  updatedAt = fileStat.mtime;
22100
22985
  } catch {
22101
22986
  updatedAt = /* @__PURE__ */ new Date(0);
@@ -22112,13 +22997,91 @@ async function discoverClaudeSessions(opts) {
22112
22997
  return sessions.slice(0, limit);
22113
22998
  }
22114
22999
 
23000
+ // src/import/transcript-provider.ts
23001
+ init_cjs_shims();
23002
+
22115
23003
  // src/import/types.ts
22116
23004
  init_cjs_shims();
22117
- var import_promises37 = require("fs/promises");
23005
+ var import_promises38 = require("fs/promises");
23006
+ function toTranscriptJsonLine(entry) {
23007
+ const firstUserMessage = entry.messages.find((m) => m.role === "user");
23008
+ const input = typeof firstUserMessage?.content === "string" ? firstUserMessage.content : "";
23009
+ return {
23010
+ input,
23011
+ output: entry.messages,
23012
+ token_usage: entry.tokenUsage ? {
23013
+ input: entry.tokenUsage.input,
23014
+ output: entry.tokenUsage.output,
23015
+ cached: entry.tokenUsage.cached
23016
+ } : void 0,
23017
+ duration_ms: entry.durationMs,
23018
+ cost_usd: entry.costUsd,
23019
+ source: {
23020
+ provider: entry.source.provider,
23021
+ session_id: entry.source.sessionId,
23022
+ model: entry.source.model,
23023
+ timestamp: entry.source.startedAt,
23024
+ git_branch: entry.source.gitBranch,
23025
+ cwd: entry.source.cwd ?? entry.source.projectPath,
23026
+ version: entry.source.version
23027
+ }
23028
+ };
23029
+ }
23030
+ async function readTranscriptJsonl(filePath) {
23031
+ const text = await (0, import_promises38.readFile)(filePath, "utf8");
23032
+ return text.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
23033
+ }
22118
23034
  async function readTranscriptFile(filePath) {
22119
- return (0, import_promises37.readFile)(filePath, "utf8");
23035
+ return (0, import_promises38.readFile)(filePath, "utf8");
22120
23036
  }
22121
23037
 
23038
+ // src/import/transcript-provider.ts
23039
+ var TranscriptProvider = class _TranscriptProvider {
23040
+ id;
23041
+ kind = "transcript";
23042
+ targetName;
23043
+ lines;
23044
+ cursor = 0;
23045
+ constructor(targetName, lines) {
23046
+ this.targetName = targetName;
23047
+ this.id = `transcript:${targetName}`;
23048
+ this.lines = lines;
23049
+ }
23050
+ /**
23051
+ * Create a TranscriptProvider from a JSONL file path.
23052
+ */
23053
+ static async fromFile(filePath) {
23054
+ const lines = await readTranscriptJsonl(filePath);
23055
+ if (lines.length === 0) {
23056
+ throw new Error(`Transcript file is empty: ${filePath}`);
23057
+ }
23058
+ const providerName = lines[0].source.provider ?? "transcript";
23059
+ return new _TranscriptProvider(providerName, lines);
23060
+ }
23061
+ get lineCount() {
23062
+ return this.lines.length;
23063
+ }
23064
+ async invoke(_request) {
23065
+ if (this.cursor >= this.lines.length) {
23066
+ throw new Error(
23067
+ `Transcript exhausted: ${this.lines.length} line(s) available but ${this.cursor + 1} invocations attempted. Each transcript line maps to one test case.`
23068
+ );
23069
+ }
23070
+ const line = this.lines[this.cursor++];
23071
+ return {
23072
+ output: line.output,
23073
+ tokenUsage: line.token_usage ? {
23074
+ input: line.token_usage.input,
23075
+ output: line.token_usage.output,
23076
+ cached: line.token_usage.cached
23077
+ } : void 0,
23078
+ durationMs: line.duration_ms,
23079
+ costUsd: line.cost_usd ?? void 0,
23080
+ startTime: line.source.timestamp
23081
+ };
23082
+ }
23083
+ };
23084
+
22122
23085
  // src/index.ts
22123
23086
  function createAgentKernel() {
22124
23087
  return { status: "stub" };
@@ -22133,6 +23096,7 @@ function createAgentKernel() {
22133
23096
  DEFAULT_EVALUATOR_TEMPLATE,
22134
23097
  DEFAULT_EVAL_PATTERNS,
22135
23098
  DEFAULT_EXPLORATION_TOOLS,
23099
+ DEFAULT_THRESHOLD,
22136
23100
  DeterministicAssertionEvaluator,
22137
23101
  EvaluatorRegistry,
22138
23102
  ExecutionMetricsEvaluator,
@@ -22154,6 +23118,7 @@ function createAgentKernel() {
22154
23118
  TemplateNotFoundError,
22155
23119
  TokenUsageEvaluator,
22156
23120
  ToolTrajectoryEvaluator,
23121
+ TranscriptProvider,
22157
23122
  WorkspaceCreationError,
22158
23123
  WorkspacePoolManager,
22159
23124
  addProject,
@@ -22190,6 +23155,7 @@ function createAgentKernel() {
22190
23155
  detectFormat,
22191
23156
  discoverAssertions,
22192
23157
  discoverClaudeSessions,
23158
+ discoverCodexSessions,
22193
23159
  discoverCopilotSessions,
22194
23160
  discoverGraders,
22195
23161
  discoverJudges,
@@ -22250,6 +23216,8 @@ function createAgentKernel() {
22250
23216
  normalizeLineEndings,
22251
23217
  parseAgentSkillsEvals,
22252
23218
  parseClaudeSession,
23219
+ parseCodexSession,
23220
+ parseCopilotEvents,
22253
23221
  parseJsonFromText,
22254
23222
  parseJsonSafe,
22255
23223
  readJsonFile,
@@ -22257,8 +23225,10 @@ function createAgentKernel() {
22257
23225
  readTestSuiteMetadata,
22258
23226
  readTextFile,
22259
23227
  readTranscriptFile,
23228
+ readTranscriptJsonl,
22260
23229
  removeProject,
22261
23230
  resolveAndCreateProvider,
23231
+ resolveDelegatedTargetDefinition,
22262
23232
  resolveFileReference,
22263
23233
  resolveTargetDefinition,
22264
23234
  resolveWorkspaceTemplate,
@@ -22288,6 +23258,7 @@ function createAgentKernel() {
22288
23258
  substituteVariables,
22289
23259
  toCamelCaseDeep,
22290
23260
  toSnakeCaseDeep,
23261
+ toTranscriptJsonLine,
22291
23262
  tokensPerTool,
22292
23263
  touchProject,
22293
23264
  transpileEvalYaml,