@agentv/core 4.19.0 → 4.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1017,6 +1017,7 @@ var TEMPLATE_VARIABLES = {
1017
1017
  INPUT: "input",
1018
1018
  OUTPUT: "output",
1019
1019
  FILE_CHANGES: "file_changes",
1020
+ TOOL_CALLS: "tool_calls",
1020
1021
  /** @deprecated Use INPUT instead — resolves to the same text value. */
1021
1022
  INPUT_TEXT: "input_text",
1022
1023
  /** @deprecated Use OUTPUT instead — resolves to the same text value. */
@@ -1195,6 +1196,7 @@ var LlmGrader = class {
1195
1196
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
1196
1197
  [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
1197
1198
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
1199
+ [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
1198
1200
  // Deprecated aliases — same values as the primary variables above
1199
1201
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
1200
1202
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
@@ -1209,6 +1211,12 @@ var LlmGrader = class {
1209
1211
 
1210
1212
  [[ ## file_changes ## ]]
1211
1213
  ${context.fileChanges}`;
1214
+ }
1215
+ if (context.toolCalls && !context.graderTemplateOverride && !this.graderTemplate) {
1216
+ userPrompt += `
1217
+
1218
+ [[ ## tool_calls ## ]]
1219
+ ${context.toolCalls}`;
1212
1220
  }
1213
1221
  const graderRawRequest = {
1214
1222
  userPrompt,
@@ -1530,6 +1538,7 @@ ${context.fileChanges}`;
1530
1538
  [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
1531
1539
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
1532
1540
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
1541
+ [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
1533
1542
  // Deprecated aliases
1534
1543
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
1535
1544
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
@@ -1558,6 +1567,9 @@ ${context.fileChanges}`;
1558
1567
  if (context.fileChanges) {
1559
1568
  parts.push("[[ ## file_changes ## ]]", context.fileChanges, "");
1560
1569
  }
1570
+ if (context.toolCalls) {
1571
+ parts.push("[[ ## tool_calls ## ]]", context.toolCalls, "");
1572
+ }
1561
1573
  if (rubrics && rubrics.length > 0) {
1562
1574
  parts.push("[[ ## rubrics ## ]]");
1563
1575
  for (const rubric of rubrics) {
@@ -1591,6 +1603,7 @@ ${context.fileChanges}`;
1591
1603
  [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
1592
1604
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
1593
1605
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
1606
+ [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
1594
1607
  // Deprecated aliases
1595
1608
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
1596
1609
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
@@ -1620,6 +1633,9 @@ ${outputSchema}`;
1620
1633
  if (context.fileChanges) {
1621
1634
  parts.push("[[ ## file_changes ## ]]", context.fileChanges, "");
1622
1635
  }
1636
+ if (context.toolCalls) {
1637
+ parts.push("[[ ## tool_calls ## ]]", context.toolCalls, "");
1638
+ }
1623
1639
  if (rubrics && rubrics.length > 0) {
1624
1640
  parts.push("[[ ## rubrics ## ]]");
1625
1641
  for (const rubric of rubrics) {
@@ -1712,6 +1728,9 @@ ${outputSchema}`;
1712
1728
  if (context.fileChanges) {
1713
1729
  parts.push("[[ ## file_changes ## ]]", context.fileChanges, "");
1714
1730
  }
1731
+ if (context.toolCalls) {
1732
+ parts.push("[[ ## tool_calls ## ]]", context.toolCalls, "");
1733
+ }
1715
1734
  parts.push("[[ ## scoring_criteria ## ]]");
1716
1735
  for (const rubric of rubrics) {
1717
1736
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
@@ -1754,6 +1773,9 @@ ${outputSchema}`;
1754
1773
  if (context.fileChanges) {
1755
1774
  parts.push("[[ ## file_changes ## ]]", context.fileChanges, "");
1756
1775
  }
1776
+ if (context.toolCalls) {
1777
+ parts.push("[[ ## tool_calls ## ]]", context.toolCalls, "");
1778
+ }
1757
1779
  parts.push("[[ ## rubrics ## ]]");
1758
1780
  for (const rubric of rubrics) {
1759
1781
  const requiredLabel = rubric.required ? " (REQUIRED)" : "";
@@ -3265,6 +3287,54 @@ var LatencyGrader = class {
3265
3287
  }
3266
3288
  };
3267
3289
 
3290
+ // src/evaluation/graders/format-tool-calls.ts
3291
+ var KEY_INPUT_FIELDS = /* @__PURE__ */ new Map([
3292
+ ["Skill", ["skill"]],
3293
+ ["Read", ["file_path"]],
3294
+ ["Write", ["file_path"]],
3295
+ ["Edit", ["file_path"]],
3296
+ ["Bash", ["command"]],
3297
+ ["Grep", ["pattern"]],
3298
+ ["Glob", ["pattern"]]
3299
+ ]);
3300
+ var MAX_FALLBACK_LENGTH = 120;
3301
+ function formatToolCalls(output) {
3302
+ if (!output) return "";
3303
+ const lines = [];
3304
+ for (const message of output) {
3305
+ if (!message.toolCalls) continue;
3306
+ for (const call of message.toolCalls) {
3307
+ const toolName = call.tool ?? "unknown";
3308
+ const detail = extractKeyDetail(toolName, call.input);
3309
+ lines.push(detail ? `- ${toolName}: ${detail}` : `- ${toolName}`);
3310
+ }
3311
+ }
3312
+ return lines.length > 0 ? lines.join("\n") : "";
3313
+ }
3314
+ function extractKeyDetail(toolName, input) {
3315
+ if (!input || typeof input !== "object") return "";
3316
+ const record = input;
3317
+ const knownFields = KEY_INPUT_FIELDS.get(toolName);
3318
+ if (knownFields) {
3319
+ for (const field of knownFields) {
3320
+ const value = record[field];
3321
+ if (typeof value === "string" && value.length > 0) {
3322
+ return truncate(value);
3323
+ }
3324
+ }
3325
+ }
3326
+ for (const value of Object.values(record)) {
3327
+ if (typeof value === "string" && value.length > 0 && value.length <= MAX_FALLBACK_LENGTH) {
3328
+ return truncate(value);
3329
+ }
3330
+ }
3331
+ return "";
3332
+ }
3333
+ function truncate(value, maxLen = 120) {
3334
+ if (value.length <= maxLen) return value;
3335
+ return `${value.slice(0, maxLen)}\u2026`;
3336
+ }
3337
+
3268
3338
  // src/evaluation/graders/skill-trigger.ts
3269
3339
  var SkillTriggerGrader = class {
3270
3340
  kind = "skill-trigger";
@@ -3341,19 +3411,27 @@ function assembleLlmGraderPrompt(input) {
3341
3411
  promptInputs,
3342
3412
  evaluatorConfig,
3343
3413
  fileChanges,
3414
+ toolCalls,
3344
3415
  graderTemplateOverride
3345
3416
  } = input;
3346
3417
  const rubrics = evaluatorConfig?.rubrics;
3347
3418
  if (rubrics && rubrics.length > 0) {
3348
3419
  const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
3349
3420
  if (hasScoreRanges) {
3350
- return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges);
3421
+ return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls);
3351
3422
  }
3352
- return assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges);
3423
+ return assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls);
3353
3424
  }
3354
- return assembleFreeform(evalCase, candidate, promptInputs, fileChanges, graderTemplateOverride);
3425
+ return assembleFreeform(
3426
+ evalCase,
3427
+ candidate,
3428
+ promptInputs,
3429
+ fileChanges,
3430
+ toolCalls,
3431
+ graderTemplateOverride
3432
+ );
3355
3433
  }
3356
- function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, graderTemplateOverride) {
3434
+ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, toolCalls, graderTemplateOverride) {
3357
3435
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
3358
3436
  const variables = {
3359
3437
  [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
@@ -3361,6 +3439,7 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, grader
3361
3439
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? "").trim(),
3362
3440
  [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
3363
3441
  [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
3442
+ [TEMPLATE_VARIABLES.TOOL_CALLS]: toolCalls ?? "",
3364
3443
  // Deprecated aliases
3365
3444
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
3366
3445
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
@@ -3374,6 +3453,12 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, grader
3374
3453
 
3375
3454
  [[ ## file_changes ## ]]
3376
3455
  ${fileChanges}`;
3456
+ }
3457
+ if (toolCalls && !graderTemplateOverride) {
3458
+ userPrompt += `
3459
+
3460
+ [[ ## tool_calls ## ]]
3461
+ ${toolCalls}`;
3377
3462
  }
3378
3463
  return {
3379
3464
  systemPrompt,
@@ -3382,7 +3467,7 @@ ${fileChanges}`;
3382
3467
  mode: "freeform"
3383
3468
  };
3384
3469
  }
3385
- function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges) {
3470
+ function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls) {
3386
3471
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
3387
3472
  const parts = [
3388
3473
  "You are an expert grader. Evaluate the candidate answer against each rubric item below.",
@@ -3401,6 +3486,9 @@ function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChang
3401
3486
  if (fileChanges) {
3402
3487
  parts.push("[[ ## file_changes ## ]]", fileChanges, "");
3403
3488
  }
3489
+ if (toolCalls) {
3490
+ parts.push("[[ ## tool_calls ## ]]", toolCalls, "");
3491
+ }
3404
3492
  parts.push("[[ ## rubrics ## ]]");
3405
3493
  for (const rubric of rubrics) {
3406
3494
  const requiredLabel = rubric.required ? " (REQUIRED)" : "";
@@ -3417,7 +3505,7 @@ function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChang
3417
3505
  mode: "checklist"
3418
3506
  };
3419
3507
  }
3420
- function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges) {
3508
+ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls) {
3421
3509
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
3422
3510
  const parts = [
3423
3511
  "You are an expert grader. Score the candidate answer on each criterion below using the provided score ranges.",
@@ -3437,6 +3525,9 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
3437
3525
  if (fileChanges) {
3438
3526
  parts.push("[[ ## file_changes ## ]]", fileChanges, "");
3439
3527
  }
3528
+ if (toolCalls) {
3529
+ parts.push("[[ ## tool_calls ## ]]", toolCalls, "");
3530
+ }
3440
3531
  parts.push("[[ ## scoring_criteria ## ]]");
3441
3532
  for (const rubric of rubrics) {
3442
3533
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
@@ -15999,7 +16090,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
15999
16090
  return { tests: await loadTestsFromAgentSkills(evalFilePath) };
16000
16091
  }
16001
16092
  if (format === "typescript") {
16002
- const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-XFQ6S4DT.js");
16093
+ const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-32COE32J.js");
16003
16094
  return loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
16004
16095
  }
16005
16096
  const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(
@@ -16034,7 +16125,7 @@ async function loadTests(evalFilePath, repoRoot, options) {
16034
16125
  return loadTestsFromAgentSkills(evalFilePath);
16035
16126
  }
16036
16127
  if (format === "typescript") {
16037
- const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-XFQ6S4DT.js");
16128
+ const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-32COE32J.js");
16038
16129
  const suite = await loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
16039
16130
  return suite.tests;
16040
16131
  }
@@ -16639,6 +16730,7 @@ async function runEvaluation(options) {
16639
16730
  trials,
16640
16731
  streamCallbacks,
16641
16732
  budgetUsd,
16733
+ runBudgetTracker,
16642
16734
  failOnError,
16643
16735
  poolWorkspaces,
16644
16736
  poolMaxSlots: configPoolMaxSlots,
@@ -16973,8 +17065,14 @@ async function runEvaluation(options) {
16973
17065
  }
16974
17066
  }
16975
17067
  return { ok: allPassed, depResults };
17068
+ }, extractEvaluationCostUsd2 = function(result) {
17069
+ if (result.trials && result.trials.length > 0) {
17070
+ const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
17071
+ return trialCostSum > 0 ? trialCostSum : void 0;
17072
+ }
17073
+ return result.costUsd;
16976
17074
  };
16977
- var toDependencyResult = toDependencyResult2, checkDependencies = checkDependencies2;
17075
+ var toDependencyResult = toDependencyResult2, checkDependencies = checkDependencies2, extractEvaluationCostUsd = extractEvaluationCostUsd2;
16978
17076
  if (suiteWorkspaceFile && sharedWorkspacePath) {
16979
17077
  const copiedWorkspaceFile = path44.join(sharedWorkspacePath, path44.basename(suiteWorkspaceFile));
16980
17078
  try {
@@ -17167,6 +17265,42 @@ async function runEvaluation(options) {
17167
17265
  async function dispatchTest(evalCase, depResults) {
17168
17266
  const workerId = nextWorkerId++;
17169
17267
  workerIdByEvalId.set(evalCase.id, workerId);
17268
+ if (runBudgetTracker?.isExceeded()) {
17269
+ const budgetResult = {
17270
+ timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
17271
+ testId: evalCase.id,
17272
+ suite: evalCase.suite,
17273
+ category: evalCase.category,
17274
+ score: 0,
17275
+ assertions: [],
17276
+ output: [],
17277
+ target: target.name,
17278
+ error: `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`,
17279
+ budgetExceeded: true,
17280
+ executionStatus: "execution_error",
17281
+ failureStage: "setup",
17282
+ failureReasonCode: "budget_exceeded",
17283
+ executionError: {
17284
+ message: `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`,
17285
+ stage: "setup"
17286
+ }
17287
+ };
17288
+ if (onProgress) {
17289
+ await onProgress({
17290
+ workerId,
17291
+ testId: evalCase.id,
17292
+ status: "failed",
17293
+ completedAt: Date.now(),
17294
+ error: budgetResult.error,
17295
+ score: budgetResult.score,
17296
+ executionStatus: budgetResult.executionStatus
17297
+ });
17298
+ }
17299
+ if (onResult) {
17300
+ await onResult(budgetResult);
17301
+ }
17302
+ return budgetResult;
17303
+ }
17170
17304
  if (budgetUsd !== void 0 && budgetExhausted) {
17171
17305
  const budgetResult = {
17172
17306
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
@@ -17280,22 +17414,17 @@ async function runEvaluation(options) {
17280
17414
  ...depResults && Object.keys(depResults).length > 0 ? { dependencyResults: depResults } : {}
17281
17415
  };
17282
17416
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
17283
- if (budgetUsd !== void 0) {
17284
- let caseCost;
17285
- if (result.trials && result.trials.length > 0) {
17286
- const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
17287
- if (trialCostSum > 0) {
17288
- caseCost = trialCostSum;
17289
- }
17290
- } else {
17291
- caseCost = result.costUsd;
17292
- }
17293
- if (caseCost !== void 0) {
17417
+ const caseCost = extractEvaluationCostUsd2(result);
17418
+ if (caseCost !== void 0) {
17419
+ if (budgetUsd !== void 0) {
17294
17420
  cumulativeBudgetCost += caseCost;
17295
17421
  if (cumulativeBudgetCost >= budgetUsd) {
17296
17422
  budgetExhausted = true;
17297
17423
  }
17298
17424
  }
17425
+ if (runBudgetTracker) {
17426
+ runBudgetTracker.add(caseCost);
17427
+ }
17299
17428
  }
17300
17429
  if (failOnError === true && result.executionStatus === "execution_error") {
17301
17430
  failOnErrorTriggered = true;
@@ -18103,6 +18232,7 @@ async function runEvalCase(options) {
18103
18232
  fileChanges = fileChanges ? `${fileChanges}
18104
18233
  ${providerFileChanges}` : providerFileChanges;
18105
18234
  }
18235
+ const toolCalls = formatToolCalls(output);
18106
18236
  const providerError = extractProviderError(providerResponse);
18107
18237
  const targetAfterEachHook = options.targetHooks?.after_each;
18108
18238
  if (workspacePath && hasHookCommand(targetAfterEachHook)) {
@@ -18186,6 +18316,7 @@ ${providerFileChanges}` : providerFileChanges;
18186
18316
  targetResolver,
18187
18317
  availableTargets,
18188
18318
  fileChanges,
18319
+ toolCalls,
18189
18320
  workspacePath,
18190
18321
  dockerConfig: evalCase.workspace?.docker,
18191
18322
  verbose,
@@ -18383,6 +18514,7 @@ async function evaluateCandidate(options) {
18383
18514
  targetResolver,
18384
18515
  availableTargets,
18385
18516
  fileChanges,
18517
+ toolCalls,
18386
18518
  workspacePath,
18387
18519
  dockerConfig,
18388
18520
  threshold: evalThreshold,
@@ -18411,6 +18543,7 @@ async function evaluateCandidate(options) {
18411
18543
  targetResolver,
18412
18544
  availableTargets,
18413
18545
  fileChanges,
18546
+ toolCalls,
18414
18547
  workspacePath,
18415
18548
  dockerConfig,
18416
18549
  threshold: evalThreshold,
@@ -18488,6 +18621,7 @@ async function runEvaluatorsForCase(options) {
18488
18621
  targetResolver,
18489
18622
  availableTargets,
18490
18623
  fileChanges,
18624
+ toolCalls,
18491
18625
  workspacePath,
18492
18626
  dockerConfig,
18493
18627
  threshold,
@@ -18517,6 +18651,7 @@ async function runEvaluatorsForCase(options) {
18517
18651
  targetResolver,
18518
18652
  availableTargets,
18519
18653
  fileChanges,
18654
+ toolCalls,
18520
18655
  workspacePath,
18521
18656
  dockerConfig,
18522
18657
  threshold,
@@ -18548,6 +18683,7 @@ async function runEvaluatorsForCase(options) {
18548
18683
  targetResolver,
18549
18684
  availableTargets,
18550
18685
  fileChanges,
18686
+ toolCalls,
18551
18687
  workspacePath,
18552
18688
  dockerConfig,
18553
18689
  dependencyResults,
@@ -18589,6 +18725,7 @@ async function runEvaluatorList(options) {
18589
18725
  targetResolver,
18590
18726
  availableTargets,
18591
18727
  fileChanges,
18728
+ toolCalls,
18592
18729
  workspacePath,
18593
18730
  dockerConfig,
18594
18731
  dependencyResults
@@ -18614,6 +18751,7 @@ async function runEvaluatorList(options) {
18614
18751
  targetResolver,
18615
18752
  availableTargets,
18616
18753
  fileChanges,
18754
+ toolCalls,
18617
18755
  workspacePath,
18618
18756
  dockerConfig,
18619
18757
  dependencyResults
@@ -19669,6 +19807,7 @@ export {
19669
19807
  ExecutionMetricsGrader,
19670
19808
  FieldAccuracyGrader,
19671
19809
  LatencyGrader,
19810
+ formatToolCalls,
19672
19811
  SkillTriggerGrader,
19673
19812
  assembleLlmGraderPrompt,
19674
19813
  TokenUsageGrader,
@@ -19737,4 +19876,4 @@ export {
19737
19876
  loadTestById,
19738
19877
  loadEvalCaseById
19739
19878
  };
19740
- //# sourceMappingURL=chunk-QXX3IBYV.js.map
19879
+ //# sourceMappingURL=chunk-ELF6SQAK.js.map