@agentv/core 4.19.0-next.1 → 4.20.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-QXX3IBYV.js → chunk-ELF6SQAK.js} +160 -21
- package/dist/chunk-ELF6SQAK.js.map +1 -0
- package/dist/index.cjs +193 -18
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +53 -1
- package/dist/index.d.ts +53 -1
- package/dist/index.js +28 -1
- package/dist/index.js.map +1 -1
- package/dist/{ts-eval-loader-XFQ6S4DT.js → ts-eval-loader-32COE32J.js} +2 -2
- package/package.json +1 -1
- package/dist/chunk-QXX3IBYV.js.map +0 -1
- /package/dist/{ts-eval-loader-XFQ6S4DT.js.map → ts-eval-loader-32COE32J.js.map} +0 -0
|
@@ -1017,6 +1017,7 @@ var TEMPLATE_VARIABLES = {
|
|
|
1017
1017
|
INPUT: "input",
|
|
1018
1018
|
OUTPUT: "output",
|
|
1019
1019
|
FILE_CHANGES: "file_changes",
|
|
1020
|
+
TOOL_CALLS: "tool_calls",
|
|
1020
1021
|
/** @deprecated Use INPUT instead — resolves to the same text value. */
|
|
1021
1022
|
INPUT_TEXT: "input_text",
|
|
1022
1023
|
/** @deprecated Use OUTPUT instead — resolves to the same text value. */
|
|
@@ -1195,6 +1196,7 @@ var LlmGrader = class {
|
|
|
1195
1196
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
|
|
1196
1197
|
[TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
|
|
1197
1198
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
|
|
1199
|
+
[TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
|
|
1198
1200
|
// Deprecated aliases — same values as the primary variables above
|
|
1199
1201
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
1200
1202
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
@@ -1209,6 +1211,12 @@ var LlmGrader = class {
|
|
|
1209
1211
|
|
|
1210
1212
|
[[ ## file_changes ## ]]
|
|
1211
1213
|
${context.fileChanges}`;
|
|
1214
|
+
}
|
|
1215
|
+
if (context.toolCalls && !context.graderTemplateOverride && !this.graderTemplate) {
|
|
1216
|
+
userPrompt += `
|
|
1217
|
+
|
|
1218
|
+
[[ ## tool_calls ## ]]
|
|
1219
|
+
${context.toolCalls}`;
|
|
1212
1220
|
}
|
|
1213
1221
|
const graderRawRequest = {
|
|
1214
1222
|
userPrompt,
|
|
@@ -1530,6 +1538,7 @@ ${context.fileChanges}`;
|
|
|
1530
1538
|
[TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
|
|
1531
1539
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
|
|
1532
1540
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
|
|
1541
|
+
[TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
|
|
1533
1542
|
// Deprecated aliases
|
|
1534
1543
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
1535
1544
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
@@ -1558,6 +1567,9 @@ ${context.fileChanges}`;
|
|
|
1558
1567
|
if (context.fileChanges) {
|
|
1559
1568
|
parts.push("[[ ## file_changes ## ]]", context.fileChanges, "");
|
|
1560
1569
|
}
|
|
1570
|
+
if (context.toolCalls) {
|
|
1571
|
+
parts.push("[[ ## tool_calls ## ]]", context.toolCalls, "");
|
|
1572
|
+
}
|
|
1561
1573
|
if (rubrics && rubrics.length > 0) {
|
|
1562
1574
|
parts.push("[[ ## rubrics ## ]]");
|
|
1563
1575
|
for (const rubric of rubrics) {
|
|
@@ -1591,6 +1603,7 @@ ${context.fileChanges}`;
|
|
|
1591
1603
|
[TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
|
|
1592
1604
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
|
|
1593
1605
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
|
|
1606
|
+
[TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
|
|
1594
1607
|
// Deprecated aliases
|
|
1595
1608
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
1596
1609
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
@@ -1620,6 +1633,9 @@ ${outputSchema}`;
|
|
|
1620
1633
|
if (context.fileChanges) {
|
|
1621
1634
|
parts.push("[[ ## file_changes ## ]]", context.fileChanges, "");
|
|
1622
1635
|
}
|
|
1636
|
+
if (context.toolCalls) {
|
|
1637
|
+
parts.push("[[ ## tool_calls ## ]]", context.toolCalls, "");
|
|
1638
|
+
}
|
|
1623
1639
|
if (rubrics && rubrics.length > 0) {
|
|
1624
1640
|
parts.push("[[ ## rubrics ## ]]");
|
|
1625
1641
|
for (const rubric of rubrics) {
|
|
@@ -1712,6 +1728,9 @@ ${outputSchema}`;
|
|
|
1712
1728
|
if (context.fileChanges) {
|
|
1713
1729
|
parts.push("[[ ## file_changes ## ]]", context.fileChanges, "");
|
|
1714
1730
|
}
|
|
1731
|
+
if (context.toolCalls) {
|
|
1732
|
+
parts.push("[[ ## tool_calls ## ]]", context.toolCalls, "");
|
|
1733
|
+
}
|
|
1715
1734
|
parts.push("[[ ## scoring_criteria ## ]]");
|
|
1716
1735
|
for (const rubric of rubrics) {
|
|
1717
1736
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
@@ -1754,6 +1773,9 @@ ${outputSchema}`;
|
|
|
1754
1773
|
if (context.fileChanges) {
|
|
1755
1774
|
parts.push("[[ ## file_changes ## ]]", context.fileChanges, "");
|
|
1756
1775
|
}
|
|
1776
|
+
if (context.toolCalls) {
|
|
1777
|
+
parts.push("[[ ## tool_calls ## ]]", context.toolCalls, "");
|
|
1778
|
+
}
|
|
1757
1779
|
parts.push("[[ ## rubrics ## ]]");
|
|
1758
1780
|
for (const rubric of rubrics) {
|
|
1759
1781
|
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
@@ -3265,6 +3287,54 @@ var LatencyGrader = class {
|
|
|
3265
3287
|
}
|
|
3266
3288
|
};
|
|
3267
3289
|
|
|
3290
|
+
// src/evaluation/graders/format-tool-calls.ts
|
|
3291
|
+
var KEY_INPUT_FIELDS = /* @__PURE__ */ new Map([
|
|
3292
|
+
["Skill", ["skill"]],
|
|
3293
|
+
["Read", ["file_path"]],
|
|
3294
|
+
["Write", ["file_path"]],
|
|
3295
|
+
["Edit", ["file_path"]],
|
|
3296
|
+
["Bash", ["command"]],
|
|
3297
|
+
["Grep", ["pattern"]],
|
|
3298
|
+
["Glob", ["pattern"]]
|
|
3299
|
+
]);
|
|
3300
|
+
var MAX_FALLBACK_LENGTH = 120;
|
|
3301
|
+
function formatToolCalls(output) {
|
|
3302
|
+
if (!output) return "";
|
|
3303
|
+
const lines = [];
|
|
3304
|
+
for (const message of output) {
|
|
3305
|
+
if (!message.toolCalls) continue;
|
|
3306
|
+
for (const call of message.toolCalls) {
|
|
3307
|
+
const toolName = call.tool ?? "unknown";
|
|
3308
|
+
const detail = extractKeyDetail(toolName, call.input);
|
|
3309
|
+
lines.push(detail ? `- ${toolName}: ${detail}` : `- ${toolName}`);
|
|
3310
|
+
}
|
|
3311
|
+
}
|
|
3312
|
+
return lines.length > 0 ? lines.join("\n") : "";
|
|
3313
|
+
}
|
|
3314
|
+
function extractKeyDetail(toolName, input) {
|
|
3315
|
+
if (!input || typeof input !== "object") return "";
|
|
3316
|
+
const record = input;
|
|
3317
|
+
const knownFields = KEY_INPUT_FIELDS.get(toolName);
|
|
3318
|
+
if (knownFields) {
|
|
3319
|
+
for (const field of knownFields) {
|
|
3320
|
+
const value = record[field];
|
|
3321
|
+
if (typeof value === "string" && value.length > 0) {
|
|
3322
|
+
return truncate(value);
|
|
3323
|
+
}
|
|
3324
|
+
}
|
|
3325
|
+
}
|
|
3326
|
+
for (const value of Object.values(record)) {
|
|
3327
|
+
if (typeof value === "string" && value.length > 0 && value.length <= MAX_FALLBACK_LENGTH) {
|
|
3328
|
+
return truncate(value);
|
|
3329
|
+
}
|
|
3330
|
+
}
|
|
3331
|
+
return "";
|
|
3332
|
+
}
|
|
3333
|
+
function truncate(value, maxLen = 120) {
|
|
3334
|
+
if (value.length <= maxLen) return value;
|
|
3335
|
+
return `${value.slice(0, maxLen)}\u2026`;
|
|
3336
|
+
}
|
|
3337
|
+
|
|
3268
3338
|
// src/evaluation/graders/skill-trigger.ts
|
|
3269
3339
|
var SkillTriggerGrader = class {
|
|
3270
3340
|
kind = "skill-trigger";
|
|
@@ -3341,19 +3411,27 @@ function assembleLlmGraderPrompt(input) {
|
|
|
3341
3411
|
promptInputs,
|
|
3342
3412
|
evaluatorConfig,
|
|
3343
3413
|
fileChanges,
|
|
3414
|
+
toolCalls,
|
|
3344
3415
|
graderTemplateOverride
|
|
3345
3416
|
} = input;
|
|
3346
3417
|
const rubrics = evaluatorConfig?.rubrics;
|
|
3347
3418
|
if (rubrics && rubrics.length > 0) {
|
|
3348
3419
|
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
3349
3420
|
if (hasScoreRanges) {
|
|
3350
|
-
return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges);
|
|
3421
|
+
return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls);
|
|
3351
3422
|
}
|
|
3352
|
-
return assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges);
|
|
3423
|
+
return assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls);
|
|
3353
3424
|
}
|
|
3354
|
-
return assembleFreeform(
|
|
3425
|
+
return assembleFreeform(
|
|
3426
|
+
evalCase,
|
|
3427
|
+
candidate,
|
|
3428
|
+
promptInputs,
|
|
3429
|
+
fileChanges,
|
|
3430
|
+
toolCalls,
|
|
3431
|
+
graderTemplateOverride
|
|
3432
|
+
);
|
|
3355
3433
|
}
|
|
3356
|
-
function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, graderTemplateOverride) {
|
|
3434
|
+
function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, toolCalls, graderTemplateOverride) {
|
|
3357
3435
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
3358
3436
|
const variables = {
|
|
3359
3437
|
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
@@ -3361,6 +3439,7 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, grader
|
|
|
3361
3439
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? "").trim(),
|
|
3362
3440
|
[TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
|
|
3363
3441
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
|
|
3442
|
+
[TEMPLATE_VARIABLES.TOOL_CALLS]: toolCalls ?? "",
|
|
3364
3443
|
// Deprecated aliases
|
|
3365
3444
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
3366
3445
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
|
|
@@ -3374,6 +3453,12 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, grader
|
|
|
3374
3453
|
|
|
3375
3454
|
[[ ## file_changes ## ]]
|
|
3376
3455
|
${fileChanges}`;
|
|
3456
|
+
}
|
|
3457
|
+
if (toolCalls && !graderTemplateOverride) {
|
|
3458
|
+
userPrompt += `
|
|
3459
|
+
|
|
3460
|
+
[[ ## tool_calls ## ]]
|
|
3461
|
+
${toolCalls}`;
|
|
3377
3462
|
}
|
|
3378
3463
|
return {
|
|
3379
3464
|
systemPrompt,
|
|
@@ -3382,7 +3467,7 @@ ${fileChanges}`;
|
|
|
3382
3467
|
mode: "freeform"
|
|
3383
3468
|
};
|
|
3384
3469
|
}
|
|
3385
|
-
function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges) {
|
|
3470
|
+
function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls) {
|
|
3386
3471
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
3387
3472
|
const parts = [
|
|
3388
3473
|
"You are an expert grader. Evaluate the candidate answer against each rubric item below.",
|
|
@@ -3401,6 +3486,9 @@ function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChang
|
|
|
3401
3486
|
if (fileChanges) {
|
|
3402
3487
|
parts.push("[[ ## file_changes ## ]]", fileChanges, "");
|
|
3403
3488
|
}
|
|
3489
|
+
if (toolCalls) {
|
|
3490
|
+
parts.push("[[ ## tool_calls ## ]]", toolCalls, "");
|
|
3491
|
+
}
|
|
3404
3492
|
parts.push("[[ ## rubrics ## ]]");
|
|
3405
3493
|
for (const rubric of rubrics) {
|
|
3406
3494
|
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
@@ -3417,7 +3505,7 @@ function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChang
|
|
|
3417
3505
|
mode: "checklist"
|
|
3418
3506
|
};
|
|
3419
3507
|
}
|
|
3420
|
-
function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges) {
|
|
3508
|
+
function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls) {
|
|
3421
3509
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
3422
3510
|
const parts = [
|
|
3423
3511
|
"You are an expert grader. Score the candidate answer on each criterion below using the provided score ranges.",
|
|
@@ -3437,6 +3525,9 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
|
|
|
3437
3525
|
if (fileChanges) {
|
|
3438
3526
|
parts.push("[[ ## file_changes ## ]]", fileChanges, "");
|
|
3439
3527
|
}
|
|
3528
|
+
if (toolCalls) {
|
|
3529
|
+
parts.push("[[ ## tool_calls ## ]]", toolCalls, "");
|
|
3530
|
+
}
|
|
3440
3531
|
parts.push("[[ ## scoring_criteria ## ]]");
|
|
3441
3532
|
for (const rubric of rubrics) {
|
|
3442
3533
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
@@ -15999,7 +16090,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
15999
16090
|
return { tests: await loadTestsFromAgentSkills(evalFilePath) };
|
|
16000
16091
|
}
|
|
16001
16092
|
if (format === "typescript") {
|
|
16002
|
-
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-
|
|
16093
|
+
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-32COE32J.js");
|
|
16003
16094
|
return loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
|
|
16004
16095
|
}
|
|
16005
16096
|
const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(
|
|
@@ -16034,7 +16125,7 @@ async function loadTests(evalFilePath, repoRoot, options) {
|
|
|
16034
16125
|
return loadTestsFromAgentSkills(evalFilePath);
|
|
16035
16126
|
}
|
|
16036
16127
|
if (format === "typescript") {
|
|
16037
|
-
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-
|
|
16128
|
+
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-32COE32J.js");
|
|
16038
16129
|
const suite = await loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
|
|
16039
16130
|
return suite.tests;
|
|
16040
16131
|
}
|
|
@@ -16639,6 +16730,7 @@ async function runEvaluation(options) {
|
|
|
16639
16730
|
trials,
|
|
16640
16731
|
streamCallbacks,
|
|
16641
16732
|
budgetUsd,
|
|
16733
|
+
runBudgetTracker,
|
|
16642
16734
|
failOnError,
|
|
16643
16735
|
poolWorkspaces,
|
|
16644
16736
|
poolMaxSlots: configPoolMaxSlots,
|
|
@@ -16973,8 +17065,14 @@ async function runEvaluation(options) {
|
|
|
16973
17065
|
}
|
|
16974
17066
|
}
|
|
16975
17067
|
return { ok: allPassed, depResults };
|
|
17068
|
+
}, extractEvaluationCostUsd2 = function(result) {
|
|
17069
|
+
if (result.trials && result.trials.length > 0) {
|
|
17070
|
+
const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
|
|
17071
|
+
return trialCostSum > 0 ? trialCostSum : void 0;
|
|
17072
|
+
}
|
|
17073
|
+
return result.costUsd;
|
|
16976
17074
|
};
|
|
16977
|
-
var toDependencyResult = toDependencyResult2, checkDependencies = checkDependencies2;
|
|
17075
|
+
var toDependencyResult = toDependencyResult2, checkDependencies = checkDependencies2, extractEvaluationCostUsd = extractEvaluationCostUsd2;
|
|
16978
17076
|
if (suiteWorkspaceFile && sharedWorkspacePath) {
|
|
16979
17077
|
const copiedWorkspaceFile = path44.join(sharedWorkspacePath, path44.basename(suiteWorkspaceFile));
|
|
16980
17078
|
try {
|
|
@@ -17167,6 +17265,42 @@ async function runEvaluation(options) {
|
|
|
17167
17265
|
async function dispatchTest(evalCase, depResults) {
|
|
17168
17266
|
const workerId = nextWorkerId++;
|
|
17169
17267
|
workerIdByEvalId.set(evalCase.id, workerId);
|
|
17268
|
+
if (runBudgetTracker?.isExceeded()) {
|
|
17269
|
+
const budgetResult = {
|
|
17270
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
17271
|
+
testId: evalCase.id,
|
|
17272
|
+
suite: evalCase.suite,
|
|
17273
|
+
category: evalCase.category,
|
|
17274
|
+
score: 0,
|
|
17275
|
+
assertions: [],
|
|
17276
|
+
output: [],
|
|
17277
|
+
target: target.name,
|
|
17278
|
+
error: `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`,
|
|
17279
|
+
budgetExceeded: true,
|
|
17280
|
+
executionStatus: "execution_error",
|
|
17281
|
+
failureStage: "setup",
|
|
17282
|
+
failureReasonCode: "budget_exceeded",
|
|
17283
|
+
executionError: {
|
|
17284
|
+
message: `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`,
|
|
17285
|
+
stage: "setup"
|
|
17286
|
+
}
|
|
17287
|
+
};
|
|
17288
|
+
if (onProgress) {
|
|
17289
|
+
await onProgress({
|
|
17290
|
+
workerId,
|
|
17291
|
+
testId: evalCase.id,
|
|
17292
|
+
status: "failed",
|
|
17293
|
+
completedAt: Date.now(),
|
|
17294
|
+
error: budgetResult.error,
|
|
17295
|
+
score: budgetResult.score,
|
|
17296
|
+
executionStatus: budgetResult.executionStatus
|
|
17297
|
+
});
|
|
17298
|
+
}
|
|
17299
|
+
if (onResult) {
|
|
17300
|
+
await onResult(budgetResult);
|
|
17301
|
+
}
|
|
17302
|
+
return budgetResult;
|
|
17303
|
+
}
|
|
17170
17304
|
if (budgetUsd !== void 0 && budgetExhausted) {
|
|
17171
17305
|
const budgetResult = {
|
|
17172
17306
|
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
@@ -17280,22 +17414,17 @@ async function runEvaluation(options) {
|
|
|
17280
17414
|
...depResults && Object.keys(depResults).length > 0 ? { dependencyResults: depResults } : {}
|
|
17281
17415
|
};
|
|
17282
17416
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
17283
|
-
|
|
17284
|
-
|
|
17285
|
-
if (
|
|
17286
|
-
const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
|
|
17287
|
-
if (trialCostSum > 0) {
|
|
17288
|
-
caseCost = trialCostSum;
|
|
17289
|
-
}
|
|
17290
|
-
} else {
|
|
17291
|
-
caseCost = result.costUsd;
|
|
17292
|
-
}
|
|
17293
|
-
if (caseCost !== void 0) {
|
|
17417
|
+
const caseCost = extractEvaluationCostUsd2(result);
|
|
17418
|
+
if (caseCost !== void 0) {
|
|
17419
|
+
if (budgetUsd !== void 0) {
|
|
17294
17420
|
cumulativeBudgetCost += caseCost;
|
|
17295
17421
|
if (cumulativeBudgetCost >= budgetUsd) {
|
|
17296
17422
|
budgetExhausted = true;
|
|
17297
17423
|
}
|
|
17298
17424
|
}
|
|
17425
|
+
if (runBudgetTracker) {
|
|
17426
|
+
runBudgetTracker.add(caseCost);
|
|
17427
|
+
}
|
|
17299
17428
|
}
|
|
17300
17429
|
if (failOnError === true && result.executionStatus === "execution_error") {
|
|
17301
17430
|
failOnErrorTriggered = true;
|
|
@@ -18103,6 +18232,7 @@ async function runEvalCase(options) {
|
|
|
18103
18232
|
fileChanges = fileChanges ? `${fileChanges}
|
|
18104
18233
|
${providerFileChanges}` : providerFileChanges;
|
|
18105
18234
|
}
|
|
18235
|
+
const toolCalls = formatToolCalls(output);
|
|
18106
18236
|
const providerError = extractProviderError(providerResponse);
|
|
18107
18237
|
const targetAfterEachHook = options.targetHooks?.after_each;
|
|
18108
18238
|
if (workspacePath && hasHookCommand(targetAfterEachHook)) {
|
|
@@ -18186,6 +18316,7 @@ ${providerFileChanges}` : providerFileChanges;
|
|
|
18186
18316
|
targetResolver,
|
|
18187
18317
|
availableTargets,
|
|
18188
18318
|
fileChanges,
|
|
18319
|
+
toolCalls,
|
|
18189
18320
|
workspacePath,
|
|
18190
18321
|
dockerConfig: evalCase.workspace?.docker,
|
|
18191
18322
|
verbose,
|
|
@@ -18383,6 +18514,7 @@ async function evaluateCandidate(options) {
|
|
|
18383
18514
|
targetResolver,
|
|
18384
18515
|
availableTargets,
|
|
18385
18516
|
fileChanges,
|
|
18517
|
+
toolCalls,
|
|
18386
18518
|
workspacePath,
|
|
18387
18519
|
dockerConfig,
|
|
18388
18520
|
threshold: evalThreshold,
|
|
@@ -18411,6 +18543,7 @@ async function evaluateCandidate(options) {
|
|
|
18411
18543
|
targetResolver,
|
|
18412
18544
|
availableTargets,
|
|
18413
18545
|
fileChanges,
|
|
18546
|
+
toolCalls,
|
|
18414
18547
|
workspacePath,
|
|
18415
18548
|
dockerConfig,
|
|
18416
18549
|
threshold: evalThreshold,
|
|
@@ -18488,6 +18621,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
18488
18621
|
targetResolver,
|
|
18489
18622
|
availableTargets,
|
|
18490
18623
|
fileChanges,
|
|
18624
|
+
toolCalls,
|
|
18491
18625
|
workspacePath,
|
|
18492
18626
|
dockerConfig,
|
|
18493
18627
|
threshold,
|
|
@@ -18517,6 +18651,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
18517
18651
|
targetResolver,
|
|
18518
18652
|
availableTargets,
|
|
18519
18653
|
fileChanges,
|
|
18654
|
+
toolCalls,
|
|
18520
18655
|
workspacePath,
|
|
18521
18656
|
dockerConfig,
|
|
18522
18657
|
threshold,
|
|
@@ -18548,6 +18683,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
18548
18683
|
targetResolver,
|
|
18549
18684
|
availableTargets,
|
|
18550
18685
|
fileChanges,
|
|
18686
|
+
toolCalls,
|
|
18551
18687
|
workspacePath,
|
|
18552
18688
|
dockerConfig,
|
|
18553
18689
|
dependencyResults,
|
|
@@ -18589,6 +18725,7 @@ async function runEvaluatorList(options) {
|
|
|
18589
18725
|
targetResolver,
|
|
18590
18726
|
availableTargets,
|
|
18591
18727
|
fileChanges,
|
|
18728
|
+
toolCalls,
|
|
18592
18729
|
workspacePath,
|
|
18593
18730
|
dockerConfig,
|
|
18594
18731
|
dependencyResults
|
|
@@ -18614,6 +18751,7 @@ async function runEvaluatorList(options) {
|
|
|
18614
18751
|
targetResolver,
|
|
18615
18752
|
availableTargets,
|
|
18616
18753
|
fileChanges,
|
|
18754
|
+
toolCalls,
|
|
18617
18755
|
workspacePath,
|
|
18618
18756
|
dockerConfig,
|
|
18619
18757
|
dependencyResults
|
|
@@ -19669,6 +19807,7 @@ export {
|
|
|
19669
19807
|
ExecutionMetricsGrader,
|
|
19670
19808
|
FieldAccuracyGrader,
|
|
19671
19809
|
LatencyGrader,
|
|
19810
|
+
formatToolCalls,
|
|
19672
19811
|
SkillTriggerGrader,
|
|
19673
19812
|
assembleLlmGraderPrompt,
|
|
19674
19813
|
TokenUsageGrader,
|
|
@@ -19737,4 +19876,4 @@ export {
|
|
|
19737
19876
|
loadTestById,
|
|
19738
19877
|
loadEvalCaseById
|
|
19739
19878
|
};
|
|
19740
|
-
//# sourceMappingURL=chunk-
|
|
19879
|
+
//# sourceMappingURL=chunk-ELF6SQAK.js.map
|