@agentv/core 4.19.0 → 4.20.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-QXX3IBYV.js → chunk-ELF6SQAK.js} +160 -21
- package/dist/chunk-ELF6SQAK.js.map +1 -0
- package/dist/index.cjs +193 -18
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +53 -1
- package/dist/index.d.ts +53 -1
- package/dist/index.js +28 -1
- package/dist/index.js.map +1 -1
- package/dist/{ts-eval-loader-XFQ6S4DT.js → ts-eval-loader-32COE32J.js} +2 -2
- package/package.json +1 -1
- package/dist/chunk-QXX3IBYV.js.map +0 -1
- /package/dist/{ts-eval-loader-XFQ6S4DT.js.map → ts-eval-loader-32COE32J.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -1448,6 +1448,7 @@ var init_template_variables = __esm({
|
|
|
1448
1448
|
INPUT: "input",
|
|
1449
1449
|
OUTPUT: "output",
|
|
1450
1450
|
FILE_CHANGES: "file_changes",
|
|
1451
|
+
TOOL_CALLS: "tool_calls",
|
|
1451
1452
|
/** @deprecated Use INPUT instead — resolves to the same text value. */
|
|
1452
1453
|
INPUT_TEXT: "input_text",
|
|
1453
1454
|
/** @deprecated Use OUTPUT instead — resolves to the same text value. */
|
|
@@ -5832,6 +5833,7 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
5832
5833
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
5833
5834
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
5834
5835
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
|
|
5836
|
+
[TEMPLATE_VARIABLES.TOOL_CALLS]: context2.toolCalls ?? "",
|
|
5835
5837
|
// Deprecated aliases — same values as the primary variables above
|
|
5836
5838
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
5837
5839
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
@@ -5846,6 +5848,12 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
5846
5848
|
|
|
5847
5849
|
[[ ## file_changes ## ]]
|
|
5848
5850
|
${context2.fileChanges}`;
|
|
5851
|
+
}
|
|
5852
|
+
if (context2.toolCalls && !context2.graderTemplateOverride && !this.graderTemplate) {
|
|
5853
|
+
userPrompt += `
|
|
5854
|
+
|
|
5855
|
+
[[ ## tool_calls ## ]]
|
|
5856
|
+
${context2.toolCalls}`;
|
|
5849
5857
|
}
|
|
5850
5858
|
const graderRawRequest = {
|
|
5851
5859
|
userPrompt,
|
|
@@ -6167,6 +6175,7 @@ ${context2.fileChanges}`;
|
|
|
6167
6175
|
[TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
|
|
6168
6176
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
6169
6177
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
|
|
6178
|
+
[TEMPLATE_VARIABLES.TOOL_CALLS]: context2.toolCalls ?? "",
|
|
6170
6179
|
// Deprecated aliases
|
|
6171
6180
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
6172
6181
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
@@ -6195,6 +6204,9 @@ ${context2.fileChanges}`;
|
|
|
6195
6204
|
if (context2.fileChanges) {
|
|
6196
6205
|
parts.push("[[ ## file_changes ## ]]", context2.fileChanges, "");
|
|
6197
6206
|
}
|
|
6207
|
+
if (context2.toolCalls) {
|
|
6208
|
+
parts.push("[[ ## tool_calls ## ]]", context2.toolCalls, "");
|
|
6209
|
+
}
|
|
6198
6210
|
if (rubrics && rubrics.length > 0) {
|
|
6199
6211
|
parts.push("[[ ## rubrics ## ]]");
|
|
6200
6212
|
for (const rubric of rubrics) {
|
|
@@ -6228,6 +6240,7 @@ ${context2.fileChanges}`;
|
|
|
6228
6240
|
[TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
|
|
6229
6241
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
6230
6242
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
|
|
6243
|
+
[TEMPLATE_VARIABLES.TOOL_CALLS]: context2.toolCalls ?? "",
|
|
6231
6244
|
// Deprecated aliases
|
|
6232
6245
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
6233
6246
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
@@ -6257,6 +6270,9 @@ ${outputSchema}`;
|
|
|
6257
6270
|
if (context2.fileChanges) {
|
|
6258
6271
|
parts.push("[[ ## file_changes ## ]]", context2.fileChanges, "");
|
|
6259
6272
|
}
|
|
6273
|
+
if (context2.toolCalls) {
|
|
6274
|
+
parts.push("[[ ## tool_calls ## ]]", context2.toolCalls, "");
|
|
6275
|
+
}
|
|
6260
6276
|
if (rubrics && rubrics.length > 0) {
|
|
6261
6277
|
parts.push("[[ ## rubrics ## ]]");
|
|
6262
6278
|
for (const rubric of rubrics) {
|
|
@@ -6349,6 +6365,9 @@ ${outputSchema}`;
|
|
|
6349
6365
|
if (context2.fileChanges) {
|
|
6350
6366
|
parts.push("[[ ## file_changes ## ]]", context2.fileChanges, "");
|
|
6351
6367
|
}
|
|
6368
|
+
if (context2.toolCalls) {
|
|
6369
|
+
parts.push("[[ ## tool_calls ## ]]", context2.toolCalls, "");
|
|
6370
|
+
}
|
|
6352
6371
|
parts.push("[[ ## scoring_criteria ## ]]");
|
|
6353
6372
|
for (const rubric of rubrics) {
|
|
6354
6373
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
@@ -6391,6 +6410,9 @@ ${outputSchema}`;
|
|
|
6391
6410
|
if (context2.fileChanges) {
|
|
6392
6411
|
parts.push("[[ ## file_changes ## ]]", context2.fileChanges, "");
|
|
6393
6412
|
}
|
|
6413
|
+
if (context2.toolCalls) {
|
|
6414
|
+
parts.push("[[ ## tool_calls ## ]]", context2.toolCalls, "");
|
|
6415
|
+
}
|
|
6394
6416
|
parts.push("[[ ## rubrics ## ]]");
|
|
6395
6417
|
for (const rubric of rubrics) {
|
|
6396
6418
|
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
@@ -7518,6 +7540,61 @@ var init_latency = __esm({
|
|
|
7518
7540
|
}
|
|
7519
7541
|
});
|
|
7520
7542
|
|
|
7543
|
+
// src/evaluation/graders/format-tool-calls.ts
|
|
7544
|
+
function formatToolCalls(output) {
|
|
7545
|
+
if (!output) return "";
|
|
7546
|
+
const lines = [];
|
|
7547
|
+
for (const message of output) {
|
|
7548
|
+
if (!message.toolCalls) continue;
|
|
7549
|
+
for (const call of message.toolCalls) {
|
|
7550
|
+
const toolName = call.tool ?? "unknown";
|
|
7551
|
+
const detail = extractKeyDetail(toolName, call.input);
|
|
7552
|
+
lines.push(detail ? `- ${toolName}: ${detail}` : `- ${toolName}`);
|
|
7553
|
+
}
|
|
7554
|
+
}
|
|
7555
|
+
return lines.length > 0 ? lines.join("\n") : "";
|
|
7556
|
+
}
|
|
7557
|
+
function extractKeyDetail(toolName, input) {
|
|
7558
|
+
if (!input || typeof input !== "object") return "";
|
|
7559
|
+
const record = input;
|
|
7560
|
+
const knownFields = KEY_INPUT_FIELDS.get(toolName);
|
|
7561
|
+
if (knownFields) {
|
|
7562
|
+
for (const field of knownFields) {
|
|
7563
|
+
const value = record[field];
|
|
7564
|
+
if (typeof value === "string" && value.length > 0) {
|
|
7565
|
+
return truncate(value);
|
|
7566
|
+
}
|
|
7567
|
+
}
|
|
7568
|
+
}
|
|
7569
|
+
for (const value of Object.values(record)) {
|
|
7570
|
+
if (typeof value === "string" && value.length > 0 && value.length <= MAX_FALLBACK_LENGTH) {
|
|
7571
|
+
return truncate(value);
|
|
7572
|
+
}
|
|
7573
|
+
}
|
|
7574
|
+
return "";
|
|
7575
|
+
}
|
|
7576
|
+
function truncate(value, maxLen = 120) {
|
|
7577
|
+
if (value.length <= maxLen) return value;
|
|
7578
|
+
return `${value.slice(0, maxLen)}\u2026`;
|
|
7579
|
+
}
|
|
7580
|
+
var KEY_INPUT_FIELDS, MAX_FALLBACK_LENGTH;
|
|
7581
|
+
var init_format_tool_calls = __esm({
|
|
7582
|
+
"src/evaluation/graders/format-tool-calls.ts"() {
|
|
7583
|
+
"use strict";
|
|
7584
|
+
init_cjs_shims();
|
|
7585
|
+
KEY_INPUT_FIELDS = /* @__PURE__ */ new Map([
|
|
7586
|
+
["Skill", ["skill"]],
|
|
7587
|
+
["Read", ["file_path"]],
|
|
7588
|
+
["Write", ["file_path"]],
|
|
7589
|
+
["Edit", ["file_path"]],
|
|
7590
|
+
["Bash", ["command"]],
|
|
7591
|
+
["Grep", ["pattern"]],
|
|
7592
|
+
["Glob", ["pattern"]]
|
|
7593
|
+
]);
|
|
7594
|
+
MAX_FALLBACK_LENGTH = 120;
|
|
7595
|
+
}
|
|
7596
|
+
});
|
|
7597
|
+
|
|
7521
7598
|
// src/evaluation/graders/skill-trigger.ts
|
|
7522
7599
|
var SkillTriggerGrader;
|
|
7523
7600
|
var init_skill_trigger = __esm({
|
|
@@ -7601,19 +7678,27 @@ function assembleLlmGraderPrompt(input) {
|
|
|
7601
7678
|
promptInputs,
|
|
7602
7679
|
evaluatorConfig,
|
|
7603
7680
|
fileChanges,
|
|
7681
|
+
toolCalls,
|
|
7604
7682
|
graderTemplateOverride
|
|
7605
7683
|
} = input;
|
|
7606
7684
|
const rubrics = evaluatorConfig?.rubrics;
|
|
7607
7685
|
if (rubrics && rubrics.length > 0) {
|
|
7608
7686
|
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
7609
7687
|
if (hasScoreRanges) {
|
|
7610
|
-
return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges);
|
|
7688
|
+
return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls);
|
|
7611
7689
|
}
|
|
7612
|
-
return assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges);
|
|
7690
|
+
return assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls);
|
|
7613
7691
|
}
|
|
7614
|
-
return assembleFreeform(
|
|
7692
|
+
return assembleFreeform(
|
|
7693
|
+
evalCase,
|
|
7694
|
+
candidate,
|
|
7695
|
+
promptInputs,
|
|
7696
|
+
fileChanges,
|
|
7697
|
+
toolCalls,
|
|
7698
|
+
graderTemplateOverride
|
|
7699
|
+
);
|
|
7615
7700
|
}
|
|
7616
|
-
function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, graderTemplateOverride) {
|
|
7701
|
+
function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, toolCalls, graderTemplateOverride) {
|
|
7617
7702
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
7618
7703
|
const variables = {
|
|
7619
7704
|
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
@@ -7621,6 +7706,7 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, grader
|
|
|
7621
7706
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? "").trim(),
|
|
7622
7707
|
[TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
|
|
7623
7708
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
|
|
7709
|
+
[TEMPLATE_VARIABLES.TOOL_CALLS]: toolCalls ?? "",
|
|
7624
7710
|
// Deprecated aliases
|
|
7625
7711
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
7626
7712
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
|
|
@@ -7634,6 +7720,12 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, grader
|
|
|
7634
7720
|
|
|
7635
7721
|
[[ ## file_changes ## ]]
|
|
7636
7722
|
${fileChanges}`;
|
|
7723
|
+
}
|
|
7724
|
+
if (toolCalls && !graderTemplateOverride) {
|
|
7725
|
+
userPrompt += `
|
|
7726
|
+
|
|
7727
|
+
[[ ## tool_calls ## ]]
|
|
7728
|
+
${toolCalls}`;
|
|
7637
7729
|
}
|
|
7638
7730
|
return {
|
|
7639
7731
|
systemPrompt,
|
|
@@ -7642,7 +7734,7 @@ ${fileChanges}`;
|
|
|
7642
7734
|
mode: "freeform"
|
|
7643
7735
|
};
|
|
7644
7736
|
}
|
|
7645
|
-
function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges) {
|
|
7737
|
+
function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls) {
|
|
7646
7738
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
7647
7739
|
const parts = [
|
|
7648
7740
|
"You are an expert grader. Evaluate the candidate answer against each rubric item below.",
|
|
@@ -7661,6 +7753,9 @@ function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChang
|
|
|
7661
7753
|
if (fileChanges) {
|
|
7662
7754
|
parts.push("[[ ## file_changes ## ]]", fileChanges, "");
|
|
7663
7755
|
}
|
|
7756
|
+
if (toolCalls) {
|
|
7757
|
+
parts.push("[[ ## tool_calls ## ]]", toolCalls, "");
|
|
7758
|
+
}
|
|
7664
7759
|
parts.push("[[ ## rubrics ## ]]");
|
|
7665
7760
|
for (const rubric of rubrics) {
|
|
7666
7761
|
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
@@ -7677,7 +7772,7 @@ function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChang
|
|
|
7677
7772
|
mode: "checklist"
|
|
7678
7773
|
};
|
|
7679
7774
|
}
|
|
7680
|
-
function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges) {
|
|
7775
|
+
function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls) {
|
|
7681
7776
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
7682
7777
|
const parts = [
|
|
7683
7778
|
"You are an expert grader. Score the candidate answer on each criterion below using the provided score ranges.",
|
|
@@ -7697,6 +7792,9 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
|
|
|
7697
7792
|
if (fileChanges) {
|
|
7698
7793
|
parts.push("[[ ## file_changes ## ]]", fileChanges, "");
|
|
7699
7794
|
}
|
|
7795
|
+
if (toolCalls) {
|
|
7796
|
+
parts.push("[[ ## tool_calls ## ]]", toolCalls, "");
|
|
7797
|
+
}
|
|
7700
7798
|
parts.push("[[ ## scoring_criteria ## ]]");
|
|
7701
7799
|
for (const rubric of rubrics) {
|
|
7702
7800
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
@@ -8470,6 +8568,7 @@ var init_graders = __esm({
|
|
|
8470
8568
|
init_field_accuracy();
|
|
8471
8569
|
init_latency();
|
|
8472
8570
|
init_llm_grader();
|
|
8571
|
+
init_format_tool_calls();
|
|
8473
8572
|
init_skill_trigger();
|
|
8474
8573
|
init_llm_grader_prompt();
|
|
8475
8574
|
init_token_usage();
|
|
@@ -19200,6 +19299,7 @@ async function runEvaluation(options) {
|
|
|
19200
19299
|
trials,
|
|
19201
19300
|
streamCallbacks,
|
|
19202
19301
|
budgetUsd,
|
|
19302
|
+
runBudgetTracker,
|
|
19203
19303
|
failOnError,
|
|
19204
19304
|
poolWorkspaces,
|
|
19205
19305
|
poolMaxSlots: configPoolMaxSlots,
|
|
@@ -19534,8 +19634,14 @@ async function runEvaluation(options) {
|
|
|
19534
19634
|
}
|
|
19535
19635
|
}
|
|
19536
19636
|
return { ok: allPassed, depResults };
|
|
19637
|
+
}, extractEvaluationCostUsd2 = function(result) {
|
|
19638
|
+
if (result.trials && result.trials.length > 0) {
|
|
19639
|
+
const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
|
|
19640
|
+
return trialCostSum > 0 ? trialCostSum : void 0;
|
|
19641
|
+
}
|
|
19642
|
+
return result.costUsd;
|
|
19537
19643
|
};
|
|
19538
|
-
var toDependencyResult = toDependencyResult2, checkDependencies = checkDependencies2;
|
|
19644
|
+
var toDependencyResult = toDependencyResult2, checkDependencies = checkDependencies2, extractEvaluationCostUsd = extractEvaluationCostUsd2;
|
|
19539
19645
|
if (suiteWorkspaceFile && sharedWorkspacePath) {
|
|
19540
19646
|
const copiedWorkspaceFile = import_node_path47.default.join(sharedWorkspacePath, import_node_path47.default.basename(suiteWorkspaceFile));
|
|
19541
19647
|
try {
|
|
@@ -19728,6 +19834,42 @@ async function runEvaluation(options) {
|
|
|
19728
19834
|
async function dispatchTest(evalCase, depResults) {
|
|
19729
19835
|
const workerId = nextWorkerId++;
|
|
19730
19836
|
workerIdByEvalId.set(evalCase.id, workerId);
|
|
19837
|
+
if (runBudgetTracker?.isExceeded()) {
|
|
19838
|
+
const budgetResult = {
|
|
19839
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
19840
|
+
testId: evalCase.id,
|
|
19841
|
+
suite: evalCase.suite,
|
|
19842
|
+
category: evalCase.category,
|
|
19843
|
+
score: 0,
|
|
19844
|
+
assertions: [],
|
|
19845
|
+
output: [],
|
|
19846
|
+
target: target.name,
|
|
19847
|
+
error: `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`,
|
|
19848
|
+
budgetExceeded: true,
|
|
19849
|
+
executionStatus: "execution_error",
|
|
19850
|
+
failureStage: "setup",
|
|
19851
|
+
failureReasonCode: "budget_exceeded",
|
|
19852
|
+
executionError: {
|
|
19853
|
+
message: `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`,
|
|
19854
|
+
stage: "setup"
|
|
19855
|
+
}
|
|
19856
|
+
};
|
|
19857
|
+
if (onProgress) {
|
|
19858
|
+
await onProgress({
|
|
19859
|
+
workerId,
|
|
19860
|
+
testId: evalCase.id,
|
|
19861
|
+
status: "failed",
|
|
19862
|
+
completedAt: Date.now(),
|
|
19863
|
+
error: budgetResult.error,
|
|
19864
|
+
score: budgetResult.score,
|
|
19865
|
+
executionStatus: budgetResult.executionStatus
|
|
19866
|
+
});
|
|
19867
|
+
}
|
|
19868
|
+
if (onResult) {
|
|
19869
|
+
await onResult(budgetResult);
|
|
19870
|
+
}
|
|
19871
|
+
return budgetResult;
|
|
19872
|
+
}
|
|
19731
19873
|
if (budgetUsd !== void 0 && budgetExhausted) {
|
|
19732
19874
|
const budgetResult = {
|
|
19733
19875
|
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
@@ -19841,22 +19983,17 @@ async function runEvaluation(options) {
|
|
|
19841
19983
|
...depResults && Object.keys(depResults).length > 0 ? { dependencyResults: depResults } : {}
|
|
19842
19984
|
};
|
|
19843
19985
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
19844
|
-
|
|
19845
|
-
|
|
19846
|
-
if (
|
|
19847
|
-
const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
|
|
19848
|
-
if (trialCostSum > 0) {
|
|
19849
|
-
caseCost = trialCostSum;
|
|
19850
|
-
}
|
|
19851
|
-
} else {
|
|
19852
|
-
caseCost = result.costUsd;
|
|
19853
|
-
}
|
|
19854
|
-
if (caseCost !== void 0) {
|
|
19986
|
+
const caseCost = extractEvaluationCostUsd2(result);
|
|
19987
|
+
if (caseCost !== void 0) {
|
|
19988
|
+
if (budgetUsd !== void 0) {
|
|
19855
19989
|
cumulativeBudgetCost += caseCost;
|
|
19856
19990
|
if (cumulativeBudgetCost >= budgetUsd) {
|
|
19857
19991
|
budgetExhausted = true;
|
|
19858
19992
|
}
|
|
19859
19993
|
}
|
|
19994
|
+
if (runBudgetTracker) {
|
|
19995
|
+
runBudgetTracker.add(caseCost);
|
|
19996
|
+
}
|
|
19860
19997
|
}
|
|
19861
19998
|
if (failOnError === true && result.executionStatus === "execution_error") {
|
|
19862
19999
|
failOnErrorTriggered = true;
|
|
@@ -20664,6 +20801,7 @@ async function runEvalCase(options) {
|
|
|
20664
20801
|
fileChanges = fileChanges ? `${fileChanges}
|
|
20665
20802
|
${providerFileChanges}` : providerFileChanges;
|
|
20666
20803
|
}
|
|
20804
|
+
const toolCalls = formatToolCalls(output);
|
|
20667
20805
|
const providerError = extractProviderError(providerResponse);
|
|
20668
20806
|
const targetAfterEachHook = options.targetHooks?.after_each;
|
|
20669
20807
|
if (workspacePath && hasHookCommand(targetAfterEachHook)) {
|
|
@@ -20747,6 +20885,7 @@ ${providerFileChanges}` : providerFileChanges;
|
|
|
20747
20885
|
targetResolver,
|
|
20748
20886
|
availableTargets,
|
|
20749
20887
|
fileChanges,
|
|
20888
|
+
toolCalls,
|
|
20750
20889
|
workspacePath,
|
|
20751
20890
|
dockerConfig: evalCase.workspace?.docker,
|
|
20752
20891
|
verbose,
|
|
@@ -20944,6 +21083,7 @@ async function evaluateCandidate(options) {
|
|
|
20944
21083
|
targetResolver,
|
|
20945
21084
|
availableTargets,
|
|
20946
21085
|
fileChanges,
|
|
21086
|
+
toolCalls,
|
|
20947
21087
|
workspacePath,
|
|
20948
21088
|
dockerConfig,
|
|
20949
21089
|
threshold: evalThreshold,
|
|
@@ -20972,6 +21112,7 @@ async function evaluateCandidate(options) {
|
|
|
20972
21112
|
targetResolver,
|
|
20973
21113
|
availableTargets,
|
|
20974
21114
|
fileChanges,
|
|
21115
|
+
toolCalls,
|
|
20975
21116
|
workspacePath,
|
|
20976
21117
|
dockerConfig,
|
|
20977
21118
|
threshold: evalThreshold,
|
|
@@ -21049,6 +21190,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
21049
21190
|
targetResolver,
|
|
21050
21191
|
availableTargets,
|
|
21051
21192
|
fileChanges,
|
|
21193
|
+
toolCalls,
|
|
21052
21194
|
workspacePath,
|
|
21053
21195
|
dockerConfig,
|
|
21054
21196
|
threshold,
|
|
@@ -21078,6 +21220,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
21078
21220
|
targetResolver,
|
|
21079
21221
|
availableTargets,
|
|
21080
21222
|
fileChanges,
|
|
21223
|
+
toolCalls,
|
|
21081
21224
|
workspacePath,
|
|
21082
21225
|
dockerConfig,
|
|
21083
21226
|
threshold,
|
|
@@ -21109,6 +21252,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
21109
21252
|
targetResolver,
|
|
21110
21253
|
availableTargets,
|
|
21111
21254
|
fileChanges,
|
|
21255
|
+
toolCalls,
|
|
21112
21256
|
workspacePath,
|
|
21113
21257
|
dockerConfig,
|
|
21114
21258
|
dependencyResults,
|
|
@@ -21150,6 +21294,7 @@ async function runEvaluatorList(options) {
|
|
|
21150
21294
|
targetResolver,
|
|
21151
21295
|
availableTargets,
|
|
21152
21296
|
fileChanges,
|
|
21297
|
+
toolCalls,
|
|
21153
21298
|
workspacePath,
|
|
21154
21299
|
dockerConfig,
|
|
21155
21300
|
dependencyResults
|
|
@@ -21175,6 +21320,7 @@ async function runEvaluatorList(options) {
|
|
|
21175
21320
|
targetResolver,
|
|
21176
21321
|
availableTargets,
|
|
21177
21322
|
fileChanges,
|
|
21323
|
+
toolCalls,
|
|
21178
21324
|
workspacePath,
|
|
21179
21325
|
dockerConfig,
|
|
21180
21326
|
dependencyResults
|
|
@@ -24109,6 +24255,7 @@ __export(index_exports, {
|
|
|
24109
24255
|
ProviderRegistry: () => ProviderRegistry,
|
|
24110
24256
|
RepoManager: () => RepoManager,
|
|
24111
24257
|
ResponseCache: () => ResponseCache,
|
|
24258
|
+
RunBudgetTracker: () => RunBudgetTracker,
|
|
24112
24259
|
SkillTriggerGrader: () => SkillTriggerGrader,
|
|
24113
24260
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
24114
24261
|
TemplateNotDirectoryError: () => TemplateNotDirectoryError,
|
|
@@ -24180,6 +24327,7 @@ __export(index_exports, {
|
|
|
24180
24327
|
extractWorkersFromSuite: () => extractWorkersFromSuite,
|
|
24181
24328
|
fileExists: () => fileExists2,
|
|
24182
24329
|
findGitRoot: () => findGitRoot,
|
|
24330
|
+
formatToolCalls: () => formatToolCalls,
|
|
24183
24331
|
freeformEvaluationSchema: () => freeformEvaluationSchema,
|
|
24184
24332
|
generateRubrics: () => generateRubrics,
|
|
24185
24333
|
getAgentvConfigDir: () => getAgentvConfigDir,
|
|
@@ -25857,6 +26005,31 @@ init_assertion_discovery();
|
|
|
25857
26005
|
init_assertions();
|
|
25858
26006
|
init_grader_discovery();
|
|
25859
26007
|
|
|
26008
|
+
// src/evaluation/run-budget-tracker.ts
|
|
26009
|
+
init_cjs_shims();
|
|
26010
|
+
var RunBudgetTracker = class {
|
|
26011
|
+
constructor(capUsd) {
|
|
26012
|
+
this.capUsd = capUsd;
|
|
26013
|
+
}
|
|
26014
|
+
cumulative = 0;
|
|
26015
|
+
/** Accumulate cost from a completed test or file. */
|
|
26016
|
+
add(costUsd) {
|
|
26017
|
+
this.cumulative += costUsd;
|
|
26018
|
+
}
|
|
26019
|
+
/** True when cumulative cost meets or exceeds the cap. */
|
|
26020
|
+
isExceeded() {
|
|
26021
|
+
return this.cumulative >= this.capUsd;
|
|
26022
|
+
}
|
|
26023
|
+
/** Current accumulated cost. */
|
|
26024
|
+
get currentCostUsd() {
|
|
26025
|
+
return this.cumulative;
|
|
26026
|
+
}
|
|
26027
|
+
/** The configured cap. */
|
|
26028
|
+
get budgetCapUsd() {
|
|
26029
|
+
return this.capUsd;
|
|
26030
|
+
}
|
|
26031
|
+
};
|
|
26032
|
+
|
|
25860
26033
|
// src/import/index.ts
|
|
25861
26034
|
init_cjs_shims();
|
|
25862
26035
|
|
|
@@ -26525,6 +26698,7 @@ function createAgentKernel() {
|
|
|
26525
26698
|
ProviderRegistry,
|
|
26526
26699
|
RepoManager,
|
|
26527
26700
|
ResponseCache,
|
|
26701
|
+
RunBudgetTracker,
|
|
26528
26702
|
SkillTriggerGrader,
|
|
26529
26703
|
TEST_MESSAGE_ROLES,
|
|
26530
26704
|
TemplateNotDirectoryError,
|
|
@@ -26596,6 +26770,7 @@ function createAgentKernel() {
|
|
|
26596
26770
|
extractWorkersFromSuite,
|
|
26597
26771
|
fileExists,
|
|
26598
26772
|
findGitRoot,
|
|
26773
|
+
formatToolCalls,
|
|
26599
26774
|
freeformEvaluationSchema,
|
|
26600
26775
|
generateRubrics,
|
|
26601
26776
|
getAgentvConfigDir,
|