agentv 4.19.0 → 4.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{artifact-writer-YATMDPWI.js → artifact-writer-RFXWXUOV.js} +4 -4
- package/dist/{chunk-R2QDYORI.js → chunk-36HXBYUY.js} +25 -2
- package/dist/chunk-36HXBYUY.js.map +1 -0
- package/dist/{chunk-62M5MR5K.js → chunk-KJZ7PZCE.js} +19 -6
- package/dist/chunk-KJZ7PZCE.js.map +1 -0
- package/dist/{chunk-PTYQS37Y.js → chunk-LP4Y5D2Z.js} +161 -24
- package/dist/chunk-LP4Y5D2Z.js.map +1 -0
- package/dist/{chunk-IWI4AJRS.js → chunk-PHGEGHKR.js} +55 -10
- package/dist/chunk-PHGEGHKR.js.map +1 -0
- package/dist/{chunk-NL6P5MUH.js → chunk-ZNS74WKH.js} +3 -3
- package/dist/cli.js +5 -5
- package/dist/{dist-RTIUSC6L.js → dist-GURCO6IS.js} +7 -3
- package/dist/index.js +5 -5
- package/dist/{interactive-7AZMOH2V.js → interactive-GLRASSKM.js} +5 -5
- package/dist/{ts-eval-loader-XFQ6S4DT-S7P2UUBX.js → ts-eval-loader-32COE32J-TCT4RIRT.js} +2 -2
- package/package.json +1 -1
- package/dist/chunk-62M5MR5K.js.map +0 -1
- package/dist/chunk-IWI4AJRS.js.map +0 -1
- package/dist/chunk-PTYQS37Y.js.map +0 -1
- package/dist/chunk-R2QDYORI.js.map +0 -1
- /package/dist/{artifact-writer-YATMDPWI.js.map → artifact-writer-RFXWXUOV.js.map} +0 -0
- /package/dist/{chunk-NL6P5MUH.js.map → chunk-ZNS74WKH.js.map} +0 -0
- /package/dist/{dist-RTIUSC6L.js.map → dist-GURCO6IS.js.map} +0 -0
- /package/dist/{interactive-7AZMOH2V.js.map → interactive-GLRASSKM.js.map} +0 -0
- /package/dist/{ts-eval-loader-XFQ6S4DT-S7P2UUBX.js.map → ts-eval-loader-32COE32J-TCT4RIRT.js.map} +0 -0
|
@@ -8225,7 +8225,7 @@ var _a20;
|
|
|
8225
8225
|
_a20 = symbol20;
|
|
8226
8226
|
var defaultDownload2 = createDownload();
|
|
8227
8227
|
|
|
8228
|
-
// ../../packages/core/dist/chunk-
|
|
8228
|
+
// ../../packages/core/dist/chunk-ELF6SQAK.js
|
|
8229
8229
|
import path46 from "node:path";
|
|
8230
8230
|
import { pathToFileURL as pathToFileURL2 } from "node:url";
|
|
8231
8231
|
import { existsSync as existsSync6 } from "node:fs";
|
|
@@ -12915,7 +12915,7 @@ var openrouter = createOpenRouter({
|
|
|
12915
12915
|
// strict for OpenRouter API
|
|
12916
12916
|
});
|
|
12917
12917
|
|
|
12918
|
-
// ../../packages/core/dist/chunk-
|
|
12918
|
+
// ../../packages/core/dist/chunk-ELF6SQAK.js
|
|
12919
12919
|
import { spawn } from "node:child_process";
|
|
12920
12920
|
import { randomUUID } from "node:crypto";
|
|
12921
12921
|
import { createWriteStream } from "node:fs";
|
|
@@ -14419,7 +14419,7 @@ var RequestError = class _RequestError extends Error {
|
|
|
14419
14419
|
}
|
|
14420
14420
|
};
|
|
14421
14421
|
|
|
14422
|
-
// ../../packages/core/dist/chunk-
|
|
14422
|
+
// ../../packages/core/dist/chunk-ELF6SQAK.js
|
|
14423
14423
|
import { exec as execCallback } from "node:child_process";
|
|
14424
14424
|
import { readdirSync, statSync } from "node:fs";
|
|
14425
14425
|
import { readFile as readFile22, readdir, stat } from "node:fs/promises";
|
|
@@ -15461,6 +15461,7 @@ var TEMPLATE_VARIABLES = {
|
|
|
15461
15461
|
INPUT: "input",
|
|
15462
15462
|
OUTPUT: "output",
|
|
15463
15463
|
FILE_CHANGES: "file_changes",
|
|
15464
|
+
TOOL_CALLS: "tool_calls",
|
|
15464
15465
|
/** @deprecated Use INPUT instead — resolves to the same text value. */
|
|
15465
15466
|
INPUT_TEXT: "input_text",
|
|
15466
15467
|
/** @deprecated Use OUTPUT instead — resolves to the same text value. */
|
|
@@ -15637,6 +15638,7 @@ var LlmGrader = class {
|
|
|
15637
15638
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
15638
15639
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
15639
15640
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
|
|
15641
|
+
[TEMPLATE_VARIABLES.TOOL_CALLS]: context2.toolCalls ?? "",
|
|
15640
15642
|
// Deprecated aliases — same values as the primary variables above
|
|
15641
15643
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
15642
15644
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
@@ -15651,6 +15653,12 @@ var LlmGrader = class {
|
|
|
15651
15653
|
|
|
15652
15654
|
[[ ## file_changes ## ]]
|
|
15653
15655
|
${context2.fileChanges}`;
|
|
15656
|
+
}
|
|
15657
|
+
if (context2.toolCalls && !context2.graderTemplateOverride && !this.graderTemplate) {
|
|
15658
|
+
userPrompt += `
|
|
15659
|
+
|
|
15660
|
+
[[ ## tool_calls ## ]]
|
|
15661
|
+
${context2.toolCalls}`;
|
|
15654
15662
|
}
|
|
15655
15663
|
const graderRawRequest = {
|
|
15656
15664
|
userPrompt,
|
|
@@ -15972,6 +15980,7 @@ ${context2.fileChanges}`;
|
|
|
15972
15980
|
[TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
|
|
15973
15981
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
15974
15982
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
|
|
15983
|
+
[TEMPLATE_VARIABLES.TOOL_CALLS]: context2.toolCalls ?? "",
|
|
15975
15984
|
// Deprecated aliases
|
|
15976
15985
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
15977
15986
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
@@ -16000,6 +16009,9 @@ ${context2.fileChanges}`;
|
|
|
16000
16009
|
if (context2.fileChanges) {
|
|
16001
16010
|
parts.push("[[ ## file_changes ## ]]", context2.fileChanges, "");
|
|
16002
16011
|
}
|
|
16012
|
+
if (context2.toolCalls) {
|
|
16013
|
+
parts.push("[[ ## tool_calls ## ]]", context2.toolCalls, "");
|
|
16014
|
+
}
|
|
16003
16015
|
if (rubrics && rubrics.length > 0) {
|
|
16004
16016
|
parts.push("[[ ## rubrics ## ]]");
|
|
16005
16017
|
for (const rubric of rubrics) {
|
|
@@ -16033,6 +16045,7 @@ ${context2.fileChanges}`;
|
|
|
16033
16045
|
[TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
|
|
16034
16046
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
16035
16047
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
|
|
16048
|
+
[TEMPLATE_VARIABLES.TOOL_CALLS]: context2.toolCalls ?? "",
|
|
16036
16049
|
// Deprecated aliases
|
|
16037
16050
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
16038
16051
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
@@ -16062,6 +16075,9 @@ ${outputSchema2}`;
|
|
|
16062
16075
|
if (context2.fileChanges) {
|
|
16063
16076
|
parts.push("[[ ## file_changes ## ]]", context2.fileChanges, "");
|
|
16064
16077
|
}
|
|
16078
|
+
if (context2.toolCalls) {
|
|
16079
|
+
parts.push("[[ ## tool_calls ## ]]", context2.toolCalls, "");
|
|
16080
|
+
}
|
|
16065
16081
|
if (rubrics && rubrics.length > 0) {
|
|
16066
16082
|
parts.push("[[ ## rubrics ## ]]");
|
|
16067
16083
|
for (const rubric of rubrics) {
|
|
@@ -16154,6 +16170,9 @@ ${outputSchema2}`;
|
|
|
16154
16170
|
if (context2.fileChanges) {
|
|
16155
16171
|
parts.push("[[ ## file_changes ## ]]", context2.fileChanges, "");
|
|
16156
16172
|
}
|
|
16173
|
+
if (context2.toolCalls) {
|
|
16174
|
+
parts.push("[[ ## tool_calls ## ]]", context2.toolCalls, "");
|
|
16175
|
+
}
|
|
16157
16176
|
parts.push("[[ ## scoring_criteria ## ]]");
|
|
16158
16177
|
for (const rubric of rubrics) {
|
|
16159
16178
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
@@ -16196,6 +16215,9 @@ ${outputSchema2}`;
|
|
|
16196
16215
|
if (context2.fileChanges) {
|
|
16197
16216
|
parts.push("[[ ## file_changes ## ]]", context2.fileChanges, "");
|
|
16198
16217
|
}
|
|
16218
|
+
if (context2.toolCalls) {
|
|
16219
|
+
parts.push("[[ ## tool_calls ## ]]", context2.toolCalls, "");
|
|
16220
|
+
}
|
|
16199
16221
|
parts.push("[[ ## rubrics ## ]]");
|
|
16200
16222
|
for (const rubric of rubrics) {
|
|
16201
16223
|
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
@@ -17694,6 +17716,52 @@ var LatencyGrader = class {
|
|
|
17694
17716
|
};
|
|
17695
17717
|
}
|
|
17696
17718
|
};
|
|
17719
|
+
var KEY_INPUT_FIELDS = /* @__PURE__ */ new Map([
|
|
17720
|
+
["Skill", ["skill"]],
|
|
17721
|
+
["Read", ["file_path"]],
|
|
17722
|
+
["Write", ["file_path"]],
|
|
17723
|
+
["Edit", ["file_path"]],
|
|
17724
|
+
["Bash", ["command"]],
|
|
17725
|
+
["Grep", ["pattern"]],
|
|
17726
|
+
["Glob", ["pattern"]]
|
|
17727
|
+
]);
|
|
17728
|
+
var MAX_FALLBACK_LENGTH = 120;
|
|
17729
|
+
function formatToolCalls(output) {
|
|
17730
|
+
if (!output) return "";
|
|
17731
|
+
const lines = [];
|
|
17732
|
+
for (const message of output) {
|
|
17733
|
+
if (!message.toolCalls) continue;
|
|
17734
|
+
for (const call of message.toolCalls) {
|
|
17735
|
+
const toolName = call.tool ?? "unknown";
|
|
17736
|
+
const detail = extractKeyDetail(toolName, call.input);
|
|
17737
|
+
lines.push(detail ? `- ${toolName}: ${detail}` : `- ${toolName}`);
|
|
17738
|
+
}
|
|
17739
|
+
}
|
|
17740
|
+
return lines.length > 0 ? lines.join("\n") : "";
|
|
17741
|
+
}
|
|
17742
|
+
function extractKeyDetail(toolName, input) {
|
|
17743
|
+
if (!input || typeof input !== "object") return "";
|
|
17744
|
+
const record = input;
|
|
17745
|
+
const knownFields = KEY_INPUT_FIELDS.get(toolName);
|
|
17746
|
+
if (knownFields) {
|
|
17747
|
+
for (const field of knownFields) {
|
|
17748
|
+
const value = record[field];
|
|
17749
|
+
if (typeof value === "string" && value.length > 0) {
|
|
17750
|
+
return truncate(value);
|
|
17751
|
+
}
|
|
17752
|
+
}
|
|
17753
|
+
}
|
|
17754
|
+
for (const value of Object.values(record)) {
|
|
17755
|
+
if (typeof value === "string" && value.length > 0 && value.length <= MAX_FALLBACK_LENGTH) {
|
|
17756
|
+
return truncate(value);
|
|
17757
|
+
}
|
|
17758
|
+
}
|
|
17759
|
+
return "";
|
|
17760
|
+
}
|
|
17761
|
+
function truncate(value, maxLen = 120) {
|
|
17762
|
+
if (value.length <= maxLen) return value;
|
|
17763
|
+
return `${value.slice(0, maxLen)}\u2026`;
|
|
17764
|
+
}
|
|
17697
17765
|
var SkillTriggerGrader = class {
|
|
17698
17766
|
kind = "skill-trigger";
|
|
17699
17767
|
config;
|
|
@@ -17767,19 +17835,27 @@ function assembleLlmGraderPrompt(input) {
|
|
|
17767
17835
|
promptInputs,
|
|
17768
17836
|
evaluatorConfig,
|
|
17769
17837
|
fileChanges,
|
|
17838
|
+
toolCalls,
|
|
17770
17839
|
graderTemplateOverride
|
|
17771
17840
|
} = input;
|
|
17772
17841
|
const rubrics = evaluatorConfig?.rubrics;
|
|
17773
17842
|
if (rubrics && rubrics.length > 0) {
|
|
17774
17843
|
const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
|
|
17775
17844
|
if (hasScoreRanges) {
|
|
17776
|
-
return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges);
|
|
17845
|
+
return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls);
|
|
17777
17846
|
}
|
|
17778
|
-
return assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges);
|
|
17847
|
+
return assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls);
|
|
17779
17848
|
}
|
|
17780
|
-
return assembleFreeform(
|
|
17849
|
+
return assembleFreeform(
|
|
17850
|
+
evalCase,
|
|
17851
|
+
candidate,
|
|
17852
|
+
promptInputs,
|
|
17853
|
+
fileChanges,
|
|
17854
|
+
toolCalls,
|
|
17855
|
+
graderTemplateOverride
|
|
17856
|
+
);
|
|
17781
17857
|
}
|
|
17782
|
-
function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, graderTemplateOverride) {
|
|
17858
|
+
function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, toolCalls, graderTemplateOverride) {
|
|
17783
17859
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
17784
17860
|
const variables = {
|
|
17785
17861
|
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
@@ -17787,6 +17863,7 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, grader
|
|
|
17787
17863
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? "").trim(),
|
|
17788
17864
|
[TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
|
|
17789
17865
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
|
|
17866
|
+
[TEMPLATE_VARIABLES.TOOL_CALLS]: toolCalls ?? "",
|
|
17790
17867
|
// Deprecated aliases
|
|
17791
17868
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
17792
17869
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
|
|
@@ -17800,6 +17877,12 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, grader
|
|
|
17800
17877
|
|
|
17801
17878
|
[[ ## file_changes ## ]]
|
|
17802
17879
|
${fileChanges}`;
|
|
17880
|
+
}
|
|
17881
|
+
if (toolCalls && !graderTemplateOverride) {
|
|
17882
|
+
userPrompt += `
|
|
17883
|
+
|
|
17884
|
+
[[ ## tool_calls ## ]]
|
|
17885
|
+
${toolCalls}`;
|
|
17803
17886
|
}
|
|
17804
17887
|
return {
|
|
17805
17888
|
systemPrompt,
|
|
@@ -17808,7 +17891,7 @@ ${fileChanges}`;
|
|
|
17808
17891
|
mode: "freeform"
|
|
17809
17892
|
};
|
|
17810
17893
|
}
|
|
17811
|
-
function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges) {
|
|
17894
|
+
function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls) {
|
|
17812
17895
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
17813
17896
|
const parts = [
|
|
17814
17897
|
"You are an expert grader. Evaluate the candidate answer against each rubric item below.",
|
|
@@ -17827,6 +17910,9 @@ function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChang
|
|
|
17827
17910
|
if (fileChanges) {
|
|
17828
17911
|
parts.push("[[ ## file_changes ## ]]", fileChanges, "");
|
|
17829
17912
|
}
|
|
17913
|
+
if (toolCalls) {
|
|
17914
|
+
parts.push("[[ ## tool_calls ## ]]", toolCalls, "");
|
|
17915
|
+
}
|
|
17830
17916
|
parts.push("[[ ## rubrics ## ]]");
|
|
17831
17917
|
for (const rubric of rubrics) {
|
|
17832
17918
|
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
@@ -17843,7 +17929,7 @@ function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChang
|
|
|
17843
17929
|
mode: "checklist"
|
|
17844
17930
|
};
|
|
17845
17931
|
}
|
|
17846
|
-
function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges) {
|
|
17932
|
+
function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls) {
|
|
17847
17933
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
17848
17934
|
const parts = [
|
|
17849
17935
|
"You are an expert grader. Score the candidate answer on each criterion below using the provided score ranges.",
|
|
@@ -17863,6 +17949,9 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
|
|
|
17863
17949
|
if (fileChanges) {
|
|
17864
17950
|
parts.push("[[ ## file_changes ## ]]", fileChanges, "");
|
|
17865
17951
|
}
|
|
17952
|
+
if (toolCalls) {
|
|
17953
|
+
parts.push("[[ ## tool_calls ## ]]", toolCalls, "");
|
|
17954
|
+
}
|
|
17866
17955
|
parts.push("[[ ## scoring_criteria ## ]]");
|
|
17867
17956
|
for (const rubric of rubrics) {
|
|
17868
17957
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
@@ -30105,7 +30194,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
30105
30194
|
return { tests: await loadTestsFromAgentSkills(evalFilePath) };
|
|
30106
30195
|
}
|
|
30107
30196
|
if (format === "typescript") {
|
|
30108
|
-
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-
|
|
30197
|
+
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-32COE32J-TCT4RIRT.js");
|
|
30109
30198
|
return loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
|
|
30110
30199
|
}
|
|
30111
30200
|
const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(
|
|
@@ -30140,7 +30229,7 @@ async function loadTests(evalFilePath, repoRoot, options) {
|
|
|
30140
30229
|
return loadTestsFromAgentSkills(evalFilePath);
|
|
30141
30230
|
}
|
|
30142
30231
|
if (format === "typescript") {
|
|
30143
|
-
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-
|
|
30232
|
+
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-32COE32J-TCT4RIRT.js");
|
|
30144
30233
|
const suite = await loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
|
|
30145
30234
|
return suite.tests;
|
|
30146
30235
|
}
|
|
@@ -30743,6 +30832,7 @@ async function runEvaluation(options) {
|
|
|
30743
30832
|
trials,
|
|
30744
30833
|
streamCallbacks,
|
|
30745
30834
|
budgetUsd,
|
|
30835
|
+
runBudgetTracker,
|
|
30746
30836
|
failOnError,
|
|
30747
30837
|
poolWorkspaces,
|
|
30748
30838
|
poolMaxSlots: configPoolMaxSlots,
|
|
@@ -31077,8 +31167,14 @@ async function runEvaluation(options) {
|
|
|
31077
31167
|
}
|
|
31078
31168
|
}
|
|
31079
31169
|
return { ok: allPassed, depResults };
|
|
31170
|
+
}, extractEvaluationCostUsd2 = function(result) {
|
|
31171
|
+
if (result.trials && result.trials.length > 0) {
|
|
31172
|
+
const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
|
|
31173
|
+
return trialCostSum > 0 ? trialCostSum : void 0;
|
|
31174
|
+
}
|
|
31175
|
+
return result.costUsd;
|
|
31080
31176
|
};
|
|
31081
|
-
var toDependencyResult = toDependencyResult2, checkDependencies = checkDependencies2;
|
|
31177
|
+
var toDependencyResult = toDependencyResult2, checkDependencies = checkDependencies2, extractEvaluationCostUsd = extractEvaluationCostUsd2;
|
|
31082
31178
|
if (suiteWorkspaceFile && sharedWorkspacePath) {
|
|
31083
31179
|
const copiedWorkspaceFile = path44.join(sharedWorkspacePath, path44.basename(suiteWorkspaceFile));
|
|
31084
31180
|
try {
|
|
@@ -31271,6 +31367,42 @@ async function runEvaluation(options) {
|
|
|
31271
31367
|
async function dispatchTest(evalCase, depResults) {
|
|
31272
31368
|
const workerId = nextWorkerId++;
|
|
31273
31369
|
workerIdByEvalId.set(evalCase.id, workerId);
|
|
31370
|
+
if (runBudgetTracker?.isExceeded()) {
|
|
31371
|
+
const budgetResult = {
|
|
31372
|
+
timestamp: (now2 ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
31373
|
+
testId: evalCase.id,
|
|
31374
|
+
suite: evalCase.suite,
|
|
31375
|
+
category: evalCase.category,
|
|
31376
|
+
score: 0,
|
|
31377
|
+
assertions: [],
|
|
31378
|
+
output: [],
|
|
31379
|
+
target: target.name,
|
|
31380
|
+
error: `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`,
|
|
31381
|
+
budgetExceeded: true,
|
|
31382
|
+
executionStatus: "execution_error",
|
|
31383
|
+
failureStage: "setup",
|
|
31384
|
+
failureReasonCode: "budget_exceeded",
|
|
31385
|
+
executionError: {
|
|
31386
|
+
message: `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`,
|
|
31387
|
+
stage: "setup"
|
|
31388
|
+
}
|
|
31389
|
+
};
|
|
31390
|
+
if (onProgress) {
|
|
31391
|
+
await onProgress({
|
|
31392
|
+
workerId,
|
|
31393
|
+
testId: evalCase.id,
|
|
31394
|
+
status: "failed",
|
|
31395
|
+
completedAt: Date.now(),
|
|
31396
|
+
error: budgetResult.error,
|
|
31397
|
+
score: budgetResult.score,
|
|
31398
|
+
executionStatus: budgetResult.executionStatus
|
|
31399
|
+
});
|
|
31400
|
+
}
|
|
31401
|
+
if (onResult) {
|
|
31402
|
+
await onResult(budgetResult);
|
|
31403
|
+
}
|
|
31404
|
+
return budgetResult;
|
|
31405
|
+
}
|
|
31274
31406
|
if (budgetUsd !== void 0 && budgetExhausted) {
|
|
31275
31407
|
const budgetResult = {
|
|
31276
31408
|
timestamp: (now2 ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
@@ -31384,22 +31516,17 @@ async function runEvaluation(options) {
|
|
|
31384
31516
|
...depResults && Object.keys(depResults).length > 0 ? { dependencyResults: depResults } : {}
|
|
31385
31517
|
};
|
|
31386
31518
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
31387
|
-
|
|
31388
|
-
|
|
31389
|
-
if (
|
|
31390
|
-
const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
|
|
31391
|
-
if (trialCostSum > 0) {
|
|
31392
|
-
caseCost = trialCostSum;
|
|
31393
|
-
}
|
|
31394
|
-
} else {
|
|
31395
|
-
caseCost = result.costUsd;
|
|
31396
|
-
}
|
|
31397
|
-
if (caseCost !== void 0) {
|
|
31519
|
+
const caseCost = extractEvaluationCostUsd2(result);
|
|
31520
|
+
if (caseCost !== void 0) {
|
|
31521
|
+
if (budgetUsd !== void 0) {
|
|
31398
31522
|
cumulativeBudgetCost += caseCost;
|
|
31399
31523
|
if (cumulativeBudgetCost >= budgetUsd) {
|
|
31400
31524
|
budgetExhausted = true;
|
|
31401
31525
|
}
|
|
31402
31526
|
}
|
|
31527
|
+
if (runBudgetTracker) {
|
|
31528
|
+
runBudgetTracker.add(caseCost);
|
|
31529
|
+
}
|
|
31403
31530
|
}
|
|
31404
31531
|
if (failOnError === true && result.executionStatus === "execution_error") {
|
|
31405
31532
|
failOnErrorTriggered = true;
|
|
@@ -32207,6 +32334,7 @@ async function runEvalCase(options) {
|
|
|
32207
32334
|
fileChanges = fileChanges ? `${fileChanges}
|
|
32208
32335
|
${providerFileChanges}` : providerFileChanges;
|
|
32209
32336
|
}
|
|
32337
|
+
const toolCalls = formatToolCalls(output);
|
|
32210
32338
|
const providerError = extractProviderError(providerResponse);
|
|
32211
32339
|
const targetAfterEachHook = options.targetHooks?.after_each;
|
|
32212
32340
|
if (workspacePath && hasHookCommand(targetAfterEachHook)) {
|
|
@@ -32290,6 +32418,7 @@ ${providerFileChanges}` : providerFileChanges;
|
|
|
32290
32418
|
targetResolver,
|
|
32291
32419
|
availableTargets,
|
|
32292
32420
|
fileChanges,
|
|
32421
|
+
toolCalls,
|
|
32293
32422
|
workspacePath,
|
|
32294
32423
|
dockerConfig: evalCase.workspace?.docker,
|
|
32295
32424
|
verbose,
|
|
@@ -32487,6 +32616,7 @@ async function evaluateCandidate(options) {
|
|
|
32487
32616
|
targetResolver,
|
|
32488
32617
|
availableTargets,
|
|
32489
32618
|
fileChanges,
|
|
32619
|
+
toolCalls,
|
|
32490
32620
|
workspacePath,
|
|
32491
32621
|
dockerConfig,
|
|
32492
32622
|
threshold: evalThreshold,
|
|
@@ -32515,6 +32645,7 @@ async function evaluateCandidate(options) {
|
|
|
32515
32645
|
targetResolver,
|
|
32516
32646
|
availableTargets,
|
|
32517
32647
|
fileChanges,
|
|
32648
|
+
toolCalls,
|
|
32518
32649
|
workspacePath,
|
|
32519
32650
|
dockerConfig,
|
|
32520
32651
|
threshold: evalThreshold,
|
|
@@ -32592,6 +32723,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
32592
32723
|
targetResolver,
|
|
32593
32724
|
availableTargets,
|
|
32594
32725
|
fileChanges,
|
|
32726
|
+
toolCalls,
|
|
32595
32727
|
workspacePath,
|
|
32596
32728
|
dockerConfig,
|
|
32597
32729
|
threshold,
|
|
@@ -32621,6 +32753,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
32621
32753
|
targetResolver,
|
|
32622
32754
|
availableTargets,
|
|
32623
32755
|
fileChanges,
|
|
32756
|
+
toolCalls,
|
|
32624
32757
|
workspacePath,
|
|
32625
32758
|
dockerConfig,
|
|
32626
32759
|
threshold,
|
|
@@ -32652,6 +32785,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
32652
32785
|
targetResolver,
|
|
32653
32786
|
availableTargets,
|
|
32654
32787
|
fileChanges,
|
|
32788
|
+
toolCalls,
|
|
32655
32789
|
workspacePath,
|
|
32656
32790
|
dockerConfig,
|
|
32657
32791
|
dependencyResults,
|
|
@@ -32693,6 +32827,7 @@ async function runEvaluatorList(options) {
|
|
|
32693
32827
|
targetResolver,
|
|
32694
32828
|
availableTargets,
|
|
32695
32829
|
fileChanges,
|
|
32830
|
+
toolCalls,
|
|
32696
32831
|
workspacePath,
|
|
32697
32832
|
dockerConfig,
|
|
32698
32833
|
dependencyResults
|
|
@@ -32718,6 +32853,7 @@ async function runEvaluatorList(options) {
|
|
|
32718
32853
|
targetResolver,
|
|
32719
32854
|
availableTargets,
|
|
32720
32855
|
fileChanges,
|
|
32856
|
+
toolCalls,
|
|
32721
32857
|
workspacePath,
|
|
32722
32858
|
dockerConfig,
|
|
32723
32859
|
dependencyResults
|
|
@@ -33778,6 +33914,7 @@ export {
|
|
|
33778
33914
|
ExecutionMetricsGrader,
|
|
33779
33915
|
FieldAccuracyGrader,
|
|
33780
33916
|
LatencyGrader,
|
|
33917
|
+
formatToolCalls,
|
|
33781
33918
|
SkillTriggerGrader,
|
|
33782
33919
|
assembleLlmGraderPrompt,
|
|
33783
33920
|
TokenUsageGrader,
|
|
@@ -33864,4 +34001,4 @@ export {
|
|
|
33864
34001
|
loadTsEvalFile,
|
|
33865
34002
|
loadTsEvalSuite
|
|
33866
34003
|
};
|
|
33867
|
-
//# sourceMappingURL=chunk-
|
|
34004
|
+
//# sourceMappingURL=chunk-LP4Y5D2Z.js.map
|