agentv 3.5.0 → 3.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -3
- package/dist/{chunk-RLL4QGNL.js → chunk-7YS6YNJZ.js} +5 -5
- package/dist/chunk-7YS6YNJZ.js.map +1 -0
- package/dist/{chunk-5GG6DDP5.js → chunk-TR6H437M.js} +14 -16
- package/dist/chunk-TR6H437M.js.map +1 -0
- package/dist/{chunk-D6G4N2H2.js → chunk-XGG64VIY.js} +80 -69
- package/dist/chunk-XGG64VIY.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-MZFXE6B5.js → dist-VP6AXX6B.js} +2 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-J7SUWZH2.js → interactive-F6XECJ33.js} +3 -3
- package/dist/templates/.agentv/.env.example +9 -11
- package/dist/templates/.agentv/config.yaml +0 -5
- package/dist/templates/.agentv/targets.yaml +16 -0
- package/package.json +1 -1
- package/dist/chunk-5GG6DDP5.js.map +0 -1
- package/dist/chunk-D6G4N2H2.js.map +0 -1
- package/dist/chunk-RLL4QGNL.js.map +0 -1
- /package/dist/{dist-MZFXE6B5.js.map → dist-VP6AXX6B.js.map} +0 -0
- /package/dist/{interactive-J7SUWZH2.js.map → interactive-F6XECJ33.js.map} +0 -0
|
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
|
|
|
301
301
|
}
|
|
302
302
|
});
|
|
303
303
|
|
|
304
|
-
// ../../packages/core/dist/chunk-
|
|
304
|
+
// ../../packages/core/dist/chunk-2IZOTQ25.js
|
|
305
305
|
import { constants } from "node:fs";
|
|
306
306
|
import { access, readFile } from "node:fs/promises";
|
|
307
307
|
import path from "node:path";
|
|
@@ -419,7 +419,7 @@ __export(external_exports2, {
|
|
|
419
419
|
void: () => voidType
|
|
420
420
|
});
|
|
421
421
|
|
|
422
|
-
// ../../packages/core/dist/chunk-
|
|
422
|
+
// ../../packages/core/dist/chunk-2IZOTQ25.js
|
|
423
423
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
424
424
|
var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
|
|
425
425
|
var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
|
|
@@ -14036,12 +14036,10 @@ function computeTraceSummary(messages) {
|
|
|
14036
14036
|
}
|
|
14037
14037
|
}
|
|
14038
14038
|
}
|
|
14039
|
-
const toolNames = Object.keys(toolCallCounts).sort();
|
|
14040
14039
|
return {
|
|
14041
14040
|
trace: {
|
|
14042
14041
|
eventCount: totalToolCalls,
|
|
14043
|
-
|
|
14044
|
-
toolCallsByName: toolCallCounts,
|
|
14042
|
+
toolCalls: toolCallCounts,
|
|
14045
14043
|
errorCount: 0,
|
|
14046
14044
|
llmCallCount,
|
|
14047
14045
|
...hasAnyDuration ? { toolDurations } : {}
|
|
@@ -14065,7 +14063,7 @@ var DEFAULT_EXPLORATION_TOOLS = [
|
|
|
14065
14063
|
function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
|
|
14066
14064
|
if (summary.eventCount === 0) return void 0;
|
|
14067
14065
|
const explorationCalls = explorationTools.reduce(
|
|
14068
|
-
(sum, tool2) => sum + (summary.
|
|
14066
|
+
(sum, tool2) => sum + (summary.toolCalls[tool2] ?? 0),
|
|
14069
14067
|
0
|
|
14070
14068
|
);
|
|
14071
14069
|
return explorationCalls / summary.eventCount;
|
|
@@ -14655,14 +14653,8 @@ function logWarning(message) {
|
|
|
14655
14653
|
console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET3}`);
|
|
14656
14654
|
}
|
|
14657
14655
|
var TEMPLATE_VARIABLES = {
|
|
14658
|
-
/** @deprecated Use OUTPUT_TEXT instead */
|
|
14659
|
-
ANSWER: "answer",
|
|
14660
14656
|
EXPECTED_OUTPUT: "expected_output",
|
|
14661
|
-
/** @deprecated Use INPUT_TEXT instead */
|
|
14662
|
-
QUESTION: "question",
|
|
14663
14657
|
CRITERIA: "criteria",
|
|
14664
|
-
/** @deprecated Use EXPECTED_OUTPUT_TEXT instead */
|
|
14665
|
-
REFERENCE_ANSWER: "reference_answer",
|
|
14666
14658
|
INPUT: "input",
|
|
14667
14659
|
OUTPUT: "output",
|
|
14668
14660
|
FILE_CHANGES: "file_changes",
|
|
@@ -14672,9 +14664,8 @@ var TEMPLATE_VARIABLES = {
|
|
|
14672
14664
|
};
|
|
14673
14665
|
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
14674
14666
|
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
14675
|
-
TEMPLATE_VARIABLES.
|
|
14676
|
-
TEMPLATE_VARIABLES.EXPECTED_OUTPUT
|
|
14677
|
-
TEMPLATE_VARIABLES.OUTPUT_TEXT
|
|
14667
|
+
TEMPLATE_VARIABLES.OUTPUT_TEXT,
|
|
14668
|
+
TEMPLATE_VARIABLES.EXPECTED_OUTPUT
|
|
14678
14669
|
]);
|
|
14679
14670
|
var ANSI_YELLOW3 = "\x1B[33m";
|
|
14680
14671
|
var ANSI_RESET4 = "\x1B[0m";
|
|
@@ -14695,13 +14686,13 @@ function validateTemplateVariables(content, source) {
|
|
|
14695
14686
|
}
|
|
14696
14687
|
match = variablePattern.exec(content);
|
|
14697
14688
|
}
|
|
14698
|
-
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.
|
|
14689
|
+
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
|
|
14699
14690
|
const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
|
|
14700
14691
|
const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
|
|
14701
14692
|
if (!hasRequiredFields) {
|
|
14702
14693
|
throw new Error(
|
|
14703
14694
|
`Missing required fields. Must include at least one of:
|
|
14704
|
-
- {{ ${TEMPLATE_VARIABLES.
|
|
14695
|
+
- {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
|
|
14705
14696
|
- {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
|
|
14706
14697
|
);
|
|
14707
14698
|
}
|
|
@@ -17843,6 +17834,8 @@ async function invokeModel(options) {
|
|
|
17843
17834
|
const { model, request, defaults, retryConfig, providerOptions } = options;
|
|
17844
17835
|
const chatPrompt = buildChatPrompt(request);
|
|
17845
17836
|
const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
|
|
17837
|
+
const startTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
17838
|
+
const startMs = Date.now();
|
|
17846
17839
|
const result = await withRetry(
|
|
17847
17840
|
() => generateText({
|
|
17848
17841
|
model,
|
|
@@ -17856,9 +17849,11 @@ async function invokeModel(options) {
|
|
|
17856
17849
|
retryConfig,
|
|
17857
17850
|
request.signal
|
|
17858
17851
|
);
|
|
17859
|
-
|
|
17852
|
+
const endTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
17853
|
+
const durationMs = Date.now() - startMs;
|
|
17854
|
+
return mapResponse(result, { durationMs, startTime, endTime });
|
|
17860
17855
|
}
|
|
17861
|
-
function mapResponse(result) {
|
|
17856
|
+
function mapResponse(result, timing) {
|
|
17862
17857
|
const content = result.text ?? "";
|
|
17863
17858
|
const rawUsage = result.totalUsage ?? result.usage;
|
|
17864
17859
|
const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
|
|
@@ -17873,7 +17868,10 @@ function mapResponse(result) {
|
|
|
17873
17868
|
raw: result,
|
|
17874
17869
|
usage: toJsonObject(rawUsage),
|
|
17875
17870
|
output: [{ role: "assistant", content }],
|
|
17876
|
-
tokenUsage
|
|
17871
|
+
tokenUsage,
|
|
17872
|
+
durationMs: timing?.durationMs,
|
|
17873
|
+
startTime: timing?.startTime,
|
|
17874
|
+
endTime: timing?.endTime
|
|
17877
17875
|
};
|
|
17878
17876
|
}
|
|
17879
17877
|
function toJsonObject(value) {
|
|
@@ -18731,10 +18729,12 @@ var ClaudeSdkProvider = class {
|
|
|
18731
18729
|
if (usage) {
|
|
18732
18730
|
const inputTokens = (usage.input_tokens ?? 0) + (usage.cache_read_input_tokens ?? 0) + (usage.cache_creation_input_tokens ?? 0);
|
|
18733
18731
|
const outputTokens = usage.output_tokens ?? 0;
|
|
18732
|
+
const reasoningTokens = usage.reasoning_tokens ?? void 0;
|
|
18734
18733
|
tokenUsage = {
|
|
18735
18734
|
input: inputTokens,
|
|
18736
18735
|
output: outputTokens,
|
|
18737
|
-
cached: usage.cache_read_input_tokens ?? void 0
|
|
18736
|
+
cached: usage.cache_read_input_tokens ?? void 0,
|
|
18737
|
+
reasoning: reasoningTokens
|
|
18738
18738
|
};
|
|
18739
18739
|
request.streamCallbacks?.onLlmCallEnd?.(this.config.model ?? "claude", tokenUsage);
|
|
18740
18740
|
}
|
|
@@ -19730,7 +19730,8 @@ ${basePrompt}` : basePrompt;
|
|
|
19730
19730
|
onUsage({
|
|
19731
19731
|
input: usage.input_tokens ?? 0,
|
|
19732
19732
|
output: usage.output_tokens ?? 0,
|
|
19733
|
-
cached: usage.cached_input_tokens ?? void 0
|
|
19733
|
+
cached: usage.cached_input_tokens ?? void 0,
|
|
19734
|
+
reasoning: usage.reasoning_tokens ?? void 0
|
|
19734
19735
|
});
|
|
19735
19736
|
}
|
|
19736
19737
|
}
|
|
@@ -21698,10 +21699,12 @@ function extractTokenUsage(events) {
|
|
|
21698
21699
|
output: output ?? 0
|
|
21699
21700
|
};
|
|
21700
21701
|
const cached = toFiniteNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
|
|
21701
|
-
|
|
21702
|
-
|
|
21703
|
-
|
|
21704
|
-
|
|
21702
|
+
const reasoning = toFiniteNumber(u.reasoning_tokens ?? u.reasoningTokens ?? u.reasoning);
|
|
21703
|
+
return {
|
|
21704
|
+
...result,
|
|
21705
|
+
...cached !== void 0 ? { cached } : {},
|
|
21706
|
+
...reasoning !== void 0 ? { reasoning } : {}
|
|
21707
|
+
};
|
|
21705
21708
|
}
|
|
21706
21709
|
}
|
|
21707
21710
|
const messages = record.messages;
|
|
@@ -23927,11 +23930,9 @@ var CodeEvaluator = class {
|
|
|
23927
23930
|
}
|
|
23928
23931
|
}
|
|
23929
23932
|
const payload = {
|
|
23930
|
-
question: context2.evalCase.question,
|
|
23931
23933
|
criteria: context2.evalCase.criteria,
|
|
23932
23934
|
expectedOutput: context2.evalCase.expected_output,
|
|
23933
|
-
|
|
23934
|
-
answer: context2.candidate,
|
|
23935
|
+
outputText: context2.candidate,
|
|
23935
23936
|
output: outputForPayload,
|
|
23936
23937
|
outputPath,
|
|
23937
23938
|
guidelineFiles: context2.evalCase.guideline_paths,
|
|
@@ -23948,9 +23949,7 @@ var CodeEvaluator = class {
|
|
|
23948
23949
|
fileChanges: context2.fileChanges ?? null,
|
|
23949
23950
|
workspacePath: context2.workspacePath ?? null,
|
|
23950
23951
|
config: this.config ?? null,
|
|
23951
|
-
// Text convenience accessors (new names, always strings)
|
|
23952
23952
|
inputText: context2.evalCase.question,
|
|
23953
|
-
outputText: context2.candidate,
|
|
23954
23953
|
expectedOutputText: context2.evalCase.reference_answer ?? ""
|
|
23955
23954
|
};
|
|
23956
23955
|
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
@@ -24109,13 +24108,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
24109
24108
|
{{${TEMPLATE_VARIABLES.CRITERIA}}}
|
|
24110
24109
|
|
|
24111
24110
|
[[ ## question ## ]]
|
|
24112
|
-
{{${TEMPLATE_VARIABLES.
|
|
24111
|
+
{{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
|
|
24113
24112
|
|
|
24114
24113
|
[[ ## reference_answer ## ]]
|
|
24115
|
-
{{${TEMPLATE_VARIABLES.
|
|
24114
|
+
{{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
|
|
24116
24115
|
|
|
24117
24116
|
[[ ## answer ## ]]
|
|
24118
|
-
{{${TEMPLATE_VARIABLES.
|
|
24117
|
+
{{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
|
|
24119
24118
|
var freeformEvaluationSchema = external_exports2.object({
|
|
24120
24119
|
score: external_exports2.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
24121
24120
|
assertions: external_exports2.array(
|
|
@@ -24193,12 +24192,8 @@ var LlmGraderEvaluator = class {
|
|
|
24193
24192
|
2
|
|
24194
24193
|
),
|
|
24195
24194
|
[TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context2.output ?? [], null, 2),
|
|
24196
|
-
[TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
|
|
24197
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
24198
24195
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
24199
|
-
[TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
|
|
24200
24196
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
|
|
24201
|
-
// Text convenience accessors (new names, always strings)
|
|
24202
24197
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
24203
24198
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
24204
24199
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
|
|
@@ -24503,10 +24498,10 @@ ${context2.fileChanges}`;
|
|
|
24503
24498
|
buildAgentUserPrompt(context2) {
|
|
24504
24499
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
24505
24500
|
const variables = {
|
|
24506
|
-
[TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
|
|
24507
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
24508
24501
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
24509
|
-
[TEMPLATE_VARIABLES.
|
|
24502
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
24503
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
24504
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
24510
24505
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
|
|
24511
24506
|
};
|
|
24512
24507
|
if (this.evaluatorTemplate) {
|
|
@@ -24559,10 +24554,10 @@ ${context2.fileChanges}`;
|
|
|
24559
24554
|
const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
|
|
24560
24555
|
if (this.evaluatorTemplate) {
|
|
24561
24556
|
const variables = {
|
|
24562
|
-
[TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
|
|
24563
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
24564
24557
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
24565
|
-
[TEMPLATE_VARIABLES.
|
|
24558
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
24559
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
24560
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
24566
24561
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
|
|
24567
24562
|
};
|
|
24568
24563
|
const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
|
|
@@ -25978,7 +25973,10 @@ var COPILOT_MATCHER = {
|
|
|
25978
25973
|
skillTools: ["Skill", "skill"],
|
|
25979
25974
|
skillInputField: "skill",
|
|
25980
25975
|
readTools: ["Read File", "readFile", "Read", "readTextFile"],
|
|
25981
|
-
readInputField: "file_path"
|
|
25976
|
+
readInputField: "file_path",
|
|
25977
|
+
skillToolPrefixes: ["Using skill: "],
|
|
25978
|
+
readToolPrefixes: ["Viewing "],
|
|
25979
|
+
readInputFields: ["file_path", "path"]
|
|
25982
25980
|
};
|
|
25983
25981
|
var PROVIDER_TOOL_SEMANTICS = {
|
|
25984
25982
|
claude: CLAUDE_MATCHER,
|
|
@@ -26020,12 +26018,22 @@ var SkillTriggerEvaluator = class {
|
|
|
26020
26018
|
triggered = true;
|
|
26021
26019
|
evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
|
|
26022
26020
|
}
|
|
26021
|
+
} else if (matcher.skillToolPrefixes?.some(
|
|
26022
|
+
(prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
|
|
26023
|
+
)) {
|
|
26024
|
+
triggered = true;
|
|
26025
|
+
evidence = `Skill tool invoked via tool name "${firstTool.tool}"`;
|
|
26023
26026
|
} else if (matcher.readTools.includes(firstTool.tool)) {
|
|
26024
|
-
const filePath =
|
|
26027
|
+
const filePath = this.readPathFromInput(input, matcher);
|
|
26025
26028
|
if (filePath.includes(skillName)) {
|
|
26026
26029
|
triggered = true;
|
|
26027
26030
|
evidence = `Read tool loaded skill file: ${filePath}`;
|
|
26028
26031
|
}
|
|
26032
|
+
} else if (matcher.readToolPrefixes?.some(
|
|
26033
|
+
(prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
|
|
26034
|
+
)) {
|
|
26035
|
+
triggered = true;
|
|
26036
|
+
evidence = `Read tool loaded skill file via tool name "${firstTool.tool}"`;
|
|
26029
26037
|
}
|
|
26030
26038
|
}
|
|
26031
26039
|
const pass = triggered === shouldTrigger;
|
|
@@ -26054,6 +26062,16 @@ var SkillTriggerEvaluator = class {
|
|
|
26054
26062
|
expectedAspectCount: 1
|
|
26055
26063
|
};
|
|
26056
26064
|
}
|
|
26065
|
+
readPathFromInput(input, matcher) {
|
|
26066
|
+
const fields = matcher.readInputFields ?? [matcher.readInputField];
|
|
26067
|
+
for (const field of fields) {
|
|
26068
|
+
const value = input[field];
|
|
26069
|
+
if (value !== void 0 && value !== null) {
|
|
26070
|
+
return String(value);
|
|
26071
|
+
}
|
|
26072
|
+
}
|
|
26073
|
+
return "";
|
|
26074
|
+
}
|
|
26057
26075
|
};
|
|
26058
26076
|
function assembleLlmGraderPrompt(input) {
|
|
26059
26077
|
const {
|
|
@@ -26086,12 +26104,8 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
|
|
|
26086
26104
|
[TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2),
|
|
26087
26105
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
|
|
26088
26106
|
[TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
|
|
26089
|
-
[TEMPLATE_VARIABLES.ANSWER]: candidate.trim(),
|
|
26090
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (evalCase.reference_answer ?? "").trim(),
|
|
26091
26107
|
[TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
|
|
26092
|
-
[TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
|
|
26093
26108
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
|
|
26094
|
-
// Text convenience accessors (new names, always strings)
|
|
26095
26109
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
26096
26110
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
|
|
26097
26111
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
|
|
@@ -26419,11 +26433,9 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26419
26433
|
for (const call of toolCalls) {
|
|
26420
26434
|
toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
|
|
26421
26435
|
}
|
|
26422
|
-
const toolNames = Object.keys(toolCallsByName).sort();
|
|
26423
26436
|
return {
|
|
26424
26437
|
eventCount: toolCalls.length,
|
|
26425
|
-
|
|
26426
|
-
toolCallsByName,
|
|
26438
|
+
toolCalls: toolCallsByName,
|
|
26427
26439
|
errorCount: 0
|
|
26428
26440
|
};
|
|
26429
26441
|
}
|
|
@@ -26441,7 +26453,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26441
26453
|
const assertions = [];
|
|
26442
26454
|
for (const toolName of toolNames) {
|
|
26443
26455
|
const required = minimums[toolName];
|
|
26444
|
-
const actual = summary.
|
|
26456
|
+
const actual = summary.toolCalls[toolName] ?? 0;
|
|
26445
26457
|
if (actual >= required) {
|
|
26446
26458
|
assertions.push({
|
|
26447
26459
|
text: `${toolName}: called ${actual} times (required >=${required})`,
|
|
@@ -27125,11 +27137,9 @@ async function resolveCustomPrompt(promptConfig, context2, timeoutMs) {
|
|
|
27125
27137
|
}
|
|
27126
27138
|
async function executePromptTemplate(script, context2, config, timeoutMs) {
|
|
27127
27139
|
const payload = {
|
|
27128
|
-
question: context2.evalCase.question,
|
|
27129
27140
|
criteria: context2.evalCase.criteria,
|
|
27130
27141
|
expectedOutput: context2.evalCase.expected_output,
|
|
27131
|
-
|
|
27132
|
-
answer: context2.candidate,
|
|
27142
|
+
outputText: context2.candidate,
|
|
27133
27143
|
output: context2.output ?? null,
|
|
27134
27144
|
guidelineFiles: context2.evalCase.guideline_paths,
|
|
27135
27145
|
inputFiles: context2.evalCase.file_paths.filter(
|
|
@@ -27140,9 +27150,7 @@ async function executePromptTemplate(script, context2, config, timeoutMs) {
|
|
|
27140
27150
|
fileChanges: context2.fileChanges ?? null,
|
|
27141
27151
|
workspacePath: context2.workspacePath ?? null,
|
|
27142
27152
|
config: config ?? context2.config ?? null,
|
|
27143
|
-
// Text convenience accessors (new names, always strings)
|
|
27144
27153
|
inputText: context2.evalCase.question,
|
|
27145
|
-
outputText: context2.candidate,
|
|
27146
27154
|
expectedOutputText: context2.evalCase.reference_answer ?? ""
|
|
27147
27155
|
};
|
|
27148
27156
|
const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
@@ -28762,7 +28770,7 @@ async function runEvaluation(options) {
|
|
|
28762
28770
|
dataset: evalCase.dataset,
|
|
28763
28771
|
score: 0,
|
|
28764
28772
|
assertions: [],
|
|
28765
|
-
|
|
28773
|
+
outputText: "",
|
|
28766
28774
|
target: target.name,
|
|
28767
28775
|
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
28768
28776
|
budgetExceeded: true,
|
|
@@ -28798,7 +28806,7 @@ async function runEvaluation(options) {
|
|
|
28798
28806
|
dataset: evalCase.dataset,
|
|
28799
28807
|
score: 0,
|
|
28800
28808
|
assertions: [],
|
|
28801
|
-
|
|
28809
|
+
outputText: "",
|
|
28802
28810
|
target: target.name,
|
|
28803
28811
|
error: errorMsg,
|
|
28804
28812
|
executionStatus: "execution_error",
|
|
@@ -29063,7 +29071,7 @@ async function runBatchEvaluation(options) {
|
|
|
29063
29071
|
const providerResponse = batchResponse[i];
|
|
29064
29072
|
const output = providerResponse.output;
|
|
29065
29073
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
29066
|
-
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0,
|
|
29074
|
+
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolCalls: {}, errorCount: 0 } } : void 0;
|
|
29067
29075
|
const merged = computed ? mergeExecutionMetrics(computed, {
|
|
29068
29076
|
tokenUsage: providerResponse.tokenUsage,
|
|
29069
29077
|
costUsd: providerResponse.costUsd,
|
|
@@ -29460,7 +29468,7 @@ async function runEvalCase(options) {
|
|
|
29460
29468
|
}
|
|
29461
29469
|
const output = providerResponse.output;
|
|
29462
29470
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
29463
|
-
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0,
|
|
29471
|
+
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolCalls: {}, errorCount: 0 } } : void 0;
|
|
29464
29472
|
const merged = computed ? mergeExecutionMetrics(computed, {
|
|
29465
29473
|
tokenUsage: providerResponse.tokenUsage,
|
|
29466
29474
|
costUsd: providerResponse.costUsd,
|
|
@@ -29765,7 +29773,7 @@ async function evaluateCandidate(options) {
|
|
|
29765
29773
|
conversationId: evalCase.conversation_id,
|
|
29766
29774
|
score: score.score,
|
|
29767
29775
|
assertions: score.assertions,
|
|
29768
|
-
|
|
29776
|
+
outputText: candidate,
|
|
29769
29777
|
target: target.name,
|
|
29770
29778
|
tokenUsage,
|
|
29771
29779
|
costUsd,
|
|
@@ -30121,7 +30129,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
30121
30129
|
conversationId: evalCase.conversation_id,
|
|
30122
30130
|
score: 0,
|
|
30123
30131
|
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
30124
|
-
|
|
30132
|
+
outputText: `Error occurred: ${message}`,
|
|
30125
30133
|
target: targetName,
|
|
30126
30134
|
requests,
|
|
30127
30135
|
input,
|
|
@@ -30638,7 +30646,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
|
|
|
30638
30646
|
return false;
|
|
30639
30647
|
}
|
|
30640
30648
|
var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
|
|
30641
|
-
"
|
|
30649
|
+
"outputText",
|
|
30642
30650
|
"requests",
|
|
30643
30651
|
"trace",
|
|
30644
30652
|
"workspacePath",
|
|
@@ -30810,14 +30818,17 @@ var OtelTraceExporter = class {
|
|
|
30810
30818
|
rootSpan.setAttribute("agentv.target", result.target);
|
|
30811
30819
|
if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
|
|
30812
30820
|
rootSpan.setAttribute("agentv.score", result.score);
|
|
30813
|
-
if (captureContent) rootSpan.setAttribute("agentv.
|
|
30821
|
+
if (captureContent) rootSpan.setAttribute("agentv.output_text", result.outputText);
|
|
30814
30822
|
if (result.durationMs != null)
|
|
30815
30823
|
rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
|
|
30816
30824
|
if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
|
|
30817
30825
|
if (result.trace) {
|
|
30818
30826
|
const t = result.trace;
|
|
30819
30827
|
rootSpan.setAttribute("agentv.trace.event_count", t.eventCount);
|
|
30820
|
-
rootSpan.setAttribute(
|
|
30828
|
+
rootSpan.setAttribute(
|
|
30829
|
+
"agentv.trace.tool_names",
|
|
30830
|
+
Object.keys(t.toolCalls).sort().join(",")
|
|
30831
|
+
);
|
|
30821
30832
|
if (t.llmCallCount != null)
|
|
30822
30833
|
rootSpan.setAttribute("agentv.trace.llm_call_count", t.llmCallCount);
|
|
30823
30834
|
}
|
|
@@ -31237,4 +31248,4 @@ export {
|
|
|
31237
31248
|
OtelStreamingObserver,
|
|
31238
31249
|
createAgentKernel
|
|
31239
31250
|
};
|
|
31240
|
-
//# sourceMappingURL=chunk-
|
|
31251
|
+
//# sourceMappingURL=chunk-XGG64VIY.js.map
|