@agentv/core 3.5.0 → 3.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-EFR4JHPL.js → chunk-2IZOTQ25.js} +1 -1
- package/dist/chunk-2IZOTQ25.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +77 -66
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +3 -4
- package/dist/index.d.ts +3 -4
- package/dist/index.js +78 -67
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-EFR4JHPL.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1786,12 +1786,10 @@ function computeTraceSummary(messages) {
|
|
|
1786
1786
|
}
|
|
1787
1787
|
}
|
|
1788
1788
|
}
|
|
1789
|
-
const toolNames = Object.keys(toolCallCounts).sort();
|
|
1790
1789
|
return {
|
|
1791
1790
|
trace: {
|
|
1792
1791
|
eventCount: totalToolCalls,
|
|
1793
|
-
|
|
1794
|
-
toolCallsByName: toolCallCounts,
|
|
1792
|
+
toolCalls: toolCallCounts,
|
|
1795
1793
|
errorCount: 0,
|
|
1796
1794
|
llmCallCount,
|
|
1797
1795
|
...hasAnyDuration ? { toolDurations } : {}
|
|
@@ -1815,7 +1813,7 @@ var DEFAULT_EXPLORATION_TOOLS = [
|
|
|
1815
1813
|
function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
|
|
1816
1814
|
if (summary.eventCount === 0) return void 0;
|
|
1817
1815
|
const explorationCalls = explorationTools.reduce(
|
|
1818
|
-
(sum, tool2) => sum + (summary.
|
|
1816
|
+
(sum, tool2) => sum + (summary.toolCalls[tool2] ?? 0),
|
|
1819
1817
|
0
|
|
1820
1818
|
);
|
|
1821
1819
|
return explorationCalls / summary.eventCount;
|
|
@@ -2445,14 +2443,8 @@ var import_promises5 = require("fs/promises");
|
|
|
2445
2443
|
|
|
2446
2444
|
// src/evaluation/template-variables.ts
|
|
2447
2445
|
var TEMPLATE_VARIABLES = {
|
|
2448
|
-
/** @deprecated Use OUTPUT_TEXT instead */
|
|
2449
|
-
ANSWER: "answer",
|
|
2450
2446
|
EXPECTED_OUTPUT: "expected_output",
|
|
2451
|
-
/** @deprecated Use INPUT_TEXT instead */
|
|
2452
|
-
QUESTION: "question",
|
|
2453
2447
|
CRITERIA: "criteria",
|
|
2454
|
-
/** @deprecated Use EXPECTED_OUTPUT_TEXT instead */
|
|
2455
|
-
REFERENCE_ANSWER: "reference_answer",
|
|
2456
2448
|
INPUT: "input",
|
|
2457
2449
|
OUTPUT: "output",
|
|
2458
2450
|
FILE_CHANGES: "file_changes",
|
|
@@ -2462,9 +2454,8 @@ var TEMPLATE_VARIABLES = {
|
|
|
2462
2454
|
};
|
|
2463
2455
|
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
2464
2456
|
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
2465
|
-
TEMPLATE_VARIABLES.
|
|
2466
|
-
TEMPLATE_VARIABLES.EXPECTED_OUTPUT
|
|
2467
|
-
TEMPLATE_VARIABLES.OUTPUT_TEXT
|
|
2457
|
+
TEMPLATE_VARIABLES.OUTPUT_TEXT,
|
|
2458
|
+
TEMPLATE_VARIABLES.EXPECTED_OUTPUT
|
|
2468
2459
|
]);
|
|
2469
2460
|
|
|
2470
2461
|
// src/evaluation/validation/prompt-validator.ts
|
|
@@ -2487,13 +2478,13 @@ function validateTemplateVariables(content, source) {
|
|
|
2487
2478
|
}
|
|
2488
2479
|
match = variablePattern.exec(content);
|
|
2489
2480
|
}
|
|
2490
|
-
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.
|
|
2481
|
+
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
|
|
2491
2482
|
const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
|
|
2492
2483
|
const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
|
|
2493
2484
|
if (!hasRequiredFields) {
|
|
2494
2485
|
throw new Error(
|
|
2495
2486
|
`Missing required fields. Must include at least one of:
|
|
2496
|
-
- {{ ${TEMPLATE_VARIABLES.
|
|
2487
|
+
- {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
|
|
2497
2488
|
- {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
|
|
2498
2489
|
);
|
|
2499
2490
|
}
|
|
@@ -5795,6 +5786,8 @@ async function invokeModel(options) {
|
|
|
5795
5786
|
const { model, request, defaults, retryConfig, providerOptions } = options;
|
|
5796
5787
|
const chatPrompt = buildChatPrompt(request);
|
|
5797
5788
|
const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
|
|
5789
|
+
const startTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
5790
|
+
const startMs = Date.now();
|
|
5798
5791
|
const result = await withRetry(
|
|
5799
5792
|
() => (0, import_ai.generateText)({
|
|
5800
5793
|
model,
|
|
@@ -5808,9 +5801,11 @@ async function invokeModel(options) {
|
|
|
5808
5801
|
retryConfig,
|
|
5809
5802
|
request.signal
|
|
5810
5803
|
);
|
|
5811
|
-
|
|
5804
|
+
const endTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
5805
|
+
const durationMs = Date.now() - startMs;
|
|
5806
|
+
return mapResponse(result, { durationMs, startTime, endTime });
|
|
5812
5807
|
}
|
|
5813
|
-
function mapResponse(result) {
|
|
5808
|
+
function mapResponse(result, timing) {
|
|
5814
5809
|
const content = result.text ?? "";
|
|
5815
5810
|
const rawUsage = result.totalUsage ?? result.usage;
|
|
5816
5811
|
const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
|
|
@@ -5825,7 +5820,10 @@ function mapResponse(result) {
|
|
|
5825
5820
|
raw: result,
|
|
5826
5821
|
usage: toJsonObject(rawUsage),
|
|
5827
5822
|
output: [{ role: "assistant", content }],
|
|
5828
|
-
tokenUsage
|
|
5823
|
+
tokenUsage,
|
|
5824
|
+
durationMs: timing?.durationMs,
|
|
5825
|
+
startTime: timing?.startTime,
|
|
5826
|
+
endTime: timing?.endTime
|
|
5829
5827
|
};
|
|
5830
5828
|
}
|
|
5831
5829
|
function toJsonObject(value) {
|
|
@@ -6703,10 +6701,12 @@ var ClaudeSdkProvider = class {
|
|
|
6703
6701
|
if (usage) {
|
|
6704
6702
|
const inputTokens = (usage.input_tokens ?? 0) + (usage.cache_read_input_tokens ?? 0) + (usage.cache_creation_input_tokens ?? 0);
|
|
6705
6703
|
const outputTokens = usage.output_tokens ?? 0;
|
|
6704
|
+
const reasoningTokens = usage.reasoning_tokens ?? void 0;
|
|
6706
6705
|
tokenUsage = {
|
|
6707
6706
|
input: inputTokens,
|
|
6708
6707
|
output: outputTokens,
|
|
6709
|
-
cached: usage.cache_read_input_tokens ?? void 0
|
|
6708
|
+
cached: usage.cache_read_input_tokens ?? void 0,
|
|
6709
|
+
reasoning: reasoningTokens
|
|
6710
6710
|
};
|
|
6711
6711
|
request.streamCallbacks?.onLlmCallEnd?.(this.config.model ?? "claude", tokenUsage);
|
|
6712
6712
|
}
|
|
@@ -7720,7 +7720,8 @@ ${basePrompt}` : basePrompt;
|
|
|
7720
7720
|
onUsage({
|
|
7721
7721
|
input: usage.input_tokens ?? 0,
|
|
7722
7722
|
output: usage.output_tokens ?? 0,
|
|
7723
|
-
cached: usage.cached_input_tokens ?? void 0
|
|
7723
|
+
cached: usage.cached_input_tokens ?? void 0,
|
|
7724
|
+
reasoning: usage.reasoning_tokens ?? void 0
|
|
7724
7725
|
});
|
|
7725
7726
|
}
|
|
7726
7727
|
}
|
|
@@ -9735,10 +9736,12 @@ function extractTokenUsage(events) {
|
|
|
9735
9736
|
output: output ?? 0
|
|
9736
9737
|
};
|
|
9737
9738
|
const cached = toFiniteNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
|
|
9738
|
-
|
|
9739
|
-
|
|
9740
|
-
|
|
9741
|
-
|
|
9739
|
+
const reasoning = toFiniteNumber(u.reasoning_tokens ?? u.reasoningTokens ?? u.reasoning);
|
|
9740
|
+
return {
|
|
9741
|
+
...result,
|
|
9742
|
+
...cached !== void 0 ? { cached } : {},
|
|
9743
|
+
...reasoning !== void 0 ? { reasoning } : {}
|
|
9744
|
+
};
|
|
9742
9745
|
}
|
|
9743
9746
|
}
|
|
9744
9747
|
const messages = record.messages;
|
|
@@ -13265,11 +13268,9 @@ var CodeEvaluator = class {
|
|
|
13265
13268
|
}
|
|
13266
13269
|
}
|
|
13267
13270
|
const payload = {
|
|
13268
|
-
question: context2.evalCase.question,
|
|
13269
13271
|
criteria: context2.evalCase.criteria,
|
|
13270
13272
|
expectedOutput: context2.evalCase.expected_output,
|
|
13271
|
-
|
|
13272
|
-
answer: context2.candidate,
|
|
13273
|
+
outputText: context2.candidate,
|
|
13273
13274
|
output: outputForPayload,
|
|
13274
13275
|
outputPath,
|
|
13275
13276
|
guidelineFiles: context2.evalCase.guideline_paths,
|
|
@@ -13286,9 +13287,7 @@ var CodeEvaluator = class {
|
|
|
13286
13287
|
fileChanges: context2.fileChanges ?? null,
|
|
13287
13288
|
workspacePath: context2.workspacePath ?? null,
|
|
13288
13289
|
config: this.config ?? null,
|
|
13289
|
-
// Text convenience accessors (new names, always strings)
|
|
13290
13290
|
inputText: context2.evalCase.question,
|
|
13291
|
-
outputText: context2.candidate,
|
|
13292
13291
|
expectedOutputText: context2.evalCase.reference_answer ?? ""
|
|
13293
13292
|
};
|
|
13294
13293
|
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
@@ -13488,13 +13487,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
13488
13487
|
{{${TEMPLATE_VARIABLES.CRITERIA}}}
|
|
13489
13488
|
|
|
13490
13489
|
[[ ## question ## ]]
|
|
13491
|
-
{{${TEMPLATE_VARIABLES.
|
|
13490
|
+
{{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
|
|
13492
13491
|
|
|
13493
13492
|
[[ ## reference_answer ## ]]
|
|
13494
|
-
{{${TEMPLATE_VARIABLES.
|
|
13493
|
+
{{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
|
|
13495
13494
|
|
|
13496
13495
|
[[ ## answer ## ]]
|
|
13497
|
-
{{${TEMPLATE_VARIABLES.
|
|
13496
|
+
{{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
|
|
13498
13497
|
var freeformEvaluationSchema = import_zod4.z.object({
|
|
13499
13498
|
score: import_zod4.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
13500
13499
|
assertions: import_zod4.z.array(
|
|
@@ -13572,12 +13571,8 @@ var LlmGraderEvaluator = class {
|
|
|
13572
13571
|
2
|
|
13573
13572
|
),
|
|
13574
13573
|
[TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context2.output ?? [], null, 2),
|
|
13575
|
-
[TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
|
|
13576
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
13577
13574
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
13578
|
-
[TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
|
|
13579
13575
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
|
|
13580
|
-
// Text convenience accessors (new names, always strings)
|
|
13581
13576
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
13582
13577
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
13583
13578
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
|
|
@@ -13882,10 +13877,10 @@ ${context2.fileChanges}`;
|
|
|
13882
13877
|
buildAgentUserPrompt(context2) {
|
|
13883
13878
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
13884
13879
|
const variables = {
|
|
13885
|
-
[TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
|
|
13886
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
13887
13880
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
13888
|
-
[TEMPLATE_VARIABLES.
|
|
13881
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
13882
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
13883
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
13889
13884
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
|
|
13890
13885
|
};
|
|
13891
13886
|
if (this.evaluatorTemplate) {
|
|
@@ -13938,10 +13933,10 @@ ${context2.fileChanges}`;
|
|
|
13938
13933
|
const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
|
|
13939
13934
|
if (this.evaluatorTemplate) {
|
|
13940
13935
|
const variables = {
|
|
13941
|
-
[TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
|
|
13942
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
13943
13936
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
13944
|
-
[TEMPLATE_VARIABLES.
|
|
13937
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
13938
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
13939
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
13945
13940
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
|
|
13946
13941
|
};
|
|
13947
13942
|
const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
|
|
@@ -15369,7 +15364,10 @@ var COPILOT_MATCHER = {
|
|
|
15369
15364
|
skillTools: ["Skill", "skill"],
|
|
15370
15365
|
skillInputField: "skill",
|
|
15371
15366
|
readTools: ["Read File", "readFile", "Read", "readTextFile"],
|
|
15372
|
-
readInputField: "file_path"
|
|
15367
|
+
readInputField: "file_path",
|
|
15368
|
+
skillToolPrefixes: ["Using skill: "],
|
|
15369
|
+
readToolPrefixes: ["Viewing "],
|
|
15370
|
+
readInputFields: ["file_path", "path"]
|
|
15373
15371
|
};
|
|
15374
15372
|
var PROVIDER_TOOL_SEMANTICS = {
|
|
15375
15373
|
claude: CLAUDE_MATCHER,
|
|
@@ -15411,12 +15409,22 @@ var SkillTriggerEvaluator = class {
|
|
|
15411
15409
|
triggered = true;
|
|
15412
15410
|
evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
|
|
15413
15411
|
}
|
|
15412
|
+
} else if (matcher.skillToolPrefixes?.some(
|
|
15413
|
+
(prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
|
|
15414
|
+
)) {
|
|
15415
|
+
triggered = true;
|
|
15416
|
+
evidence = `Skill tool invoked via tool name "${firstTool.tool}"`;
|
|
15414
15417
|
} else if (matcher.readTools.includes(firstTool.tool)) {
|
|
15415
|
-
const filePath =
|
|
15418
|
+
const filePath = this.readPathFromInput(input, matcher);
|
|
15416
15419
|
if (filePath.includes(skillName)) {
|
|
15417
15420
|
triggered = true;
|
|
15418
15421
|
evidence = `Read tool loaded skill file: ${filePath}`;
|
|
15419
15422
|
}
|
|
15423
|
+
} else if (matcher.readToolPrefixes?.some(
|
|
15424
|
+
(prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
|
|
15425
|
+
)) {
|
|
15426
|
+
triggered = true;
|
|
15427
|
+
evidence = `Read tool loaded skill file via tool name "${firstTool.tool}"`;
|
|
15420
15428
|
}
|
|
15421
15429
|
}
|
|
15422
15430
|
const pass = triggered === shouldTrigger;
|
|
@@ -15445,6 +15453,16 @@ var SkillTriggerEvaluator = class {
|
|
|
15445
15453
|
expectedAspectCount: 1
|
|
15446
15454
|
};
|
|
15447
15455
|
}
|
|
15456
|
+
readPathFromInput(input, matcher) {
|
|
15457
|
+
const fields = matcher.readInputFields ?? [matcher.readInputField];
|
|
15458
|
+
for (const field of fields) {
|
|
15459
|
+
const value = input[field];
|
|
15460
|
+
if (value !== void 0 && value !== null) {
|
|
15461
|
+
return String(value);
|
|
15462
|
+
}
|
|
15463
|
+
}
|
|
15464
|
+
return "";
|
|
15465
|
+
}
|
|
15448
15466
|
};
|
|
15449
15467
|
|
|
15450
15468
|
// src/evaluation/evaluators/llm-grader-prompt.ts
|
|
@@ -15479,12 +15497,8 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
|
|
|
15479
15497
|
[TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2),
|
|
15480
15498
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
|
|
15481
15499
|
[TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
|
|
15482
|
-
[TEMPLATE_VARIABLES.ANSWER]: candidate.trim(),
|
|
15483
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (evalCase.reference_answer ?? "").trim(),
|
|
15484
15500
|
[TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
|
|
15485
|
-
[TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
|
|
15486
15501
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
|
|
15487
|
-
// Text convenience accessors (new names, always strings)
|
|
15488
15502
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
15489
15503
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
|
|
15490
15504
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
|
|
@@ -15816,11 +15830,9 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15816
15830
|
for (const call of toolCalls) {
|
|
15817
15831
|
toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
|
|
15818
15832
|
}
|
|
15819
|
-
const toolNames = Object.keys(toolCallsByName).sort();
|
|
15820
15833
|
return {
|
|
15821
15834
|
eventCount: toolCalls.length,
|
|
15822
|
-
|
|
15823
|
-
toolCallsByName,
|
|
15835
|
+
toolCalls: toolCallsByName,
|
|
15824
15836
|
errorCount: 0
|
|
15825
15837
|
};
|
|
15826
15838
|
}
|
|
@@ -15838,7 +15850,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15838
15850
|
const assertions = [];
|
|
15839
15851
|
for (const toolName of toolNames) {
|
|
15840
15852
|
const required = minimums[toolName];
|
|
15841
|
-
const actual = summary.
|
|
15853
|
+
const actual = summary.toolCalls[toolName] ?? 0;
|
|
15842
15854
|
if (actual >= required) {
|
|
15843
15855
|
assertions.push({
|
|
15844
15856
|
text: `${toolName}: called ${actual} times (required >=${required})`,
|
|
@@ -16541,11 +16553,9 @@ async function resolveCustomPrompt(promptConfig, context2, timeoutMs) {
|
|
|
16541
16553
|
}
|
|
16542
16554
|
async function executePromptTemplate(script, context2, config, timeoutMs) {
|
|
16543
16555
|
const payload = {
|
|
16544
|
-
question: context2.evalCase.question,
|
|
16545
16556
|
criteria: context2.evalCase.criteria,
|
|
16546
16557
|
expectedOutput: context2.evalCase.expected_output,
|
|
16547
|
-
|
|
16548
|
-
answer: context2.candidate,
|
|
16558
|
+
outputText: context2.candidate,
|
|
16549
16559
|
output: context2.output ?? null,
|
|
16550
16560
|
guidelineFiles: context2.evalCase.guideline_paths,
|
|
16551
16561
|
inputFiles: context2.evalCase.file_paths.filter(
|
|
@@ -16556,9 +16566,7 @@ async function executePromptTemplate(script, context2, config, timeoutMs) {
|
|
|
16556
16566
|
fileChanges: context2.fileChanges ?? null,
|
|
16557
16567
|
workspacePath: context2.workspacePath ?? null,
|
|
16558
16568
|
config: config ?? context2.config ?? null,
|
|
16559
|
-
// Text convenience accessors (new names, always strings)
|
|
16560
16569
|
inputText: context2.evalCase.question,
|
|
16561
|
-
outputText: context2.candidate,
|
|
16562
16570
|
expectedOutputText: context2.evalCase.reference_answer ?? ""
|
|
16563
16571
|
};
|
|
16564
16572
|
const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
@@ -18222,7 +18230,7 @@ async function runEvaluation(options) {
|
|
|
18222
18230
|
dataset: evalCase.dataset,
|
|
18223
18231
|
score: 0,
|
|
18224
18232
|
assertions: [],
|
|
18225
|
-
|
|
18233
|
+
outputText: "",
|
|
18226
18234
|
target: target.name,
|
|
18227
18235
|
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
18228
18236
|
budgetExceeded: true,
|
|
@@ -18258,7 +18266,7 @@ async function runEvaluation(options) {
|
|
|
18258
18266
|
dataset: evalCase.dataset,
|
|
18259
18267
|
score: 0,
|
|
18260
18268
|
assertions: [],
|
|
18261
|
-
|
|
18269
|
+
outputText: "",
|
|
18262
18270
|
target: target.name,
|
|
18263
18271
|
error: errorMsg,
|
|
18264
18272
|
executionStatus: "execution_error",
|
|
@@ -18523,7 +18531,7 @@ async function runBatchEvaluation(options) {
|
|
|
18523
18531
|
const providerResponse = batchResponse[i];
|
|
18524
18532
|
const output = providerResponse.output;
|
|
18525
18533
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
18526
|
-
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0,
|
|
18534
|
+
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolCalls: {}, errorCount: 0 } } : void 0;
|
|
18527
18535
|
const merged = computed ? mergeExecutionMetrics(computed, {
|
|
18528
18536
|
tokenUsage: providerResponse.tokenUsage,
|
|
18529
18537
|
costUsd: providerResponse.costUsd,
|
|
@@ -18920,7 +18928,7 @@ async function runEvalCase(options) {
|
|
|
18920
18928
|
}
|
|
18921
18929
|
const output = providerResponse.output;
|
|
18922
18930
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
18923
|
-
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0,
|
|
18931
|
+
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolCalls: {}, errorCount: 0 } } : void 0;
|
|
18924
18932
|
const merged = computed ? mergeExecutionMetrics(computed, {
|
|
18925
18933
|
tokenUsage: providerResponse.tokenUsage,
|
|
18926
18934
|
costUsd: providerResponse.costUsd,
|
|
@@ -19225,7 +19233,7 @@ async function evaluateCandidate(options) {
|
|
|
19225
19233
|
conversationId: evalCase.conversation_id,
|
|
19226
19234
|
score: score.score,
|
|
19227
19235
|
assertions: score.assertions,
|
|
19228
|
-
|
|
19236
|
+
outputText: candidate,
|
|
19229
19237
|
target: target.name,
|
|
19230
19238
|
tokenUsage,
|
|
19231
19239
|
costUsd,
|
|
@@ -19581,7 +19589,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
19581
19589
|
conversationId: evalCase.conversation_id,
|
|
19582
19590
|
score: 0,
|
|
19583
19591
|
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
19584
|
-
|
|
19592
|
+
outputText: `Error occurred: ${message}`,
|
|
19585
19593
|
target: targetName,
|
|
19586
19594
|
requests,
|
|
19587
19595
|
input,
|
|
@@ -20119,7 +20127,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
|
|
|
20119
20127
|
|
|
20120
20128
|
// src/evaluation/baseline.ts
|
|
20121
20129
|
var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
|
|
20122
|
-
"
|
|
20130
|
+
"outputText",
|
|
20123
20131
|
"requests",
|
|
20124
20132
|
"trace",
|
|
20125
20133
|
"workspacePath",
|
|
@@ -20293,14 +20301,17 @@ var OtelTraceExporter = class {
|
|
|
20293
20301
|
rootSpan.setAttribute("agentv.target", result.target);
|
|
20294
20302
|
if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
|
|
20295
20303
|
rootSpan.setAttribute("agentv.score", result.score);
|
|
20296
|
-
if (captureContent) rootSpan.setAttribute("agentv.
|
|
20304
|
+
if (captureContent) rootSpan.setAttribute("agentv.output_text", result.outputText);
|
|
20297
20305
|
if (result.durationMs != null)
|
|
20298
20306
|
rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
|
|
20299
20307
|
if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
|
|
20300
20308
|
if (result.trace) {
|
|
20301
20309
|
const t = result.trace;
|
|
20302
20310
|
rootSpan.setAttribute("agentv.trace.event_count", t.eventCount);
|
|
20303
|
-
rootSpan.setAttribute(
|
|
20311
|
+
rootSpan.setAttribute(
|
|
20312
|
+
"agentv.trace.tool_names",
|
|
20313
|
+
Object.keys(t.toolCalls).sort().join(",")
|
|
20314
|
+
);
|
|
20304
20315
|
if (t.llmCallCount != null)
|
|
20305
20316
|
rootSpan.setAttribute("agentv.trace.llm_call_count", t.llmCallCount);
|
|
20306
20317
|
}
|