@agentv/core 3.5.0 → 3.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-EFR4JHPL.js → chunk-2IZOTQ25.js} +1 -1
- package/dist/chunk-2IZOTQ25.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +77 -66
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +3 -4
- package/dist/index.d.ts +3 -4
- package/dist/index.js +78 -67
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-EFR4JHPL.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -251,10 +251,8 @@ interface TokenUsage {
|
|
|
251
251
|
interface TraceSummary {
|
|
252
252
|
/** Total number of events in trace */
|
|
253
253
|
readonly eventCount: number;
|
|
254
|
-
/** Unique tool names, sorted alphabetically */
|
|
255
|
-
readonly toolNames: readonly string[];
|
|
256
254
|
/** Map of tool name to call count */
|
|
257
|
-
readonly
|
|
255
|
+
readonly toolCalls: Readonly<Record<string, number>>;
|
|
258
256
|
/** Number of error events */
|
|
259
257
|
readonly errorCount: number;
|
|
260
258
|
/** Per-tool duration arrays in milliseconds (optional) */
|
|
@@ -1134,7 +1132,7 @@ interface EvaluationResult {
|
|
|
1134
1132
|
readonly conversationId?: string;
|
|
1135
1133
|
readonly score: number;
|
|
1136
1134
|
readonly assertions: readonly AssertionEntry[];
|
|
1137
|
-
readonly
|
|
1135
|
+
readonly outputText: string;
|
|
1138
1136
|
readonly target: string;
|
|
1139
1137
|
/** Token usage metrics from provider (optional) */
|
|
1140
1138
|
readonly tokenUsage?: TokenUsage;
|
|
@@ -2412,6 +2410,7 @@ declare class SkillTriggerEvaluator implements Evaluator {
|
|
|
2412
2410
|
constructor(config: SkillTriggerEvaluatorConfig);
|
|
2413
2411
|
private resolveMatcher;
|
|
2414
2412
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2413
|
+
private readPathFromInput;
|
|
2415
2414
|
}
|
|
2416
2415
|
|
|
2417
2416
|
interface LlmGraderPromptAssembly {
|
package/dist/index.d.ts
CHANGED
|
@@ -251,10 +251,8 @@ interface TokenUsage {
|
|
|
251
251
|
interface TraceSummary {
|
|
252
252
|
/** Total number of events in trace */
|
|
253
253
|
readonly eventCount: number;
|
|
254
|
-
/** Unique tool names, sorted alphabetically */
|
|
255
|
-
readonly toolNames: readonly string[];
|
|
256
254
|
/** Map of tool name to call count */
|
|
257
|
-
readonly
|
|
255
|
+
readonly toolCalls: Readonly<Record<string, number>>;
|
|
258
256
|
/** Number of error events */
|
|
259
257
|
readonly errorCount: number;
|
|
260
258
|
/** Per-tool duration arrays in milliseconds (optional) */
|
|
@@ -1134,7 +1132,7 @@ interface EvaluationResult {
|
|
|
1134
1132
|
readonly conversationId?: string;
|
|
1135
1133
|
readonly score: number;
|
|
1136
1134
|
readonly assertions: readonly AssertionEntry[];
|
|
1137
|
-
readonly
|
|
1135
|
+
readonly outputText: string;
|
|
1138
1136
|
readonly target: string;
|
|
1139
1137
|
/** Token usage metrics from provider (optional) */
|
|
1140
1138
|
readonly tokenUsage?: TokenUsage;
|
|
@@ -2412,6 +2410,7 @@ declare class SkillTriggerEvaluator implements Evaluator {
|
|
|
2412
2410
|
constructor(config: SkillTriggerEvaluatorConfig);
|
|
2413
2411
|
private resolveMatcher;
|
|
2414
2412
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2413
|
+
private readPathFromInput;
|
|
2415
2414
|
}
|
|
2416
2415
|
|
|
2417
2416
|
interface LlmGraderPromptAssembly {
|
package/dist/index.js
CHANGED
|
@@ -16,7 +16,7 @@ import {
|
|
|
16
16
|
readTextFile,
|
|
17
17
|
resolveFileReference,
|
|
18
18
|
resolveTargetDefinition
|
|
19
|
-
} from "./chunk-
|
|
19
|
+
} from "./chunk-2IZOTQ25.js";
|
|
20
20
|
import {
|
|
21
21
|
AgentvProvider
|
|
22
22
|
} from "./chunk-W5YDZWT4.js";
|
|
@@ -83,12 +83,10 @@ function computeTraceSummary(messages) {
|
|
|
83
83
|
}
|
|
84
84
|
}
|
|
85
85
|
}
|
|
86
|
-
const toolNames = Object.keys(toolCallCounts).sort();
|
|
87
86
|
return {
|
|
88
87
|
trace: {
|
|
89
88
|
eventCount: totalToolCalls,
|
|
90
|
-
|
|
91
|
-
toolCallsByName: toolCallCounts,
|
|
89
|
+
toolCalls: toolCallCounts,
|
|
92
90
|
errorCount: 0,
|
|
93
91
|
llmCallCount,
|
|
94
92
|
...hasAnyDuration ? { toolDurations } : {}
|
|
@@ -112,7 +110,7 @@ var DEFAULT_EXPLORATION_TOOLS = [
|
|
|
112
110
|
function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
|
|
113
111
|
if (summary.eventCount === 0) return void 0;
|
|
114
112
|
const explorationCalls = explorationTools.reduce(
|
|
115
|
-
(sum, tool2) => sum + (summary.
|
|
113
|
+
(sum, tool2) => sum + (summary.toolCalls[tool2] ?? 0),
|
|
116
114
|
0
|
|
117
115
|
);
|
|
118
116
|
return explorationCalls / summary.eventCount;
|
|
@@ -742,14 +740,8 @@ import { readFile as readFile4 } from "node:fs/promises";
|
|
|
742
740
|
|
|
743
741
|
// src/evaluation/template-variables.ts
|
|
744
742
|
var TEMPLATE_VARIABLES = {
|
|
745
|
-
/** @deprecated Use OUTPUT_TEXT instead */
|
|
746
|
-
ANSWER: "answer",
|
|
747
743
|
EXPECTED_OUTPUT: "expected_output",
|
|
748
|
-
/** @deprecated Use INPUT_TEXT instead */
|
|
749
|
-
QUESTION: "question",
|
|
750
744
|
CRITERIA: "criteria",
|
|
751
|
-
/** @deprecated Use EXPECTED_OUTPUT_TEXT instead */
|
|
752
|
-
REFERENCE_ANSWER: "reference_answer",
|
|
753
745
|
INPUT: "input",
|
|
754
746
|
OUTPUT: "output",
|
|
755
747
|
FILE_CHANGES: "file_changes",
|
|
@@ -759,9 +751,8 @@ var TEMPLATE_VARIABLES = {
|
|
|
759
751
|
};
|
|
760
752
|
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
761
753
|
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
762
|
-
TEMPLATE_VARIABLES.
|
|
763
|
-
TEMPLATE_VARIABLES.EXPECTED_OUTPUT
|
|
764
|
-
TEMPLATE_VARIABLES.OUTPUT_TEXT
|
|
754
|
+
TEMPLATE_VARIABLES.OUTPUT_TEXT,
|
|
755
|
+
TEMPLATE_VARIABLES.EXPECTED_OUTPUT
|
|
765
756
|
]);
|
|
766
757
|
|
|
767
758
|
// src/evaluation/validation/prompt-validator.ts
|
|
@@ -784,13 +775,13 @@ function validateTemplateVariables(content, source) {
|
|
|
784
775
|
}
|
|
785
776
|
match = variablePattern.exec(content);
|
|
786
777
|
}
|
|
787
|
-
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.
|
|
778
|
+
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
|
|
788
779
|
const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
|
|
789
780
|
const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
|
|
790
781
|
if (!hasRequiredFields) {
|
|
791
782
|
throw new Error(
|
|
792
783
|
`Missing required fields. Must include at least one of:
|
|
793
|
-
- {{ ${TEMPLATE_VARIABLES.
|
|
784
|
+
- {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
|
|
794
785
|
- {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
|
|
795
786
|
);
|
|
796
787
|
}
|
|
@@ -3974,6 +3965,8 @@ async function invokeModel(options) {
|
|
|
3974
3965
|
const { model, request, defaults, retryConfig, providerOptions } = options;
|
|
3975
3966
|
const chatPrompt = buildChatPrompt(request);
|
|
3976
3967
|
const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
|
|
3968
|
+
const startTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
3969
|
+
const startMs = Date.now();
|
|
3977
3970
|
const result = await withRetry(
|
|
3978
3971
|
() => generateText({
|
|
3979
3972
|
model,
|
|
@@ -3987,9 +3980,11 @@ async function invokeModel(options) {
|
|
|
3987
3980
|
retryConfig,
|
|
3988
3981
|
request.signal
|
|
3989
3982
|
);
|
|
3990
|
-
|
|
3983
|
+
const endTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
3984
|
+
const durationMs = Date.now() - startMs;
|
|
3985
|
+
return mapResponse(result, { durationMs, startTime, endTime });
|
|
3991
3986
|
}
|
|
3992
|
-
function mapResponse(result) {
|
|
3987
|
+
function mapResponse(result, timing) {
|
|
3993
3988
|
const content = result.text ?? "";
|
|
3994
3989
|
const rawUsage = result.totalUsage ?? result.usage;
|
|
3995
3990
|
const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
|
|
@@ -4004,7 +3999,10 @@ function mapResponse(result) {
|
|
|
4004
3999
|
raw: result,
|
|
4005
4000
|
usage: toJsonObject(rawUsage),
|
|
4006
4001
|
output: [{ role: "assistant", content }],
|
|
4007
|
-
tokenUsage
|
|
4002
|
+
tokenUsage,
|
|
4003
|
+
durationMs: timing?.durationMs,
|
|
4004
|
+
startTime: timing?.startTime,
|
|
4005
|
+
endTime: timing?.endTime
|
|
4008
4006
|
};
|
|
4009
4007
|
}
|
|
4010
4008
|
function toJsonObject(value) {
|
|
@@ -4882,10 +4880,12 @@ var ClaudeSdkProvider = class {
|
|
|
4882
4880
|
if (usage) {
|
|
4883
4881
|
const inputTokens = (usage.input_tokens ?? 0) + (usage.cache_read_input_tokens ?? 0) + (usage.cache_creation_input_tokens ?? 0);
|
|
4884
4882
|
const outputTokens = usage.output_tokens ?? 0;
|
|
4883
|
+
const reasoningTokens = usage.reasoning_tokens ?? void 0;
|
|
4885
4884
|
tokenUsage = {
|
|
4886
4885
|
input: inputTokens,
|
|
4887
4886
|
output: outputTokens,
|
|
4888
|
-
cached: usage.cache_read_input_tokens ?? void 0
|
|
4887
|
+
cached: usage.cache_read_input_tokens ?? void 0,
|
|
4888
|
+
reasoning: reasoningTokens
|
|
4889
4889
|
};
|
|
4890
4890
|
request.streamCallbacks?.onLlmCallEnd?.(this.config.model ?? "claude", tokenUsage);
|
|
4891
4891
|
}
|
|
@@ -5899,7 +5899,8 @@ ${basePrompt}` : basePrompt;
|
|
|
5899
5899
|
onUsage({
|
|
5900
5900
|
input: usage.input_tokens ?? 0,
|
|
5901
5901
|
output: usage.output_tokens ?? 0,
|
|
5902
|
-
cached: usage.cached_input_tokens ?? void 0
|
|
5902
|
+
cached: usage.cached_input_tokens ?? void 0,
|
|
5903
|
+
reasoning: usage.reasoning_tokens ?? void 0
|
|
5903
5904
|
});
|
|
5904
5905
|
}
|
|
5905
5906
|
}
|
|
@@ -7913,10 +7914,12 @@ function extractTokenUsage(events) {
|
|
|
7913
7914
|
output: output ?? 0
|
|
7914
7915
|
};
|
|
7915
7916
|
const cached = toFiniteNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
|
|
7916
|
-
|
|
7917
|
-
|
|
7918
|
-
|
|
7919
|
-
|
|
7917
|
+
const reasoning = toFiniteNumber(u.reasoning_tokens ?? u.reasoningTokens ?? u.reasoning);
|
|
7918
|
+
return {
|
|
7919
|
+
...result,
|
|
7920
|
+
...cached !== void 0 ? { cached } : {},
|
|
7921
|
+
...reasoning !== void 0 ? { reasoning } : {}
|
|
7922
|
+
};
|
|
7920
7923
|
}
|
|
7921
7924
|
}
|
|
7922
7925
|
const messages = record.messages;
|
|
@@ -10245,11 +10248,9 @@ var CodeEvaluator = class {
|
|
|
10245
10248
|
}
|
|
10246
10249
|
}
|
|
10247
10250
|
const payload = {
|
|
10248
|
-
question: context.evalCase.question,
|
|
10249
10251
|
criteria: context.evalCase.criteria,
|
|
10250
10252
|
expectedOutput: context.evalCase.expected_output,
|
|
10251
|
-
|
|
10252
|
-
answer: context.candidate,
|
|
10253
|
+
outputText: context.candidate,
|
|
10253
10254
|
output: outputForPayload,
|
|
10254
10255
|
outputPath,
|
|
10255
10256
|
guidelineFiles: context.evalCase.guideline_paths,
|
|
@@ -10266,9 +10267,7 @@ var CodeEvaluator = class {
|
|
|
10266
10267
|
fileChanges: context.fileChanges ?? null,
|
|
10267
10268
|
workspacePath: context.workspacePath ?? null,
|
|
10268
10269
|
config: this.config ?? null,
|
|
10269
|
-
// Text convenience accessors (new names, always strings)
|
|
10270
10270
|
inputText: context.evalCase.question,
|
|
10271
|
-
outputText: context.candidate,
|
|
10272
10271
|
expectedOutputText: context.evalCase.reference_answer ?? ""
|
|
10273
10272
|
};
|
|
10274
10273
|
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
@@ -10436,13 +10435,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
10436
10435
|
{{${TEMPLATE_VARIABLES.CRITERIA}}}
|
|
10437
10436
|
|
|
10438
10437
|
[[ ## question ## ]]
|
|
10439
|
-
{{${TEMPLATE_VARIABLES.
|
|
10438
|
+
{{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
|
|
10440
10439
|
|
|
10441
10440
|
[[ ## reference_answer ## ]]
|
|
10442
|
-
{{${TEMPLATE_VARIABLES.
|
|
10441
|
+
{{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
|
|
10443
10442
|
|
|
10444
10443
|
[[ ## answer ## ]]
|
|
10445
|
-
{{${TEMPLATE_VARIABLES.
|
|
10444
|
+
{{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
|
|
10446
10445
|
var freeformEvaluationSchema = z3.object({
|
|
10447
10446
|
score: z3.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
10448
10447
|
assertions: z3.array(
|
|
@@ -10520,12 +10519,8 @@ var LlmGraderEvaluator = class {
|
|
|
10520
10519
|
2
|
|
10521
10520
|
),
|
|
10522
10521
|
[TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context.output ?? [], null, 2),
|
|
10523
|
-
[TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
|
|
10524
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
10525
10522
|
[TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
|
|
10526
|
-
[TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
|
|
10527
10523
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
|
|
10528
|
-
// Text convenience accessors (new names, always strings)
|
|
10529
10524
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
10530
10525
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
10531
10526
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
|
|
@@ -10830,10 +10825,10 @@ ${context.fileChanges}`;
|
|
|
10830
10825
|
buildAgentUserPrompt(context) {
|
|
10831
10826
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
10832
10827
|
const variables = {
|
|
10833
|
-
[TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
|
|
10834
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
10835
10828
|
[TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
|
|
10836
|
-
[TEMPLATE_VARIABLES.
|
|
10829
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
10830
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
10831
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim(),
|
|
10837
10832
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? ""
|
|
10838
10833
|
};
|
|
10839
10834
|
if (this.evaluatorTemplate) {
|
|
@@ -10886,10 +10881,10 @@ ${context.fileChanges}`;
|
|
|
10886
10881
|
const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
|
|
10887
10882
|
if (this.evaluatorTemplate) {
|
|
10888
10883
|
const variables = {
|
|
10889
|
-
[TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
|
|
10890
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
10891
10884
|
[TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
|
|
10892
|
-
[TEMPLATE_VARIABLES.
|
|
10885
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
10886
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
10887
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim(),
|
|
10893
10888
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? ""
|
|
10894
10889
|
};
|
|
10895
10890
|
const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
|
|
@@ -12317,7 +12312,10 @@ var COPILOT_MATCHER = {
|
|
|
12317
12312
|
skillTools: ["Skill", "skill"],
|
|
12318
12313
|
skillInputField: "skill",
|
|
12319
12314
|
readTools: ["Read File", "readFile", "Read", "readTextFile"],
|
|
12320
|
-
readInputField: "file_path"
|
|
12315
|
+
readInputField: "file_path",
|
|
12316
|
+
skillToolPrefixes: ["Using skill: "],
|
|
12317
|
+
readToolPrefixes: ["Viewing "],
|
|
12318
|
+
readInputFields: ["file_path", "path"]
|
|
12321
12319
|
};
|
|
12322
12320
|
var PROVIDER_TOOL_SEMANTICS = {
|
|
12323
12321
|
claude: CLAUDE_MATCHER,
|
|
@@ -12359,12 +12357,22 @@ var SkillTriggerEvaluator = class {
|
|
|
12359
12357
|
triggered = true;
|
|
12360
12358
|
evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
|
|
12361
12359
|
}
|
|
12360
|
+
} else if (matcher.skillToolPrefixes?.some(
|
|
12361
|
+
(prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
|
|
12362
|
+
)) {
|
|
12363
|
+
triggered = true;
|
|
12364
|
+
evidence = `Skill tool invoked via tool name "${firstTool.tool}"`;
|
|
12362
12365
|
} else if (matcher.readTools.includes(firstTool.tool)) {
|
|
12363
|
-
const filePath =
|
|
12366
|
+
const filePath = this.readPathFromInput(input, matcher);
|
|
12364
12367
|
if (filePath.includes(skillName)) {
|
|
12365
12368
|
triggered = true;
|
|
12366
12369
|
evidence = `Read tool loaded skill file: ${filePath}`;
|
|
12367
12370
|
}
|
|
12371
|
+
} else if (matcher.readToolPrefixes?.some(
|
|
12372
|
+
(prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
|
|
12373
|
+
)) {
|
|
12374
|
+
triggered = true;
|
|
12375
|
+
evidence = `Read tool loaded skill file via tool name "${firstTool.tool}"`;
|
|
12368
12376
|
}
|
|
12369
12377
|
}
|
|
12370
12378
|
const pass = triggered === shouldTrigger;
|
|
@@ -12393,6 +12401,16 @@ var SkillTriggerEvaluator = class {
|
|
|
12393
12401
|
expectedAspectCount: 1
|
|
12394
12402
|
};
|
|
12395
12403
|
}
|
|
12404
|
+
readPathFromInput(input, matcher) {
|
|
12405
|
+
const fields = matcher.readInputFields ?? [matcher.readInputField];
|
|
12406
|
+
for (const field of fields) {
|
|
12407
|
+
const value = input[field];
|
|
12408
|
+
if (value !== void 0 && value !== null) {
|
|
12409
|
+
return String(value);
|
|
12410
|
+
}
|
|
12411
|
+
}
|
|
12412
|
+
return "";
|
|
12413
|
+
}
|
|
12396
12414
|
};
|
|
12397
12415
|
|
|
12398
12416
|
// src/evaluation/evaluators/llm-grader-prompt.ts
|
|
@@ -12427,12 +12445,8 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
|
|
|
12427
12445
|
[TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2),
|
|
12428
12446
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
|
|
12429
12447
|
[TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
|
|
12430
|
-
[TEMPLATE_VARIABLES.ANSWER]: candidate.trim(),
|
|
12431
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (evalCase.reference_answer ?? "").trim(),
|
|
12432
12448
|
[TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
|
|
12433
|
-
[TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
|
|
12434
12449
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
|
|
12435
|
-
// Text convenience accessors (new names, always strings)
|
|
12436
12450
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
12437
12451
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
|
|
12438
12452
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
|
|
@@ -12764,11 +12778,9 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12764
12778
|
for (const call of toolCalls) {
|
|
12765
12779
|
toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
|
|
12766
12780
|
}
|
|
12767
|
-
const toolNames = Object.keys(toolCallsByName).sort();
|
|
12768
12781
|
return {
|
|
12769
12782
|
eventCount: toolCalls.length,
|
|
12770
|
-
|
|
12771
|
-
toolCallsByName,
|
|
12783
|
+
toolCalls: toolCallsByName,
|
|
12772
12784
|
errorCount: 0
|
|
12773
12785
|
};
|
|
12774
12786
|
}
|
|
@@ -12786,7 +12798,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12786
12798
|
const assertions = [];
|
|
12787
12799
|
for (const toolName of toolNames) {
|
|
12788
12800
|
const required = minimums[toolName];
|
|
12789
|
-
const actual = summary.
|
|
12801
|
+
const actual = summary.toolCalls[toolName] ?? 0;
|
|
12790
12802
|
if (actual >= required) {
|
|
12791
12803
|
assertions.push({
|
|
12792
12804
|
text: `${toolName}: called ${actual} times (required >=${required})`,
|
|
@@ -13489,11 +13501,9 @@ async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
|
|
|
13489
13501
|
}
|
|
13490
13502
|
async function executePromptTemplate(script, context, config, timeoutMs) {
|
|
13491
13503
|
const payload = {
|
|
13492
|
-
question: context.evalCase.question,
|
|
13493
13504
|
criteria: context.evalCase.criteria,
|
|
13494
13505
|
expectedOutput: context.evalCase.expected_output,
|
|
13495
|
-
|
|
13496
|
-
answer: context.candidate,
|
|
13506
|
+
outputText: context.candidate,
|
|
13497
13507
|
output: context.output ?? null,
|
|
13498
13508
|
guidelineFiles: context.evalCase.guideline_paths,
|
|
13499
13509
|
inputFiles: context.evalCase.file_paths.filter(
|
|
@@ -13504,9 +13514,7 @@ async function executePromptTemplate(script, context, config, timeoutMs) {
|
|
|
13504
13514
|
fileChanges: context.fileChanges ?? null,
|
|
13505
13515
|
workspacePath: context.workspacePath ?? null,
|
|
13506
13516
|
config: config ?? context.config ?? null,
|
|
13507
|
-
// Text convenience accessors (new names, always strings)
|
|
13508
13517
|
inputText: context.evalCase.question,
|
|
13509
|
-
outputText: context.candidate,
|
|
13510
13518
|
expectedOutputText: context.evalCase.reference_answer ?? ""
|
|
13511
13519
|
};
|
|
13512
13520
|
const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
@@ -15170,7 +15178,7 @@ async function runEvaluation(options) {
|
|
|
15170
15178
|
dataset: evalCase.dataset,
|
|
15171
15179
|
score: 0,
|
|
15172
15180
|
assertions: [],
|
|
15173
|
-
|
|
15181
|
+
outputText: "",
|
|
15174
15182
|
target: target.name,
|
|
15175
15183
|
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
15176
15184
|
budgetExceeded: true,
|
|
@@ -15206,7 +15214,7 @@ async function runEvaluation(options) {
|
|
|
15206
15214
|
dataset: evalCase.dataset,
|
|
15207
15215
|
score: 0,
|
|
15208
15216
|
assertions: [],
|
|
15209
|
-
|
|
15217
|
+
outputText: "",
|
|
15210
15218
|
target: target.name,
|
|
15211
15219
|
error: errorMsg,
|
|
15212
15220
|
executionStatus: "execution_error",
|
|
@@ -15471,7 +15479,7 @@ async function runBatchEvaluation(options) {
|
|
|
15471
15479
|
const providerResponse = batchResponse[i];
|
|
15472
15480
|
const output = providerResponse.output;
|
|
15473
15481
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
15474
|
-
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0,
|
|
15482
|
+
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolCalls: {}, errorCount: 0 } } : void 0;
|
|
15475
15483
|
const merged = computed ? mergeExecutionMetrics(computed, {
|
|
15476
15484
|
tokenUsage: providerResponse.tokenUsage,
|
|
15477
15485
|
costUsd: providerResponse.costUsd,
|
|
@@ -15868,7 +15876,7 @@ async function runEvalCase(options) {
|
|
|
15868
15876
|
}
|
|
15869
15877
|
const output = providerResponse.output;
|
|
15870
15878
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
15871
|
-
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0,
|
|
15879
|
+
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolCalls: {}, errorCount: 0 } } : void 0;
|
|
15872
15880
|
const merged = computed ? mergeExecutionMetrics(computed, {
|
|
15873
15881
|
tokenUsage: providerResponse.tokenUsage,
|
|
15874
15882
|
costUsd: providerResponse.costUsd,
|
|
@@ -16173,7 +16181,7 @@ async function evaluateCandidate(options) {
|
|
|
16173
16181
|
conversationId: evalCase.conversation_id,
|
|
16174
16182
|
score: score.score,
|
|
16175
16183
|
assertions: score.assertions,
|
|
16176
|
-
|
|
16184
|
+
outputText: candidate,
|
|
16177
16185
|
target: target.name,
|
|
16178
16186
|
tokenUsage,
|
|
16179
16187
|
costUsd,
|
|
@@ -16529,7 +16537,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
16529
16537
|
conversationId: evalCase.conversation_id,
|
|
16530
16538
|
score: 0,
|
|
16531
16539
|
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
16532
|
-
|
|
16540
|
+
outputText: `Error occurred: ${message}`,
|
|
16533
16541
|
target: targetName,
|
|
16534
16542
|
requests,
|
|
16535
16543
|
input,
|
|
@@ -17067,7 +17075,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
|
|
|
17067
17075
|
|
|
17068
17076
|
// src/evaluation/baseline.ts
|
|
17069
17077
|
var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
|
|
17070
|
-
"
|
|
17078
|
+
"outputText",
|
|
17071
17079
|
"requests",
|
|
17072
17080
|
"trace",
|
|
17073
17081
|
"workspacePath",
|
|
@@ -17241,14 +17249,17 @@ var OtelTraceExporter = class {
|
|
|
17241
17249
|
rootSpan.setAttribute("agentv.target", result.target);
|
|
17242
17250
|
if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
|
|
17243
17251
|
rootSpan.setAttribute("agentv.score", result.score);
|
|
17244
|
-
if (captureContent) rootSpan.setAttribute("agentv.
|
|
17252
|
+
if (captureContent) rootSpan.setAttribute("agentv.output_text", result.outputText);
|
|
17245
17253
|
if (result.durationMs != null)
|
|
17246
17254
|
rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
|
|
17247
17255
|
if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
|
|
17248
17256
|
if (result.trace) {
|
|
17249
17257
|
const t = result.trace;
|
|
17250
17258
|
rootSpan.setAttribute("agentv.trace.event_count", t.eventCount);
|
|
17251
|
-
rootSpan.setAttribute(
|
|
17259
|
+
rootSpan.setAttribute(
|
|
17260
|
+
"agentv.trace.tool_names",
|
|
17261
|
+
Object.keys(t.toolCalls).sort().join(",")
|
|
17262
|
+
);
|
|
17252
17263
|
if (t.llmCallCount != null)
|
|
17253
17264
|
rootSpan.setAttribute("agentv.trace.llm_call_count", t.llmCallCount);
|
|
17254
17265
|
}
|