@agentv/core 3.5.0 → 3.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-EFR4JHPL.js → chunk-2IZOTQ25.js} +1 -1
- package/dist/chunk-2IZOTQ25.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +67 -55
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.js +68 -56
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-EFR4JHPL.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -1134,7 +1134,7 @@ interface EvaluationResult {
|
|
|
1134
1134
|
readonly conversationId?: string;
|
|
1135
1135
|
readonly score: number;
|
|
1136
1136
|
readonly assertions: readonly AssertionEntry[];
|
|
1137
|
-
readonly
|
|
1137
|
+
readonly outputText: string;
|
|
1138
1138
|
readonly target: string;
|
|
1139
1139
|
/** Token usage metrics from provider (optional) */
|
|
1140
1140
|
readonly tokenUsage?: TokenUsage;
|
|
@@ -2412,6 +2412,7 @@ declare class SkillTriggerEvaluator implements Evaluator {
|
|
|
2412
2412
|
constructor(config: SkillTriggerEvaluatorConfig);
|
|
2413
2413
|
private resolveMatcher;
|
|
2414
2414
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2415
|
+
private readPathFromInput;
|
|
2415
2416
|
}
|
|
2416
2417
|
|
|
2417
2418
|
interface LlmGraderPromptAssembly {
|
package/dist/index.d.ts
CHANGED
|
@@ -1134,7 +1134,7 @@ interface EvaluationResult {
|
|
|
1134
1134
|
readonly conversationId?: string;
|
|
1135
1135
|
readonly score: number;
|
|
1136
1136
|
readonly assertions: readonly AssertionEntry[];
|
|
1137
|
-
readonly
|
|
1137
|
+
readonly outputText: string;
|
|
1138
1138
|
readonly target: string;
|
|
1139
1139
|
/** Token usage metrics from provider (optional) */
|
|
1140
1140
|
readonly tokenUsage?: TokenUsage;
|
|
@@ -2412,6 +2412,7 @@ declare class SkillTriggerEvaluator implements Evaluator {
|
|
|
2412
2412
|
constructor(config: SkillTriggerEvaluatorConfig);
|
|
2413
2413
|
private resolveMatcher;
|
|
2414
2414
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2415
|
+
private readPathFromInput;
|
|
2415
2416
|
}
|
|
2416
2417
|
|
|
2417
2418
|
interface LlmGraderPromptAssembly {
|
package/dist/index.js
CHANGED
|
@@ -16,7 +16,7 @@ import {
|
|
|
16
16
|
readTextFile,
|
|
17
17
|
resolveFileReference,
|
|
18
18
|
resolveTargetDefinition
|
|
19
|
-
} from "./chunk-
|
|
19
|
+
} from "./chunk-2IZOTQ25.js";
|
|
20
20
|
import {
|
|
21
21
|
AgentvProvider
|
|
22
22
|
} from "./chunk-W5YDZWT4.js";
|
|
@@ -742,14 +742,8 @@ import { readFile as readFile4 } from "node:fs/promises";
|
|
|
742
742
|
|
|
743
743
|
// src/evaluation/template-variables.ts
|
|
744
744
|
var TEMPLATE_VARIABLES = {
|
|
745
|
-
/** @deprecated Use OUTPUT_TEXT instead */
|
|
746
|
-
ANSWER: "answer",
|
|
747
745
|
EXPECTED_OUTPUT: "expected_output",
|
|
748
|
-
/** @deprecated Use INPUT_TEXT instead */
|
|
749
|
-
QUESTION: "question",
|
|
750
746
|
CRITERIA: "criteria",
|
|
751
|
-
/** @deprecated Use EXPECTED_OUTPUT_TEXT instead */
|
|
752
|
-
REFERENCE_ANSWER: "reference_answer",
|
|
753
747
|
INPUT: "input",
|
|
754
748
|
OUTPUT: "output",
|
|
755
749
|
FILE_CHANGES: "file_changes",
|
|
@@ -759,9 +753,8 @@ var TEMPLATE_VARIABLES = {
|
|
|
759
753
|
};
|
|
760
754
|
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
761
755
|
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
762
|
-
TEMPLATE_VARIABLES.
|
|
763
|
-
TEMPLATE_VARIABLES.EXPECTED_OUTPUT
|
|
764
|
-
TEMPLATE_VARIABLES.OUTPUT_TEXT
|
|
756
|
+
TEMPLATE_VARIABLES.OUTPUT_TEXT,
|
|
757
|
+
TEMPLATE_VARIABLES.EXPECTED_OUTPUT
|
|
765
758
|
]);
|
|
766
759
|
|
|
767
760
|
// src/evaluation/validation/prompt-validator.ts
|
|
@@ -784,13 +777,13 @@ function validateTemplateVariables(content, source) {
|
|
|
784
777
|
}
|
|
785
778
|
match = variablePattern.exec(content);
|
|
786
779
|
}
|
|
787
|
-
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.
|
|
780
|
+
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
|
|
788
781
|
const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
|
|
789
782
|
const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
|
|
790
783
|
if (!hasRequiredFields) {
|
|
791
784
|
throw new Error(
|
|
792
785
|
`Missing required fields. Must include at least one of:
|
|
793
|
-
- {{ ${TEMPLATE_VARIABLES.
|
|
786
|
+
- {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
|
|
794
787
|
- {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
|
|
795
788
|
);
|
|
796
789
|
}
|
|
@@ -3974,6 +3967,8 @@ async function invokeModel(options) {
|
|
|
3974
3967
|
const { model, request, defaults, retryConfig, providerOptions } = options;
|
|
3975
3968
|
const chatPrompt = buildChatPrompt(request);
|
|
3976
3969
|
const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
|
|
3970
|
+
const startTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
3971
|
+
const startMs = Date.now();
|
|
3977
3972
|
const result = await withRetry(
|
|
3978
3973
|
() => generateText({
|
|
3979
3974
|
model,
|
|
@@ -3987,9 +3982,11 @@ async function invokeModel(options) {
|
|
|
3987
3982
|
retryConfig,
|
|
3988
3983
|
request.signal
|
|
3989
3984
|
);
|
|
3990
|
-
|
|
3985
|
+
const endTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
3986
|
+
const durationMs = Date.now() - startMs;
|
|
3987
|
+
return mapResponse(result, { durationMs, startTime, endTime });
|
|
3991
3988
|
}
|
|
3992
|
-
function mapResponse(result) {
|
|
3989
|
+
function mapResponse(result, timing) {
|
|
3993
3990
|
const content = result.text ?? "";
|
|
3994
3991
|
const rawUsage = result.totalUsage ?? result.usage;
|
|
3995
3992
|
const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
|
|
@@ -4004,7 +4001,10 @@ function mapResponse(result) {
|
|
|
4004
4001
|
raw: result,
|
|
4005
4002
|
usage: toJsonObject(rawUsage),
|
|
4006
4003
|
output: [{ role: "assistant", content }],
|
|
4007
|
-
tokenUsage
|
|
4004
|
+
tokenUsage,
|
|
4005
|
+
durationMs: timing?.durationMs,
|
|
4006
|
+
startTime: timing?.startTime,
|
|
4007
|
+
endTime: timing?.endTime
|
|
4008
4008
|
};
|
|
4009
4009
|
}
|
|
4010
4010
|
function toJsonObject(value) {
|
|
@@ -4882,10 +4882,12 @@ var ClaudeSdkProvider = class {
|
|
|
4882
4882
|
if (usage) {
|
|
4883
4883
|
const inputTokens = (usage.input_tokens ?? 0) + (usage.cache_read_input_tokens ?? 0) + (usage.cache_creation_input_tokens ?? 0);
|
|
4884
4884
|
const outputTokens = usage.output_tokens ?? 0;
|
|
4885
|
+
const reasoningTokens = usage.reasoning_tokens ?? void 0;
|
|
4885
4886
|
tokenUsage = {
|
|
4886
4887
|
input: inputTokens,
|
|
4887
4888
|
output: outputTokens,
|
|
4888
|
-
cached: usage.cache_read_input_tokens ?? void 0
|
|
4889
|
+
cached: usage.cache_read_input_tokens ?? void 0,
|
|
4890
|
+
reasoning: reasoningTokens
|
|
4889
4891
|
};
|
|
4890
4892
|
request.streamCallbacks?.onLlmCallEnd?.(this.config.model ?? "claude", tokenUsage);
|
|
4891
4893
|
}
|
|
@@ -5899,7 +5901,8 @@ ${basePrompt}` : basePrompt;
|
|
|
5899
5901
|
onUsage({
|
|
5900
5902
|
input: usage.input_tokens ?? 0,
|
|
5901
5903
|
output: usage.output_tokens ?? 0,
|
|
5902
|
-
cached: usage.cached_input_tokens ?? void 0
|
|
5904
|
+
cached: usage.cached_input_tokens ?? void 0,
|
|
5905
|
+
reasoning: usage.reasoning_tokens ?? void 0
|
|
5903
5906
|
});
|
|
5904
5907
|
}
|
|
5905
5908
|
}
|
|
@@ -7913,10 +7916,12 @@ function extractTokenUsage(events) {
|
|
|
7913
7916
|
output: output ?? 0
|
|
7914
7917
|
};
|
|
7915
7918
|
const cached = toFiniteNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
|
|
7916
|
-
|
|
7917
|
-
|
|
7918
|
-
|
|
7919
|
-
|
|
7919
|
+
const reasoning = toFiniteNumber(u.reasoning_tokens ?? u.reasoningTokens ?? u.reasoning);
|
|
7920
|
+
return {
|
|
7921
|
+
...result,
|
|
7922
|
+
...cached !== void 0 ? { cached } : {},
|
|
7923
|
+
...reasoning !== void 0 ? { reasoning } : {}
|
|
7924
|
+
};
|
|
7920
7925
|
}
|
|
7921
7926
|
}
|
|
7922
7927
|
const messages = record.messages;
|
|
@@ -10245,11 +10250,9 @@ var CodeEvaluator = class {
|
|
|
10245
10250
|
}
|
|
10246
10251
|
}
|
|
10247
10252
|
const payload = {
|
|
10248
|
-
question: context.evalCase.question,
|
|
10249
10253
|
criteria: context.evalCase.criteria,
|
|
10250
10254
|
expectedOutput: context.evalCase.expected_output,
|
|
10251
|
-
|
|
10252
|
-
answer: context.candidate,
|
|
10255
|
+
outputText: context.candidate,
|
|
10253
10256
|
output: outputForPayload,
|
|
10254
10257
|
outputPath,
|
|
10255
10258
|
guidelineFiles: context.evalCase.guideline_paths,
|
|
@@ -10266,9 +10269,7 @@ var CodeEvaluator = class {
|
|
|
10266
10269
|
fileChanges: context.fileChanges ?? null,
|
|
10267
10270
|
workspacePath: context.workspacePath ?? null,
|
|
10268
10271
|
config: this.config ?? null,
|
|
10269
|
-
// Text convenience accessors (new names, always strings)
|
|
10270
10272
|
inputText: context.evalCase.question,
|
|
10271
|
-
outputText: context.candidate,
|
|
10272
10273
|
expectedOutputText: context.evalCase.reference_answer ?? ""
|
|
10273
10274
|
};
|
|
10274
10275
|
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
@@ -10436,13 +10437,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
10436
10437
|
{{${TEMPLATE_VARIABLES.CRITERIA}}}
|
|
10437
10438
|
|
|
10438
10439
|
[[ ## question ## ]]
|
|
10439
|
-
{{${TEMPLATE_VARIABLES.
|
|
10440
|
+
{{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
|
|
10440
10441
|
|
|
10441
10442
|
[[ ## reference_answer ## ]]
|
|
10442
|
-
{{${TEMPLATE_VARIABLES.
|
|
10443
|
+
{{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
|
|
10443
10444
|
|
|
10444
10445
|
[[ ## answer ## ]]
|
|
10445
|
-
{{${TEMPLATE_VARIABLES.
|
|
10446
|
+
{{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
|
|
10446
10447
|
var freeformEvaluationSchema = z3.object({
|
|
10447
10448
|
score: z3.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
10448
10449
|
assertions: z3.array(
|
|
@@ -10520,12 +10521,8 @@ var LlmGraderEvaluator = class {
|
|
|
10520
10521
|
2
|
|
10521
10522
|
),
|
|
10522
10523
|
[TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context.output ?? [], null, 2),
|
|
10523
|
-
[TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
|
|
10524
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
10525
10524
|
[TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
|
|
10526
|
-
[TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
|
|
10527
10525
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
|
|
10528
|
-
// Text convenience accessors (new names, always strings)
|
|
10529
10526
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
10530
10527
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
10531
10528
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
|
|
@@ -10830,10 +10827,10 @@ ${context.fileChanges}`;
|
|
|
10830
10827
|
buildAgentUserPrompt(context) {
|
|
10831
10828
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
10832
10829
|
const variables = {
|
|
10833
|
-
[TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
|
|
10834
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
10835
10830
|
[TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
|
|
10836
|
-
[TEMPLATE_VARIABLES.
|
|
10831
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
10832
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
10833
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim(),
|
|
10837
10834
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? ""
|
|
10838
10835
|
};
|
|
10839
10836
|
if (this.evaluatorTemplate) {
|
|
@@ -10886,10 +10883,10 @@ ${context.fileChanges}`;
|
|
|
10886
10883
|
const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
|
|
10887
10884
|
if (this.evaluatorTemplate) {
|
|
10888
10885
|
const variables = {
|
|
10889
|
-
[TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
|
|
10890
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
10891
10886
|
[TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
|
|
10892
|
-
[TEMPLATE_VARIABLES.
|
|
10887
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
10888
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
10889
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim(),
|
|
10893
10890
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? ""
|
|
10894
10891
|
};
|
|
10895
10892
|
const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
|
|
@@ -12317,7 +12314,10 @@ var COPILOT_MATCHER = {
|
|
|
12317
12314
|
skillTools: ["Skill", "skill"],
|
|
12318
12315
|
skillInputField: "skill",
|
|
12319
12316
|
readTools: ["Read File", "readFile", "Read", "readTextFile"],
|
|
12320
|
-
readInputField: "file_path"
|
|
12317
|
+
readInputField: "file_path",
|
|
12318
|
+
skillToolPrefixes: ["Using skill: "],
|
|
12319
|
+
readToolPrefixes: ["Viewing "],
|
|
12320
|
+
readInputFields: ["file_path", "path"]
|
|
12321
12321
|
};
|
|
12322
12322
|
var PROVIDER_TOOL_SEMANTICS = {
|
|
12323
12323
|
claude: CLAUDE_MATCHER,
|
|
@@ -12359,12 +12359,22 @@ var SkillTriggerEvaluator = class {
|
|
|
12359
12359
|
triggered = true;
|
|
12360
12360
|
evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
|
|
12361
12361
|
}
|
|
12362
|
+
} else if (matcher.skillToolPrefixes?.some(
|
|
12363
|
+
(prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
|
|
12364
|
+
)) {
|
|
12365
|
+
triggered = true;
|
|
12366
|
+
evidence = `Skill tool invoked via tool name "${firstTool.tool}"`;
|
|
12362
12367
|
} else if (matcher.readTools.includes(firstTool.tool)) {
|
|
12363
|
-
const filePath =
|
|
12368
|
+
const filePath = this.readPathFromInput(input, matcher);
|
|
12364
12369
|
if (filePath.includes(skillName)) {
|
|
12365
12370
|
triggered = true;
|
|
12366
12371
|
evidence = `Read tool loaded skill file: ${filePath}`;
|
|
12367
12372
|
}
|
|
12373
|
+
} else if (matcher.readToolPrefixes?.some(
|
|
12374
|
+
(prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
|
|
12375
|
+
)) {
|
|
12376
|
+
triggered = true;
|
|
12377
|
+
evidence = `Read tool loaded skill file via tool name "${firstTool.tool}"`;
|
|
12368
12378
|
}
|
|
12369
12379
|
}
|
|
12370
12380
|
const pass = triggered === shouldTrigger;
|
|
@@ -12393,6 +12403,16 @@ var SkillTriggerEvaluator = class {
|
|
|
12393
12403
|
expectedAspectCount: 1
|
|
12394
12404
|
};
|
|
12395
12405
|
}
|
|
12406
|
+
readPathFromInput(input, matcher) {
|
|
12407
|
+
const fields = matcher.readInputFields ?? [matcher.readInputField];
|
|
12408
|
+
for (const field of fields) {
|
|
12409
|
+
const value = input[field];
|
|
12410
|
+
if (value !== void 0 && value !== null) {
|
|
12411
|
+
return String(value);
|
|
12412
|
+
}
|
|
12413
|
+
}
|
|
12414
|
+
return "";
|
|
12415
|
+
}
|
|
12396
12416
|
};
|
|
12397
12417
|
|
|
12398
12418
|
// src/evaluation/evaluators/llm-grader-prompt.ts
|
|
@@ -12427,12 +12447,8 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
|
|
|
12427
12447
|
[TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2),
|
|
12428
12448
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
|
|
12429
12449
|
[TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
|
|
12430
|
-
[TEMPLATE_VARIABLES.ANSWER]: candidate.trim(),
|
|
12431
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (evalCase.reference_answer ?? "").trim(),
|
|
12432
12450
|
[TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
|
|
12433
|
-
[TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
|
|
12434
12451
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
|
|
12435
|
-
// Text convenience accessors (new names, always strings)
|
|
12436
12452
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
12437
12453
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
|
|
12438
12454
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
|
|
@@ -13489,11 +13505,9 @@ async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
|
|
|
13489
13505
|
}
|
|
13490
13506
|
async function executePromptTemplate(script, context, config, timeoutMs) {
|
|
13491
13507
|
const payload = {
|
|
13492
|
-
question: context.evalCase.question,
|
|
13493
13508
|
criteria: context.evalCase.criteria,
|
|
13494
13509
|
expectedOutput: context.evalCase.expected_output,
|
|
13495
|
-
|
|
13496
|
-
answer: context.candidate,
|
|
13510
|
+
outputText: context.candidate,
|
|
13497
13511
|
output: context.output ?? null,
|
|
13498
13512
|
guidelineFiles: context.evalCase.guideline_paths,
|
|
13499
13513
|
inputFiles: context.evalCase.file_paths.filter(
|
|
@@ -13504,9 +13518,7 @@ async function executePromptTemplate(script, context, config, timeoutMs) {
|
|
|
13504
13518
|
fileChanges: context.fileChanges ?? null,
|
|
13505
13519
|
workspacePath: context.workspacePath ?? null,
|
|
13506
13520
|
config: config ?? context.config ?? null,
|
|
13507
|
-
// Text convenience accessors (new names, always strings)
|
|
13508
13521
|
inputText: context.evalCase.question,
|
|
13509
|
-
outputText: context.candidate,
|
|
13510
13522
|
expectedOutputText: context.evalCase.reference_answer ?? ""
|
|
13511
13523
|
};
|
|
13512
13524
|
const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
@@ -15170,7 +15182,7 @@ async function runEvaluation(options) {
|
|
|
15170
15182
|
dataset: evalCase.dataset,
|
|
15171
15183
|
score: 0,
|
|
15172
15184
|
assertions: [],
|
|
15173
|
-
|
|
15185
|
+
outputText: "",
|
|
15174
15186
|
target: target.name,
|
|
15175
15187
|
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
15176
15188
|
budgetExceeded: true,
|
|
@@ -15206,7 +15218,7 @@ async function runEvaluation(options) {
|
|
|
15206
15218
|
dataset: evalCase.dataset,
|
|
15207
15219
|
score: 0,
|
|
15208
15220
|
assertions: [],
|
|
15209
|
-
|
|
15221
|
+
outputText: "",
|
|
15210
15222
|
target: target.name,
|
|
15211
15223
|
error: errorMsg,
|
|
15212
15224
|
executionStatus: "execution_error",
|
|
@@ -16173,7 +16185,7 @@ async function evaluateCandidate(options) {
|
|
|
16173
16185
|
conversationId: evalCase.conversation_id,
|
|
16174
16186
|
score: score.score,
|
|
16175
16187
|
assertions: score.assertions,
|
|
16176
|
-
|
|
16188
|
+
outputText: candidate,
|
|
16177
16189
|
target: target.name,
|
|
16178
16190
|
tokenUsage,
|
|
16179
16191
|
costUsd,
|
|
@@ -16529,7 +16541,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
16529
16541
|
conversationId: evalCase.conversation_id,
|
|
16530
16542
|
score: 0,
|
|
16531
16543
|
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
16532
|
-
|
|
16544
|
+
outputText: `Error occurred: ${message}`,
|
|
16533
16545
|
target: targetName,
|
|
16534
16546
|
requests,
|
|
16535
16547
|
input,
|
|
@@ -17067,7 +17079,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
|
|
|
17067
17079
|
|
|
17068
17080
|
// src/evaluation/baseline.ts
|
|
17069
17081
|
var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
|
|
17070
|
-
"
|
|
17082
|
+
"outputText",
|
|
17071
17083
|
"requests",
|
|
17072
17084
|
"trace",
|
|
17073
17085
|
"workspacePath",
|
|
@@ -17241,7 +17253,7 @@ var OtelTraceExporter = class {
|
|
|
17241
17253
|
rootSpan.setAttribute("agentv.target", result.target);
|
|
17242
17254
|
if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
|
|
17243
17255
|
rootSpan.setAttribute("agentv.score", result.score);
|
|
17244
|
-
if (captureContent) rootSpan.setAttribute("agentv.
|
|
17256
|
+
if (captureContent) rootSpan.setAttribute("agentv.output_text", result.outputText);
|
|
17245
17257
|
if (result.durationMs != null)
|
|
17246
17258
|
rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
|
|
17247
17259
|
if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
|