@agentv/core 3.5.0 → 3.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ import {
6
6
  findGitRoot,
7
7
  isEvaluatorKind,
8
8
  resolveFileReference
9
- } from "../../chunk-EFR4JHPL.js";
9
+ } from "../../chunk-2IZOTQ25.js";
10
10
 
11
11
  // src/evaluation/validation/file-type.ts
12
12
  import { readFile } from "node:fs/promises";
package/dist/index.cjs CHANGED
@@ -2445,14 +2445,8 @@ var import_promises5 = require("fs/promises");
2445
2445
 
2446
2446
  // src/evaluation/template-variables.ts
2447
2447
  var TEMPLATE_VARIABLES = {
2448
- /** @deprecated Use OUTPUT_TEXT instead */
2449
- ANSWER: "answer",
2450
2448
  EXPECTED_OUTPUT: "expected_output",
2451
- /** @deprecated Use INPUT_TEXT instead */
2452
- QUESTION: "question",
2453
2449
  CRITERIA: "criteria",
2454
- /** @deprecated Use EXPECTED_OUTPUT_TEXT instead */
2455
- REFERENCE_ANSWER: "reference_answer",
2456
2450
  INPUT: "input",
2457
2451
  OUTPUT: "output",
2458
2452
  FILE_CHANGES: "file_changes",
@@ -2462,9 +2456,8 @@ var TEMPLATE_VARIABLES = {
2462
2456
  };
2463
2457
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
2464
2458
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
2465
- TEMPLATE_VARIABLES.ANSWER,
2466
- TEMPLATE_VARIABLES.EXPECTED_OUTPUT,
2467
- TEMPLATE_VARIABLES.OUTPUT_TEXT
2459
+ TEMPLATE_VARIABLES.OUTPUT_TEXT,
2460
+ TEMPLATE_VARIABLES.EXPECTED_OUTPUT
2468
2461
  ]);
2469
2462
 
2470
2463
  // src/evaluation/validation/prompt-validator.ts
@@ -2487,13 +2480,13 @@ function validateTemplateVariables(content, source) {
2487
2480
  }
2488
2481
  match = variablePattern.exec(content);
2489
2482
  }
2490
- const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.ANSWER) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
2483
+ const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
2491
2484
  const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
2492
2485
  const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
2493
2486
  if (!hasRequiredFields) {
2494
2487
  throw new Error(
2495
2488
  `Missing required fields. Must include at least one of:
2496
- - {{ ${TEMPLATE_VARIABLES.ANSWER} }} or {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
2489
+ - {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
2497
2490
  - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
2498
2491
  );
2499
2492
  }
@@ -5795,6 +5788,8 @@ async function invokeModel(options) {
5795
5788
  const { model, request, defaults, retryConfig, providerOptions } = options;
5796
5789
  const chatPrompt = buildChatPrompt(request);
5797
5790
  const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
5791
+ const startTime = (/* @__PURE__ */ new Date()).toISOString();
5792
+ const startMs = Date.now();
5798
5793
  const result = await withRetry(
5799
5794
  () => (0, import_ai.generateText)({
5800
5795
  model,
@@ -5808,9 +5803,11 @@ async function invokeModel(options) {
5808
5803
  retryConfig,
5809
5804
  request.signal
5810
5805
  );
5811
- return mapResponse(result);
5806
+ const endTime = (/* @__PURE__ */ new Date()).toISOString();
5807
+ const durationMs = Date.now() - startMs;
5808
+ return mapResponse(result, { durationMs, startTime, endTime });
5812
5809
  }
5813
- function mapResponse(result) {
5810
+ function mapResponse(result, timing) {
5814
5811
  const content = result.text ?? "";
5815
5812
  const rawUsage = result.totalUsage ?? result.usage;
5816
5813
  const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
@@ -5825,7 +5822,10 @@ function mapResponse(result) {
5825
5822
  raw: result,
5826
5823
  usage: toJsonObject(rawUsage),
5827
5824
  output: [{ role: "assistant", content }],
5828
- tokenUsage
5825
+ tokenUsage,
5826
+ durationMs: timing?.durationMs,
5827
+ startTime: timing?.startTime,
5828
+ endTime: timing?.endTime
5829
5829
  };
5830
5830
  }
5831
5831
  function toJsonObject(value) {
@@ -6703,10 +6703,12 @@ var ClaudeSdkProvider = class {
6703
6703
  if (usage) {
6704
6704
  const inputTokens = (usage.input_tokens ?? 0) + (usage.cache_read_input_tokens ?? 0) + (usage.cache_creation_input_tokens ?? 0);
6705
6705
  const outputTokens = usage.output_tokens ?? 0;
6706
+ const reasoningTokens = usage.reasoning_tokens ?? void 0;
6706
6707
  tokenUsage = {
6707
6708
  input: inputTokens,
6708
6709
  output: outputTokens,
6709
- cached: usage.cache_read_input_tokens ?? void 0
6710
+ cached: usage.cache_read_input_tokens ?? void 0,
6711
+ reasoning: reasoningTokens
6710
6712
  };
6711
6713
  request.streamCallbacks?.onLlmCallEnd?.(this.config.model ?? "claude", tokenUsage);
6712
6714
  }
@@ -7720,7 +7722,8 @@ ${basePrompt}` : basePrompt;
7720
7722
  onUsage({
7721
7723
  input: usage.input_tokens ?? 0,
7722
7724
  output: usage.output_tokens ?? 0,
7723
- cached: usage.cached_input_tokens ?? void 0
7725
+ cached: usage.cached_input_tokens ?? void 0,
7726
+ reasoning: usage.reasoning_tokens ?? void 0
7724
7727
  });
7725
7728
  }
7726
7729
  }
@@ -9735,10 +9738,12 @@ function extractTokenUsage(events) {
9735
9738
  output: output ?? 0
9736
9739
  };
9737
9740
  const cached = toFiniteNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
9738
- if (cached !== void 0) {
9739
- return { ...result, cached };
9740
- }
9741
- return result;
9741
+ const reasoning = toFiniteNumber(u.reasoning_tokens ?? u.reasoningTokens ?? u.reasoning);
9742
+ return {
9743
+ ...result,
9744
+ ...cached !== void 0 ? { cached } : {},
9745
+ ...reasoning !== void 0 ? { reasoning } : {}
9746
+ };
9742
9747
  }
9743
9748
  }
9744
9749
  const messages = record.messages;
@@ -13265,11 +13270,9 @@ var CodeEvaluator = class {
13265
13270
  }
13266
13271
  }
13267
13272
  const payload = {
13268
- question: context2.evalCase.question,
13269
13273
  criteria: context2.evalCase.criteria,
13270
13274
  expectedOutput: context2.evalCase.expected_output,
13271
- referenceAnswer: context2.evalCase.reference_answer,
13272
- answer: context2.candidate,
13275
+ outputText: context2.candidate,
13273
13276
  output: outputForPayload,
13274
13277
  outputPath,
13275
13278
  guidelineFiles: context2.evalCase.guideline_paths,
@@ -13286,9 +13289,7 @@ var CodeEvaluator = class {
13286
13289
  fileChanges: context2.fileChanges ?? null,
13287
13290
  workspacePath: context2.workspacePath ?? null,
13288
13291
  config: this.config ?? null,
13289
- // Text convenience accessors (new names, always strings)
13290
13292
  inputText: context2.evalCase.question,
13291
- outputText: context2.candidate,
13292
13293
  expectedOutputText: context2.evalCase.reference_answer ?? ""
13293
13294
  };
13294
13295
  const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
@@ -13488,13 +13489,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
13488
13489
  {{${TEMPLATE_VARIABLES.CRITERIA}}}
13489
13490
 
13490
13491
  [[ ## question ## ]]
13491
- {{${TEMPLATE_VARIABLES.QUESTION}}}
13492
+ {{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
13492
13493
 
13493
13494
  [[ ## reference_answer ## ]]
13494
- {{${TEMPLATE_VARIABLES.REFERENCE_ANSWER}}}
13495
+ {{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
13495
13496
 
13496
13497
  [[ ## answer ## ]]
13497
- {{${TEMPLATE_VARIABLES.ANSWER}}}`;
13498
+ {{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
13498
13499
  var freeformEvaluationSchema = import_zod4.z.object({
13499
13500
  score: import_zod4.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
13500
13501
  assertions: import_zod4.z.array(
@@ -13572,12 +13573,8 @@ var LlmGraderEvaluator = class {
13572
13573
  2
13573
13574
  ),
13574
13575
  [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context2.output ?? [], null, 2),
13575
- [TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
13576
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
13577
13576
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
13578
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
13579
13577
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
13580
- // Text convenience accessors (new names, always strings)
13581
13578
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
13582
13579
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
13583
13580
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
@@ -13882,10 +13879,10 @@ ${context2.fileChanges}`;
13882
13879
  buildAgentUserPrompt(context2) {
13883
13880
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
13884
13881
  const variables = {
13885
- [TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
13886
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
13887
13882
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
13888
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
13883
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
13884
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
13885
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
13889
13886
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
13890
13887
  };
13891
13888
  if (this.evaluatorTemplate) {
@@ -13938,10 +13935,10 @@ ${context2.fileChanges}`;
13938
13935
  const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
13939
13936
  if (this.evaluatorTemplate) {
13940
13937
  const variables = {
13941
- [TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
13942
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
13943
13938
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
13944
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
13939
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
13940
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
13941
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
13945
13942
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
13946
13943
  };
13947
13944
  const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
@@ -15369,7 +15366,10 @@ var COPILOT_MATCHER = {
15369
15366
  skillTools: ["Skill", "skill"],
15370
15367
  skillInputField: "skill",
15371
15368
  readTools: ["Read File", "readFile", "Read", "readTextFile"],
15372
- readInputField: "file_path"
15369
+ readInputField: "file_path",
15370
+ skillToolPrefixes: ["Using skill: "],
15371
+ readToolPrefixes: ["Viewing "],
15372
+ readInputFields: ["file_path", "path"]
15373
15373
  };
15374
15374
  var PROVIDER_TOOL_SEMANTICS = {
15375
15375
  claude: CLAUDE_MATCHER,
@@ -15411,12 +15411,22 @@ var SkillTriggerEvaluator = class {
15411
15411
  triggered = true;
15412
15412
  evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
15413
15413
  }
15414
+ } else if (matcher.skillToolPrefixes?.some(
15415
+ (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
15416
+ )) {
15417
+ triggered = true;
15418
+ evidence = `Skill tool invoked via tool name "${firstTool.tool}"`;
15414
15419
  } else if (matcher.readTools.includes(firstTool.tool)) {
15415
- const filePath = String(input[matcher.readInputField] ?? "");
15420
+ const filePath = this.readPathFromInput(input, matcher);
15416
15421
  if (filePath.includes(skillName)) {
15417
15422
  triggered = true;
15418
15423
  evidence = `Read tool loaded skill file: ${filePath}`;
15419
15424
  }
15425
+ } else if (matcher.readToolPrefixes?.some(
15426
+ (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
15427
+ )) {
15428
+ triggered = true;
15429
+ evidence = `Read tool loaded skill file via tool name "${firstTool.tool}"`;
15420
15430
  }
15421
15431
  }
15422
15432
  const pass = triggered === shouldTrigger;
@@ -15445,6 +15455,16 @@ var SkillTriggerEvaluator = class {
15445
15455
  expectedAspectCount: 1
15446
15456
  };
15447
15457
  }
15458
+ readPathFromInput(input, matcher) {
15459
+ const fields = matcher.readInputFields ?? [matcher.readInputField];
15460
+ for (const field of fields) {
15461
+ const value = input[field];
15462
+ if (value !== void 0 && value !== null) {
15463
+ return String(value);
15464
+ }
15465
+ }
15466
+ return "";
15467
+ }
15448
15468
  };
15449
15469
 
15450
15470
  // src/evaluation/evaluators/llm-grader-prompt.ts
@@ -15479,12 +15499,8 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
15479
15499
  [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2),
15480
15500
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
15481
15501
  [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
15482
- [TEMPLATE_VARIABLES.ANSWER]: candidate.trim(),
15483
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (evalCase.reference_answer ?? "").trim(),
15484
15502
  [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
15485
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
15486
15503
  [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
15487
- // Text convenience accessors (new names, always strings)
15488
15504
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
15489
15505
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
15490
15506
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
@@ -16541,11 +16557,9 @@ async function resolveCustomPrompt(promptConfig, context2, timeoutMs) {
16541
16557
  }
16542
16558
  async function executePromptTemplate(script, context2, config, timeoutMs) {
16543
16559
  const payload = {
16544
- question: context2.evalCase.question,
16545
16560
  criteria: context2.evalCase.criteria,
16546
16561
  expectedOutput: context2.evalCase.expected_output,
16547
- referenceAnswer: context2.evalCase.reference_answer,
16548
- answer: context2.candidate,
16562
+ outputText: context2.candidate,
16549
16563
  output: context2.output ?? null,
16550
16564
  guidelineFiles: context2.evalCase.guideline_paths,
16551
16565
  inputFiles: context2.evalCase.file_paths.filter(
@@ -16556,9 +16570,7 @@ async function executePromptTemplate(script, context2, config, timeoutMs) {
16556
16570
  fileChanges: context2.fileChanges ?? null,
16557
16571
  workspacePath: context2.workspacePath ?? null,
16558
16572
  config: config ?? context2.config ?? null,
16559
- // Text convenience accessors (new names, always strings)
16560
16573
  inputText: context2.evalCase.question,
16561
- outputText: context2.candidate,
16562
16574
  expectedOutputText: context2.evalCase.reference_answer ?? ""
16563
16575
  };
16564
16576
  const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
@@ -18222,7 +18234,7 @@ async function runEvaluation(options) {
18222
18234
  dataset: evalCase.dataset,
18223
18235
  score: 0,
18224
18236
  assertions: [],
18225
- answer: "",
18237
+ outputText: "",
18226
18238
  target: target.name,
18227
18239
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
18228
18240
  budgetExceeded: true,
@@ -18258,7 +18270,7 @@ async function runEvaluation(options) {
18258
18270
  dataset: evalCase.dataset,
18259
18271
  score: 0,
18260
18272
  assertions: [],
18261
- answer: "",
18273
+ outputText: "",
18262
18274
  target: target.name,
18263
18275
  error: errorMsg,
18264
18276
  executionStatus: "execution_error",
@@ -19225,7 +19237,7 @@ async function evaluateCandidate(options) {
19225
19237
  conversationId: evalCase.conversation_id,
19226
19238
  score: score.score,
19227
19239
  assertions: score.assertions,
19228
- answer: candidate,
19240
+ outputText: candidate,
19229
19241
  target: target.name,
19230
19242
  tokenUsage,
19231
19243
  costUsd,
@@ -19581,7 +19593,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
19581
19593
  conversationId: evalCase.conversation_id,
19582
19594
  score: 0,
19583
19595
  assertions: [{ text: `Error: ${message}`, passed: false }],
19584
- answer: `Error occurred: ${message}`,
19596
+ outputText: `Error occurred: ${message}`,
19585
19597
  target: targetName,
19586
19598
  requests,
19587
19599
  input,
@@ -20119,7 +20131,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
20119
20131
 
20120
20132
  // src/evaluation/baseline.ts
20121
20133
  var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
20122
- "answer",
20134
+ "outputText",
20123
20135
  "requests",
20124
20136
  "trace",
20125
20137
  "workspacePath",
@@ -20293,7 +20305,7 @@ var OtelTraceExporter = class {
20293
20305
  rootSpan.setAttribute("agentv.target", result.target);
20294
20306
  if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
20295
20307
  rootSpan.setAttribute("agentv.score", result.score);
20296
- if (captureContent) rootSpan.setAttribute("agentv.answer", result.answer);
20308
+ if (captureContent) rootSpan.setAttribute("agentv.output_text", result.outputText);
20297
20309
  if (result.durationMs != null)
20298
20310
  rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
20299
20311
  if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);