@agentv/core 3.5.0 → 3.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -1134,7 +1134,7 @@ interface EvaluationResult {
1134
1134
  readonly conversationId?: string;
1135
1135
  readonly score: number;
1136
1136
  readonly assertions: readonly AssertionEntry[];
1137
- readonly answer: string;
1137
+ readonly outputText: string;
1138
1138
  readonly target: string;
1139
1139
  /** Token usage metrics from provider (optional) */
1140
1140
  readonly tokenUsage?: TokenUsage;
@@ -2412,6 +2412,7 @@ declare class SkillTriggerEvaluator implements Evaluator {
2412
2412
  constructor(config: SkillTriggerEvaluatorConfig);
2413
2413
  private resolveMatcher;
2414
2414
  evaluate(context: EvaluationContext): EvaluationScore;
2415
+ private readPathFromInput;
2415
2416
  }
2416
2417
 
2417
2418
  interface LlmGraderPromptAssembly {
package/dist/index.d.ts CHANGED
@@ -1134,7 +1134,7 @@ interface EvaluationResult {
1134
1134
  readonly conversationId?: string;
1135
1135
  readonly score: number;
1136
1136
  readonly assertions: readonly AssertionEntry[];
1137
- readonly answer: string;
1137
+ readonly outputText: string;
1138
1138
  readonly target: string;
1139
1139
  /** Token usage metrics from provider (optional) */
1140
1140
  readonly tokenUsage?: TokenUsage;
@@ -2412,6 +2412,7 @@ declare class SkillTriggerEvaluator implements Evaluator {
2412
2412
  constructor(config: SkillTriggerEvaluatorConfig);
2413
2413
  private resolveMatcher;
2414
2414
  evaluate(context: EvaluationContext): EvaluationScore;
2415
+ private readPathFromInput;
2415
2416
  }
2416
2417
 
2417
2418
  interface LlmGraderPromptAssembly {
package/dist/index.js CHANGED
@@ -16,7 +16,7 @@ import {
16
16
  readTextFile,
17
17
  resolveFileReference,
18
18
  resolveTargetDefinition
19
- } from "./chunk-EFR4JHPL.js";
19
+ } from "./chunk-2IZOTQ25.js";
20
20
  import {
21
21
  AgentvProvider
22
22
  } from "./chunk-W5YDZWT4.js";
@@ -742,14 +742,8 @@ import { readFile as readFile4 } from "node:fs/promises";
742
742
 
743
743
  // src/evaluation/template-variables.ts
744
744
  var TEMPLATE_VARIABLES = {
745
- /** @deprecated Use OUTPUT_TEXT instead */
746
- ANSWER: "answer",
747
745
  EXPECTED_OUTPUT: "expected_output",
748
- /** @deprecated Use INPUT_TEXT instead */
749
- QUESTION: "question",
750
746
  CRITERIA: "criteria",
751
- /** @deprecated Use EXPECTED_OUTPUT_TEXT instead */
752
- REFERENCE_ANSWER: "reference_answer",
753
747
  INPUT: "input",
754
748
  OUTPUT: "output",
755
749
  FILE_CHANGES: "file_changes",
@@ -759,9 +753,8 @@ var TEMPLATE_VARIABLES = {
759
753
  };
760
754
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
761
755
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
762
- TEMPLATE_VARIABLES.ANSWER,
763
- TEMPLATE_VARIABLES.EXPECTED_OUTPUT,
764
- TEMPLATE_VARIABLES.OUTPUT_TEXT
756
+ TEMPLATE_VARIABLES.OUTPUT_TEXT,
757
+ TEMPLATE_VARIABLES.EXPECTED_OUTPUT
765
758
  ]);
766
759
 
767
760
  // src/evaluation/validation/prompt-validator.ts
@@ -784,13 +777,13 @@ function validateTemplateVariables(content, source) {
784
777
  }
785
778
  match = variablePattern.exec(content);
786
779
  }
787
- const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.ANSWER) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
780
+ const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
788
781
  const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
789
782
  const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
790
783
  if (!hasRequiredFields) {
791
784
  throw new Error(
792
785
  `Missing required fields. Must include at least one of:
793
- - {{ ${TEMPLATE_VARIABLES.ANSWER} }} or {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
786
+ - {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
794
787
  - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
795
788
  );
796
789
  }
@@ -3974,6 +3967,8 @@ async function invokeModel(options) {
3974
3967
  const { model, request, defaults, retryConfig, providerOptions } = options;
3975
3968
  const chatPrompt = buildChatPrompt(request);
3976
3969
  const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
3970
+ const startTime = (/* @__PURE__ */ new Date()).toISOString();
3971
+ const startMs = Date.now();
3977
3972
  const result = await withRetry(
3978
3973
  () => generateText({
3979
3974
  model,
@@ -3987,9 +3982,11 @@ async function invokeModel(options) {
3987
3982
  retryConfig,
3988
3983
  request.signal
3989
3984
  );
3990
- return mapResponse(result);
3985
+ const endTime = (/* @__PURE__ */ new Date()).toISOString();
3986
+ const durationMs = Date.now() - startMs;
3987
+ return mapResponse(result, { durationMs, startTime, endTime });
3991
3988
  }
3992
- function mapResponse(result) {
3989
+ function mapResponse(result, timing) {
3993
3990
  const content = result.text ?? "";
3994
3991
  const rawUsage = result.totalUsage ?? result.usage;
3995
3992
  const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
@@ -4004,7 +4001,10 @@ function mapResponse(result) {
4004
4001
  raw: result,
4005
4002
  usage: toJsonObject(rawUsage),
4006
4003
  output: [{ role: "assistant", content }],
4007
- tokenUsage
4004
+ tokenUsage,
4005
+ durationMs: timing?.durationMs,
4006
+ startTime: timing?.startTime,
4007
+ endTime: timing?.endTime
4008
4008
  };
4009
4009
  }
4010
4010
  function toJsonObject(value) {
@@ -4882,10 +4882,12 @@ var ClaudeSdkProvider = class {
4882
4882
  if (usage) {
4883
4883
  const inputTokens = (usage.input_tokens ?? 0) + (usage.cache_read_input_tokens ?? 0) + (usage.cache_creation_input_tokens ?? 0);
4884
4884
  const outputTokens = usage.output_tokens ?? 0;
4885
+ const reasoningTokens = usage.reasoning_tokens ?? void 0;
4885
4886
  tokenUsage = {
4886
4887
  input: inputTokens,
4887
4888
  output: outputTokens,
4888
- cached: usage.cache_read_input_tokens ?? void 0
4889
+ cached: usage.cache_read_input_tokens ?? void 0,
4890
+ reasoning: reasoningTokens
4889
4891
  };
4890
4892
  request.streamCallbacks?.onLlmCallEnd?.(this.config.model ?? "claude", tokenUsage);
4891
4893
  }
@@ -5899,7 +5901,8 @@ ${basePrompt}` : basePrompt;
5899
5901
  onUsage({
5900
5902
  input: usage.input_tokens ?? 0,
5901
5903
  output: usage.output_tokens ?? 0,
5902
- cached: usage.cached_input_tokens ?? void 0
5904
+ cached: usage.cached_input_tokens ?? void 0,
5905
+ reasoning: usage.reasoning_tokens ?? void 0
5903
5906
  });
5904
5907
  }
5905
5908
  }
@@ -7913,10 +7916,12 @@ function extractTokenUsage(events) {
7913
7916
  output: output ?? 0
7914
7917
  };
7915
7918
  const cached = toFiniteNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
7916
- if (cached !== void 0) {
7917
- return { ...result, cached };
7918
- }
7919
- return result;
7919
+ const reasoning = toFiniteNumber(u.reasoning_tokens ?? u.reasoningTokens ?? u.reasoning);
7920
+ return {
7921
+ ...result,
7922
+ ...cached !== void 0 ? { cached } : {},
7923
+ ...reasoning !== void 0 ? { reasoning } : {}
7924
+ };
7920
7925
  }
7921
7926
  }
7922
7927
  const messages = record.messages;
@@ -10245,11 +10250,9 @@ var CodeEvaluator = class {
10245
10250
  }
10246
10251
  }
10247
10252
  const payload = {
10248
- question: context.evalCase.question,
10249
10253
  criteria: context.evalCase.criteria,
10250
10254
  expectedOutput: context.evalCase.expected_output,
10251
- referenceAnswer: context.evalCase.reference_answer,
10252
- answer: context.candidate,
10255
+ outputText: context.candidate,
10253
10256
  output: outputForPayload,
10254
10257
  outputPath,
10255
10258
  guidelineFiles: context.evalCase.guideline_paths,
@@ -10266,9 +10269,7 @@ var CodeEvaluator = class {
10266
10269
  fileChanges: context.fileChanges ?? null,
10267
10270
  workspacePath: context.workspacePath ?? null,
10268
10271
  config: this.config ?? null,
10269
- // Text convenience accessors (new names, always strings)
10270
10272
  inputText: context.evalCase.question,
10271
- outputText: context.candidate,
10272
10273
  expectedOutputText: context.evalCase.reference_answer ?? ""
10273
10274
  };
10274
10275
  const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
@@ -10436,13 +10437,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
10436
10437
  {{${TEMPLATE_VARIABLES.CRITERIA}}}
10437
10438
 
10438
10439
  [[ ## question ## ]]
10439
- {{${TEMPLATE_VARIABLES.QUESTION}}}
10440
+ {{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
10440
10441
 
10441
10442
  [[ ## reference_answer ## ]]
10442
- {{${TEMPLATE_VARIABLES.REFERENCE_ANSWER}}}
10443
+ {{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
10443
10444
 
10444
10445
  [[ ## answer ## ]]
10445
- {{${TEMPLATE_VARIABLES.ANSWER}}}`;
10446
+ {{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
10446
10447
  var freeformEvaluationSchema = z3.object({
10447
10448
  score: z3.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
10448
10449
  assertions: z3.array(
@@ -10520,12 +10521,8 @@ var LlmGraderEvaluator = class {
10520
10521
  2
10521
10522
  ),
10522
10523
  [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context.output ?? [], null, 2),
10523
- [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
10524
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
10525
10524
  [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
10526
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
10527
10525
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
10528
- // Text convenience accessors (new names, always strings)
10529
10526
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
10530
10527
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
10531
10528
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
@@ -10830,10 +10827,10 @@ ${context.fileChanges}`;
10830
10827
  buildAgentUserPrompt(context) {
10831
10828
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
10832
10829
  const variables = {
10833
- [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
10834
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
10835
10830
  [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
10836
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
10831
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
10832
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
10833
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim(),
10837
10834
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? ""
10838
10835
  };
10839
10836
  if (this.evaluatorTemplate) {
@@ -10886,10 +10883,10 @@ ${context.fileChanges}`;
10886
10883
  const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
10887
10884
  if (this.evaluatorTemplate) {
10888
10885
  const variables = {
10889
- [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
10890
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
10891
10886
  [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
10892
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
10887
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
10888
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
10889
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim(),
10893
10890
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? ""
10894
10891
  };
10895
10892
  const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
@@ -12317,7 +12314,10 @@ var COPILOT_MATCHER = {
12317
12314
  skillTools: ["Skill", "skill"],
12318
12315
  skillInputField: "skill",
12319
12316
  readTools: ["Read File", "readFile", "Read", "readTextFile"],
12320
- readInputField: "file_path"
12317
+ readInputField: "file_path",
12318
+ skillToolPrefixes: ["Using skill: "],
12319
+ readToolPrefixes: ["Viewing "],
12320
+ readInputFields: ["file_path", "path"]
12321
12321
  };
12322
12322
  var PROVIDER_TOOL_SEMANTICS = {
12323
12323
  claude: CLAUDE_MATCHER,
@@ -12359,12 +12359,22 @@ var SkillTriggerEvaluator = class {
12359
12359
  triggered = true;
12360
12360
  evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
12361
12361
  }
12362
+ } else if (matcher.skillToolPrefixes?.some(
12363
+ (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
12364
+ )) {
12365
+ triggered = true;
12366
+ evidence = `Skill tool invoked via tool name "${firstTool.tool}"`;
12362
12367
  } else if (matcher.readTools.includes(firstTool.tool)) {
12363
- const filePath = String(input[matcher.readInputField] ?? "");
12368
+ const filePath = this.readPathFromInput(input, matcher);
12364
12369
  if (filePath.includes(skillName)) {
12365
12370
  triggered = true;
12366
12371
  evidence = `Read tool loaded skill file: ${filePath}`;
12367
12372
  }
12373
+ } else if (matcher.readToolPrefixes?.some(
12374
+ (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
12375
+ )) {
12376
+ triggered = true;
12377
+ evidence = `Read tool loaded skill file via tool name "${firstTool.tool}"`;
12368
12378
  }
12369
12379
  }
12370
12380
  const pass = triggered === shouldTrigger;
@@ -12393,6 +12403,16 @@ var SkillTriggerEvaluator = class {
12393
12403
  expectedAspectCount: 1
12394
12404
  };
12395
12405
  }
12406
+ readPathFromInput(input, matcher) {
12407
+ const fields = matcher.readInputFields ?? [matcher.readInputField];
12408
+ for (const field of fields) {
12409
+ const value = input[field];
12410
+ if (value !== void 0 && value !== null) {
12411
+ return String(value);
12412
+ }
12413
+ }
12414
+ return "";
12415
+ }
12396
12416
  };
12397
12417
 
12398
12418
  // src/evaluation/evaluators/llm-grader-prompt.ts
@@ -12427,12 +12447,8 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
12427
12447
  [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2),
12428
12448
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
12429
12449
  [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
12430
- [TEMPLATE_VARIABLES.ANSWER]: candidate.trim(),
12431
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (evalCase.reference_answer ?? "").trim(),
12432
12450
  [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
12433
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
12434
12451
  [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
12435
- // Text convenience accessors (new names, always strings)
12436
12452
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
12437
12453
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
12438
12454
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
@@ -13489,11 +13505,9 @@ async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
13489
13505
  }
13490
13506
  async function executePromptTemplate(script, context, config, timeoutMs) {
13491
13507
  const payload = {
13492
- question: context.evalCase.question,
13493
13508
  criteria: context.evalCase.criteria,
13494
13509
  expectedOutput: context.evalCase.expected_output,
13495
- referenceAnswer: context.evalCase.reference_answer,
13496
- answer: context.candidate,
13510
+ outputText: context.candidate,
13497
13511
  output: context.output ?? null,
13498
13512
  guidelineFiles: context.evalCase.guideline_paths,
13499
13513
  inputFiles: context.evalCase.file_paths.filter(
@@ -13504,9 +13518,7 @@ async function executePromptTemplate(script, context, config, timeoutMs) {
13504
13518
  fileChanges: context.fileChanges ?? null,
13505
13519
  workspacePath: context.workspacePath ?? null,
13506
13520
  config: config ?? context.config ?? null,
13507
- // Text convenience accessors (new names, always strings)
13508
13521
  inputText: context.evalCase.question,
13509
- outputText: context.candidate,
13510
13522
  expectedOutputText: context.evalCase.reference_answer ?? ""
13511
13523
  };
13512
13524
  const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
@@ -15170,7 +15182,7 @@ async function runEvaluation(options) {
15170
15182
  dataset: evalCase.dataset,
15171
15183
  score: 0,
15172
15184
  assertions: [],
15173
- answer: "",
15185
+ outputText: "",
15174
15186
  target: target.name,
15175
15187
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
15176
15188
  budgetExceeded: true,
@@ -15206,7 +15218,7 @@ async function runEvaluation(options) {
15206
15218
  dataset: evalCase.dataset,
15207
15219
  score: 0,
15208
15220
  assertions: [],
15209
- answer: "",
15221
+ outputText: "",
15210
15222
  target: target.name,
15211
15223
  error: errorMsg,
15212
15224
  executionStatus: "execution_error",
@@ -16173,7 +16185,7 @@ async function evaluateCandidate(options) {
16173
16185
  conversationId: evalCase.conversation_id,
16174
16186
  score: score.score,
16175
16187
  assertions: score.assertions,
16176
- answer: candidate,
16188
+ outputText: candidate,
16177
16189
  target: target.name,
16178
16190
  tokenUsage,
16179
16191
  costUsd,
@@ -16529,7 +16541,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
16529
16541
  conversationId: evalCase.conversation_id,
16530
16542
  score: 0,
16531
16543
  assertions: [{ text: `Error: ${message}`, passed: false }],
16532
- answer: `Error occurred: ${message}`,
16544
+ outputText: `Error occurred: ${message}`,
16533
16545
  target: targetName,
16534
16546
  requests,
16535
16547
  input,
@@ -17067,7 +17079,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
17067
17079
 
17068
17080
  // src/evaluation/baseline.ts
17069
17081
  var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
17070
- "answer",
17082
+ "outputText",
17071
17083
  "requests",
17072
17084
  "trace",
17073
17085
  "workspacePath",
@@ -17241,7 +17253,7 @@ var OtelTraceExporter = class {
17241
17253
  rootSpan.setAttribute("agentv.target", result.target);
17242
17254
  if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
17243
17255
  rootSpan.setAttribute("agentv.score", result.score);
17244
- if (captureContent) rootSpan.setAttribute("agentv.answer", result.answer);
17256
+ if (captureContent) rootSpan.setAttribute("agentv.output_text", result.outputText);
17245
17257
  if (result.durationMs != null)
17246
17258
  rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
17247
17259
  if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);