agentv 3.5.0 → 3.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
301
301
  }
302
302
  });
303
303
 
304
- // ../../packages/core/dist/chunk-EFR4JHPL.js
304
+ // ../../packages/core/dist/chunk-2IZOTQ25.js
305
305
  import { constants } from "node:fs";
306
306
  import { access, readFile } from "node:fs/promises";
307
307
  import path from "node:path";
@@ -419,7 +419,7 @@ __export(external_exports2, {
419
419
  void: () => voidType
420
420
  });
421
421
 
422
- // ../../packages/core/dist/chunk-EFR4JHPL.js
422
+ // ../../packages/core/dist/chunk-2IZOTQ25.js
423
423
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
424
424
  var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
425
425
  var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
@@ -14655,14 +14655,8 @@ function logWarning(message) {
14655
14655
  console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET3}`);
14656
14656
  }
14657
14657
  var TEMPLATE_VARIABLES = {
14658
- /** @deprecated Use OUTPUT_TEXT instead */
14659
- ANSWER: "answer",
14660
14658
  EXPECTED_OUTPUT: "expected_output",
14661
- /** @deprecated Use INPUT_TEXT instead */
14662
- QUESTION: "question",
14663
14659
  CRITERIA: "criteria",
14664
- /** @deprecated Use EXPECTED_OUTPUT_TEXT instead */
14665
- REFERENCE_ANSWER: "reference_answer",
14666
14660
  INPUT: "input",
14667
14661
  OUTPUT: "output",
14668
14662
  FILE_CHANGES: "file_changes",
@@ -14672,9 +14666,8 @@ var TEMPLATE_VARIABLES = {
14672
14666
  };
14673
14667
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
14674
14668
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
14675
- TEMPLATE_VARIABLES.ANSWER,
14676
- TEMPLATE_VARIABLES.EXPECTED_OUTPUT,
14677
- TEMPLATE_VARIABLES.OUTPUT_TEXT
14669
+ TEMPLATE_VARIABLES.OUTPUT_TEXT,
14670
+ TEMPLATE_VARIABLES.EXPECTED_OUTPUT
14678
14671
  ]);
14679
14672
  var ANSI_YELLOW3 = "\x1B[33m";
14680
14673
  var ANSI_RESET4 = "\x1B[0m";
@@ -14695,13 +14688,13 @@ function validateTemplateVariables(content, source) {
14695
14688
  }
14696
14689
  match = variablePattern.exec(content);
14697
14690
  }
14698
- const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.ANSWER) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
14691
+ const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
14699
14692
  const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
14700
14693
  const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
14701
14694
  if (!hasRequiredFields) {
14702
14695
  throw new Error(
14703
14696
  `Missing required fields. Must include at least one of:
14704
- - {{ ${TEMPLATE_VARIABLES.ANSWER} }} or {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
14697
+ - {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
14705
14698
  - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
14706
14699
  );
14707
14700
  }
@@ -17843,6 +17836,8 @@ async function invokeModel(options) {
17843
17836
  const { model, request, defaults, retryConfig, providerOptions } = options;
17844
17837
  const chatPrompt = buildChatPrompt(request);
17845
17838
  const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
17839
+ const startTime = (/* @__PURE__ */ new Date()).toISOString();
17840
+ const startMs = Date.now();
17846
17841
  const result = await withRetry(
17847
17842
  () => generateText({
17848
17843
  model,
@@ -17856,9 +17851,11 @@ async function invokeModel(options) {
17856
17851
  retryConfig,
17857
17852
  request.signal
17858
17853
  );
17859
- return mapResponse(result);
17854
+ const endTime = (/* @__PURE__ */ new Date()).toISOString();
17855
+ const durationMs = Date.now() - startMs;
17856
+ return mapResponse(result, { durationMs, startTime, endTime });
17860
17857
  }
17861
- function mapResponse(result) {
17858
+ function mapResponse(result, timing) {
17862
17859
  const content = result.text ?? "";
17863
17860
  const rawUsage = result.totalUsage ?? result.usage;
17864
17861
  const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
@@ -17873,7 +17870,10 @@ function mapResponse(result) {
17873
17870
  raw: result,
17874
17871
  usage: toJsonObject(rawUsage),
17875
17872
  output: [{ role: "assistant", content }],
17876
- tokenUsage
17873
+ tokenUsage,
17874
+ durationMs: timing?.durationMs,
17875
+ startTime: timing?.startTime,
17876
+ endTime: timing?.endTime
17877
17877
  };
17878
17878
  }
17879
17879
  function toJsonObject(value) {
@@ -18731,10 +18731,12 @@ var ClaudeSdkProvider = class {
18731
18731
  if (usage) {
18732
18732
  const inputTokens = (usage.input_tokens ?? 0) + (usage.cache_read_input_tokens ?? 0) + (usage.cache_creation_input_tokens ?? 0);
18733
18733
  const outputTokens = usage.output_tokens ?? 0;
18734
+ const reasoningTokens = usage.reasoning_tokens ?? void 0;
18734
18735
  tokenUsage = {
18735
18736
  input: inputTokens,
18736
18737
  output: outputTokens,
18737
- cached: usage.cache_read_input_tokens ?? void 0
18738
+ cached: usage.cache_read_input_tokens ?? void 0,
18739
+ reasoning: reasoningTokens
18738
18740
  };
18739
18741
  request.streamCallbacks?.onLlmCallEnd?.(this.config.model ?? "claude", tokenUsage);
18740
18742
  }
@@ -19730,7 +19732,8 @@ ${basePrompt}` : basePrompt;
19730
19732
  onUsage({
19731
19733
  input: usage.input_tokens ?? 0,
19732
19734
  output: usage.output_tokens ?? 0,
19733
- cached: usage.cached_input_tokens ?? void 0
19735
+ cached: usage.cached_input_tokens ?? void 0,
19736
+ reasoning: usage.reasoning_tokens ?? void 0
19734
19737
  });
19735
19738
  }
19736
19739
  }
@@ -21698,10 +21701,12 @@ function extractTokenUsage(events) {
21698
21701
  output: output ?? 0
21699
21702
  };
21700
21703
  const cached = toFiniteNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
21701
- if (cached !== void 0) {
21702
- return { ...result, cached };
21703
- }
21704
- return result;
21704
+ const reasoning = toFiniteNumber(u.reasoning_tokens ?? u.reasoningTokens ?? u.reasoning);
21705
+ return {
21706
+ ...result,
21707
+ ...cached !== void 0 ? { cached } : {},
21708
+ ...reasoning !== void 0 ? { reasoning } : {}
21709
+ };
21705
21710
  }
21706
21711
  }
21707
21712
  const messages = record.messages;
@@ -23927,11 +23932,9 @@ var CodeEvaluator = class {
23927
23932
  }
23928
23933
  }
23929
23934
  const payload = {
23930
- question: context2.evalCase.question,
23931
23935
  criteria: context2.evalCase.criteria,
23932
23936
  expectedOutput: context2.evalCase.expected_output,
23933
- referenceAnswer: context2.evalCase.reference_answer,
23934
- answer: context2.candidate,
23937
+ outputText: context2.candidate,
23935
23938
  output: outputForPayload,
23936
23939
  outputPath,
23937
23940
  guidelineFiles: context2.evalCase.guideline_paths,
@@ -23948,9 +23951,7 @@ var CodeEvaluator = class {
23948
23951
  fileChanges: context2.fileChanges ?? null,
23949
23952
  workspacePath: context2.workspacePath ?? null,
23950
23953
  config: this.config ?? null,
23951
- // Text convenience accessors (new names, always strings)
23952
23954
  inputText: context2.evalCase.question,
23953
- outputText: context2.candidate,
23954
23955
  expectedOutputText: context2.evalCase.reference_answer ?? ""
23955
23956
  };
23956
23957
  const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
@@ -24109,13 +24110,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
24109
24110
  {{${TEMPLATE_VARIABLES.CRITERIA}}}
24110
24111
 
24111
24112
  [[ ## question ## ]]
24112
- {{${TEMPLATE_VARIABLES.QUESTION}}}
24113
+ {{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
24113
24114
 
24114
24115
  [[ ## reference_answer ## ]]
24115
- {{${TEMPLATE_VARIABLES.REFERENCE_ANSWER}}}
24116
+ {{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
24116
24117
 
24117
24118
  [[ ## answer ## ]]
24118
- {{${TEMPLATE_VARIABLES.ANSWER}}}`;
24119
+ {{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
24119
24120
  var freeformEvaluationSchema = external_exports2.object({
24120
24121
  score: external_exports2.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
24121
24122
  assertions: external_exports2.array(
@@ -24193,12 +24194,8 @@ var LlmGraderEvaluator = class {
24193
24194
  2
24194
24195
  ),
24195
24196
  [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context2.output ?? [], null, 2),
24196
- [TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
24197
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
24198
24197
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
24199
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
24200
24198
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
24201
- // Text convenience accessors (new names, always strings)
24202
24199
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
24203
24200
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
24204
24201
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
@@ -24503,10 +24500,10 @@ ${context2.fileChanges}`;
24503
24500
  buildAgentUserPrompt(context2) {
24504
24501
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
24505
24502
  const variables = {
24506
- [TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
24507
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
24508
24503
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
24509
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
24504
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
24505
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
24506
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
24510
24507
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
24511
24508
  };
24512
24509
  if (this.evaluatorTemplate) {
@@ -24559,10 +24556,10 @@ ${context2.fileChanges}`;
24559
24556
  const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
24560
24557
  if (this.evaluatorTemplate) {
24561
24558
  const variables = {
24562
- [TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
24563
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
24564
24559
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
24565
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
24560
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
24561
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
24562
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
24566
24563
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
24567
24564
  };
24568
24565
  const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
@@ -25978,7 +25975,10 @@ var COPILOT_MATCHER = {
25978
25975
  skillTools: ["Skill", "skill"],
25979
25976
  skillInputField: "skill",
25980
25977
  readTools: ["Read File", "readFile", "Read", "readTextFile"],
25981
- readInputField: "file_path"
25978
+ readInputField: "file_path",
25979
+ skillToolPrefixes: ["Using skill: "],
25980
+ readToolPrefixes: ["Viewing "],
25981
+ readInputFields: ["file_path", "path"]
25982
25982
  };
25983
25983
  var PROVIDER_TOOL_SEMANTICS = {
25984
25984
  claude: CLAUDE_MATCHER,
@@ -26020,12 +26020,22 @@ var SkillTriggerEvaluator = class {
26020
26020
  triggered = true;
26021
26021
  evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
26022
26022
  }
26023
+ } else if (matcher.skillToolPrefixes?.some(
26024
+ (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
26025
+ )) {
26026
+ triggered = true;
26027
+ evidence = `Skill tool invoked via tool name "${firstTool.tool}"`;
26023
26028
  } else if (matcher.readTools.includes(firstTool.tool)) {
26024
- const filePath = String(input[matcher.readInputField] ?? "");
26029
+ const filePath = this.readPathFromInput(input, matcher);
26025
26030
  if (filePath.includes(skillName)) {
26026
26031
  triggered = true;
26027
26032
  evidence = `Read tool loaded skill file: ${filePath}`;
26028
26033
  }
26034
+ } else if (matcher.readToolPrefixes?.some(
26035
+ (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
26036
+ )) {
26037
+ triggered = true;
26038
+ evidence = `Read tool loaded skill file via tool name "${firstTool.tool}"`;
26029
26039
  }
26030
26040
  }
26031
26041
  const pass = triggered === shouldTrigger;
@@ -26054,6 +26064,16 @@ var SkillTriggerEvaluator = class {
26054
26064
  expectedAspectCount: 1
26055
26065
  };
26056
26066
  }
26067
+ readPathFromInput(input, matcher) {
26068
+ const fields = matcher.readInputFields ?? [matcher.readInputField];
26069
+ for (const field of fields) {
26070
+ const value = input[field];
26071
+ if (value !== void 0 && value !== null) {
26072
+ return String(value);
26073
+ }
26074
+ }
26075
+ return "";
26076
+ }
26057
26077
  };
26058
26078
  function assembleLlmGraderPrompt(input) {
26059
26079
  const {
@@ -26086,12 +26106,8 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
26086
26106
  [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2),
26087
26107
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
26088
26108
  [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
26089
- [TEMPLATE_VARIABLES.ANSWER]: candidate.trim(),
26090
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (evalCase.reference_answer ?? "").trim(),
26091
26109
  [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
26092
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
26093
26110
  [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
26094
- // Text convenience accessors (new names, always strings)
26095
26111
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
26096
26112
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
26097
26113
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
@@ -27125,11 +27141,9 @@ async function resolveCustomPrompt(promptConfig, context2, timeoutMs) {
27125
27141
  }
27126
27142
  async function executePromptTemplate(script, context2, config, timeoutMs) {
27127
27143
  const payload = {
27128
- question: context2.evalCase.question,
27129
27144
  criteria: context2.evalCase.criteria,
27130
27145
  expectedOutput: context2.evalCase.expected_output,
27131
- referenceAnswer: context2.evalCase.reference_answer,
27132
- answer: context2.candidate,
27146
+ outputText: context2.candidate,
27133
27147
  output: context2.output ?? null,
27134
27148
  guidelineFiles: context2.evalCase.guideline_paths,
27135
27149
  inputFiles: context2.evalCase.file_paths.filter(
@@ -27140,9 +27154,7 @@ async function executePromptTemplate(script, context2, config, timeoutMs) {
27140
27154
  fileChanges: context2.fileChanges ?? null,
27141
27155
  workspacePath: context2.workspacePath ?? null,
27142
27156
  config: config ?? context2.config ?? null,
27143
- // Text convenience accessors (new names, always strings)
27144
27157
  inputText: context2.evalCase.question,
27145
- outputText: context2.candidate,
27146
27158
  expectedOutputText: context2.evalCase.reference_answer ?? ""
27147
27159
  };
27148
27160
  const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
@@ -28762,7 +28774,7 @@ async function runEvaluation(options) {
28762
28774
  dataset: evalCase.dataset,
28763
28775
  score: 0,
28764
28776
  assertions: [],
28765
- answer: "",
28777
+ outputText: "",
28766
28778
  target: target.name,
28767
28779
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
28768
28780
  budgetExceeded: true,
@@ -28798,7 +28810,7 @@ async function runEvaluation(options) {
28798
28810
  dataset: evalCase.dataset,
28799
28811
  score: 0,
28800
28812
  assertions: [],
28801
- answer: "",
28813
+ outputText: "",
28802
28814
  target: target.name,
28803
28815
  error: errorMsg,
28804
28816
  executionStatus: "execution_error",
@@ -29765,7 +29777,7 @@ async function evaluateCandidate(options) {
29765
29777
  conversationId: evalCase.conversation_id,
29766
29778
  score: score.score,
29767
29779
  assertions: score.assertions,
29768
- answer: candidate,
29780
+ outputText: candidate,
29769
29781
  target: target.name,
29770
29782
  tokenUsage,
29771
29783
  costUsd,
@@ -30121,7 +30133,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
30121
30133
  conversationId: evalCase.conversation_id,
30122
30134
  score: 0,
30123
30135
  assertions: [{ text: `Error: ${message}`, passed: false }],
30124
- answer: `Error occurred: ${message}`,
30136
+ outputText: `Error occurred: ${message}`,
30125
30137
  target: targetName,
30126
30138
  requests,
30127
30139
  input,
@@ -30638,7 +30650,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
30638
30650
  return false;
30639
30651
  }
30640
30652
  var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
30641
- "answer",
30653
+ "outputText",
30642
30654
  "requests",
30643
30655
  "trace",
30644
30656
  "workspacePath",
@@ -30810,7 +30822,7 @@ var OtelTraceExporter = class {
30810
30822
  rootSpan.setAttribute("agentv.target", result.target);
30811
30823
  if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
30812
30824
  rootSpan.setAttribute("agentv.score", result.score);
30813
- if (captureContent) rootSpan.setAttribute("agentv.answer", result.answer);
30825
+ if (captureContent) rootSpan.setAttribute("agentv.output_text", result.outputText);
30814
30826
  if (result.durationMs != null)
30815
30827
  rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
30816
30828
  if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
@@ -31237,4 +31249,4 @@ export {
31237
31249
  OtelStreamingObserver,
31238
31250
  createAgentKernel
31239
31251
  };
31240
- //# sourceMappingURL=chunk-D6G4N2H2.js.map
31252
+ //# sourceMappingURL=chunk-K4RXLQWV.js.map