agentv 3.2.4 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
301
301
  }
302
302
  });
303
303
 
304
- // ../../packages/core/dist/chunk-5SQK5FXC.js
304
+ // ../../packages/core/dist/chunk-C4MKEQR5.js
305
305
  import { constants } from "node:fs";
306
306
  import { access, readFile } from "node:fs/promises";
307
307
  import path from "node:path";
@@ -419,7 +419,7 @@ __export(external_exports2, {
419
419
  void: () => voidType
420
420
  });
421
421
 
422
- // ../../packages/core/dist/chunk-5SQK5FXC.js
422
+ // ../../packages/core/dist/chunk-C4MKEQR5.js
423
423
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
424
424
  var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
425
425
  var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
@@ -14657,19 +14657,26 @@ function logWarning(message) {
14657
14657
  console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET3}`);
14658
14658
  }
14659
14659
  var TEMPLATE_VARIABLES = {
14660
+ /** @deprecated Use OUTPUT_TEXT instead */
14660
14661
  ANSWER: "answer",
14661
14662
  EXPECTED_OUTPUT: "expected_output",
14663
+ /** @deprecated Use INPUT_TEXT instead */
14662
14664
  QUESTION: "question",
14663
14665
  CRITERIA: "criteria",
14666
+ /** @deprecated Use EXPECTED_OUTPUT_TEXT instead */
14664
14667
  REFERENCE_ANSWER: "reference_answer",
14665
14668
  INPUT: "input",
14666
14669
  OUTPUT: "output",
14667
- FILE_CHANGES: "file_changes"
14670
+ FILE_CHANGES: "file_changes",
14671
+ INPUT_TEXT: "input_text",
14672
+ OUTPUT_TEXT: "output_text",
14673
+ EXPECTED_OUTPUT_TEXT: "expected_output_text"
14668
14674
  };
14669
14675
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
14670
14676
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
14671
14677
  TEMPLATE_VARIABLES.ANSWER,
14672
- TEMPLATE_VARIABLES.EXPECTED_OUTPUT
14678
+ TEMPLATE_VARIABLES.EXPECTED_OUTPUT,
14679
+ TEMPLATE_VARIABLES.OUTPUT_TEXT
14673
14680
  ]);
14674
14681
  var ANSI_YELLOW3 = "\x1B[33m";
14675
14682
  var ANSI_RESET4 = "\x1B[0m";
@@ -14690,13 +14697,13 @@ function validateTemplateVariables(content, source) {
14690
14697
  }
14691
14698
  match = variablePattern.exec(content);
14692
14699
  }
14693
- const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.ANSWER);
14700
+ const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.ANSWER) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
14694
14701
  const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
14695
14702
  const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
14696
14703
  if (!hasRequiredFields) {
14697
14704
  throw new Error(
14698
14705
  `Missing required fields. Must include at least one of:
14699
- - {{ ${TEMPLATE_VARIABLES.ANSWER} }}
14706
+ - {{ ${TEMPLATE_VARIABLES.ANSWER} }} or {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
14700
14707
  - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
14701
14708
  );
14702
14709
  }
@@ -17856,7 +17863,14 @@ async function invokeModel(options) {
17856
17863
  function mapResponse(result) {
17857
17864
  const content = result.text ?? "";
17858
17865
  const rawUsage = result.totalUsage ?? result.usage;
17859
- const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
17866
+ const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
17867
+ const cached = rawUsage?.inputTokenDetails?.cacheReadTokens ?? void 0;
17868
+ const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? {
17869
+ input: rawUsage.inputTokens,
17870
+ output: rawUsage.outputTokens,
17871
+ ...reasoning != null ? { reasoning } : {},
17872
+ ...cached != null ? { cached } : {}
17873
+ } : void 0;
17860
17874
  return {
17861
17875
  raw: result,
17862
17876
  usage: toJsonObject(rawUsage),
@@ -18191,10 +18205,12 @@ var ClaudeCliProvider = class {
18191
18205
  if (usage) {
18192
18206
  const inputTokens = (usage.input_tokens ?? 0) + (usage.cache_read_input_tokens ?? 0) + (usage.cache_creation_input_tokens ?? 0);
18193
18207
  const outputTokens = usage.output_tokens ?? 0;
18208
+ const reasoningTokens = usage.reasoning_tokens ?? void 0;
18194
18209
  tokenUsage = {
18195
18210
  input: inputTokens,
18196
18211
  output: outputTokens,
18197
- cached: usage.cache_read_input_tokens ?? void 0
18212
+ cached: usage.cache_read_input_tokens ?? void 0,
18213
+ reasoning: reasoningTokens
18198
18214
  };
18199
18215
  request.streamCallbacks?.onLlmCallEnd?.(this.config.model ?? "claude", tokenUsage);
18200
18216
  }
@@ -23859,7 +23875,11 @@ var CodeEvaluator = class {
23859
23875
  endTime: context2.endTime ?? null,
23860
23876
  fileChanges: context2.fileChanges ?? null,
23861
23877
  workspacePath: context2.workspacePath ?? null,
23862
- config: this.config ?? null
23878
+ config: this.config ?? null,
23879
+ // Text convenience accessors (new names, always strings)
23880
+ inputText: context2.evalCase.question,
23881
+ outputText: context2.candidate,
23882
+ expectedOutputText: context2.evalCase.reference_answer ?? ""
23863
23883
  };
23864
23884
  const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
23865
23885
  let proxyEnv;
@@ -24101,7 +24121,11 @@ var LlmGraderEvaluator = class {
24101
24121
  [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
24102
24122
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
24103
24123
  [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
24104
- [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
24124
+ [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
24125
+ // Text convenience accessors (new names, always strings)
24126
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
24127
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
24128
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
24105
24129
  };
24106
24130
  const systemPrompt = buildOutputSchema();
24107
24131
  const evaluatorTemplate = context2.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
@@ -25941,28 +25965,60 @@ var LatencyEvaluator = class {
25941
25965
  };
25942
25966
  }
25943
25967
  };
25968
+ var CLAUDE_MATCHER = {
25969
+ skillTools: ["Skill"],
25970
+ skillInputField: "skill",
25971
+ readTools: ["Read"],
25972
+ readInputField: "file_path"
25973
+ };
25974
+ var COPILOT_MATCHER = {
25975
+ skillTools: ["Skill", "skill"],
25976
+ skillInputField: "skill",
25977
+ readTools: ["Read File", "readFile", "Read", "readTextFile"],
25978
+ readInputField: "file_path"
25979
+ };
25980
+ var PROVIDER_TOOL_SEMANTICS = {
25981
+ claude: CLAUDE_MATCHER,
25982
+ "claude-cli": CLAUDE_MATCHER,
25983
+ "claude-sdk": CLAUDE_MATCHER,
25984
+ "pi-coding-agent": CLAUDE_MATCHER,
25985
+ "pi-agent-sdk": CLAUDE_MATCHER,
25986
+ "copilot-cli": COPILOT_MATCHER,
25987
+ "copilot-sdk": COPILOT_MATCHER,
25988
+ vscode: COPILOT_MATCHER,
25989
+ "vscode-insiders": COPILOT_MATCHER
25990
+ };
25944
25991
  var SkillTriggerEvaluator = class {
25945
25992
  kind = "skill-trigger";
25946
25993
  config;
25947
25994
  constructor(config) {
25948
25995
  this.config = config;
25949
25996
  }
25997
+ resolveMatcher(providerKind) {
25998
+ if (providerKind) {
25999
+ const match = PROVIDER_TOOL_SEMANTICS[providerKind];
26000
+ if (match) return match;
26001
+ }
26002
+ return CLAUDE_MATCHER;
26003
+ }
25950
26004
  evaluate(context2) {
25951
26005
  const skillName = this.config.skill;
25952
26006
  const shouldTrigger = this.config.should_trigger !== false;
26007
+ const providerKind = context2.provider?.kind;
26008
+ const matcher = this.resolveMatcher(providerKind);
25953
26009
  const firstTool = (context2.output ?? []).flatMap((msg) => msg.toolCalls ?? [])[0];
25954
26010
  let triggered = false;
25955
26011
  let evidence = "";
25956
26012
  if (firstTool) {
25957
26013
  const input = firstTool.input ?? {};
25958
- if (firstTool.tool === "Skill") {
25959
- const skillArg = String(input.skill ?? "");
26014
+ if (matcher.skillTools.includes(firstTool.tool)) {
26015
+ const skillArg = String(input[matcher.skillInputField] ?? "");
25960
26016
  if (skillArg.includes(skillName)) {
25961
26017
  triggered = true;
25962
- evidence = `Skill tool invoked with skill="${skillArg}"`;
26018
+ evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
25963
26019
  }
25964
- } else if (firstTool.tool === "Read") {
25965
- const filePath = String(input.file_path ?? "");
26020
+ } else if (matcher.readTools.includes(firstTool.tool)) {
26021
+ const filePath = String(input[matcher.readInputField] ?? "");
25966
26022
  if (filePath.includes(skillName)) {
25967
26023
  triggered = true;
25968
26024
  evidence = `Read tool loaded skill file: ${filePath}`;
@@ -25987,7 +26043,7 @@ var SkillTriggerEvaluator = class {
25987
26043
  verdict: "fail",
25988
26044
  hits: [],
25989
26045
  misses: [
25990
- shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not Skill/Read for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`
26046
+ shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`
25991
26047
  ],
25992
26048
  expectedAspectCount: 1,
25993
26049
  reasoning: shouldTrigger ? `Skill "${skillName}" was not triggered` : "False trigger: skill fired when it should not have"
@@ -26029,7 +26085,11 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
26029
26085
  [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (evalCase.reference_answer ?? "").trim(),
26030
26086
  [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
26031
26087
  [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
26032
- [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? ""
26088
+ [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
26089
+ // Text convenience accessors (new names, always strings)
26090
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
26091
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
26092
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
26033
26093
  };
26034
26094
  const systemPrompt = buildOutputSchema();
26035
26095
  const template = evaluatorTemplateOverride ?? DEFAULT_EVALUATOR_TEMPLATE;
@@ -27020,7 +27080,11 @@ async function executePromptTemplate(script, context2, config, timeoutMs) {
27020
27080
  trace: context2.trace ?? null,
27021
27081
  fileChanges: context2.fileChanges ?? null,
27022
27082
  workspacePath: context2.workspacePath ?? null,
27023
- config: config ?? context2.config ?? null
27083
+ config: config ?? context2.config ?? null,
27084
+ // Text convenience accessors (new names, always strings)
27085
+ inputText: context2.evalCase.question,
27086
+ outputText: context2.candidate,
27087
+ expectedOutputText: context2.evalCase.reference_answer ?? ""
27024
27088
  };
27025
27089
  const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
27026
27090
  const scriptPath = script[script.length - 1];
@@ -29279,6 +29343,7 @@ async function runEvalCase(options) {
29279
29343
  } catch {
29280
29344
  }
29281
29345
  }
29346
+ const caseStartMs = Date.now();
29282
29347
  const attemptBudget = (maxRetries ?? 0) + 1;
29283
29348
  let attempt = 0;
29284
29349
  let providerResponse = cachedResponse;
@@ -29427,9 +29492,22 @@ async function runEvalCase(options) {
29427
29492
  fileChanges,
29428
29493
  workspacePath
29429
29494
  });
29495
+ const totalDurationMs = Date.now() - caseStartMs;
29496
+ const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
29497
+ const evalRunTokenUsage = tokenUsage || graderTokens ? {
29498
+ input: (tokenUsage?.input ?? 0) + (graderTokens?.input ?? 0),
29499
+ output: (tokenUsage?.output ?? 0) + (graderTokens?.output ?? 0),
29500
+ ...tokenUsage?.reasoning != null || graderTokens?.reasoning != null ? { reasoning: (tokenUsage?.reasoning ?? 0) + (graderTokens?.reasoning ?? 0) } : {},
29501
+ ...tokenUsage?.cached != null || graderTokens?.cached != null ? { cached: (tokenUsage?.cached ?? 0) + (graderTokens?.cached ?? 0) } : {}
29502
+ } : void 0;
29503
+ const evalRun = {
29504
+ durationMs: totalDurationMs,
29505
+ ...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
29506
+ };
29430
29507
  const executionStatus = providerError ? "execution_error" : classifyQualityStatus(result.score);
29431
29508
  const finalResult = providerError ? {
29432
29509
  ...result,
29510
+ evalRun,
29433
29511
  error: providerError,
29434
29512
  executionStatus,
29435
29513
  failureStage: "agent",
@@ -29438,7 +29516,7 @@ async function runEvalCase(options) {
29438
29516
  beforeAllOutput,
29439
29517
  beforeEachOutput,
29440
29518
  afterEachOutput
29441
- } : { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
29519
+ } : { ...result, evalRun, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
29442
29520
  const isFailure = !!finalResult.error || finalResult.score < 0.5;
29443
29521
  if (workspacePath && !isSharedWorkspace) {
29444
29522
  if (forceCleanup) {
@@ -29458,6 +29536,7 @@ async function runEvalCase(options) {
29458
29536
  }
29459
29537
  return finalResult;
29460
29538
  } catch (error) {
29539
+ const evalRun = { durationMs: Date.now() - caseStartMs };
29461
29540
  const errorResult = buildErrorResult(
29462
29541
  evalCase,
29463
29542
  target.name,
@@ -29473,10 +29552,10 @@ async function runEvalCase(options) {
29473
29552
  await cleanupWorkspace(workspacePath).catch(() => {
29474
29553
  });
29475
29554
  } else {
29476
- return { ...errorResult, workspacePath, beforeEachOutput, afterEachOutput };
29555
+ return { ...errorResult, evalRun, workspacePath, beforeEachOutput, afterEachOutput };
29477
29556
  }
29478
29557
  }
29479
- return { ...errorResult, beforeEachOutput, afterEachOutput };
29558
+ return { ...errorResult, evalRun, beforeEachOutput, afterEachOutput };
29480
29559
  }
29481
29560
  }
29482
29561
  async function runEvalCaseWithTrials(options, trialsConfig) {
@@ -30051,6 +30130,44 @@ function buildResultInput(promptInputs) {
30051
30130
  }
30052
30131
  return promptInputs.question;
30053
30132
  }
30133
+ function aggregateEvaluatorTokenUsage(scores) {
30134
+ if (!scores || scores.length === 0) return void 0;
30135
+ let hasAny = false;
30136
+ let input = 0;
30137
+ let output = 0;
30138
+ let reasoning = 0;
30139
+ let cached = 0;
30140
+ let hasReasoning = false;
30141
+ let hasCached = false;
30142
+ const visit = (items) => {
30143
+ for (const item of items) {
30144
+ if (item.tokenUsage) {
30145
+ hasAny = true;
30146
+ input += item.tokenUsage.input;
30147
+ output += item.tokenUsage.output;
30148
+ if (item.tokenUsage.reasoning != null) {
30149
+ hasReasoning = true;
30150
+ reasoning += item.tokenUsage.reasoning;
30151
+ }
30152
+ if (item.tokenUsage.cached != null) {
30153
+ hasCached = true;
30154
+ cached += item.tokenUsage.cached;
30155
+ }
30156
+ }
30157
+ if (item.scores) {
30158
+ visit(item.scores);
30159
+ }
30160
+ }
30161
+ };
30162
+ visit(scores);
30163
+ if (!hasAny) return void 0;
30164
+ return {
30165
+ input,
30166
+ output,
30167
+ ...hasReasoning ? { reasoning } : {},
30168
+ ...hasCached ? { cached } : {}
30169
+ };
30170
+ }
30054
30171
  function isTimeoutLike(error) {
30055
30172
  if (!error) {
30056
30173
  return false;
@@ -31086,4 +31203,4 @@ export {
31086
31203
  OtelStreamingObserver,
31087
31204
  createAgentKernel
31088
31205
  };
31089
- //# sourceMappingURL=chunk-VBGYESW7.js.map
31206
+ //# sourceMappingURL=chunk-5M3K2DMV.js.map