agentv 0.26.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -164,7 +164,7 @@ import { access as access6, mkdir as mkdir7 } from "node:fs/promises";
164
164
  import path19 from "node:path";
165
165
  import { pathToFileURL } from "node:url";
166
166
 
167
- // ../../packages/core/dist/chunk-NDEN3H2B.js
167
+ // ../../packages/core/dist/chunk-V3JCB3HI.js
168
168
  import { constants } from "node:fs";
169
169
  import { access, readFile } from "node:fs/promises";
170
170
  import path from "node:path";
@@ -4211,7 +4211,7 @@ var coerce = {
4211
4211
  };
4212
4212
  var NEVER = INVALID;
4213
4213
 
4214
- // ../../packages/core/dist/chunk-NDEN3H2B.js
4214
+ // ../../packages/core/dist/chunk-V3JCB3HI.js
4215
4215
  async function fileExists(filePath) {
4216
4216
  try {
4217
4217
  await access(filePath, constants.F_OK);
@@ -34567,18 +34567,23 @@ function isTestMessage(value) {
34567
34567
  if (typeof candidate.content === "string") {
34568
34568
  return true;
34569
34569
  }
34570
- if (!Array.isArray(candidate.content)) {
34571
- return false;
34570
+ if (Array.isArray(candidate.content) && candidate.content.every(isJsonObject)) {
34571
+ return true;
34572
34572
  }
34573
- return candidate.content.every(isJsonObject);
34573
+ if (Array.isArray(candidate.tool_calls) && candidate.tool_calls.length > 0) {
34574
+ return true;
34575
+ }
34576
+ if (isJsonObject(candidate.content)) {
34577
+ return true;
34578
+ }
34579
+ return false;
34574
34580
  }
34575
34581
  var EVALUATOR_KIND_VALUES = [
34576
34582
  "code_judge",
34577
34583
  "llm_judge",
34578
34584
  "rubric",
34579
34585
  "composite",
34580
- "tool_trajectory",
34581
- "expected_messages"
34586
+ "tool_trajectory"
34582
34587
  ];
34583
34588
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
34584
34589
  function isEvaluatorKind(value) {
@@ -35058,15 +35063,6 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
35058
35063
  });
35059
35064
  continue;
35060
35065
  }
35061
- if (typeValue === "expected_messages") {
35062
- const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
35063
- evaluators.push({
35064
- name: name16,
35065
- type: "expected_messages",
35066
- ...weight2 !== void 0 ? { weight: weight2 } : {}
35067
- });
35068
- continue;
35069
- }
35070
35066
  if (typeValue === "tool_trajectory") {
35071
35067
  const mode = asString2(rawEvaluator.mode);
35072
35068
  if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
@@ -35317,63 +35313,6 @@ async function processMessages(options) {
35317
35313
  }
35318
35314
  return segments;
35319
35315
  }
35320
- async function resolveAssistantContent(content, searchRoots, verbose) {
35321
- if (typeof content === "string") {
35322
- return content;
35323
- }
35324
- if (!content) {
35325
- return "";
35326
- }
35327
- const parts = [];
35328
- for (const entry of content) {
35329
- if (typeof entry === "string") {
35330
- parts.push({ content: entry, isFile: false });
35331
- continue;
35332
- }
35333
- if (!isJsonObject(entry)) {
35334
- continue;
35335
- }
35336
- const segmentType = asString3(entry.type);
35337
- if (segmentType === "file") {
35338
- const rawValue = asString3(entry.value);
35339
- if (!rawValue) {
35340
- continue;
35341
- }
35342
- const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
35343
- rawValue,
35344
- searchRoots
35345
- );
35346
- if (!resolvedPath) {
35347
- const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
35348
- logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
35349
- continue;
35350
- }
35351
- try {
35352
- const fileContent = (await readFile32(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
35353
- parts.push({ content: fileContent, isFile: true, displayPath });
35354
- if (verbose) {
35355
- console.log(` [Expected Assistant File] Found: ${displayPath}`);
35356
- console.log(` Resolved to: ${resolvedPath}`);
35357
- }
35358
- } catch (error40) {
35359
- logWarning3(`Could not read file ${resolvedPath}: ${error40.message}`);
35360
- }
35361
- continue;
35362
- }
35363
- const textValue = asString3(entry.text);
35364
- if (typeof textValue === "string") {
35365
- parts.push({ content: textValue, isFile: false });
35366
- continue;
35367
- }
35368
- const valueValue = asString3(entry.value);
35369
- if (typeof valueValue === "string") {
35370
- parts.push({ content: valueValue, isFile: false });
35371
- continue;
35372
- }
35373
- parts.push({ content: JSON.stringify(entry), isFile: false });
35374
- }
35375
- return formatFileContents(parts);
35376
- }
35377
35316
  function asString3(value) {
35378
35317
  return typeof value === "string" ? value : void 0;
35379
35318
  }
@@ -35406,14 +35345,15 @@ ${detailBlock}${ANSI_RESET4}`);
35406
35345
  }
35407
35346
  }
35408
35347
  async function processExpectedMessages(options) {
35409
- const { messages, searchRoots, repoRootPath, verbose } = options;
35348
+ const { messages, searchRoots, verbose } = options;
35410
35349
  const segments = [];
35411
35350
  for (const message of messages) {
35351
+ const extendedMessage = message;
35412
35352
  const segment = {
35413
35353
  role: message.role
35414
35354
  };
35415
- if (message.role === "assistant" && message.tool_calls !== void 0) {
35416
- segment.tool_calls = message.tool_calls;
35355
+ if (extendedMessage.name) {
35356
+ segment.name = extendedMessage.name;
35417
35357
  }
35418
35358
  const content = message.content;
35419
35359
  if (typeof content === "string") {
@@ -35461,6 +35401,13 @@ async function processExpectedMessages(options) {
35461
35401
  processedContent.push(cloneJsonObject(rawSegment));
35462
35402
  }
35463
35403
  segment.content = processedContent;
35404
+ } else if (isJsonObject(content)) {
35405
+ segment.content = cloneJsonObject(content);
35406
+ }
35407
+ if (extendedMessage.tool_calls && Array.isArray(extendedMessage.tool_calls)) {
35408
+ segment.tool_calls = extendedMessage.tool_calls.map(
35409
+ (tc) => isJsonObject(tc) ? cloneJsonObject(tc) : tc
35410
+ );
35464
35411
  }
35465
35412
  segments.push(segment);
35466
35413
  }
@@ -35749,9 +35696,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
35749
35696
  logError(`No valid expected message found for eval case: ${id}`);
35750
35697
  continue;
35751
35698
  }
35752
- if (expectedMessages.length > 1) {
35753
- logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
35754
- }
35755
35699
  const guidelinePaths = [];
35756
35700
  const inputTextParts = [];
35757
35701
  const inputSegments = await processMessages({
@@ -35771,8 +35715,19 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
35771
35715
  verbose
35772
35716
  }) : [];
35773
35717
  const codeSnippets = extractCodeBlocks(inputSegments);
35774
- const expectedContent = expectedMessages[0]?.content;
35775
- const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
35718
+ let referenceAnswer = "";
35719
+ if (outputSegments.length > 1) {
35720
+ referenceAnswer = JSON.stringify(outputSegments, null, 2);
35721
+ } else if (outputSegments.length === 1) {
35722
+ const singleMessage = outputSegments[0];
35723
+ if (typeof singleMessage.content === "string") {
35724
+ referenceAnswer = singleMessage.content;
35725
+ } else if (singleMessage.content) {
35726
+ referenceAnswer = JSON.stringify(singleMessage, null, 2);
35727
+ } else if (singleMessage.tool_calls) {
35728
+ referenceAnswer = JSON.stringify(singleMessage, null, 2);
35729
+ }
35730
+ }
35776
35731
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
35777
35732
  const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
35778
35733
  let evaluators;
@@ -35827,7 +35782,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
35827
35782
  question,
35828
35783
  input_messages: inputMessages,
35829
35784
  input_segments: inputSegments,
35830
- expected_segments: outputSegments,
35785
+ expected_messages: outputSegments,
35831
35786
  reference_answer: referenceAnswer,
35832
35787
  guideline_paths: guidelinePaths.map((guidelinePath) => path62.resolve(guidelinePath)),
35833
35788
  guideline_patterns: guidelinePatterns,
@@ -37669,7 +37624,7 @@ function createProvider(target) {
37669
37624
  }
37670
37625
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
37671
37626
 
37672
- Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
37627
+ Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
37673
37628
 
37674
37629
  Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
37675
37630
 
@@ -37727,7 +37682,7 @@ var LlmJudgeEvaluator = class {
37727
37682
  const variables = {
37728
37683
  [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
37729
37684
  [TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
37730
- context.evalCase.expected_segments,
37685
+ context.evalCase.expected_messages,
37731
37686
  null,
37732
37687
  2
37733
37688
  ),
@@ -37946,7 +37901,9 @@ var CodeEvaluator = class {
37946
37901
  input_files: context.evalCase.file_paths.filter(
37947
37902
  (path132) => !context.evalCase.guideline_paths.includes(path132)
37948
37903
  ),
37949
- input_messages: context.evalCase.input_messages
37904
+ input_messages: context.evalCase.input_messages,
37905
+ candidate_trace_file: context.candidateTraceRef ?? null,
37906
+ candidate_trace_summary: context.candidateTraceSummary ?? null
37950
37907
  },
37951
37908
  null,
37952
37909
  2
@@ -38212,105 +38169,6 @@ var ToolTrajectoryEvaluator = class {
38212
38169
  };
38213
38170
  }
38214
38171
  };
38215
- var ExpectedMessagesEvaluator = class {
38216
- kind = "expected_messages";
38217
- evaluate(context) {
38218
- const { candidateTrace, evalCase } = context;
38219
- const expectedSegments = evalCase.expected_segments;
38220
- const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
38221
- if (expectedToolCalls.length === 0) {
38222
- return {
38223
- score: 1,
38224
- verdict: "pass",
38225
- hits: ["No tool_calls specified in expected_messages"],
38226
- misses: [],
38227
- expectedAspectCount: 1
38228
- };
38229
- }
38230
- if (!candidateTrace || candidateTrace.length === 0) {
38231
- return {
38232
- score: 0,
38233
- verdict: "fail",
38234
- hits: [],
38235
- misses: ["No trace available to validate tool_calls"],
38236
- expectedAspectCount: expectedToolCalls.length
38237
- };
38238
- }
38239
- const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
38240
- return this.validateToolCalls(expectedToolCalls, actualToolCalls);
38241
- }
38242
- extractExpectedToolCalls(segments) {
38243
- if (!segments) {
38244
- return [];
38245
- }
38246
- const toolCalls = [];
38247
- for (const segment of segments) {
38248
- const role = segment.role;
38249
- const segmentToolCalls = segment.tool_calls;
38250
- if (role === "assistant" && Array.isArray(segmentToolCalls)) {
38251
- for (const tc of segmentToolCalls) {
38252
- if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
38253
- const toolCall = tc;
38254
- toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
38255
- }
38256
- }
38257
- }
38258
- }
38259
- return toolCalls;
38260
- }
38261
- validateToolCalls(expected, actual) {
38262
- const hits = [];
38263
- const misses = [];
38264
- for (let i = 0; i < expected.length; i++) {
38265
- const expectedCall = expected[i];
38266
- const actualCall = actual[i];
38267
- if (!actualCall) {
38268
- misses.push(
38269
- `tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
38270
- );
38271
- continue;
38272
- }
38273
- if (actualCall.name !== expectedCall.tool) {
38274
- misses.push(
38275
- `tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
38276
- );
38277
- continue;
38278
- }
38279
- if (expectedCall.input !== void 0) {
38280
- if (!this.deepEquals(expectedCall.input, actualCall.input)) {
38281
- misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
38282
- continue;
38283
- }
38284
- }
38285
- hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
38286
- }
38287
- const totalChecks = expected.length || 1;
38288
- const score = hits.length / totalChecks;
38289
- return {
38290
- score,
38291
- verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
38292
- hits,
38293
- misses,
38294
- expectedAspectCount: totalChecks
38295
- };
38296
- }
38297
- deepEquals(a, b) {
38298
- if (a === b) return true;
38299
- if (typeof a !== typeof b) return false;
38300
- if (typeof a !== "object" || a === null || b === null) return false;
38301
- if (Array.isArray(a) && Array.isArray(b)) {
38302
- if (a.length !== b.length) return false;
38303
- return a.every((val, i) => this.deepEquals(val, b[i]));
38304
- }
38305
- if (Array.isArray(a) || Array.isArray(b)) return false;
38306
- const aObj = a;
38307
- const bObj = b;
38308
- const aKeys = Object.keys(aObj);
38309
- const bKeys = Object.keys(bObj);
38310
- if (aKeys.length !== bKeys.length) return false;
38311
- return aKeys.every((key2) => this.deepEquals(aObj[key2], bObj[key2]));
38312
- }
38313
- };
38314
38172
  var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
38315
38173
  {{EVALUATOR_RESULTS_JSON}}
38316
38174
 
@@ -39061,6 +38919,7 @@ async function runEvalCase(options) {
39061
38919
  judgeProvider,
39062
38920
  agentTimeoutMs,
39063
38921
  candidateTrace,
38922
+ candidateTraceRef: providerResponse.traceRef,
39064
38923
  candidateTraceSummary
39065
38924
  });
39066
38925
  } catch (error40) {
@@ -39080,6 +38939,7 @@ async function evaluateCandidate(options) {
39080
38939
  judgeProvider,
39081
38940
  agentTimeoutMs,
39082
38941
  candidateTrace,
38942
+ candidateTraceRef,
39083
38943
  candidateTraceSummary
39084
38944
  } = options;
39085
38945
  const gradeTimestamp = nowFn();
@@ -39095,6 +38955,7 @@ async function evaluateCandidate(options) {
39095
38955
  judgeProvider,
39096
38956
  agentTimeoutMs,
39097
38957
  candidateTrace,
38958
+ candidateTraceRef,
39098
38959
  candidateTraceSummary
39099
38960
  });
39100
38961
  const completedAt = nowFn();
@@ -39149,6 +39010,7 @@ async function runEvaluatorsForCase(options) {
39149
39010
  judgeProvider,
39150
39011
  agentTimeoutMs,
39151
39012
  candidateTrace,
39013
+ candidateTraceRef,
39152
39014
  candidateTraceSummary
39153
39015
  } = options;
39154
39016
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
@@ -39165,6 +39027,7 @@ async function runEvaluatorsForCase(options) {
39165
39027
  judgeProvider,
39166
39028
  agentTimeoutMs,
39167
39029
  candidateTrace,
39030
+ candidateTraceRef,
39168
39031
  candidateTraceSummary
39169
39032
  });
39170
39033
  }
@@ -39183,6 +39046,7 @@ async function runEvaluatorsForCase(options) {
39183
39046
  now,
39184
39047
  judgeProvider,
39185
39048
  candidateTrace,
39049
+ candidateTraceRef,
39186
39050
  candidateTraceSummary
39187
39051
  });
39188
39052
  return { score };
@@ -39201,6 +39065,7 @@ async function runEvaluatorList(options) {
39201
39065
  judgeProvider,
39202
39066
  agentTimeoutMs,
39203
39067
  candidateTrace,
39068
+ candidateTraceRef,
39204
39069
  candidateTraceSummary
39205
39070
  } = options;
39206
39071
  const scored = [];
@@ -39247,7 +39112,9 @@ async function runEvaluatorList(options) {
39247
39112
  provider,
39248
39113
  attempt,
39249
39114
  promptInputs,
39250
- now
39115
+ now,
39116
+ candidateTraceRef,
39117
+ candidateTraceSummary
39251
39118
  });
39252
39119
  const weight = evaluator.weight ?? 1;
39253
39120
  scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -39285,8 +39152,6 @@ async function runEvaluatorList(options) {
39285
39152
  return new ToolTrajectoryEvaluator({
39286
39153
  config: memberConfig
39287
39154
  });
39288
- case "expected_messages":
39289
- return new ExpectedMessagesEvaluator();
39290
39155
  default: {
39291
39156
  const unknownConfig = memberConfig;
39292
39157
  throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -39336,32 +39201,7 @@ async function runEvaluatorList(options) {
39336
39201
  promptInputs,
39337
39202
  now,
39338
39203
  candidateTrace,
39339
- candidateTraceSummary
39340
- });
39341
- const weight = evaluator.weight ?? 1;
39342
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
39343
- evaluatorResults.push({
39344
- name: evaluator.name,
39345
- type: evaluator.type,
39346
- score: score2.score,
39347
- weight,
39348
- verdict: score2.verdict,
39349
- hits: score2.hits,
39350
- misses: score2.misses,
39351
- reasoning: score2.reasoning
39352
- });
39353
- }
39354
- if (evaluator.type === "expected_messages") {
39355
- const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
39356
- const score2 = expectedMessagesEvaluator.evaluate({
39357
- evalCase,
39358
- candidate,
39359
- target,
39360
- provider,
39361
- attempt,
39362
- promptInputs,
39363
- now,
39364
- candidateTrace,
39204
+ candidateTraceRef,
39365
39205
  candidateTraceSummary
39366
39206
  });
39367
39207
  const weight = evaluator.weight ?? 1;
@@ -40649,26 +40489,6 @@ function validateMessages(messages, location, filePath, errors) {
40649
40489
  message: `Invalid role '${role}'. Must be one of: ${validRoles.join(", ")}`
40650
40490
  });
40651
40491
  }
40652
- const toolCalls = message.tool_calls;
40653
- if (toolCalls !== void 0) {
40654
- if (role !== "assistant") {
40655
- errors.push({
40656
- severity: "error",
40657
- filePath,
40658
- location: `${msgLocation}.tool_calls`,
40659
- message: "tool_calls can only be specified on assistant messages"
40660
- });
40661
- } else if (!Array.isArray(toolCalls)) {
40662
- errors.push({
40663
- severity: "error",
40664
- filePath,
40665
- location: `${msgLocation}.tool_calls`,
40666
- message: "tool_calls must be an array"
40667
- });
40668
- } else {
40669
- validateToolCalls(toolCalls, `${msgLocation}.tool_calls`, filePath, errors);
40670
- }
40671
- }
40672
40492
  const content = message.content;
40673
40493
  if (typeof content === "string") {
40674
40494
  validateContentForRoleMarkers(content, `${msgLocation}.content`, filePath, errors);
@@ -40733,30 +40553,6 @@ function validateContentForRoleMarkers(content, location, filePath, errors) {
40733
40553
  }
40734
40554
  }
40735
40555
  }
40736
- function validateToolCalls(toolCalls, location, filePath, errors) {
40737
- for (let i = 0; i < toolCalls.length; i++) {
40738
- const toolCall = toolCalls[i];
40739
- const callLocation = `${location}[${i}]`;
40740
- if (!isObject2(toolCall)) {
40741
- errors.push({
40742
- severity: "error",
40743
- filePath,
40744
- location: callLocation,
40745
- message: "Tool call must be an object"
40746
- });
40747
- continue;
40748
- }
40749
- const tool2 = toolCall.tool;
40750
- if (typeof tool2 !== "string" || tool2.trim().length === 0) {
40751
- errors.push({
40752
- severity: "error",
40753
- filePath,
40754
- location: `${callLocation}.tool`,
40755
- message: "Missing or invalid 'tool' field (must be a non-empty string)"
40756
- });
40757
- }
40758
- }
40759
- }
40760
40556
  function isObject22(value) {
40761
40557
  return typeof value === "object" && value !== null && !Array.isArray(value);
40762
40558
  }
@@ -42708,4 +42504,4 @@ export {
42708
42504
  app,
42709
42505
  runCli
42710
42506
  };
42711
- //# sourceMappingURL=chunk-6ZM7WVSC.js.map
42507
+ //# sourceMappingURL=chunk-IVIT4U6S.js.map