agentv 3.9.0 → 3.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
301
301
  }
302
302
  });
303
303
 
304
- // ../../packages/core/dist/chunk-PC5TLJF6.js
304
+ // ../../packages/core/dist/chunk-K7JCJIXA.js
305
305
  import { constants } from "node:fs";
306
306
  import { access, readFile } from "node:fs/promises";
307
307
  import path from "node:path";
@@ -419,7 +419,7 @@ __export(external_exports2, {
419
419
  void: () => voidType
420
420
  });
421
421
 
422
- // ../../packages/core/dist/chunk-PC5TLJF6.js
422
+ // ../../packages/core/dist/chunk-K7JCJIXA.js
423
423
  import { readFile as readFile2 } from "node:fs/promises";
424
424
  import path3 from "node:path";
425
425
  import fg from "fast-glob";
@@ -14472,6 +14472,17 @@ function extractTargetsFromSuite(suite) {
14472
14472
  }
14473
14473
  return void 0;
14474
14474
  }
14475
+ function extractWorkersFromSuite(suite) {
14476
+ const execution = suite.execution;
14477
+ if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
14478
+ return void 0;
14479
+ }
14480
+ const workers = execution.workers;
14481
+ if (typeof workers === "number" && Number.isInteger(workers) && workers >= 1 && workers <= 50) {
14482
+ return workers;
14483
+ }
14484
+ return void 0;
14485
+ }
14475
14486
  function extractTargetsFromTestCase(testCase) {
14476
14487
  const execution = testCase.execution;
14477
14488
  if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
@@ -16735,6 +16746,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
16735
16746
  tests,
16736
16747
  trials: extractTrialsConfig(parsed),
16737
16748
  targets: extractTargetsFromSuite(parsed),
16749
+ workers: extractWorkersFromSuite(parsed),
16738
16750
  cacheConfig: extractCacheConfig(parsed),
16739
16751
  totalBudgetUsd: extractTotalBudgetUsd(parsed),
16740
16752
  ...metadata !== void 0 && { metadata },
@@ -23966,8 +23978,7 @@ ${context2.fileChanges}`;
23966
23978
  }
23967
23979
  const evaluatorRawRequest = {
23968
23980
  userPrompt,
23969
- systemPrompt,
23970
- target: graderProvider.targetName
23981
+ systemPrompt
23971
23982
  };
23972
23983
  try {
23973
23984
  const { data, tokenUsage } = await this.runWithRetry({
@@ -23985,6 +23996,7 @@ ${context2.fileChanges}`;
23985
23996
  assertions,
23986
23997
  expectedAspectCount: Math.max(assertions.length, 1),
23987
23998
  evaluatorRawRequest,
23999
+ graderTarget: graderProvider.targetName,
23988
24000
  tokenUsage
23989
24001
  };
23990
24002
  } catch (e) {
@@ -23996,7 +24008,8 @@ ${context2.fileChanges}`;
23996
24008
  verdict: "skip",
23997
24009
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
23998
24010
  expectedAspectCount: 1,
23999
- evaluatorRawRequest
24011
+ evaluatorRawRequest,
24012
+ graderTarget: graderProvider.targetName
24000
24013
  };
24001
24014
  }
24002
24015
  }
@@ -24014,8 +24027,7 @@ ${context2.fileChanges}`;
24014
24027
  const systemPrompt = buildRubricOutputSchema();
24015
24028
  const evaluatorRawRequest = {
24016
24029
  userPrompt: prompt,
24017
- systemPrompt,
24018
- target: graderProvider.targetName
24030
+ systemPrompt
24019
24031
  };
24020
24032
  try {
24021
24033
  const { data, tokenUsage } = await this.runWithRetry({
@@ -24032,6 +24044,7 @@ ${context2.fileChanges}`;
24032
24044
  assertions,
24033
24045
  expectedAspectCount: rubrics.length,
24034
24046
  evaluatorRawRequest,
24047
+ graderTarget: graderProvider.targetName,
24035
24048
  tokenUsage
24036
24049
  };
24037
24050
  } catch (e) {
@@ -24043,7 +24056,8 @@ ${context2.fileChanges}`;
24043
24056
  verdict: "skip",
24044
24057
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
24045
24058
  expectedAspectCount: rubrics.length,
24046
- evaluatorRawRequest
24059
+ evaluatorRawRequest,
24060
+ graderTarget: graderProvider.targetName
24047
24061
  };
24048
24062
  }
24049
24063
  }
@@ -24056,8 +24070,7 @@ ${context2.fileChanges}`;
24056
24070
  const systemPrompt = buildScoreRangeOutputSchema();
24057
24071
  const evaluatorRawRequest = {
24058
24072
  userPrompt: prompt,
24059
- systemPrompt,
24060
- target: graderProvider.targetName
24073
+ systemPrompt
24061
24074
  };
24062
24075
  try {
24063
24076
  const { data, tokenUsage } = await this.runWithRetry({
@@ -24074,6 +24087,7 @@ ${context2.fileChanges}`;
24074
24087
  assertions,
24075
24088
  expectedAspectCount: rubrics.length,
24076
24089
  evaluatorRawRequest,
24090
+ graderTarget: graderProvider.targetName,
24077
24091
  details,
24078
24092
  tokenUsage
24079
24093
  };
@@ -24086,7 +24100,8 @@ ${context2.fileChanges}`;
24086
24100
  verdict: "skip",
24087
24101
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
24088
24102
  expectedAspectCount: rubrics.length,
24089
- evaluatorRawRequest
24103
+ evaluatorRawRequest,
24104
+ graderTarget: graderProvider.targetName
24090
24105
  };
24091
24106
  }
24092
24107
  }
@@ -24118,7 +24133,6 @@ ${context2.fileChanges}`;
24118
24133
  mode: "built-in",
24119
24134
  systemPrompt,
24120
24135
  userPrompt,
24121
- target: graderProvider.targetName,
24122
24136
  maxSteps: this.maxSteps
24123
24137
  };
24124
24138
  try {
@@ -24136,7 +24150,13 @@ ${context2.fileChanges}`;
24136
24150
  steps: steps.length,
24137
24151
  tool_calls: toolCallCount
24138
24152
  };
24139
- return this.parseAgentResult(text2, rubrics, evaluatorRawRequest, details);
24153
+ return this.parseAgentResult(
24154
+ text2,
24155
+ rubrics,
24156
+ evaluatorRawRequest,
24157
+ details,
24158
+ graderProvider.targetName
24159
+ );
24140
24160
  } catch (error) {
24141
24161
  const message = error instanceof Error ? error.message : String(error);
24142
24162
  return {
@@ -24145,6 +24165,7 @@ ${context2.fileChanges}`;
24145
24165
  assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
24146
24166
  expectedAspectCount: 1,
24147
24167
  evaluatorRawRequest,
24168
+ graderTarget: graderProvider.targetName,
24148
24169
  details: { mode: "built-in", error: message }
24149
24170
  };
24150
24171
  }
@@ -24197,6 +24218,7 @@ ${context2.fileChanges}`;
24197
24218
  ],
24198
24219
  expectedAspectCount: 1,
24199
24220
  evaluatorRawRequest,
24221
+ graderTarget: provider.targetName,
24200
24222
  details: { mode: modeLabel, grader_target: provider.targetName }
24201
24223
  };
24202
24224
  }
@@ -24206,7 +24228,13 @@ ${context2.fileChanges}`;
24206
24228
  mode: modeLabel,
24207
24229
  grader_target: provider.targetName
24208
24230
  };
24209
- return this.parseAgentResult(assistantContent, rubrics, evaluatorRawRequest, details);
24231
+ return this.parseAgentResult(
24232
+ assistantContent,
24233
+ rubrics,
24234
+ evaluatorRawRequest,
24235
+ details,
24236
+ provider.targetName
24237
+ );
24210
24238
  } catch (error) {
24211
24239
  const message = error instanceof Error ? error.message : String(error);
24212
24240
  return {
@@ -24217,6 +24245,7 @@ ${context2.fileChanges}`;
24217
24245
  ],
24218
24246
  expectedAspectCount: 1,
24219
24247
  evaluatorRawRequest,
24248
+ graderTarget: provider.targetName,
24220
24249
  details: {
24221
24250
  mode: modeLabel,
24222
24251
  grader_target: provider.targetName,
@@ -24361,7 +24390,7 @@ ${outputSchema2}`;
24361
24390
  * Parse the agent's response text into an EvaluationScore.
24362
24391
  * Supports both freeform and rubric modes.
24363
24392
  */
24364
- parseAgentResult(text2, rubrics, evaluatorRawRequest, details) {
24393
+ parseAgentResult(text2, rubrics, evaluatorRawRequest, details, graderTarget) {
24365
24394
  try {
24366
24395
  const parsed = parseJsonFromText(text2);
24367
24396
  if (rubrics && rubrics.length > 0) {
@@ -24373,6 +24402,7 @@ ${outputSchema2}`;
24373
24402
  assertions: assertions2,
24374
24403
  expectedAspectCount: rubrics.length,
24375
24404
  evaluatorRawRequest,
24405
+ graderTarget,
24376
24406
  details
24377
24407
  };
24378
24408
  }
@@ -24385,6 +24415,7 @@ ${outputSchema2}`;
24385
24415
  assertions,
24386
24416
  expectedAspectCount: Math.max(assertions.length, 1),
24387
24417
  evaluatorRawRequest,
24418
+ graderTarget,
24388
24419
  details
24389
24420
  };
24390
24421
  } catch {
@@ -24399,6 +24430,7 @@ ${outputSchema2}`;
24399
24430
  ],
24400
24431
  expectedAspectCount: 1,
24401
24432
  evaluatorRawRequest,
24433
+ graderTarget,
24402
24434
  details
24403
24435
  };
24404
24436
  }
@@ -28331,14 +28363,22 @@ async function runEvaluation(options) {
28331
28363
  const usePool = poolEnabled !== false && !!suiteWorkspace?.repos?.length && !isPerTestIsolation && !useStaticWorkspace;
28332
28364
  const resolvedRetainOnSuccess = retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup");
28333
28365
  const resolvedRetainOnFailure = retainOnFailure ?? (cleanupWorkspaces ? "cleanup" : "keep");
28334
- const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
28335
- const workers = hasSharedWorkspace && !usePool ? 1 : requestedWorkers;
28366
+ const workers = options.maxConcurrency ?? target.workers ?? 1;
28336
28367
  setupLog(
28337
- `sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool} requestedWorkers=${requestedWorkers} effectiveWorkers=${workers}`
28368
+ `sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool} workers=${workers}`
28338
28369
  );
28339
- if (hasSharedWorkspace && !usePool && requestedWorkers > 1) {
28370
+ if (hasSharedWorkspace && !usePool && workers > 1) {
28340
28371
  console.warn(
28341
- `Warning: Shared workspace requires sequential execution. Overriding workers from ${requestedWorkers} to 1.`
28372
+ [
28373
+ `Warning: This eval uses a shared workspace with ${workers} workers.`,
28374
+ "If the agent under test makes file edits, concurrent runs may corrupt each other.",
28375
+ "To limit concurrency, add this to your eval YAML:",
28376
+ "",
28377
+ " execution:",
28378
+ " workers: 1",
28379
+ "",
28380
+ "Or pass --workers 1 on the command line."
28381
+ ].join("\n")
28342
28382
  );
28343
28383
  }
28344
28384
  const limit = pLimit(workers);
@@ -28621,7 +28661,8 @@ async function runEvaluation(options) {
28621
28661
  streamCallbacks,
28622
28662
  typeRegistry,
28623
28663
  repoManager,
28624
- evalDir
28664
+ evalDir,
28665
+ verbose
28625
28666
  };
28626
28667
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
28627
28668
  if (totalBudgetUsd !== void 0) {
@@ -28701,7 +28742,8 @@ async function runEvaluation(options) {
28701
28742
  promptInputs,
28702
28743
  primaryProvider,
28703
28744
  "agent",
28704
- "provider_error"
28745
+ "provider_error",
28746
+ verbose
28705
28747
  );
28706
28748
  results.push(errorResult);
28707
28749
  if (onResult) {
@@ -28774,6 +28816,7 @@ async function runBatchEvaluation(options) {
28774
28816
  nowFn,
28775
28817
  onProgress,
28776
28818
  onResult,
28819
+ verbose,
28777
28820
  resolveGraderProvider,
28778
28821
  agentTimeoutMs,
28779
28822
  targetResolver,
@@ -28861,7 +28904,8 @@ async function runBatchEvaluation(options) {
28861
28904
  startTime,
28862
28905
  endTime,
28863
28906
  targetResolver,
28864
- availableTargets
28907
+ availableTargets,
28908
+ verbose
28865
28909
  });
28866
28910
  if (providerError) {
28867
28911
  result = {
@@ -28882,7 +28926,8 @@ async function runBatchEvaluation(options) {
28882
28926
  promptInputs,
28883
28927
  provider,
28884
28928
  "evaluator",
28885
- "evaluator_error"
28929
+ "evaluator_error",
28930
+ verbose
28886
28931
  );
28887
28932
  results.push(errorResult);
28888
28933
  if (onResult) {
@@ -28945,7 +28990,8 @@ async function runEvalCase(options) {
28945
28990
  suiteWorkspaceFile,
28946
28991
  typeRegistry: providedTypeRegistry,
28947
28992
  repoManager,
28948
- evalDir
28993
+ evalDir,
28994
+ verbose
28949
28995
  } = options;
28950
28996
  const setupDebug = process.env.AGENTV_SETUP_DEBUG === "1";
28951
28997
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
@@ -28982,7 +29028,8 @@ async function runEvalCase(options) {
28982
29028
  promptInputs,
28983
29029
  provider,
28984
29030
  "setup",
28985
- "template_error"
29031
+ "template_error",
29032
+ verbose
28986
29033
  );
28987
29034
  }
28988
29035
  if (caseWorkspaceFile && workspacePath) {
@@ -29011,7 +29058,8 @@ async function runEvalCase(options) {
29011
29058
  promptInputs,
29012
29059
  provider,
29013
29060
  "repo_setup",
29014
- "local_path_not_found"
29061
+ "local_path_not_found",
29062
+ verbose
29015
29063
  );
29016
29064
  }
29017
29065
  }
@@ -29037,7 +29085,8 @@ async function runEvalCase(options) {
29037
29085
  promptInputs,
29038
29086
  provider,
29039
29087
  "repo_setup",
29040
- "clone_error"
29088
+ "clone_error",
29089
+ verbose
29041
29090
  );
29042
29091
  }
29043
29092
  }
@@ -29063,7 +29112,8 @@ async function runEvalCase(options) {
29063
29112
  promptInputs,
29064
29113
  provider,
29065
29114
  "setup",
29066
- "file_copy_error"
29115
+ "file_copy_error",
29116
+ verbose
29067
29117
  );
29068
29118
  }
29069
29119
  }
@@ -29108,7 +29158,8 @@ async function runEvalCase(options) {
29108
29158
  promptInputs,
29109
29159
  provider,
29110
29160
  "setup",
29111
- "script_error"
29161
+ "script_error",
29162
+ verbose
29112
29163
  );
29113
29164
  }
29114
29165
  }
@@ -29139,7 +29190,8 @@ async function runEvalCase(options) {
29139
29190
  promptInputs,
29140
29191
  provider,
29141
29192
  "setup",
29142
- "script_error"
29193
+ "script_error",
29194
+ verbose
29143
29195
  );
29144
29196
  }
29145
29197
  }
@@ -29183,7 +29235,8 @@ async function runEvalCase(options) {
29183
29235
  promptInputs,
29184
29236
  provider,
29185
29237
  "agent",
29186
- "provider_error"
29238
+ "provider_error",
29239
+ verbose
29187
29240
  );
29188
29241
  if (workspacePath) {
29189
29242
  if (forceCleanup) {
@@ -29204,7 +29257,8 @@ async function runEvalCase(options) {
29204
29257
  promptInputs,
29205
29258
  provider,
29206
29259
  "agent",
29207
- "provider_error"
29260
+ "provider_error",
29261
+ verbose
29208
29262
  );
29209
29263
  if (workspacePath) {
29210
29264
  if (forceCleanup) {
@@ -29299,7 +29353,8 @@ async function runEvalCase(options) {
29299
29353
  targetResolver,
29300
29354
  availableTargets,
29301
29355
  fileChanges,
29302
- workspacePath
29356
+ workspacePath,
29357
+ verbose
29303
29358
  });
29304
29359
  const totalDurationMs = Date.now() - caseStartMs;
29305
29360
  const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
@@ -29354,7 +29409,8 @@ async function runEvalCase(options) {
29354
29409
  promptInputs,
29355
29410
  provider,
29356
29411
  "evaluator",
29357
- "evaluator_error"
29412
+ "evaluator_error",
29413
+ verbose
29358
29414
  );
29359
29415
  if (workspacePath && !isSharedWorkspace) {
29360
29416
  if (forceCleanup || (retainOnFailure ?? "keep") === "cleanup") {
@@ -29496,7 +29552,7 @@ async function evaluateCandidate(options) {
29496
29552
  let lmRequest;
29497
29553
  if (isAgentProvider(provider)) {
29498
29554
  agentRequest = {
29499
- question: promptInputs.question
29555
+ ...options.verbose ? { input: promptInputs.question } : {}
29500
29556
  };
29501
29557
  } else {
29502
29558
  if (promptInputs.chatPrompt) {
@@ -29510,8 +29566,9 @@ async function evaluateCandidate(options) {
29510
29566
  }
29511
29567
  }
29512
29568
  const evaluatorRequest = scores ? void 0 : score.evaluatorRawRequest;
29513
- const requests = agentRequest || lmRequest || evaluatorRequest ? {
29514
- ...agentRequest ? { agent: agentRequest } : {},
29569
+ const effectiveAgentRequest = agentRequest && Object.keys(agentRequest).length > 0 ? agentRequest : void 0;
29570
+ const requests = effectiveAgentRequest || lmRequest || evaluatorRequest ? {
29571
+ ...effectiveAgentRequest ? { agent: effectiveAgentRequest } : {},
29515
29572
  ...lmRequest ? { lm: lmRequest } : {},
29516
29573
  ...evaluatorRequest ? { evaluator: evaluatorRequest } : {}
29517
29574
  } : void 0;
@@ -29531,9 +29588,9 @@ async function evaluateCandidate(options) {
29531
29588
  endTime,
29532
29589
  requests,
29533
29590
  input,
29591
+ output: output ?? [{ role: "assistant", content: candidate }],
29534
29592
  scores,
29535
29593
  trace: trace2,
29536
- output: output ?? [{ role: "assistant", content: candidate }],
29537
29594
  fileChanges,
29538
29595
  executionStatus: classifyQualityStatus(score.score)
29539
29596
  };
@@ -29699,6 +29756,7 @@ async function runEvaluatorList(options) {
29699
29756
  verdict: score2.verdict,
29700
29757
  assertions: score2.assertions,
29701
29758
  input: score2.evaluatorRawRequest,
29759
+ target: score2.graderTarget,
29702
29760
  details: score2.details,
29703
29761
  scores: mapChildResults(score2.scores),
29704
29762
  tokenUsage: score2.tokenUsage,
@@ -29838,13 +29896,13 @@ async function invokeProvider(provider, options) {
29838
29896
  }
29839
29897
  }
29840
29898
  }
29841
- function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
29899
+ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode, verbose) {
29842
29900
  const message = error instanceof Error ? error.message : String(error);
29843
29901
  let agentRequest;
29844
29902
  let lmRequest;
29845
29903
  if (isAgentProvider(provider)) {
29846
29904
  agentRequest = {
29847
- question: promptInputs.question,
29905
+ ...verbose ? { input: promptInputs.question } : {},
29848
29906
  error: message
29849
29907
  };
29850
29908
  } else {
@@ -29872,10 +29930,10 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
29872
29930
  conversationId: evalCase.conversation_id,
29873
29931
  score: 0,
29874
29932
  assertions: [{ text: `Error: ${message}`, passed: false }],
29875
- output: [{ role: "assistant", content: `Error occurred: ${message}` }],
29876
29933
  target: targetName,
29877
29934
  requests,
29878
29935
  input,
29936
+ output: [{ role: "assistant", content: `Error occurred: ${message}` }],
29879
29937
  error: message,
29880
29938
  executionStatus: "execution_error",
29881
29939
  failureStage,
@@ -30878,6 +30936,7 @@ export {
30878
30936
  loadConfig,
30879
30937
  extractTargetFromSuite,
30880
30938
  extractTargetsFromSuite,
30939
+ extractWorkersFromSuite,
30881
30940
  extractTargetsFromTestCase,
30882
30941
  extractTrialsConfig,
30883
30942
  extractCacheConfig,
@@ -30992,4 +31051,4 @@ export {
30992
31051
  OtelStreamingObserver,
30993
31052
  createAgentKernel
30994
31053
  };
30995
- //# sourceMappingURL=chunk-TXDPYXHY.js.map
31054
+ //# sourceMappingURL=chunk-OIVGGWJ3.js.map