@agentv/core 3.9.0 → 3.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -19,7 +19,7 @@ import {
19
19
  readTextFile,
20
20
  resolveFileReference,
21
21
  resolveTargetDefinition
22
- } from "./chunk-PC5TLJF6.js";
22
+ } from "./chunk-K7JCJIXA.js";
23
23
  import {
24
24
  AgentvProvider
25
25
  } from "./chunk-W5YDZWT4.js";
@@ -420,6 +420,17 @@ function extractTargetsFromSuite(suite) {
420
420
  }
421
421
  return void 0;
422
422
  }
423
+ function extractWorkersFromSuite(suite) {
424
+ const execution = suite.execution;
425
+ if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
426
+ return void 0;
427
+ }
428
+ const workers = execution.workers;
429
+ if (typeof workers === "number" && Number.isInteger(workers) && workers >= 1 && workers <= 50) {
430
+ return workers;
431
+ }
432
+ return void 0;
433
+ }
423
434
  function extractTargetsFromTestCase(testCase) {
424
435
  const execution = testCase.execution;
425
436
  if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
@@ -2720,6 +2731,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
2720
2731
  tests,
2721
2732
  trials: extractTrialsConfig(parsed),
2722
2733
  targets: extractTargetsFromSuite(parsed),
2734
+ workers: extractWorkersFromSuite(parsed),
2723
2735
  cacheConfig: extractCacheConfig(parsed),
2724
2736
  totalBudgetUsd: extractTotalBudgetUsd(parsed),
2725
2737
  ...metadata !== void 0 && { metadata },
@@ -10160,8 +10172,7 @@ ${context.fileChanges}`;
10160
10172
  }
10161
10173
  const evaluatorRawRequest = {
10162
10174
  userPrompt,
10163
- systemPrompt,
10164
- target: graderProvider.targetName
10175
+ systemPrompt
10165
10176
  };
10166
10177
  try {
10167
10178
  const { data, tokenUsage } = await this.runWithRetry({
@@ -10179,6 +10190,7 @@ ${context.fileChanges}`;
10179
10190
  assertions,
10180
10191
  expectedAspectCount: Math.max(assertions.length, 1),
10181
10192
  evaluatorRawRequest,
10193
+ graderTarget: graderProvider.targetName,
10182
10194
  tokenUsage
10183
10195
  };
10184
10196
  } catch (e) {
@@ -10190,7 +10202,8 @@ ${context.fileChanges}`;
10190
10202
  verdict: "skip",
10191
10203
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
10192
10204
  expectedAspectCount: 1,
10193
- evaluatorRawRequest
10205
+ evaluatorRawRequest,
10206
+ graderTarget: graderProvider.targetName
10194
10207
  };
10195
10208
  }
10196
10209
  }
@@ -10208,8 +10221,7 @@ ${context.fileChanges}`;
10208
10221
  const systemPrompt = buildRubricOutputSchema();
10209
10222
  const evaluatorRawRequest = {
10210
10223
  userPrompt: prompt,
10211
- systemPrompt,
10212
- target: graderProvider.targetName
10224
+ systemPrompt
10213
10225
  };
10214
10226
  try {
10215
10227
  const { data, tokenUsage } = await this.runWithRetry({
@@ -10226,6 +10238,7 @@ ${context.fileChanges}`;
10226
10238
  assertions,
10227
10239
  expectedAspectCount: rubrics.length,
10228
10240
  evaluatorRawRequest,
10241
+ graderTarget: graderProvider.targetName,
10229
10242
  tokenUsage
10230
10243
  };
10231
10244
  } catch (e) {
@@ -10237,7 +10250,8 @@ ${context.fileChanges}`;
10237
10250
  verdict: "skip",
10238
10251
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
10239
10252
  expectedAspectCount: rubrics.length,
10240
- evaluatorRawRequest
10253
+ evaluatorRawRequest,
10254
+ graderTarget: graderProvider.targetName
10241
10255
  };
10242
10256
  }
10243
10257
  }
@@ -10250,8 +10264,7 @@ ${context.fileChanges}`;
10250
10264
  const systemPrompt = buildScoreRangeOutputSchema();
10251
10265
  const evaluatorRawRequest = {
10252
10266
  userPrompt: prompt,
10253
- systemPrompt,
10254
- target: graderProvider.targetName
10267
+ systemPrompt
10255
10268
  };
10256
10269
  try {
10257
10270
  const { data, tokenUsage } = await this.runWithRetry({
@@ -10268,6 +10281,7 @@ ${context.fileChanges}`;
10268
10281
  assertions,
10269
10282
  expectedAspectCount: rubrics.length,
10270
10283
  evaluatorRawRequest,
10284
+ graderTarget: graderProvider.targetName,
10271
10285
  details,
10272
10286
  tokenUsage
10273
10287
  };
@@ -10280,7 +10294,8 @@ ${context.fileChanges}`;
10280
10294
  verdict: "skip",
10281
10295
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
10282
10296
  expectedAspectCount: rubrics.length,
10283
- evaluatorRawRequest
10297
+ evaluatorRawRequest,
10298
+ graderTarget: graderProvider.targetName
10284
10299
  };
10285
10300
  }
10286
10301
  }
@@ -10312,7 +10327,6 @@ ${context.fileChanges}`;
10312
10327
  mode: "built-in",
10313
10328
  systemPrompt,
10314
10329
  userPrompt,
10315
- target: graderProvider.targetName,
10316
10330
  maxSteps: this.maxSteps
10317
10331
  };
10318
10332
  try {
@@ -10330,7 +10344,13 @@ ${context.fileChanges}`;
10330
10344
  steps: steps.length,
10331
10345
  tool_calls: toolCallCount
10332
10346
  };
10333
- return this.parseAgentResult(text, rubrics, evaluatorRawRequest, details);
10347
+ return this.parseAgentResult(
10348
+ text,
10349
+ rubrics,
10350
+ evaluatorRawRequest,
10351
+ details,
10352
+ graderProvider.targetName
10353
+ );
10334
10354
  } catch (error) {
10335
10355
  const message = error instanceof Error ? error.message : String(error);
10336
10356
  return {
@@ -10339,6 +10359,7 @@ ${context.fileChanges}`;
10339
10359
  assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
10340
10360
  expectedAspectCount: 1,
10341
10361
  evaluatorRawRequest,
10362
+ graderTarget: graderProvider.targetName,
10342
10363
  details: { mode: "built-in", error: message }
10343
10364
  };
10344
10365
  }
@@ -10391,6 +10412,7 @@ ${context.fileChanges}`;
10391
10412
  ],
10392
10413
  expectedAspectCount: 1,
10393
10414
  evaluatorRawRequest,
10415
+ graderTarget: provider.targetName,
10394
10416
  details: { mode: modeLabel, grader_target: provider.targetName }
10395
10417
  };
10396
10418
  }
@@ -10400,7 +10422,13 @@ ${context.fileChanges}`;
10400
10422
  mode: modeLabel,
10401
10423
  grader_target: provider.targetName
10402
10424
  };
10403
- return this.parseAgentResult(assistantContent, rubrics, evaluatorRawRequest, details);
10425
+ return this.parseAgentResult(
10426
+ assistantContent,
10427
+ rubrics,
10428
+ evaluatorRawRequest,
10429
+ details,
10430
+ provider.targetName
10431
+ );
10404
10432
  } catch (error) {
10405
10433
  const message = error instanceof Error ? error.message : String(error);
10406
10434
  return {
@@ -10411,6 +10439,7 @@ ${context.fileChanges}`;
10411
10439
  ],
10412
10440
  expectedAspectCount: 1,
10413
10441
  evaluatorRawRequest,
10442
+ graderTarget: provider.targetName,
10414
10443
  details: {
10415
10444
  mode: modeLabel,
10416
10445
  grader_target: provider.targetName,
@@ -10555,7 +10584,7 @@ ${outputSchema}`;
10555
10584
  * Parse the agent's response text into an EvaluationScore.
10556
10585
  * Supports both freeform and rubric modes.
10557
10586
  */
10558
- parseAgentResult(text, rubrics, evaluatorRawRequest, details) {
10587
+ parseAgentResult(text, rubrics, evaluatorRawRequest, details, graderTarget) {
10559
10588
  try {
10560
10589
  const parsed = parseJsonFromText(text);
10561
10590
  if (rubrics && rubrics.length > 0) {
@@ -10567,6 +10596,7 @@ ${outputSchema}`;
10567
10596
  assertions: assertions2,
10568
10597
  expectedAspectCount: rubrics.length,
10569
10598
  evaluatorRawRequest,
10599
+ graderTarget,
10570
10600
  details
10571
10601
  };
10572
10602
  }
@@ -10579,6 +10609,7 @@ ${outputSchema}`;
10579
10609
  assertions,
10580
10610
  expectedAspectCount: Math.max(assertions.length, 1),
10581
10611
  evaluatorRawRequest,
10612
+ graderTarget,
10582
10613
  details
10583
10614
  };
10584
10615
  } catch {
@@ -10593,6 +10624,7 @@ ${outputSchema}`;
10593
10624
  ],
10594
10625
  expectedAspectCount: 1,
10595
10626
  evaluatorRawRequest,
10627
+ graderTarget,
10596
10628
  details
10597
10629
  };
10598
10630
  }
@@ -14606,14 +14638,22 @@ async function runEvaluation(options) {
14606
14638
  const usePool = poolEnabled !== false && !!suiteWorkspace?.repos?.length && !isPerTestIsolation && !useStaticWorkspace;
14607
14639
  const resolvedRetainOnSuccess = retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup");
14608
14640
  const resolvedRetainOnFailure = retainOnFailure ?? (cleanupWorkspaces ? "cleanup" : "keep");
14609
- const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
14610
- const workers = hasSharedWorkspace && !usePool ? 1 : requestedWorkers;
14641
+ const workers = options.maxConcurrency ?? target.workers ?? 1;
14611
14642
  setupLog(
14612
- `sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool} requestedWorkers=${requestedWorkers} effectiveWorkers=${workers}`
14643
+ `sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool} workers=${workers}`
14613
14644
  );
14614
- if (hasSharedWorkspace && !usePool && requestedWorkers > 1) {
14645
+ if (hasSharedWorkspace && !usePool && workers > 1) {
14615
14646
  console.warn(
14616
- `Warning: Shared workspace requires sequential execution. Overriding workers from ${requestedWorkers} to 1.`
14647
+ [
14648
+ `Warning: This eval uses a shared workspace with ${workers} workers.`,
14649
+ "If the agent under test makes file edits, concurrent runs may corrupt each other.",
14650
+ "To limit concurrency, add this to your eval YAML:",
14651
+ "",
14652
+ " execution:",
14653
+ " workers: 1",
14654
+ "",
14655
+ "Or pass --workers 1 on the command line."
14656
+ ].join("\n")
14617
14657
  );
14618
14658
  }
14619
14659
  const limit = pLimit(workers);
@@ -14896,7 +14936,8 @@ async function runEvaluation(options) {
14896
14936
  streamCallbacks,
14897
14937
  typeRegistry,
14898
14938
  repoManager,
14899
- evalDir
14939
+ evalDir,
14940
+ verbose
14900
14941
  };
14901
14942
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
14902
14943
  if (totalBudgetUsd !== void 0) {
@@ -14976,7 +15017,8 @@ async function runEvaluation(options) {
14976
15017
  promptInputs,
14977
15018
  primaryProvider,
14978
15019
  "agent",
14979
- "provider_error"
15020
+ "provider_error",
15021
+ verbose
14980
15022
  );
14981
15023
  results.push(errorResult);
14982
15024
  if (onResult) {
@@ -15049,6 +15091,7 @@ async function runBatchEvaluation(options) {
15049
15091
  nowFn,
15050
15092
  onProgress,
15051
15093
  onResult,
15094
+ verbose,
15052
15095
  resolveGraderProvider,
15053
15096
  agentTimeoutMs,
15054
15097
  targetResolver,
@@ -15136,7 +15179,8 @@ async function runBatchEvaluation(options) {
15136
15179
  startTime,
15137
15180
  endTime,
15138
15181
  targetResolver,
15139
- availableTargets
15182
+ availableTargets,
15183
+ verbose
15140
15184
  });
15141
15185
  if (providerError) {
15142
15186
  result = {
@@ -15157,7 +15201,8 @@ async function runBatchEvaluation(options) {
15157
15201
  promptInputs,
15158
15202
  provider,
15159
15203
  "evaluator",
15160
- "evaluator_error"
15204
+ "evaluator_error",
15205
+ verbose
15161
15206
  );
15162
15207
  results.push(errorResult);
15163
15208
  if (onResult) {
@@ -15220,7 +15265,8 @@ async function runEvalCase(options) {
15220
15265
  suiteWorkspaceFile,
15221
15266
  typeRegistry: providedTypeRegistry,
15222
15267
  repoManager,
15223
- evalDir
15268
+ evalDir,
15269
+ verbose
15224
15270
  } = options;
15225
15271
  const setupDebug = process.env.AGENTV_SETUP_DEBUG === "1";
15226
15272
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
@@ -15257,7 +15303,8 @@ async function runEvalCase(options) {
15257
15303
  promptInputs,
15258
15304
  provider,
15259
15305
  "setup",
15260
- "template_error"
15306
+ "template_error",
15307
+ verbose
15261
15308
  );
15262
15309
  }
15263
15310
  if (caseWorkspaceFile && workspacePath) {
@@ -15286,7 +15333,8 @@ async function runEvalCase(options) {
15286
15333
  promptInputs,
15287
15334
  provider,
15288
15335
  "repo_setup",
15289
- "local_path_not_found"
15336
+ "local_path_not_found",
15337
+ verbose
15290
15338
  );
15291
15339
  }
15292
15340
  }
@@ -15312,7 +15360,8 @@ async function runEvalCase(options) {
15312
15360
  promptInputs,
15313
15361
  provider,
15314
15362
  "repo_setup",
15315
- "clone_error"
15363
+ "clone_error",
15364
+ verbose
15316
15365
  );
15317
15366
  }
15318
15367
  }
@@ -15338,7 +15387,8 @@ async function runEvalCase(options) {
15338
15387
  promptInputs,
15339
15388
  provider,
15340
15389
  "setup",
15341
- "file_copy_error"
15390
+ "file_copy_error",
15391
+ verbose
15342
15392
  );
15343
15393
  }
15344
15394
  }
@@ -15383,7 +15433,8 @@ async function runEvalCase(options) {
15383
15433
  promptInputs,
15384
15434
  provider,
15385
15435
  "setup",
15386
- "script_error"
15436
+ "script_error",
15437
+ verbose
15387
15438
  );
15388
15439
  }
15389
15440
  }
@@ -15414,7 +15465,8 @@ async function runEvalCase(options) {
15414
15465
  promptInputs,
15415
15466
  provider,
15416
15467
  "setup",
15417
- "script_error"
15468
+ "script_error",
15469
+ verbose
15418
15470
  );
15419
15471
  }
15420
15472
  }
@@ -15458,7 +15510,8 @@ async function runEvalCase(options) {
15458
15510
  promptInputs,
15459
15511
  provider,
15460
15512
  "agent",
15461
- "provider_error"
15513
+ "provider_error",
15514
+ verbose
15462
15515
  );
15463
15516
  if (workspacePath) {
15464
15517
  if (forceCleanup) {
@@ -15479,7 +15532,8 @@ async function runEvalCase(options) {
15479
15532
  promptInputs,
15480
15533
  provider,
15481
15534
  "agent",
15482
- "provider_error"
15535
+ "provider_error",
15536
+ verbose
15483
15537
  );
15484
15538
  if (workspacePath) {
15485
15539
  if (forceCleanup) {
@@ -15574,7 +15628,8 @@ async function runEvalCase(options) {
15574
15628
  targetResolver,
15575
15629
  availableTargets,
15576
15630
  fileChanges,
15577
- workspacePath
15631
+ workspacePath,
15632
+ verbose
15578
15633
  });
15579
15634
  const totalDurationMs = Date.now() - caseStartMs;
15580
15635
  const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
@@ -15629,7 +15684,8 @@ async function runEvalCase(options) {
15629
15684
  promptInputs,
15630
15685
  provider,
15631
15686
  "evaluator",
15632
- "evaluator_error"
15687
+ "evaluator_error",
15688
+ verbose
15633
15689
  );
15634
15690
  if (workspacePath && !isSharedWorkspace) {
15635
15691
  if (forceCleanup || (retainOnFailure ?? "keep") === "cleanup") {
@@ -15771,7 +15827,7 @@ async function evaluateCandidate(options) {
15771
15827
  let lmRequest;
15772
15828
  if (isAgentProvider(provider)) {
15773
15829
  agentRequest = {
15774
- question: promptInputs.question
15830
+ ...options.verbose ? { input: promptInputs.question } : {}
15775
15831
  };
15776
15832
  } else {
15777
15833
  if (promptInputs.chatPrompt) {
@@ -15785,8 +15841,9 @@ async function evaluateCandidate(options) {
15785
15841
  }
15786
15842
  }
15787
15843
  const evaluatorRequest = scores ? void 0 : score.evaluatorRawRequest;
15788
- const requests = agentRequest || lmRequest || evaluatorRequest ? {
15789
- ...agentRequest ? { agent: agentRequest } : {},
15844
+ const effectiveAgentRequest = agentRequest && Object.keys(agentRequest).length > 0 ? agentRequest : void 0;
15845
+ const requests = effectiveAgentRequest || lmRequest || evaluatorRequest ? {
15846
+ ...effectiveAgentRequest ? { agent: effectiveAgentRequest } : {},
15790
15847
  ...lmRequest ? { lm: lmRequest } : {},
15791
15848
  ...evaluatorRequest ? { evaluator: evaluatorRequest } : {}
15792
15849
  } : void 0;
@@ -15806,9 +15863,9 @@ async function evaluateCandidate(options) {
15806
15863
  endTime,
15807
15864
  requests,
15808
15865
  input,
15866
+ output: output ?? [{ role: "assistant", content: candidate }],
15809
15867
  scores,
15810
15868
  trace,
15811
- output: output ?? [{ role: "assistant", content: candidate }],
15812
15869
  fileChanges,
15813
15870
  executionStatus: classifyQualityStatus(score.score)
15814
15871
  };
@@ -15974,6 +16031,7 @@ async function runEvaluatorList(options) {
15974
16031
  verdict: score2.verdict,
15975
16032
  assertions: score2.assertions,
15976
16033
  input: score2.evaluatorRawRequest,
16034
+ target: score2.graderTarget,
15977
16035
  details: score2.details,
15978
16036
  scores: mapChildResults(score2.scores),
15979
16037
  tokenUsage: score2.tokenUsage,
@@ -16113,13 +16171,13 @@ async function invokeProvider(provider, options) {
16113
16171
  }
16114
16172
  }
16115
16173
  }
16116
- function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
16174
+ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode, verbose) {
16117
16175
  const message = error instanceof Error ? error.message : String(error);
16118
16176
  let agentRequest;
16119
16177
  let lmRequest;
16120
16178
  if (isAgentProvider(provider)) {
16121
16179
  agentRequest = {
16122
- question: promptInputs.question,
16180
+ ...verbose ? { input: promptInputs.question } : {},
16123
16181
  error: message
16124
16182
  };
16125
16183
  } else {
@@ -16147,10 +16205,10 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
16147
16205
  conversationId: evalCase.conversation_id,
16148
16206
  score: 0,
16149
16207
  assertions: [{ text: `Error: ${message}`, passed: false }],
16150
- output: [{ role: "assistant", content: `Error occurred: ${message}` }],
16151
16208
  target: targetName,
16152
16209
  requests,
16153
16210
  input,
16211
+ output: [{ role: "assistant", content: `Error occurred: ${message}` }],
16154
16212
  error: message,
16155
16213
  executionStatus: "execution_error",
16156
16214
  failureStage,
@@ -17219,6 +17277,7 @@ export {
17219
17277
  extractTargetsFromSuite,
17220
17278
  extractTargetsFromTestCase,
17221
17279
  extractTrialsConfig,
17280
+ extractWorkersFromSuite,
17222
17281
  fileExists,
17223
17282
  findGitRoot,
17224
17283
  freeformEvaluationSchema,