@agentv/core 3.9.0 → 3.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,7 @@ import {
8
8
  isEvaluatorKind,
9
9
  loadCasesFromFile,
10
10
  resolveFileReference
11
- } from "../../chunk-PC5TLJF6.js";
11
+ } from "../../chunk-K7JCJIXA.js";
12
12
 
13
13
  // src/evaluation/validation/file-type.ts
14
14
  import { readFile } from "node:fs/promises";
package/dist/index.cjs CHANGED
@@ -1575,6 +1575,7 @@ __export(index_exports, {
1575
1575
  extractTargetsFromSuite: () => extractTargetsFromSuite,
1576
1576
  extractTargetsFromTestCase: () => extractTargetsFromTestCase,
1577
1577
  extractTrialsConfig: () => extractTrialsConfig,
1578
+ extractWorkersFromSuite: () => extractWorkersFromSuite,
1578
1579
  fileExists: () => fileExists2,
1579
1580
  findGitRoot: () => findGitRoot,
1580
1581
  freeformEvaluationSchema: () => freeformEvaluationSchema,
@@ -2256,6 +2257,17 @@ function extractTargetsFromSuite(suite) {
2256
2257
  }
2257
2258
  return void 0;
2258
2259
  }
2260
+ function extractWorkersFromSuite(suite) {
2261
+ const execution = suite.execution;
2262
+ if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
2263
+ return void 0;
2264
+ }
2265
+ const workers = execution.workers;
2266
+ if (typeof workers === "number" && Number.isInteger(workers) && workers >= 1 && workers <= 50) {
2267
+ return workers;
2268
+ }
2269
+ return void 0;
2270
+ }
2259
2271
  function extractTargetsFromTestCase(testCase) {
2260
2272
  const execution = testCase.execution;
2261
2273
  if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
@@ -4556,6 +4568,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
4556
4568
  tests,
4557
4569
  trials: extractTrialsConfig(parsed),
4558
4570
  targets: extractTargetsFromSuite(parsed),
4571
+ workers: extractWorkersFromSuite(parsed),
4559
4572
  cacheConfig: extractCacheConfig(parsed),
4560
4573
  totalBudgetUsd: extractTotalBudgetUsd(parsed),
4561
4574
  ...metadata !== void 0 && { metadata },
@@ -13344,8 +13357,7 @@ ${context2.fileChanges}`;
13344
13357
  }
13345
13358
  const evaluatorRawRequest = {
13346
13359
  userPrompt,
13347
- systemPrompt,
13348
- target: graderProvider.targetName
13360
+ systemPrompt
13349
13361
  };
13350
13362
  try {
13351
13363
  const { data, tokenUsage } = await this.runWithRetry({
@@ -13363,6 +13375,7 @@ ${context2.fileChanges}`;
13363
13375
  assertions,
13364
13376
  expectedAspectCount: Math.max(assertions.length, 1),
13365
13377
  evaluatorRawRequest,
13378
+ graderTarget: graderProvider.targetName,
13366
13379
  tokenUsage
13367
13380
  };
13368
13381
  } catch (e) {
@@ -13374,7 +13387,8 @@ ${context2.fileChanges}`;
13374
13387
  verdict: "skip",
13375
13388
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
13376
13389
  expectedAspectCount: 1,
13377
- evaluatorRawRequest
13390
+ evaluatorRawRequest,
13391
+ graderTarget: graderProvider.targetName
13378
13392
  };
13379
13393
  }
13380
13394
  }
@@ -13392,8 +13406,7 @@ ${context2.fileChanges}`;
13392
13406
  const systemPrompt = buildRubricOutputSchema();
13393
13407
  const evaluatorRawRequest = {
13394
13408
  userPrompt: prompt,
13395
- systemPrompt,
13396
- target: graderProvider.targetName
13409
+ systemPrompt
13397
13410
  };
13398
13411
  try {
13399
13412
  const { data, tokenUsage } = await this.runWithRetry({
@@ -13410,6 +13423,7 @@ ${context2.fileChanges}`;
13410
13423
  assertions,
13411
13424
  expectedAspectCount: rubrics.length,
13412
13425
  evaluatorRawRequest,
13426
+ graderTarget: graderProvider.targetName,
13413
13427
  tokenUsage
13414
13428
  };
13415
13429
  } catch (e) {
@@ -13421,7 +13435,8 @@ ${context2.fileChanges}`;
13421
13435
  verdict: "skip",
13422
13436
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
13423
13437
  expectedAspectCount: rubrics.length,
13424
- evaluatorRawRequest
13438
+ evaluatorRawRequest,
13439
+ graderTarget: graderProvider.targetName
13425
13440
  };
13426
13441
  }
13427
13442
  }
@@ -13434,8 +13449,7 @@ ${context2.fileChanges}`;
13434
13449
  const systemPrompt = buildScoreRangeOutputSchema();
13435
13450
  const evaluatorRawRequest = {
13436
13451
  userPrompt: prompt,
13437
- systemPrompt,
13438
- target: graderProvider.targetName
13452
+ systemPrompt
13439
13453
  };
13440
13454
  try {
13441
13455
  const { data, tokenUsage } = await this.runWithRetry({
@@ -13452,6 +13466,7 @@ ${context2.fileChanges}`;
13452
13466
  assertions,
13453
13467
  expectedAspectCount: rubrics.length,
13454
13468
  evaluatorRawRequest,
13469
+ graderTarget: graderProvider.targetName,
13455
13470
  details,
13456
13471
  tokenUsage
13457
13472
  };
@@ -13464,7 +13479,8 @@ ${context2.fileChanges}`;
13464
13479
  verdict: "skip",
13465
13480
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
13466
13481
  expectedAspectCount: rubrics.length,
13467
- evaluatorRawRequest
13482
+ evaluatorRawRequest,
13483
+ graderTarget: graderProvider.targetName
13468
13484
  };
13469
13485
  }
13470
13486
  }
@@ -13496,7 +13512,6 @@ ${context2.fileChanges}`;
13496
13512
  mode: "built-in",
13497
13513
  systemPrompt,
13498
13514
  userPrompt,
13499
- target: graderProvider.targetName,
13500
13515
  maxSteps: this.maxSteps
13501
13516
  };
13502
13517
  try {
@@ -13514,7 +13529,13 @@ ${context2.fileChanges}`;
13514
13529
  steps: steps.length,
13515
13530
  tool_calls: toolCallCount
13516
13531
  };
13517
- return this.parseAgentResult(text, rubrics, evaluatorRawRequest, details);
13532
+ return this.parseAgentResult(
13533
+ text,
13534
+ rubrics,
13535
+ evaluatorRawRequest,
13536
+ details,
13537
+ graderProvider.targetName
13538
+ );
13518
13539
  } catch (error) {
13519
13540
  const message = error instanceof Error ? error.message : String(error);
13520
13541
  return {
@@ -13523,6 +13544,7 @@ ${context2.fileChanges}`;
13523
13544
  assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
13524
13545
  expectedAspectCount: 1,
13525
13546
  evaluatorRawRequest,
13547
+ graderTarget: graderProvider.targetName,
13526
13548
  details: { mode: "built-in", error: message }
13527
13549
  };
13528
13550
  }
@@ -13575,6 +13597,7 @@ ${context2.fileChanges}`;
13575
13597
  ],
13576
13598
  expectedAspectCount: 1,
13577
13599
  evaluatorRawRequest,
13600
+ graderTarget: provider.targetName,
13578
13601
  details: { mode: modeLabel, grader_target: provider.targetName }
13579
13602
  };
13580
13603
  }
@@ -13584,7 +13607,13 @@ ${context2.fileChanges}`;
13584
13607
  mode: modeLabel,
13585
13608
  grader_target: provider.targetName
13586
13609
  };
13587
- return this.parseAgentResult(assistantContent, rubrics, evaluatorRawRequest, details);
13610
+ return this.parseAgentResult(
13611
+ assistantContent,
13612
+ rubrics,
13613
+ evaluatorRawRequest,
13614
+ details,
13615
+ provider.targetName
13616
+ );
13588
13617
  } catch (error) {
13589
13618
  const message = error instanceof Error ? error.message : String(error);
13590
13619
  return {
@@ -13595,6 +13624,7 @@ ${context2.fileChanges}`;
13595
13624
  ],
13596
13625
  expectedAspectCount: 1,
13597
13626
  evaluatorRawRequest,
13627
+ graderTarget: provider.targetName,
13598
13628
  details: {
13599
13629
  mode: modeLabel,
13600
13630
  grader_target: provider.targetName,
@@ -13739,7 +13769,7 @@ ${outputSchema}`;
13739
13769
  * Parse the agent's response text into an EvaluationScore.
13740
13770
  * Supports both freeform and rubric modes.
13741
13771
  */
13742
- parseAgentResult(text, rubrics, evaluatorRawRequest, details) {
13772
+ parseAgentResult(text, rubrics, evaluatorRawRequest, details, graderTarget) {
13743
13773
  try {
13744
13774
  const parsed = parseJsonFromText(text);
13745
13775
  if (rubrics && rubrics.length > 0) {
@@ -13751,6 +13781,7 @@ ${outputSchema}`;
13751
13781
  assertions: assertions2,
13752
13782
  expectedAspectCount: rubrics.length,
13753
13783
  evaluatorRawRequest,
13784
+ graderTarget,
13754
13785
  details
13755
13786
  };
13756
13787
  }
@@ -13763,6 +13794,7 @@ ${outputSchema}`;
13763
13794
  assertions,
13764
13795
  expectedAspectCount: Math.max(assertions.length, 1),
13765
13796
  evaluatorRawRequest,
13797
+ graderTarget,
13766
13798
  details
13767
13799
  };
13768
13800
  } catch {
@@ -13777,6 +13809,7 @@ ${outputSchema}`;
13777
13809
  ],
13778
13810
  expectedAspectCount: 1,
13779
13811
  evaluatorRawRequest,
13812
+ graderTarget,
13780
13813
  details
13781
13814
  };
13782
13815
  }
@@ -17790,14 +17823,22 @@ async function runEvaluation(options) {
17790
17823
  const usePool = poolEnabled !== false && !!suiteWorkspace?.repos?.length && !isPerTestIsolation && !useStaticWorkspace;
17791
17824
  const resolvedRetainOnSuccess = retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup");
17792
17825
  const resolvedRetainOnFailure = retainOnFailure ?? (cleanupWorkspaces ? "cleanup" : "keep");
17793
- const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
17794
- const workers = hasSharedWorkspace && !usePool ? 1 : requestedWorkers;
17826
+ const workers = options.maxConcurrency ?? target.workers ?? 1;
17795
17827
  setupLog(
17796
- `sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool} requestedWorkers=${requestedWorkers} effectiveWorkers=${workers}`
17828
+ `sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool} workers=${workers}`
17797
17829
  );
17798
- if (hasSharedWorkspace && !usePool && requestedWorkers > 1) {
17830
+ if (hasSharedWorkspace && !usePool && workers > 1) {
17799
17831
  console.warn(
17800
- `Warning: Shared workspace requires sequential execution. Overriding workers from ${requestedWorkers} to 1.`
17832
+ [
17833
+ `Warning: This eval uses a shared workspace with ${workers} workers.`,
17834
+ "If the agent under test makes file edits, concurrent runs may corrupt each other.",
17835
+ "To limit concurrency, add this to your eval YAML:",
17836
+ "",
17837
+ " execution:",
17838
+ " workers: 1",
17839
+ "",
17840
+ "Or pass --workers 1 on the command line."
17841
+ ].join("\n")
17801
17842
  );
17802
17843
  }
17803
17844
  const limit = pLimit(workers);
@@ -18080,7 +18121,8 @@ async function runEvaluation(options) {
18080
18121
  streamCallbacks,
18081
18122
  typeRegistry,
18082
18123
  repoManager,
18083
- evalDir
18124
+ evalDir,
18125
+ verbose
18084
18126
  };
18085
18127
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
18086
18128
  if (totalBudgetUsd !== void 0) {
@@ -18160,7 +18202,8 @@ async function runEvaluation(options) {
18160
18202
  promptInputs,
18161
18203
  primaryProvider,
18162
18204
  "agent",
18163
- "provider_error"
18205
+ "provider_error",
18206
+ verbose
18164
18207
  );
18165
18208
  results.push(errorResult);
18166
18209
  if (onResult) {
@@ -18233,6 +18276,7 @@ async function runBatchEvaluation(options) {
18233
18276
  nowFn,
18234
18277
  onProgress,
18235
18278
  onResult,
18279
+ verbose,
18236
18280
  resolveGraderProvider,
18237
18281
  agentTimeoutMs,
18238
18282
  targetResolver,
@@ -18320,7 +18364,8 @@ async function runBatchEvaluation(options) {
18320
18364
  startTime,
18321
18365
  endTime,
18322
18366
  targetResolver,
18323
- availableTargets
18367
+ availableTargets,
18368
+ verbose
18324
18369
  });
18325
18370
  if (providerError) {
18326
18371
  result = {
@@ -18341,7 +18386,8 @@ async function runBatchEvaluation(options) {
18341
18386
  promptInputs,
18342
18387
  provider,
18343
18388
  "evaluator",
18344
- "evaluator_error"
18389
+ "evaluator_error",
18390
+ verbose
18345
18391
  );
18346
18392
  results.push(errorResult);
18347
18393
  if (onResult) {
@@ -18404,7 +18450,8 @@ async function runEvalCase(options) {
18404
18450
  suiteWorkspaceFile,
18405
18451
  typeRegistry: providedTypeRegistry,
18406
18452
  repoManager,
18407
- evalDir
18453
+ evalDir,
18454
+ verbose
18408
18455
  } = options;
18409
18456
  const setupDebug = process.env.AGENTV_SETUP_DEBUG === "1";
18410
18457
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
@@ -18441,7 +18488,8 @@ async function runEvalCase(options) {
18441
18488
  promptInputs,
18442
18489
  provider,
18443
18490
  "setup",
18444
- "template_error"
18491
+ "template_error",
18492
+ verbose
18445
18493
  );
18446
18494
  }
18447
18495
  if (caseWorkspaceFile && workspacePath) {
@@ -18470,7 +18518,8 @@ async function runEvalCase(options) {
18470
18518
  promptInputs,
18471
18519
  provider,
18472
18520
  "repo_setup",
18473
- "local_path_not_found"
18521
+ "local_path_not_found",
18522
+ verbose
18474
18523
  );
18475
18524
  }
18476
18525
  }
@@ -18496,7 +18545,8 @@ async function runEvalCase(options) {
18496
18545
  promptInputs,
18497
18546
  provider,
18498
18547
  "repo_setup",
18499
- "clone_error"
18548
+ "clone_error",
18549
+ verbose
18500
18550
  );
18501
18551
  }
18502
18552
  }
@@ -18522,7 +18572,8 @@ async function runEvalCase(options) {
18522
18572
  promptInputs,
18523
18573
  provider,
18524
18574
  "setup",
18525
- "file_copy_error"
18575
+ "file_copy_error",
18576
+ verbose
18526
18577
  );
18527
18578
  }
18528
18579
  }
@@ -18567,7 +18618,8 @@ async function runEvalCase(options) {
18567
18618
  promptInputs,
18568
18619
  provider,
18569
18620
  "setup",
18570
- "script_error"
18621
+ "script_error",
18622
+ verbose
18571
18623
  );
18572
18624
  }
18573
18625
  }
@@ -18598,7 +18650,8 @@ async function runEvalCase(options) {
18598
18650
  promptInputs,
18599
18651
  provider,
18600
18652
  "setup",
18601
- "script_error"
18653
+ "script_error",
18654
+ verbose
18602
18655
  );
18603
18656
  }
18604
18657
  }
@@ -18642,7 +18695,8 @@ async function runEvalCase(options) {
18642
18695
  promptInputs,
18643
18696
  provider,
18644
18697
  "agent",
18645
- "provider_error"
18698
+ "provider_error",
18699
+ verbose
18646
18700
  );
18647
18701
  if (workspacePath) {
18648
18702
  if (forceCleanup) {
@@ -18663,7 +18717,8 @@ async function runEvalCase(options) {
18663
18717
  promptInputs,
18664
18718
  provider,
18665
18719
  "agent",
18666
- "provider_error"
18720
+ "provider_error",
18721
+ verbose
18667
18722
  );
18668
18723
  if (workspacePath) {
18669
18724
  if (forceCleanup) {
@@ -18758,7 +18813,8 @@ async function runEvalCase(options) {
18758
18813
  targetResolver,
18759
18814
  availableTargets,
18760
18815
  fileChanges,
18761
- workspacePath
18816
+ workspacePath,
18817
+ verbose
18762
18818
  });
18763
18819
  const totalDurationMs = Date.now() - caseStartMs;
18764
18820
  const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
@@ -18813,7 +18869,8 @@ async function runEvalCase(options) {
18813
18869
  promptInputs,
18814
18870
  provider,
18815
18871
  "evaluator",
18816
- "evaluator_error"
18872
+ "evaluator_error",
18873
+ verbose
18817
18874
  );
18818
18875
  if (workspacePath && !isSharedWorkspace) {
18819
18876
  if (forceCleanup || (retainOnFailure ?? "keep") === "cleanup") {
@@ -18955,7 +19012,7 @@ async function evaluateCandidate(options) {
18955
19012
  let lmRequest;
18956
19013
  if (isAgentProvider(provider)) {
18957
19014
  agentRequest = {
18958
- question: promptInputs.question
19015
+ ...options.verbose ? { input: promptInputs.question } : {}
18959
19016
  };
18960
19017
  } else {
18961
19018
  if (promptInputs.chatPrompt) {
@@ -18969,8 +19026,9 @@ async function evaluateCandidate(options) {
18969
19026
  }
18970
19027
  }
18971
19028
  const evaluatorRequest = scores ? void 0 : score.evaluatorRawRequest;
18972
- const requests = agentRequest || lmRequest || evaluatorRequest ? {
18973
- ...agentRequest ? { agent: agentRequest } : {},
19029
+ const effectiveAgentRequest = agentRequest && Object.keys(agentRequest).length > 0 ? agentRequest : void 0;
19030
+ const requests = effectiveAgentRequest || lmRequest || evaluatorRequest ? {
19031
+ ...effectiveAgentRequest ? { agent: effectiveAgentRequest } : {},
18974
19032
  ...lmRequest ? { lm: lmRequest } : {},
18975
19033
  ...evaluatorRequest ? { evaluator: evaluatorRequest } : {}
18976
19034
  } : void 0;
@@ -18990,9 +19048,9 @@ async function evaluateCandidate(options) {
18990
19048
  endTime,
18991
19049
  requests,
18992
19050
  input,
19051
+ output: output ?? [{ role: "assistant", content: candidate }],
18993
19052
  scores,
18994
19053
  trace: trace2,
18995
- output: output ?? [{ role: "assistant", content: candidate }],
18996
19054
  fileChanges,
18997
19055
  executionStatus: classifyQualityStatus(score.score)
18998
19056
  };
@@ -19158,6 +19216,7 @@ async function runEvaluatorList(options) {
19158
19216
  verdict: score2.verdict,
19159
19217
  assertions: score2.assertions,
19160
19218
  input: score2.evaluatorRawRequest,
19219
+ target: score2.graderTarget,
19161
19220
  details: score2.details,
19162
19221
  scores: mapChildResults(score2.scores),
19163
19222
  tokenUsage: score2.tokenUsage,
@@ -19297,13 +19356,13 @@ async function invokeProvider(provider, options) {
19297
19356
  }
19298
19357
  }
19299
19358
  }
19300
- function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
19359
+ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode, verbose) {
19301
19360
  const message = error instanceof Error ? error.message : String(error);
19302
19361
  let agentRequest;
19303
19362
  let lmRequest;
19304
19363
  if (isAgentProvider(provider)) {
19305
19364
  agentRequest = {
19306
- question: promptInputs.question,
19365
+ ...verbose ? { input: promptInputs.question } : {},
19307
19366
  error: message
19308
19367
  };
19309
19368
  } else {
@@ -19331,10 +19390,10 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
19331
19390
  conversationId: evalCase.conversation_id,
19332
19391
  score: 0,
19333
19392
  assertions: [{ text: `Error: ${message}`, passed: false }],
19334
- output: [{ role: "assistant", content: `Error occurred: ${message}` }],
19335
19393
  target: targetName,
19336
19394
  requests,
19337
19395
  input,
19396
+ output: [{ role: "assistant", content: `Error occurred: ${message}` }],
19338
19397
  error: message,
19339
19398
  executionStatus: "execution_error",
19340
19399
  failureStage,
@@ -20408,6 +20467,7 @@ function createAgentKernel() {
20408
20467
  extractTargetsFromSuite,
20409
20468
  extractTargetsFromTestCase,
20410
20469
  extractTrialsConfig,
20470
+ extractWorkersFromSuite,
20411
20471
  fileExists,
20412
20472
  findGitRoot,
20413
20473
  freeformEvaluationSchema,