@agentv/core 3.9.1 → 3.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,7 @@ import {
8
8
  isEvaluatorKind,
9
9
  loadCasesFromFile,
10
10
  resolveFileReference
11
- } from "../../chunk-PC5TLJF6.js";
11
+ } from "../../chunk-K7JCJIXA.js";
12
12
 
13
13
  // src/evaluation/validation/file-type.ts
14
14
  import { readFile } from "node:fs/promises";
package/dist/index.cjs CHANGED
@@ -13357,8 +13357,7 @@ ${context2.fileChanges}`;
13357
13357
  }
13358
13358
  const evaluatorRawRequest = {
13359
13359
  userPrompt,
13360
- systemPrompt,
13361
- target: graderProvider.targetName
13360
+ systemPrompt
13362
13361
  };
13363
13362
  try {
13364
13363
  const { data, tokenUsage } = await this.runWithRetry({
@@ -13376,6 +13375,7 @@ ${context2.fileChanges}`;
13376
13375
  assertions,
13377
13376
  expectedAspectCount: Math.max(assertions.length, 1),
13378
13377
  evaluatorRawRequest,
13378
+ graderTarget: graderProvider.targetName,
13379
13379
  tokenUsage
13380
13380
  };
13381
13381
  } catch (e) {
@@ -13387,7 +13387,8 @@ ${context2.fileChanges}`;
13387
13387
  verdict: "skip",
13388
13388
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
13389
13389
  expectedAspectCount: 1,
13390
- evaluatorRawRequest
13390
+ evaluatorRawRequest,
13391
+ graderTarget: graderProvider.targetName
13391
13392
  };
13392
13393
  }
13393
13394
  }
@@ -13405,8 +13406,7 @@ ${context2.fileChanges}`;
13405
13406
  const systemPrompt = buildRubricOutputSchema();
13406
13407
  const evaluatorRawRequest = {
13407
13408
  userPrompt: prompt,
13408
- systemPrompt,
13409
- target: graderProvider.targetName
13409
+ systemPrompt
13410
13410
  };
13411
13411
  try {
13412
13412
  const { data, tokenUsage } = await this.runWithRetry({
@@ -13423,6 +13423,7 @@ ${context2.fileChanges}`;
13423
13423
  assertions,
13424
13424
  expectedAspectCount: rubrics.length,
13425
13425
  evaluatorRawRequest,
13426
+ graderTarget: graderProvider.targetName,
13426
13427
  tokenUsage
13427
13428
  };
13428
13429
  } catch (e) {
@@ -13434,7 +13435,8 @@ ${context2.fileChanges}`;
13434
13435
  verdict: "skip",
13435
13436
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
13436
13437
  expectedAspectCount: rubrics.length,
13437
- evaluatorRawRequest
13438
+ evaluatorRawRequest,
13439
+ graderTarget: graderProvider.targetName
13438
13440
  };
13439
13441
  }
13440
13442
  }
@@ -13447,8 +13449,7 @@ ${context2.fileChanges}`;
13447
13449
  const systemPrompt = buildScoreRangeOutputSchema();
13448
13450
  const evaluatorRawRequest = {
13449
13451
  userPrompt: prompt,
13450
- systemPrompt,
13451
- target: graderProvider.targetName
13452
+ systemPrompt
13452
13453
  };
13453
13454
  try {
13454
13455
  const { data, tokenUsage } = await this.runWithRetry({
@@ -13465,6 +13466,7 @@ ${context2.fileChanges}`;
13465
13466
  assertions,
13466
13467
  expectedAspectCount: rubrics.length,
13467
13468
  evaluatorRawRequest,
13469
+ graderTarget: graderProvider.targetName,
13468
13470
  details,
13469
13471
  tokenUsage
13470
13472
  };
@@ -13477,7 +13479,8 @@ ${context2.fileChanges}`;
13477
13479
  verdict: "skip",
13478
13480
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
13479
13481
  expectedAspectCount: rubrics.length,
13480
- evaluatorRawRequest
13482
+ evaluatorRawRequest,
13483
+ graderTarget: graderProvider.targetName
13481
13484
  };
13482
13485
  }
13483
13486
  }
@@ -13509,7 +13512,6 @@ ${context2.fileChanges}`;
13509
13512
  mode: "built-in",
13510
13513
  systemPrompt,
13511
13514
  userPrompt,
13512
- target: graderProvider.targetName,
13513
13515
  maxSteps: this.maxSteps
13514
13516
  };
13515
13517
  try {
@@ -13527,7 +13529,13 @@ ${context2.fileChanges}`;
13527
13529
  steps: steps.length,
13528
13530
  tool_calls: toolCallCount
13529
13531
  };
13530
- return this.parseAgentResult(text, rubrics, evaluatorRawRequest, details);
13532
+ return this.parseAgentResult(
13533
+ text,
13534
+ rubrics,
13535
+ evaluatorRawRequest,
13536
+ details,
13537
+ graderProvider.targetName
13538
+ );
13531
13539
  } catch (error) {
13532
13540
  const message = error instanceof Error ? error.message : String(error);
13533
13541
  return {
@@ -13536,6 +13544,7 @@ ${context2.fileChanges}`;
13536
13544
  assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
13537
13545
  expectedAspectCount: 1,
13538
13546
  evaluatorRawRequest,
13547
+ graderTarget: graderProvider.targetName,
13539
13548
  details: { mode: "built-in", error: message }
13540
13549
  };
13541
13550
  }
@@ -13588,6 +13597,7 @@ ${context2.fileChanges}`;
13588
13597
  ],
13589
13598
  expectedAspectCount: 1,
13590
13599
  evaluatorRawRequest,
13600
+ graderTarget: provider.targetName,
13591
13601
  details: { mode: modeLabel, grader_target: provider.targetName }
13592
13602
  };
13593
13603
  }
@@ -13597,7 +13607,13 @@ ${context2.fileChanges}`;
13597
13607
  mode: modeLabel,
13598
13608
  grader_target: provider.targetName
13599
13609
  };
13600
- return this.parseAgentResult(assistantContent, rubrics, evaluatorRawRequest, details);
13610
+ return this.parseAgentResult(
13611
+ assistantContent,
13612
+ rubrics,
13613
+ evaluatorRawRequest,
13614
+ details,
13615
+ provider.targetName
13616
+ );
13601
13617
  } catch (error) {
13602
13618
  const message = error instanceof Error ? error.message : String(error);
13603
13619
  return {
@@ -13608,6 +13624,7 @@ ${context2.fileChanges}`;
13608
13624
  ],
13609
13625
  expectedAspectCount: 1,
13610
13626
  evaluatorRawRequest,
13627
+ graderTarget: provider.targetName,
13611
13628
  details: {
13612
13629
  mode: modeLabel,
13613
13630
  grader_target: provider.targetName,
@@ -13752,7 +13769,7 @@ ${outputSchema}`;
13752
13769
  * Parse the agent's response text into an EvaluationScore.
13753
13770
  * Supports both freeform and rubric modes.
13754
13771
  */
13755
- parseAgentResult(text, rubrics, evaluatorRawRequest, details) {
13772
+ parseAgentResult(text, rubrics, evaluatorRawRequest, details, graderTarget) {
13756
13773
  try {
13757
13774
  const parsed = parseJsonFromText(text);
13758
13775
  if (rubrics && rubrics.length > 0) {
@@ -13764,6 +13781,7 @@ ${outputSchema}`;
13764
13781
  assertions: assertions2,
13765
13782
  expectedAspectCount: rubrics.length,
13766
13783
  evaluatorRawRequest,
13784
+ graderTarget,
13767
13785
  details
13768
13786
  };
13769
13787
  }
@@ -13776,6 +13794,7 @@ ${outputSchema}`;
13776
13794
  assertions,
13777
13795
  expectedAspectCount: Math.max(assertions.length, 1),
13778
13796
  evaluatorRawRequest,
13797
+ graderTarget,
13779
13798
  details
13780
13799
  };
13781
13800
  } catch {
@@ -13790,6 +13809,7 @@ ${outputSchema}`;
13790
13809
  ],
13791
13810
  expectedAspectCount: 1,
13792
13811
  evaluatorRawRequest,
13812
+ graderTarget,
13793
13813
  details
13794
13814
  };
13795
13815
  }
@@ -18101,7 +18121,8 @@ async function runEvaluation(options) {
18101
18121
  streamCallbacks,
18102
18122
  typeRegistry,
18103
18123
  repoManager,
18104
- evalDir
18124
+ evalDir,
18125
+ verbose
18105
18126
  };
18106
18127
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
18107
18128
  if (totalBudgetUsd !== void 0) {
@@ -18181,7 +18202,8 @@ async function runEvaluation(options) {
18181
18202
  promptInputs,
18182
18203
  primaryProvider,
18183
18204
  "agent",
18184
- "provider_error"
18205
+ "provider_error",
18206
+ verbose
18185
18207
  );
18186
18208
  results.push(errorResult);
18187
18209
  if (onResult) {
@@ -18254,6 +18276,7 @@ async function runBatchEvaluation(options) {
18254
18276
  nowFn,
18255
18277
  onProgress,
18256
18278
  onResult,
18279
+ verbose,
18257
18280
  resolveGraderProvider,
18258
18281
  agentTimeoutMs,
18259
18282
  targetResolver,
@@ -18341,7 +18364,8 @@ async function runBatchEvaluation(options) {
18341
18364
  startTime,
18342
18365
  endTime,
18343
18366
  targetResolver,
18344
- availableTargets
18367
+ availableTargets,
18368
+ verbose
18345
18369
  });
18346
18370
  if (providerError) {
18347
18371
  result = {
@@ -18362,7 +18386,8 @@ async function runBatchEvaluation(options) {
18362
18386
  promptInputs,
18363
18387
  provider,
18364
18388
  "evaluator",
18365
- "evaluator_error"
18389
+ "evaluator_error",
18390
+ verbose
18366
18391
  );
18367
18392
  results.push(errorResult);
18368
18393
  if (onResult) {
@@ -18425,7 +18450,8 @@ async function runEvalCase(options) {
18425
18450
  suiteWorkspaceFile,
18426
18451
  typeRegistry: providedTypeRegistry,
18427
18452
  repoManager,
18428
- evalDir
18453
+ evalDir,
18454
+ verbose
18429
18455
  } = options;
18430
18456
  const setupDebug = process.env.AGENTV_SETUP_DEBUG === "1";
18431
18457
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
@@ -18462,7 +18488,8 @@ async function runEvalCase(options) {
18462
18488
  promptInputs,
18463
18489
  provider,
18464
18490
  "setup",
18465
- "template_error"
18491
+ "template_error",
18492
+ verbose
18466
18493
  );
18467
18494
  }
18468
18495
  if (caseWorkspaceFile && workspacePath) {
@@ -18491,7 +18518,8 @@ async function runEvalCase(options) {
18491
18518
  promptInputs,
18492
18519
  provider,
18493
18520
  "repo_setup",
18494
- "local_path_not_found"
18521
+ "local_path_not_found",
18522
+ verbose
18495
18523
  );
18496
18524
  }
18497
18525
  }
@@ -18517,7 +18545,8 @@ async function runEvalCase(options) {
18517
18545
  promptInputs,
18518
18546
  provider,
18519
18547
  "repo_setup",
18520
- "clone_error"
18548
+ "clone_error",
18549
+ verbose
18521
18550
  );
18522
18551
  }
18523
18552
  }
@@ -18543,7 +18572,8 @@ async function runEvalCase(options) {
18543
18572
  promptInputs,
18544
18573
  provider,
18545
18574
  "setup",
18546
- "file_copy_error"
18575
+ "file_copy_error",
18576
+ verbose
18547
18577
  );
18548
18578
  }
18549
18579
  }
@@ -18588,7 +18618,8 @@ async function runEvalCase(options) {
18588
18618
  promptInputs,
18589
18619
  provider,
18590
18620
  "setup",
18591
- "script_error"
18621
+ "script_error",
18622
+ verbose
18592
18623
  );
18593
18624
  }
18594
18625
  }
@@ -18619,7 +18650,8 @@ async function runEvalCase(options) {
18619
18650
  promptInputs,
18620
18651
  provider,
18621
18652
  "setup",
18622
- "script_error"
18653
+ "script_error",
18654
+ verbose
18623
18655
  );
18624
18656
  }
18625
18657
  }
@@ -18663,7 +18695,8 @@ async function runEvalCase(options) {
18663
18695
  promptInputs,
18664
18696
  provider,
18665
18697
  "agent",
18666
- "provider_error"
18698
+ "provider_error",
18699
+ verbose
18667
18700
  );
18668
18701
  if (workspacePath) {
18669
18702
  if (forceCleanup) {
@@ -18684,7 +18717,8 @@ async function runEvalCase(options) {
18684
18717
  promptInputs,
18685
18718
  provider,
18686
18719
  "agent",
18687
- "provider_error"
18720
+ "provider_error",
18721
+ verbose
18688
18722
  );
18689
18723
  if (workspacePath) {
18690
18724
  if (forceCleanup) {
@@ -18779,7 +18813,8 @@ async function runEvalCase(options) {
18779
18813
  targetResolver,
18780
18814
  availableTargets,
18781
18815
  fileChanges,
18782
- workspacePath
18816
+ workspacePath,
18817
+ verbose
18783
18818
  });
18784
18819
  const totalDurationMs = Date.now() - caseStartMs;
18785
18820
  const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
@@ -18834,7 +18869,8 @@ async function runEvalCase(options) {
18834
18869
  promptInputs,
18835
18870
  provider,
18836
18871
  "evaluator",
18837
- "evaluator_error"
18872
+ "evaluator_error",
18873
+ verbose
18838
18874
  );
18839
18875
  if (workspacePath && !isSharedWorkspace) {
18840
18876
  if (forceCleanup || (retainOnFailure ?? "keep") === "cleanup") {
@@ -18976,7 +19012,7 @@ async function evaluateCandidate(options) {
18976
19012
  let lmRequest;
18977
19013
  if (isAgentProvider(provider)) {
18978
19014
  agentRequest = {
18979
- question: promptInputs.question
19015
+ ...options.verbose ? { input: promptInputs.question } : {}
18980
19016
  };
18981
19017
  } else {
18982
19018
  if (promptInputs.chatPrompt) {
@@ -18990,8 +19026,9 @@ async function evaluateCandidate(options) {
18990
19026
  }
18991
19027
  }
18992
19028
  const evaluatorRequest = scores ? void 0 : score.evaluatorRawRequest;
18993
- const requests = agentRequest || lmRequest || evaluatorRequest ? {
18994
- ...agentRequest ? { agent: agentRequest } : {},
19029
+ const effectiveAgentRequest = agentRequest && Object.keys(agentRequest).length > 0 ? agentRequest : void 0;
19030
+ const requests = effectiveAgentRequest || lmRequest || evaluatorRequest ? {
19031
+ ...effectiveAgentRequest ? { agent: effectiveAgentRequest } : {},
18995
19032
  ...lmRequest ? { lm: lmRequest } : {},
18996
19033
  ...evaluatorRequest ? { evaluator: evaluatorRequest } : {}
18997
19034
  } : void 0;
@@ -19011,9 +19048,9 @@ async function evaluateCandidate(options) {
19011
19048
  endTime,
19012
19049
  requests,
19013
19050
  input,
19051
+ output: output ?? [{ role: "assistant", content: candidate }],
19014
19052
  scores,
19015
19053
  trace: trace2,
19016
- output: output ?? [{ role: "assistant", content: candidate }],
19017
19054
  fileChanges,
19018
19055
  executionStatus: classifyQualityStatus(score.score)
19019
19056
  };
@@ -19179,6 +19216,7 @@ async function runEvaluatorList(options) {
19179
19216
  verdict: score2.verdict,
19180
19217
  assertions: score2.assertions,
19181
19218
  input: score2.evaluatorRawRequest,
19219
+ target: score2.graderTarget,
19182
19220
  details: score2.details,
19183
19221
  scores: mapChildResults(score2.scores),
19184
19222
  tokenUsage: score2.tokenUsage,
@@ -19318,13 +19356,13 @@ async function invokeProvider(provider, options) {
19318
19356
  }
19319
19357
  }
19320
19358
  }
19321
- function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
19359
+ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode, verbose) {
19322
19360
  const message = error instanceof Error ? error.message : String(error);
19323
19361
  let agentRequest;
19324
19362
  let lmRequest;
19325
19363
  if (isAgentProvider(provider)) {
19326
19364
  agentRequest = {
19327
- question: promptInputs.question,
19365
+ ...verbose ? { input: promptInputs.question } : {},
19328
19366
  error: message
19329
19367
  };
19330
19368
  } else {
@@ -19352,10 +19390,10 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
19352
19390
  conversationId: evalCase.conversation_id,
19353
19391
  score: 0,
19354
19392
  assertions: [{ text: `Error: ${message}`, passed: false }],
19355
- output: [{ role: "assistant", content: `Error occurred: ${message}` }],
19356
19393
  target: targetName,
19357
19394
  requests,
19358
19395
  input,
19396
+ output: [{ role: "assistant", content: `Error occurred: ${message}` }],
19359
19397
  error: message,
19360
19398
  executionStatus: "execution_error",
19361
19399
  failureStage,