@agentv/core 3.9.1 → 3.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -1196,6 +1196,8 @@ interface EvaluatorResult {
1196
1196
  readonly assertions: readonly AssertionEntry[];
1197
1197
  readonly rawRequest?: JsonObject;
1198
1198
  readonly input?: JsonObject;
1199
+ /** Target name used for grading (e.g., the LLM provider name). */
1200
+ readonly target?: string;
1199
1201
  readonly scores?: readonly EvaluatorResult[];
1200
1202
  /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts). */
1201
1203
  readonly details?: JsonObject;
@@ -2057,6 +2059,8 @@ interface EvaluationScore {
2057
2059
  readonly details?: JsonObject;
2058
2060
  /** Token usage from LLM calls made by this evaluator (optional). */
2059
2061
  readonly tokenUsage?: TokenUsage;
2062
+ /** Target name used for grading (e.g., the LLM provider). */
2063
+ readonly graderTarget?: string;
2060
2064
  }
2061
2065
  interface ChildEvaluatorResult {
2062
2066
  readonly name: string;
@@ -2660,6 +2664,8 @@ interface RunEvalCaseOptions {
2660
2664
  readonly repoManager?: RepoManager;
2661
2665
  /** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
2662
2666
  readonly evalDir?: string;
2667
+ /** Include verbose request details in results (e.g. agent input text) */
2668
+ readonly verbose?: boolean;
2663
2669
  }
2664
2670
  interface ProgressEvent {
2665
2671
  readonly workerId: number;
package/dist/index.d.ts CHANGED
@@ -1196,6 +1196,8 @@ interface EvaluatorResult {
1196
1196
  readonly assertions: readonly AssertionEntry[];
1197
1197
  readonly rawRequest?: JsonObject;
1198
1198
  readonly input?: JsonObject;
1199
+ /** Target name used for grading (e.g., the LLM provider name). */
1200
+ readonly target?: string;
1199
1201
  readonly scores?: readonly EvaluatorResult[];
1200
1202
  /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts). */
1201
1203
  readonly details?: JsonObject;
@@ -2057,6 +2059,8 @@ interface EvaluationScore {
2057
2059
  readonly details?: JsonObject;
2058
2060
  /** Token usage from LLM calls made by this evaluator (optional). */
2059
2061
  readonly tokenUsage?: TokenUsage;
2062
+ /** Target name used for grading (e.g., the LLM provider). */
2063
+ readonly graderTarget?: string;
2060
2064
  }
2061
2065
  interface ChildEvaluatorResult {
2062
2066
  readonly name: string;
@@ -2660,6 +2664,8 @@ interface RunEvalCaseOptions {
2660
2664
  readonly repoManager?: RepoManager;
2661
2665
  /** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
2662
2666
  readonly evalDir?: string;
2667
+ /** Include verbose request details in results (e.g. agent input text) */
2668
+ readonly verbose?: boolean;
2663
2669
  }
2664
2670
  interface ProgressEvent {
2665
2671
  readonly workerId: number;
package/dist/index.js CHANGED
@@ -19,7 +19,7 @@ import {
19
19
  readTextFile,
20
20
  resolveFileReference,
21
21
  resolveTargetDefinition
22
- } from "./chunk-PC5TLJF6.js";
22
+ } from "./chunk-K7JCJIXA.js";
23
23
  import {
24
24
  AgentvProvider
25
25
  } from "./chunk-W5YDZWT4.js";
@@ -10172,8 +10172,7 @@ ${context.fileChanges}`;
10172
10172
  }
10173
10173
  const evaluatorRawRequest = {
10174
10174
  userPrompt,
10175
- systemPrompt,
10176
- target: graderProvider.targetName
10175
+ systemPrompt
10177
10176
  };
10178
10177
  try {
10179
10178
  const { data, tokenUsage } = await this.runWithRetry({
@@ -10191,6 +10190,7 @@ ${context.fileChanges}`;
10191
10190
  assertions,
10192
10191
  expectedAspectCount: Math.max(assertions.length, 1),
10193
10192
  evaluatorRawRequest,
10193
+ graderTarget: graderProvider.targetName,
10194
10194
  tokenUsage
10195
10195
  };
10196
10196
  } catch (e) {
@@ -10202,7 +10202,8 @@ ${context.fileChanges}`;
10202
10202
  verdict: "skip",
10203
10203
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
10204
10204
  expectedAspectCount: 1,
10205
- evaluatorRawRequest
10205
+ evaluatorRawRequest,
10206
+ graderTarget: graderProvider.targetName
10206
10207
  };
10207
10208
  }
10208
10209
  }
@@ -10220,8 +10221,7 @@ ${context.fileChanges}`;
10220
10221
  const systemPrompt = buildRubricOutputSchema();
10221
10222
  const evaluatorRawRequest = {
10222
10223
  userPrompt: prompt,
10223
- systemPrompt,
10224
- target: graderProvider.targetName
10224
+ systemPrompt
10225
10225
  };
10226
10226
  try {
10227
10227
  const { data, tokenUsage } = await this.runWithRetry({
@@ -10238,6 +10238,7 @@ ${context.fileChanges}`;
10238
10238
  assertions,
10239
10239
  expectedAspectCount: rubrics.length,
10240
10240
  evaluatorRawRequest,
10241
+ graderTarget: graderProvider.targetName,
10241
10242
  tokenUsage
10242
10243
  };
10243
10244
  } catch (e) {
@@ -10249,7 +10250,8 @@ ${context.fileChanges}`;
10249
10250
  verdict: "skip",
10250
10251
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
10251
10252
  expectedAspectCount: rubrics.length,
10252
- evaluatorRawRequest
10253
+ evaluatorRawRequest,
10254
+ graderTarget: graderProvider.targetName
10253
10255
  };
10254
10256
  }
10255
10257
  }
@@ -10262,8 +10264,7 @@ ${context.fileChanges}`;
10262
10264
  const systemPrompt = buildScoreRangeOutputSchema();
10263
10265
  const evaluatorRawRequest = {
10264
10266
  userPrompt: prompt,
10265
- systemPrompt,
10266
- target: graderProvider.targetName
10267
+ systemPrompt
10267
10268
  };
10268
10269
  try {
10269
10270
  const { data, tokenUsage } = await this.runWithRetry({
@@ -10280,6 +10281,7 @@ ${context.fileChanges}`;
10280
10281
  assertions,
10281
10282
  expectedAspectCount: rubrics.length,
10282
10283
  evaluatorRawRequest,
10284
+ graderTarget: graderProvider.targetName,
10283
10285
  details,
10284
10286
  tokenUsage
10285
10287
  };
@@ -10292,7 +10294,8 @@ ${context.fileChanges}`;
10292
10294
  verdict: "skip",
10293
10295
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
10294
10296
  expectedAspectCount: rubrics.length,
10295
- evaluatorRawRequest
10297
+ evaluatorRawRequest,
10298
+ graderTarget: graderProvider.targetName
10296
10299
  };
10297
10300
  }
10298
10301
  }
@@ -10324,7 +10327,6 @@ ${context.fileChanges}`;
10324
10327
  mode: "built-in",
10325
10328
  systemPrompt,
10326
10329
  userPrompt,
10327
- target: graderProvider.targetName,
10328
10330
  maxSteps: this.maxSteps
10329
10331
  };
10330
10332
  try {
@@ -10342,7 +10344,13 @@ ${context.fileChanges}`;
10342
10344
  steps: steps.length,
10343
10345
  tool_calls: toolCallCount
10344
10346
  };
10345
- return this.parseAgentResult(text, rubrics, evaluatorRawRequest, details);
10347
+ return this.parseAgentResult(
10348
+ text,
10349
+ rubrics,
10350
+ evaluatorRawRequest,
10351
+ details,
10352
+ graderProvider.targetName
10353
+ );
10346
10354
  } catch (error) {
10347
10355
  const message = error instanceof Error ? error.message : String(error);
10348
10356
  return {
@@ -10351,6 +10359,7 @@ ${context.fileChanges}`;
10351
10359
  assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
10352
10360
  expectedAspectCount: 1,
10353
10361
  evaluatorRawRequest,
10362
+ graderTarget: graderProvider.targetName,
10354
10363
  details: { mode: "built-in", error: message }
10355
10364
  };
10356
10365
  }
@@ -10403,6 +10412,7 @@ ${context.fileChanges}`;
10403
10412
  ],
10404
10413
  expectedAspectCount: 1,
10405
10414
  evaluatorRawRequest,
10415
+ graderTarget: provider.targetName,
10406
10416
  details: { mode: modeLabel, grader_target: provider.targetName }
10407
10417
  };
10408
10418
  }
@@ -10412,7 +10422,13 @@ ${context.fileChanges}`;
10412
10422
  mode: modeLabel,
10413
10423
  grader_target: provider.targetName
10414
10424
  };
10415
- return this.parseAgentResult(assistantContent, rubrics, evaluatorRawRequest, details);
10425
+ return this.parseAgentResult(
10426
+ assistantContent,
10427
+ rubrics,
10428
+ evaluatorRawRequest,
10429
+ details,
10430
+ provider.targetName
10431
+ );
10416
10432
  } catch (error) {
10417
10433
  const message = error instanceof Error ? error.message : String(error);
10418
10434
  return {
@@ -10423,6 +10439,7 @@ ${context.fileChanges}`;
10423
10439
  ],
10424
10440
  expectedAspectCount: 1,
10425
10441
  evaluatorRawRequest,
10442
+ graderTarget: provider.targetName,
10426
10443
  details: {
10427
10444
  mode: modeLabel,
10428
10445
  grader_target: provider.targetName,
@@ -10567,7 +10584,7 @@ ${outputSchema}`;
10567
10584
  * Parse the agent's response text into an EvaluationScore.
10568
10585
  * Supports both freeform and rubric modes.
10569
10586
  */
10570
- parseAgentResult(text, rubrics, evaluatorRawRequest, details) {
10587
+ parseAgentResult(text, rubrics, evaluatorRawRequest, details, graderTarget) {
10571
10588
  try {
10572
10589
  const parsed = parseJsonFromText(text);
10573
10590
  if (rubrics && rubrics.length > 0) {
@@ -10579,6 +10596,7 @@ ${outputSchema}`;
10579
10596
  assertions: assertions2,
10580
10597
  expectedAspectCount: rubrics.length,
10581
10598
  evaluatorRawRequest,
10599
+ graderTarget,
10582
10600
  details
10583
10601
  };
10584
10602
  }
@@ -10591,6 +10609,7 @@ ${outputSchema}`;
10591
10609
  assertions,
10592
10610
  expectedAspectCount: Math.max(assertions.length, 1),
10593
10611
  evaluatorRawRequest,
10612
+ graderTarget,
10594
10613
  details
10595
10614
  };
10596
10615
  } catch {
@@ -10605,6 +10624,7 @@ ${outputSchema}`;
10605
10624
  ],
10606
10625
  expectedAspectCount: 1,
10607
10626
  evaluatorRawRequest,
10627
+ graderTarget,
10608
10628
  details
10609
10629
  };
10610
10630
  }
@@ -14916,7 +14936,8 @@ async function runEvaluation(options) {
14916
14936
  streamCallbacks,
14917
14937
  typeRegistry,
14918
14938
  repoManager,
14919
- evalDir
14939
+ evalDir,
14940
+ verbose
14920
14941
  };
14921
14942
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
14922
14943
  if (totalBudgetUsd !== void 0) {
@@ -14996,7 +15017,8 @@ async function runEvaluation(options) {
14996
15017
  promptInputs,
14997
15018
  primaryProvider,
14998
15019
  "agent",
14999
- "provider_error"
15020
+ "provider_error",
15021
+ verbose
15000
15022
  );
15001
15023
  results.push(errorResult);
15002
15024
  if (onResult) {
@@ -15069,6 +15091,7 @@ async function runBatchEvaluation(options) {
15069
15091
  nowFn,
15070
15092
  onProgress,
15071
15093
  onResult,
15094
+ verbose,
15072
15095
  resolveGraderProvider,
15073
15096
  agentTimeoutMs,
15074
15097
  targetResolver,
@@ -15156,7 +15179,8 @@ async function runBatchEvaluation(options) {
15156
15179
  startTime,
15157
15180
  endTime,
15158
15181
  targetResolver,
15159
- availableTargets
15182
+ availableTargets,
15183
+ verbose
15160
15184
  });
15161
15185
  if (providerError) {
15162
15186
  result = {
@@ -15177,7 +15201,8 @@ async function runBatchEvaluation(options) {
15177
15201
  promptInputs,
15178
15202
  provider,
15179
15203
  "evaluator",
15180
- "evaluator_error"
15204
+ "evaluator_error",
15205
+ verbose
15181
15206
  );
15182
15207
  results.push(errorResult);
15183
15208
  if (onResult) {
@@ -15240,7 +15265,8 @@ async function runEvalCase(options) {
15240
15265
  suiteWorkspaceFile,
15241
15266
  typeRegistry: providedTypeRegistry,
15242
15267
  repoManager,
15243
- evalDir
15268
+ evalDir,
15269
+ verbose
15244
15270
  } = options;
15245
15271
  const setupDebug = process.env.AGENTV_SETUP_DEBUG === "1";
15246
15272
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
@@ -15277,7 +15303,8 @@ async function runEvalCase(options) {
15277
15303
  promptInputs,
15278
15304
  provider,
15279
15305
  "setup",
15280
- "template_error"
15306
+ "template_error",
15307
+ verbose
15281
15308
  );
15282
15309
  }
15283
15310
  if (caseWorkspaceFile && workspacePath) {
@@ -15306,7 +15333,8 @@ async function runEvalCase(options) {
15306
15333
  promptInputs,
15307
15334
  provider,
15308
15335
  "repo_setup",
15309
- "local_path_not_found"
15336
+ "local_path_not_found",
15337
+ verbose
15310
15338
  );
15311
15339
  }
15312
15340
  }
@@ -15332,7 +15360,8 @@ async function runEvalCase(options) {
15332
15360
  promptInputs,
15333
15361
  provider,
15334
15362
  "repo_setup",
15335
- "clone_error"
15363
+ "clone_error",
15364
+ verbose
15336
15365
  );
15337
15366
  }
15338
15367
  }
@@ -15358,7 +15387,8 @@ async function runEvalCase(options) {
15358
15387
  promptInputs,
15359
15388
  provider,
15360
15389
  "setup",
15361
- "file_copy_error"
15390
+ "file_copy_error",
15391
+ verbose
15362
15392
  );
15363
15393
  }
15364
15394
  }
@@ -15403,7 +15433,8 @@ async function runEvalCase(options) {
15403
15433
  promptInputs,
15404
15434
  provider,
15405
15435
  "setup",
15406
- "script_error"
15436
+ "script_error",
15437
+ verbose
15407
15438
  );
15408
15439
  }
15409
15440
  }
@@ -15434,7 +15465,8 @@ async function runEvalCase(options) {
15434
15465
  promptInputs,
15435
15466
  provider,
15436
15467
  "setup",
15437
- "script_error"
15468
+ "script_error",
15469
+ verbose
15438
15470
  );
15439
15471
  }
15440
15472
  }
@@ -15478,7 +15510,8 @@ async function runEvalCase(options) {
15478
15510
  promptInputs,
15479
15511
  provider,
15480
15512
  "agent",
15481
- "provider_error"
15513
+ "provider_error",
15514
+ verbose
15482
15515
  );
15483
15516
  if (workspacePath) {
15484
15517
  if (forceCleanup) {
@@ -15499,7 +15532,8 @@ async function runEvalCase(options) {
15499
15532
  promptInputs,
15500
15533
  provider,
15501
15534
  "agent",
15502
- "provider_error"
15535
+ "provider_error",
15536
+ verbose
15503
15537
  );
15504
15538
  if (workspacePath) {
15505
15539
  if (forceCleanup) {
@@ -15594,7 +15628,8 @@ async function runEvalCase(options) {
15594
15628
  targetResolver,
15595
15629
  availableTargets,
15596
15630
  fileChanges,
15597
- workspacePath
15631
+ workspacePath,
15632
+ verbose
15598
15633
  });
15599
15634
  const totalDurationMs = Date.now() - caseStartMs;
15600
15635
  const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
@@ -15649,7 +15684,8 @@ async function runEvalCase(options) {
15649
15684
  promptInputs,
15650
15685
  provider,
15651
15686
  "evaluator",
15652
- "evaluator_error"
15687
+ "evaluator_error",
15688
+ verbose
15653
15689
  );
15654
15690
  if (workspacePath && !isSharedWorkspace) {
15655
15691
  if (forceCleanup || (retainOnFailure ?? "keep") === "cleanup") {
@@ -15791,7 +15827,7 @@ async function evaluateCandidate(options) {
15791
15827
  let lmRequest;
15792
15828
  if (isAgentProvider(provider)) {
15793
15829
  agentRequest = {
15794
- question: promptInputs.question
15830
+ ...options.verbose ? { input: promptInputs.question } : {}
15795
15831
  };
15796
15832
  } else {
15797
15833
  if (promptInputs.chatPrompt) {
@@ -15805,8 +15841,9 @@ async function evaluateCandidate(options) {
15805
15841
  }
15806
15842
  }
15807
15843
  const evaluatorRequest = scores ? void 0 : score.evaluatorRawRequest;
15808
- const requests = agentRequest || lmRequest || evaluatorRequest ? {
15809
- ...agentRequest ? { agent: agentRequest } : {},
15844
+ const effectiveAgentRequest = agentRequest && Object.keys(agentRequest).length > 0 ? agentRequest : void 0;
15845
+ const requests = effectiveAgentRequest || lmRequest || evaluatorRequest ? {
15846
+ ...effectiveAgentRequest ? { agent: effectiveAgentRequest } : {},
15810
15847
  ...lmRequest ? { lm: lmRequest } : {},
15811
15848
  ...evaluatorRequest ? { evaluator: evaluatorRequest } : {}
15812
15849
  } : void 0;
@@ -15826,9 +15863,9 @@ async function evaluateCandidate(options) {
15826
15863
  endTime,
15827
15864
  requests,
15828
15865
  input,
15866
+ output: output ?? [{ role: "assistant", content: candidate }],
15829
15867
  scores,
15830
15868
  trace,
15831
- output: output ?? [{ role: "assistant", content: candidate }],
15832
15869
  fileChanges,
15833
15870
  executionStatus: classifyQualityStatus(score.score)
15834
15871
  };
@@ -15994,6 +16031,7 @@ async function runEvaluatorList(options) {
15994
16031
  verdict: score2.verdict,
15995
16032
  assertions: score2.assertions,
15996
16033
  input: score2.evaluatorRawRequest,
16034
+ target: score2.graderTarget,
15997
16035
  details: score2.details,
15998
16036
  scores: mapChildResults(score2.scores),
15999
16037
  tokenUsage: score2.tokenUsage,
@@ -16133,13 +16171,13 @@ async function invokeProvider(provider, options) {
16133
16171
  }
16134
16172
  }
16135
16173
  }
16136
- function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
16174
+ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode, verbose) {
16137
16175
  const message = error instanceof Error ? error.message : String(error);
16138
16176
  let agentRequest;
16139
16177
  let lmRequest;
16140
16178
  if (isAgentProvider(provider)) {
16141
16179
  agentRequest = {
16142
- question: promptInputs.question,
16180
+ ...verbose ? { input: promptInputs.question } : {},
16143
16181
  error: message
16144
16182
  };
16145
16183
  } else {
@@ -16167,10 +16205,10 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
16167
16205
  conversationId: evalCase.conversation_id,
16168
16206
  score: 0,
16169
16207
  assertions: [{ text: `Error: ${message}`, passed: false }],
16170
- output: [{ role: "assistant", content: `Error occurred: ${message}` }],
16171
16208
  target: targetName,
16172
16209
  requests,
16173
16210
  input,
16211
+ output: [{ role: "assistant", content: `Error occurred: ${message}` }],
16174
16212
  error: message,
16175
16213
  executionStatus: "execution_error",
16176
16214
  failureStage,