@agentv/core 3.9.1 → 3.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-PC5TLJF6.js → chunk-K7JCJIXA.js} +1 -1
- package/dist/chunk-K7JCJIXA.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +73 -35
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +6 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +74 -36
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-PC5TLJF6.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -1196,6 +1196,8 @@ interface EvaluatorResult {
|
|
|
1196
1196
|
readonly assertions: readonly AssertionEntry[];
|
|
1197
1197
|
readonly rawRequest?: JsonObject;
|
|
1198
1198
|
readonly input?: JsonObject;
|
|
1199
|
+
/** Target name used for grading (e.g., the LLM provider name). */
|
|
1200
|
+
readonly target?: string;
|
|
1199
1201
|
readonly scores?: readonly EvaluatorResult[];
|
|
1200
1202
|
/** Optional structured details from code graders (e.g., TP/TN/FP/FN counts). */
|
|
1201
1203
|
readonly details?: JsonObject;
|
|
@@ -2057,6 +2059,8 @@ interface EvaluationScore {
|
|
|
2057
2059
|
readonly details?: JsonObject;
|
|
2058
2060
|
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
2059
2061
|
readonly tokenUsage?: TokenUsage;
|
|
2062
|
+
/** Target name used for grading (e.g., the LLM provider). */
|
|
2063
|
+
readonly graderTarget?: string;
|
|
2060
2064
|
}
|
|
2061
2065
|
interface ChildEvaluatorResult {
|
|
2062
2066
|
readonly name: string;
|
|
@@ -2660,6 +2664,8 @@ interface RunEvalCaseOptions {
|
|
|
2660
2664
|
readonly repoManager?: RepoManager;
|
|
2661
2665
|
/** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
|
|
2662
2666
|
readonly evalDir?: string;
|
|
2667
|
+
/** Include verbose request details in results (e.g. agent input text) */
|
|
2668
|
+
readonly verbose?: boolean;
|
|
2663
2669
|
}
|
|
2664
2670
|
interface ProgressEvent {
|
|
2665
2671
|
readonly workerId: number;
|
package/dist/index.d.ts
CHANGED
|
@@ -1196,6 +1196,8 @@ interface EvaluatorResult {
|
|
|
1196
1196
|
readonly assertions: readonly AssertionEntry[];
|
|
1197
1197
|
readonly rawRequest?: JsonObject;
|
|
1198
1198
|
readonly input?: JsonObject;
|
|
1199
|
+
/** Target name used for grading (e.g., the LLM provider name). */
|
|
1200
|
+
readonly target?: string;
|
|
1199
1201
|
readonly scores?: readonly EvaluatorResult[];
|
|
1200
1202
|
/** Optional structured details from code graders (e.g., TP/TN/FP/FN counts). */
|
|
1201
1203
|
readonly details?: JsonObject;
|
|
@@ -2057,6 +2059,8 @@ interface EvaluationScore {
|
|
|
2057
2059
|
readonly details?: JsonObject;
|
|
2058
2060
|
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
2059
2061
|
readonly tokenUsage?: TokenUsage;
|
|
2062
|
+
/** Target name used for grading (e.g., the LLM provider). */
|
|
2063
|
+
readonly graderTarget?: string;
|
|
2060
2064
|
}
|
|
2061
2065
|
interface ChildEvaluatorResult {
|
|
2062
2066
|
readonly name: string;
|
|
@@ -2660,6 +2664,8 @@ interface RunEvalCaseOptions {
|
|
|
2660
2664
|
readonly repoManager?: RepoManager;
|
|
2661
2665
|
/** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
|
|
2662
2666
|
readonly evalDir?: string;
|
|
2667
|
+
/** Include verbose request details in results (e.g. agent input text) */
|
|
2668
|
+
readonly verbose?: boolean;
|
|
2663
2669
|
}
|
|
2664
2670
|
interface ProgressEvent {
|
|
2665
2671
|
readonly workerId: number;
|
package/dist/index.js
CHANGED
|
@@ -19,7 +19,7 @@ import {
|
|
|
19
19
|
readTextFile,
|
|
20
20
|
resolveFileReference,
|
|
21
21
|
resolveTargetDefinition
|
|
22
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-K7JCJIXA.js";
|
|
23
23
|
import {
|
|
24
24
|
AgentvProvider
|
|
25
25
|
} from "./chunk-W5YDZWT4.js";
|
|
@@ -10172,8 +10172,7 @@ ${context.fileChanges}`;
|
|
|
10172
10172
|
}
|
|
10173
10173
|
const evaluatorRawRequest = {
|
|
10174
10174
|
userPrompt,
|
|
10175
|
-
systemPrompt
|
|
10176
|
-
target: graderProvider.targetName
|
|
10175
|
+
systemPrompt
|
|
10177
10176
|
};
|
|
10178
10177
|
try {
|
|
10179
10178
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -10191,6 +10190,7 @@ ${context.fileChanges}`;
|
|
|
10191
10190
|
assertions,
|
|
10192
10191
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
10193
10192
|
evaluatorRawRequest,
|
|
10193
|
+
graderTarget: graderProvider.targetName,
|
|
10194
10194
|
tokenUsage
|
|
10195
10195
|
};
|
|
10196
10196
|
} catch (e) {
|
|
@@ -10202,7 +10202,8 @@ ${context.fileChanges}`;
|
|
|
10202
10202
|
verdict: "skip",
|
|
10203
10203
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
10204
10204
|
expectedAspectCount: 1,
|
|
10205
|
-
evaluatorRawRequest
|
|
10205
|
+
evaluatorRawRequest,
|
|
10206
|
+
graderTarget: graderProvider.targetName
|
|
10206
10207
|
};
|
|
10207
10208
|
}
|
|
10208
10209
|
}
|
|
@@ -10220,8 +10221,7 @@ ${context.fileChanges}`;
|
|
|
10220
10221
|
const systemPrompt = buildRubricOutputSchema();
|
|
10221
10222
|
const evaluatorRawRequest = {
|
|
10222
10223
|
userPrompt: prompt,
|
|
10223
|
-
systemPrompt
|
|
10224
|
-
target: graderProvider.targetName
|
|
10224
|
+
systemPrompt
|
|
10225
10225
|
};
|
|
10226
10226
|
try {
|
|
10227
10227
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -10238,6 +10238,7 @@ ${context.fileChanges}`;
|
|
|
10238
10238
|
assertions,
|
|
10239
10239
|
expectedAspectCount: rubrics.length,
|
|
10240
10240
|
evaluatorRawRequest,
|
|
10241
|
+
graderTarget: graderProvider.targetName,
|
|
10241
10242
|
tokenUsage
|
|
10242
10243
|
};
|
|
10243
10244
|
} catch (e) {
|
|
@@ -10249,7 +10250,8 @@ ${context.fileChanges}`;
|
|
|
10249
10250
|
verdict: "skip",
|
|
10250
10251
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
10251
10252
|
expectedAspectCount: rubrics.length,
|
|
10252
|
-
evaluatorRawRequest
|
|
10253
|
+
evaluatorRawRequest,
|
|
10254
|
+
graderTarget: graderProvider.targetName
|
|
10253
10255
|
};
|
|
10254
10256
|
}
|
|
10255
10257
|
}
|
|
@@ -10262,8 +10264,7 @@ ${context.fileChanges}`;
|
|
|
10262
10264
|
const systemPrompt = buildScoreRangeOutputSchema();
|
|
10263
10265
|
const evaluatorRawRequest = {
|
|
10264
10266
|
userPrompt: prompt,
|
|
10265
|
-
systemPrompt
|
|
10266
|
-
target: graderProvider.targetName
|
|
10267
|
+
systemPrompt
|
|
10267
10268
|
};
|
|
10268
10269
|
try {
|
|
10269
10270
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -10280,6 +10281,7 @@ ${context.fileChanges}`;
|
|
|
10280
10281
|
assertions,
|
|
10281
10282
|
expectedAspectCount: rubrics.length,
|
|
10282
10283
|
evaluatorRawRequest,
|
|
10284
|
+
graderTarget: graderProvider.targetName,
|
|
10283
10285
|
details,
|
|
10284
10286
|
tokenUsage
|
|
10285
10287
|
};
|
|
@@ -10292,7 +10294,8 @@ ${context.fileChanges}`;
|
|
|
10292
10294
|
verdict: "skip",
|
|
10293
10295
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
10294
10296
|
expectedAspectCount: rubrics.length,
|
|
10295
|
-
evaluatorRawRequest
|
|
10297
|
+
evaluatorRawRequest,
|
|
10298
|
+
graderTarget: graderProvider.targetName
|
|
10296
10299
|
};
|
|
10297
10300
|
}
|
|
10298
10301
|
}
|
|
@@ -10324,7 +10327,6 @@ ${context.fileChanges}`;
|
|
|
10324
10327
|
mode: "built-in",
|
|
10325
10328
|
systemPrompt,
|
|
10326
10329
|
userPrompt,
|
|
10327
|
-
target: graderProvider.targetName,
|
|
10328
10330
|
maxSteps: this.maxSteps
|
|
10329
10331
|
};
|
|
10330
10332
|
try {
|
|
@@ -10342,7 +10344,13 @@ ${context.fileChanges}`;
|
|
|
10342
10344
|
steps: steps.length,
|
|
10343
10345
|
tool_calls: toolCallCount
|
|
10344
10346
|
};
|
|
10345
|
-
return this.parseAgentResult(
|
|
10347
|
+
return this.parseAgentResult(
|
|
10348
|
+
text,
|
|
10349
|
+
rubrics,
|
|
10350
|
+
evaluatorRawRequest,
|
|
10351
|
+
details,
|
|
10352
|
+
graderProvider.targetName
|
|
10353
|
+
);
|
|
10346
10354
|
} catch (error) {
|
|
10347
10355
|
const message = error instanceof Error ? error.message : String(error);
|
|
10348
10356
|
return {
|
|
@@ -10351,6 +10359,7 @@ ${context.fileChanges}`;
|
|
|
10351
10359
|
assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
|
|
10352
10360
|
expectedAspectCount: 1,
|
|
10353
10361
|
evaluatorRawRequest,
|
|
10362
|
+
graderTarget: graderProvider.targetName,
|
|
10354
10363
|
details: { mode: "built-in", error: message }
|
|
10355
10364
|
};
|
|
10356
10365
|
}
|
|
@@ -10403,6 +10412,7 @@ ${context.fileChanges}`;
|
|
|
10403
10412
|
],
|
|
10404
10413
|
expectedAspectCount: 1,
|
|
10405
10414
|
evaluatorRawRequest,
|
|
10415
|
+
graderTarget: provider.targetName,
|
|
10406
10416
|
details: { mode: modeLabel, grader_target: provider.targetName }
|
|
10407
10417
|
};
|
|
10408
10418
|
}
|
|
@@ -10412,7 +10422,13 @@ ${context.fileChanges}`;
|
|
|
10412
10422
|
mode: modeLabel,
|
|
10413
10423
|
grader_target: provider.targetName
|
|
10414
10424
|
};
|
|
10415
|
-
return this.parseAgentResult(
|
|
10425
|
+
return this.parseAgentResult(
|
|
10426
|
+
assistantContent,
|
|
10427
|
+
rubrics,
|
|
10428
|
+
evaluatorRawRequest,
|
|
10429
|
+
details,
|
|
10430
|
+
provider.targetName
|
|
10431
|
+
);
|
|
10416
10432
|
} catch (error) {
|
|
10417
10433
|
const message = error instanceof Error ? error.message : String(error);
|
|
10418
10434
|
return {
|
|
@@ -10423,6 +10439,7 @@ ${context.fileChanges}`;
|
|
|
10423
10439
|
],
|
|
10424
10440
|
expectedAspectCount: 1,
|
|
10425
10441
|
evaluatorRawRequest,
|
|
10442
|
+
graderTarget: provider.targetName,
|
|
10426
10443
|
details: {
|
|
10427
10444
|
mode: modeLabel,
|
|
10428
10445
|
grader_target: provider.targetName,
|
|
@@ -10567,7 +10584,7 @@ ${outputSchema}`;
|
|
|
10567
10584
|
* Parse the agent's response text into an EvaluationScore.
|
|
10568
10585
|
* Supports both freeform and rubric modes.
|
|
10569
10586
|
*/
|
|
10570
|
-
parseAgentResult(text, rubrics, evaluatorRawRequest, details) {
|
|
10587
|
+
parseAgentResult(text, rubrics, evaluatorRawRequest, details, graderTarget) {
|
|
10571
10588
|
try {
|
|
10572
10589
|
const parsed = parseJsonFromText(text);
|
|
10573
10590
|
if (rubrics && rubrics.length > 0) {
|
|
@@ -10579,6 +10596,7 @@ ${outputSchema}`;
|
|
|
10579
10596
|
assertions: assertions2,
|
|
10580
10597
|
expectedAspectCount: rubrics.length,
|
|
10581
10598
|
evaluatorRawRequest,
|
|
10599
|
+
graderTarget,
|
|
10582
10600
|
details
|
|
10583
10601
|
};
|
|
10584
10602
|
}
|
|
@@ -10591,6 +10609,7 @@ ${outputSchema}`;
|
|
|
10591
10609
|
assertions,
|
|
10592
10610
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
10593
10611
|
evaluatorRawRequest,
|
|
10612
|
+
graderTarget,
|
|
10594
10613
|
details
|
|
10595
10614
|
};
|
|
10596
10615
|
} catch {
|
|
@@ -10605,6 +10624,7 @@ ${outputSchema}`;
|
|
|
10605
10624
|
],
|
|
10606
10625
|
expectedAspectCount: 1,
|
|
10607
10626
|
evaluatorRawRequest,
|
|
10627
|
+
graderTarget,
|
|
10608
10628
|
details
|
|
10609
10629
|
};
|
|
10610
10630
|
}
|
|
@@ -14916,7 +14936,8 @@ async function runEvaluation(options) {
|
|
|
14916
14936
|
streamCallbacks,
|
|
14917
14937
|
typeRegistry,
|
|
14918
14938
|
repoManager,
|
|
14919
|
-
evalDir
|
|
14939
|
+
evalDir,
|
|
14940
|
+
verbose
|
|
14920
14941
|
};
|
|
14921
14942
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
14922
14943
|
if (totalBudgetUsd !== void 0) {
|
|
@@ -14996,7 +15017,8 @@ async function runEvaluation(options) {
|
|
|
14996
15017
|
promptInputs,
|
|
14997
15018
|
primaryProvider,
|
|
14998
15019
|
"agent",
|
|
14999
|
-
"provider_error"
|
|
15020
|
+
"provider_error",
|
|
15021
|
+
verbose
|
|
15000
15022
|
);
|
|
15001
15023
|
results.push(errorResult);
|
|
15002
15024
|
if (onResult) {
|
|
@@ -15069,6 +15091,7 @@ async function runBatchEvaluation(options) {
|
|
|
15069
15091
|
nowFn,
|
|
15070
15092
|
onProgress,
|
|
15071
15093
|
onResult,
|
|
15094
|
+
verbose,
|
|
15072
15095
|
resolveGraderProvider,
|
|
15073
15096
|
agentTimeoutMs,
|
|
15074
15097
|
targetResolver,
|
|
@@ -15156,7 +15179,8 @@ async function runBatchEvaluation(options) {
|
|
|
15156
15179
|
startTime,
|
|
15157
15180
|
endTime,
|
|
15158
15181
|
targetResolver,
|
|
15159
|
-
availableTargets
|
|
15182
|
+
availableTargets,
|
|
15183
|
+
verbose
|
|
15160
15184
|
});
|
|
15161
15185
|
if (providerError) {
|
|
15162
15186
|
result = {
|
|
@@ -15177,7 +15201,8 @@ async function runBatchEvaluation(options) {
|
|
|
15177
15201
|
promptInputs,
|
|
15178
15202
|
provider,
|
|
15179
15203
|
"evaluator",
|
|
15180
|
-
"evaluator_error"
|
|
15204
|
+
"evaluator_error",
|
|
15205
|
+
verbose
|
|
15181
15206
|
);
|
|
15182
15207
|
results.push(errorResult);
|
|
15183
15208
|
if (onResult) {
|
|
@@ -15240,7 +15265,8 @@ async function runEvalCase(options) {
|
|
|
15240
15265
|
suiteWorkspaceFile,
|
|
15241
15266
|
typeRegistry: providedTypeRegistry,
|
|
15242
15267
|
repoManager,
|
|
15243
|
-
evalDir
|
|
15268
|
+
evalDir,
|
|
15269
|
+
verbose
|
|
15244
15270
|
} = options;
|
|
15245
15271
|
const setupDebug = process.env.AGENTV_SETUP_DEBUG === "1";
|
|
15246
15272
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
@@ -15277,7 +15303,8 @@ async function runEvalCase(options) {
|
|
|
15277
15303
|
promptInputs,
|
|
15278
15304
|
provider,
|
|
15279
15305
|
"setup",
|
|
15280
|
-
"template_error"
|
|
15306
|
+
"template_error",
|
|
15307
|
+
verbose
|
|
15281
15308
|
);
|
|
15282
15309
|
}
|
|
15283
15310
|
if (caseWorkspaceFile && workspacePath) {
|
|
@@ -15306,7 +15333,8 @@ async function runEvalCase(options) {
|
|
|
15306
15333
|
promptInputs,
|
|
15307
15334
|
provider,
|
|
15308
15335
|
"repo_setup",
|
|
15309
|
-
"local_path_not_found"
|
|
15336
|
+
"local_path_not_found",
|
|
15337
|
+
verbose
|
|
15310
15338
|
);
|
|
15311
15339
|
}
|
|
15312
15340
|
}
|
|
@@ -15332,7 +15360,8 @@ async function runEvalCase(options) {
|
|
|
15332
15360
|
promptInputs,
|
|
15333
15361
|
provider,
|
|
15334
15362
|
"repo_setup",
|
|
15335
|
-
"clone_error"
|
|
15363
|
+
"clone_error",
|
|
15364
|
+
verbose
|
|
15336
15365
|
);
|
|
15337
15366
|
}
|
|
15338
15367
|
}
|
|
@@ -15358,7 +15387,8 @@ async function runEvalCase(options) {
|
|
|
15358
15387
|
promptInputs,
|
|
15359
15388
|
provider,
|
|
15360
15389
|
"setup",
|
|
15361
|
-
"file_copy_error"
|
|
15390
|
+
"file_copy_error",
|
|
15391
|
+
verbose
|
|
15362
15392
|
);
|
|
15363
15393
|
}
|
|
15364
15394
|
}
|
|
@@ -15403,7 +15433,8 @@ async function runEvalCase(options) {
|
|
|
15403
15433
|
promptInputs,
|
|
15404
15434
|
provider,
|
|
15405
15435
|
"setup",
|
|
15406
|
-
"script_error"
|
|
15436
|
+
"script_error",
|
|
15437
|
+
verbose
|
|
15407
15438
|
);
|
|
15408
15439
|
}
|
|
15409
15440
|
}
|
|
@@ -15434,7 +15465,8 @@ async function runEvalCase(options) {
|
|
|
15434
15465
|
promptInputs,
|
|
15435
15466
|
provider,
|
|
15436
15467
|
"setup",
|
|
15437
|
-
"script_error"
|
|
15468
|
+
"script_error",
|
|
15469
|
+
verbose
|
|
15438
15470
|
);
|
|
15439
15471
|
}
|
|
15440
15472
|
}
|
|
@@ -15478,7 +15510,8 @@ async function runEvalCase(options) {
|
|
|
15478
15510
|
promptInputs,
|
|
15479
15511
|
provider,
|
|
15480
15512
|
"agent",
|
|
15481
|
-
"provider_error"
|
|
15513
|
+
"provider_error",
|
|
15514
|
+
verbose
|
|
15482
15515
|
);
|
|
15483
15516
|
if (workspacePath) {
|
|
15484
15517
|
if (forceCleanup) {
|
|
@@ -15499,7 +15532,8 @@ async function runEvalCase(options) {
|
|
|
15499
15532
|
promptInputs,
|
|
15500
15533
|
provider,
|
|
15501
15534
|
"agent",
|
|
15502
|
-
"provider_error"
|
|
15535
|
+
"provider_error",
|
|
15536
|
+
verbose
|
|
15503
15537
|
);
|
|
15504
15538
|
if (workspacePath) {
|
|
15505
15539
|
if (forceCleanup) {
|
|
@@ -15594,7 +15628,8 @@ async function runEvalCase(options) {
|
|
|
15594
15628
|
targetResolver,
|
|
15595
15629
|
availableTargets,
|
|
15596
15630
|
fileChanges,
|
|
15597
|
-
workspacePath
|
|
15631
|
+
workspacePath,
|
|
15632
|
+
verbose
|
|
15598
15633
|
});
|
|
15599
15634
|
const totalDurationMs = Date.now() - caseStartMs;
|
|
15600
15635
|
const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
|
|
@@ -15649,7 +15684,8 @@ async function runEvalCase(options) {
|
|
|
15649
15684
|
promptInputs,
|
|
15650
15685
|
provider,
|
|
15651
15686
|
"evaluator",
|
|
15652
|
-
"evaluator_error"
|
|
15687
|
+
"evaluator_error",
|
|
15688
|
+
verbose
|
|
15653
15689
|
);
|
|
15654
15690
|
if (workspacePath && !isSharedWorkspace) {
|
|
15655
15691
|
if (forceCleanup || (retainOnFailure ?? "keep") === "cleanup") {
|
|
@@ -15791,7 +15827,7 @@ async function evaluateCandidate(options) {
|
|
|
15791
15827
|
let lmRequest;
|
|
15792
15828
|
if (isAgentProvider(provider)) {
|
|
15793
15829
|
agentRequest = {
|
|
15794
|
-
|
|
15830
|
+
...options.verbose ? { input: promptInputs.question } : {}
|
|
15795
15831
|
};
|
|
15796
15832
|
} else {
|
|
15797
15833
|
if (promptInputs.chatPrompt) {
|
|
@@ -15805,8 +15841,9 @@ async function evaluateCandidate(options) {
|
|
|
15805
15841
|
}
|
|
15806
15842
|
}
|
|
15807
15843
|
const evaluatorRequest = scores ? void 0 : score.evaluatorRawRequest;
|
|
15808
|
-
const
|
|
15809
|
-
|
|
15844
|
+
const effectiveAgentRequest = agentRequest && Object.keys(agentRequest).length > 0 ? agentRequest : void 0;
|
|
15845
|
+
const requests = effectiveAgentRequest || lmRequest || evaluatorRequest ? {
|
|
15846
|
+
...effectiveAgentRequest ? { agent: effectiveAgentRequest } : {},
|
|
15810
15847
|
...lmRequest ? { lm: lmRequest } : {},
|
|
15811
15848
|
...evaluatorRequest ? { evaluator: evaluatorRequest } : {}
|
|
15812
15849
|
} : void 0;
|
|
@@ -15826,9 +15863,9 @@ async function evaluateCandidate(options) {
|
|
|
15826
15863
|
endTime,
|
|
15827
15864
|
requests,
|
|
15828
15865
|
input,
|
|
15866
|
+
output: output ?? [{ role: "assistant", content: candidate }],
|
|
15829
15867
|
scores,
|
|
15830
15868
|
trace,
|
|
15831
|
-
output: output ?? [{ role: "assistant", content: candidate }],
|
|
15832
15869
|
fileChanges,
|
|
15833
15870
|
executionStatus: classifyQualityStatus(score.score)
|
|
15834
15871
|
};
|
|
@@ -15994,6 +16031,7 @@ async function runEvaluatorList(options) {
|
|
|
15994
16031
|
verdict: score2.verdict,
|
|
15995
16032
|
assertions: score2.assertions,
|
|
15996
16033
|
input: score2.evaluatorRawRequest,
|
|
16034
|
+
target: score2.graderTarget,
|
|
15997
16035
|
details: score2.details,
|
|
15998
16036
|
scores: mapChildResults(score2.scores),
|
|
15999
16037
|
tokenUsage: score2.tokenUsage,
|
|
@@ -16133,13 +16171,13 @@ async function invokeProvider(provider, options) {
|
|
|
16133
16171
|
}
|
|
16134
16172
|
}
|
|
16135
16173
|
}
|
|
16136
|
-
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
|
|
16174
|
+
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode, verbose) {
|
|
16137
16175
|
const message = error instanceof Error ? error.message : String(error);
|
|
16138
16176
|
let agentRequest;
|
|
16139
16177
|
let lmRequest;
|
|
16140
16178
|
if (isAgentProvider(provider)) {
|
|
16141
16179
|
agentRequest = {
|
|
16142
|
-
|
|
16180
|
+
...verbose ? { input: promptInputs.question } : {},
|
|
16143
16181
|
error: message
|
|
16144
16182
|
};
|
|
16145
16183
|
} else {
|
|
@@ -16167,10 +16205,10 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
16167
16205
|
conversationId: evalCase.conversation_id,
|
|
16168
16206
|
score: 0,
|
|
16169
16207
|
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
16170
|
-
output: [{ role: "assistant", content: `Error occurred: ${message}` }],
|
|
16171
16208
|
target: targetName,
|
|
16172
16209
|
requests,
|
|
16173
16210
|
input,
|
|
16211
|
+
output: [{ role: "assistant", content: `Error occurred: ${message}` }],
|
|
16174
16212
|
error: message,
|
|
16175
16213
|
executionStatus: "execution_error",
|
|
16176
16214
|
failureStage,
|