@agentv/core 3.9.1 → 3.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-PC5TLJF6.js → chunk-K7JCJIXA.js} +1 -1
- package/dist/chunk-K7JCJIXA.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +1 -1
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +2 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +90 -46
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +6 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +91 -47
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-PC5TLJF6.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -8068,11 +8068,7 @@ var CopilotCliProvider = class {
|
|
|
8068
8068
|
}
|
|
8069
8069
|
}
|
|
8070
8070
|
if (sessionUpdate === "usage_update") {
|
|
8071
|
-
|
|
8072
|
-
tokenUsage = { input: update.used, output: tokenUsage.output };
|
|
8073
|
-
} else {
|
|
8074
|
-
tokenUsage = { input: update.used, output: 0 };
|
|
8075
|
-
}
|
|
8071
|
+
tokenUsage = { input: update.used, output: 0 };
|
|
8076
8072
|
if (update.cost && update.cost.currency === "USD") {
|
|
8077
8073
|
costUsd = (costUsd ?? 0) + update.cost.amount;
|
|
8078
8074
|
}
|
|
@@ -8106,21 +8102,32 @@ var CopilotCliProvider = class {
|
|
|
8106
8102
|
sessionId: session.sessionId,
|
|
8107
8103
|
prompt: promptMessages
|
|
8108
8104
|
});
|
|
8105
|
+
let promptResponse;
|
|
8109
8106
|
if (request.signal) {
|
|
8110
8107
|
const abortHandler = () => {
|
|
8111
8108
|
killProcess(agentProcess);
|
|
8112
8109
|
};
|
|
8113
8110
|
request.signal.addEventListener("abort", abortHandler, { once: true });
|
|
8114
8111
|
try {
|
|
8115
|
-
await this.raceWithTimeout(sendPromise, agentProcess);
|
|
8112
|
+
promptResponse = await this.raceWithTimeout(sendPromise, agentProcess);
|
|
8116
8113
|
} finally {
|
|
8117
8114
|
request.signal.removeEventListener("abort", abortHandler);
|
|
8118
8115
|
}
|
|
8119
8116
|
} else {
|
|
8120
|
-
await this.raceWithTimeout(sendPromise, agentProcess);
|
|
8117
|
+
promptResponse = await this.raceWithTimeout(sendPromise, agentProcess);
|
|
8121
8118
|
}
|
|
8122
8119
|
const endTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
8123
8120
|
const durationMs = Date.now() - startMs;
|
|
8121
|
+
const responseUsage = promptResponse.usage;
|
|
8122
|
+
if (responseUsage && responseUsage.totalTokens > 0) {
|
|
8123
|
+
tokenUsage = {
|
|
8124
|
+
input: responseUsage.inputTokens,
|
|
8125
|
+
output: responseUsage.outputTokens,
|
|
8126
|
+
...responseUsage.thoughtTokens != null ? { reasoning: responseUsage.thoughtTokens } : {},
|
|
8127
|
+
...responseUsage.cachedReadTokens != null ? { cached: responseUsage.cachedReadTokens } : {}
|
|
8128
|
+
};
|
|
8129
|
+
request.streamCallbacks?.onLlmCallEnd?.("copilot", tokenUsage);
|
|
8130
|
+
}
|
|
8124
8131
|
const rejectedCalls = completedToolCalls.filter((tc) => {
|
|
8125
8132
|
const out = tc.output;
|
|
8126
8133
|
return out && (out.code === "rejected" || out.code === "denied");
|
|
@@ -8178,8 +8185,7 @@ var CopilotCliProvider = class {
|
|
|
8178
8185
|
async raceWithTimeout(sendPromise, agentProcess) {
|
|
8179
8186
|
const timeoutMs = this.config.timeoutMs;
|
|
8180
8187
|
if (!timeoutMs) {
|
|
8181
|
-
|
|
8182
|
-
return;
|
|
8188
|
+
return sendPromise;
|
|
8183
8189
|
}
|
|
8184
8190
|
let timer;
|
|
8185
8191
|
const timeoutPromise = new Promise((_, reject) => {
|
|
@@ -8190,7 +8196,7 @@ var CopilotCliProvider = class {
|
|
|
8190
8196
|
timer.unref?.();
|
|
8191
8197
|
});
|
|
8192
8198
|
try {
|
|
8193
|
-
await Promise.race([sendPromise, timeoutPromise]);
|
|
8199
|
+
return await Promise.race([sendPromise, timeoutPromise]);
|
|
8194
8200
|
} finally {
|
|
8195
8201
|
if (timer) clearTimeout(timer);
|
|
8196
8202
|
}
|
|
@@ -12440,7 +12446,7 @@ async function readTargetDefinitions(filePath) {
|
|
|
12440
12446
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
12441
12447
|
}
|
|
12442
12448
|
const raw = await (0, import_promises24.readFile)(absolutePath, "utf8");
|
|
12443
|
-
const parsed = (0, import_yaml6.parse)(raw);
|
|
12449
|
+
const parsed = interpolateEnv((0, import_yaml6.parse)(raw), process.env);
|
|
12444
12450
|
if (!isRecord(parsed)) {
|
|
12445
12451
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
|
|
12446
12452
|
}
|
|
@@ -13357,8 +13363,7 @@ ${context2.fileChanges}`;
|
|
|
13357
13363
|
}
|
|
13358
13364
|
const evaluatorRawRequest = {
|
|
13359
13365
|
userPrompt,
|
|
13360
|
-
systemPrompt
|
|
13361
|
-
target: graderProvider.targetName
|
|
13366
|
+
systemPrompt
|
|
13362
13367
|
};
|
|
13363
13368
|
try {
|
|
13364
13369
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -13376,6 +13381,7 @@ ${context2.fileChanges}`;
|
|
|
13376
13381
|
assertions,
|
|
13377
13382
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
13378
13383
|
evaluatorRawRequest,
|
|
13384
|
+
graderTarget: graderProvider.targetName,
|
|
13379
13385
|
tokenUsage
|
|
13380
13386
|
};
|
|
13381
13387
|
} catch (e) {
|
|
@@ -13387,7 +13393,8 @@ ${context2.fileChanges}`;
|
|
|
13387
13393
|
verdict: "skip",
|
|
13388
13394
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
13389
13395
|
expectedAspectCount: 1,
|
|
13390
|
-
evaluatorRawRequest
|
|
13396
|
+
evaluatorRawRequest,
|
|
13397
|
+
graderTarget: graderProvider.targetName
|
|
13391
13398
|
};
|
|
13392
13399
|
}
|
|
13393
13400
|
}
|
|
@@ -13405,8 +13412,7 @@ ${context2.fileChanges}`;
|
|
|
13405
13412
|
const systemPrompt = buildRubricOutputSchema();
|
|
13406
13413
|
const evaluatorRawRequest = {
|
|
13407
13414
|
userPrompt: prompt,
|
|
13408
|
-
systemPrompt
|
|
13409
|
-
target: graderProvider.targetName
|
|
13415
|
+
systemPrompt
|
|
13410
13416
|
};
|
|
13411
13417
|
try {
|
|
13412
13418
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -13423,6 +13429,7 @@ ${context2.fileChanges}`;
|
|
|
13423
13429
|
assertions,
|
|
13424
13430
|
expectedAspectCount: rubrics.length,
|
|
13425
13431
|
evaluatorRawRequest,
|
|
13432
|
+
graderTarget: graderProvider.targetName,
|
|
13426
13433
|
tokenUsage
|
|
13427
13434
|
};
|
|
13428
13435
|
} catch (e) {
|
|
@@ -13434,7 +13441,8 @@ ${context2.fileChanges}`;
|
|
|
13434
13441
|
verdict: "skip",
|
|
13435
13442
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
13436
13443
|
expectedAspectCount: rubrics.length,
|
|
13437
|
-
evaluatorRawRequest
|
|
13444
|
+
evaluatorRawRequest,
|
|
13445
|
+
graderTarget: graderProvider.targetName
|
|
13438
13446
|
};
|
|
13439
13447
|
}
|
|
13440
13448
|
}
|
|
@@ -13447,8 +13455,7 @@ ${context2.fileChanges}`;
|
|
|
13447
13455
|
const systemPrompt = buildScoreRangeOutputSchema();
|
|
13448
13456
|
const evaluatorRawRequest = {
|
|
13449
13457
|
userPrompt: prompt,
|
|
13450
|
-
systemPrompt
|
|
13451
|
-
target: graderProvider.targetName
|
|
13458
|
+
systemPrompt
|
|
13452
13459
|
};
|
|
13453
13460
|
try {
|
|
13454
13461
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -13465,6 +13472,7 @@ ${context2.fileChanges}`;
|
|
|
13465
13472
|
assertions,
|
|
13466
13473
|
expectedAspectCount: rubrics.length,
|
|
13467
13474
|
evaluatorRawRequest,
|
|
13475
|
+
graderTarget: graderProvider.targetName,
|
|
13468
13476
|
details,
|
|
13469
13477
|
tokenUsage
|
|
13470
13478
|
};
|
|
@@ -13477,7 +13485,8 @@ ${context2.fileChanges}`;
|
|
|
13477
13485
|
verdict: "skip",
|
|
13478
13486
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
13479
13487
|
expectedAspectCount: rubrics.length,
|
|
13480
|
-
evaluatorRawRequest
|
|
13488
|
+
evaluatorRawRequest,
|
|
13489
|
+
graderTarget: graderProvider.targetName
|
|
13481
13490
|
};
|
|
13482
13491
|
}
|
|
13483
13492
|
}
|
|
@@ -13509,7 +13518,6 @@ ${context2.fileChanges}`;
|
|
|
13509
13518
|
mode: "built-in",
|
|
13510
13519
|
systemPrompt,
|
|
13511
13520
|
userPrompt,
|
|
13512
|
-
target: graderProvider.targetName,
|
|
13513
13521
|
maxSteps: this.maxSteps
|
|
13514
13522
|
};
|
|
13515
13523
|
try {
|
|
@@ -13527,7 +13535,13 @@ ${context2.fileChanges}`;
|
|
|
13527
13535
|
steps: steps.length,
|
|
13528
13536
|
tool_calls: toolCallCount
|
|
13529
13537
|
};
|
|
13530
|
-
return this.parseAgentResult(
|
|
13538
|
+
return this.parseAgentResult(
|
|
13539
|
+
text,
|
|
13540
|
+
rubrics,
|
|
13541
|
+
evaluatorRawRequest,
|
|
13542
|
+
details,
|
|
13543
|
+
graderProvider.targetName
|
|
13544
|
+
);
|
|
13531
13545
|
} catch (error) {
|
|
13532
13546
|
const message = error instanceof Error ? error.message : String(error);
|
|
13533
13547
|
return {
|
|
@@ -13536,6 +13550,7 @@ ${context2.fileChanges}`;
|
|
|
13536
13550
|
assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
|
|
13537
13551
|
expectedAspectCount: 1,
|
|
13538
13552
|
evaluatorRawRequest,
|
|
13553
|
+
graderTarget: graderProvider.targetName,
|
|
13539
13554
|
details: { mode: "built-in", error: message }
|
|
13540
13555
|
};
|
|
13541
13556
|
}
|
|
@@ -13588,6 +13603,7 @@ ${context2.fileChanges}`;
|
|
|
13588
13603
|
],
|
|
13589
13604
|
expectedAspectCount: 1,
|
|
13590
13605
|
evaluatorRawRequest,
|
|
13606
|
+
graderTarget: provider.targetName,
|
|
13591
13607
|
details: { mode: modeLabel, grader_target: provider.targetName }
|
|
13592
13608
|
};
|
|
13593
13609
|
}
|
|
@@ -13597,7 +13613,13 @@ ${context2.fileChanges}`;
|
|
|
13597
13613
|
mode: modeLabel,
|
|
13598
13614
|
grader_target: provider.targetName
|
|
13599
13615
|
};
|
|
13600
|
-
return this.parseAgentResult(
|
|
13616
|
+
return this.parseAgentResult(
|
|
13617
|
+
assistantContent,
|
|
13618
|
+
rubrics,
|
|
13619
|
+
evaluatorRawRequest,
|
|
13620
|
+
details,
|
|
13621
|
+
provider.targetName
|
|
13622
|
+
);
|
|
13601
13623
|
} catch (error) {
|
|
13602
13624
|
const message = error instanceof Error ? error.message : String(error);
|
|
13603
13625
|
return {
|
|
@@ -13608,6 +13630,7 @@ ${context2.fileChanges}`;
|
|
|
13608
13630
|
],
|
|
13609
13631
|
expectedAspectCount: 1,
|
|
13610
13632
|
evaluatorRawRequest,
|
|
13633
|
+
graderTarget: provider.targetName,
|
|
13611
13634
|
details: {
|
|
13612
13635
|
mode: modeLabel,
|
|
13613
13636
|
grader_target: provider.targetName,
|
|
@@ -13752,7 +13775,7 @@ ${outputSchema}`;
|
|
|
13752
13775
|
* Parse the agent's response text into an EvaluationScore.
|
|
13753
13776
|
* Supports both freeform and rubric modes.
|
|
13754
13777
|
*/
|
|
13755
|
-
parseAgentResult(text, rubrics, evaluatorRawRequest, details) {
|
|
13778
|
+
parseAgentResult(text, rubrics, evaluatorRawRequest, details, graderTarget) {
|
|
13756
13779
|
try {
|
|
13757
13780
|
const parsed = parseJsonFromText(text);
|
|
13758
13781
|
if (rubrics && rubrics.length > 0) {
|
|
@@ -13764,6 +13787,7 @@ ${outputSchema}`;
|
|
|
13764
13787
|
assertions: assertions2,
|
|
13765
13788
|
expectedAspectCount: rubrics.length,
|
|
13766
13789
|
evaluatorRawRequest,
|
|
13790
|
+
graderTarget,
|
|
13767
13791
|
details
|
|
13768
13792
|
};
|
|
13769
13793
|
}
|
|
@@ -13776,6 +13800,7 @@ ${outputSchema}`;
|
|
|
13776
13800
|
assertions,
|
|
13777
13801
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
13778
13802
|
evaluatorRawRequest,
|
|
13803
|
+
graderTarget,
|
|
13779
13804
|
details
|
|
13780
13805
|
};
|
|
13781
13806
|
} catch {
|
|
@@ -13790,6 +13815,7 @@ ${outputSchema}`;
|
|
|
13790
13815
|
],
|
|
13791
13816
|
expectedAspectCount: 1,
|
|
13792
13817
|
evaluatorRawRequest,
|
|
13818
|
+
graderTarget,
|
|
13793
13819
|
details
|
|
13794
13820
|
};
|
|
13795
13821
|
}
|
|
@@ -18101,7 +18127,8 @@ async function runEvaluation(options) {
|
|
|
18101
18127
|
streamCallbacks,
|
|
18102
18128
|
typeRegistry,
|
|
18103
18129
|
repoManager,
|
|
18104
|
-
evalDir
|
|
18130
|
+
evalDir,
|
|
18131
|
+
verbose
|
|
18105
18132
|
};
|
|
18106
18133
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
18107
18134
|
if (totalBudgetUsd !== void 0) {
|
|
@@ -18181,7 +18208,8 @@ async function runEvaluation(options) {
|
|
|
18181
18208
|
promptInputs,
|
|
18182
18209
|
primaryProvider,
|
|
18183
18210
|
"agent",
|
|
18184
|
-
"provider_error"
|
|
18211
|
+
"provider_error",
|
|
18212
|
+
verbose
|
|
18185
18213
|
);
|
|
18186
18214
|
results.push(errorResult);
|
|
18187
18215
|
if (onResult) {
|
|
@@ -18254,6 +18282,7 @@ async function runBatchEvaluation(options) {
|
|
|
18254
18282
|
nowFn,
|
|
18255
18283
|
onProgress,
|
|
18256
18284
|
onResult,
|
|
18285
|
+
verbose,
|
|
18257
18286
|
resolveGraderProvider,
|
|
18258
18287
|
agentTimeoutMs,
|
|
18259
18288
|
targetResolver,
|
|
@@ -18341,7 +18370,8 @@ async function runBatchEvaluation(options) {
|
|
|
18341
18370
|
startTime,
|
|
18342
18371
|
endTime,
|
|
18343
18372
|
targetResolver,
|
|
18344
|
-
availableTargets
|
|
18373
|
+
availableTargets,
|
|
18374
|
+
verbose
|
|
18345
18375
|
});
|
|
18346
18376
|
if (providerError) {
|
|
18347
18377
|
result = {
|
|
@@ -18362,7 +18392,8 @@ async function runBatchEvaluation(options) {
|
|
|
18362
18392
|
promptInputs,
|
|
18363
18393
|
provider,
|
|
18364
18394
|
"evaluator",
|
|
18365
|
-
"evaluator_error"
|
|
18395
|
+
"evaluator_error",
|
|
18396
|
+
verbose
|
|
18366
18397
|
);
|
|
18367
18398
|
results.push(errorResult);
|
|
18368
18399
|
if (onResult) {
|
|
@@ -18425,7 +18456,8 @@ async function runEvalCase(options) {
|
|
|
18425
18456
|
suiteWorkspaceFile,
|
|
18426
18457
|
typeRegistry: providedTypeRegistry,
|
|
18427
18458
|
repoManager,
|
|
18428
|
-
evalDir
|
|
18459
|
+
evalDir,
|
|
18460
|
+
verbose
|
|
18429
18461
|
} = options;
|
|
18430
18462
|
const setupDebug = process.env.AGENTV_SETUP_DEBUG === "1";
|
|
18431
18463
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
@@ -18462,7 +18494,8 @@ async function runEvalCase(options) {
|
|
|
18462
18494
|
promptInputs,
|
|
18463
18495
|
provider,
|
|
18464
18496
|
"setup",
|
|
18465
|
-
"template_error"
|
|
18497
|
+
"template_error",
|
|
18498
|
+
verbose
|
|
18466
18499
|
);
|
|
18467
18500
|
}
|
|
18468
18501
|
if (caseWorkspaceFile && workspacePath) {
|
|
@@ -18491,7 +18524,8 @@ async function runEvalCase(options) {
|
|
|
18491
18524
|
promptInputs,
|
|
18492
18525
|
provider,
|
|
18493
18526
|
"repo_setup",
|
|
18494
|
-
"local_path_not_found"
|
|
18527
|
+
"local_path_not_found",
|
|
18528
|
+
verbose
|
|
18495
18529
|
);
|
|
18496
18530
|
}
|
|
18497
18531
|
}
|
|
@@ -18517,7 +18551,8 @@ async function runEvalCase(options) {
|
|
|
18517
18551
|
promptInputs,
|
|
18518
18552
|
provider,
|
|
18519
18553
|
"repo_setup",
|
|
18520
|
-
"clone_error"
|
|
18554
|
+
"clone_error",
|
|
18555
|
+
verbose
|
|
18521
18556
|
);
|
|
18522
18557
|
}
|
|
18523
18558
|
}
|
|
@@ -18543,7 +18578,8 @@ async function runEvalCase(options) {
|
|
|
18543
18578
|
promptInputs,
|
|
18544
18579
|
provider,
|
|
18545
18580
|
"setup",
|
|
18546
|
-
"file_copy_error"
|
|
18581
|
+
"file_copy_error",
|
|
18582
|
+
verbose
|
|
18547
18583
|
);
|
|
18548
18584
|
}
|
|
18549
18585
|
}
|
|
@@ -18588,7 +18624,8 @@ async function runEvalCase(options) {
|
|
|
18588
18624
|
promptInputs,
|
|
18589
18625
|
provider,
|
|
18590
18626
|
"setup",
|
|
18591
|
-
"script_error"
|
|
18627
|
+
"script_error",
|
|
18628
|
+
verbose
|
|
18592
18629
|
);
|
|
18593
18630
|
}
|
|
18594
18631
|
}
|
|
@@ -18619,7 +18656,8 @@ async function runEvalCase(options) {
|
|
|
18619
18656
|
promptInputs,
|
|
18620
18657
|
provider,
|
|
18621
18658
|
"setup",
|
|
18622
|
-
"script_error"
|
|
18659
|
+
"script_error",
|
|
18660
|
+
verbose
|
|
18623
18661
|
);
|
|
18624
18662
|
}
|
|
18625
18663
|
}
|
|
@@ -18663,7 +18701,8 @@ async function runEvalCase(options) {
|
|
|
18663
18701
|
promptInputs,
|
|
18664
18702
|
provider,
|
|
18665
18703
|
"agent",
|
|
18666
|
-
"provider_error"
|
|
18704
|
+
"provider_error",
|
|
18705
|
+
verbose
|
|
18667
18706
|
);
|
|
18668
18707
|
if (workspacePath) {
|
|
18669
18708
|
if (forceCleanup) {
|
|
@@ -18684,7 +18723,8 @@ async function runEvalCase(options) {
|
|
|
18684
18723
|
promptInputs,
|
|
18685
18724
|
provider,
|
|
18686
18725
|
"agent",
|
|
18687
|
-
"provider_error"
|
|
18726
|
+
"provider_error",
|
|
18727
|
+
verbose
|
|
18688
18728
|
);
|
|
18689
18729
|
if (workspacePath) {
|
|
18690
18730
|
if (forceCleanup) {
|
|
@@ -18779,7 +18819,8 @@ async function runEvalCase(options) {
|
|
|
18779
18819
|
targetResolver,
|
|
18780
18820
|
availableTargets,
|
|
18781
18821
|
fileChanges,
|
|
18782
|
-
workspacePath
|
|
18822
|
+
workspacePath,
|
|
18823
|
+
verbose
|
|
18783
18824
|
});
|
|
18784
18825
|
const totalDurationMs = Date.now() - caseStartMs;
|
|
18785
18826
|
const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
|
|
@@ -18834,7 +18875,8 @@ async function runEvalCase(options) {
|
|
|
18834
18875
|
promptInputs,
|
|
18835
18876
|
provider,
|
|
18836
18877
|
"evaluator",
|
|
18837
|
-
"evaluator_error"
|
|
18878
|
+
"evaluator_error",
|
|
18879
|
+
verbose
|
|
18838
18880
|
);
|
|
18839
18881
|
if (workspacePath && !isSharedWorkspace) {
|
|
18840
18882
|
if (forceCleanup || (retainOnFailure ?? "keep") === "cleanup") {
|
|
@@ -18976,7 +19018,7 @@ async function evaluateCandidate(options) {
|
|
|
18976
19018
|
let lmRequest;
|
|
18977
19019
|
if (isAgentProvider(provider)) {
|
|
18978
19020
|
agentRequest = {
|
|
18979
|
-
|
|
19021
|
+
...options.verbose ? { input: promptInputs.question } : {}
|
|
18980
19022
|
};
|
|
18981
19023
|
} else {
|
|
18982
19024
|
if (promptInputs.chatPrompt) {
|
|
@@ -18990,8 +19032,9 @@ async function evaluateCandidate(options) {
|
|
|
18990
19032
|
}
|
|
18991
19033
|
}
|
|
18992
19034
|
const evaluatorRequest = scores ? void 0 : score.evaluatorRawRequest;
|
|
18993
|
-
const
|
|
18994
|
-
|
|
19035
|
+
const effectiveAgentRequest = agentRequest && Object.keys(agentRequest).length > 0 ? agentRequest : void 0;
|
|
19036
|
+
const requests = effectiveAgentRequest || lmRequest || evaluatorRequest ? {
|
|
19037
|
+
...effectiveAgentRequest ? { agent: effectiveAgentRequest } : {},
|
|
18995
19038
|
...lmRequest ? { lm: lmRequest } : {},
|
|
18996
19039
|
...evaluatorRequest ? { evaluator: evaluatorRequest } : {}
|
|
18997
19040
|
} : void 0;
|
|
@@ -19011,9 +19054,9 @@ async function evaluateCandidate(options) {
|
|
|
19011
19054
|
endTime,
|
|
19012
19055
|
requests,
|
|
19013
19056
|
input,
|
|
19057
|
+
output: output ?? [{ role: "assistant", content: candidate }],
|
|
19014
19058
|
scores,
|
|
19015
19059
|
trace: trace2,
|
|
19016
|
-
output: output ?? [{ role: "assistant", content: candidate }],
|
|
19017
19060
|
fileChanges,
|
|
19018
19061
|
executionStatus: classifyQualityStatus(score.score)
|
|
19019
19062
|
};
|
|
@@ -19179,6 +19222,7 @@ async function runEvaluatorList(options) {
|
|
|
19179
19222
|
verdict: score2.verdict,
|
|
19180
19223
|
assertions: score2.assertions,
|
|
19181
19224
|
input: score2.evaluatorRawRequest,
|
|
19225
|
+
target: score2.graderTarget,
|
|
19182
19226
|
details: score2.details,
|
|
19183
19227
|
scores: mapChildResults(score2.scores),
|
|
19184
19228
|
tokenUsage: score2.tokenUsage,
|
|
@@ -19318,13 +19362,13 @@ async function invokeProvider(provider, options) {
|
|
|
19318
19362
|
}
|
|
19319
19363
|
}
|
|
19320
19364
|
}
|
|
19321
|
-
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
|
|
19365
|
+
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode, verbose) {
|
|
19322
19366
|
const message = error instanceof Error ? error.message : String(error);
|
|
19323
19367
|
let agentRequest;
|
|
19324
19368
|
let lmRequest;
|
|
19325
19369
|
if (isAgentProvider(provider)) {
|
|
19326
19370
|
agentRequest = {
|
|
19327
|
-
|
|
19371
|
+
...verbose ? { input: promptInputs.question } : {},
|
|
19328
19372
|
error: message
|
|
19329
19373
|
};
|
|
19330
19374
|
} else {
|
|
@@ -19352,10 +19396,10 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
19352
19396
|
conversationId: evalCase.conversation_id,
|
|
19353
19397
|
score: 0,
|
|
19354
19398
|
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
19355
|
-
output: [{ role: "assistant", content: `Error occurred: ${message}` }],
|
|
19356
19399
|
target: targetName,
|
|
19357
19400
|
requests,
|
|
19358
19401
|
input,
|
|
19402
|
+
output: [{ role: "assistant", content: `Error occurred: ${message}` }],
|
|
19359
19403
|
error: message,
|
|
19360
19404
|
executionStatus: "execution_error",
|
|
19361
19405
|
failureStage,
|