@agentv/core 3.9.1 → 3.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-PC5TLJF6.js → chunk-K7JCJIXA.js} +1 -1
- package/dist/chunk-K7JCJIXA.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +1 -1
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +2 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +90 -46
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +6 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +91 -47
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-PC5TLJF6.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -1196,6 +1196,8 @@ interface EvaluatorResult {
|
|
|
1196
1196
|
readonly assertions: readonly AssertionEntry[];
|
|
1197
1197
|
readonly rawRequest?: JsonObject;
|
|
1198
1198
|
readonly input?: JsonObject;
|
|
1199
|
+
/** Target name used for grading (e.g., the LLM provider name). */
|
|
1200
|
+
readonly target?: string;
|
|
1199
1201
|
readonly scores?: readonly EvaluatorResult[];
|
|
1200
1202
|
/** Optional structured details from code graders (e.g., TP/TN/FP/FN counts). */
|
|
1201
1203
|
readonly details?: JsonObject;
|
|
@@ -2057,6 +2059,8 @@ interface EvaluationScore {
|
|
|
2057
2059
|
readonly details?: JsonObject;
|
|
2058
2060
|
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
2059
2061
|
readonly tokenUsage?: TokenUsage;
|
|
2062
|
+
/** Target name used for grading (e.g., the LLM provider). */
|
|
2063
|
+
readonly graderTarget?: string;
|
|
2060
2064
|
}
|
|
2061
2065
|
interface ChildEvaluatorResult {
|
|
2062
2066
|
readonly name: string;
|
|
@@ -2660,6 +2664,8 @@ interface RunEvalCaseOptions {
|
|
|
2660
2664
|
readonly repoManager?: RepoManager;
|
|
2661
2665
|
/** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
|
|
2662
2666
|
readonly evalDir?: string;
|
|
2667
|
+
/** Include verbose request details in results (e.g. agent input text) */
|
|
2668
|
+
readonly verbose?: boolean;
|
|
2663
2669
|
}
|
|
2664
2670
|
interface ProgressEvent {
|
|
2665
2671
|
readonly workerId: number;
|
package/dist/index.d.ts
CHANGED
|
@@ -1196,6 +1196,8 @@ interface EvaluatorResult {
|
|
|
1196
1196
|
readonly assertions: readonly AssertionEntry[];
|
|
1197
1197
|
readonly rawRequest?: JsonObject;
|
|
1198
1198
|
readonly input?: JsonObject;
|
|
1199
|
+
/** Target name used for grading (e.g., the LLM provider name). */
|
|
1200
|
+
readonly target?: string;
|
|
1199
1201
|
readonly scores?: readonly EvaluatorResult[];
|
|
1200
1202
|
/** Optional structured details from code graders (e.g., TP/TN/FP/FN counts). */
|
|
1201
1203
|
readonly details?: JsonObject;
|
|
@@ -2057,6 +2059,8 @@ interface EvaluationScore {
|
|
|
2057
2059
|
readonly details?: JsonObject;
|
|
2058
2060
|
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
2059
2061
|
readonly tokenUsage?: TokenUsage;
|
|
2062
|
+
/** Target name used for grading (e.g., the LLM provider). */
|
|
2063
|
+
readonly graderTarget?: string;
|
|
2060
2064
|
}
|
|
2061
2065
|
interface ChildEvaluatorResult {
|
|
2062
2066
|
readonly name: string;
|
|
@@ -2660,6 +2664,8 @@ interface RunEvalCaseOptions {
|
|
|
2660
2664
|
readonly repoManager?: RepoManager;
|
|
2661
2665
|
/** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
|
|
2662
2666
|
readonly evalDir?: string;
|
|
2667
|
+
/** Include verbose request details in results (e.g. agent input text) */
|
|
2668
|
+
readonly verbose?: boolean;
|
|
2663
2669
|
}
|
|
2664
2670
|
interface ProgressEvent {
|
|
2665
2671
|
readonly workerId: number;
|
package/dist/index.js
CHANGED
|
@@ -19,7 +19,7 @@ import {
|
|
|
19
19
|
readTextFile,
|
|
20
20
|
resolveFileReference,
|
|
21
21
|
resolveTargetDefinition
|
|
22
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-K7JCJIXA.js";
|
|
23
23
|
import {
|
|
24
24
|
AgentvProvider
|
|
25
25
|
} from "./chunk-W5YDZWT4.js";
|
|
@@ -6112,11 +6112,7 @@ var CopilotCliProvider = class {
|
|
|
6112
6112
|
}
|
|
6113
6113
|
}
|
|
6114
6114
|
if (sessionUpdate === "usage_update") {
|
|
6115
|
-
|
|
6116
|
-
tokenUsage = { input: update.used, output: tokenUsage.output };
|
|
6117
|
-
} else {
|
|
6118
|
-
tokenUsage = { input: update.used, output: 0 };
|
|
6119
|
-
}
|
|
6115
|
+
tokenUsage = { input: update.used, output: 0 };
|
|
6120
6116
|
if (update.cost && update.cost.currency === "USD") {
|
|
6121
6117
|
costUsd = (costUsd ?? 0) + update.cost.amount;
|
|
6122
6118
|
}
|
|
@@ -6150,21 +6146,32 @@ var CopilotCliProvider = class {
|
|
|
6150
6146
|
sessionId: session.sessionId,
|
|
6151
6147
|
prompt: promptMessages
|
|
6152
6148
|
});
|
|
6149
|
+
let promptResponse;
|
|
6153
6150
|
if (request.signal) {
|
|
6154
6151
|
const abortHandler = () => {
|
|
6155
6152
|
killProcess(agentProcess);
|
|
6156
6153
|
};
|
|
6157
6154
|
request.signal.addEventListener("abort", abortHandler, { once: true });
|
|
6158
6155
|
try {
|
|
6159
|
-
await this.raceWithTimeout(sendPromise, agentProcess);
|
|
6156
|
+
promptResponse = await this.raceWithTimeout(sendPromise, agentProcess);
|
|
6160
6157
|
} finally {
|
|
6161
6158
|
request.signal.removeEventListener("abort", abortHandler);
|
|
6162
6159
|
}
|
|
6163
6160
|
} else {
|
|
6164
|
-
await this.raceWithTimeout(sendPromise, agentProcess);
|
|
6161
|
+
promptResponse = await this.raceWithTimeout(sendPromise, agentProcess);
|
|
6165
6162
|
}
|
|
6166
6163
|
const endTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
6167
6164
|
const durationMs = Date.now() - startMs;
|
|
6165
|
+
const responseUsage = promptResponse.usage;
|
|
6166
|
+
if (responseUsage && responseUsage.totalTokens > 0) {
|
|
6167
|
+
tokenUsage = {
|
|
6168
|
+
input: responseUsage.inputTokens,
|
|
6169
|
+
output: responseUsage.outputTokens,
|
|
6170
|
+
...responseUsage.thoughtTokens != null ? { reasoning: responseUsage.thoughtTokens } : {},
|
|
6171
|
+
...responseUsage.cachedReadTokens != null ? { cached: responseUsage.cachedReadTokens } : {}
|
|
6172
|
+
};
|
|
6173
|
+
request.streamCallbacks?.onLlmCallEnd?.("copilot", tokenUsage);
|
|
6174
|
+
}
|
|
6168
6175
|
const rejectedCalls = completedToolCalls.filter((tc) => {
|
|
6169
6176
|
const out = tc.output;
|
|
6170
6177
|
return out && (out.code === "rejected" || out.code === "denied");
|
|
@@ -6222,8 +6229,7 @@ var CopilotCliProvider = class {
|
|
|
6222
6229
|
async raceWithTimeout(sendPromise, agentProcess) {
|
|
6223
6230
|
const timeoutMs = this.config.timeoutMs;
|
|
6224
6231
|
if (!timeoutMs) {
|
|
6225
|
-
|
|
6226
|
-
return;
|
|
6232
|
+
return sendPromise;
|
|
6227
6233
|
}
|
|
6228
6234
|
let timer;
|
|
6229
6235
|
const timeoutPromise = new Promise((_, reject) => {
|
|
@@ -6234,7 +6240,7 @@ var CopilotCliProvider = class {
|
|
|
6234
6240
|
timer.unref?.();
|
|
6235
6241
|
});
|
|
6236
6242
|
try {
|
|
6237
|
-
await Promise.race([sendPromise, timeoutPromise]);
|
|
6243
|
+
return await Promise.race([sendPromise, timeoutPromise]);
|
|
6238
6244
|
} finally {
|
|
6239
6245
|
if (timer) clearTimeout(timer);
|
|
6240
6246
|
}
|
|
@@ -9287,7 +9293,7 @@ async function readTargetDefinitions(filePath) {
|
|
|
9287
9293
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
9288
9294
|
}
|
|
9289
9295
|
const raw = await readFile9(absolutePath, "utf8");
|
|
9290
|
-
const parsed = parse4(raw);
|
|
9296
|
+
const parsed = interpolateEnv(parse4(raw), process.env);
|
|
9291
9297
|
if (!isRecord(parsed)) {
|
|
9292
9298
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
|
|
9293
9299
|
}
|
|
@@ -10172,8 +10178,7 @@ ${context.fileChanges}`;
|
|
|
10172
10178
|
}
|
|
10173
10179
|
const evaluatorRawRequest = {
|
|
10174
10180
|
userPrompt,
|
|
10175
|
-
systemPrompt
|
|
10176
|
-
target: graderProvider.targetName
|
|
10181
|
+
systemPrompt
|
|
10177
10182
|
};
|
|
10178
10183
|
try {
|
|
10179
10184
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -10191,6 +10196,7 @@ ${context.fileChanges}`;
|
|
|
10191
10196
|
assertions,
|
|
10192
10197
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
10193
10198
|
evaluatorRawRequest,
|
|
10199
|
+
graderTarget: graderProvider.targetName,
|
|
10194
10200
|
tokenUsage
|
|
10195
10201
|
};
|
|
10196
10202
|
} catch (e) {
|
|
@@ -10202,7 +10208,8 @@ ${context.fileChanges}`;
|
|
|
10202
10208
|
verdict: "skip",
|
|
10203
10209
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
10204
10210
|
expectedAspectCount: 1,
|
|
10205
|
-
evaluatorRawRequest
|
|
10211
|
+
evaluatorRawRequest,
|
|
10212
|
+
graderTarget: graderProvider.targetName
|
|
10206
10213
|
};
|
|
10207
10214
|
}
|
|
10208
10215
|
}
|
|
@@ -10220,8 +10227,7 @@ ${context.fileChanges}`;
|
|
|
10220
10227
|
const systemPrompt = buildRubricOutputSchema();
|
|
10221
10228
|
const evaluatorRawRequest = {
|
|
10222
10229
|
userPrompt: prompt,
|
|
10223
|
-
systemPrompt
|
|
10224
|
-
target: graderProvider.targetName
|
|
10230
|
+
systemPrompt
|
|
10225
10231
|
};
|
|
10226
10232
|
try {
|
|
10227
10233
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -10238,6 +10244,7 @@ ${context.fileChanges}`;
|
|
|
10238
10244
|
assertions,
|
|
10239
10245
|
expectedAspectCount: rubrics.length,
|
|
10240
10246
|
evaluatorRawRequest,
|
|
10247
|
+
graderTarget: graderProvider.targetName,
|
|
10241
10248
|
tokenUsage
|
|
10242
10249
|
};
|
|
10243
10250
|
} catch (e) {
|
|
@@ -10249,7 +10256,8 @@ ${context.fileChanges}`;
|
|
|
10249
10256
|
verdict: "skip",
|
|
10250
10257
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
10251
10258
|
expectedAspectCount: rubrics.length,
|
|
10252
|
-
evaluatorRawRequest
|
|
10259
|
+
evaluatorRawRequest,
|
|
10260
|
+
graderTarget: graderProvider.targetName
|
|
10253
10261
|
};
|
|
10254
10262
|
}
|
|
10255
10263
|
}
|
|
@@ -10262,8 +10270,7 @@ ${context.fileChanges}`;
|
|
|
10262
10270
|
const systemPrompt = buildScoreRangeOutputSchema();
|
|
10263
10271
|
const evaluatorRawRequest = {
|
|
10264
10272
|
userPrompt: prompt,
|
|
10265
|
-
systemPrompt
|
|
10266
|
-
target: graderProvider.targetName
|
|
10273
|
+
systemPrompt
|
|
10267
10274
|
};
|
|
10268
10275
|
try {
|
|
10269
10276
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -10280,6 +10287,7 @@ ${context.fileChanges}`;
|
|
|
10280
10287
|
assertions,
|
|
10281
10288
|
expectedAspectCount: rubrics.length,
|
|
10282
10289
|
evaluatorRawRequest,
|
|
10290
|
+
graderTarget: graderProvider.targetName,
|
|
10283
10291
|
details,
|
|
10284
10292
|
tokenUsage
|
|
10285
10293
|
};
|
|
@@ -10292,7 +10300,8 @@ ${context.fileChanges}`;
|
|
|
10292
10300
|
verdict: "skip",
|
|
10293
10301
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
10294
10302
|
expectedAspectCount: rubrics.length,
|
|
10295
|
-
evaluatorRawRequest
|
|
10303
|
+
evaluatorRawRequest,
|
|
10304
|
+
graderTarget: graderProvider.targetName
|
|
10296
10305
|
};
|
|
10297
10306
|
}
|
|
10298
10307
|
}
|
|
@@ -10324,7 +10333,6 @@ ${context.fileChanges}`;
|
|
|
10324
10333
|
mode: "built-in",
|
|
10325
10334
|
systemPrompt,
|
|
10326
10335
|
userPrompt,
|
|
10327
|
-
target: graderProvider.targetName,
|
|
10328
10336
|
maxSteps: this.maxSteps
|
|
10329
10337
|
};
|
|
10330
10338
|
try {
|
|
@@ -10342,7 +10350,13 @@ ${context.fileChanges}`;
|
|
|
10342
10350
|
steps: steps.length,
|
|
10343
10351
|
tool_calls: toolCallCount
|
|
10344
10352
|
};
|
|
10345
|
-
return this.parseAgentResult(
|
|
10353
|
+
return this.parseAgentResult(
|
|
10354
|
+
text,
|
|
10355
|
+
rubrics,
|
|
10356
|
+
evaluatorRawRequest,
|
|
10357
|
+
details,
|
|
10358
|
+
graderProvider.targetName
|
|
10359
|
+
);
|
|
10346
10360
|
} catch (error) {
|
|
10347
10361
|
const message = error instanceof Error ? error.message : String(error);
|
|
10348
10362
|
return {
|
|
@@ -10351,6 +10365,7 @@ ${context.fileChanges}`;
|
|
|
10351
10365
|
assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
|
|
10352
10366
|
expectedAspectCount: 1,
|
|
10353
10367
|
evaluatorRawRequest,
|
|
10368
|
+
graderTarget: graderProvider.targetName,
|
|
10354
10369
|
details: { mode: "built-in", error: message }
|
|
10355
10370
|
};
|
|
10356
10371
|
}
|
|
@@ -10403,6 +10418,7 @@ ${context.fileChanges}`;
|
|
|
10403
10418
|
],
|
|
10404
10419
|
expectedAspectCount: 1,
|
|
10405
10420
|
evaluatorRawRequest,
|
|
10421
|
+
graderTarget: provider.targetName,
|
|
10406
10422
|
details: { mode: modeLabel, grader_target: provider.targetName }
|
|
10407
10423
|
};
|
|
10408
10424
|
}
|
|
@@ -10412,7 +10428,13 @@ ${context.fileChanges}`;
|
|
|
10412
10428
|
mode: modeLabel,
|
|
10413
10429
|
grader_target: provider.targetName
|
|
10414
10430
|
};
|
|
10415
|
-
return this.parseAgentResult(
|
|
10431
|
+
return this.parseAgentResult(
|
|
10432
|
+
assistantContent,
|
|
10433
|
+
rubrics,
|
|
10434
|
+
evaluatorRawRequest,
|
|
10435
|
+
details,
|
|
10436
|
+
provider.targetName
|
|
10437
|
+
);
|
|
10416
10438
|
} catch (error) {
|
|
10417
10439
|
const message = error instanceof Error ? error.message : String(error);
|
|
10418
10440
|
return {
|
|
@@ -10423,6 +10445,7 @@ ${context.fileChanges}`;
|
|
|
10423
10445
|
],
|
|
10424
10446
|
expectedAspectCount: 1,
|
|
10425
10447
|
evaluatorRawRequest,
|
|
10448
|
+
graderTarget: provider.targetName,
|
|
10426
10449
|
details: {
|
|
10427
10450
|
mode: modeLabel,
|
|
10428
10451
|
grader_target: provider.targetName,
|
|
@@ -10567,7 +10590,7 @@ ${outputSchema}`;
|
|
|
10567
10590
|
* Parse the agent's response text into an EvaluationScore.
|
|
10568
10591
|
* Supports both freeform and rubric modes.
|
|
10569
10592
|
*/
|
|
10570
|
-
parseAgentResult(text, rubrics, evaluatorRawRequest, details) {
|
|
10593
|
+
parseAgentResult(text, rubrics, evaluatorRawRequest, details, graderTarget) {
|
|
10571
10594
|
try {
|
|
10572
10595
|
const parsed = parseJsonFromText(text);
|
|
10573
10596
|
if (rubrics && rubrics.length > 0) {
|
|
@@ -10579,6 +10602,7 @@ ${outputSchema}`;
|
|
|
10579
10602
|
assertions: assertions2,
|
|
10580
10603
|
expectedAspectCount: rubrics.length,
|
|
10581
10604
|
evaluatorRawRequest,
|
|
10605
|
+
graderTarget,
|
|
10582
10606
|
details
|
|
10583
10607
|
};
|
|
10584
10608
|
}
|
|
@@ -10591,6 +10615,7 @@ ${outputSchema}`;
|
|
|
10591
10615
|
assertions,
|
|
10592
10616
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
10593
10617
|
evaluatorRawRequest,
|
|
10618
|
+
graderTarget,
|
|
10594
10619
|
details
|
|
10595
10620
|
};
|
|
10596
10621
|
} catch {
|
|
@@ -10605,6 +10630,7 @@ ${outputSchema}`;
|
|
|
10605
10630
|
],
|
|
10606
10631
|
expectedAspectCount: 1,
|
|
10607
10632
|
evaluatorRawRequest,
|
|
10633
|
+
graderTarget,
|
|
10608
10634
|
details
|
|
10609
10635
|
};
|
|
10610
10636
|
}
|
|
@@ -14916,7 +14942,8 @@ async function runEvaluation(options) {
|
|
|
14916
14942
|
streamCallbacks,
|
|
14917
14943
|
typeRegistry,
|
|
14918
14944
|
repoManager,
|
|
14919
|
-
evalDir
|
|
14945
|
+
evalDir,
|
|
14946
|
+
verbose
|
|
14920
14947
|
};
|
|
14921
14948
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
14922
14949
|
if (totalBudgetUsd !== void 0) {
|
|
@@ -14996,7 +15023,8 @@ async function runEvaluation(options) {
|
|
|
14996
15023
|
promptInputs,
|
|
14997
15024
|
primaryProvider,
|
|
14998
15025
|
"agent",
|
|
14999
|
-
"provider_error"
|
|
15026
|
+
"provider_error",
|
|
15027
|
+
verbose
|
|
15000
15028
|
);
|
|
15001
15029
|
results.push(errorResult);
|
|
15002
15030
|
if (onResult) {
|
|
@@ -15069,6 +15097,7 @@ async function runBatchEvaluation(options) {
|
|
|
15069
15097
|
nowFn,
|
|
15070
15098
|
onProgress,
|
|
15071
15099
|
onResult,
|
|
15100
|
+
verbose,
|
|
15072
15101
|
resolveGraderProvider,
|
|
15073
15102
|
agentTimeoutMs,
|
|
15074
15103
|
targetResolver,
|
|
@@ -15156,7 +15185,8 @@ async function runBatchEvaluation(options) {
|
|
|
15156
15185
|
startTime,
|
|
15157
15186
|
endTime,
|
|
15158
15187
|
targetResolver,
|
|
15159
|
-
availableTargets
|
|
15188
|
+
availableTargets,
|
|
15189
|
+
verbose
|
|
15160
15190
|
});
|
|
15161
15191
|
if (providerError) {
|
|
15162
15192
|
result = {
|
|
@@ -15177,7 +15207,8 @@ async function runBatchEvaluation(options) {
|
|
|
15177
15207
|
promptInputs,
|
|
15178
15208
|
provider,
|
|
15179
15209
|
"evaluator",
|
|
15180
|
-
"evaluator_error"
|
|
15210
|
+
"evaluator_error",
|
|
15211
|
+
verbose
|
|
15181
15212
|
);
|
|
15182
15213
|
results.push(errorResult);
|
|
15183
15214
|
if (onResult) {
|
|
@@ -15240,7 +15271,8 @@ async function runEvalCase(options) {
|
|
|
15240
15271
|
suiteWorkspaceFile,
|
|
15241
15272
|
typeRegistry: providedTypeRegistry,
|
|
15242
15273
|
repoManager,
|
|
15243
|
-
evalDir
|
|
15274
|
+
evalDir,
|
|
15275
|
+
verbose
|
|
15244
15276
|
} = options;
|
|
15245
15277
|
const setupDebug = process.env.AGENTV_SETUP_DEBUG === "1";
|
|
15246
15278
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
@@ -15277,7 +15309,8 @@ async function runEvalCase(options) {
|
|
|
15277
15309
|
promptInputs,
|
|
15278
15310
|
provider,
|
|
15279
15311
|
"setup",
|
|
15280
|
-
"template_error"
|
|
15312
|
+
"template_error",
|
|
15313
|
+
verbose
|
|
15281
15314
|
);
|
|
15282
15315
|
}
|
|
15283
15316
|
if (caseWorkspaceFile && workspacePath) {
|
|
@@ -15306,7 +15339,8 @@ async function runEvalCase(options) {
|
|
|
15306
15339
|
promptInputs,
|
|
15307
15340
|
provider,
|
|
15308
15341
|
"repo_setup",
|
|
15309
|
-
"local_path_not_found"
|
|
15342
|
+
"local_path_not_found",
|
|
15343
|
+
verbose
|
|
15310
15344
|
);
|
|
15311
15345
|
}
|
|
15312
15346
|
}
|
|
@@ -15332,7 +15366,8 @@ async function runEvalCase(options) {
|
|
|
15332
15366
|
promptInputs,
|
|
15333
15367
|
provider,
|
|
15334
15368
|
"repo_setup",
|
|
15335
|
-
"clone_error"
|
|
15369
|
+
"clone_error",
|
|
15370
|
+
verbose
|
|
15336
15371
|
);
|
|
15337
15372
|
}
|
|
15338
15373
|
}
|
|
@@ -15358,7 +15393,8 @@ async function runEvalCase(options) {
|
|
|
15358
15393
|
promptInputs,
|
|
15359
15394
|
provider,
|
|
15360
15395
|
"setup",
|
|
15361
|
-
"file_copy_error"
|
|
15396
|
+
"file_copy_error",
|
|
15397
|
+
verbose
|
|
15362
15398
|
);
|
|
15363
15399
|
}
|
|
15364
15400
|
}
|
|
@@ -15403,7 +15439,8 @@ async function runEvalCase(options) {
|
|
|
15403
15439
|
promptInputs,
|
|
15404
15440
|
provider,
|
|
15405
15441
|
"setup",
|
|
15406
|
-
"script_error"
|
|
15442
|
+
"script_error",
|
|
15443
|
+
verbose
|
|
15407
15444
|
);
|
|
15408
15445
|
}
|
|
15409
15446
|
}
|
|
@@ -15434,7 +15471,8 @@ async function runEvalCase(options) {
|
|
|
15434
15471
|
promptInputs,
|
|
15435
15472
|
provider,
|
|
15436
15473
|
"setup",
|
|
15437
|
-
"script_error"
|
|
15474
|
+
"script_error",
|
|
15475
|
+
verbose
|
|
15438
15476
|
);
|
|
15439
15477
|
}
|
|
15440
15478
|
}
|
|
@@ -15478,7 +15516,8 @@ async function runEvalCase(options) {
|
|
|
15478
15516
|
promptInputs,
|
|
15479
15517
|
provider,
|
|
15480
15518
|
"agent",
|
|
15481
|
-
"provider_error"
|
|
15519
|
+
"provider_error",
|
|
15520
|
+
verbose
|
|
15482
15521
|
);
|
|
15483
15522
|
if (workspacePath) {
|
|
15484
15523
|
if (forceCleanup) {
|
|
@@ -15499,7 +15538,8 @@ async function runEvalCase(options) {
|
|
|
15499
15538
|
promptInputs,
|
|
15500
15539
|
provider,
|
|
15501
15540
|
"agent",
|
|
15502
|
-
"provider_error"
|
|
15541
|
+
"provider_error",
|
|
15542
|
+
verbose
|
|
15503
15543
|
);
|
|
15504
15544
|
if (workspacePath) {
|
|
15505
15545
|
if (forceCleanup) {
|
|
@@ -15594,7 +15634,8 @@ async function runEvalCase(options) {
|
|
|
15594
15634
|
targetResolver,
|
|
15595
15635
|
availableTargets,
|
|
15596
15636
|
fileChanges,
|
|
15597
|
-
workspacePath
|
|
15637
|
+
workspacePath,
|
|
15638
|
+
verbose
|
|
15598
15639
|
});
|
|
15599
15640
|
const totalDurationMs = Date.now() - caseStartMs;
|
|
15600
15641
|
const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
|
|
@@ -15649,7 +15690,8 @@ async function runEvalCase(options) {
|
|
|
15649
15690
|
promptInputs,
|
|
15650
15691
|
provider,
|
|
15651
15692
|
"evaluator",
|
|
15652
|
-
"evaluator_error"
|
|
15693
|
+
"evaluator_error",
|
|
15694
|
+
verbose
|
|
15653
15695
|
);
|
|
15654
15696
|
if (workspacePath && !isSharedWorkspace) {
|
|
15655
15697
|
if (forceCleanup || (retainOnFailure ?? "keep") === "cleanup") {
|
|
@@ -15791,7 +15833,7 @@ async function evaluateCandidate(options) {
|
|
|
15791
15833
|
let lmRequest;
|
|
15792
15834
|
if (isAgentProvider(provider)) {
|
|
15793
15835
|
agentRequest = {
|
|
15794
|
-
|
|
15836
|
+
...options.verbose ? { input: promptInputs.question } : {}
|
|
15795
15837
|
};
|
|
15796
15838
|
} else {
|
|
15797
15839
|
if (promptInputs.chatPrompt) {
|
|
@@ -15805,8 +15847,9 @@ async function evaluateCandidate(options) {
|
|
|
15805
15847
|
}
|
|
15806
15848
|
}
|
|
15807
15849
|
const evaluatorRequest = scores ? void 0 : score.evaluatorRawRequest;
|
|
15808
|
-
const
|
|
15809
|
-
|
|
15850
|
+
const effectiveAgentRequest = agentRequest && Object.keys(agentRequest).length > 0 ? agentRequest : void 0;
|
|
15851
|
+
const requests = effectiveAgentRequest || lmRequest || evaluatorRequest ? {
|
|
15852
|
+
...effectiveAgentRequest ? { agent: effectiveAgentRequest } : {},
|
|
15810
15853
|
...lmRequest ? { lm: lmRequest } : {},
|
|
15811
15854
|
...evaluatorRequest ? { evaluator: evaluatorRequest } : {}
|
|
15812
15855
|
} : void 0;
|
|
@@ -15826,9 +15869,9 @@ async function evaluateCandidate(options) {
|
|
|
15826
15869
|
endTime,
|
|
15827
15870
|
requests,
|
|
15828
15871
|
input,
|
|
15872
|
+
output: output ?? [{ role: "assistant", content: candidate }],
|
|
15829
15873
|
scores,
|
|
15830
15874
|
trace,
|
|
15831
|
-
output: output ?? [{ role: "assistant", content: candidate }],
|
|
15832
15875
|
fileChanges,
|
|
15833
15876
|
executionStatus: classifyQualityStatus(score.score)
|
|
15834
15877
|
};
|
|
@@ -15994,6 +16037,7 @@ async function runEvaluatorList(options) {
|
|
|
15994
16037
|
verdict: score2.verdict,
|
|
15995
16038
|
assertions: score2.assertions,
|
|
15996
16039
|
input: score2.evaluatorRawRequest,
|
|
16040
|
+
target: score2.graderTarget,
|
|
15997
16041
|
details: score2.details,
|
|
15998
16042
|
scores: mapChildResults(score2.scores),
|
|
15999
16043
|
tokenUsage: score2.tokenUsage,
|
|
@@ -16133,13 +16177,13 @@ async function invokeProvider(provider, options) {
|
|
|
16133
16177
|
}
|
|
16134
16178
|
}
|
|
16135
16179
|
}
|
|
16136
|
-
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
|
|
16180
|
+
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode, verbose) {
|
|
16137
16181
|
const message = error instanceof Error ? error.message : String(error);
|
|
16138
16182
|
let agentRequest;
|
|
16139
16183
|
let lmRequest;
|
|
16140
16184
|
if (isAgentProvider(provider)) {
|
|
16141
16185
|
agentRequest = {
|
|
16142
|
-
|
|
16186
|
+
...verbose ? { input: promptInputs.question } : {},
|
|
16143
16187
|
error: message
|
|
16144
16188
|
};
|
|
16145
16189
|
} else {
|
|
@@ -16167,10 +16211,10 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
16167
16211
|
conversationId: evalCase.conversation_id,
|
|
16168
16212
|
score: 0,
|
|
16169
16213
|
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
16170
|
-
output: [{ role: "assistant", content: `Error occurred: ${message}` }],
|
|
16171
16214
|
target: targetName,
|
|
16172
16215
|
requests,
|
|
16173
16216
|
input,
|
|
16217
|
+
output: [{ role: "assistant", content: `Error occurred: ${message}` }],
|
|
16174
16218
|
error: message,
|
|
16175
16219
|
executionStatus: "execution_error",
|
|
16176
16220
|
failureStage,
|