@agentv/core 3.9.0 → 3.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-PC5TLJF6.js → chunk-K7JCJIXA.js} +1 -1
- package/dist/chunk-K7JCJIXA.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +100 -40
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +17 -5
- package/dist/index.d.ts +17 -5
- package/dist/index.js +100 -41
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-PC5TLJF6.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -19,7 +19,7 @@ import {
|
|
|
19
19
|
readTextFile,
|
|
20
20
|
resolveFileReference,
|
|
21
21
|
resolveTargetDefinition
|
|
22
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-K7JCJIXA.js";
|
|
23
23
|
import {
|
|
24
24
|
AgentvProvider
|
|
25
25
|
} from "./chunk-W5YDZWT4.js";
|
|
@@ -420,6 +420,17 @@ function extractTargetsFromSuite(suite) {
|
|
|
420
420
|
}
|
|
421
421
|
return void 0;
|
|
422
422
|
}
|
|
423
|
+
function extractWorkersFromSuite(suite) {
|
|
424
|
+
const execution = suite.execution;
|
|
425
|
+
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
426
|
+
return void 0;
|
|
427
|
+
}
|
|
428
|
+
const workers = execution.workers;
|
|
429
|
+
if (typeof workers === "number" && Number.isInteger(workers) && workers >= 1 && workers <= 50) {
|
|
430
|
+
return workers;
|
|
431
|
+
}
|
|
432
|
+
return void 0;
|
|
433
|
+
}
|
|
423
434
|
function extractTargetsFromTestCase(testCase) {
|
|
424
435
|
const execution = testCase.execution;
|
|
425
436
|
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
@@ -2720,6 +2731,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
2720
2731
|
tests,
|
|
2721
2732
|
trials: extractTrialsConfig(parsed),
|
|
2722
2733
|
targets: extractTargetsFromSuite(parsed),
|
|
2734
|
+
workers: extractWorkersFromSuite(parsed),
|
|
2723
2735
|
cacheConfig: extractCacheConfig(parsed),
|
|
2724
2736
|
totalBudgetUsd: extractTotalBudgetUsd(parsed),
|
|
2725
2737
|
...metadata !== void 0 && { metadata },
|
|
@@ -10160,8 +10172,7 @@ ${context.fileChanges}`;
|
|
|
10160
10172
|
}
|
|
10161
10173
|
const evaluatorRawRequest = {
|
|
10162
10174
|
userPrompt,
|
|
10163
|
-
systemPrompt
|
|
10164
|
-
target: graderProvider.targetName
|
|
10175
|
+
systemPrompt
|
|
10165
10176
|
};
|
|
10166
10177
|
try {
|
|
10167
10178
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -10179,6 +10190,7 @@ ${context.fileChanges}`;
|
|
|
10179
10190
|
assertions,
|
|
10180
10191
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
10181
10192
|
evaluatorRawRequest,
|
|
10193
|
+
graderTarget: graderProvider.targetName,
|
|
10182
10194
|
tokenUsage
|
|
10183
10195
|
};
|
|
10184
10196
|
} catch (e) {
|
|
@@ -10190,7 +10202,8 @@ ${context.fileChanges}`;
|
|
|
10190
10202
|
verdict: "skip",
|
|
10191
10203
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
10192
10204
|
expectedAspectCount: 1,
|
|
10193
|
-
evaluatorRawRequest
|
|
10205
|
+
evaluatorRawRequest,
|
|
10206
|
+
graderTarget: graderProvider.targetName
|
|
10194
10207
|
};
|
|
10195
10208
|
}
|
|
10196
10209
|
}
|
|
@@ -10208,8 +10221,7 @@ ${context.fileChanges}`;
|
|
|
10208
10221
|
const systemPrompt = buildRubricOutputSchema();
|
|
10209
10222
|
const evaluatorRawRequest = {
|
|
10210
10223
|
userPrompt: prompt,
|
|
10211
|
-
systemPrompt
|
|
10212
|
-
target: graderProvider.targetName
|
|
10224
|
+
systemPrompt
|
|
10213
10225
|
};
|
|
10214
10226
|
try {
|
|
10215
10227
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -10226,6 +10238,7 @@ ${context.fileChanges}`;
|
|
|
10226
10238
|
assertions,
|
|
10227
10239
|
expectedAspectCount: rubrics.length,
|
|
10228
10240
|
evaluatorRawRequest,
|
|
10241
|
+
graderTarget: graderProvider.targetName,
|
|
10229
10242
|
tokenUsage
|
|
10230
10243
|
};
|
|
10231
10244
|
} catch (e) {
|
|
@@ -10237,7 +10250,8 @@ ${context.fileChanges}`;
|
|
|
10237
10250
|
verdict: "skip",
|
|
10238
10251
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
10239
10252
|
expectedAspectCount: rubrics.length,
|
|
10240
|
-
evaluatorRawRequest
|
|
10253
|
+
evaluatorRawRequest,
|
|
10254
|
+
graderTarget: graderProvider.targetName
|
|
10241
10255
|
};
|
|
10242
10256
|
}
|
|
10243
10257
|
}
|
|
@@ -10250,8 +10264,7 @@ ${context.fileChanges}`;
|
|
|
10250
10264
|
const systemPrompt = buildScoreRangeOutputSchema();
|
|
10251
10265
|
const evaluatorRawRequest = {
|
|
10252
10266
|
userPrompt: prompt,
|
|
10253
|
-
systemPrompt
|
|
10254
|
-
target: graderProvider.targetName
|
|
10267
|
+
systemPrompt
|
|
10255
10268
|
};
|
|
10256
10269
|
try {
|
|
10257
10270
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -10268,6 +10281,7 @@ ${context.fileChanges}`;
|
|
|
10268
10281
|
assertions,
|
|
10269
10282
|
expectedAspectCount: rubrics.length,
|
|
10270
10283
|
evaluatorRawRequest,
|
|
10284
|
+
graderTarget: graderProvider.targetName,
|
|
10271
10285
|
details,
|
|
10272
10286
|
tokenUsage
|
|
10273
10287
|
};
|
|
@@ -10280,7 +10294,8 @@ ${context.fileChanges}`;
|
|
|
10280
10294
|
verdict: "skip",
|
|
10281
10295
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
10282
10296
|
expectedAspectCount: rubrics.length,
|
|
10283
|
-
evaluatorRawRequest
|
|
10297
|
+
evaluatorRawRequest,
|
|
10298
|
+
graderTarget: graderProvider.targetName
|
|
10284
10299
|
};
|
|
10285
10300
|
}
|
|
10286
10301
|
}
|
|
@@ -10312,7 +10327,6 @@ ${context.fileChanges}`;
|
|
|
10312
10327
|
mode: "built-in",
|
|
10313
10328
|
systemPrompt,
|
|
10314
10329
|
userPrompt,
|
|
10315
|
-
target: graderProvider.targetName,
|
|
10316
10330
|
maxSteps: this.maxSteps
|
|
10317
10331
|
};
|
|
10318
10332
|
try {
|
|
@@ -10330,7 +10344,13 @@ ${context.fileChanges}`;
|
|
|
10330
10344
|
steps: steps.length,
|
|
10331
10345
|
tool_calls: toolCallCount
|
|
10332
10346
|
};
|
|
10333
|
-
return this.parseAgentResult(
|
|
10347
|
+
return this.parseAgentResult(
|
|
10348
|
+
text,
|
|
10349
|
+
rubrics,
|
|
10350
|
+
evaluatorRawRequest,
|
|
10351
|
+
details,
|
|
10352
|
+
graderProvider.targetName
|
|
10353
|
+
);
|
|
10334
10354
|
} catch (error) {
|
|
10335
10355
|
const message = error instanceof Error ? error.message : String(error);
|
|
10336
10356
|
return {
|
|
@@ -10339,6 +10359,7 @@ ${context.fileChanges}`;
|
|
|
10339
10359
|
assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
|
|
10340
10360
|
expectedAspectCount: 1,
|
|
10341
10361
|
evaluatorRawRequest,
|
|
10362
|
+
graderTarget: graderProvider.targetName,
|
|
10342
10363
|
details: { mode: "built-in", error: message }
|
|
10343
10364
|
};
|
|
10344
10365
|
}
|
|
@@ -10391,6 +10412,7 @@ ${context.fileChanges}`;
|
|
|
10391
10412
|
],
|
|
10392
10413
|
expectedAspectCount: 1,
|
|
10393
10414
|
evaluatorRawRequest,
|
|
10415
|
+
graderTarget: provider.targetName,
|
|
10394
10416
|
details: { mode: modeLabel, grader_target: provider.targetName }
|
|
10395
10417
|
};
|
|
10396
10418
|
}
|
|
@@ -10400,7 +10422,13 @@ ${context.fileChanges}`;
|
|
|
10400
10422
|
mode: modeLabel,
|
|
10401
10423
|
grader_target: provider.targetName
|
|
10402
10424
|
};
|
|
10403
|
-
return this.parseAgentResult(
|
|
10425
|
+
return this.parseAgentResult(
|
|
10426
|
+
assistantContent,
|
|
10427
|
+
rubrics,
|
|
10428
|
+
evaluatorRawRequest,
|
|
10429
|
+
details,
|
|
10430
|
+
provider.targetName
|
|
10431
|
+
);
|
|
10404
10432
|
} catch (error) {
|
|
10405
10433
|
const message = error instanceof Error ? error.message : String(error);
|
|
10406
10434
|
return {
|
|
@@ -10411,6 +10439,7 @@ ${context.fileChanges}`;
|
|
|
10411
10439
|
],
|
|
10412
10440
|
expectedAspectCount: 1,
|
|
10413
10441
|
evaluatorRawRequest,
|
|
10442
|
+
graderTarget: provider.targetName,
|
|
10414
10443
|
details: {
|
|
10415
10444
|
mode: modeLabel,
|
|
10416
10445
|
grader_target: provider.targetName,
|
|
@@ -10555,7 +10584,7 @@ ${outputSchema}`;
|
|
|
10555
10584
|
* Parse the agent's response text into an EvaluationScore.
|
|
10556
10585
|
* Supports both freeform and rubric modes.
|
|
10557
10586
|
*/
|
|
10558
|
-
parseAgentResult(text, rubrics, evaluatorRawRequest, details) {
|
|
10587
|
+
parseAgentResult(text, rubrics, evaluatorRawRequest, details, graderTarget) {
|
|
10559
10588
|
try {
|
|
10560
10589
|
const parsed = parseJsonFromText(text);
|
|
10561
10590
|
if (rubrics && rubrics.length > 0) {
|
|
@@ -10567,6 +10596,7 @@ ${outputSchema}`;
|
|
|
10567
10596
|
assertions: assertions2,
|
|
10568
10597
|
expectedAspectCount: rubrics.length,
|
|
10569
10598
|
evaluatorRawRequest,
|
|
10599
|
+
graderTarget,
|
|
10570
10600
|
details
|
|
10571
10601
|
};
|
|
10572
10602
|
}
|
|
@@ -10579,6 +10609,7 @@ ${outputSchema}`;
|
|
|
10579
10609
|
assertions,
|
|
10580
10610
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
10581
10611
|
evaluatorRawRequest,
|
|
10612
|
+
graderTarget,
|
|
10582
10613
|
details
|
|
10583
10614
|
};
|
|
10584
10615
|
} catch {
|
|
@@ -10593,6 +10624,7 @@ ${outputSchema}`;
|
|
|
10593
10624
|
],
|
|
10594
10625
|
expectedAspectCount: 1,
|
|
10595
10626
|
evaluatorRawRequest,
|
|
10627
|
+
graderTarget,
|
|
10596
10628
|
details
|
|
10597
10629
|
};
|
|
10598
10630
|
}
|
|
@@ -14606,14 +14638,22 @@ async function runEvaluation(options) {
|
|
|
14606
14638
|
const usePool = poolEnabled !== false && !!suiteWorkspace?.repos?.length && !isPerTestIsolation && !useStaticWorkspace;
|
|
14607
14639
|
const resolvedRetainOnSuccess = retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup");
|
|
14608
14640
|
const resolvedRetainOnFailure = retainOnFailure ?? (cleanupWorkspaces ? "cleanup" : "keep");
|
|
14609
|
-
const
|
|
14610
|
-
const workers = hasSharedWorkspace && !usePool ? 1 : requestedWorkers;
|
|
14641
|
+
const workers = options.maxConcurrency ?? target.workers ?? 1;
|
|
14611
14642
|
setupLog(
|
|
14612
|
-
`sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool}
|
|
14643
|
+
`sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool} workers=${workers}`
|
|
14613
14644
|
);
|
|
14614
|
-
if (hasSharedWorkspace && !usePool &&
|
|
14645
|
+
if (hasSharedWorkspace && !usePool && workers > 1) {
|
|
14615
14646
|
console.warn(
|
|
14616
|
-
|
|
14647
|
+
[
|
|
14648
|
+
`Warning: This eval uses a shared workspace with ${workers} workers.`,
|
|
14649
|
+
"If the agent under test makes file edits, concurrent runs may corrupt each other.",
|
|
14650
|
+
"To limit concurrency, add this to your eval YAML:",
|
|
14651
|
+
"",
|
|
14652
|
+
" execution:",
|
|
14653
|
+
" workers: 1",
|
|
14654
|
+
"",
|
|
14655
|
+
"Or pass --workers 1 on the command line."
|
|
14656
|
+
].join("\n")
|
|
14617
14657
|
);
|
|
14618
14658
|
}
|
|
14619
14659
|
const limit = pLimit(workers);
|
|
@@ -14896,7 +14936,8 @@ async function runEvaluation(options) {
|
|
|
14896
14936
|
streamCallbacks,
|
|
14897
14937
|
typeRegistry,
|
|
14898
14938
|
repoManager,
|
|
14899
|
-
evalDir
|
|
14939
|
+
evalDir,
|
|
14940
|
+
verbose
|
|
14900
14941
|
};
|
|
14901
14942
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
14902
14943
|
if (totalBudgetUsd !== void 0) {
|
|
@@ -14976,7 +15017,8 @@ async function runEvaluation(options) {
|
|
|
14976
15017
|
promptInputs,
|
|
14977
15018
|
primaryProvider,
|
|
14978
15019
|
"agent",
|
|
14979
|
-
"provider_error"
|
|
15020
|
+
"provider_error",
|
|
15021
|
+
verbose
|
|
14980
15022
|
);
|
|
14981
15023
|
results.push(errorResult);
|
|
14982
15024
|
if (onResult) {
|
|
@@ -15049,6 +15091,7 @@ async function runBatchEvaluation(options) {
|
|
|
15049
15091
|
nowFn,
|
|
15050
15092
|
onProgress,
|
|
15051
15093
|
onResult,
|
|
15094
|
+
verbose,
|
|
15052
15095
|
resolveGraderProvider,
|
|
15053
15096
|
agentTimeoutMs,
|
|
15054
15097
|
targetResolver,
|
|
@@ -15136,7 +15179,8 @@ async function runBatchEvaluation(options) {
|
|
|
15136
15179
|
startTime,
|
|
15137
15180
|
endTime,
|
|
15138
15181
|
targetResolver,
|
|
15139
|
-
availableTargets
|
|
15182
|
+
availableTargets,
|
|
15183
|
+
verbose
|
|
15140
15184
|
});
|
|
15141
15185
|
if (providerError) {
|
|
15142
15186
|
result = {
|
|
@@ -15157,7 +15201,8 @@ async function runBatchEvaluation(options) {
|
|
|
15157
15201
|
promptInputs,
|
|
15158
15202
|
provider,
|
|
15159
15203
|
"evaluator",
|
|
15160
|
-
"evaluator_error"
|
|
15204
|
+
"evaluator_error",
|
|
15205
|
+
verbose
|
|
15161
15206
|
);
|
|
15162
15207
|
results.push(errorResult);
|
|
15163
15208
|
if (onResult) {
|
|
@@ -15220,7 +15265,8 @@ async function runEvalCase(options) {
|
|
|
15220
15265
|
suiteWorkspaceFile,
|
|
15221
15266
|
typeRegistry: providedTypeRegistry,
|
|
15222
15267
|
repoManager,
|
|
15223
|
-
evalDir
|
|
15268
|
+
evalDir,
|
|
15269
|
+
verbose
|
|
15224
15270
|
} = options;
|
|
15225
15271
|
const setupDebug = process.env.AGENTV_SETUP_DEBUG === "1";
|
|
15226
15272
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
@@ -15257,7 +15303,8 @@ async function runEvalCase(options) {
|
|
|
15257
15303
|
promptInputs,
|
|
15258
15304
|
provider,
|
|
15259
15305
|
"setup",
|
|
15260
|
-
"template_error"
|
|
15306
|
+
"template_error",
|
|
15307
|
+
verbose
|
|
15261
15308
|
);
|
|
15262
15309
|
}
|
|
15263
15310
|
if (caseWorkspaceFile && workspacePath) {
|
|
@@ -15286,7 +15333,8 @@ async function runEvalCase(options) {
|
|
|
15286
15333
|
promptInputs,
|
|
15287
15334
|
provider,
|
|
15288
15335
|
"repo_setup",
|
|
15289
|
-
"local_path_not_found"
|
|
15336
|
+
"local_path_not_found",
|
|
15337
|
+
verbose
|
|
15290
15338
|
);
|
|
15291
15339
|
}
|
|
15292
15340
|
}
|
|
@@ -15312,7 +15360,8 @@ async function runEvalCase(options) {
|
|
|
15312
15360
|
promptInputs,
|
|
15313
15361
|
provider,
|
|
15314
15362
|
"repo_setup",
|
|
15315
|
-
"clone_error"
|
|
15363
|
+
"clone_error",
|
|
15364
|
+
verbose
|
|
15316
15365
|
);
|
|
15317
15366
|
}
|
|
15318
15367
|
}
|
|
@@ -15338,7 +15387,8 @@ async function runEvalCase(options) {
|
|
|
15338
15387
|
promptInputs,
|
|
15339
15388
|
provider,
|
|
15340
15389
|
"setup",
|
|
15341
|
-
"file_copy_error"
|
|
15390
|
+
"file_copy_error",
|
|
15391
|
+
verbose
|
|
15342
15392
|
);
|
|
15343
15393
|
}
|
|
15344
15394
|
}
|
|
@@ -15383,7 +15433,8 @@ async function runEvalCase(options) {
|
|
|
15383
15433
|
promptInputs,
|
|
15384
15434
|
provider,
|
|
15385
15435
|
"setup",
|
|
15386
|
-
"script_error"
|
|
15436
|
+
"script_error",
|
|
15437
|
+
verbose
|
|
15387
15438
|
);
|
|
15388
15439
|
}
|
|
15389
15440
|
}
|
|
@@ -15414,7 +15465,8 @@ async function runEvalCase(options) {
|
|
|
15414
15465
|
promptInputs,
|
|
15415
15466
|
provider,
|
|
15416
15467
|
"setup",
|
|
15417
|
-
"script_error"
|
|
15468
|
+
"script_error",
|
|
15469
|
+
verbose
|
|
15418
15470
|
);
|
|
15419
15471
|
}
|
|
15420
15472
|
}
|
|
@@ -15458,7 +15510,8 @@ async function runEvalCase(options) {
|
|
|
15458
15510
|
promptInputs,
|
|
15459
15511
|
provider,
|
|
15460
15512
|
"agent",
|
|
15461
|
-
"provider_error"
|
|
15513
|
+
"provider_error",
|
|
15514
|
+
verbose
|
|
15462
15515
|
);
|
|
15463
15516
|
if (workspacePath) {
|
|
15464
15517
|
if (forceCleanup) {
|
|
@@ -15479,7 +15532,8 @@ async function runEvalCase(options) {
|
|
|
15479
15532
|
promptInputs,
|
|
15480
15533
|
provider,
|
|
15481
15534
|
"agent",
|
|
15482
|
-
"provider_error"
|
|
15535
|
+
"provider_error",
|
|
15536
|
+
verbose
|
|
15483
15537
|
);
|
|
15484
15538
|
if (workspacePath) {
|
|
15485
15539
|
if (forceCleanup) {
|
|
@@ -15574,7 +15628,8 @@ async function runEvalCase(options) {
|
|
|
15574
15628
|
targetResolver,
|
|
15575
15629
|
availableTargets,
|
|
15576
15630
|
fileChanges,
|
|
15577
|
-
workspacePath
|
|
15631
|
+
workspacePath,
|
|
15632
|
+
verbose
|
|
15578
15633
|
});
|
|
15579
15634
|
const totalDurationMs = Date.now() - caseStartMs;
|
|
15580
15635
|
const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
|
|
@@ -15629,7 +15684,8 @@ async function runEvalCase(options) {
|
|
|
15629
15684
|
promptInputs,
|
|
15630
15685
|
provider,
|
|
15631
15686
|
"evaluator",
|
|
15632
|
-
"evaluator_error"
|
|
15687
|
+
"evaluator_error",
|
|
15688
|
+
verbose
|
|
15633
15689
|
);
|
|
15634
15690
|
if (workspacePath && !isSharedWorkspace) {
|
|
15635
15691
|
if (forceCleanup || (retainOnFailure ?? "keep") === "cleanup") {
|
|
@@ -15771,7 +15827,7 @@ async function evaluateCandidate(options) {
|
|
|
15771
15827
|
let lmRequest;
|
|
15772
15828
|
if (isAgentProvider(provider)) {
|
|
15773
15829
|
agentRequest = {
|
|
15774
|
-
|
|
15830
|
+
...options.verbose ? { input: promptInputs.question } : {}
|
|
15775
15831
|
};
|
|
15776
15832
|
} else {
|
|
15777
15833
|
if (promptInputs.chatPrompt) {
|
|
@@ -15785,8 +15841,9 @@ async function evaluateCandidate(options) {
|
|
|
15785
15841
|
}
|
|
15786
15842
|
}
|
|
15787
15843
|
const evaluatorRequest = scores ? void 0 : score.evaluatorRawRequest;
|
|
15788
|
-
const
|
|
15789
|
-
|
|
15844
|
+
const effectiveAgentRequest = agentRequest && Object.keys(agentRequest).length > 0 ? agentRequest : void 0;
|
|
15845
|
+
const requests = effectiveAgentRequest || lmRequest || evaluatorRequest ? {
|
|
15846
|
+
...effectiveAgentRequest ? { agent: effectiveAgentRequest } : {},
|
|
15790
15847
|
...lmRequest ? { lm: lmRequest } : {},
|
|
15791
15848
|
...evaluatorRequest ? { evaluator: evaluatorRequest } : {}
|
|
15792
15849
|
} : void 0;
|
|
@@ -15806,9 +15863,9 @@ async function evaluateCandidate(options) {
|
|
|
15806
15863
|
endTime,
|
|
15807
15864
|
requests,
|
|
15808
15865
|
input,
|
|
15866
|
+
output: output ?? [{ role: "assistant", content: candidate }],
|
|
15809
15867
|
scores,
|
|
15810
15868
|
trace,
|
|
15811
|
-
output: output ?? [{ role: "assistant", content: candidate }],
|
|
15812
15869
|
fileChanges,
|
|
15813
15870
|
executionStatus: classifyQualityStatus(score.score)
|
|
15814
15871
|
};
|
|
@@ -15974,6 +16031,7 @@ async function runEvaluatorList(options) {
|
|
|
15974
16031
|
verdict: score2.verdict,
|
|
15975
16032
|
assertions: score2.assertions,
|
|
15976
16033
|
input: score2.evaluatorRawRequest,
|
|
16034
|
+
target: score2.graderTarget,
|
|
15977
16035
|
details: score2.details,
|
|
15978
16036
|
scores: mapChildResults(score2.scores),
|
|
15979
16037
|
tokenUsage: score2.tokenUsage,
|
|
@@ -16113,13 +16171,13 @@ async function invokeProvider(provider, options) {
|
|
|
16113
16171
|
}
|
|
16114
16172
|
}
|
|
16115
16173
|
}
|
|
16116
|
-
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
|
|
16174
|
+
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode, verbose) {
|
|
16117
16175
|
const message = error instanceof Error ? error.message : String(error);
|
|
16118
16176
|
let agentRequest;
|
|
16119
16177
|
let lmRequest;
|
|
16120
16178
|
if (isAgentProvider(provider)) {
|
|
16121
16179
|
agentRequest = {
|
|
16122
|
-
|
|
16180
|
+
...verbose ? { input: promptInputs.question } : {},
|
|
16123
16181
|
error: message
|
|
16124
16182
|
};
|
|
16125
16183
|
} else {
|
|
@@ -16147,10 +16205,10 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
16147
16205
|
conversationId: evalCase.conversation_id,
|
|
16148
16206
|
score: 0,
|
|
16149
16207
|
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
16150
|
-
output: [{ role: "assistant", content: `Error occurred: ${message}` }],
|
|
16151
16208
|
target: targetName,
|
|
16152
16209
|
requests,
|
|
16153
16210
|
input,
|
|
16211
|
+
output: [{ role: "assistant", content: `Error occurred: ${message}` }],
|
|
16154
16212
|
error: message,
|
|
16155
16213
|
executionStatus: "execution_error",
|
|
16156
16214
|
failureStage,
|
|
@@ -17219,6 +17277,7 @@ export {
|
|
|
17219
17277
|
extractTargetsFromSuite,
|
|
17220
17278
|
extractTargetsFromTestCase,
|
|
17221
17279
|
extractTrialsConfig,
|
|
17280
|
+
extractWorkersFromSuite,
|
|
17222
17281
|
fileExists,
|
|
17223
17282
|
findGitRoot,
|
|
17224
17283
|
freeformEvaluationSchema,
|