agentv 3.9.0 → 3.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-Q2YWV4QM.js → chunk-6ZAFWUBT.js} +29 -18
- package/dist/chunk-6ZAFWUBT.js.map +1 -0
- package/dist/{chunk-GC5P5HHZ.js → chunk-JGMJL2LV.js} +76 -42
- package/dist/chunk-JGMJL2LV.js.map +1 -0
- package/dist/{chunk-TXDPYXHY.js → chunk-OIVGGWJ3.js} +102 -43
- package/dist/chunk-OIVGGWJ3.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-PIOSPBKX.js → dist-PUPHGVKL.js} +4 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-3VTDK5NX.js → interactive-BD56NB23.js} +3 -3
- package/dist/templates/.agentv/config.yaml +4 -13
- package/dist/templates/.agentv/targets.yaml +0 -16
- package/dist/templates/{.agentv/.env.example → .env.example} +11 -9
- package/package.json +1 -1
- package/dist/chunk-GC5P5HHZ.js.map +0 -1
- package/dist/chunk-Q2YWV4QM.js.map +0 -1
- package/dist/chunk-TXDPYXHY.js.map +0 -1
- /package/dist/{dist-PIOSPBKX.js.map → dist-PUPHGVKL.js.map} +0 -0
- /package/dist/{interactive-3VTDK5NX.js.map → interactive-BD56NB23.js.map} +0 -0
|
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
|
|
|
301
301
|
}
|
|
302
302
|
});
|
|
303
303
|
|
|
304
|
-
// ../../packages/core/dist/chunk-
|
|
304
|
+
// ../../packages/core/dist/chunk-K7JCJIXA.js
|
|
305
305
|
import { constants } from "node:fs";
|
|
306
306
|
import { access, readFile } from "node:fs/promises";
|
|
307
307
|
import path from "node:path";
|
|
@@ -419,7 +419,7 @@ __export(external_exports2, {
|
|
|
419
419
|
void: () => voidType
|
|
420
420
|
});
|
|
421
421
|
|
|
422
|
-
// ../../packages/core/dist/chunk-
|
|
422
|
+
// ../../packages/core/dist/chunk-K7JCJIXA.js
|
|
423
423
|
import { readFile as readFile2 } from "node:fs/promises";
|
|
424
424
|
import path3 from "node:path";
|
|
425
425
|
import fg from "fast-glob";
|
|
@@ -14472,6 +14472,17 @@ function extractTargetsFromSuite(suite) {
|
|
|
14472
14472
|
}
|
|
14473
14473
|
return void 0;
|
|
14474
14474
|
}
|
|
14475
|
+
function extractWorkersFromSuite(suite) {
|
|
14476
|
+
const execution = suite.execution;
|
|
14477
|
+
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
14478
|
+
return void 0;
|
|
14479
|
+
}
|
|
14480
|
+
const workers = execution.workers;
|
|
14481
|
+
if (typeof workers === "number" && Number.isInteger(workers) && workers >= 1 && workers <= 50) {
|
|
14482
|
+
return workers;
|
|
14483
|
+
}
|
|
14484
|
+
return void 0;
|
|
14485
|
+
}
|
|
14475
14486
|
function extractTargetsFromTestCase(testCase) {
|
|
14476
14487
|
const execution = testCase.execution;
|
|
14477
14488
|
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
@@ -16735,6 +16746,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
16735
16746
|
tests,
|
|
16736
16747
|
trials: extractTrialsConfig(parsed),
|
|
16737
16748
|
targets: extractTargetsFromSuite(parsed),
|
|
16749
|
+
workers: extractWorkersFromSuite(parsed),
|
|
16738
16750
|
cacheConfig: extractCacheConfig(parsed),
|
|
16739
16751
|
totalBudgetUsd: extractTotalBudgetUsd(parsed),
|
|
16740
16752
|
...metadata !== void 0 && { metadata },
|
|
@@ -23966,8 +23978,7 @@ ${context2.fileChanges}`;
|
|
|
23966
23978
|
}
|
|
23967
23979
|
const evaluatorRawRequest = {
|
|
23968
23980
|
userPrompt,
|
|
23969
|
-
systemPrompt
|
|
23970
|
-
target: graderProvider.targetName
|
|
23981
|
+
systemPrompt
|
|
23971
23982
|
};
|
|
23972
23983
|
try {
|
|
23973
23984
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -23985,6 +23996,7 @@ ${context2.fileChanges}`;
|
|
|
23985
23996
|
assertions,
|
|
23986
23997
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
23987
23998
|
evaluatorRawRequest,
|
|
23999
|
+
graderTarget: graderProvider.targetName,
|
|
23988
24000
|
tokenUsage
|
|
23989
24001
|
};
|
|
23990
24002
|
} catch (e) {
|
|
@@ -23996,7 +24008,8 @@ ${context2.fileChanges}`;
|
|
|
23996
24008
|
verdict: "skip",
|
|
23997
24009
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
23998
24010
|
expectedAspectCount: 1,
|
|
23999
|
-
evaluatorRawRequest
|
|
24011
|
+
evaluatorRawRequest,
|
|
24012
|
+
graderTarget: graderProvider.targetName
|
|
24000
24013
|
};
|
|
24001
24014
|
}
|
|
24002
24015
|
}
|
|
@@ -24014,8 +24027,7 @@ ${context2.fileChanges}`;
|
|
|
24014
24027
|
const systemPrompt = buildRubricOutputSchema();
|
|
24015
24028
|
const evaluatorRawRequest = {
|
|
24016
24029
|
userPrompt: prompt,
|
|
24017
|
-
systemPrompt
|
|
24018
|
-
target: graderProvider.targetName
|
|
24030
|
+
systemPrompt
|
|
24019
24031
|
};
|
|
24020
24032
|
try {
|
|
24021
24033
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -24032,6 +24044,7 @@ ${context2.fileChanges}`;
|
|
|
24032
24044
|
assertions,
|
|
24033
24045
|
expectedAspectCount: rubrics.length,
|
|
24034
24046
|
evaluatorRawRequest,
|
|
24047
|
+
graderTarget: graderProvider.targetName,
|
|
24035
24048
|
tokenUsage
|
|
24036
24049
|
};
|
|
24037
24050
|
} catch (e) {
|
|
@@ -24043,7 +24056,8 @@ ${context2.fileChanges}`;
|
|
|
24043
24056
|
verdict: "skip",
|
|
24044
24057
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
24045
24058
|
expectedAspectCount: rubrics.length,
|
|
24046
|
-
evaluatorRawRequest
|
|
24059
|
+
evaluatorRawRequest,
|
|
24060
|
+
graderTarget: graderProvider.targetName
|
|
24047
24061
|
};
|
|
24048
24062
|
}
|
|
24049
24063
|
}
|
|
@@ -24056,8 +24070,7 @@ ${context2.fileChanges}`;
|
|
|
24056
24070
|
const systemPrompt = buildScoreRangeOutputSchema();
|
|
24057
24071
|
const evaluatorRawRequest = {
|
|
24058
24072
|
userPrompt: prompt,
|
|
24059
|
-
systemPrompt
|
|
24060
|
-
target: graderProvider.targetName
|
|
24073
|
+
systemPrompt
|
|
24061
24074
|
};
|
|
24062
24075
|
try {
|
|
24063
24076
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
@@ -24074,6 +24087,7 @@ ${context2.fileChanges}`;
|
|
|
24074
24087
|
assertions,
|
|
24075
24088
|
expectedAspectCount: rubrics.length,
|
|
24076
24089
|
evaluatorRawRequest,
|
|
24090
|
+
graderTarget: graderProvider.targetName,
|
|
24077
24091
|
details,
|
|
24078
24092
|
tokenUsage
|
|
24079
24093
|
};
|
|
@@ -24086,7 +24100,8 @@ ${context2.fileChanges}`;
|
|
|
24086
24100
|
verdict: "skip",
|
|
24087
24101
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
24088
24102
|
expectedAspectCount: rubrics.length,
|
|
24089
|
-
evaluatorRawRequest
|
|
24103
|
+
evaluatorRawRequest,
|
|
24104
|
+
graderTarget: graderProvider.targetName
|
|
24090
24105
|
};
|
|
24091
24106
|
}
|
|
24092
24107
|
}
|
|
@@ -24118,7 +24133,6 @@ ${context2.fileChanges}`;
|
|
|
24118
24133
|
mode: "built-in",
|
|
24119
24134
|
systemPrompt,
|
|
24120
24135
|
userPrompt,
|
|
24121
|
-
target: graderProvider.targetName,
|
|
24122
24136
|
maxSteps: this.maxSteps
|
|
24123
24137
|
};
|
|
24124
24138
|
try {
|
|
@@ -24136,7 +24150,13 @@ ${context2.fileChanges}`;
|
|
|
24136
24150
|
steps: steps.length,
|
|
24137
24151
|
tool_calls: toolCallCount
|
|
24138
24152
|
};
|
|
24139
|
-
return this.parseAgentResult(
|
|
24153
|
+
return this.parseAgentResult(
|
|
24154
|
+
text2,
|
|
24155
|
+
rubrics,
|
|
24156
|
+
evaluatorRawRequest,
|
|
24157
|
+
details,
|
|
24158
|
+
graderProvider.targetName
|
|
24159
|
+
);
|
|
24140
24160
|
} catch (error) {
|
|
24141
24161
|
const message = error instanceof Error ? error.message : String(error);
|
|
24142
24162
|
return {
|
|
@@ -24145,6 +24165,7 @@ ${context2.fileChanges}`;
|
|
|
24145
24165
|
assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
|
|
24146
24166
|
expectedAspectCount: 1,
|
|
24147
24167
|
evaluatorRawRequest,
|
|
24168
|
+
graderTarget: graderProvider.targetName,
|
|
24148
24169
|
details: { mode: "built-in", error: message }
|
|
24149
24170
|
};
|
|
24150
24171
|
}
|
|
@@ -24197,6 +24218,7 @@ ${context2.fileChanges}`;
|
|
|
24197
24218
|
],
|
|
24198
24219
|
expectedAspectCount: 1,
|
|
24199
24220
|
evaluatorRawRequest,
|
|
24221
|
+
graderTarget: provider.targetName,
|
|
24200
24222
|
details: { mode: modeLabel, grader_target: provider.targetName }
|
|
24201
24223
|
};
|
|
24202
24224
|
}
|
|
@@ -24206,7 +24228,13 @@ ${context2.fileChanges}`;
|
|
|
24206
24228
|
mode: modeLabel,
|
|
24207
24229
|
grader_target: provider.targetName
|
|
24208
24230
|
};
|
|
24209
|
-
return this.parseAgentResult(
|
|
24231
|
+
return this.parseAgentResult(
|
|
24232
|
+
assistantContent,
|
|
24233
|
+
rubrics,
|
|
24234
|
+
evaluatorRawRequest,
|
|
24235
|
+
details,
|
|
24236
|
+
provider.targetName
|
|
24237
|
+
);
|
|
24210
24238
|
} catch (error) {
|
|
24211
24239
|
const message = error instanceof Error ? error.message : String(error);
|
|
24212
24240
|
return {
|
|
@@ -24217,6 +24245,7 @@ ${context2.fileChanges}`;
|
|
|
24217
24245
|
],
|
|
24218
24246
|
expectedAspectCount: 1,
|
|
24219
24247
|
evaluatorRawRequest,
|
|
24248
|
+
graderTarget: provider.targetName,
|
|
24220
24249
|
details: {
|
|
24221
24250
|
mode: modeLabel,
|
|
24222
24251
|
grader_target: provider.targetName,
|
|
@@ -24361,7 +24390,7 @@ ${outputSchema2}`;
|
|
|
24361
24390
|
* Parse the agent's response text into an EvaluationScore.
|
|
24362
24391
|
* Supports both freeform and rubric modes.
|
|
24363
24392
|
*/
|
|
24364
|
-
parseAgentResult(text2, rubrics, evaluatorRawRequest, details) {
|
|
24393
|
+
parseAgentResult(text2, rubrics, evaluatorRawRequest, details, graderTarget) {
|
|
24365
24394
|
try {
|
|
24366
24395
|
const parsed = parseJsonFromText(text2);
|
|
24367
24396
|
if (rubrics && rubrics.length > 0) {
|
|
@@ -24373,6 +24402,7 @@ ${outputSchema2}`;
|
|
|
24373
24402
|
assertions: assertions2,
|
|
24374
24403
|
expectedAspectCount: rubrics.length,
|
|
24375
24404
|
evaluatorRawRequest,
|
|
24405
|
+
graderTarget,
|
|
24376
24406
|
details
|
|
24377
24407
|
};
|
|
24378
24408
|
}
|
|
@@ -24385,6 +24415,7 @@ ${outputSchema2}`;
|
|
|
24385
24415
|
assertions,
|
|
24386
24416
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
24387
24417
|
evaluatorRawRequest,
|
|
24418
|
+
graderTarget,
|
|
24388
24419
|
details
|
|
24389
24420
|
};
|
|
24390
24421
|
} catch {
|
|
@@ -24399,6 +24430,7 @@ ${outputSchema2}`;
|
|
|
24399
24430
|
],
|
|
24400
24431
|
expectedAspectCount: 1,
|
|
24401
24432
|
evaluatorRawRequest,
|
|
24433
|
+
graderTarget,
|
|
24402
24434
|
details
|
|
24403
24435
|
};
|
|
24404
24436
|
}
|
|
@@ -28331,14 +28363,22 @@ async function runEvaluation(options) {
|
|
|
28331
28363
|
const usePool = poolEnabled !== false && !!suiteWorkspace?.repos?.length && !isPerTestIsolation && !useStaticWorkspace;
|
|
28332
28364
|
const resolvedRetainOnSuccess = retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup");
|
|
28333
28365
|
const resolvedRetainOnFailure = retainOnFailure ?? (cleanupWorkspaces ? "cleanup" : "keep");
|
|
28334
|
-
const
|
|
28335
|
-
const workers = hasSharedWorkspace && !usePool ? 1 : requestedWorkers;
|
|
28366
|
+
const workers = options.maxConcurrency ?? target.workers ?? 1;
|
|
28336
28367
|
setupLog(
|
|
28337
|
-
`sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool}
|
|
28368
|
+
`sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool} workers=${workers}`
|
|
28338
28369
|
);
|
|
28339
|
-
if (hasSharedWorkspace && !usePool &&
|
|
28370
|
+
if (hasSharedWorkspace && !usePool && workers > 1) {
|
|
28340
28371
|
console.warn(
|
|
28341
|
-
|
|
28372
|
+
[
|
|
28373
|
+
`Warning: This eval uses a shared workspace with ${workers} workers.`,
|
|
28374
|
+
"If the agent under test makes file edits, concurrent runs may corrupt each other.",
|
|
28375
|
+
"To limit concurrency, add this to your eval YAML:",
|
|
28376
|
+
"",
|
|
28377
|
+
" execution:",
|
|
28378
|
+
" workers: 1",
|
|
28379
|
+
"",
|
|
28380
|
+
"Or pass --workers 1 on the command line."
|
|
28381
|
+
].join("\n")
|
|
28342
28382
|
);
|
|
28343
28383
|
}
|
|
28344
28384
|
const limit = pLimit(workers);
|
|
@@ -28621,7 +28661,8 @@ async function runEvaluation(options) {
|
|
|
28621
28661
|
streamCallbacks,
|
|
28622
28662
|
typeRegistry,
|
|
28623
28663
|
repoManager,
|
|
28624
|
-
evalDir
|
|
28664
|
+
evalDir,
|
|
28665
|
+
verbose
|
|
28625
28666
|
};
|
|
28626
28667
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
28627
28668
|
if (totalBudgetUsd !== void 0) {
|
|
@@ -28701,7 +28742,8 @@ async function runEvaluation(options) {
|
|
|
28701
28742
|
promptInputs,
|
|
28702
28743
|
primaryProvider,
|
|
28703
28744
|
"agent",
|
|
28704
|
-
"provider_error"
|
|
28745
|
+
"provider_error",
|
|
28746
|
+
verbose
|
|
28705
28747
|
);
|
|
28706
28748
|
results.push(errorResult);
|
|
28707
28749
|
if (onResult) {
|
|
@@ -28774,6 +28816,7 @@ async function runBatchEvaluation(options) {
|
|
|
28774
28816
|
nowFn,
|
|
28775
28817
|
onProgress,
|
|
28776
28818
|
onResult,
|
|
28819
|
+
verbose,
|
|
28777
28820
|
resolveGraderProvider,
|
|
28778
28821
|
agentTimeoutMs,
|
|
28779
28822
|
targetResolver,
|
|
@@ -28861,7 +28904,8 @@ async function runBatchEvaluation(options) {
|
|
|
28861
28904
|
startTime,
|
|
28862
28905
|
endTime,
|
|
28863
28906
|
targetResolver,
|
|
28864
|
-
availableTargets
|
|
28907
|
+
availableTargets,
|
|
28908
|
+
verbose
|
|
28865
28909
|
});
|
|
28866
28910
|
if (providerError) {
|
|
28867
28911
|
result = {
|
|
@@ -28882,7 +28926,8 @@ async function runBatchEvaluation(options) {
|
|
|
28882
28926
|
promptInputs,
|
|
28883
28927
|
provider,
|
|
28884
28928
|
"evaluator",
|
|
28885
|
-
"evaluator_error"
|
|
28929
|
+
"evaluator_error",
|
|
28930
|
+
verbose
|
|
28886
28931
|
);
|
|
28887
28932
|
results.push(errorResult);
|
|
28888
28933
|
if (onResult) {
|
|
@@ -28945,7 +28990,8 @@ async function runEvalCase(options) {
|
|
|
28945
28990
|
suiteWorkspaceFile,
|
|
28946
28991
|
typeRegistry: providedTypeRegistry,
|
|
28947
28992
|
repoManager,
|
|
28948
|
-
evalDir
|
|
28993
|
+
evalDir,
|
|
28994
|
+
verbose
|
|
28949
28995
|
} = options;
|
|
28950
28996
|
const setupDebug = process.env.AGENTV_SETUP_DEBUG === "1";
|
|
28951
28997
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
@@ -28982,7 +29028,8 @@ async function runEvalCase(options) {
|
|
|
28982
29028
|
promptInputs,
|
|
28983
29029
|
provider,
|
|
28984
29030
|
"setup",
|
|
28985
|
-
"template_error"
|
|
29031
|
+
"template_error",
|
|
29032
|
+
verbose
|
|
28986
29033
|
);
|
|
28987
29034
|
}
|
|
28988
29035
|
if (caseWorkspaceFile && workspacePath) {
|
|
@@ -29011,7 +29058,8 @@ async function runEvalCase(options) {
|
|
|
29011
29058
|
promptInputs,
|
|
29012
29059
|
provider,
|
|
29013
29060
|
"repo_setup",
|
|
29014
|
-
"local_path_not_found"
|
|
29061
|
+
"local_path_not_found",
|
|
29062
|
+
verbose
|
|
29015
29063
|
);
|
|
29016
29064
|
}
|
|
29017
29065
|
}
|
|
@@ -29037,7 +29085,8 @@ async function runEvalCase(options) {
|
|
|
29037
29085
|
promptInputs,
|
|
29038
29086
|
provider,
|
|
29039
29087
|
"repo_setup",
|
|
29040
|
-
"clone_error"
|
|
29088
|
+
"clone_error",
|
|
29089
|
+
verbose
|
|
29041
29090
|
);
|
|
29042
29091
|
}
|
|
29043
29092
|
}
|
|
@@ -29063,7 +29112,8 @@ async function runEvalCase(options) {
|
|
|
29063
29112
|
promptInputs,
|
|
29064
29113
|
provider,
|
|
29065
29114
|
"setup",
|
|
29066
|
-
"file_copy_error"
|
|
29115
|
+
"file_copy_error",
|
|
29116
|
+
verbose
|
|
29067
29117
|
);
|
|
29068
29118
|
}
|
|
29069
29119
|
}
|
|
@@ -29108,7 +29158,8 @@ async function runEvalCase(options) {
|
|
|
29108
29158
|
promptInputs,
|
|
29109
29159
|
provider,
|
|
29110
29160
|
"setup",
|
|
29111
|
-
"script_error"
|
|
29161
|
+
"script_error",
|
|
29162
|
+
verbose
|
|
29112
29163
|
);
|
|
29113
29164
|
}
|
|
29114
29165
|
}
|
|
@@ -29139,7 +29190,8 @@ async function runEvalCase(options) {
|
|
|
29139
29190
|
promptInputs,
|
|
29140
29191
|
provider,
|
|
29141
29192
|
"setup",
|
|
29142
|
-
"script_error"
|
|
29193
|
+
"script_error",
|
|
29194
|
+
verbose
|
|
29143
29195
|
);
|
|
29144
29196
|
}
|
|
29145
29197
|
}
|
|
@@ -29183,7 +29235,8 @@ async function runEvalCase(options) {
|
|
|
29183
29235
|
promptInputs,
|
|
29184
29236
|
provider,
|
|
29185
29237
|
"agent",
|
|
29186
|
-
"provider_error"
|
|
29238
|
+
"provider_error",
|
|
29239
|
+
verbose
|
|
29187
29240
|
);
|
|
29188
29241
|
if (workspacePath) {
|
|
29189
29242
|
if (forceCleanup) {
|
|
@@ -29204,7 +29257,8 @@ async function runEvalCase(options) {
|
|
|
29204
29257
|
promptInputs,
|
|
29205
29258
|
provider,
|
|
29206
29259
|
"agent",
|
|
29207
|
-
"provider_error"
|
|
29260
|
+
"provider_error",
|
|
29261
|
+
verbose
|
|
29208
29262
|
);
|
|
29209
29263
|
if (workspacePath) {
|
|
29210
29264
|
if (forceCleanup) {
|
|
@@ -29299,7 +29353,8 @@ async function runEvalCase(options) {
|
|
|
29299
29353
|
targetResolver,
|
|
29300
29354
|
availableTargets,
|
|
29301
29355
|
fileChanges,
|
|
29302
|
-
workspacePath
|
|
29356
|
+
workspacePath,
|
|
29357
|
+
verbose
|
|
29303
29358
|
});
|
|
29304
29359
|
const totalDurationMs = Date.now() - caseStartMs;
|
|
29305
29360
|
const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
|
|
@@ -29354,7 +29409,8 @@ async function runEvalCase(options) {
|
|
|
29354
29409
|
promptInputs,
|
|
29355
29410
|
provider,
|
|
29356
29411
|
"evaluator",
|
|
29357
|
-
"evaluator_error"
|
|
29412
|
+
"evaluator_error",
|
|
29413
|
+
verbose
|
|
29358
29414
|
);
|
|
29359
29415
|
if (workspacePath && !isSharedWorkspace) {
|
|
29360
29416
|
if (forceCleanup || (retainOnFailure ?? "keep") === "cleanup") {
|
|
@@ -29496,7 +29552,7 @@ async function evaluateCandidate(options) {
|
|
|
29496
29552
|
let lmRequest;
|
|
29497
29553
|
if (isAgentProvider(provider)) {
|
|
29498
29554
|
agentRequest = {
|
|
29499
|
-
|
|
29555
|
+
...options.verbose ? { input: promptInputs.question } : {}
|
|
29500
29556
|
};
|
|
29501
29557
|
} else {
|
|
29502
29558
|
if (promptInputs.chatPrompt) {
|
|
@@ -29510,8 +29566,9 @@ async function evaluateCandidate(options) {
|
|
|
29510
29566
|
}
|
|
29511
29567
|
}
|
|
29512
29568
|
const evaluatorRequest = scores ? void 0 : score.evaluatorRawRequest;
|
|
29513
|
-
const
|
|
29514
|
-
|
|
29569
|
+
const effectiveAgentRequest = agentRequest && Object.keys(agentRequest).length > 0 ? agentRequest : void 0;
|
|
29570
|
+
const requests = effectiveAgentRequest || lmRequest || evaluatorRequest ? {
|
|
29571
|
+
...effectiveAgentRequest ? { agent: effectiveAgentRequest } : {},
|
|
29515
29572
|
...lmRequest ? { lm: lmRequest } : {},
|
|
29516
29573
|
...evaluatorRequest ? { evaluator: evaluatorRequest } : {}
|
|
29517
29574
|
} : void 0;
|
|
@@ -29531,9 +29588,9 @@ async function evaluateCandidate(options) {
|
|
|
29531
29588
|
endTime,
|
|
29532
29589
|
requests,
|
|
29533
29590
|
input,
|
|
29591
|
+
output: output ?? [{ role: "assistant", content: candidate }],
|
|
29534
29592
|
scores,
|
|
29535
29593
|
trace: trace2,
|
|
29536
|
-
output: output ?? [{ role: "assistant", content: candidate }],
|
|
29537
29594
|
fileChanges,
|
|
29538
29595
|
executionStatus: classifyQualityStatus(score.score)
|
|
29539
29596
|
};
|
|
@@ -29699,6 +29756,7 @@ async function runEvaluatorList(options) {
|
|
|
29699
29756
|
verdict: score2.verdict,
|
|
29700
29757
|
assertions: score2.assertions,
|
|
29701
29758
|
input: score2.evaluatorRawRequest,
|
|
29759
|
+
target: score2.graderTarget,
|
|
29702
29760
|
details: score2.details,
|
|
29703
29761
|
scores: mapChildResults(score2.scores),
|
|
29704
29762
|
tokenUsage: score2.tokenUsage,
|
|
@@ -29838,13 +29896,13 @@ async function invokeProvider(provider, options) {
|
|
|
29838
29896
|
}
|
|
29839
29897
|
}
|
|
29840
29898
|
}
|
|
29841
|
-
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
|
|
29899
|
+
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode, verbose) {
|
|
29842
29900
|
const message = error instanceof Error ? error.message : String(error);
|
|
29843
29901
|
let agentRequest;
|
|
29844
29902
|
let lmRequest;
|
|
29845
29903
|
if (isAgentProvider(provider)) {
|
|
29846
29904
|
agentRequest = {
|
|
29847
|
-
|
|
29905
|
+
...verbose ? { input: promptInputs.question } : {},
|
|
29848
29906
|
error: message
|
|
29849
29907
|
};
|
|
29850
29908
|
} else {
|
|
@@ -29872,10 +29930,10 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
29872
29930
|
conversationId: evalCase.conversation_id,
|
|
29873
29931
|
score: 0,
|
|
29874
29932
|
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
29875
|
-
output: [{ role: "assistant", content: `Error occurred: ${message}` }],
|
|
29876
29933
|
target: targetName,
|
|
29877
29934
|
requests,
|
|
29878
29935
|
input,
|
|
29936
|
+
output: [{ role: "assistant", content: `Error occurred: ${message}` }],
|
|
29879
29937
|
error: message,
|
|
29880
29938
|
executionStatus: "execution_error",
|
|
29881
29939
|
failureStage,
|
|
@@ -30878,6 +30936,7 @@ export {
|
|
|
30878
30936
|
loadConfig,
|
|
30879
30937
|
extractTargetFromSuite,
|
|
30880
30938
|
extractTargetsFromSuite,
|
|
30939
|
+
extractWorkersFromSuite,
|
|
30881
30940
|
extractTargetsFromTestCase,
|
|
30882
30941
|
extractTrialsConfig,
|
|
30883
30942
|
extractCacheConfig,
|
|
@@ -30992,4 +31051,4 @@ export {
|
|
|
30992
31051
|
OtelStreamingObserver,
|
|
30993
31052
|
createAgentKernel
|
|
30994
31053
|
};
|
|
30995
|
-
//# sourceMappingURL=chunk-
|
|
31054
|
+
//# sourceMappingURL=chunk-OIVGGWJ3.js.map
|