@agentv/core 3.10.3 → 3.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-VCFYWLFV.js → chunk-AVTN5AB7.js} +17 -12
- package/dist/chunk-AVTN5AB7.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +173 -135
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +11 -5
- package/dist/index.d.ts +11 -5
- package/dist/index.js +158 -125
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-VCFYWLFV.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1854,6 +1854,64 @@ var import_node_path8 = __toESM(require("path"), 1);
|
|
|
1854
1854
|
var import_micromatch2 = __toESM(require("micromatch"), 1);
|
|
1855
1855
|
var import_yaml4 = require("yaml");
|
|
1856
1856
|
|
|
1857
|
+
// src/evaluation/input-message-utils.ts
|
|
1858
|
+
function flattenInputMessages(messages) {
|
|
1859
|
+
return messages.flatMap((message) => extractContentSegments(message.content));
|
|
1860
|
+
}
|
|
1861
|
+
function collectResolvedInputFilePaths(messages) {
|
|
1862
|
+
const filePaths = [];
|
|
1863
|
+
for (const message of messages) {
|
|
1864
|
+
if (!Array.isArray(message.content)) {
|
|
1865
|
+
continue;
|
|
1866
|
+
}
|
|
1867
|
+
for (const segment of message.content) {
|
|
1868
|
+
if (isJsonObject(segment) && segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
1869
|
+
filePaths.push(segment.resolvedPath);
|
|
1870
|
+
}
|
|
1871
|
+
}
|
|
1872
|
+
}
|
|
1873
|
+
return filePaths;
|
|
1874
|
+
}
|
|
1875
|
+
function extractContentSegments(content) {
|
|
1876
|
+
if (typeof content === "string") {
|
|
1877
|
+
return content.trim().length > 0 ? [{ type: "text", value: content }] : [];
|
|
1878
|
+
}
|
|
1879
|
+
if (isJsonObject(content)) {
|
|
1880
|
+
const rendered = JSON.stringify(content, null, 2);
|
|
1881
|
+
return rendered.trim().length > 0 ? [{ type: "text", value: rendered }] : [];
|
|
1882
|
+
}
|
|
1883
|
+
if (!Array.isArray(content)) {
|
|
1884
|
+
return [];
|
|
1885
|
+
}
|
|
1886
|
+
const segments = [];
|
|
1887
|
+
for (const segment of content) {
|
|
1888
|
+
if (!isJsonObject(segment)) {
|
|
1889
|
+
continue;
|
|
1890
|
+
}
|
|
1891
|
+
segments.push(cloneJsonObject(segment));
|
|
1892
|
+
}
|
|
1893
|
+
return segments;
|
|
1894
|
+
}
|
|
1895
|
+
function cloneJsonObject(source) {
|
|
1896
|
+
const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
|
|
1897
|
+
return Object.fromEntries(entries);
|
|
1898
|
+
}
|
|
1899
|
+
function cloneJsonValue(value) {
|
|
1900
|
+
if (value === null) {
|
|
1901
|
+
return null;
|
|
1902
|
+
}
|
|
1903
|
+
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
|
|
1904
|
+
return value;
|
|
1905
|
+
}
|
|
1906
|
+
if (Array.isArray(value)) {
|
|
1907
|
+
return value.map((item) => cloneJsonValue(item));
|
|
1908
|
+
}
|
|
1909
|
+
if (typeof value === "object") {
|
|
1910
|
+
return cloneJsonObject(value);
|
|
1911
|
+
}
|
|
1912
|
+
return value;
|
|
1913
|
+
}
|
|
1914
|
+
|
|
1857
1915
|
// src/evaluation/interpolation.ts
|
|
1858
1916
|
var ENV_VAR_PATTERN = /\$\{\{\s*([A-Za-z_][A-Za-z0-9_]*)\s*\}\}/g;
|
|
1859
1917
|
function interpolateEnv(value, env) {
|
|
@@ -1941,7 +1999,6 @@ function parseAgentSkillsEvals(parsed, source = "evals.json", baseDir) {
|
|
|
1941
1999
|
id: String(id),
|
|
1942
2000
|
question: prompt,
|
|
1943
2001
|
input: [{ role: "user", content: prompt }],
|
|
1944
|
-
input_segments: [{ type: "text", value: prompt }],
|
|
1945
2002
|
expected_output: evalCase.expected_output ? [{ role: "assistant", content: evalCase.expected_output }] : [],
|
|
1946
2003
|
reference_answer: evalCase.expected_output,
|
|
1947
2004
|
file_paths: filePaths,
|
|
@@ -2194,7 +2251,7 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
2194
2251
|
}
|
|
2195
2252
|
try {
|
|
2196
2253
|
const rawConfig = await (0, import_promises4.readFile)(configPath, "utf8");
|
|
2197
|
-
const parsed = (0, import_yaml2.parse)(rawConfig);
|
|
2254
|
+
const parsed = interpolateEnv((0, import_yaml2.parse)(rawConfig), process.env);
|
|
2198
2255
|
if (!isJsonObject(parsed)) {
|
|
2199
2256
|
logWarning(`Invalid .agentv/config.yaml format at ${configPath}`);
|
|
2200
2257
|
continue;
|
|
@@ -2412,6 +2469,27 @@ function parseExecutionDefaults(raw, configPath) {
|
|
|
2412
2469
|
} else if (otelFile !== void 0) {
|
|
2413
2470
|
logWarning(`Invalid execution.otel_file in ${configPath}, expected non-empty string`);
|
|
2414
2471
|
}
|
|
2472
|
+
if (typeof obj.export_otel === "boolean") {
|
|
2473
|
+
result.export_otel = obj.export_otel;
|
|
2474
|
+
} else if (obj.export_otel !== void 0) {
|
|
2475
|
+
logWarning(`Invalid execution.export_otel in ${configPath}, expected boolean`);
|
|
2476
|
+
}
|
|
2477
|
+
const otelBackend = obj.otel_backend;
|
|
2478
|
+
if (typeof otelBackend === "string" && otelBackend.trim().length > 0) {
|
|
2479
|
+
result.otel_backend = otelBackend.trim();
|
|
2480
|
+
} else if (otelBackend !== void 0) {
|
|
2481
|
+
logWarning(`Invalid execution.otel_backend in ${configPath}, expected non-empty string`);
|
|
2482
|
+
}
|
|
2483
|
+
if (typeof obj.otel_capture_content === "boolean") {
|
|
2484
|
+
result.otel_capture_content = obj.otel_capture_content;
|
|
2485
|
+
} else if (obj.otel_capture_content !== void 0) {
|
|
2486
|
+
logWarning(`Invalid execution.otel_capture_content in ${configPath}, expected boolean`);
|
|
2487
|
+
}
|
|
2488
|
+
if (typeof obj.otel_group_turns === "boolean") {
|
|
2489
|
+
result.otel_group_turns = obj.otel_group_turns;
|
|
2490
|
+
} else if (obj.otel_group_turns !== void 0) {
|
|
2491
|
+
logWarning(`Invalid execution.otel_group_turns in ${configPath}, expected boolean`);
|
|
2492
|
+
}
|
|
2415
2493
|
if (typeof obj.pool_workspaces === "boolean") {
|
|
2416
2494
|
result.pool_workspaces = obj.pool_workspaces;
|
|
2417
2495
|
} else if (obj.pool_workspaces !== void 0) {
|
|
@@ -3882,27 +3960,28 @@ var ANSI_YELLOW5 = "\x1B[33m";
|
|
|
3882
3960
|
var ANSI_RESET6 = "\x1B[0m";
|
|
3883
3961
|
async function processMessages(options) {
|
|
3884
3962
|
const { messages, searchRoots, repoRootPath, textParts, messageType, verbose } = options;
|
|
3885
|
-
const
|
|
3963
|
+
const processedMessages = [];
|
|
3886
3964
|
for (const message of messages) {
|
|
3887
3965
|
const content = message.content;
|
|
3888
3966
|
if (typeof content === "string") {
|
|
3889
|
-
segments.push({ type: "text", value: content });
|
|
3890
3967
|
if (textParts) {
|
|
3891
3968
|
textParts.push(content);
|
|
3892
3969
|
}
|
|
3970
|
+
processedMessages.push({ ...message, content });
|
|
3893
3971
|
continue;
|
|
3894
3972
|
}
|
|
3895
3973
|
if (isJsonObject(content)) {
|
|
3896
3974
|
const rendered = JSON.stringify(content, null, 2);
|
|
3897
|
-
segments.push({ type: "text", value: rendered });
|
|
3898
3975
|
if (textParts) {
|
|
3899
3976
|
textParts.push(rendered);
|
|
3900
3977
|
}
|
|
3978
|
+
processedMessages.push({ ...message, content: cloneJsonObject(content) });
|
|
3901
3979
|
continue;
|
|
3902
3980
|
}
|
|
3903
3981
|
if (!Array.isArray(content)) {
|
|
3904
3982
|
continue;
|
|
3905
3983
|
}
|
|
3984
|
+
const processedContent = [];
|
|
3906
3985
|
for (const rawSegment of content) {
|
|
3907
3986
|
if (!isJsonObject(rawSegment)) {
|
|
3908
3987
|
continue;
|
|
@@ -3925,8 +4004,8 @@ async function processMessages(options) {
|
|
|
3925
4004
|
}
|
|
3926
4005
|
try {
|
|
3927
4006
|
const fileContent = (await (0, import_promises6.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
3928
|
-
|
|
3929
|
-
|
|
4007
|
+
processedContent.push({
|
|
4008
|
+
...cloneJsonObject(rawSegment),
|
|
3930
4009
|
path: displayPath,
|
|
3931
4010
|
text: fileContent,
|
|
3932
4011
|
resolvedPath: import_node_path6.default.resolve(resolvedPath)
|
|
@@ -3943,37 +4022,19 @@ async function processMessages(options) {
|
|
|
3943
4022
|
continue;
|
|
3944
4023
|
}
|
|
3945
4024
|
const clonedSegment = cloneJsonObject(rawSegment);
|
|
3946
|
-
|
|
4025
|
+
processedContent.push(clonedSegment);
|
|
3947
4026
|
const inlineValue = clonedSegment.value;
|
|
3948
4027
|
if (typeof inlineValue === "string" && textParts) {
|
|
3949
4028
|
textParts.push(inlineValue);
|
|
3950
4029
|
}
|
|
3951
4030
|
}
|
|
4031
|
+
processedMessages.push({ ...message, content: processedContent });
|
|
3952
4032
|
}
|
|
3953
|
-
return
|
|
4033
|
+
return processedMessages;
|
|
3954
4034
|
}
|
|
3955
4035
|
function asString3(value) {
|
|
3956
4036
|
return typeof value === "string" ? value : void 0;
|
|
3957
4037
|
}
|
|
3958
|
-
function cloneJsonObject(source) {
|
|
3959
|
-
const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
|
|
3960
|
-
return Object.fromEntries(entries);
|
|
3961
|
-
}
|
|
3962
|
-
function cloneJsonValue(value) {
|
|
3963
|
-
if (value === null) {
|
|
3964
|
-
return null;
|
|
3965
|
-
}
|
|
3966
|
-
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
|
|
3967
|
-
return value;
|
|
3968
|
-
}
|
|
3969
|
-
if (Array.isArray(value)) {
|
|
3970
|
-
return value.map((item) => cloneJsonValue(item));
|
|
3971
|
-
}
|
|
3972
|
-
if (typeof value === "object") {
|
|
3973
|
-
return cloneJsonObject(value);
|
|
3974
|
-
}
|
|
3975
|
-
return value;
|
|
3976
|
-
}
|
|
3977
4038
|
function logWarning3(message, details) {
|
|
3978
4039
|
if (details && details.length > 0) {
|
|
3979
4040
|
const detailBlock = details.join("\n");
|
|
@@ -4222,10 +4283,10 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
4222
4283
|
);
|
|
4223
4284
|
}
|
|
4224
4285
|
}
|
|
4225
|
-
const
|
|
4286
|
+
const rawInputMessages = resolveInputMessages(evalcase);
|
|
4226
4287
|
const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
|
|
4227
4288
|
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
|
|
4228
|
-
if (!id || !hasEvaluationSpec || !
|
|
4289
|
+
if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
|
|
4229
4290
|
logError2(
|
|
4230
4291
|
`Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
|
|
4231
4292
|
);
|
|
@@ -4233,8 +4294,8 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
4233
4294
|
}
|
|
4234
4295
|
const hasExpectedMessages = expectedMessages.length > 0;
|
|
4235
4296
|
const inputTextParts = [];
|
|
4236
|
-
const
|
|
4237
|
-
messages:
|
|
4297
|
+
const inputMessages = await processMessages({
|
|
4298
|
+
messages: rawInputMessages,
|
|
4238
4299
|
searchRoots,
|
|
4239
4300
|
repoRootPath,
|
|
4240
4301
|
textParts: inputTextParts,
|
|
@@ -4280,19 +4341,13 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
4280
4341
|
}
|
|
4281
4342
|
}
|
|
4282
4343
|
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
4283
|
-
const userFilePaths =
|
|
4284
|
-
for (const segment of inputSegments) {
|
|
4285
|
-
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
4286
|
-
userFilePaths.push(segment.resolvedPath);
|
|
4287
|
-
}
|
|
4288
|
-
}
|
|
4344
|
+
const userFilePaths = collectResolvedInputFilePaths(inputMessages);
|
|
4289
4345
|
const testCase = {
|
|
4290
4346
|
id,
|
|
4291
4347
|
eval_set: evalSetName,
|
|
4292
4348
|
conversation_id: conversationId,
|
|
4293
4349
|
question,
|
|
4294
4350
|
input: inputMessages,
|
|
4295
|
-
input_segments: inputSegments,
|
|
4296
4351
|
expected_output: outputSegments,
|
|
4297
4352
|
reference_answer: referenceAnswer,
|
|
4298
4353
|
file_paths: userFilePaths,
|
|
@@ -4358,50 +4413,9 @@ function parseMetadata(suite) {
|
|
|
4358
4413
|
|
|
4359
4414
|
// src/evaluation/formatting/prompt-builder.ts
|
|
4360
4415
|
async function buildPromptInputs(testCase, mode = "lm") {
|
|
4361
|
-
const segmentsByMessage =
|
|
4362
|
-
|
|
4363
|
-
|
|
4364
|
-
if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
|
|
4365
|
-
fileContentsByPath.set(segment.path, segment.text);
|
|
4366
|
-
}
|
|
4367
|
-
}
|
|
4368
|
-
for (const message of testCase.input) {
|
|
4369
|
-
const messageSegments = [];
|
|
4370
|
-
if (typeof message.content === "string") {
|
|
4371
|
-
if (message.content.trim().length > 0) {
|
|
4372
|
-
messageSegments.push({ type: "text", value: message.content });
|
|
4373
|
-
}
|
|
4374
|
-
} else if (Array.isArray(message.content)) {
|
|
4375
|
-
for (const segment of message.content) {
|
|
4376
|
-
if (typeof segment === "string") {
|
|
4377
|
-
if (segment.trim().length > 0) {
|
|
4378
|
-
messageSegments.push({ type: "text", value: segment });
|
|
4379
|
-
}
|
|
4380
|
-
} else if (isJsonObject(segment)) {
|
|
4381
|
-
const type = asString5(segment.type);
|
|
4382
|
-
if (type === "file") {
|
|
4383
|
-
const value = asString5(segment.value);
|
|
4384
|
-
if (!value) continue;
|
|
4385
|
-
const fileText = fileContentsByPath.get(value);
|
|
4386
|
-
if (fileText !== void 0) {
|
|
4387
|
-
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
4388
|
-
}
|
|
4389
|
-
} else if (type === "text") {
|
|
4390
|
-
const textValue = asString5(segment.value);
|
|
4391
|
-
if (textValue && textValue.trim().length > 0) {
|
|
4392
|
-
messageSegments.push({ type: "text", value: textValue });
|
|
4393
|
-
}
|
|
4394
|
-
}
|
|
4395
|
-
}
|
|
4396
|
-
}
|
|
4397
|
-
} else if (isJsonObject(message.content)) {
|
|
4398
|
-
const rendered = JSON.stringify(message.content, null, 2);
|
|
4399
|
-
if (rendered.trim().length > 0) {
|
|
4400
|
-
messageSegments.push({ type: "text", value: rendered });
|
|
4401
|
-
}
|
|
4402
|
-
}
|
|
4403
|
-
segmentsByMessage.push(messageSegments);
|
|
4404
|
-
}
|
|
4416
|
+
const segmentsByMessage = testCase.input.map(
|
|
4417
|
+
(message) => extractContentSegments(message.content)
|
|
4418
|
+
);
|
|
4405
4419
|
const useRoleMarkers = needsRoleMarkers(testCase.input, segmentsByMessage);
|
|
4406
4420
|
let question;
|
|
4407
4421
|
if (useRoleMarkers) {
|
|
@@ -4429,7 +4443,7 @@ ${messageContent}`);
|
|
|
4429
4443
|
question = messageParts.join("\n\n");
|
|
4430
4444
|
} else {
|
|
4431
4445
|
const questionParts = [];
|
|
4432
|
-
for (const segment of testCase.
|
|
4446
|
+
for (const segment of flattenInputMessages(testCase.input)) {
|
|
4433
4447
|
const formattedContent = formatSegment(segment, mode);
|
|
4434
4448
|
if (formattedContent) {
|
|
4435
4449
|
questionParts.push(formattedContent);
|
|
@@ -4516,9 +4530,6 @@ function buildChatPromptFromSegments(options) {
|
|
|
4516
4530
|
}
|
|
4517
4531
|
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
4518
4532
|
}
|
|
4519
|
-
function asString5(value) {
|
|
4520
|
-
return typeof value === "string" ? value : void 0;
|
|
4521
|
-
}
|
|
4522
4533
|
|
|
4523
4534
|
// src/evaluation/yaml-parser.ts
|
|
4524
4535
|
var ANSI_YELLOW7 = "\x1B[33m";
|
|
@@ -4601,7 +4612,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4601
4612
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
4602
4613
|
}
|
|
4603
4614
|
const suite = interpolated;
|
|
4604
|
-
const evalSetNameFromSuite =
|
|
4615
|
+
const evalSetNameFromSuite = asString5(suite.name)?.trim();
|
|
4605
4616
|
const fallbackEvalSet = import_node_path8.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
4606
4617
|
const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
|
|
4607
4618
|
const rawTestcases = resolveTests(suite);
|
|
@@ -4620,7 +4631,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4620
4631
|
const suiteInputMessages = expandInputShorthand(suite.input);
|
|
4621
4632
|
const suiteInputFiles = suite.input_files;
|
|
4622
4633
|
const rawGlobalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
4623
|
-
const _globalTarget =
|
|
4634
|
+
const _globalTarget = asString5(rawGlobalExecution?.target) ?? asString5(suite.target);
|
|
4624
4635
|
const suiteAssertions = suite.assertions ?? suite.assert;
|
|
4625
4636
|
if (suite.assert !== void 0 && suite.assertions === void 0) {
|
|
4626
4637
|
logWarning5("'assert' is deprecated at the suite level. Use 'assertions' instead.");
|
|
@@ -4633,17 +4644,17 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4633
4644
|
continue;
|
|
4634
4645
|
}
|
|
4635
4646
|
const evalcase = rawEvalcase;
|
|
4636
|
-
const id =
|
|
4647
|
+
const id = asString5(evalcase.id);
|
|
4637
4648
|
if (filterPattern && (!id || !import_micromatch2.default.isMatch(id, filterPattern))) {
|
|
4638
4649
|
continue;
|
|
4639
4650
|
}
|
|
4640
|
-
const conversationId =
|
|
4641
|
-
let outcome =
|
|
4651
|
+
const conversationId = asString5(evalcase.conversation_id);
|
|
4652
|
+
let outcome = asString5(evalcase.criteria);
|
|
4642
4653
|
if (!outcome && evalcase.expected_outcome !== void 0) {
|
|
4643
|
-
outcome =
|
|
4654
|
+
outcome = asString5(evalcase.expected_outcome);
|
|
4644
4655
|
if (outcome) {
|
|
4645
4656
|
logWarning5(
|
|
4646
|
-
`Test '${
|
|
4657
|
+
`Test '${asString5(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
|
|
4647
4658
|
);
|
|
4648
4659
|
}
|
|
4649
4660
|
}
|
|
@@ -4660,10 +4671,9 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4660
4671
|
continue;
|
|
4661
4672
|
}
|
|
4662
4673
|
const effectiveSuiteInputMessages = suiteInputMessages && !skipDefaults ? suiteInputMessages : void 0;
|
|
4663
|
-
const inputMessages = effectiveSuiteInputMessages ? [...effectiveSuiteInputMessages, ...testInputMessages] : testInputMessages;
|
|
4664
4674
|
const hasExpectedMessages = expectedMessages.length > 0;
|
|
4665
4675
|
const inputTextParts = [];
|
|
4666
|
-
const
|
|
4676
|
+
const suiteResolvedInputMessages = effectiveSuiteInputMessages ? await processMessages({
|
|
4667
4677
|
messages: effectiveSuiteInputMessages,
|
|
4668
4678
|
searchRoots,
|
|
4669
4679
|
repoRootPath,
|
|
@@ -4671,7 +4681,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4671
4681
|
messageType: "input",
|
|
4672
4682
|
verbose
|
|
4673
4683
|
}) : [];
|
|
4674
|
-
const
|
|
4684
|
+
const testResolvedInputMessages = await processMessages({
|
|
4675
4685
|
messages: testInputMessages,
|
|
4676
4686
|
searchRoots,
|
|
4677
4687
|
repoRootPath,
|
|
@@ -4679,7 +4689,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4679
4689
|
messageType: "input",
|
|
4680
4690
|
verbose
|
|
4681
4691
|
});
|
|
4682
|
-
const
|
|
4692
|
+
const inputMessages = [...suiteResolvedInputMessages, ...testResolvedInputMessages];
|
|
4683
4693
|
const outputSegments = hasExpectedMessages ? await processExpectedMessages({
|
|
4684
4694
|
messages: expectedMessages,
|
|
4685
4695
|
searchRoots,
|
|
@@ -4717,12 +4727,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4717
4727
|
}
|
|
4718
4728
|
}
|
|
4719
4729
|
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
4720
|
-
const userFilePaths =
|
|
4721
|
-
for (const segment of inputSegments) {
|
|
4722
|
-
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
4723
|
-
userFilePaths.push(segment.resolvedPath);
|
|
4724
|
-
}
|
|
4725
|
-
}
|
|
4730
|
+
const userFilePaths = collectResolvedInputFilePaths(inputMessages);
|
|
4726
4731
|
const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
|
|
4727
4732
|
const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
|
|
4728
4733
|
const metadata = isJsonObject(evalcase.metadata) ? evalcase.metadata : void 0;
|
|
@@ -4733,7 +4738,6 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4733
4738
|
conversation_id: conversationId,
|
|
4734
4739
|
question,
|
|
4735
4740
|
input: inputMessages,
|
|
4736
|
-
input_segments: inputSegments,
|
|
4737
4741
|
expected_output: outputSegments,
|
|
4738
4742
|
reference_answer: referenceAnswer,
|
|
4739
4743
|
file_paths: userFilePaths,
|
|
@@ -4942,7 +4946,7 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
|
|
|
4942
4946
|
path: caseLevel.path ?? suiteLevel.path
|
|
4943
4947
|
};
|
|
4944
4948
|
}
|
|
4945
|
-
function
|
|
4949
|
+
function asString5(value) {
|
|
4946
4950
|
return typeof value === "string" ? value : void 0;
|
|
4947
4951
|
}
|
|
4948
4952
|
function logWarning5(message, details) {
|
|
@@ -8813,7 +8817,7 @@ var PiAgentSdkProvider = class {
|
|
|
8813
8817
|
const { Agent, getModel, getEnvApiKey } = await loadPiModules();
|
|
8814
8818
|
const startTimeIso = (/* @__PURE__ */ new Date()).toISOString();
|
|
8815
8819
|
const startMs = Date.now();
|
|
8816
|
-
const providerName = this.config.
|
|
8820
|
+
const providerName = this.config.subprovider ?? "anthropic";
|
|
8817
8821
|
const modelId = this.config.model ?? "claude-sonnet-4-20250514";
|
|
8818
8822
|
const model = getModel(providerName, modelId);
|
|
8819
8823
|
const systemPrompt = this.config.systemPrompt ?? "Answer directly and concisely.";
|
|
@@ -8925,7 +8929,7 @@ var PiAgentSdkProvider = class {
|
|
|
8925
8929
|
messages: agentMessages,
|
|
8926
8930
|
systemPrompt,
|
|
8927
8931
|
model: this.config.model,
|
|
8928
|
-
|
|
8932
|
+
subprovider: this.config.subprovider
|
|
8929
8933
|
},
|
|
8930
8934
|
output,
|
|
8931
8935
|
tokenUsage,
|
|
@@ -9161,8 +9165,8 @@ var PiCodingAgentProvider = class {
|
|
|
9161
9165
|
}
|
|
9162
9166
|
buildPiArgs(prompt, inputFiles, _captureFileChanges) {
|
|
9163
9167
|
const args = [];
|
|
9164
|
-
if (this.config.
|
|
9165
|
-
args.push("--provider", this.config.
|
|
9168
|
+
if (this.config.subprovider) {
|
|
9169
|
+
args.push("--provider", this.config.subprovider);
|
|
9166
9170
|
}
|
|
9167
9171
|
if (this.config.model) {
|
|
9168
9172
|
args.push("--model", this.config.model);
|
|
@@ -9220,7 +9224,7 @@ ${prompt}` : prompt;
|
|
|
9220
9224
|
buildEnv() {
|
|
9221
9225
|
const env = { ...process.env };
|
|
9222
9226
|
if (this.config.apiKey) {
|
|
9223
|
-
const provider = this.config.
|
|
9227
|
+
const provider = this.config.subprovider?.toLowerCase() ?? "google";
|
|
9224
9228
|
switch (provider) {
|
|
9225
9229
|
case "google":
|
|
9226
9230
|
case "gemini":
|
|
@@ -10531,7 +10535,7 @@ function normalizeCopilotLogFormat(value) {
|
|
|
10531
10535
|
}
|
|
10532
10536
|
function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
10533
10537
|
const executableSource = target.executable ?? target.command ?? target.binary;
|
|
10534
|
-
const
|
|
10538
|
+
const subproviderSource = target.subprovider;
|
|
10535
10539
|
const modelSource = target.model ?? target.pi_model ?? target.piModel;
|
|
10536
10540
|
const apiKeySource = target.api_key ?? target.apiKey;
|
|
10537
10541
|
const toolsSource = target.tools ?? target.pi_tools ?? target.piTools;
|
|
@@ -10547,10 +10551,15 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
|
10547
10551
|
allowLiteral: true,
|
|
10548
10552
|
optionalEnv: true
|
|
10549
10553
|
}) ?? "pi";
|
|
10550
|
-
const
|
|
10551
|
-
|
|
10552
|
-
|
|
10553
|
-
|
|
10554
|
+
const subprovider = resolveOptionalString(
|
|
10555
|
+
subproviderSource,
|
|
10556
|
+
env,
|
|
10557
|
+
`${target.name} pi subprovider`,
|
|
10558
|
+
{
|
|
10559
|
+
allowLiteral: true,
|
|
10560
|
+
optionalEnv: true
|
|
10561
|
+
}
|
|
10562
|
+
);
|
|
10554
10563
|
const model = resolveOptionalString(modelSource, env, `${target.name} pi model`, {
|
|
10555
10564
|
allowLiteral: true,
|
|
10556
10565
|
optionalEnv: true
|
|
@@ -10598,7 +10607,7 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
|
10598
10607
|
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
10599
10608
|
return {
|
|
10600
10609
|
executable,
|
|
10601
|
-
|
|
10610
|
+
subprovider,
|
|
10602
10611
|
model,
|
|
10603
10612
|
apiKey,
|
|
10604
10613
|
tools,
|
|
@@ -10613,15 +10622,15 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
|
10613
10622
|
};
|
|
10614
10623
|
}
|
|
10615
10624
|
function resolvePiAgentSdkConfig(target, env) {
|
|
10616
|
-
const
|
|
10625
|
+
const subproviderSource = target.subprovider;
|
|
10617
10626
|
const modelSource = target.model ?? target.pi_model ?? target.piModel;
|
|
10618
10627
|
const apiKeySource = target.api_key ?? target.apiKey;
|
|
10619
10628
|
const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
|
|
10620
10629
|
const systemPromptSource = target.system_prompt ?? target.systemPrompt;
|
|
10621
|
-
const
|
|
10622
|
-
|
|
10630
|
+
const subprovider = resolveOptionalString(
|
|
10631
|
+
subproviderSource,
|
|
10623
10632
|
env,
|
|
10624
|
-
`${target.name} pi-agent-sdk
|
|
10633
|
+
`${target.name} pi-agent-sdk subprovider`,
|
|
10625
10634
|
{
|
|
10626
10635
|
allowLiteral: true,
|
|
10627
10636
|
optionalEnv: true
|
|
@@ -10638,7 +10647,7 @@ function resolvePiAgentSdkConfig(target, env) {
|
|
|
10638
10647
|
const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi-agent-sdk timeout`);
|
|
10639
10648
|
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
10640
10649
|
return {
|
|
10641
|
-
|
|
10650
|
+
subprovider,
|
|
10642
10651
|
model,
|
|
10643
10652
|
apiKey,
|
|
10644
10653
|
timeoutMs,
|
|
@@ -13300,7 +13309,8 @@ var freeformEvaluationSchema = import_zod4.z.object({
|
|
|
13300
13309
|
passed: import_zod4.z.boolean().describe("Whether this aspect was satisfied"),
|
|
13301
13310
|
evidence: import_zod4.z.string().describe("Concise evidence (1-2 sentences)").optional()
|
|
13302
13311
|
})
|
|
13303
|
-
).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional()
|
|
13312
|
+
).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional(),
|
|
13313
|
+
details: import_zod4.z.record(import_zod4.z.unknown()).describe("Optional structured metadata for domain-specific metrics").optional()
|
|
13304
13314
|
});
|
|
13305
13315
|
var rubricCheckResultSchema = import_zod4.z.object({
|
|
13306
13316
|
id: import_zod4.z.string().describe("The ID of the rubric item being checked"),
|
|
@@ -13362,7 +13372,7 @@ var LlmGraderEvaluator = class {
|
|
|
13362
13372
|
async evaluateFreeform(context2, graderProvider) {
|
|
13363
13373
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
13364
13374
|
const variables = {
|
|
13365
|
-
[TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context2.evalCase.
|
|
13375
|
+
[TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context2.evalCase.input, null, 2),
|
|
13366
13376
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(
|
|
13367
13377
|
context2.evalCase.expected_output,
|
|
13368
13378
|
null,
|
|
@@ -13405,6 +13415,7 @@ ${context2.fileChanges}`;
|
|
|
13405
13415
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
13406
13416
|
evaluatorRawRequest,
|
|
13407
13417
|
graderTarget: graderProvider.targetName,
|
|
13418
|
+
details: data.details,
|
|
13408
13419
|
tokenUsage
|
|
13409
13420
|
};
|
|
13410
13421
|
} catch (e) {
|
|
@@ -13824,7 +13835,7 @@ ${outputSchema}`;
|
|
|
13824
13835
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
13825
13836
|
evaluatorRawRequest,
|
|
13826
13837
|
graderTarget,
|
|
13827
|
-
details
|
|
13838
|
+
details: data.details && Object.keys(data.details).length > 0 ? { ...details, ...data.details } : details
|
|
13828
13839
|
};
|
|
13829
13840
|
} catch {
|
|
13830
13841
|
return {
|
|
@@ -13971,7 +13982,8 @@ function buildOutputSchema() {
|
|
|
13971
13982
|
' "passed": <boolean>,',
|
|
13972
13983
|
' "evidence": "<concise evidence, 1-2 sentences, optional>"',
|
|
13973
13984
|
" }",
|
|
13974
|
-
" ]",
|
|
13985
|
+
" ],",
|
|
13986
|
+
' "details": {<optional object with domain-specific structured metrics>}',
|
|
13975
13987
|
"}"
|
|
13976
13988
|
].join("\n");
|
|
13977
13989
|
}
|
|
@@ -15335,7 +15347,7 @@ function assembleLlmGraderPrompt(input) {
|
|
|
15335
15347
|
function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evaluatorTemplateOverride) {
|
|
15336
15348
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
15337
15349
|
const variables = {
|
|
15338
|
-
[TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.
|
|
15350
|
+
[TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input, null, 2),
|
|
15339
15351
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
|
|
15340
15352
|
[TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
|
|
15341
15353
|
[TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
|
|
@@ -17616,6 +17628,18 @@ var QUALITY_PASS_THRESHOLD = 0.8;
|
|
|
17616
17628
|
function classifyQualityStatus(score) {
|
|
17617
17629
|
return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
|
|
17618
17630
|
}
|
|
17631
|
+
function buildSkippedEvaluatorError(scores) {
|
|
17632
|
+
const skippedScores = scores?.filter((score) => score.verdict === "skip") ?? [];
|
|
17633
|
+
if (skippedScores.length === 0) {
|
|
17634
|
+
return void 0;
|
|
17635
|
+
}
|
|
17636
|
+
const messages = skippedScores.map((score) => {
|
|
17637
|
+
const label = score.name || score.type;
|
|
17638
|
+
const assertionMessage = score.assertions.find((assertion) => !assertion.passed)?.text ?? "Evaluator skipped";
|
|
17639
|
+
return `${label}: ${assertionMessage}`;
|
|
17640
|
+
});
|
|
17641
|
+
return messages.length === 1 ? messages[0] : `Evaluators skipped: ${messages.join(" | ")}`;
|
|
17642
|
+
}
|
|
17619
17643
|
function usesFileReferencePrompt(provider) {
|
|
17620
17644
|
return isAgentProvider(provider) || provider.kind === "cli";
|
|
17621
17645
|
}
|
|
@@ -18880,7 +18904,8 @@ async function runEvalCase(options) {
|
|
|
18880
18904
|
durationMs: totalDurationMs,
|
|
18881
18905
|
...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
|
|
18882
18906
|
};
|
|
18883
|
-
const
|
|
18907
|
+
const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
|
|
18908
|
+
const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score);
|
|
18884
18909
|
const finalResult = providerError ? {
|
|
18885
18910
|
...result,
|
|
18886
18911
|
evalRun,
|
|
@@ -18892,7 +18917,26 @@ async function runEvalCase(options) {
|
|
|
18892
18917
|
beforeAllOutput,
|
|
18893
18918
|
beforeEachOutput,
|
|
18894
18919
|
afterEachOutput
|
|
18895
|
-
} :
|
|
18920
|
+
} : skippedEvaluatorError ? {
|
|
18921
|
+
...result,
|
|
18922
|
+
score: 0,
|
|
18923
|
+
evalRun,
|
|
18924
|
+
error: skippedEvaluatorError,
|
|
18925
|
+
executionStatus,
|
|
18926
|
+
failureStage: "evaluator",
|
|
18927
|
+
failureReasonCode: "evaluator_error",
|
|
18928
|
+
executionError: { message: skippedEvaluatorError, stage: "evaluator" },
|
|
18929
|
+
beforeAllOutput,
|
|
18930
|
+
beforeEachOutput,
|
|
18931
|
+
afterEachOutput
|
|
18932
|
+
} : {
|
|
18933
|
+
...result,
|
|
18934
|
+
evalRun,
|
|
18935
|
+
executionStatus,
|
|
18936
|
+
beforeAllOutput,
|
|
18937
|
+
beforeEachOutput,
|
|
18938
|
+
afterEachOutput
|
|
18939
|
+
};
|
|
18896
18940
|
const isFailure = !!finalResult.error || finalResult.score < 0.5;
|
|
18897
18941
|
if (workspacePath && !isSharedWorkspace) {
|
|
18898
18942
|
if (forceCleanup) {
|
|
@@ -19637,11 +19681,6 @@ async function evaluate(config) {
|
|
|
19637
19681
|
evalCases = (config.tests ?? []).map((test) => {
|
|
19638
19682
|
const input = typeof test.input === "string" ? [{ role: "user", content: test.input }] : test.input;
|
|
19639
19683
|
const question = typeof test.input === "string" ? test.input : test.input.find((m) => m.role === "user")?.content ?? "";
|
|
19640
|
-
const inputSegments = input.map((m) => ({
|
|
19641
|
-
type: "text",
|
|
19642
|
-
value: typeof m.content === "string" ? m.content : JSON.stringify(m.content),
|
|
19643
|
-
messageIndex: 0
|
|
19644
|
-
}));
|
|
19645
19684
|
const expectedOutputValue = test.expectedOutput ?? test.expected_output;
|
|
19646
19685
|
const expectedOutput = expectedOutputValue ? [
|
|
19647
19686
|
{ role: "assistant", content: expectedOutputValue }
|
|
@@ -19670,7 +19709,6 @@ async function evaluate(config) {
|
|
|
19670
19709
|
criteria: test.criteria ?? "",
|
|
19671
19710
|
question: String(question),
|
|
19672
19711
|
input,
|
|
19673
|
-
input_segments: inputSegments,
|
|
19674
19712
|
expected_output: expectedOutput,
|
|
19675
19713
|
reference_answer: expectedOutputValue,
|
|
19676
19714
|
file_paths: [],
|