agentv 3.10.2 → 3.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-6UE665XI.js → chunk-ETMDLQ72.js} +1141 -60
- package/dist/chunk-ETMDLQ72.js.map +1 -0
- package/dist/{chunk-KGK5NUFG.js → chunk-EZGWZVVK.js} +377 -163
- package/dist/chunk-EZGWZVVK.js.map +1 -0
- package/dist/{chunk-F7LAJMTO.js → chunk-JEW3FEO7.js} +68 -32
- package/dist/chunk-JEW3FEO7.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-3QUJEJUT.js → dist-QERRYDSC.js} +2 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-EO6AR2R3.js → interactive-AD4PRYDN.js} +3 -3
- package/package.json +3 -1
- package/dist/chunk-6UE665XI.js.map +0 -1
- package/dist/chunk-F7LAJMTO.js.map +0 -1
- package/dist/chunk-KGK5NUFG.js.map +0 -1
- package/dist/templates/.agents/skills/agentv-chat-to-eval/README.md +0 -84
- package/dist/templates/.agents/skills/agentv-chat-to-eval/SKILL.md +0 -144
- package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-json.md +0 -67
- package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-markdown.md +0 -101
- package/dist/templates/.agents/skills/agentv-eval-builder/SKILL.md +0 -458
- package/dist/templates/.agents/skills/agentv-eval-builder/references/config-schema.json +0 -36
- package/dist/templates/.agents/skills/agentv-eval-builder/references/custom-evaluators.md +0 -118
- package/dist/templates/.agents/skills/agentv-eval-builder/references/eval-schema.json +0 -12753
- package/dist/templates/.agents/skills/agentv-eval-builder/references/rubric-evaluator.md +0 -77
- package/dist/templates/.agents/skills/agentv-eval-orchestrator/SKILL.md +0 -50
- package/dist/templates/.agents/skills/agentv-prompt-optimizer/SKILL.md +0 -78
- package/dist/templates/.agentv/.env.example +0 -25
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +0 -177
- package/dist/templates/.claude/skills/agentv-eval-builder/references/batch-cli-evaluator.md +0 -316
- package/dist/templates/.claude/skills/agentv-eval-builder/references/compare-command.md +0 -137
- package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +0 -215
- package/dist/templates/.claude/skills/agentv-eval-builder/references/config-schema.json +0 -27
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +0 -115
- package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +0 -278
- package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +0 -333
- package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +0 -79
- package/dist/templates/.claude/skills/agentv-eval-builder/references/structured-data-evaluators.md +0 -121
- package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +0 -298
- package/dist/templates/.claude/skills/agentv-prompt-optimizer/SKILL.md +0 -78
- package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +0 -5
- package/dist/templates/.github/prompts/agentv-optimize.prompt.md +0 -4
- /package/dist/{dist-3QUJEJUT.js.map → dist-QERRYDSC.js.map} +0 -0
- /package/dist/{interactive-EO6AR2R3.js.map → interactive-AD4PRYDN.js.map} +0 -0
|
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
|
|
|
301
301
|
}
|
|
302
302
|
});
|
|
303
303
|
|
|
304
|
-
// ../../packages/core/dist/chunk-
|
|
304
|
+
// ../../packages/core/dist/chunk-AVTN5AB7.js
|
|
305
305
|
import { constants } from "node:fs";
|
|
306
306
|
import { access, readFile } from "node:fs/promises";
|
|
307
307
|
import path from "node:path";
|
|
@@ -419,7 +419,7 @@ __export(external_exports2, {
|
|
|
419
419
|
void: () => voidType
|
|
420
420
|
});
|
|
421
421
|
|
|
422
|
-
// ../../packages/core/dist/chunk-
|
|
422
|
+
// ../../packages/core/dist/chunk-AVTN5AB7.js
|
|
423
423
|
import { readFile as readFile2 } from "node:fs/promises";
|
|
424
424
|
import path3 from "node:path";
|
|
425
425
|
import fg from "fast-glob";
|
|
@@ -1363,7 +1363,7 @@ function normalizeCopilotLogFormat(value) {
|
|
|
1363
1363
|
}
|
|
1364
1364
|
function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
1365
1365
|
const executableSource = target.executable ?? target.command ?? target.binary;
|
|
1366
|
-
const
|
|
1366
|
+
const subproviderSource = target.subprovider;
|
|
1367
1367
|
const modelSource = target.model ?? target.pi_model ?? target.piModel;
|
|
1368
1368
|
const apiKeySource = target.api_key ?? target.apiKey;
|
|
1369
1369
|
const toolsSource = target.tools ?? target.pi_tools ?? target.piTools;
|
|
@@ -1379,10 +1379,15 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
|
1379
1379
|
allowLiteral: true,
|
|
1380
1380
|
optionalEnv: true
|
|
1381
1381
|
}) ?? "pi";
|
|
1382
|
-
const
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1382
|
+
const subprovider = resolveOptionalString(
|
|
1383
|
+
subproviderSource,
|
|
1384
|
+
env,
|
|
1385
|
+
`${target.name} pi subprovider`,
|
|
1386
|
+
{
|
|
1387
|
+
allowLiteral: true,
|
|
1388
|
+
optionalEnv: true
|
|
1389
|
+
}
|
|
1390
|
+
);
|
|
1386
1391
|
const model = resolveOptionalString(modelSource, env, `${target.name} pi model`, {
|
|
1387
1392
|
allowLiteral: true,
|
|
1388
1393
|
optionalEnv: true
|
|
@@ -1430,7 +1435,7 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
|
1430
1435
|
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
1431
1436
|
return {
|
|
1432
1437
|
executable,
|
|
1433
|
-
|
|
1438
|
+
subprovider,
|
|
1434
1439
|
model,
|
|
1435
1440
|
apiKey,
|
|
1436
1441
|
tools,
|
|
@@ -1445,15 +1450,15 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
|
1445
1450
|
};
|
|
1446
1451
|
}
|
|
1447
1452
|
function resolvePiAgentSdkConfig(target, env) {
|
|
1448
|
-
const
|
|
1453
|
+
const subproviderSource = target.subprovider;
|
|
1449
1454
|
const modelSource = target.model ?? target.pi_model ?? target.piModel;
|
|
1450
1455
|
const apiKeySource = target.api_key ?? target.apiKey;
|
|
1451
1456
|
const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
|
|
1452
1457
|
const systemPromptSource = target.system_prompt ?? target.systemPrompt;
|
|
1453
|
-
const
|
|
1454
|
-
|
|
1458
|
+
const subprovider = resolveOptionalString(
|
|
1459
|
+
subproviderSource,
|
|
1455
1460
|
env,
|
|
1456
|
-
`${target.name} pi-agent-sdk
|
|
1461
|
+
`${target.name} pi-agent-sdk subprovider`,
|
|
1457
1462
|
{
|
|
1458
1463
|
allowLiteral: true,
|
|
1459
1464
|
optionalEnv: true
|
|
@@ -1470,7 +1475,7 @@ function resolvePiAgentSdkConfig(target, env) {
|
|
|
1470
1475
|
const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi-agent-sdk timeout`);
|
|
1471
1476
|
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
1472
1477
|
return {
|
|
1473
|
-
|
|
1478
|
+
subprovider,
|
|
1474
1479
|
model,
|
|
1475
1480
|
apiKey,
|
|
1476
1481
|
timeoutMs,
|
|
@@ -2039,7 +2044,7 @@ import path8 from "node:path";
|
|
|
2039
2044
|
import { parse as parse3 } from "yaml";
|
|
2040
2045
|
import { createOpenAI } from "@ai-sdk/openai";
|
|
2041
2046
|
|
|
2042
|
-
// ../../node_modules/.bun/@openrouter+ai-sdk-provider@2.3.
|
|
2047
|
+
// ../../node_modules/.bun/@openrouter+ai-sdk-provider@2.3.3+3ab978b6804fd9e7/node_modules/@openrouter/ai-sdk-provider/dist/index.mjs
|
|
2043
2048
|
var __defProp = Object.defineProperty;
|
|
2044
2049
|
var __defProps = Object.defineProperties;
|
|
2045
2050
|
var __getOwnPropDescs = Object.getOwnPropertyDescriptors;
|
|
@@ -4202,11 +4207,13 @@ function isDefinedOrNotNull(value) {
|
|
|
4202
4207
|
var ReasoningFormat = /* @__PURE__ */ ((ReasoningFormat2) => {
|
|
4203
4208
|
ReasoningFormat2["Unknown"] = "unknown";
|
|
4204
4209
|
ReasoningFormat2["OpenAIResponsesV1"] = "openai-responses-v1";
|
|
4210
|
+
ReasoningFormat2["AzureOpenAIResponsesV1"] = "azure-openai-responses-v1";
|
|
4205
4211
|
ReasoningFormat2["XAIResponsesV1"] = "xai-responses-v1";
|
|
4206
4212
|
ReasoningFormat2["AnthropicClaudeV1"] = "anthropic-claude-v1";
|
|
4207
4213
|
ReasoningFormat2["GoogleGeminiV1"] = "google-gemini-v1";
|
|
4208
4214
|
return ReasoningFormat2;
|
|
4209
4215
|
})(ReasoningFormat || {});
|
|
4216
|
+
var DEFAULT_REASONING_FORMAT = "anthropic-claude-v1";
|
|
4210
4217
|
var CommonReasoningDetailSchema = external_exports.object({
|
|
4211
4218
|
id: external_exports.string().nullish(),
|
|
4212
4219
|
format: external_exports.enum(ReasoningFormat).nullish(),
|
|
@@ -4360,7 +4367,11 @@ var OpenRouterProviderMetadataSchema = external_exports.object({
|
|
|
4360
4367
|
}).catchall(external_exports.any());
|
|
4361
4368
|
var OpenRouterProviderOptionsSchema = external_exports.object({
|
|
4362
4369
|
openrouter: external_exports.object({
|
|
4363
|
-
|
|
4370
|
+
// Use ReasoningDetailArraySchema (with unknown fallback) instead of
|
|
4371
|
+
// z.array(ReasoningDetailUnionSchema) so that a single malformed entry
|
|
4372
|
+
// (e.g., a future format not yet in the enum) is individually dropped
|
|
4373
|
+
// rather than causing the entire array to fail parsing.
|
|
4374
|
+
reasoning_details: ReasoningDetailArraySchema.optional(),
|
|
4364
4375
|
annotations: external_exports.array(FileAnnotationSchema).optional()
|
|
4365
4376
|
}).optional()
|
|
4366
4377
|
}).optional();
|
|
@@ -4758,8 +4769,24 @@ function convertToOpenRouterChatMessages(prompt) {
|
|
|
4758
4769
|
const candidateReasoningDetails = messageReasoningDetails && Array.isArray(messageReasoningDetails) && messageReasoningDetails.length > 0 ? messageReasoningDetails : findFirstReasoningDetails(content);
|
|
4759
4770
|
let finalReasoningDetails;
|
|
4760
4771
|
if (candidateReasoningDetails && candidateReasoningDetails.length > 0) {
|
|
4772
|
+
const validDetails = candidateReasoningDetails.filter((detail) => {
|
|
4773
|
+
var _a173;
|
|
4774
|
+
if (detail.type !== "reasoning.text") {
|
|
4775
|
+
return true;
|
|
4776
|
+
}
|
|
4777
|
+
const format = (_a173 = detail.format) != null ? _a173 : DEFAULT_REASONING_FORMAT;
|
|
4778
|
+
if (format !== "anthropic-claude-v1") {
|
|
4779
|
+
return true;
|
|
4780
|
+
}
|
|
4781
|
+
return !!detail.signature;
|
|
4782
|
+
});
|
|
4783
|
+
if (validDetails.length < candidateReasoningDetails.length) {
|
|
4784
|
+
console.warn(
|
|
4785
|
+
"[openrouter] Some reasoning_details entries were removed because they were missing signatures. See https://github.com/OpenRouterTeam/ai-sdk-provider/issues/423 for more details."
|
|
4786
|
+
);
|
|
4787
|
+
}
|
|
4761
4788
|
const uniqueDetails = [];
|
|
4762
|
-
for (const detail of
|
|
4789
|
+
for (const detail of validDetails) {
|
|
4763
4790
|
if (reasoningDetailsTracker.upsert(detail)) {
|
|
4764
4791
|
uniqueDetails.push(detail);
|
|
4765
4792
|
}
|
|
@@ -4808,20 +4835,135 @@ function getToolResultContent(input) {
|
|
|
4808
4835
|
return input.output.value;
|
|
4809
4836
|
case "json":
|
|
4810
4837
|
case "error-json":
|
|
4811
|
-
case "content":
|
|
4812
4838
|
return JSON.stringify(input.output.value);
|
|
4839
|
+
case "content":
|
|
4840
|
+
return mapToolResultContentParts(input.output.value);
|
|
4813
4841
|
case "execution-denied":
|
|
4814
4842
|
return (_a163 = input.output.reason) != null ? _a163 : "Tool execution denied";
|
|
4815
4843
|
}
|
|
4816
4844
|
}
|
|
4845
|
+
function mapToolResultContentParts(parts) {
|
|
4846
|
+
return parts.map((part) => {
|
|
4847
|
+
var _a163, _b162, _c;
|
|
4848
|
+
switch (part.type) {
|
|
4849
|
+
case "text":
|
|
4850
|
+
return { type: "text", text: part.text };
|
|
4851
|
+
case "image-data":
|
|
4852
|
+
return {
|
|
4853
|
+
type: "image_url",
|
|
4854
|
+
image_url: {
|
|
4855
|
+
url: buildFileDataUrl({
|
|
4856
|
+
data: part.data,
|
|
4857
|
+
mediaType: part.mediaType,
|
|
4858
|
+
defaultMediaType: "image/jpeg"
|
|
4859
|
+
})
|
|
4860
|
+
}
|
|
4861
|
+
};
|
|
4862
|
+
case "image-url":
|
|
4863
|
+
return {
|
|
4864
|
+
type: "image_url",
|
|
4865
|
+
image_url: { url: part.url }
|
|
4866
|
+
};
|
|
4867
|
+
case "file-data": {
|
|
4868
|
+
const dataUrl = buildFileDataUrl({
|
|
4869
|
+
data: part.data,
|
|
4870
|
+
mediaType: part.mediaType,
|
|
4871
|
+
defaultMediaType: "application/octet-stream"
|
|
4872
|
+
});
|
|
4873
|
+
if ((_a163 = part.mediaType) == null ? void 0 : _a163.startsWith("image/")) {
|
|
4874
|
+
return {
|
|
4875
|
+
type: "image_url",
|
|
4876
|
+
image_url: { url: dataUrl }
|
|
4877
|
+
};
|
|
4878
|
+
}
|
|
4879
|
+
if ((_b162 = part.mediaType) == null ? void 0 : _b162.startsWith("audio/")) {
|
|
4880
|
+
const rawFormat = part.mediaType.replace("audio/", "");
|
|
4881
|
+
const format = MIME_TO_FORMAT[rawFormat];
|
|
4882
|
+
if (format !== void 0) {
|
|
4883
|
+
return {
|
|
4884
|
+
type: "input_audio",
|
|
4885
|
+
input_audio: {
|
|
4886
|
+
data: getBase64FromDataUrl(dataUrl),
|
|
4887
|
+
format
|
|
4888
|
+
}
|
|
4889
|
+
};
|
|
4890
|
+
}
|
|
4891
|
+
}
|
|
4892
|
+
return {
|
|
4893
|
+
type: "file",
|
|
4894
|
+
file: {
|
|
4895
|
+
filename: (_c = part.filename) != null ? _c : "",
|
|
4896
|
+
file_data: dataUrl
|
|
4897
|
+
}
|
|
4898
|
+
};
|
|
4899
|
+
}
|
|
4900
|
+
case "file-url": {
|
|
4901
|
+
if (looksLikeImageUrl(part.url)) {
|
|
4902
|
+
return {
|
|
4903
|
+
type: "image_url",
|
|
4904
|
+
image_url: { url: part.url }
|
|
4905
|
+
};
|
|
4906
|
+
}
|
|
4907
|
+
return {
|
|
4908
|
+
type: "file",
|
|
4909
|
+
file: {
|
|
4910
|
+
filename: filenameFromUrl(part.url),
|
|
4911
|
+
file_data: part.url
|
|
4912
|
+
}
|
|
4913
|
+
};
|
|
4914
|
+
}
|
|
4915
|
+
case "file-id":
|
|
4916
|
+
case "image-file-id":
|
|
4917
|
+
case "custom":
|
|
4918
|
+
return { type: "text", text: JSON.stringify(part) };
|
|
4919
|
+
default: {
|
|
4920
|
+
const _exhaustiveCheck = part;
|
|
4921
|
+
return { type: "text", text: JSON.stringify(_exhaustiveCheck) };
|
|
4922
|
+
}
|
|
4923
|
+
}
|
|
4924
|
+
});
|
|
4925
|
+
}
|
|
4926
|
+
var IMAGE_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
4927
|
+
"jpg",
|
|
4928
|
+
"jpeg",
|
|
4929
|
+
"png",
|
|
4930
|
+
"gif",
|
|
4931
|
+
"webp",
|
|
4932
|
+
"svg",
|
|
4933
|
+
"bmp",
|
|
4934
|
+
"ico",
|
|
4935
|
+
"tif",
|
|
4936
|
+
"tiff",
|
|
4937
|
+
"avif"
|
|
4938
|
+
]);
|
|
4939
|
+
function looksLikeImageUrl(url) {
|
|
4940
|
+
var _a163;
|
|
4941
|
+
try {
|
|
4942
|
+
const pathname = new URL(url).pathname;
|
|
4943
|
+
const ext = (_a163 = pathname.split(".").pop()) == null ? void 0 : _a163.toLowerCase();
|
|
4944
|
+
return ext !== void 0 && IMAGE_EXTENSIONS.has(ext);
|
|
4945
|
+
} catch (e) {
|
|
4946
|
+
return false;
|
|
4947
|
+
}
|
|
4948
|
+
}
|
|
4949
|
+
function filenameFromUrl(url) {
|
|
4950
|
+
try {
|
|
4951
|
+
const pathname = new URL(url).pathname;
|
|
4952
|
+
const last = pathname.split("/").pop();
|
|
4953
|
+
return (last == null ? void 0 : last.includes(".")) ? last : "";
|
|
4954
|
+
} catch (e) {
|
|
4955
|
+
return "";
|
|
4956
|
+
}
|
|
4957
|
+
}
|
|
4817
4958
|
function findFirstReasoningDetails(content) {
|
|
4818
|
-
var _a163, _b162, _c;
|
|
4959
|
+
var _a163, _b162, _c, _d;
|
|
4819
4960
|
for (const part of content) {
|
|
4820
4961
|
if (part.type === "tool-call") {
|
|
4821
|
-
const
|
|
4822
|
-
|
|
4823
|
-
|
|
4824
|
-
|
|
4962
|
+
const parsed = OpenRouterProviderOptionsSchema.safeParse(
|
|
4963
|
+
part.providerOptions
|
|
4964
|
+
);
|
|
4965
|
+
if (parsed.success && ((_b162 = (_a163 = parsed.data) == null ? void 0 : _a163.openrouter) == null ? void 0 : _b162.reasoning_details) && parsed.data.openrouter.reasoning_details.length > 0) {
|
|
4966
|
+
return parsed.data.openrouter.reasoning_details;
|
|
4825
4967
|
}
|
|
4826
4968
|
}
|
|
4827
4969
|
}
|
|
@@ -4830,7 +4972,7 @@ function findFirstReasoningDetails(content) {
|
|
|
4830
4972
|
const parsed = OpenRouterProviderOptionsSchema.safeParse(
|
|
4831
4973
|
part.providerOptions
|
|
4832
4974
|
);
|
|
4833
|
-
if (parsed.success && ((
|
|
4975
|
+
if (parsed.success && ((_d = (_c = parsed.data) == null ? void 0 : _c.openrouter) == null ? void 0 : _d.reasoning_details) && parsed.data.openrouter.reasoning_details.length > 0) {
|
|
4834
4976
|
return parsed.data.openrouter.reasoning_details;
|
|
4835
4977
|
}
|
|
4836
4978
|
}
|
|
@@ -6490,7 +6632,7 @@ function withUserAgentSuffix22(headers, ...userAgentSuffixParts) {
|
|
|
6490
6632
|
"user-agent": userAgent
|
|
6491
6633
|
});
|
|
6492
6634
|
}
|
|
6493
|
-
var VERSION2 = false ? "0.0.0-test" : "2.3.
|
|
6635
|
+
var VERSION2 = false ? "0.0.0-test" : "2.3.3";
|
|
6494
6636
|
function createOpenRouter(options = {}) {
|
|
6495
6637
|
var _a163, _b162, _c;
|
|
6496
6638
|
const baseURL = (_b162 = withoutTrailingSlash2((_a163 = options.baseURL) != null ? _a163 : options.baseUrl)) != null ? _b162 : "https://openrouter.ai/api/v1";
|
|
@@ -14227,6 +14369,62 @@ function mergeExecutionMetrics(computed, metrics) {
|
|
|
14227
14369
|
endTime: metrics.endTime ?? computed.endTime
|
|
14228
14370
|
};
|
|
14229
14371
|
}
|
|
14372
|
+
function flattenInputMessages(messages) {
|
|
14373
|
+
return messages.flatMap((message) => extractContentSegments(message.content));
|
|
14374
|
+
}
|
|
14375
|
+
function collectResolvedInputFilePaths(messages) {
|
|
14376
|
+
const filePaths = [];
|
|
14377
|
+
for (const message of messages) {
|
|
14378
|
+
if (!Array.isArray(message.content)) {
|
|
14379
|
+
continue;
|
|
14380
|
+
}
|
|
14381
|
+
for (const segment of message.content) {
|
|
14382
|
+
if (isJsonObject(segment) && segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
14383
|
+
filePaths.push(segment.resolvedPath);
|
|
14384
|
+
}
|
|
14385
|
+
}
|
|
14386
|
+
}
|
|
14387
|
+
return filePaths;
|
|
14388
|
+
}
|
|
14389
|
+
function extractContentSegments(content) {
|
|
14390
|
+
if (typeof content === "string") {
|
|
14391
|
+
return content.trim().length > 0 ? [{ type: "text", value: content }] : [];
|
|
14392
|
+
}
|
|
14393
|
+
if (isJsonObject(content)) {
|
|
14394
|
+
const rendered = JSON.stringify(content, null, 2);
|
|
14395
|
+
return rendered.trim().length > 0 ? [{ type: "text", value: rendered }] : [];
|
|
14396
|
+
}
|
|
14397
|
+
if (!Array.isArray(content)) {
|
|
14398
|
+
return [];
|
|
14399
|
+
}
|
|
14400
|
+
const segments = [];
|
|
14401
|
+
for (const segment of content) {
|
|
14402
|
+
if (!isJsonObject(segment)) {
|
|
14403
|
+
continue;
|
|
14404
|
+
}
|
|
14405
|
+
segments.push(cloneJsonObject(segment));
|
|
14406
|
+
}
|
|
14407
|
+
return segments;
|
|
14408
|
+
}
|
|
14409
|
+
function cloneJsonObject(source) {
|
|
14410
|
+
const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
|
|
14411
|
+
return Object.fromEntries(entries);
|
|
14412
|
+
}
|
|
14413
|
+
function cloneJsonValue(value) {
|
|
14414
|
+
if (value === null) {
|
|
14415
|
+
return null;
|
|
14416
|
+
}
|
|
14417
|
+
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
|
|
14418
|
+
return value;
|
|
14419
|
+
}
|
|
14420
|
+
if (Array.isArray(value)) {
|
|
14421
|
+
return value.map((item) => cloneJsonValue(item));
|
|
14422
|
+
}
|
|
14423
|
+
if (typeof value === "object") {
|
|
14424
|
+
return cloneJsonObject(value);
|
|
14425
|
+
}
|
|
14426
|
+
return value;
|
|
14427
|
+
}
|
|
14230
14428
|
var ANSI_RED = "\x1B[31m";
|
|
14231
14429
|
var ANSI_RESET2 = "\x1B[0m";
|
|
14232
14430
|
function logError(msg) {
|
|
@@ -14292,7 +14490,6 @@ function parseAgentSkillsEvals(parsed, source = "evals.json", baseDir) {
|
|
|
14292
14490
|
id: String(id),
|
|
14293
14491
|
question: prompt,
|
|
14294
14492
|
input: [{ role: "user", content: prompt }],
|
|
14295
|
-
input_segments: [{ type: "text", value: prompt }],
|
|
14296
14493
|
expected_output: evalCase.expected_output ? [{ role: "assistant", content: evalCase.expected_output }] : [],
|
|
14297
14494
|
reference_answer: evalCase.expected_output,
|
|
14298
14495
|
file_paths: filePaths,
|
|
@@ -14414,7 +14611,7 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
14414
14611
|
}
|
|
14415
14612
|
try {
|
|
14416
14613
|
const rawConfig = await readFile22(configPath, "utf8");
|
|
14417
|
-
const parsed = parse(rawConfig);
|
|
14614
|
+
const parsed = interpolateEnv(parse(rawConfig), process.env);
|
|
14418
14615
|
if (!isJsonObject(parsed)) {
|
|
14419
14616
|
logWarning(`Invalid .agentv/config.yaml format at ${configPath}`);
|
|
14420
14617
|
continue;
|
|
@@ -14632,6 +14829,27 @@ function parseExecutionDefaults(raw, configPath) {
|
|
|
14632
14829
|
} else if (otelFile !== void 0) {
|
|
14633
14830
|
logWarning(`Invalid execution.otel_file in ${configPath}, expected non-empty string`);
|
|
14634
14831
|
}
|
|
14832
|
+
if (typeof obj.export_otel === "boolean") {
|
|
14833
|
+
result.export_otel = obj.export_otel;
|
|
14834
|
+
} else if (obj.export_otel !== void 0) {
|
|
14835
|
+
logWarning(`Invalid execution.export_otel in ${configPath}, expected boolean`);
|
|
14836
|
+
}
|
|
14837
|
+
const otelBackend = obj.otel_backend;
|
|
14838
|
+
if (typeof otelBackend === "string" && otelBackend.trim().length > 0) {
|
|
14839
|
+
result.otel_backend = otelBackend.trim();
|
|
14840
|
+
} else if (otelBackend !== void 0) {
|
|
14841
|
+
logWarning(`Invalid execution.otel_backend in ${configPath}, expected non-empty string`);
|
|
14842
|
+
}
|
|
14843
|
+
if (typeof obj.otel_capture_content === "boolean") {
|
|
14844
|
+
result.otel_capture_content = obj.otel_capture_content;
|
|
14845
|
+
} else if (obj.otel_capture_content !== void 0) {
|
|
14846
|
+
logWarning(`Invalid execution.otel_capture_content in ${configPath}, expected boolean`);
|
|
14847
|
+
}
|
|
14848
|
+
if (typeof obj.otel_group_turns === "boolean") {
|
|
14849
|
+
result.otel_group_turns = obj.otel_group_turns;
|
|
14850
|
+
} else if (obj.otel_group_turns !== void 0) {
|
|
14851
|
+
logWarning(`Invalid execution.otel_group_turns in ${configPath}, expected boolean`);
|
|
14852
|
+
}
|
|
14635
14853
|
if (typeof obj.pool_workspaces === "boolean") {
|
|
14636
14854
|
result.pool_workspaces = obj.pool_workspaces;
|
|
14637
14855
|
} else if (obj.pool_workspaces !== void 0) {
|
|
@@ -16076,27 +16294,28 @@ var ANSI_YELLOW4 = "\x1B[33m";
|
|
|
16076
16294
|
var ANSI_RESET5 = "\x1B[0m";
|
|
16077
16295
|
async function processMessages(options) {
|
|
16078
16296
|
const { messages, searchRoots, repoRootPath, textParts, messageType, verbose } = options;
|
|
16079
|
-
const
|
|
16297
|
+
const processedMessages = [];
|
|
16080
16298
|
for (const message of messages) {
|
|
16081
16299
|
const content = message.content;
|
|
16082
16300
|
if (typeof content === "string") {
|
|
16083
|
-
segments.push({ type: "text", value: content });
|
|
16084
16301
|
if (textParts) {
|
|
16085
16302
|
textParts.push(content);
|
|
16086
16303
|
}
|
|
16304
|
+
processedMessages.push({ ...message, content });
|
|
16087
16305
|
continue;
|
|
16088
16306
|
}
|
|
16089
16307
|
if (isJsonObject(content)) {
|
|
16090
16308
|
const rendered = JSON.stringify(content, null, 2);
|
|
16091
|
-
segments.push({ type: "text", value: rendered });
|
|
16092
16309
|
if (textParts) {
|
|
16093
16310
|
textParts.push(rendered);
|
|
16094
16311
|
}
|
|
16312
|
+
processedMessages.push({ ...message, content: cloneJsonObject(content) });
|
|
16095
16313
|
continue;
|
|
16096
16314
|
}
|
|
16097
16315
|
if (!Array.isArray(content)) {
|
|
16098
16316
|
continue;
|
|
16099
16317
|
}
|
|
16318
|
+
const processedContent = [];
|
|
16100
16319
|
for (const rawSegment of content) {
|
|
16101
16320
|
if (!isJsonObject(rawSegment)) {
|
|
16102
16321
|
continue;
|
|
@@ -16119,8 +16338,8 @@ async function processMessages(options) {
|
|
|
16119
16338
|
}
|
|
16120
16339
|
try {
|
|
16121
16340
|
const fileContent = (await readFile4(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
16122
|
-
|
|
16123
|
-
|
|
16341
|
+
processedContent.push({
|
|
16342
|
+
...cloneJsonObject(rawSegment),
|
|
16124
16343
|
path: displayPath,
|
|
16125
16344
|
text: fileContent,
|
|
16126
16345
|
resolvedPath: path5.resolve(resolvedPath)
|
|
@@ -16137,37 +16356,19 @@ async function processMessages(options) {
|
|
|
16137
16356
|
continue;
|
|
16138
16357
|
}
|
|
16139
16358
|
const clonedSegment = cloneJsonObject(rawSegment);
|
|
16140
|
-
|
|
16359
|
+
processedContent.push(clonedSegment);
|
|
16141
16360
|
const inlineValue = clonedSegment.value;
|
|
16142
16361
|
if (typeof inlineValue === "string" && textParts) {
|
|
16143
16362
|
textParts.push(inlineValue);
|
|
16144
16363
|
}
|
|
16145
16364
|
}
|
|
16365
|
+
processedMessages.push({ ...message, content: processedContent });
|
|
16146
16366
|
}
|
|
16147
|
-
return
|
|
16367
|
+
return processedMessages;
|
|
16148
16368
|
}
|
|
16149
16369
|
function asString3(value) {
|
|
16150
16370
|
return typeof value === "string" ? value : void 0;
|
|
16151
16371
|
}
|
|
16152
|
-
function cloneJsonObject(source) {
|
|
16153
|
-
const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
|
|
16154
|
-
return Object.fromEntries(entries);
|
|
16155
|
-
}
|
|
16156
|
-
function cloneJsonValue(value) {
|
|
16157
|
-
if (value === null) {
|
|
16158
|
-
return null;
|
|
16159
|
-
}
|
|
16160
|
-
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
|
|
16161
|
-
return value;
|
|
16162
|
-
}
|
|
16163
|
-
if (Array.isArray(value)) {
|
|
16164
|
-
return value.map((item) => cloneJsonValue(item));
|
|
16165
|
-
}
|
|
16166
|
-
if (typeof value === "object") {
|
|
16167
|
-
return cloneJsonObject(value);
|
|
16168
|
-
}
|
|
16169
|
-
return value;
|
|
16170
|
-
}
|
|
16171
16372
|
function logWarning3(message, details) {
|
|
16172
16373
|
if (details && details.length > 0) {
|
|
16173
16374
|
const detailBlock = details.join("\n");
|
|
@@ -16412,10 +16613,10 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
16412
16613
|
);
|
|
16413
16614
|
}
|
|
16414
16615
|
}
|
|
16415
|
-
const
|
|
16616
|
+
const rawInputMessages = resolveInputMessages(evalcase);
|
|
16416
16617
|
const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
|
|
16417
16618
|
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
|
|
16418
|
-
if (!id || !hasEvaluationSpec || !
|
|
16619
|
+
if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
|
|
16419
16620
|
logError2(
|
|
16420
16621
|
`Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
|
|
16421
16622
|
);
|
|
@@ -16423,8 +16624,8 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
16423
16624
|
}
|
|
16424
16625
|
const hasExpectedMessages = expectedMessages.length > 0;
|
|
16425
16626
|
const inputTextParts = [];
|
|
16426
|
-
const
|
|
16427
|
-
messages:
|
|
16627
|
+
const inputMessages = await processMessages({
|
|
16628
|
+
messages: rawInputMessages,
|
|
16428
16629
|
searchRoots,
|
|
16429
16630
|
repoRootPath,
|
|
16430
16631
|
textParts: inputTextParts,
|
|
@@ -16470,19 +16671,13 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
16470
16671
|
}
|
|
16471
16672
|
}
|
|
16472
16673
|
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
16473
|
-
const userFilePaths =
|
|
16474
|
-
for (const segment of inputSegments) {
|
|
16475
|
-
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
16476
|
-
userFilePaths.push(segment.resolvedPath);
|
|
16477
|
-
}
|
|
16478
|
-
}
|
|
16674
|
+
const userFilePaths = collectResolvedInputFilePaths(inputMessages);
|
|
16479
16675
|
const testCase = {
|
|
16480
16676
|
id,
|
|
16481
16677
|
eval_set: evalSetName,
|
|
16482
16678
|
conversation_id: conversationId,
|
|
16483
16679
|
question,
|
|
16484
16680
|
input: inputMessages,
|
|
16485
|
-
input_segments: inputSegments,
|
|
16486
16681
|
expected_output: outputSegments,
|
|
16487
16682
|
reference_answer: referenceAnswer,
|
|
16488
16683
|
file_paths: userFilePaths,
|
|
@@ -16543,50 +16738,9 @@ function parseMetadata(suite) {
|
|
|
16543
16738
|
});
|
|
16544
16739
|
}
|
|
16545
16740
|
async function buildPromptInputs(testCase, mode = "lm") {
|
|
16546
|
-
const segmentsByMessage =
|
|
16547
|
-
|
|
16548
|
-
|
|
16549
|
-
if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
|
|
16550
|
-
fileContentsByPath.set(segment.path, segment.text);
|
|
16551
|
-
}
|
|
16552
|
-
}
|
|
16553
|
-
for (const message of testCase.input) {
|
|
16554
|
-
const messageSegments = [];
|
|
16555
|
-
if (typeof message.content === "string") {
|
|
16556
|
-
if (message.content.trim().length > 0) {
|
|
16557
|
-
messageSegments.push({ type: "text", value: message.content });
|
|
16558
|
-
}
|
|
16559
|
-
} else if (Array.isArray(message.content)) {
|
|
16560
|
-
for (const segment of message.content) {
|
|
16561
|
-
if (typeof segment === "string") {
|
|
16562
|
-
if (segment.trim().length > 0) {
|
|
16563
|
-
messageSegments.push({ type: "text", value: segment });
|
|
16564
|
-
}
|
|
16565
|
-
} else if (isJsonObject(segment)) {
|
|
16566
|
-
const type = asString5(segment.type);
|
|
16567
|
-
if (type === "file") {
|
|
16568
|
-
const value = asString5(segment.value);
|
|
16569
|
-
if (!value) continue;
|
|
16570
|
-
const fileText = fileContentsByPath.get(value);
|
|
16571
|
-
if (fileText !== void 0) {
|
|
16572
|
-
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
16573
|
-
}
|
|
16574
|
-
} else if (type === "text") {
|
|
16575
|
-
const textValue = asString5(segment.value);
|
|
16576
|
-
if (textValue && textValue.trim().length > 0) {
|
|
16577
|
-
messageSegments.push({ type: "text", value: textValue });
|
|
16578
|
-
}
|
|
16579
|
-
}
|
|
16580
|
-
}
|
|
16581
|
-
}
|
|
16582
|
-
} else if (isJsonObject(message.content)) {
|
|
16583
|
-
const rendered = JSON.stringify(message.content, null, 2);
|
|
16584
|
-
if (rendered.trim().length > 0) {
|
|
16585
|
-
messageSegments.push({ type: "text", value: rendered });
|
|
16586
|
-
}
|
|
16587
|
-
}
|
|
16588
|
-
segmentsByMessage.push(messageSegments);
|
|
16589
|
-
}
|
|
16741
|
+
const segmentsByMessage = testCase.input.map(
|
|
16742
|
+
(message) => extractContentSegments(message.content)
|
|
16743
|
+
);
|
|
16590
16744
|
const useRoleMarkers = needsRoleMarkers(testCase.input, segmentsByMessage);
|
|
16591
16745
|
let question;
|
|
16592
16746
|
if (useRoleMarkers) {
|
|
@@ -16614,7 +16768,7 @@ ${messageContent}`);
|
|
|
16614
16768
|
question = messageParts.join("\n\n");
|
|
16615
16769
|
} else {
|
|
16616
16770
|
const questionParts = [];
|
|
16617
|
-
for (const segment of testCase.
|
|
16771
|
+
for (const segment of flattenInputMessages(testCase.input)) {
|
|
16618
16772
|
const formattedContent = formatSegment(segment, mode);
|
|
16619
16773
|
if (formattedContent) {
|
|
16620
16774
|
questionParts.push(formattedContent);
|
|
@@ -16701,9 +16855,6 @@ function buildChatPromptFromSegments(options) {
|
|
|
16701
16855
|
}
|
|
16702
16856
|
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
16703
16857
|
}
|
|
16704
|
-
function asString5(value) {
|
|
16705
|
-
return typeof value === "string" ? value : void 0;
|
|
16706
|
-
}
|
|
16707
16858
|
var ANSI_YELLOW6 = "\x1B[33m";
|
|
16708
16859
|
var ANSI_RED3 = "\x1B[31m";
|
|
16709
16860
|
var ANSI_RESET7 = "\x1B[0m";
|
|
@@ -16784,7 +16935,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
16784
16935
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
16785
16936
|
}
|
|
16786
16937
|
const suite = interpolated;
|
|
16787
|
-
const evalSetNameFromSuite =
|
|
16938
|
+
const evalSetNameFromSuite = asString5(suite.name)?.trim();
|
|
16788
16939
|
const fallbackEvalSet = path7.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
16789
16940
|
const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
|
|
16790
16941
|
const rawTestcases = resolveTests(suite);
|
|
@@ -16803,7 +16954,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
16803
16954
|
const suiteInputMessages = expandInputShorthand(suite.input);
|
|
16804
16955
|
const suiteInputFiles = suite.input_files;
|
|
16805
16956
|
const rawGlobalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
16806
|
-
const _globalTarget =
|
|
16957
|
+
const _globalTarget = asString5(rawGlobalExecution?.target) ?? asString5(suite.target);
|
|
16807
16958
|
const suiteAssertions = suite.assertions ?? suite.assert;
|
|
16808
16959
|
if (suite.assert !== void 0 && suite.assertions === void 0) {
|
|
16809
16960
|
logWarning5("'assert' is deprecated at the suite level. Use 'assertions' instead.");
|
|
@@ -16816,17 +16967,17 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
16816
16967
|
continue;
|
|
16817
16968
|
}
|
|
16818
16969
|
const evalcase = rawEvalcase;
|
|
16819
|
-
const id =
|
|
16970
|
+
const id = asString5(evalcase.id);
|
|
16820
16971
|
if (filterPattern && (!id || !micromatch2.isMatch(id, filterPattern))) {
|
|
16821
16972
|
continue;
|
|
16822
16973
|
}
|
|
16823
|
-
const conversationId =
|
|
16824
|
-
let outcome =
|
|
16974
|
+
const conversationId = asString5(evalcase.conversation_id);
|
|
16975
|
+
let outcome = asString5(evalcase.criteria);
|
|
16825
16976
|
if (!outcome && evalcase.expected_outcome !== void 0) {
|
|
16826
|
-
outcome =
|
|
16977
|
+
outcome = asString5(evalcase.expected_outcome);
|
|
16827
16978
|
if (outcome) {
|
|
16828
16979
|
logWarning5(
|
|
16829
|
-
`Test '${
|
|
16980
|
+
`Test '${asString5(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
|
|
16830
16981
|
);
|
|
16831
16982
|
}
|
|
16832
16983
|
}
|
|
@@ -16843,10 +16994,9 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
16843
16994
|
continue;
|
|
16844
16995
|
}
|
|
16845
16996
|
const effectiveSuiteInputMessages = suiteInputMessages && !skipDefaults ? suiteInputMessages : void 0;
|
|
16846
|
-
const inputMessages = effectiveSuiteInputMessages ? [...effectiveSuiteInputMessages, ...testInputMessages] : testInputMessages;
|
|
16847
16997
|
const hasExpectedMessages = expectedMessages.length > 0;
|
|
16848
16998
|
const inputTextParts = [];
|
|
16849
|
-
const
|
|
16999
|
+
const suiteResolvedInputMessages = effectiveSuiteInputMessages ? await processMessages({
|
|
16850
17000
|
messages: effectiveSuiteInputMessages,
|
|
16851
17001
|
searchRoots,
|
|
16852
17002
|
repoRootPath,
|
|
@@ -16854,7 +17004,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
16854
17004
|
messageType: "input",
|
|
16855
17005
|
verbose
|
|
16856
17006
|
}) : [];
|
|
16857
|
-
const
|
|
17007
|
+
const testResolvedInputMessages = await processMessages({
|
|
16858
17008
|
messages: testInputMessages,
|
|
16859
17009
|
searchRoots,
|
|
16860
17010
|
repoRootPath,
|
|
@@ -16862,7 +17012,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
16862
17012
|
messageType: "input",
|
|
16863
17013
|
verbose
|
|
16864
17014
|
});
|
|
16865
|
-
const
|
|
17015
|
+
const inputMessages = [...suiteResolvedInputMessages, ...testResolvedInputMessages];
|
|
16866
17016
|
const outputSegments = hasExpectedMessages ? await processExpectedMessages({
|
|
16867
17017
|
messages: expectedMessages,
|
|
16868
17018
|
searchRoots,
|
|
@@ -16900,12 +17050,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
16900
17050
|
}
|
|
16901
17051
|
}
|
|
16902
17052
|
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
16903
|
-
const userFilePaths =
|
|
16904
|
-
for (const segment of inputSegments) {
|
|
16905
|
-
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
16906
|
-
userFilePaths.push(segment.resolvedPath);
|
|
16907
|
-
}
|
|
16908
|
-
}
|
|
17053
|
+
const userFilePaths = collectResolvedInputFilePaths(inputMessages);
|
|
16909
17054
|
const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
|
|
16910
17055
|
const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
|
|
16911
17056
|
const metadata = isJsonObject(evalcase.metadata) ? evalcase.metadata : void 0;
|
|
@@ -16916,7 +17061,6 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
16916
17061
|
conversation_id: conversationId,
|
|
16917
17062
|
question,
|
|
16918
17063
|
input: inputMessages,
|
|
16919
|
-
input_segments: inputSegments,
|
|
16920
17064
|
expected_output: outputSegments,
|
|
16921
17065
|
reference_answer: referenceAnswer,
|
|
16922
17066
|
file_paths: userFilePaths,
|
|
@@ -17125,7 +17269,7 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
|
|
|
17125
17269
|
path: caseLevel.path ?? suiteLevel.path
|
|
17126
17270
|
};
|
|
17127
17271
|
}
|
|
17128
|
-
function
|
|
17272
|
+
function asString5(value) {
|
|
17129
17273
|
return typeof value === "string" ? value : void 0;
|
|
17130
17274
|
}
|
|
17131
17275
|
function logWarning5(message, details) {
|
|
@@ -19552,7 +19696,7 @@ ${basePrompt}` : basePrompt;
|
|
|
19552
19696
|
if (itemType === "command_execution") {
|
|
19553
19697
|
completedToolCalls.push({
|
|
19554
19698
|
tool: "command_execution",
|
|
19555
|
-
input: item.command,
|
|
19699
|
+
input: { command: item.command },
|
|
19556
19700
|
output: item.aggregated_output,
|
|
19557
19701
|
id: item.id
|
|
19558
19702
|
});
|
|
@@ -20383,11 +20527,22 @@ async function loadCopilotSdk() {
|
|
|
20383
20527
|
try {
|
|
20384
20528
|
copilotSdkModule = await import("@github/copilot-sdk");
|
|
20385
20529
|
} catch (error) {
|
|
20530
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
20531
|
+
if (message.includes("vscode-jsonrpc")) {
|
|
20532
|
+
throw new Error(
|
|
20533
|
+
`Failed to load @github/copilot-sdk due to a known ESM compatibility issue with vscode-jsonrpc (https://github.com/github/copilot-sdk/issues/710).
|
|
20534
|
+
|
|
20535
|
+
Workarounds:
|
|
20536
|
+
- Use the copilot-cli target instead (recommended): set target type to "copilot-cli" in your eval YAML
|
|
20537
|
+
- If running under Node.js 24+: set NODE_OPTIONS="--experimental-specifier-resolution=node"
|
|
20538
|
+
- Wait for vscode-jsonrpc@9.0.0 stable to be released upstream`
|
|
20539
|
+
);
|
|
20540
|
+
}
|
|
20386
20541
|
throw new Error(
|
|
20387
20542
|
`Failed to load @github/copilot-sdk. Please install it:
|
|
20388
20543
|
npm install @github/copilot-sdk
|
|
20389
20544
|
|
|
20390
|
-
Original error: ${
|
|
20545
|
+
Original error: ${message}`
|
|
20391
20546
|
);
|
|
20392
20547
|
}
|
|
20393
20548
|
}
|
|
@@ -20781,7 +20936,7 @@ var PiAgentSdkProvider = class {
|
|
|
20781
20936
|
const { Agent, getModel, getEnvApiKey } = await loadPiModules();
|
|
20782
20937
|
const startTimeIso = (/* @__PURE__ */ new Date()).toISOString();
|
|
20783
20938
|
const startMs = Date.now();
|
|
20784
|
-
const providerName = this.config.
|
|
20939
|
+
const providerName = this.config.subprovider ?? "anthropic";
|
|
20785
20940
|
const modelId = this.config.model ?? "claude-sonnet-4-20250514";
|
|
20786
20941
|
const model = getModel(providerName, modelId);
|
|
20787
20942
|
const systemPrompt = this.config.systemPrompt ?? "Answer directly and concisely.";
|
|
@@ -20893,7 +21048,7 @@ var PiAgentSdkProvider = class {
|
|
|
20893
21048
|
messages: agentMessages,
|
|
20894
21049
|
systemPrompt,
|
|
20895
21050
|
model: this.config.model,
|
|
20896
|
-
|
|
21051
|
+
subprovider: this.config.subprovider
|
|
20897
21052
|
},
|
|
20898
21053
|
output,
|
|
20899
21054
|
tokenUsage,
|
|
@@ -21117,8 +21272,8 @@ var PiCodingAgentProvider = class {
|
|
|
21117
21272
|
}
|
|
21118
21273
|
buildPiArgs(prompt, inputFiles, _captureFileChanges) {
|
|
21119
21274
|
const args = [];
|
|
21120
|
-
if (this.config.
|
|
21121
|
-
args.push("--provider", this.config.
|
|
21275
|
+
if (this.config.subprovider) {
|
|
21276
|
+
args.push("--provider", this.config.subprovider);
|
|
21122
21277
|
}
|
|
21123
21278
|
if (this.config.model) {
|
|
21124
21279
|
args.push("--model", this.config.model);
|
|
@@ -21176,7 +21331,7 @@ ${prompt}` : prompt;
|
|
|
21176
21331
|
buildEnv() {
|
|
21177
21332
|
const env = { ...process.env };
|
|
21178
21333
|
if (this.config.apiKey) {
|
|
21179
|
-
const provider = this.config.
|
|
21334
|
+
const provider = this.config.subprovider?.toLowerCase() ?? "google";
|
|
21180
21335
|
switch (provider) {
|
|
21181
21336
|
case "google":
|
|
21182
21337
|
case "gemini":
|
|
@@ -21592,6 +21747,13 @@ function extractToolCalls4(content) {
|
|
|
21592
21747
|
id: typeof p.id === "string" ? p.id : void 0
|
|
21593
21748
|
});
|
|
21594
21749
|
}
|
|
21750
|
+
if (p.type === "toolCall" && typeof p.name === "string") {
|
|
21751
|
+
toolCalls.push({
|
|
21752
|
+
tool: p.name,
|
|
21753
|
+
input: p.arguments,
|
|
21754
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
21755
|
+
});
|
|
21756
|
+
}
|
|
21595
21757
|
if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
|
|
21596
21758
|
const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
|
|
21597
21759
|
if (existing) {
|
|
@@ -23903,7 +24065,8 @@ var freeformEvaluationSchema = external_exports2.object({
|
|
|
23903
24065
|
passed: external_exports2.boolean().describe("Whether this aspect was satisfied"),
|
|
23904
24066
|
evidence: external_exports2.string().describe("Concise evidence (1-2 sentences)").optional()
|
|
23905
24067
|
})
|
|
23906
|
-
).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional()
|
|
24068
|
+
).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional(),
|
|
24069
|
+
details: external_exports2.record(external_exports2.unknown()).describe("Optional structured metadata for domain-specific metrics").optional()
|
|
23907
24070
|
});
|
|
23908
24071
|
var rubricCheckResultSchema = external_exports2.object({
|
|
23909
24072
|
id: external_exports2.string().describe("The ID of the rubric item being checked"),
|
|
@@ -23965,7 +24128,7 @@ var LlmGraderEvaluator = class {
|
|
|
23965
24128
|
async evaluateFreeform(context2, graderProvider) {
|
|
23966
24129
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
23967
24130
|
const variables = {
|
|
23968
|
-
[TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context2.evalCase.
|
|
24131
|
+
[TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context2.evalCase.input, null, 2),
|
|
23969
24132
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(
|
|
23970
24133
|
context2.evalCase.expected_output,
|
|
23971
24134
|
null,
|
|
@@ -24008,6 +24171,7 @@ ${context2.fileChanges}`;
|
|
|
24008
24171
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
24009
24172
|
evaluatorRawRequest,
|
|
24010
24173
|
graderTarget: graderProvider.targetName,
|
|
24174
|
+
details: data.details,
|
|
24011
24175
|
tokenUsage
|
|
24012
24176
|
};
|
|
24013
24177
|
} catch (e) {
|
|
@@ -24427,7 +24591,7 @@ ${outputSchema2}`;
|
|
|
24427
24591
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
24428
24592
|
evaluatorRawRequest,
|
|
24429
24593
|
graderTarget,
|
|
24430
|
-
details
|
|
24594
|
+
details: data.details && Object.keys(data.details).length > 0 ? { ...details, ...data.details } : details
|
|
24431
24595
|
};
|
|
24432
24596
|
} catch {
|
|
24433
24597
|
return {
|
|
@@ -24574,7 +24738,8 @@ function buildOutputSchema() {
|
|
|
24574
24738
|
' "passed": <boolean>,',
|
|
24575
24739
|
' "evidence": "<concise evidence, 1-2 sentences, optional>"',
|
|
24576
24740
|
" }",
|
|
24577
|
-
" ]",
|
|
24741
|
+
" ],",
|
|
24742
|
+
' "details": {<optional object with domain-specific structured metrics>}',
|
|
24578
24743
|
"}"
|
|
24579
24744
|
].join("\n");
|
|
24580
24745
|
}
|
|
@@ -25778,12 +25943,31 @@ var COPILOT_MATCHER = {
|
|
|
25778
25943
|
readToolPrefixes: ["Viewing "],
|
|
25779
25944
|
readInputFields: ["file_path", "path"]
|
|
25780
25945
|
};
|
|
25946
|
+
var PI_CODING_AGENT_MATCHER = {
|
|
25947
|
+
skillTools: [],
|
|
25948
|
+
skillInputField: "skill",
|
|
25949
|
+
readTools: ["read"],
|
|
25950
|
+
readInputField: "path",
|
|
25951
|
+
readInputFields: ["path", "file_path", "filePath"]
|
|
25952
|
+
};
|
|
25953
|
+
var CODEX_MATCHER = {
|
|
25954
|
+
skillTools: [],
|
|
25955
|
+
skillInputField: "skill",
|
|
25956
|
+
readTools: ["command_execution"],
|
|
25957
|
+
readInputField: "command",
|
|
25958
|
+
skillToolPrefixes: ["mcp:"],
|
|
25959
|
+
readToolPrefixes: ["mcp:"],
|
|
25960
|
+
readInputFields: ["command", "path", "file_path", "filePath"]
|
|
25961
|
+
};
|
|
25781
25962
|
var PROVIDER_TOOL_SEMANTICS = {
|
|
25782
25963
|
claude: CLAUDE_MATCHER,
|
|
25783
25964
|
"claude-cli": CLAUDE_MATCHER,
|
|
25784
25965
|
"claude-sdk": CLAUDE_MATCHER,
|
|
25785
|
-
|
|
25786
|
-
"pi-agent
|
|
25966
|
+
codex: CODEX_MATCHER,
|
|
25967
|
+
"pi-coding-agent": PI_CODING_AGENT_MATCHER,
|
|
25968
|
+
// pi-agent-sdk has no tools, so skill detection is a no-op. Kept for completeness.
|
|
25969
|
+
// TODO: consider removing pi-agent-sdk provider entirely.
|
|
25970
|
+
"pi-agent-sdk": PI_CODING_AGENT_MATCHER,
|
|
25787
25971
|
"copilot-cli": COPILOT_MATCHER,
|
|
25788
25972
|
"copilot-sdk": COPILOT_MATCHER,
|
|
25789
25973
|
vscode: COPILOT_MATCHER,
|
|
@@ -25807,33 +25991,37 @@ var SkillTriggerEvaluator = class {
|
|
|
25807
25991
|
const shouldTrigger = this.config.should_trigger !== false;
|
|
25808
25992
|
const providerKind = context2.provider?.kind;
|
|
25809
25993
|
const matcher = this.resolveMatcher(providerKind);
|
|
25810
|
-
const
|
|
25994
|
+
const allToolCalls = (context2.output ?? []).flatMap((msg) => msg.toolCalls ?? []);
|
|
25811
25995
|
let triggered = false;
|
|
25812
25996
|
let evidence = "";
|
|
25813
|
-
|
|
25814
|
-
const input =
|
|
25815
|
-
if (matcher.skillTools.includes(
|
|
25997
|
+
for (const toolCall of allToolCalls) {
|
|
25998
|
+
const input = toolCall.input ?? {};
|
|
25999
|
+
if (matcher.skillTools.includes(toolCall.tool)) {
|
|
25816
26000
|
const skillArg = String(input[matcher.skillInputField] ?? "");
|
|
25817
26001
|
if (skillArg.includes(skillName)) {
|
|
25818
26002
|
triggered = true;
|
|
25819
26003
|
evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
|
|
26004
|
+
break;
|
|
25820
26005
|
}
|
|
25821
26006
|
} else if (matcher.skillToolPrefixes?.some(
|
|
25822
|
-
(prefix) =>
|
|
26007
|
+
(prefix) => toolCall.tool.startsWith(prefix) && toolCall.tool.includes(skillName)
|
|
25823
26008
|
)) {
|
|
25824
26009
|
triggered = true;
|
|
25825
|
-
evidence = `Skill tool invoked via tool name "${
|
|
25826
|
-
|
|
26010
|
+
evidence = `Skill tool invoked via tool name "${toolCall.tool}"`;
|
|
26011
|
+
break;
|
|
26012
|
+
} else if (matcher.readTools.includes(toolCall.tool)) {
|
|
25827
26013
|
const filePath = this.readPathFromInput(input, matcher);
|
|
25828
26014
|
if (filePath.includes(skillName)) {
|
|
25829
26015
|
triggered = true;
|
|
25830
26016
|
evidence = `Read tool loaded skill file: ${filePath}`;
|
|
26017
|
+
break;
|
|
25831
26018
|
}
|
|
25832
26019
|
} else if (matcher.readToolPrefixes?.some(
|
|
25833
|
-
(prefix) =>
|
|
26020
|
+
(prefix) => toolCall.tool.startsWith(prefix) && toolCall.tool.includes(skillName)
|
|
25834
26021
|
)) {
|
|
25835
26022
|
triggered = true;
|
|
25836
|
-
evidence = `Read tool loaded skill file via tool name "${
|
|
26023
|
+
evidence = `Read tool loaded skill file via tool name "${toolCall.tool}"`;
|
|
26024
|
+
break;
|
|
25837
26025
|
}
|
|
25838
26026
|
}
|
|
25839
26027
|
const pass = triggered === shouldTrigger;
|
|
@@ -25855,7 +26043,7 @@ var SkillTriggerEvaluator = class {
|
|
|
25855
26043
|
verdict: "fail",
|
|
25856
26044
|
assertions: [
|
|
25857
26045
|
{
|
|
25858
|
-
text: shouldTrigger ?
|
|
26046
|
+
text: shouldTrigger ? allToolCalls.length > 0 ? `Skill "${skillName}" not found in ${allToolCalls.length} tool call(s)` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`,
|
|
25859
26047
|
passed: false
|
|
25860
26048
|
}
|
|
25861
26049
|
],
|
|
@@ -25901,7 +26089,7 @@ function assembleLlmGraderPrompt(input) {
|
|
|
25901
26089
|
function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evaluatorTemplateOverride) {
|
|
25902
26090
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
25903
26091
|
const variables = {
|
|
25904
|
-
[TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.
|
|
26092
|
+
[TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input, null, 2),
|
|
25905
26093
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
|
|
25906
26094
|
[TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
|
|
25907
26095
|
[TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
|
|
@@ -28115,6 +28303,18 @@ var QUALITY_PASS_THRESHOLD = 0.8;
|
|
|
28115
28303
|
function classifyQualityStatus(score) {
|
|
28116
28304
|
return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
|
|
28117
28305
|
}
|
|
28306
|
+
function buildSkippedEvaluatorError(scores) {
|
|
28307
|
+
const skippedScores = scores?.filter((score) => score.verdict === "skip") ?? [];
|
|
28308
|
+
if (skippedScores.length === 0) {
|
|
28309
|
+
return void 0;
|
|
28310
|
+
}
|
|
28311
|
+
const messages = skippedScores.map((score) => {
|
|
28312
|
+
const label = score.name || score.type;
|
|
28313
|
+
const assertionMessage = score.assertions.find((assertion) => !assertion.passed)?.text ?? "Evaluator skipped";
|
|
28314
|
+
return `${label}: ${assertionMessage}`;
|
|
28315
|
+
});
|
|
28316
|
+
return messages.length === 1 ? messages[0] : `Evaluators skipped: ${messages.join(" | ")}`;
|
|
28317
|
+
}
|
|
28118
28318
|
function usesFileReferencePrompt(provider) {
|
|
28119
28319
|
return isAgentProvider(provider) || provider.kind === "cli";
|
|
28120
28320
|
}
|
|
@@ -29379,7 +29579,8 @@ async function runEvalCase(options) {
|
|
|
29379
29579
|
durationMs: totalDurationMs,
|
|
29380
29580
|
...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
|
|
29381
29581
|
};
|
|
29382
|
-
const
|
|
29582
|
+
const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
|
|
29583
|
+
const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score);
|
|
29383
29584
|
const finalResult = providerError ? {
|
|
29384
29585
|
...result,
|
|
29385
29586
|
evalRun,
|
|
@@ -29391,7 +29592,26 @@ async function runEvalCase(options) {
|
|
|
29391
29592
|
beforeAllOutput,
|
|
29392
29593
|
beforeEachOutput,
|
|
29393
29594
|
afterEachOutput
|
|
29394
|
-
} :
|
|
29595
|
+
} : skippedEvaluatorError ? {
|
|
29596
|
+
...result,
|
|
29597
|
+
score: 0,
|
|
29598
|
+
evalRun,
|
|
29599
|
+
error: skippedEvaluatorError,
|
|
29600
|
+
executionStatus,
|
|
29601
|
+
failureStage: "evaluator",
|
|
29602
|
+
failureReasonCode: "evaluator_error",
|
|
29603
|
+
executionError: { message: skippedEvaluatorError, stage: "evaluator" },
|
|
29604
|
+
beforeAllOutput,
|
|
29605
|
+
beforeEachOutput,
|
|
29606
|
+
afterEachOutput
|
|
29607
|
+
} : {
|
|
29608
|
+
...result,
|
|
29609
|
+
evalRun,
|
|
29610
|
+
executionStatus,
|
|
29611
|
+
beforeAllOutput,
|
|
29612
|
+
beforeEachOutput,
|
|
29613
|
+
afterEachOutput
|
|
29614
|
+
};
|
|
29395
29615
|
const isFailure = !!finalResult.error || finalResult.score < 0.5;
|
|
29396
29616
|
if (workspacePath && !isSharedWorkspace) {
|
|
29397
29617
|
if (forceCleanup) {
|
|
@@ -30128,11 +30348,6 @@ async function evaluate(config) {
|
|
|
30128
30348
|
evalCases = (config.tests ?? []).map((test) => {
|
|
30129
30349
|
const input = typeof test.input === "string" ? [{ role: "user", content: test.input }] : test.input;
|
|
30130
30350
|
const question = typeof test.input === "string" ? test.input : test.input.find((m) => m.role === "user")?.content ?? "";
|
|
30131
|
-
const inputSegments = input.map((m) => ({
|
|
30132
|
-
type: "text",
|
|
30133
|
-
value: typeof m.content === "string" ? m.content : JSON.stringify(m.content),
|
|
30134
|
-
messageIndex: 0
|
|
30135
|
-
}));
|
|
30136
30351
|
const expectedOutputValue = test.expectedOutput ?? test.expected_output;
|
|
30137
30352
|
const expectedOutput = expectedOutputValue ? [
|
|
30138
30353
|
{ role: "assistant", content: expectedOutputValue }
|
|
@@ -30161,7 +30376,6 @@ async function evaluate(config) {
|
|
|
30161
30376
|
criteria: test.criteria ?? "",
|
|
30162
30377
|
question: String(question),
|
|
30163
30378
|
input,
|
|
30164
|
-
input_segments: inputSegments,
|
|
30165
30379
|
expected_output: expectedOutput,
|
|
30166
30380
|
reference_answer: expectedOutputValue,
|
|
30167
30381
|
file_paths: [],
|
|
@@ -31062,4 +31276,4 @@ export {
|
|
|
31062
31276
|
OtelStreamingObserver,
|
|
31063
31277
|
createAgentKernel
|
|
31064
31278
|
};
|
|
31065
|
-
//# sourceMappingURL=chunk-
|
|
31279
|
+
//# sourceMappingURL=chunk-EZGWZVVK.js.map
|