agentv 3.10.2 → 3.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/dist/{chunk-6UE665XI.js → chunk-ETMDLQ72.js} +1141 -60
  2. package/dist/chunk-ETMDLQ72.js.map +1 -0
  3. package/dist/{chunk-KGK5NUFG.js → chunk-EZGWZVVK.js} +377 -163
  4. package/dist/chunk-EZGWZVVK.js.map +1 -0
  5. package/dist/{chunk-F7LAJMTO.js → chunk-JEW3FEO7.js} +68 -32
  6. package/dist/chunk-JEW3FEO7.js.map +1 -0
  7. package/dist/cli.js +3 -3
  8. package/dist/{dist-3QUJEJUT.js → dist-QERRYDSC.js} +2 -2
  9. package/dist/index.js +3 -3
  10. package/dist/{interactive-EO6AR2R3.js → interactive-AD4PRYDN.js} +3 -3
  11. package/package.json +3 -1
  12. package/dist/chunk-6UE665XI.js.map +0 -1
  13. package/dist/chunk-F7LAJMTO.js.map +0 -1
  14. package/dist/chunk-KGK5NUFG.js.map +0 -1
  15. package/dist/templates/.agents/skills/agentv-chat-to-eval/README.md +0 -84
  16. package/dist/templates/.agents/skills/agentv-chat-to-eval/SKILL.md +0 -144
  17. package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-json.md +0 -67
  18. package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-markdown.md +0 -101
  19. package/dist/templates/.agents/skills/agentv-eval-builder/SKILL.md +0 -458
  20. package/dist/templates/.agents/skills/agentv-eval-builder/references/config-schema.json +0 -36
  21. package/dist/templates/.agents/skills/agentv-eval-builder/references/custom-evaluators.md +0 -118
  22. package/dist/templates/.agents/skills/agentv-eval-builder/references/eval-schema.json +0 -12753
  23. package/dist/templates/.agents/skills/agentv-eval-builder/references/rubric-evaluator.md +0 -77
  24. package/dist/templates/.agents/skills/agentv-eval-orchestrator/SKILL.md +0 -50
  25. package/dist/templates/.agents/skills/agentv-prompt-optimizer/SKILL.md +0 -78
  26. package/dist/templates/.agentv/.env.example +0 -25
  27. package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +0 -177
  28. package/dist/templates/.claude/skills/agentv-eval-builder/references/batch-cli-evaluator.md +0 -316
  29. package/dist/templates/.claude/skills/agentv-eval-builder/references/compare-command.md +0 -137
  30. package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +0 -215
  31. package/dist/templates/.claude/skills/agentv-eval-builder/references/config-schema.json +0 -27
  32. package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +0 -115
  33. package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +0 -278
  34. package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +0 -333
  35. package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +0 -79
  36. package/dist/templates/.claude/skills/agentv-eval-builder/references/structured-data-evaluators.md +0 -121
  37. package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +0 -298
  38. package/dist/templates/.claude/skills/agentv-prompt-optimizer/SKILL.md +0 -78
  39. package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +0 -5
  40. package/dist/templates/.github/prompts/agentv-optimize.prompt.md +0 -4
  41. /package/dist/{dist-3QUJEJUT.js.map → dist-QERRYDSC.js.map} +0 -0
  42. /package/dist/{interactive-EO6AR2R3.js.map → interactive-AD4PRYDN.js.map} +0 -0
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
301
301
  }
302
302
  });
303
303
 
304
- // ../../packages/core/dist/chunk-VCFYWLFV.js
304
+ // ../../packages/core/dist/chunk-AVTN5AB7.js
305
305
  import { constants } from "node:fs";
306
306
  import { access, readFile } from "node:fs/promises";
307
307
  import path from "node:path";
@@ -419,7 +419,7 @@ __export(external_exports2, {
419
419
  void: () => voidType
420
420
  });
421
421
 
422
- // ../../packages/core/dist/chunk-VCFYWLFV.js
422
+ // ../../packages/core/dist/chunk-AVTN5AB7.js
423
423
  import { readFile as readFile2 } from "node:fs/promises";
424
424
  import path3 from "node:path";
425
425
  import fg from "fast-glob";
@@ -1363,7 +1363,7 @@ function normalizeCopilotLogFormat(value) {
1363
1363
  }
1364
1364
  function resolvePiCodingAgentConfig(target, env, evalFilePath) {
1365
1365
  const executableSource = target.executable ?? target.command ?? target.binary;
1366
- const providerSource = target.pi_provider ?? target.piProvider ?? target.llm_provider;
1366
+ const subproviderSource = target.subprovider;
1367
1367
  const modelSource = target.model ?? target.pi_model ?? target.piModel;
1368
1368
  const apiKeySource = target.api_key ?? target.apiKey;
1369
1369
  const toolsSource = target.tools ?? target.pi_tools ?? target.piTools;
@@ -1379,10 +1379,15 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
1379
1379
  allowLiteral: true,
1380
1380
  optionalEnv: true
1381
1381
  }) ?? "pi";
1382
- const provider = resolveOptionalString(providerSource, env, `${target.name} pi provider`, {
1383
- allowLiteral: true,
1384
- optionalEnv: true
1385
- });
1382
+ const subprovider = resolveOptionalString(
1383
+ subproviderSource,
1384
+ env,
1385
+ `${target.name} pi subprovider`,
1386
+ {
1387
+ allowLiteral: true,
1388
+ optionalEnv: true
1389
+ }
1390
+ );
1386
1391
  const model = resolveOptionalString(modelSource, env, `${target.name} pi model`, {
1387
1392
  allowLiteral: true,
1388
1393
  optionalEnv: true
@@ -1430,7 +1435,7 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
1430
1435
  const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
1431
1436
  return {
1432
1437
  executable,
1433
- provider,
1438
+ subprovider,
1434
1439
  model,
1435
1440
  apiKey,
1436
1441
  tools,
@@ -1445,15 +1450,15 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
1445
1450
  };
1446
1451
  }
1447
1452
  function resolvePiAgentSdkConfig(target, env) {
1448
- const providerSource = target.pi_provider ?? target.piProvider ?? target.llm_provider;
1453
+ const subproviderSource = target.subprovider;
1449
1454
  const modelSource = target.model ?? target.pi_model ?? target.piModel;
1450
1455
  const apiKeySource = target.api_key ?? target.apiKey;
1451
1456
  const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
1452
1457
  const systemPromptSource = target.system_prompt ?? target.systemPrompt;
1453
- const provider = resolveOptionalString(
1454
- providerSource,
1458
+ const subprovider = resolveOptionalString(
1459
+ subproviderSource,
1455
1460
  env,
1456
- `${target.name} pi-agent-sdk provider`,
1461
+ `${target.name} pi-agent-sdk subprovider`,
1457
1462
  {
1458
1463
  allowLiteral: true,
1459
1464
  optionalEnv: true
@@ -1470,7 +1475,7 @@ function resolvePiAgentSdkConfig(target, env) {
1470
1475
  const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi-agent-sdk timeout`);
1471
1476
  const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
1472
1477
  return {
1473
- provider,
1478
+ subprovider,
1474
1479
  model,
1475
1480
  apiKey,
1476
1481
  timeoutMs,
@@ -2039,7 +2044,7 @@ import path8 from "node:path";
2039
2044
  import { parse as parse3 } from "yaml";
2040
2045
  import { createOpenAI } from "@ai-sdk/openai";
2041
2046
 
2042
- // ../../node_modules/.bun/@openrouter+ai-sdk-provider@2.3.1+3ab978b6804fd9e7/node_modules/@openrouter/ai-sdk-provider/dist/index.mjs
2047
+ // ../../node_modules/.bun/@openrouter+ai-sdk-provider@2.3.3+3ab978b6804fd9e7/node_modules/@openrouter/ai-sdk-provider/dist/index.mjs
2043
2048
  var __defProp = Object.defineProperty;
2044
2049
  var __defProps = Object.defineProperties;
2045
2050
  var __getOwnPropDescs = Object.getOwnPropertyDescriptors;
@@ -4202,11 +4207,13 @@ function isDefinedOrNotNull(value) {
4202
4207
  var ReasoningFormat = /* @__PURE__ */ ((ReasoningFormat2) => {
4203
4208
  ReasoningFormat2["Unknown"] = "unknown";
4204
4209
  ReasoningFormat2["OpenAIResponsesV1"] = "openai-responses-v1";
4210
+ ReasoningFormat2["AzureOpenAIResponsesV1"] = "azure-openai-responses-v1";
4205
4211
  ReasoningFormat2["XAIResponsesV1"] = "xai-responses-v1";
4206
4212
  ReasoningFormat2["AnthropicClaudeV1"] = "anthropic-claude-v1";
4207
4213
  ReasoningFormat2["GoogleGeminiV1"] = "google-gemini-v1";
4208
4214
  return ReasoningFormat2;
4209
4215
  })(ReasoningFormat || {});
4216
+ var DEFAULT_REASONING_FORMAT = "anthropic-claude-v1";
4210
4217
  var CommonReasoningDetailSchema = external_exports.object({
4211
4218
  id: external_exports.string().nullish(),
4212
4219
  format: external_exports.enum(ReasoningFormat).nullish(),
@@ -4360,7 +4367,11 @@ var OpenRouterProviderMetadataSchema = external_exports.object({
4360
4367
  }).catchall(external_exports.any());
4361
4368
  var OpenRouterProviderOptionsSchema = external_exports.object({
4362
4369
  openrouter: external_exports.object({
4363
- reasoning_details: external_exports.array(ReasoningDetailUnionSchema).optional(),
4370
+ // Use ReasoningDetailArraySchema (with unknown fallback) instead of
4371
+ // z.array(ReasoningDetailUnionSchema) so that a single malformed entry
4372
+ // (e.g., a future format not yet in the enum) is individually dropped
4373
+ // rather than causing the entire array to fail parsing.
4374
+ reasoning_details: ReasoningDetailArraySchema.optional(),
4364
4375
  annotations: external_exports.array(FileAnnotationSchema).optional()
4365
4376
  }).optional()
4366
4377
  }).optional();
@@ -4758,8 +4769,24 @@ function convertToOpenRouterChatMessages(prompt) {
4758
4769
  const candidateReasoningDetails = messageReasoningDetails && Array.isArray(messageReasoningDetails) && messageReasoningDetails.length > 0 ? messageReasoningDetails : findFirstReasoningDetails(content);
4759
4770
  let finalReasoningDetails;
4760
4771
  if (candidateReasoningDetails && candidateReasoningDetails.length > 0) {
4772
+ const validDetails = candidateReasoningDetails.filter((detail) => {
4773
+ var _a173;
4774
+ if (detail.type !== "reasoning.text") {
4775
+ return true;
4776
+ }
4777
+ const format = (_a173 = detail.format) != null ? _a173 : DEFAULT_REASONING_FORMAT;
4778
+ if (format !== "anthropic-claude-v1") {
4779
+ return true;
4780
+ }
4781
+ return !!detail.signature;
4782
+ });
4783
+ if (validDetails.length < candidateReasoningDetails.length) {
4784
+ console.warn(
4785
+ "[openrouter] Some reasoning_details entries were removed because they were missing signatures. See https://github.com/OpenRouterTeam/ai-sdk-provider/issues/423 for more details."
4786
+ );
4787
+ }
4761
4788
  const uniqueDetails = [];
4762
- for (const detail of candidateReasoningDetails) {
4789
+ for (const detail of validDetails) {
4763
4790
  if (reasoningDetailsTracker.upsert(detail)) {
4764
4791
  uniqueDetails.push(detail);
4765
4792
  }
@@ -4808,20 +4835,135 @@ function getToolResultContent(input) {
4808
4835
  return input.output.value;
4809
4836
  case "json":
4810
4837
  case "error-json":
4811
- case "content":
4812
4838
  return JSON.stringify(input.output.value);
4839
+ case "content":
4840
+ return mapToolResultContentParts(input.output.value);
4813
4841
  case "execution-denied":
4814
4842
  return (_a163 = input.output.reason) != null ? _a163 : "Tool execution denied";
4815
4843
  }
4816
4844
  }
4845
+ function mapToolResultContentParts(parts) {
4846
+ return parts.map((part) => {
4847
+ var _a163, _b162, _c;
4848
+ switch (part.type) {
4849
+ case "text":
4850
+ return { type: "text", text: part.text };
4851
+ case "image-data":
4852
+ return {
4853
+ type: "image_url",
4854
+ image_url: {
4855
+ url: buildFileDataUrl({
4856
+ data: part.data,
4857
+ mediaType: part.mediaType,
4858
+ defaultMediaType: "image/jpeg"
4859
+ })
4860
+ }
4861
+ };
4862
+ case "image-url":
4863
+ return {
4864
+ type: "image_url",
4865
+ image_url: { url: part.url }
4866
+ };
4867
+ case "file-data": {
4868
+ const dataUrl = buildFileDataUrl({
4869
+ data: part.data,
4870
+ mediaType: part.mediaType,
4871
+ defaultMediaType: "application/octet-stream"
4872
+ });
4873
+ if ((_a163 = part.mediaType) == null ? void 0 : _a163.startsWith("image/")) {
4874
+ return {
4875
+ type: "image_url",
4876
+ image_url: { url: dataUrl }
4877
+ };
4878
+ }
4879
+ if ((_b162 = part.mediaType) == null ? void 0 : _b162.startsWith("audio/")) {
4880
+ const rawFormat = part.mediaType.replace("audio/", "");
4881
+ const format = MIME_TO_FORMAT[rawFormat];
4882
+ if (format !== void 0) {
4883
+ return {
4884
+ type: "input_audio",
4885
+ input_audio: {
4886
+ data: getBase64FromDataUrl(dataUrl),
4887
+ format
4888
+ }
4889
+ };
4890
+ }
4891
+ }
4892
+ return {
4893
+ type: "file",
4894
+ file: {
4895
+ filename: (_c = part.filename) != null ? _c : "",
4896
+ file_data: dataUrl
4897
+ }
4898
+ };
4899
+ }
4900
+ case "file-url": {
4901
+ if (looksLikeImageUrl(part.url)) {
4902
+ return {
4903
+ type: "image_url",
4904
+ image_url: { url: part.url }
4905
+ };
4906
+ }
4907
+ return {
4908
+ type: "file",
4909
+ file: {
4910
+ filename: filenameFromUrl(part.url),
4911
+ file_data: part.url
4912
+ }
4913
+ };
4914
+ }
4915
+ case "file-id":
4916
+ case "image-file-id":
4917
+ case "custom":
4918
+ return { type: "text", text: JSON.stringify(part) };
4919
+ default: {
4920
+ const _exhaustiveCheck = part;
4921
+ return { type: "text", text: JSON.stringify(_exhaustiveCheck) };
4922
+ }
4923
+ }
4924
+ });
4925
+ }
4926
+ var IMAGE_EXTENSIONS = /* @__PURE__ */ new Set([
4927
+ "jpg",
4928
+ "jpeg",
4929
+ "png",
4930
+ "gif",
4931
+ "webp",
4932
+ "svg",
4933
+ "bmp",
4934
+ "ico",
4935
+ "tif",
4936
+ "tiff",
4937
+ "avif"
4938
+ ]);
4939
+ function looksLikeImageUrl(url) {
4940
+ var _a163;
4941
+ try {
4942
+ const pathname = new URL(url).pathname;
4943
+ const ext = (_a163 = pathname.split(".").pop()) == null ? void 0 : _a163.toLowerCase();
4944
+ return ext !== void 0 && IMAGE_EXTENSIONS.has(ext);
4945
+ } catch (e) {
4946
+ return false;
4947
+ }
4948
+ }
4949
+ function filenameFromUrl(url) {
4950
+ try {
4951
+ const pathname = new URL(url).pathname;
4952
+ const last = pathname.split("/").pop();
4953
+ return (last == null ? void 0 : last.includes(".")) ? last : "";
4954
+ } catch (e) {
4955
+ return "";
4956
+ }
4957
+ }
4817
4958
  function findFirstReasoningDetails(content) {
4818
- var _a163, _b162, _c;
4959
+ var _a163, _b162, _c, _d;
4819
4960
  for (const part of content) {
4820
4961
  if (part.type === "tool-call") {
4821
- const openrouter2 = (_a163 = part.providerOptions) == null ? void 0 : _a163.openrouter;
4822
- const details = openrouter2 == null ? void 0 : openrouter2.reasoning_details;
4823
- if (Array.isArray(details) && details.length > 0) {
4824
- return details;
4962
+ const parsed = OpenRouterProviderOptionsSchema.safeParse(
4963
+ part.providerOptions
4964
+ );
4965
+ if (parsed.success && ((_b162 = (_a163 = parsed.data) == null ? void 0 : _a163.openrouter) == null ? void 0 : _b162.reasoning_details) && parsed.data.openrouter.reasoning_details.length > 0) {
4966
+ return parsed.data.openrouter.reasoning_details;
4825
4967
  }
4826
4968
  }
4827
4969
  }
@@ -4830,7 +4972,7 @@ function findFirstReasoningDetails(content) {
4830
4972
  const parsed = OpenRouterProviderOptionsSchema.safeParse(
4831
4973
  part.providerOptions
4832
4974
  );
4833
- if (parsed.success && ((_c = (_b162 = parsed.data) == null ? void 0 : _b162.openrouter) == null ? void 0 : _c.reasoning_details) && parsed.data.openrouter.reasoning_details.length > 0) {
4975
+ if (parsed.success && ((_d = (_c = parsed.data) == null ? void 0 : _c.openrouter) == null ? void 0 : _d.reasoning_details) && parsed.data.openrouter.reasoning_details.length > 0) {
4834
4976
  return parsed.data.openrouter.reasoning_details;
4835
4977
  }
4836
4978
  }
@@ -6490,7 +6632,7 @@ function withUserAgentSuffix22(headers, ...userAgentSuffixParts) {
6490
6632
  "user-agent": userAgent
6491
6633
  });
6492
6634
  }
6493
- var VERSION2 = false ? "0.0.0-test" : "2.3.1";
6635
+ var VERSION2 = false ? "0.0.0-test" : "2.3.3";
6494
6636
  function createOpenRouter(options = {}) {
6495
6637
  var _a163, _b162, _c;
6496
6638
  const baseURL = (_b162 = withoutTrailingSlash2((_a163 = options.baseURL) != null ? _a163 : options.baseUrl)) != null ? _b162 : "https://openrouter.ai/api/v1";
@@ -14227,6 +14369,62 @@ function mergeExecutionMetrics(computed, metrics) {
14227
14369
  endTime: metrics.endTime ?? computed.endTime
14228
14370
  };
14229
14371
  }
14372
+ function flattenInputMessages(messages) {
14373
+ return messages.flatMap((message) => extractContentSegments(message.content));
14374
+ }
14375
+ function collectResolvedInputFilePaths(messages) {
14376
+ const filePaths = [];
14377
+ for (const message of messages) {
14378
+ if (!Array.isArray(message.content)) {
14379
+ continue;
14380
+ }
14381
+ for (const segment of message.content) {
14382
+ if (isJsonObject(segment) && segment.type === "file" && typeof segment.resolvedPath === "string") {
14383
+ filePaths.push(segment.resolvedPath);
14384
+ }
14385
+ }
14386
+ }
14387
+ return filePaths;
14388
+ }
14389
+ function extractContentSegments(content) {
14390
+ if (typeof content === "string") {
14391
+ return content.trim().length > 0 ? [{ type: "text", value: content }] : [];
14392
+ }
14393
+ if (isJsonObject(content)) {
14394
+ const rendered = JSON.stringify(content, null, 2);
14395
+ return rendered.trim().length > 0 ? [{ type: "text", value: rendered }] : [];
14396
+ }
14397
+ if (!Array.isArray(content)) {
14398
+ return [];
14399
+ }
14400
+ const segments = [];
14401
+ for (const segment of content) {
14402
+ if (!isJsonObject(segment)) {
14403
+ continue;
14404
+ }
14405
+ segments.push(cloneJsonObject(segment));
14406
+ }
14407
+ return segments;
14408
+ }
14409
+ function cloneJsonObject(source) {
14410
+ const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
14411
+ return Object.fromEntries(entries);
14412
+ }
14413
+ function cloneJsonValue(value) {
14414
+ if (value === null) {
14415
+ return null;
14416
+ }
14417
+ if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
14418
+ return value;
14419
+ }
14420
+ if (Array.isArray(value)) {
14421
+ return value.map((item) => cloneJsonValue(item));
14422
+ }
14423
+ if (typeof value === "object") {
14424
+ return cloneJsonObject(value);
14425
+ }
14426
+ return value;
14427
+ }
14230
14428
  var ANSI_RED = "\x1B[31m";
14231
14429
  var ANSI_RESET2 = "\x1B[0m";
14232
14430
  function logError(msg) {
@@ -14292,7 +14490,6 @@ function parseAgentSkillsEvals(parsed, source = "evals.json", baseDir) {
14292
14490
  id: String(id),
14293
14491
  question: prompt,
14294
14492
  input: [{ role: "user", content: prompt }],
14295
- input_segments: [{ type: "text", value: prompt }],
14296
14493
  expected_output: evalCase.expected_output ? [{ role: "assistant", content: evalCase.expected_output }] : [],
14297
14494
  reference_answer: evalCase.expected_output,
14298
14495
  file_paths: filePaths,
@@ -14414,7 +14611,7 @@ async function loadConfig(evalFilePath, repoRoot) {
14414
14611
  }
14415
14612
  try {
14416
14613
  const rawConfig = await readFile22(configPath, "utf8");
14417
- const parsed = parse(rawConfig);
14614
+ const parsed = interpolateEnv(parse(rawConfig), process.env);
14418
14615
  if (!isJsonObject(parsed)) {
14419
14616
  logWarning(`Invalid .agentv/config.yaml format at ${configPath}`);
14420
14617
  continue;
@@ -14632,6 +14829,27 @@ function parseExecutionDefaults(raw, configPath) {
14632
14829
  } else if (otelFile !== void 0) {
14633
14830
  logWarning(`Invalid execution.otel_file in ${configPath}, expected non-empty string`);
14634
14831
  }
14832
+ if (typeof obj.export_otel === "boolean") {
14833
+ result.export_otel = obj.export_otel;
14834
+ } else if (obj.export_otel !== void 0) {
14835
+ logWarning(`Invalid execution.export_otel in ${configPath}, expected boolean`);
14836
+ }
14837
+ const otelBackend = obj.otel_backend;
14838
+ if (typeof otelBackend === "string" && otelBackend.trim().length > 0) {
14839
+ result.otel_backend = otelBackend.trim();
14840
+ } else if (otelBackend !== void 0) {
14841
+ logWarning(`Invalid execution.otel_backend in ${configPath}, expected non-empty string`);
14842
+ }
14843
+ if (typeof obj.otel_capture_content === "boolean") {
14844
+ result.otel_capture_content = obj.otel_capture_content;
14845
+ } else if (obj.otel_capture_content !== void 0) {
14846
+ logWarning(`Invalid execution.otel_capture_content in ${configPath}, expected boolean`);
14847
+ }
14848
+ if (typeof obj.otel_group_turns === "boolean") {
14849
+ result.otel_group_turns = obj.otel_group_turns;
14850
+ } else if (obj.otel_group_turns !== void 0) {
14851
+ logWarning(`Invalid execution.otel_group_turns in ${configPath}, expected boolean`);
14852
+ }
14635
14853
  if (typeof obj.pool_workspaces === "boolean") {
14636
14854
  result.pool_workspaces = obj.pool_workspaces;
14637
14855
  } else if (obj.pool_workspaces !== void 0) {
@@ -16076,27 +16294,28 @@ var ANSI_YELLOW4 = "\x1B[33m";
16076
16294
  var ANSI_RESET5 = "\x1B[0m";
16077
16295
  async function processMessages(options) {
16078
16296
  const { messages, searchRoots, repoRootPath, textParts, messageType, verbose } = options;
16079
- const segments = [];
16297
+ const processedMessages = [];
16080
16298
  for (const message of messages) {
16081
16299
  const content = message.content;
16082
16300
  if (typeof content === "string") {
16083
- segments.push({ type: "text", value: content });
16084
16301
  if (textParts) {
16085
16302
  textParts.push(content);
16086
16303
  }
16304
+ processedMessages.push({ ...message, content });
16087
16305
  continue;
16088
16306
  }
16089
16307
  if (isJsonObject(content)) {
16090
16308
  const rendered = JSON.stringify(content, null, 2);
16091
- segments.push({ type: "text", value: rendered });
16092
16309
  if (textParts) {
16093
16310
  textParts.push(rendered);
16094
16311
  }
16312
+ processedMessages.push({ ...message, content: cloneJsonObject(content) });
16095
16313
  continue;
16096
16314
  }
16097
16315
  if (!Array.isArray(content)) {
16098
16316
  continue;
16099
16317
  }
16318
+ const processedContent = [];
16100
16319
  for (const rawSegment of content) {
16101
16320
  if (!isJsonObject(rawSegment)) {
16102
16321
  continue;
@@ -16119,8 +16338,8 @@ async function processMessages(options) {
16119
16338
  }
16120
16339
  try {
16121
16340
  const fileContent = (await readFile4(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
16122
- segments.push({
16123
- type: "file",
16341
+ processedContent.push({
16342
+ ...cloneJsonObject(rawSegment),
16124
16343
  path: displayPath,
16125
16344
  text: fileContent,
16126
16345
  resolvedPath: path5.resolve(resolvedPath)
@@ -16137,37 +16356,19 @@ async function processMessages(options) {
16137
16356
  continue;
16138
16357
  }
16139
16358
  const clonedSegment = cloneJsonObject(rawSegment);
16140
- segments.push(clonedSegment);
16359
+ processedContent.push(clonedSegment);
16141
16360
  const inlineValue = clonedSegment.value;
16142
16361
  if (typeof inlineValue === "string" && textParts) {
16143
16362
  textParts.push(inlineValue);
16144
16363
  }
16145
16364
  }
16365
+ processedMessages.push({ ...message, content: processedContent });
16146
16366
  }
16147
- return segments;
16367
+ return processedMessages;
16148
16368
  }
16149
16369
  function asString3(value) {
16150
16370
  return typeof value === "string" ? value : void 0;
16151
16371
  }
16152
- function cloneJsonObject(source) {
16153
- const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
16154
- return Object.fromEntries(entries);
16155
- }
16156
- function cloneJsonValue(value) {
16157
- if (value === null) {
16158
- return null;
16159
- }
16160
- if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
16161
- return value;
16162
- }
16163
- if (Array.isArray(value)) {
16164
- return value.map((item) => cloneJsonValue(item));
16165
- }
16166
- if (typeof value === "object") {
16167
- return cloneJsonObject(value);
16168
- }
16169
- return value;
16170
- }
16171
16372
  function logWarning3(message, details) {
16172
16373
  if (details && details.length > 0) {
16173
16374
  const detailBlock = details.join("\n");
@@ -16412,10 +16613,10 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
16412
16613
  );
16413
16614
  }
16414
16615
  }
16415
- const inputMessages = resolveInputMessages(evalcase);
16616
+ const rawInputMessages = resolveInputMessages(evalcase);
16416
16617
  const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
16417
16618
  const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
16418
- if (!id || !hasEvaluationSpec || !inputMessages || inputMessages.length === 0) {
16619
+ if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
16419
16620
  logError2(
16420
16621
  `Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
16421
16622
  );
@@ -16423,8 +16624,8 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
16423
16624
  }
16424
16625
  const hasExpectedMessages = expectedMessages.length > 0;
16425
16626
  const inputTextParts = [];
16426
- const inputSegments = await processMessages({
16427
- messages: inputMessages,
16627
+ const inputMessages = await processMessages({
16628
+ messages: rawInputMessages,
16428
16629
  searchRoots,
16429
16630
  repoRootPath,
16430
16631
  textParts: inputTextParts,
@@ -16470,19 +16671,13 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
16470
16671
  }
16471
16672
  }
16472
16673
  warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
16473
- const userFilePaths = [];
16474
- for (const segment of inputSegments) {
16475
- if (segment.type === "file" && typeof segment.resolvedPath === "string") {
16476
- userFilePaths.push(segment.resolvedPath);
16477
- }
16478
- }
16674
+ const userFilePaths = collectResolvedInputFilePaths(inputMessages);
16479
16675
  const testCase = {
16480
16676
  id,
16481
16677
  eval_set: evalSetName,
16482
16678
  conversation_id: conversationId,
16483
16679
  question,
16484
16680
  input: inputMessages,
16485
- input_segments: inputSegments,
16486
16681
  expected_output: outputSegments,
16487
16682
  reference_answer: referenceAnswer,
16488
16683
  file_paths: userFilePaths,
@@ -16543,50 +16738,9 @@ function parseMetadata(suite) {
16543
16738
  });
16544
16739
  }
16545
16740
  async function buildPromptInputs(testCase, mode = "lm") {
16546
- const segmentsByMessage = [];
16547
- const fileContentsByPath = /* @__PURE__ */ new Map();
16548
- for (const segment of testCase.input_segments) {
16549
- if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
16550
- fileContentsByPath.set(segment.path, segment.text);
16551
- }
16552
- }
16553
- for (const message of testCase.input) {
16554
- const messageSegments = [];
16555
- if (typeof message.content === "string") {
16556
- if (message.content.trim().length > 0) {
16557
- messageSegments.push({ type: "text", value: message.content });
16558
- }
16559
- } else if (Array.isArray(message.content)) {
16560
- for (const segment of message.content) {
16561
- if (typeof segment === "string") {
16562
- if (segment.trim().length > 0) {
16563
- messageSegments.push({ type: "text", value: segment });
16564
- }
16565
- } else if (isJsonObject(segment)) {
16566
- const type = asString5(segment.type);
16567
- if (type === "file") {
16568
- const value = asString5(segment.value);
16569
- if (!value) continue;
16570
- const fileText = fileContentsByPath.get(value);
16571
- if (fileText !== void 0) {
16572
- messageSegments.push({ type: "file", text: fileText, path: value });
16573
- }
16574
- } else if (type === "text") {
16575
- const textValue = asString5(segment.value);
16576
- if (textValue && textValue.trim().length > 0) {
16577
- messageSegments.push({ type: "text", value: textValue });
16578
- }
16579
- }
16580
- }
16581
- }
16582
- } else if (isJsonObject(message.content)) {
16583
- const rendered = JSON.stringify(message.content, null, 2);
16584
- if (rendered.trim().length > 0) {
16585
- messageSegments.push({ type: "text", value: rendered });
16586
- }
16587
- }
16588
- segmentsByMessage.push(messageSegments);
16589
- }
16741
+ const segmentsByMessage = testCase.input.map(
16742
+ (message) => extractContentSegments(message.content)
16743
+ );
16590
16744
  const useRoleMarkers = needsRoleMarkers(testCase.input, segmentsByMessage);
16591
16745
  let question;
16592
16746
  if (useRoleMarkers) {
@@ -16614,7 +16768,7 @@ ${messageContent}`);
16614
16768
  question = messageParts.join("\n\n");
16615
16769
  } else {
16616
16770
  const questionParts = [];
16617
- for (const segment of testCase.input_segments) {
16771
+ for (const segment of flattenInputMessages(testCase.input)) {
16618
16772
  const formattedContent = formatSegment(segment, mode);
16619
16773
  if (formattedContent) {
16620
16774
  questionParts.push(formattedContent);
@@ -16701,9 +16855,6 @@ function buildChatPromptFromSegments(options) {
16701
16855
  }
16702
16856
  return chatPrompt.length > 0 ? chatPrompt : void 0;
16703
16857
  }
16704
- function asString5(value) {
16705
- return typeof value === "string" ? value : void 0;
16706
- }
16707
16858
  var ANSI_YELLOW6 = "\x1B[33m";
16708
16859
  var ANSI_RED3 = "\x1B[31m";
16709
16860
  var ANSI_RESET7 = "\x1B[0m";
@@ -16784,7 +16935,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
16784
16935
  throw new Error(`Invalid test file format: ${evalFilePath}`);
16785
16936
  }
16786
16937
  const suite = interpolated;
16787
- const evalSetNameFromSuite = asString6(suite.name)?.trim();
16938
+ const evalSetNameFromSuite = asString5(suite.name)?.trim();
16788
16939
  const fallbackEvalSet = path7.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
16789
16940
  const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
16790
16941
  const rawTestcases = resolveTests(suite);
@@ -16803,7 +16954,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
16803
16954
  const suiteInputMessages = expandInputShorthand(suite.input);
16804
16955
  const suiteInputFiles = suite.input_files;
16805
16956
  const rawGlobalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
16806
- const _globalTarget = asString6(rawGlobalExecution?.target) ?? asString6(suite.target);
16957
+ const _globalTarget = asString5(rawGlobalExecution?.target) ?? asString5(suite.target);
16807
16958
  const suiteAssertions = suite.assertions ?? suite.assert;
16808
16959
  if (suite.assert !== void 0 && suite.assertions === void 0) {
16809
16960
  logWarning5("'assert' is deprecated at the suite level. Use 'assertions' instead.");
@@ -16816,17 +16967,17 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
16816
16967
  continue;
16817
16968
  }
16818
16969
  const evalcase = rawEvalcase;
16819
- const id = asString6(evalcase.id);
16970
+ const id = asString5(evalcase.id);
16820
16971
  if (filterPattern && (!id || !micromatch2.isMatch(id, filterPattern))) {
16821
16972
  continue;
16822
16973
  }
16823
- const conversationId = asString6(evalcase.conversation_id);
16824
- let outcome = asString6(evalcase.criteria);
16974
+ const conversationId = asString5(evalcase.conversation_id);
16975
+ let outcome = asString5(evalcase.criteria);
16825
16976
  if (!outcome && evalcase.expected_outcome !== void 0) {
16826
- outcome = asString6(evalcase.expected_outcome);
16977
+ outcome = asString5(evalcase.expected_outcome);
16827
16978
  if (outcome) {
16828
16979
  logWarning5(
16829
- `Test '${asString6(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
16980
+ `Test '${asString5(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
16830
16981
  );
16831
16982
  }
16832
16983
  }
@@ -16843,10 +16994,9 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
16843
16994
  continue;
16844
16995
  }
16845
16996
  const effectiveSuiteInputMessages = suiteInputMessages && !skipDefaults ? suiteInputMessages : void 0;
16846
- const inputMessages = effectiveSuiteInputMessages ? [...effectiveSuiteInputMessages, ...testInputMessages] : testInputMessages;
16847
16997
  const hasExpectedMessages = expectedMessages.length > 0;
16848
16998
  const inputTextParts = [];
16849
- const suiteInputSegments = effectiveSuiteInputMessages ? await processMessages({
16999
+ const suiteResolvedInputMessages = effectiveSuiteInputMessages ? await processMessages({
16850
17000
  messages: effectiveSuiteInputMessages,
16851
17001
  searchRoots,
16852
17002
  repoRootPath,
@@ -16854,7 +17004,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
16854
17004
  messageType: "input",
16855
17005
  verbose
16856
17006
  }) : [];
16857
- const testInputSegments = await processMessages({
17007
+ const testResolvedInputMessages = await processMessages({
16858
17008
  messages: testInputMessages,
16859
17009
  searchRoots,
16860
17010
  repoRootPath,
@@ -16862,7 +17012,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
16862
17012
  messageType: "input",
16863
17013
  verbose
16864
17014
  });
16865
- const inputSegments = [...suiteInputSegments, ...testInputSegments];
17015
+ const inputMessages = [...suiteResolvedInputMessages, ...testResolvedInputMessages];
16866
17016
  const outputSegments = hasExpectedMessages ? await processExpectedMessages({
16867
17017
  messages: expectedMessages,
16868
17018
  searchRoots,
@@ -16900,12 +17050,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
16900
17050
  }
16901
17051
  }
16902
17052
  warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
16903
- const userFilePaths = [];
16904
- for (const segment of inputSegments) {
16905
- if (segment.type === "file" && typeof segment.resolvedPath === "string") {
16906
- userFilePaths.push(segment.resolvedPath);
16907
- }
16908
- }
17053
+ const userFilePaths = collectResolvedInputFilePaths(inputMessages);
16909
17054
  const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
16910
17055
  const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
16911
17056
  const metadata = isJsonObject(evalcase.metadata) ? evalcase.metadata : void 0;
@@ -16916,7 +17061,6 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
16916
17061
  conversation_id: conversationId,
16917
17062
  question,
16918
17063
  input: inputMessages,
16919
- input_segments: inputSegments,
16920
17064
  expected_output: outputSegments,
16921
17065
  reference_answer: referenceAnswer,
16922
17066
  file_paths: userFilePaths,
@@ -17125,7 +17269,7 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
17125
17269
  path: caseLevel.path ?? suiteLevel.path
17126
17270
  };
17127
17271
  }
17128
- function asString6(value) {
17272
+ function asString5(value) {
17129
17273
  return typeof value === "string" ? value : void 0;
17130
17274
  }
17131
17275
  function logWarning5(message, details) {
@@ -19552,7 +19696,7 @@ ${basePrompt}` : basePrompt;
19552
19696
  if (itemType === "command_execution") {
19553
19697
  completedToolCalls.push({
19554
19698
  tool: "command_execution",
19555
- input: item.command,
19699
+ input: { command: item.command },
19556
19700
  output: item.aggregated_output,
19557
19701
  id: item.id
19558
19702
  });
@@ -20383,11 +20527,22 @@ async function loadCopilotSdk() {
20383
20527
  try {
20384
20528
  copilotSdkModule = await import("@github/copilot-sdk");
20385
20529
  } catch (error) {
20530
+ const message = error instanceof Error ? error.message : String(error);
20531
+ if (message.includes("vscode-jsonrpc")) {
20532
+ throw new Error(
20533
+ `Failed to load @github/copilot-sdk due to a known ESM compatibility issue with vscode-jsonrpc (https://github.com/github/copilot-sdk/issues/710).
20534
+
20535
+ Workarounds:
20536
+ - Use the copilot-cli target instead (recommended): set target type to "copilot-cli" in your eval YAML
20537
+ - If running under Node.js 24+: set NODE_OPTIONS="--experimental-specifier-resolution=node"
20538
+ - Wait for vscode-jsonrpc@9.0.0 stable to be released upstream`
20539
+ );
20540
+ }
20386
20541
  throw new Error(
20387
20542
  `Failed to load @github/copilot-sdk. Please install it:
20388
20543
  npm install @github/copilot-sdk
20389
20544
 
20390
- Original error: ${error instanceof Error ? error.message : String(error)}`
20545
+ Original error: ${message}`
20391
20546
  );
20392
20547
  }
20393
20548
  }
@@ -20781,7 +20936,7 @@ var PiAgentSdkProvider = class {
20781
20936
  const { Agent, getModel, getEnvApiKey } = await loadPiModules();
20782
20937
  const startTimeIso = (/* @__PURE__ */ new Date()).toISOString();
20783
20938
  const startMs = Date.now();
20784
- const providerName = this.config.provider ?? "anthropic";
20939
+ const providerName = this.config.subprovider ?? "anthropic";
20785
20940
  const modelId = this.config.model ?? "claude-sonnet-4-20250514";
20786
20941
  const model = getModel(providerName, modelId);
20787
20942
  const systemPrompt = this.config.systemPrompt ?? "Answer directly and concisely.";
@@ -20893,7 +21048,7 @@ var PiAgentSdkProvider = class {
20893
21048
  messages: agentMessages,
20894
21049
  systemPrompt,
20895
21050
  model: this.config.model,
20896
- provider: this.config.provider
21051
+ subprovider: this.config.subprovider
20897
21052
  },
20898
21053
  output,
20899
21054
  tokenUsage,
@@ -21117,8 +21272,8 @@ var PiCodingAgentProvider = class {
21117
21272
  }
21118
21273
  buildPiArgs(prompt, inputFiles, _captureFileChanges) {
21119
21274
  const args = [];
21120
- if (this.config.provider) {
21121
- args.push("--provider", this.config.provider);
21275
+ if (this.config.subprovider) {
21276
+ args.push("--provider", this.config.subprovider);
21122
21277
  }
21123
21278
  if (this.config.model) {
21124
21279
  args.push("--model", this.config.model);
@@ -21176,7 +21331,7 @@ ${prompt}` : prompt;
21176
21331
  buildEnv() {
21177
21332
  const env = { ...process.env };
21178
21333
  if (this.config.apiKey) {
21179
- const provider = this.config.provider?.toLowerCase() ?? "google";
21334
+ const provider = this.config.subprovider?.toLowerCase() ?? "google";
21180
21335
  switch (provider) {
21181
21336
  case "google":
21182
21337
  case "gemini":
@@ -21592,6 +21747,13 @@ function extractToolCalls4(content) {
21592
21747
  id: typeof p.id === "string" ? p.id : void 0
21593
21748
  });
21594
21749
  }
21750
+ if (p.type === "toolCall" && typeof p.name === "string") {
21751
+ toolCalls.push({
21752
+ tool: p.name,
21753
+ input: p.arguments,
21754
+ id: typeof p.id === "string" ? p.id : void 0
21755
+ });
21756
+ }
21595
21757
  if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
21596
21758
  const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
21597
21759
  if (existing) {
@@ -23903,7 +24065,8 @@ var freeformEvaluationSchema = external_exports2.object({
23903
24065
  passed: external_exports2.boolean().describe("Whether this aspect was satisfied"),
23904
24066
  evidence: external_exports2.string().describe("Concise evidence (1-2 sentences)").optional()
23905
24067
  })
23906
- ).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional()
24068
+ ).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional(),
24069
+ details: external_exports2.record(external_exports2.unknown()).describe("Optional structured metadata for domain-specific metrics").optional()
23907
24070
  });
23908
24071
  var rubricCheckResultSchema = external_exports2.object({
23909
24072
  id: external_exports2.string().describe("The ID of the rubric item being checked"),
@@ -23965,7 +24128,7 @@ var LlmGraderEvaluator = class {
23965
24128
  async evaluateFreeform(context2, graderProvider) {
23966
24129
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
23967
24130
  const variables = {
23968
- [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context2.evalCase.input_segments, null, 2),
24131
+ [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context2.evalCase.input, null, 2),
23969
24132
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(
23970
24133
  context2.evalCase.expected_output,
23971
24134
  null,
@@ -24008,6 +24171,7 @@ ${context2.fileChanges}`;
24008
24171
  expectedAspectCount: Math.max(assertions.length, 1),
24009
24172
  evaluatorRawRequest,
24010
24173
  graderTarget: graderProvider.targetName,
24174
+ details: data.details,
24011
24175
  tokenUsage
24012
24176
  };
24013
24177
  } catch (e) {
@@ -24427,7 +24591,7 @@ ${outputSchema2}`;
24427
24591
  expectedAspectCount: Math.max(assertions.length, 1),
24428
24592
  evaluatorRawRequest,
24429
24593
  graderTarget,
24430
- details
24594
+ details: data.details && Object.keys(data.details).length > 0 ? { ...details, ...data.details } : details
24431
24595
  };
24432
24596
  } catch {
24433
24597
  return {
@@ -24574,7 +24738,8 @@ function buildOutputSchema() {
24574
24738
  ' "passed": <boolean>,',
24575
24739
  ' "evidence": "<concise evidence, 1-2 sentences, optional>"',
24576
24740
  " }",
24577
- " ]",
24741
+ " ],",
24742
+ ' "details": {<optional object with domain-specific structured metrics>}',
24578
24743
  "}"
24579
24744
  ].join("\n");
24580
24745
  }
@@ -25778,12 +25943,31 @@ var COPILOT_MATCHER = {
25778
25943
  readToolPrefixes: ["Viewing "],
25779
25944
  readInputFields: ["file_path", "path"]
25780
25945
  };
25946
+ var PI_CODING_AGENT_MATCHER = {
25947
+ skillTools: [],
25948
+ skillInputField: "skill",
25949
+ readTools: ["read"],
25950
+ readInputField: "path",
25951
+ readInputFields: ["path", "file_path", "filePath"]
25952
+ };
25953
+ var CODEX_MATCHER = {
25954
+ skillTools: [],
25955
+ skillInputField: "skill",
25956
+ readTools: ["command_execution"],
25957
+ readInputField: "command",
25958
+ skillToolPrefixes: ["mcp:"],
25959
+ readToolPrefixes: ["mcp:"],
25960
+ readInputFields: ["command", "path", "file_path", "filePath"]
25961
+ };
25781
25962
  var PROVIDER_TOOL_SEMANTICS = {
25782
25963
  claude: CLAUDE_MATCHER,
25783
25964
  "claude-cli": CLAUDE_MATCHER,
25784
25965
  "claude-sdk": CLAUDE_MATCHER,
25785
- "pi-coding-agent": CLAUDE_MATCHER,
25786
- "pi-agent-sdk": CLAUDE_MATCHER,
25966
+ codex: CODEX_MATCHER,
25967
+ "pi-coding-agent": PI_CODING_AGENT_MATCHER,
25968
+ // pi-agent-sdk has no tools, so skill detection is a no-op. Kept for completeness.
25969
+ // TODO: consider removing pi-agent-sdk provider entirely.
25970
+ "pi-agent-sdk": PI_CODING_AGENT_MATCHER,
25787
25971
  "copilot-cli": COPILOT_MATCHER,
25788
25972
  "copilot-sdk": COPILOT_MATCHER,
25789
25973
  vscode: COPILOT_MATCHER,
@@ -25807,33 +25991,37 @@ var SkillTriggerEvaluator = class {
25807
25991
  const shouldTrigger = this.config.should_trigger !== false;
25808
25992
  const providerKind = context2.provider?.kind;
25809
25993
  const matcher = this.resolveMatcher(providerKind);
25810
- const firstTool = (context2.output ?? []).flatMap((msg) => msg.toolCalls ?? [])[0];
25994
+ const allToolCalls = (context2.output ?? []).flatMap((msg) => msg.toolCalls ?? []);
25811
25995
  let triggered = false;
25812
25996
  let evidence = "";
25813
- if (firstTool) {
25814
- const input = firstTool.input ?? {};
25815
- if (matcher.skillTools.includes(firstTool.tool)) {
25997
+ for (const toolCall of allToolCalls) {
25998
+ const input = toolCall.input ?? {};
25999
+ if (matcher.skillTools.includes(toolCall.tool)) {
25816
26000
  const skillArg = String(input[matcher.skillInputField] ?? "");
25817
26001
  if (skillArg.includes(skillName)) {
25818
26002
  triggered = true;
25819
26003
  evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
26004
+ break;
25820
26005
  }
25821
26006
  } else if (matcher.skillToolPrefixes?.some(
25822
- (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
26007
+ (prefix) => toolCall.tool.startsWith(prefix) && toolCall.tool.includes(skillName)
25823
26008
  )) {
25824
26009
  triggered = true;
25825
- evidence = `Skill tool invoked via tool name "${firstTool.tool}"`;
25826
- } else if (matcher.readTools.includes(firstTool.tool)) {
26010
+ evidence = `Skill tool invoked via tool name "${toolCall.tool}"`;
26011
+ break;
26012
+ } else if (matcher.readTools.includes(toolCall.tool)) {
25827
26013
  const filePath = this.readPathFromInput(input, matcher);
25828
26014
  if (filePath.includes(skillName)) {
25829
26015
  triggered = true;
25830
26016
  evidence = `Read tool loaded skill file: ${filePath}`;
26017
+ break;
25831
26018
  }
25832
26019
  } else if (matcher.readToolPrefixes?.some(
25833
- (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
26020
+ (prefix) => toolCall.tool.startsWith(prefix) && toolCall.tool.includes(skillName)
25834
26021
  )) {
25835
26022
  triggered = true;
25836
- evidence = `Read tool loaded skill file via tool name "${firstTool.tool}"`;
26023
+ evidence = `Read tool loaded skill file via tool name "${toolCall.tool}"`;
26024
+ break;
25837
26025
  }
25838
26026
  }
25839
26027
  const pass = triggered === shouldTrigger;
@@ -25855,7 +26043,7 @@ var SkillTriggerEvaluator = class {
25855
26043
  verdict: "fail",
25856
26044
  assertions: [
25857
26045
  {
25858
- text: shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`,
26046
+ text: shouldTrigger ? allToolCalls.length > 0 ? `Skill "${skillName}" not found in ${allToolCalls.length} tool call(s)` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`,
25859
26047
  passed: false
25860
26048
  }
25861
26049
  ],
@@ -25901,7 +26089,7 @@ function assembleLlmGraderPrompt(input) {
25901
26089
  function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evaluatorTemplateOverride) {
25902
26090
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
25903
26091
  const variables = {
25904
- [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2),
26092
+ [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input, null, 2),
25905
26093
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
25906
26094
  [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
25907
26095
  [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
@@ -28115,6 +28303,18 @@ var QUALITY_PASS_THRESHOLD = 0.8;
28115
28303
  function classifyQualityStatus(score) {
28116
28304
  return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
28117
28305
  }
28306
+ function buildSkippedEvaluatorError(scores) {
28307
+ const skippedScores = scores?.filter((score) => score.verdict === "skip") ?? [];
28308
+ if (skippedScores.length === 0) {
28309
+ return void 0;
28310
+ }
28311
+ const messages = skippedScores.map((score) => {
28312
+ const label = score.name || score.type;
28313
+ const assertionMessage = score.assertions.find((assertion) => !assertion.passed)?.text ?? "Evaluator skipped";
28314
+ return `${label}: ${assertionMessage}`;
28315
+ });
28316
+ return messages.length === 1 ? messages[0] : `Evaluators skipped: ${messages.join(" | ")}`;
28317
+ }
28118
28318
  function usesFileReferencePrompt(provider) {
28119
28319
  return isAgentProvider(provider) || provider.kind === "cli";
28120
28320
  }
@@ -29379,7 +29579,8 @@ async function runEvalCase(options) {
29379
29579
  durationMs: totalDurationMs,
29380
29580
  ...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
29381
29581
  };
29382
- const executionStatus = providerError ? "execution_error" : classifyQualityStatus(result.score);
29582
+ const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
29583
+ const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score);
29383
29584
  const finalResult = providerError ? {
29384
29585
  ...result,
29385
29586
  evalRun,
@@ -29391,7 +29592,26 @@ async function runEvalCase(options) {
29391
29592
  beforeAllOutput,
29392
29593
  beforeEachOutput,
29393
29594
  afterEachOutput
29394
- } : { ...result, evalRun, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
29595
+ } : skippedEvaluatorError ? {
29596
+ ...result,
29597
+ score: 0,
29598
+ evalRun,
29599
+ error: skippedEvaluatorError,
29600
+ executionStatus,
29601
+ failureStage: "evaluator",
29602
+ failureReasonCode: "evaluator_error",
29603
+ executionError: { message: skippedEvaluatorError, stage: "evaluator" },
29604
+ beforeAllOutput,
29605
+ beforeEachOutput,
29606
+ afterEachOutput
29607
+ } : {
29608
+ ...result,
29609
+ evalRun,
29610
+ executionStatus,
29611
+ beforeAllOutput,
29612
+ beforeEachOutput,
29613
+ afterEachOutput
29614
+ };
29395
29615
  const isFailure = !!finalResult.error || finalResult.score < 0.5;
29396
29616
  if (workspacePath && !isSharedWorkspace) {
29397
29617
  if (forceCleanup) {
@@ -30128,11 +30348,6 @@ async function evaluate(config) {
30128
30348
  evalCases = (config.tests ?? []).map((test) => {
30129
30349
  const input = typeof test.input === "string" ? [{ role: "user", content: test.input }] : test.input;
30130
30350
  const question = typeof test.input === "string" ? test.input : test.input.find((m) => m.role === "user")?.content ?? "";
30131
- const inputSegments = input.map((m) => ({
30132
- type: "text",
30133
- value: typeof m.content === "string" ? m.content : JSON.stringify(m.content),
30134
- messageIndex: 0
30135
- }));
30136
30351
  const expectedOutputValue = test.expectedOutput ?? test.expected_output;
30137
30352
  const expectedOutput = expectedOutputValue ? [
30138
30353
  { role: "assistant", content: expectedOutputValue }
@@ -30161,7 +30376,6 @@ async function evaluate(config) {
30161
30376
  criteria: test.criteria ?? "",
30162
30377
  question: String(question),
30163
30378
  input,
30164
- input_segments: inputSegments,
30165
30379
  expected_output: expectedOutput,
30166
30380
  reference_answer: expectedOutputValue,
30167
30381
  file_paths: [],
@@ -31062,4 +31276,4 @@ export {
31062
31276
  OtelStreamingObserver,
31063
31277
  createAgentKernel
31064
31278
  };
31065
- //# sourceMappingURL=chunk-KGK5NUFG.js.map
31279
+ //# sourceMappingURL=chunk-EZGWZVVK.js.map