@agentv/core 0.2.8 → 0.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -202,6 +202,15 @@ interface Provider {
202
202
  readonly kind: ProviderKind;
203
203
  readonly targetName: string;
204
204
  invoke(request: ProviderRequest): Promise<ProviderResponse>;
205
+ /**
206
+ * Optional capability marker for provider-managed batching (single session handling multiple requests).
207
+ */
208
+ readonly supportsBatch?: boolean;
209
+ /**
210
+ * Optional batch invocation hook. When defined alongside supportsBatch=true,
211
+ * the orchestrator may send multiple requests in a single provider session.
212
+ */
213
+ invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
205
214
  }
206
215
  type EnvLookup = Readonly<Record<string, string | undefined>>;
207
216
  interface TargetDefinition {
@@ -251,30 +260,35 @@ type ResolvedTarget = {
251
260
  readonly name: string;
252
261
  readonly judgeTarget?: string;
253
262
  readonly workers?: number;
263
+ readonly providerBatching?: boolean;
254
264
  readonly config: AzureResolvedConfig;
255
265
  } | {
256
266
  readonly kind: "anthropic";
257
267
  readonly name: string;
258
268
  readonly judgeTarget?: string;
259
269
  readonly workers?: number;
270
+ readonly providerBatching?: boolean;
260
271
  readonly config: AnthropicResolvedConfig;
261
272
  } | {
262
273
  readonly kind: "gemini";
263
274
  readonly name: string;
264
275
  readonly judgeTarget?: string;
265
276
  readonly workers?: number;
277
+ readonly providerBatching?: boolean;
266
278
  readonly config: GeminiResolvedConfig;
267
279
  } | {
268
280
  readonly kind: "mock";
269
281
  readonly name: string;
270
282
  readonly judgeTarget?: string;
271
283
  readonly workers?: number;
284
+ readonly providerBatching?: boolean;
272
285
  readonly config: MockResolvedConfig;
273
286
  } | {
274
287
  readonly kind: "vscode" | "vscode-insiders";
275
288
  readonly name: string;
276
289
  readonly judgeTarget?: string;
277
290
  readonly workers?: number;
291
+ readonly providerBatching?: boolean;
278
292
  readonly config: VSCodeResolvedConfig;
279
293
  };
280
294
  declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup): ResolvedTarget;
package/dist/index.d.ts CHANGED
@@ -202,6 +202,15 @@ interface Provider {
202
202
  readonly kind: ProviderKind;
203
203
  readonly targetName: string;
204
204
  invoke(request: ProviderRequest): Promise<ProviderResponse>;
205
+ /**
206
+ * Optional capability marker for provider-managed batching (single session handling multiple requests).
207
+ */
208
+ readonly supportsBatch?: boolean;
209
+ /**
210
+ * Optional batch invocation hook. When defined alongside supportsBatch=true,
211
+ * the orchestrator may send multiple requests in a single provider session.
212
+ */
213
+ invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
205
214
  }
206
215
  type EnvLookup = Readonly<Record<string, string | undefined>>;
207
216
  interface TargetDefinition {
@@ -251,30 +260,35 @@ type ResolvedTarget = {
251
260
  readonly name: string;
252
261
  readonly judgeTarget?: string;
253
262
  readonly workers?: number;
263
+ readonly providerBatching?: boolean;
254
264
  readonly config: AzureResolvedConfig;
255
265
  } | {
256
266
  readonly kind: "anthropic";
257
267
  readonly name: string;
258
268
  readonly judgeTarget?: string;
259
269
  readonly workers?: number;
270
+ readonly providerBatching?: boolean;
260
271
  readonly config: AnthropicResolvedConfig;
261
272
  } | {
262
273
  readonly kind: "gemini";
263
274
  readonly name: string;
264
275
  readonly judgeTarget?: string;
265
276
  readonly workers?: number;
277
+ readonly providerBatching?: boolean;
266
278
  readonly config: GeminiResolvedConfig;
267
279
  } | {
268
280
  readonly kind: "mock";
269
281
  readonly name: string;
270
282
  readonly judgeTarget?: string;
271
283
  readonly workers?: number;
284
+ readonly providerBatching?: boolean;
272
285
  readonly config: MockResolvedConfig;
273
286
  } | {
274
287
  readonly kind: "vscode" | "vscode-insiders";
275
288
  readonly name: string;
276
289
  readonly judgeTarget?: string;
277
290
  readonly workers?: number;
291
+ readonly providerBatching?: boolean;
278
292
  readonly config: VSCodeResolvedConfig;
279
293
  };
280
294
  declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup): ResolvedTarget;
package/dist/index.js CHANGED
@@ -5,7 +5,7 @@ import {
5
5
  fileExists,
6
6
  findGitRoot,
7
7
  resolveFileReference
8
- } from "./chunk-XXNQA4EW.js";
8
+ } from "./chunk-P4GOYWYH.js";
9
9
 
10
10
  // src/evaluation/types.ts
11
11
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -735,6 +735,9 @@ function normalizeAzureApiVersion(value) {
735
735
  function resolveTargetDefinition(definition, env = process.env) {
736
736
  const parsed = BASE_TARGET_SCHEMA.parse(definition);
737
737
  const provider = parsed.provider.toLowerCase();
738
+ const providerBatching = resolveOptionalBoolean(
739
+ parsed.settings?.provider_batching ?? parsed.settings?.providerBatching
740
+ );
738
741
  switch (provider) {
739
742
  case "azure":
740
743
  case "azure-openai":
@@ -743,6 +746,7 @@ function resolveTargetDefinition(definition, env = process.env) {
743
746
  name: parsed.name,
744
747
  judgeTarget: parsed.judge_target,
745
748
  workers: parsed.workers,
749
+ providerBatching,
746
750
  config: resolveAzureConfig(parsed, env)
747
751
  };
748
752
  case "anthropic":
@@ -751,6 +755,7 @@ function resolveTargetDefinition(definition, env = process.env) {
751
755
  name: parsed.name,
752
756
  judgeTarget: parsed.judge_target,
753
757
  workers: parsed.workers,
758
+ providerBatching,
754
759
  config: resolveAnthropicConfig(parsed, env)
755
760
  };
756
761
  case "gemini":
@@ -761,6 +766,7 @@ function resolveTargetDefinition(definition, env = process.env) {
761
766
  name: parsed.name,
762
767
  judgeTarget: parsed.judge_target,
763
768
  workers: parsed.workers,
769
+ providerBatching,
764
770
  config: resolveGeminiConfig(parsed, env)
765
771
  };
766
772
  case "mock":
@@ -769,6 +775,7 @@ function resolveTargetDefinition(definition, env = process.env) {
769
775
  name: parsed.name,
770
776
  judgeTarget: parsed.judge_target,
771
777
  workers: parsed.workers,
778
+ providerBatching,
772
779
  config: resolveMockConfig(parsed)
773
780
  };
774
781
  case "vscode":
@@ -778,6 +785,7 @@ function resolveTargetDefinition(definition, env = process.env) {
778
785
  name: parsed.name,
779
786
  judgeTarget: parsed.judge_target,
780
787
  workers: parsed.workers,
788
+ providerBatching,
781
789
  config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
782
790
  };
783
791
  default:
@@ -964,11 +972,17 @@ function isLikelyEnvReference(value) {
964
972
  // src/evaluation/providers/vscode.ts
965
973
  import { readFile as readFile2 } from "node:fs/promises";
966
974
  import path2 from "node:path";
967
- import { dispatchAgentSession, getSubagentRoot, provisionSubagents } from "subagent";
975
+ import {
976
+ dispatchAgentSession,
977
+ dispatchBatchAgent,
978
+ getSubagentRoot,
979
+ provisionSubagents
980
+ } from "subagent";
968
981
  var VSCodeProvider = class {
969
982
  id;
970
983
  kind;
971
984
  targetName;
985
+ supportsBatch = true;
972
986
  config;
973
987
  constructor(targetName, config, kind) {
974
988
  this.id = `${kind}:${targetName}`;
@@ -1015,38 +1029,102 @@ var VSCodeProvider = class {
1015
1029
  }
1016
1030
  };
1017
1031
  }
1032
+ async invokeBatch(requests) {
1033
+ if (requests.length === 0) {
1034
+ return [];
1035
+ }
1036
+ const normalizedRequests = requests.map((req) => ({
1037
+ request: req,
1038
+ attachments: normalizeAttachments(req.attachments)
1039
+ }));
1040
+ const combinedAttachments = mergeAttachments(
1041
+ normalizedRequests.map(({ attachments }) => attachments)
1042
+ );
1043
+ const userQueries = normalizedRequests.map(
1044
+ ({ request, attachments }) => buildPromptDocument(request, attachments, request.guideline_patterns)
1045
+ );
1046
+ const session = await dispatchBatchAgent({
1047
+ userQueries,
1048
+ extraAttachments: combinedAttachments,
1049
+ wait: this.config.waitForResponse,
1050
+ dryRun: this.config.dryRun,
1051
+ vscodeCmd: this.config.command,
1052
+ subagentRoot: this.config.subagentRoot,
1053
+ workspaceTemplate: this.config.workspaceTemplate,
1054
+ silent: true
1055
+ });
1056
+ if (session.exitCode !== 0 || !session.responseFiles) {
1057
+ const failure = session.error ?? "VS Code subagent did not produce batch responses";
1058
+ throw new Error(failure);
1059
+ }
1060
+ if (this.config.dryRun) {
1061
+ return normalizedRequests.map(({ attachments }) => ({
1062
+ text: "",
1063
+ raw: {
1064
+ session,
1065
+ attachments,
1066
+ allAttachments: combinedAttachments
1067
+ }
1068
+ }));
1069
+ }
1070
+ if (session.responseFiles.length !== requests.length) {
1071
+ throw new Error(
1072
+ `VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
1073
+ );
1074
+ }
1075
+ const responses = [];
1076
+ for (const [index, responseFile] of session.responseFiles.entries()) {
1077
+ const responseText = await readFile2(responseFile, "utf8");
1078
+ responses.push({
1079
+ text: responseText,
1080
+ raw: {
1081
+ session,
1082
+ attachments: normalizedRequests[index]?.attachments,
1083
+ allAttachments: combinedAttachments,
1084
+ responseFile
1085
+ }
1086
+ });
1087
+ }
1088
+ return responses;
1089
+ }
1018
1090
  };
1019
1091
  function buildPromptDocument(request, attachments, guidelinePatterns) {
1020
1092
  const parts = [];
1021
1093
  const guidelineFiles = collectGuidelineFiles(attachments, guidelinePatterns);
1022
- if (guidelineFiles.length > 0) {
1023
- parts.push("\n", buildMandatoryPrereadBlock(guidelineFiles));
1094
+ const attachmentFiles = collectAttachmentFiles(attachments);
1095
+ const nonGuidelineAttachments = attachmentFiles.filter(
1096
+ (file) => !guidelineFiles.includes(file)
1097
+ );
1098
+ const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineAttachments);
1099
+ if (prereadBlock.length > 0) {
1100
+ parts.push("\n", prereadBlock);
1024
1101
  }
1025
1102
  parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
1026
1103
  return parts.join("\n").trim();
1027
1104
  }
1028
- function buildMandatoryPrereadBlock(guidelineFiles) {
1029
- if (guidelineFiles.length === 0) {
1105
+ function buildMandatoryPrereadBlock(guidelineFiles, attachmentFiles) {
1106
+ if (guidelineFiles.length === 0 && attachmentFiles.length === 0) {
1030
1107
  return "";
1031
1108
  }
1032
- const fileList = [];
1033
- let counter = 0;
1034
- for (const absolutePath of guidelineFiles) {
1035
- counter += 1;
1109
+ const buildList = (files) => files.map((absolutePath) => {
1036
1110
  const fileName = path2.basename(absolutePath);
1037
1111
  const fileUri = pathToFileUri(absolutePath);
1038
- fileList.push(`* [${fileName}](${fileUri})`);
1039
- }
1040
- const filesText = fileList.join("\n");
1041
- const instruction = [
1042
- `Read all guideline files:
1043
- ${filesText}.
1044
- `,
1045
- `If any file is missing, fail with ERROR: missing-file <filename> and stop.
1046
- `,
1047
- `Then apply system_instructions on the user query below.`
1048
- ].join("");
1049
- return `${instruction}`;
1112
+ return `* [${fileName}](${fileUri})`;
1113
+ });
1114
+ const sections = [];
1115
+ if (guidelineFiles.length > 0) {
1116
+ sections.push(`Read all guideline files:
1117
+ ${buildList(guidelineFiles).join("\n")}.`);
1118
+ }
1119
+ if (attachmentFiles.length > 0) {
1120
+ sections.push(`Read all attachment files:
1121
+ ${buildList(attachmentFiles).join("\n")}.`);
1122
+ }
1123
+ sections.push(
1124
+ "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
1125
+ "Then apply system_instructions on the user query below."
1126
+ );
1127
+ return sections.join("\n");
1050
1128
  }
1051
1129
  function collectGuidelineFiles(attachments, guidelinePatterns) {
1052
1130
  if (!attachments || attachments.length === 0) {
@@ -1064,6 +1142,19 @@ function collectGuidelineFiles(attachments, guidelinePatterns) {
1064
1142
  }
1065
1143
  return Array.from(unique.values());
1066
1144
  }
1145
+ function collectAttachmentFiles(attachments) {
1146
+ if (!attachments || attachments.length === 0) {
1147
+ return [];
1148
+ }
1149
+ const unique = /* @__PURE__ */ new Map();
1150
+ for (const attachment of attachments) {
1151
+ const absolutePath = path2.resolve(attachment);
1152
+ if (!unique.has(absolutePath)) {
1153
+ unique.set(absolutePath, absolutePath);
1154
+ }
1155
+ }
1156
+ return Array.from(unique.values());
1157
+ }
1067
1158
  function pathToFileUri(filePath) {
1068
1159
  const absolutePath = path2.isAbsolute(filePath) ? filePath : path2.resolve(filePath);
1069
1160
  const normalizedPath = absolutePath.replace(/\\/g, "/");
@@ -1082,6 +1173,16 @@ function normalizeAttachments(attachments) {
1082
1173
  }
1083
1174
  return Array.from(deduped);
1084
1175
  }
1176
+ function mergeAttachments(all) {
1177
+ const deduped = /* @__PURE__ */ new Set();
1178
+ for (const list of all) {
1179
+ if (!list) continue;
1180
+ for (const attachment of list) {
1181
+ deduped.add(path2.resolve(attachment));
1182
+ }
1183
+ }
1184
+ return deduped.size > 0 ? Array.from(deduped) : void 0;
1185
+ }
1085
1186
  async function ensureVSCodeSubagents(options) {
1086
1187
  const { kind, count, verbose = false } = options;
1087
1188
  const vscodeCmd = kind === "vscode-insiders" ? "code-insiders" : "code";
@@ -1811,6 +1912,12 @@ async function runEvaluation(options) {
1811
1912
  };
1812
1913
  const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
1813
1914
  const primaryProvider = getOrCreateProvider(target);
1915
+ const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
1916
+ if (target.providerBatching && !providerSupportsBatch && verbose) {
1917
+ console.warn(
1918
+ `Provider batching requested for target '${target.name}', but provider does not advertise batch support. Using per-case dispatch.`
1919
+ );
1920
+ }
1814
1921
  if (onProgress && filteredEvalCases.length > 0) {
1815
1922
  for (let i = 0; i < filteredEvalCases.length; i++) {
1816
1923
  await onProgress({
@@ -1820,6 +1927,27 @@ async function runEvaluation(options) {
1820
1927
  });
1821
1928
  }
1822
1929
  }
1930
+ if (providerSupportsBatch) {
1931
+ try {
1932
+ return await runBatchEvaluation({
1933
+ evalCases: filteredEvalCases,
1934
+ provider: primaryProvider,
1935
+ target,
1936
+ graderRegistry,
1937
+ promptDumpDir,
1938
+ nowFn: now ?? (() => /* @__PURE__ */ new Date()),
1939
+ onProgress,
1940
+ onResult,
1941
+ verbose,
1942
+ resolveJudgeProvider
1943
+ });
1944
+ } catch (error) {
1945
+ if (verbose) {
1946
+ const message = error instanceof Error ? error.message : String(error);
1947
+ console.warn(`Provider batch execution failed, falling back to per-case dispatch: ${message}`);
1948
+ }
1949
+ }
1950
+ }
1823
1951
  const workers = options.maxConcurrency ?? target.workers ?? 1;
1824
1952
  const limit = pLimit(workers);
1825
1953
  let nextWorkerId = 1;
@@ -1903,6 +2031,137 @@ async function runEvaluation(options) {
1903
2031
  }
1904
2032
  return results;
1905
2033
  }
2034
+ async function runBatchEvaluation(options) {
2035
+ const {
2036
+ evalCases,
2037
+ provider,
2038
+ target,
2039
+ graderRegistry,
2040
+ promptDumpDir,
2041
+ nowFn,
2042
+ onProgress,
2043
+ onResult,
2044
+ resolveJudgeProvider
2045
+ } = options;
2046
+ const promptInputsList = [];
2047
+ for (const evalCase of evalCases) {
2048
+ const promptInputs = await buildPromptInputs(evalCase);
2049
+ if (promptDumpDir) {
2050
+ await dumpPrompt(promptDumpDir, evalCase, promptInputs);
2051
+ }
2052
+ promptInputsList.push(promptInputs);
2053
+ }
2054
+ const batchRequests = evalCases.map((evalCase, index) => {
2055
+ const promptInputs = promptInputsList[index];
2056
+ return {
2057
+ prompt: promptInputs.request,
2058
+ guidelines: promptInputs.guidelines,
2059
+ guideline_patterns: evalCase.guideline_patterns,
2060
+ attachments: evalCase.file_paths,
2061
+ evalCaseId: evalCase.id,
2062
+ metadata: {
2063
+ systemPrompt: promptInputs.systemMessage ?? ""
2064
+ }
2065
+ };
2066
+ });
2067
+ const batchResponse = await provider.invokeBatch?.(batchRequests);
2068
+ if (!Array.isArray(batchResponse)) {
2069
+ throw new Error("Provider batching failed: invokeBatch did not return an array");
2070
+ }
2071
+ if (batchResponse.length !== evalCases.length) {
2072
+ throw new Error(
2073
+ `Provider batching failed: expected ${evalCases.length} responses, received ${batchResponse.length}`
2074
+ );
2075
+ }
2076
+ if (onProgress) {
2077
+ const startedAt = Date.now();
2078
+ for (let i = 0; i < evalCases.length; i++) {
2079
+ await onProgress({
2080
+ workerId: 1,
2081
+ evalId: evalCases[i].id,
2082
+ status: "running",
2083
+ startedAt
2084
+ });
2085
+ }
2086
+ }
2087
+ const results = [];
2088
+ for (let i = 0; i < evalCases.length; i++) {
2089
+ const evalCase = evalCases[i];
2090
+ const promptInputs = promptInputsList[i];
2091
+ const providerResponse = batchResponse[i];
2092
+ const now = nowFn();
2093
+ const graderKind = evalCase.grader ?? "heuristic";
2094
+ const activeGrader = graderRegistry[graderKind] ?? graderRegistry.heuristic;
2095
+ if (!activeGrader) {
2096
+ throw new Error(`No grader registered for kind '${graderKind}'`);
2097
+ }
2098
+ let grade;
2099
+ try {
2100
+ grade = await activeGrader.grade({
2101
+ evalCase,
2102
+ candidate: providerResponse.text ?? "",
2103
+ target,
2104
+ provider,
2105
+ attempt: 0,
2106
+ promptInputs,
2107
+ now,
2108
+ judgeProvider: await resolveJudgeProvider(target)
2109
+ });
2110
+ } catch (error) {
2111
+ const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
2112
+ results.push(errorResult);
2113
+ if (onResult) {
2114
+ await onResult(errorResult);
2115
+ }
2116
+ if (onProgress) {
2117
+ await onProgress({
2118
+ workerId: 1,
2119
+ evalId: evalCase.id,
2120
+ status: "failed",
2121
+ completedAt: Date.now(),
2122
+ error: error instanceof Error ? error.message : String(error)
2123
+ });
2124
+ }
2125
+ continue;
2126
+ }
2127
+ const completedAt = nowFn();
2128
+ const rawRequest = {
2129
+ request: promptInputs.request,
2130
+ guidelines: promptInputs.guidelines,
2131
+ guideline_paths: evalCase.guideline_paths,
2132
+ system_message: promptInputs.systemMessage ?? ""
2133
+ };
2134
+ const result = {
2135
+ eval_id: evalCase.id,
2136
+ conversation_id: evalCase.conversation_id,
2137
+ score: grade.score,
2138
+ hits: grade.hits,
2139
+ misses: grade.misses,
2140
+ model_answer: providerResponse.text ?? "",
2141
+ expected_aspect_count: grade.expectedAspectCount,
2142
+ target: target.name,
2143
+ timestamp: completedAt.toISOString(),
2144
+ reasoning: grade.reasoning,
2145
+ raw_aspects: grade.rawAspects,
2146
+ raw_request: rawRequest,
2147
+ grader_raw_request: grade.graderRawRequest
2148
+ };
2149
+ results.push(result);
2150
+ if (onResult) {
2151
+ await onResult(result);
2152
+ }
2153
+ if (onProgress) {
2154
+ await onProgress({
2155
+ workerId: 1,
2156
+ evalId: evalCase.id,
2157
+ status: "completed",
2158
+ startedAt: 0,
2159
+ completedAt: Date.now()
2160
+ });
2161
+ }
2162
+ }
2163
+ return results;
2164
+ }
1906
2165
  async function runEvalCase(options) {
1907
2166
  const {
1908
2167
  evalCase,