@agentv/core 0.2.8 → 0.2.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-P4GOYWYH.js +140 -0
- package/dist/chunk-P4GOYWYH.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +274 -20
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +14 -0
- package/dist/index.d.ts +14 -0
- package/dist/index.js +281 -22
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
package/dist/index.cjs
CHANGED
|
@@ -900,6 +900,9 @@ function normalizeAzureApiVersion(value) {
|
|
|
900
900
|
function resolveTargetDefinition(definition, env = process.env) {
|
|
901
901
|
const parsed = BASE_TARGET_SCHEMA.parse(definition);
|
|
902
902
|
const provider = parsed.provider.toLowerCase();
|
|
903
|
+
const providerBatching = resolveOptionalBoolean(
|
|
904
|
+
parsed.settings?.provider_batching ?? parsed.settings?.providerBatching
|
|
905
|
+
);
|
|
903
906
|
switch (provider) {
|
|
904
907
|
case "azure":
|
|
905
908
|
case "azure-openai":
|
|
@@ -908,6 +911,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
908
911
|
name: parsed.name,
|
|
909
912
|
judgeTarget: parsed.judge_target,
|
|
910
913
|
workers: parsed.workers,
|
|
914
|
+
providerBatching,
|
|
911
915
|
config: resolveAzureConfig(parsed, env)
|
|
912
916
|
};
|
|
913
917
|
case "anthropic":
|
|
@@ -916,6 +920,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
916
920
|
name: parsed.name,
|
|
917
921
|
judgeTarget: parsed.judge_target,
|
|
918
922
|
workers: parsed.workers,
|
|
923
|
+
providerBatching,
|
|
919
924
|
config: resolveAnthropicConfig(parsed, env)
|
|
920
925
|
};
|
|
921
926
|
case "gemini":
|
|
@@ -926,6 +931,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
926
931
|
name: parsed.name,
|
|
927
932
|
judgeTarget: parsed.judge_target,
|
|
928
933
|
workers: parsed.workers,
|
|
934
|
+
providerBatching,
|
|
929
935
|
config: resolveGeminiConfig(parsed, env)
|
|
930
936
|
};
|
|
931
937
|
case "mock":
|
|
@@ -934,6 +940,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
934
940
|
name: parsed.name,
|
|
935
941
|
judgeTarget: parsed.judge_target,
|
|
936
942
|
workers: parsed.workers,
|
|
943
|
+
providerBatching,
|
|
937
944
|
config: resolveMockConfig(parsed)
|
|
938
945
|
};
|
|
939
946
|
case "vscode":
|
|
@@ -943,6 +950,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
943
950
|
name: parsed.name,
|
|
944
951
|
judgeTarget: parsed.judge_target,
|
|
945
952
|
workers: parsed.workers,
|
|
953
|
+
providerBatching,
|
|
946
954
|
config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
|
|
947
955
|
};
|
|
948
956
|
default:
|
|
@@ -1134,6 +1142,7 @@ var VSCodeProvider = class {
|
|
|
1134
1142
|
id;
|
|
1135
1143
|
kind;
|
|
1136
1144
|
targetName;
|
|
1145
|
+
supportsBatch = true;
|
|
1137
1146
|
config;
|
|
1138
1147
|
constructor(targetName, config, kind) {
|
|
1139
1148
|
this.id = `${kind}:${targetName}`;
|
|
@@ -1180,38 +1189,102 @@ var VSCodeProvider = class {
|
|
|
1180
1189
|
}
|
|
1181
1190
|
};
|
|
1182
1191
|
}
|
|
1192
|
+
async invokeBatch(requests) {
|
|
1193
|
+
if (requests.length === 0) {
|
|
1194
|
+
return [];
|
|
1195
|
+
}
|
|
1196
|
+
const normalizedRequests = requests.map((req) => ({
|
|
1197
|
+
request: req,
|
|
1198
|
+
attachments: normalizeAttachments(req.attachments)
|
|
1199
|
+
}));
|
|
1200
|
+
const combinedAttachments = mergeAttachments(
|
|
1201
|
+
normalizedRequests.map(({ attachments }) => attachments)
|
|
1202
|
+
);
|
|
1203
|
+
const userQueries = normalizedRequests.map(
|
|
1204
|
+
({ request, attachments }) => buildPromptDocument(request, attachments, request.guideline_patterns)
|
|
1205
|
+
);
|
|
1206
|
+
const session = await (0, import_subagent.dispatchBatchAgent)({
|
|
1207
|
+
userQueries,
|
|
1208
|
+
extraAttachments: combinedAttachments,
|
|
1209
|
+
wait: this.config.waitForResponse,
|
|
1210
|
+
dryRun: this.config.dryRun,
|
|
1211
|
+
vscodeCmd: this.config.command,
|
|
1212
|
+
subagentRoot: this.config.subagentRoot,
|
|
1213
|
+
workspaceTemplate: this.config.workspaceTemplate,
|
|
1214
|
+
silent: true
|
|
1215
|
+
});
|
|
1216
|
+
if (session.exitCode !== 0 || !session.responseFiles) {
|
|
1217
|
+
const failure = session.error ?? "VS Code subagent did not produce batch responses";
|
|
1218
|
+
throw new Error(failure);
|
|
1219
|
+
}
|
|
1220
|
+
if (this.config.dryRun) {
|
|
1221
|
+
return normalizedRequests.map(({ attachments }) => ({
|
|
1222
|
+
text: "",
|
|
1223
|
+
raw: {
|
|
1224
|
+
session,
|
|
1225
|
+
attachments,
|
|
1226
|
+
allAttachments: combinedAttachments
|
|
1227
|
+
}
|
|
1228
|
+
}));
|
|
1229
|
+
}
|
|
1230
|
+
if (session.responseFiles.length !== requests.length) {
|
|
1231
|
+
throw new Error(
|
|
1232
|
+
`VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
|
|
1233
|
+
);
|
|
1234
|
+
}
|
|
1235
|
+
const responses = [];
|
|
1236
|
+
for (const [index, responseFile] of session.responseFiles.entries()) {
|
|
1237
|
+
const responseText = await (0, import_promises3.readFile)(responseFile, "utf8");
|
|
1238
|
+
responses.push({
|
|
1239
|
+
text: responseText,
|
|
1240
|
+
raw: {
|
|
1241
|
+
session,
|
|
1242
|
+
attachments: normalizedRequests[index]?.attachments,
|
|
1243
|
+
allAttachments: combinedAttachments,
|
|
1244
|
+
responseFile
|
|
1245
|
+
}
|
|
1246
|
+
});
|
|
1247
|
+
}
|
|
1248
|
+
return responses;
|
|
1249
|
+
}
|
|
1183
1250
|
};
|
|
1184
1251
|
function buildPromptDocument(request, attachments, guidelinePatterns) {
|
|
1185
1252
|
const parts = [];
|
|
1186
1253
|
const guidelineFiles = collectGuidelineFiles(attachments, guidelinePatterns);
|
|
1187
|
-
|
|
1188
|
-
|
|
1254
|
+
const attachmentFiles = collectAttachmentFiles(attachments);
|
|
1255
|
+
const nonGuidelineAttachments = attachmentFiles.filter(
|
|
1256
|
+
(file) => !guidelineFiles.includes(file)
|
|
1257
|
+
);
|
|
1258
|
+
const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineAttachments);
|
|
1259
|
+
if (prereadBlock.length > 0) {
|
|
1260
|
+
parts.push("\n", prereadBlock);
|
|
1189
1261
|
}
|
|
1190
1262
|
parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
|
|
1191
1263
|
return parts.join("\n").trim();
|
|
1192
1264
|
}
|
|
1193
|
-
function buildMandatoryPrereadBlock(guidelineFiles) {
|
|
1194
|
-
if (guidelineFiles.length === 0) {
|
|
1265
|
+
function buildMandatoryPrereadBlock(guidelineFiles, attachmentFiles) {
|
|
1266
|
+
if (guidelineFiles.length === 0 && attachmentFiles.length === 0) {
|
|
1195
1267
|
return "";
|
|
1196
1268
|
}
|
|
1197
|
-
const
|
|
1198
|
-
let counter = 0;
|
|
1199
|
-
for (const absolutePath of guidelineFiles) {
|
|
1200
|
-
counter += 1;
|
|
1269
|
+
const buildList = (files) => files.map((absolutePath) => {
|
|
1201
1270
|
const fileName = import_node_path3.default.basename(absolutePath);
|
|
1202
1271
|
const fileUri = pathToFileUri(absolutePath);
|
|
1203
|
-
|
|
1204
|
-
}
|
|
1205
|
-
const
|
|
1206
|
-
|
|
1207
|
-
`Read all guideline files:
|
|
1208
|
-
${
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1272
|
+
return `* [${fileName}](${fileUri})`;
|
|
1273
|
+
});
|
|
1274
|
+
const sections = [];
|
|
1275
|
+
if (guidelineFiles.length > 0) {
|
|
1276
|
+
sections.push(`Read all guideline files:
|
|
1277
|
+
${buildList(guidelineFiles).join("\n")}.`);
|
|
1278
|
+
}
|
|
1279
|
+
if (attachmentFiles.length > 0) {
|
|
1280
|
+
sections.push(`Read all attachment files:
|
|
1281
|
+
${buildList(attachmentFiles).join("\n")}.`);
|
|
1282
|
+
}
|
|
1283
|
+
sections.push(
|
|
1284
|
+
"If any file is missing, fail with ERROR: missing-file <filename> and stop.",
|
|
1285
|
+
"Then apply system_instructions on the user query below."
|
|
1286
|
+
);
|
|
1287
|
+
return sections.join("\n");
|
|
1215
1288
|
}
|
|
1216
1289
|
function collectGuidelineFiles(attachments, guidelinePatterns) {
|
|
1217
1290
|
if (!attachments || attachments.length === 0) {
|
|
@@ -1229,6 +1302,19 @@ function collectGuidelineFiles(attachments, guidelinePatterns) {
|
|
|
1229
1302
|
}
|
|
1230
1303
|
return Array.from(unique.values());
|
|
1231
1304
|
}
|
|
1305
|
+
function collectAttachmentFiles(attachments) {
|
|
1306
|
+
if (!attachments || attachments.length === 0) {
|
|
1307
|
+
return [];
|
|
1308
|
+
}
|
|
1309
|
+
const unique = /* @__PURE__ */ new Map();
|
|
1310
|
+
for (const attachment of attachments) {
|
|
1311
|
+
const absolutePath = import_node_path3.default.resolve(attachment);
|
|
1312
|
+
if (!unique.has(absolutePath)) {
|
|
1313
|
+
unique.set(absolutePath, absolutePath);
|
|
1314
|
+
}
|
|
1315
|
+
}
|
|
1316
|
+
return Array.from(unique.values());
|
|
1317
|
+
}
|
|
1232
1318
|
function pathToFileUri(filePath) {
|
|
1233
1319
|
const absolutePath = import_node_path3.default.isAbsolute(filePath) ? filePath : import_node_path3.default.resolve(filePath);
|
|
1234
1320
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
@@ -1247,6 +1333,16 @@ function normalizeAttachments(attachments) {
|
|
|
1247
1333
|
}
|
|
1248
1334
|
return Array.from(deduped);
|
|
1249
1335
|
}
|
|
1336
|
+
function mergeAttachments(all) {
|
|
1337
|
+
const deduped = /* @__PURE__ */ new Set();
|
|
1338
|
+
for (const list of all) {
|
|
1339
|
+
if (!list) continue;
|
|
1340
|
+
for (const attachment of list) {
|
|
1341
|
+
deduped.add(import_node_path3.default.resolve(attachment));
|
|
1342
|
+
}
|
|
1343
|
+
}
|
|
1344
|
+
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
1345
|
+
}
|
|
1250
1346
|
async function ensureVSCodeSubagents(options) {
|
|
1251
1347
|
const { kind, count, verbose = false } = options;
|
|
1252
1348
|
const vscodeCmd = kind === "vscode-insiders" ? "code-insiders" : "code";
|
|
@@ -1981,6 +2077,12 @@ async function runEvaluation(options) {
|
|
|
1981
2077
|
};
|
|
1982
2078
|
const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
|
|
1983
2079
|
const primaryProvider = getOrCreateProvider(target);
|
|
2080
|
+
const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
|
|
2081
|
+
if (target.providerBatching && !providerSupportsBatch && verbose) {
|
|
2082
|
+
console.warn(
|
|
2083
|
+
`Provider batching requested for target '${target.name}', but provider does not advertise batch support. Using per-case dispatch.`
|
|
2084
|
+
);
|
|
2085
|
+
}
|
|
1984
2086
|
if (onProgress && filteredEvalCases.length > 0) {
|
|
1985
2087
|
for (let i = 0; i < filteredEvalCases.length; i++) {
|
|
1986
2088
|
await onProgress({
|
|
@@ -1990,6 +2092,27 @@ async function runEvaluation(options) {
|
|
|
1990
2092
|
});
|
|
1991
2093
|
}
|
|
1992
2094
|
}
|
|
2095
|
+
if (providerSupportsBatch) {
|
|
2096
|
+
try {
|
|
2097
|
+
return await runBatchEvaluation({
|
|
2098
|
+
evalCases: filteredEvalCases,
|
|
2099
|
+
provider: primaryProvider,
|
|
2100
|
+
target,
|
|
2101
|
+
graderRegistry,
|
|
2102
|
+
promptDumpDir,
|
|
2103
|
+
nowFn: now ?? (() => /* @__PURE__ */ new Date()),
|
|
2104
|
+
onProgress,
|
|
2105
|
+
onResult,
|
|
2106
|
+
verbose,
|
|
2107
|
+
resolveJudgeProvider
|
|
2108
|
+
});
|
|
2109
|
+
} catch (error) {
|
|
2110
|
+
if (verbose) {
|
|
2111
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2112
|
+
console.warn(`Provider batch execution failed, falling back to per-case dispatch: ${message}`);
|
|
2113
|
+
}
|
|
2114
|
+
}
|
|
2115
|
+
}
|
|
1993
2116
|
const workers = options.maxConcurrency ?? target.workers ?? 1;
|
|
1994
2117
|
const limit = pLimit(workers);
|
|
1995
2118
|
let nextWorkerId = 1;
|
|
@@ -2073,6 +2196,137 @@ async function runEvaluation(options) {
|
|
|
2073
2196
|
}
|
|
2074
2197
|
return results;
|
|
2075
2198
|
}
|
|
2199
|
+
async function runBatchEvaluation(options) {
|
|
2200
|
+
const {
|
|
2201
|
+
evalCases,
|
|
2202
|
+
provider,
|
|
2203
|
+
target,
|
|
2204
|
+
graderRegistry,
|
|
2205
|
+
promptDumpDir,
|
|
2206
|
+
nowFn,
|
|
2207
|
+
onProgress,
|
|
2208
|
+
onResult,
|
|
2209
|
+
resolveJudgeProvider
|
|
2210
|
+
} = options;
|
|
2211
|
+
const promptInputsList = [];
|
|
2212
|
+
for (const evalCase of evalCases) {
|
|
2213
|
+
const promptInputs = await buildPromptInputs(evalCase);
|
|
2214
|
+
if (promptDumpDir) {
|
|
2215
|
+
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
2216
|
+
}
|
|
2217
|
+
promptInputsList.push(promptInputs);
|
|
2218
|
+
}
|
|
2219
|
+
const batchRequests = evalCases.map((evalCase, index) => {
|
|
2220
|
+
const promptInputs = promptInputsList[index];
|
|
2221
|
+
return {
|
|
2222
|
+
prompt: promptInputs.request,
|
|
2223
|
+
guidelines: promptInputs.guidelines,
|
|
2224
|
+
guideline_patterns: evalCase.guideline_patterns,
|
|
2225
|
+
attachments: evalCase.file_paths,
|
|
2226
|
+
evalCaseId: evalCase.id,
|
|
2227
|
+
metadata: {
|
|
2228
|
+
systemPrompt: promptInputs.systemMessage ?? ""
|
|
2229
|
+
}
|
|
2230
|
+
};
|
|
2231
|
+
});
|
|
2232
|
+
const batchResponse = await provider.invokeBatch?.(batchRequests);
|
|
2233
|
+
if (!Array.isArray(batchResponse)) {
|
|
2234
|
+
throw new Error("Provider batching failed: invokeBatch did not return an array");
|
|
2235
|
+
}
|
|
2236
|
+
if (batchResponse.length !== evalCases.length) {
|
|
2237
|
+
throw new Error(
|
|
2238
|
+
`Provider batching failed: expected ${evalCases.length} responses, received ${batchResponse.length}`
|
|
2239
|
+
);
|
|
2240
|
+
}
|
|
2241
|
+
if (onProgress) {
|
|
2242
|
+
const startedAt = Date.now();
|
|
2243
|
+
for (let i = 0; i < evalCases.length; i++) {
|
|
2244
|
+
await onProgress({
|
|
2245
|
+
workerId: 1,
|
|
2246
|
+
evalId: evalCases[i].id,
|
|
2247
|
+
status: "running",
|
|
2248
|
+
startedAt
|
|
2249
|
+
});
|
|
2250
|
+
}
|
|
2251
|
+
}
|
|
2252
|
+
const results = [];
|
|
2253
|
+
for (let i = 0; i < evalCases.length; i++) {
|
|
2254
|
+
const evalCase = evalCases[i];
|
|
2255
|
+
const promptInputs = promptInputsList[i];
|
|
2256
|
+
const providerResponse = batchResponse[i];
|
|
2257
|
+
const now = nowFn();
|
|
2258
|
+
const graderKind = evalCase.grader ?? "heuristic";
|
|
2259
|
+
const activeGrader = graderRegistry[graderKind] ?? graderRegistry.heuristic;
|
|
2260
|
+
if (!activeGrader) {
|
|
2261
|
+
throw new Error(`No grader registered for kind '${graderKind}'`);
|
|
2262
|
+
}
|
|
2263
|
+
let grade;
|
|
2264
|
+
try {
|
|
2265
|
+
grade = await activeGrader.grade({
|
|
2266
|
+
evalCase,
|
|
2267
|
+
candidate: providerResponse.text ?? "",
|
|
2268
|
+
target,
|
|
2269
|
+
provider,
|
|
2270
|
+
attempt: 0,
|
|
2271
|
+
promptInputs,
|
|
2272
|
+
now,
|
|
2273
|
+
judgeProvider: await resolveJudgeProvider(target)
|
|
2274
|
+
});
|
|
2275
|
+
} catch (error) {
|
|
2276
|
+
const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
2277
|
+
results.push(errorResult);
|
|
2278
|
+
if (onResult) {
|
|
2279
|
+
await onResult(errorResult);
|
|
2280
|
+
}
|
|
2281
|
+
if (onProgress) {
|
|
2282
|
+
await onProgress({
|
|
2283
|
+
workerId: 1,
|
|
2284
|
+
evalId: evalCase.id,
|
|
2285
|
+
status: "failed",
|
|
2286
|
+
completedAt: Date.now(),
|
|
2287
|
+
error: error instanceof Error ? error.message : String(error)
|
|
2288
|
+
});
|
|
2289
|
+
}
|
|
2290
|
+
continue;
|
|
2291
|
+
}
|
|
2292
|
+
const completedAt = nowFn();
|
|
2293
|
+
const rawRequest = {
|
|
2294
|
+
request: promptInputs.request,
|
|
2295
|
+
guidelines: promptInputs.guidelines,
|
|
2296
|
+
guideline_paths: evalCase.guideline_paths,
|
|
2297
|
+
system_message: promptInputs.systemMessage ?? ""
|
|
2298
|
+
};
|
|
2299
|
+
const result = {
|
|
2300
|
+
eval_id: evalCase.id,
|
|
2301
|
+
conversation_id: evalCase.conversation_id,
|
|
2302
|
+
score: grade.score,
|
|
2303
|
+
hits: grade.hits,
|
|
2304
|
+
misses: grade.misses,
|
|
2305
|
+
model_answer: providerResponse.text ?? "",
|
|
2306
|
+
expected_aspect_count: grade.expectedAspectCount,
|
|
2307
|
+
target: target.name,
|
|
2308
|
+
timestamp: completedAt.toISOString(),
|
|
2309
|
+
reasoning: grade.reasoning,
|
|
2310
|
+
raw_aspects: grade.rawAspects,
|
|
2311
|
+
raw_request: rawRequest,
|
|
2312
|
+
grader_raw_request: grade.graderRawRequest
|
|
2313
|
+
};
|
|
2314
|
+
results.push(result);
|
|
2315
|
+
if (onResult) {
|
|
2316
|
+
await onResult(result);
|
|
2317
|
+
}
|
|
2318
|
+
if (onProgress) {
|
|
2319
|
+
await onProgress({
|
|
2320
|
+
workerId: 1,
|
|
2321
|
+
evalId: evalCase.id,
|
|
2322
|
+
status: "completed",
|
|
2323
|
+
startedAt: 0,
|
|
2324
|
+
completedAt: Date.now()
|
|
2325
|
+
});
|
|
2326
|
+
}
|
|
2327
|
+
}
|
|
2328
|
+
return results;
|
|
2329
|
+
}
|
|
2076
2330
|
async function runEvalCase(options) {
|
|
2077
2331
|
const {
|
|
2078
2332
|
evalCase,
|