@agentv/core 0.2.8 → 0.2.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-P4GOYWYH.js +140 -0
- package/dist/chunk-P4GOYWYH.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +274 -20
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +14 -0
- package/dist/index.d.ts +14 -0
- package/dist/index.js +281 -22
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
package/dist/index.d.cts
CHANGED
|
@@ -202,6 +202,15 @@ interface Provider {
|
|
|
202
202
|
readonly kind: ProviderKind;
|
|
203
203
|
readonly targetName: string;
|
|
204
204
|
invoke(request: ProviderRequest): Promise<ProviderResponse>;
|
|
205
|
+
/**
|
|
206
|
+
* Optional capability marker for provider-managed batching (single session handling multiple requests).
|
|
207
|
+
*/
|
|
208
|
+
readonly supportsBatch?: boolean;
|
|
209
|
+
/**
|
|
210
|
+
* Optional batch invocation hook. When defined alongside supportsBatch=true,
|
|
211
|
+
* the orchestrator may send multiple requests in a single provider session.
|
|
212
|
+
*/
|
|
213
|
+
invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
|
|
205
214
|
}
|
|
206
215
|
type EnvLookup = Readonly<Record<string, string | undefined>>;
|
|
207
216
|
interface TargetDefinition {
|
|
@@ -251,30 +260,35 @@ type ResolvedTarget = {
|
|
|
251
260
|
readonly name: string;
|
|
252
261
|
readonly judgeTarget?: string;
|
|
253
262
|
readonly workers?: number;
|
|
263
|
+
readonly providerBatching?: boolean;
|
|
254
264
|
readonly config: AzureResolvedConfig;
|
|
255
265
|
} | {
|
|
256
266
|
readonly kind: "anthropic";
|
|
257
267
|
readonly name: string;
|
|
258
268
|
readonly judgeTarget?: string;
|
|
259
269
|
readonly workers?: number;
|
|
270
|
+
readonly providerBatching?: boolean;
|
|
260
271
|
readonly config: AnthropicResolvedConfig;
|
|
261
272
|
} | {
|
|
262
273
|
readonly kind: "gemini";
|
|
263
274
|
readonly name: string;
|
|
264
275
|
readonly judgeTarget?: string;
|
|
265
276
|
readonly workers?: number;
|
|
277
|
+
readonly providerBatching?: boolean;
|
|
266
278
|
readonly config: GeminiResolvedConfig;
|
|
267
279
|
} | {
|
|
268
280
|
readonly kind: "mock";
|
|
269
281
|
readonly name: string;
|
|
270
282
|
readonly judgeTarget?: string;
|
|
271
283
|
readonly workers?: number;
|
|
284
|
+
readonly providerBatching?: boolean;
|
|
272
285
|
readonly config: MockResolvedConfig;
|
|
273
286
|
} | {
|
|
274
287
|
readonly kind: "vscode" | "vscode-insiders";
|
|
275
288
|
readonly name: string;
|
|
276
289
|
readonly judgeTarget?: string;
|
|
277
290
|
readonly workers?: number;
|
|
291
|
+
readonly providerBatching?: boolean;
|
|
278
292
|
readonly config: VSCodeResolvedConfig;
|
|
279
293
|
};
|
|
280
294
|
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup): ResolvedTarget;
|
package/dist/index.d.ts
CHANGED
|
@@ -202,6 +202,15 @@ interface Provider {
|
|
|
202
202
|
readonly kind: ProviderKind;
|
|
203
203
|
readonly targetName: string;
|
|
204
204
|
invoke(request: ProviderRequest): Promise<ProviderResponse>;
|
|
205
|
+
/**
|
|
206
|
+
* Optional capability marker for provider-managed batching (single session handling multiple requests).
|
|
207
|
+
*/
|
|
208
|
+
readonly supportsBatch?: boolean;
|
|
209
|
+
/**
|
|
210
|
+
* Optional batch invocation hook. When defined alongside supportsBatch=true,
|
|
211
|
+
* the orchestrator may send multiple requests in a single provider session.
|
|
212
|
+
*/
|
|
213
|
+
invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
|
|
205
214
|
}
|
|
206
215
|
type EnvLookup = Readonly<Record<string, string | undefined>>;
|
|
207
216
|
interface TargetDefinition {
|
|
@@ -251,30 +260,35 @@ type ResolvedTarget = {
|
|
|
251
260
|
readonly name: string;
|
|
252
261
|
readonly judgeTarget?: string;
|
|
253
262
|
readonly workers?: number;
|
|
263
|
+
readonly providerBatching?: boolean;
|
|
254
264
|
readonly config: AzureResolvedConfig;
|
|
255
265
|
} | {
|
|
256
266
|
readonly kind: "anthropic";
|
|
257
267
|
readonly name: string;
|
|
258
268
|
readonly judgeTarget?: string;
|
|
259
269
|
readonly workers?: number;
|
|
270
|
+
readonly providerBatching?: boolean;
|
|
260
271
|
readonly config: AnthropicResolvedConfig;
|
|
261
272
|
} | {
|
|
262
273
|
readonly kind: "gemini";
|
|
263
274
|
readonly name: string;
|
|
264
275
|
readonly judgeTarget?: string;
|
|
265
276
|
readonly workers?: number;
|
|
277
|
+
readonly providerBatching?: boolean;
|
|
266
278
|
readonly config: GeminiResolvedConfig;
|
|
267
279
|
} | {
|
|
268
280
|
readonly kind: "mock";
|
|
269
281
|
readonly name: string;
|
|
270
282
|
readonly judgeTarget?: string;
|
|
271
283
|
readonly workers?: number;
|
|
284
|
+
readonly providerBatching?: boolean;
|
|
272
285
|
readonly config: MockResolvedConfig;
|
|
273
286
|
} | {
|
|
274
287
|
readonly kind: "vscode" | "vscode-insiders";
|
|
275
288
|
readonly name: string;
|
|
276
289
|
readonly judgeTarget?: string;
|
|
277
290
|
readonly workers?: number;
|
|
291
|
+
readonly providerBatching?: boolean;
|
|
278
292
|
readonly config: VSCodeResolvedConfig;
|
|
279
293
|
};
|
|
280
294
|
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup): ResolvedTarget;
|
package/dist/index.js
CHANGED
|
@@ -5,7 +5,7 @@ import {
|
|
|
5
5
|
fileExists,
|
|
6
6
|
findGitRoot,
|
|
7
7
|
resolveFileReference
|
|
8
|
-
} from "./chunk-
|
|
8
|
+
} from "./chunk-P4GOYWYH.js";
|
|
9
9
|
|
|
10
10
|
// src/evaluation/types.ts
|
|
11
11
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -735,6 +735,9 @@ function normalizeAzureApiVersion(value) {
|
|
|
735
735
|
function resolveTargetDefinition(definition, env = process.env) {
|
|
736
736
|
const parsed = BASE_TARGET_SCHEMA.parse(definition);
|
|
737
737
|
const provider = parsed.provider.toLowerCase();
|
|
738
|
+
const providerBatching = resolveOptionalBoolean(
|
|
739
|
+
parsed.settings?.provider_batching ?? parsed.settings?.providerBatching
|
|
740
|
+
);
|
|
738
741
|
switch (provider) {
|
|
739
742
|
case "azure":
|
|
740
743
|
case "azure-openai":
|
|
@@ -743,6 +746,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
743
746
|
name: parsed.name,
|
|
744
747
|
judgeTarget: parsed.judge_target,
|
|
745
748
|
workers: parsed.workers,
|
|
749
|
+
providerBatching,
|
|
746
750
|
config: resolveAzureConfig(parsed, env)
|
|
747
751
|
};
|
|
748
752
|
case "anthropic":
|
|
@@ -751,6 +755,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
751
755
|
name: parsed.name,
|
|
752
756
|
judgeTarget: parsed.judge_target,
|
|
753
757
|
workers: parsed.workers,
|
|
758
|
+
providerBatching,
|
|
754
759
|
config: resolveAnthropicConfig(parsed, env)
|
|
755
760
|
};
|
|
756
761
|
case "gemini":
|
|
@@ -761,6 +766,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
761
766
|
name: parsed.name,
|
|
762
767
|
judgeTarget: parsed.judge_target,
|
|
763
768
|
workers: parsed.workers,
|
|
769
|
+
providerBatching,
|
|
764
770
|
config: resolveGeminiConfig(parsed, env)
|
|
765
771
|
};
|
|
766
772
|
case "mock":
|
|
@@ -769,6 +775,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
769
775
|
name: parsed.name,
|
|
770
776
|
judgeTarget: parsed.judge_target,
|
|
771
777
|
workers: parsed.workers,
|
|
778
|
+
providerBatching,
|
|
772
779
|
config: resolveMockConfig(parsed)
|
|
773
780
|
};
|
|
774
781
|
case "vscode":
|
|
@@ -778,6 +785,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
778
785
|
name: parsed.name,
|
|
779
786
|
judgeTarget: parsed.judge_target,
|
|
780
787
|
workers: parsed.workers,
|
|
788
|
+
providerBatching,
|
|
781
789
|
config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
|
|
782
790
|
};
|
|
783
791
|
default:
|
|
@@ -964,11 +972,17 @@ function isLikelyEnvReference(value) {
|
|
|
964
972
|
// src/evaluation/providers/vscode.ts
|
|
965
973
|
import { readFile as readFile2 } from "node:fs/promises";
|
|
966
974
|
import path2 from "node:path";
|
|
967
|
-
import {
|
|
975
|
+
import {
|
|
976
|
+
dispatchAgentSession,
|
|
977
|
+
dispatchBatchAgent,
|
|
978
|
+
getSubagentRoot,
|
|
979
|
+
provisionSubagents
|
|
980
|
+
} from "subagent";
|
|
968
981
|
var VSCodeProvider = class {
|
|
969
982
|
id;
|
|
970
983
|
kind;
|
|
971
984
|
targetName;
|
|
985
|
+
supportsBatch = true;
|
|
972
986
|
config;
|
|
973
987
|
constructor(targetName, config, kind) {
|
|
974
988
|
this.id = `${kind}:${targetName}`;
|
|
@@ -1015,38 +1029,102 @@ var VSCodeProvider = class {
|
|
|
1015
1029
|
}
|
|
1016
1030
|
};
|
|
1017
1031
|
}
|
|
1032
|
+
async invokeBatch(requests) {
|
|
1033
|
+
if (requests.length === 0) {
|
|
1034
|
+
return [];
|
|
1035
|
+
}
|
|
1036
|
+
const normalizedRequests = requests.map((req) => ({
|
|
1037
|
+
request: req,
|
|
1038
|
+
attachments: normalizeAttachments(req.attachments)
|
|
1039
|
+
}));
|
|
1040
|
+
const combinedAttachments = mergeAttachments(
|
|
1041
|
+
normalizedRequests.map(({ attachments }) => attachments)
|
|
1042
|
+
);
|
|
1043
|
+
const userQueries = normalizedRequests.map(
|
|
1044
|
+
({ request, attachments }) => buildPromptDocument(request, attachments, request.guideline_patterns)
|
|
1045
|
+
);
|
|
1046
|
+
const session = await dispatchBatchAgent({
|
|
1047
|
+
userQueries,
|
|
1048
|
+
extraAttachments: combinedAttachments,
|
|
1049
|
+
wait: this.config.waitForResponse,
|
|
1050
|
+
dryRun: this.config.dryRun,
|
|
1051
|
+
vscodeCmd: this.config.command,
|
|
1052
|
+
subagentRoot: this.config.subagentRoot,
|
|
1053
|
+
workspaceTemplate: this.config.workspaceTemplate,
|
|
1054
|
+
silent: true
|
|
1055
|
+
});
|
|
1056
|
+
if (session.exitCode !== 0 || !session.responseFiles) {
|
|
1057
|
+
const failure = session.error ?? "VS Code subagent did not produce batch responses";
|
|
1058
|
+
throw new Error(failure);
|
|
1059
|
+
}
|
|
1060
|
+
if (this.config.dryRun) {
|
|
1061
|
+
return normalizedRequests.map(({ attachments }) => ({
|
|
1062
|
+
text: "",
|
|
1063
|
+
raw: {
|
|
1064
|
+
session,
|
|
1065
|
+
attachments,
|
|
1066
|
+
allAttachments: combinedAttachments
|
|
1067
|
+
}
|
|
1068
|
+
}));
|
|
1069
|
+
}
|
|
1070
|
+
if (session.responseFiles.length !== requests.length) {
|
|
1071
|
+
throw new Error(
|
|
1072
|
+
`VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
|
|
1073
|
+
);
|
|
1074
|
+
}
|
|
1075
|
+
const responses = [];
|
|
1076
|
+
for (const [index, responseFile] of session.responseFiles.entries()) {
|
|
1077
|
+
const responseText = await readFile2(responseFile, "utf8");
|
|
1078
|
+
responses.push({
|
|
1079
|
+
text: responseText,
|
|
1080
|
+
raw: {
|
|
1081
|
+
session,
|
|
1082
|
+
attachments: normalizedRequests[index]?.attachments,
|
|
1083
|
+
allAttachments: combinedAttachments,
|
|
1084
|
+
responseFile
|
|
1085
|
+
}
|
|
1086
|
+
});
|
|
1087
|
+
}
|
|
1088
|
+
return responses;
|
|
1089
|
+
}
|
|
1018
1090
|
};
|
|
1019
1091
|
function buildPromptDocument(request, attachments, guidelinePatterns) {
|
|
1020
1092
|
const parts = [];
|
|
1021
1093
|
const guidelineFiles = collectGuidelineFiles(attachments, guidelinePatterns);
|
|
1022
|
-
|
|
1023
|
-
|
|
1094
|
+
const attachmentFiles = collectAttachmentFiles(attachments);
|
|
1095
|
+
const nonGuidelineAttachments = attachmentFiles.filter(
|
|
1096
|
+
(file) => !guidelineFiles.includes(file)
|
|
1097
|
+
);
|
|
1098
|
+
const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineAttachments);
|
|
1099
|
+
if (prereadBlock.length > 0) {
|
|
1100
|
+
parts.push("\n", prereadBlock);
|
|
1024
1101
|
}
|
|
1025
1102
|
parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
|
|
1026
1103
|
return parts.join("\n").trim();
|
|
1027
1104
|
}
|
|
1028
|
-
function buildMandatoryPrereadBlock(guidelineFiles) {
|
|
1029
|
-
if (guidelineFiles.length === 0) {
|
|
1105
|
+
function buildMandatoryPrereadBlock(guidelineFiles, attachmentFiles) {
|
|
1106
|
+
if (guidelineFiles.length === 0 && attachmentFiles.length === 0) {
|
|
1030
1107
|
return "";
|
|
1031
1108
|
}
|
|
1032
|
-
const
|
|
1033
|
-
let counter = 0;
|
|
1034
|
-
for (const absolutePath of guidelineFiles) {
|
|
1035
|
-
counter += 1;
|
|
1109
|
+
const buildList = (files) => files.map((absolutePath) => {
|
|
1036
1110
|
const fileName = path2.basename(absolutePath);
|
|
1037
1111
|
const fileUri = pathToFileUri(absolutePath);
|
|
1038
|
-
|
|
1039
|
-
}
|
|
1040
|
-
const
|
|
1041
|
-
|
|
1042
|
-
`Read all guideline files:
|
|
1043
|
-
${
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1112
|
+
return `* [${fileName}](${fileUri})`;
|
|
1113
|
+
});
|
|
1114
|
+
const sections = [];
|
|
1115
|
+
if (guidelineFiles.length > 0) {
|
|
1116
|
+
sections.push(`Read all guideline files:
|
|
1117
|
+
${buildList(guidelineFiles).join("\n")}.`);
|
|
1118
|
+
}
|
|
1119
|
+
if (attachmentFiles.length > 0) {
|
|
1120
|
+
sections.push(`Read all attachment files:
|
|
1121
|
+
${buildList(attachmentFiles).join("\n")}.`);
|
|
1122
|
+
}
|
|
1123
|
+
sections.push(
|
|
1124
|
+
"If any file is missing, fail with ERROR: missing-file <filename> and stop.",
|
|
1125
|
+
"Then apply system_instructions on the user query below."
|
|
1126
|
+
);
|
|
1127
|
+
return sections.join("\n");
|
|
1050
1128
|
}
|
|
1051
1129
|
function collectGuidelineFiles(attachments, guidelinePatterns) {
|
|
1052
1130
|
if (!attachments || attachments.length === 0) {
|
|
@@ -1064,6 +1142,19 @@ function collectGuidelineFiles(attachments, guidelinePatterns) {
|
|
|
1064
1142
|
}
|
|
1065
1143
|
return Array.from(unique.values());
|
|
1066
1144
|
}
|
|
1145
|
+
function collectAttachmentFiles(attachments) {
|
|
1146
|
+
if (!attachments || attachments.length === 0) {
|
|
1147
|
+
return [];
|
|
1148
|
+
}
|
|
1149
|
+
const unique = /* @__PURE__ */ new Map();
|
|
1150
|
+
for (const attachment of attachments) {
|
|
1151
|
+
const absolutePath = path2.resolve(attachment);
|
|
1152
|
+
if (!unique.has(absolutePath)) {
|
|
1153
|
+
unique.set(absolutePath, absolutePath);
|
|
1154
|
+
}
|
|
1155
|
+
}
|
|
1156
|
+
return Array.from(unique.values());
|
|
1157
|
+
}
|
|
1067
1158
|
function pathToFileUri(filePath) {
|
|
1068
1159
|
const absolutePath = path2.isAbsolute(filePath) ? filePath : path2.resolve(filePath);
|
|
1069
1160
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
@@ -1082,6 +1173,16 @@ function normalizeAttachments(attachments) {
|
|
|
1082
1173
|
}
|
|
1083
1174
|
return Array.from(deduped);
|
|
1084
1175
|
}
|
|
1176
|
+
function mergeAttachments(all) {
|
|
1177
|
+
const deduped = /* @__PURE__ */ new Set();
|
|
1178
|
+
for (const list of all) {
|
|
1179
|
+
if (!list) continue;
|
|
1180
|
+
for (const attachment of list) {
|
|
1181
|
+
deduped.add(path2.resolve(attachment));
|
|
1182
|
+
}
|
|
1183
|
+
}
|
|
1184
|
+
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
1185
|
+
}
|
|
1085
1186
|
async function ensureVSCodeSubagents(options) {
|
|
1086
1187
|
const { kind, count, verbose = false } = options;
|
|
1087
1188
|
const vscodeCmd = kind === "vscode-insiders" ? "code-insiders" : "code";
|
|
@@ -1811,6 +1912,12 @@ async function runEvaluation(options) {
|
|
|
1811
1912
|
};
|
|
1812
1913
|
const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
|
|
1813
1914
|
const primaryProvider = getOrCreateProvider(target);
|
|
1915
|
+
const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
|
|
1916
|
+
if (target.providerBatching && !providerSupportsBatch && verbose) {
|
|
1917
|
+
console.warn(
|
|
1918
|
+
`Provider batching requested for target '${target.name}', but provider does not advertise batch support. Using per-case dispatch.`
|
|
1919
|
+
);
|
|
1920
|
+
}
|
|
1814
1921
|
if (onProgress && filteredEvalCases.length > 0) {
|
|
1815
1922
|
for (let i = 0; i < filteredEvalCases.length; i++) {
|
|
1816
1923
|
await onProgress({
|
|
@@ -1820,6 +1927,27 @@ async function runEvaluation(options) {
|
|
|
1820
1927
|
});
|
|
1821
1928
|
}
|
|
1822
1929
|
}
|
|
1930
|
+
if (providerSupportsBatch) {
|
|
1931
|
+
try {
|
|
1932
|
+
return await runBatchEvaluation({
|
|
1933
|
+
evalCases: filteredEvalCases,
|
|
1934
|
+
provider: primaryProvider,
|
|
1935
|
+
target,
|
|
1936
|
+
graderRegistry,
|
|
1937
|
+
promptDumpDir,
|
|
1938
|
+
nowFn: now ?? (() => /* @__PURE__ */ new Date()),
|
|
1939
|
+
onProgress,
|
|
1940
|
+
onResult,
|
|
1941
|
+
verbose,
|
|
1942
|
+
resolveJudgeProvider
|
|
1943
|
+
});
|
|
1944
|
+
} catch (error) {
|
|
1945
|
+
if (verbose) {
|
|
1946
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1947
|
+
console.warn(`Provider batch execution failed, falling back to per-case dispatch: ${message}`);
|
|
1948
|
+
}
|
|
1949
|
+
}
|
|
1950
|
+
}
|
|
1823
1951
|
const workers = options.maxConcurrency ?? target.workers ?? 1;
|
|
1824
1952
|
const limit = pLimit(workers);
|
|
1825
1953
|
let nextWorkerId = 1;
|
|
@@ -1903,6 +2031,137 @@ async function runEvaluation(options) {
|
|
|
1903
2031
|
}
|
|
1904
2032
|
return results;
|
|
1905
2033
|
}
|
|
2034
|
+
async function runBatchEvaluation(options) {
|
|
2035
|
+
const {
|
|
2036
|
+
evalCases,
|
|
2037
|
+
provider,
|
|
2038
|
+
target,
|
|
2039
|
+
graderRegistry,
|
|
2040
|
+
promptDumpDir,
|
|
2041
|
+
nowFn,
|
|
2042
|
+
onProgress,
|
|
2043
|
+
onResult,
|
|
2044
|
+
resolveJudgeProvider
|
|
2045
|
+
} = options;
|
|
2046
|
+
const promptInputsList = [];
|
|
2047
|
+
for (const evalCase of evalCases) {
|
|
2048
|
+
const promptInputs = await buildPromptInputs(evalCase);
|
|
2049
|
+
if (promptDumpDir) {
|
|
2050
|
+
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
2051
|
+
}
|
|
2052
|
+
promptInputsList.push(promptInputs);
|
|
2053
|
+
}
|
|
2054
|
+
const batchRequests = evalCases.map((evalCase, index) => {
|
|
2055
|
+
const promptInputs = promptInputsList[index];
|
|
2056
|
+
return {
|
|
2057
|
+
prompt: promptInputs.request,
|
|
2058
|
+
guidelines: promptInputs.guidelines,
|
|
2059
|
+
guideline_patterns: evalCase.guideline_patterns,
|
|
2060
|
+
attachments: evalCase.file_paths,
|
|
2061
|
+
evalCaseId: evalCase.id,
|
|
2062
|
+
metadata: {
|
|
2063
|
+
systemPrompt: promptInputs.systemMessage ?? ""
|
|
2064
|
+
}
|
|
2065
|
+
};
|
|
2066
|
+
});
|
|
2067
|
+
const batchResponse = await provider.invokeBatch?.(batchRequests);
|
|
2068
|
+
if (!Array.isArray(batchResponse)) {
|
|
2069
|
+
throw new Error("Provider batching failed: invokeBatch did not return an array");
|
|
2070
|
+
}
|
|
2071
|
+
if (batchResponse.length !== evalCases.length) {
|
|
2072
|
+
throw new Error(
|
|
2073
|
+
`Provider batching failed: expected ${evalCases.length} responses, received ${batchResponse.length}`
|
|
2074
|
+
);
|
|
2075
|
+
}
|
|
2076
|
+
if (onProgress) {
|
|
2077
|
+
const startedAt = Date.now();
|
|
2078
|
+
for (let i = 0; i < evalCases.length; i++) {
|
|
2079
|
+
await onProgress({
|
|
2080
|
+
workerId: 1,
|
|
2081
|
+
evalId: evalCases[i].id,
|
|
2082
|
+
status: "running",
|
|
2083
|
+
startedAt
|
|
2084
|
+
});
|
|
2085
|
+
}
|
|
2086
|
+
}
|
|
2087
|
+
const results = [];
|
|
2088
|
+
for (let i = 0; i < evalCases.length; i++) {
|
|
2089
|
+
const evalCase = evalCases[i];
|
|
2090
|
+
const promptInputs = promptInputsList[i];
|
|
2091
|
+
const providerResponse = batchResponse[i];
|
|
2092
|
+
const now = nowFn();
|
|
2093
|
+
const graderKind = evalCase.grader ?? "heuristic";
|
|
2094
|
+
const activeGrader = graderRegistry[graderKind] ?? graderRegistry.heuristic;
|
|
2095
|
+
if (!activeGrader) {
|
|
2096
|
+
throw new Error(`No grader registered for kind '${graderKind}'`);
|
|
2097
|
+
}
|
|
2098
|
+
let grade;
|
|
2099
|
+
try {
|
|
2100
|
+
grade = await activeGrader.grade({
|
|
2101
|
+
evalCase,
|
|
2102
|
+
candidate: providerResponse.text ?? "",
|
|
2103
|
+
target,
|
|
2104
|
+
provider,
|
|
2105
|
+
attempt: 0,
|
|
2106
|
+
promptInputs,
|
|
2107
|
+
now,
|
|
2108
|
+
judgeProvider: await resolveJudgeProvider(target)
|
|
2109
|
+
});
|
|
2110
|
+
} catch (error) {
|
|
2111
|
+
const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
2112
|
+
results.push(errorResult);
|
|
2113
|
+
if (onResult) {
|
|
2114
|
+
await onResult(errorResult);
|
|
2115
|
+
}
|
|
2116
|
+
if (onProgress) {
|
|
2117
|
+
await onProgress({
|
|
2118
|
+
workerId: 1,
|
|
2119
|
+
evalId: evalCase.id,
|
|
2120
|
+
status: "failed",
|
|
2121
|
+
completedAt: Date.now(),
|
|
2122
|
+
error: error instanceof Error ? error.message : String(error)
|
|
2123
|
+
});
|
|
2124
|
+
}
|
|
2125
|
+
continue;
|
|
2126
|
+
}
|
|
2127
|
+
const completedAt = nowFn();
|
|
2128
|
+
const rawRequest = {
|
|
2129
|
+
request: promptInputs.request,
|
|
2130
|
+
guidelines: promptInputs.guidelines,
|
|
2131
|
+
guideline_paths: evalCase.guideline_paths,
|
|
2132
|
+
system_message: promptInputs.systemMessage ?? ""
|
|
2133
|
+
};
|
|
2134
|
+
const result = {
|
|
2135
|
+
eval_id: evalCase.id,
|
|
2136
|
+
conversation_id: evalCase.conversation_id,
|
|
2137
|
+
score: grade.score,
|
|
2138
|
+
hits: grade.hits,
|
|
2139
|
+
misses: grade.misses,
|
|
2140
|
+
model_answer: providerResponse.text ?? "",
|
|
2141
|
+
expected_aspect_count: grade.expectedAspectCount,
|
|
2142
|
+
target: target.name,
|
|
2143
|
+
timestamp: completedAt.toISOString(),
|
|
2144
|
+
reasoning: grade.reasoning,
|
|
2145
|
+
raw_aspects: grade.rawAspects,
|
|
2146
|
+
raw_request: rawRequest,
|
|
2147
|
+
grader_raw_request: grade.graderRawRequest
|
|
2148
|
+
};
|
|
2149
|
+
results.push(result);
|
|
2150
|
+
if (onResult) {
|
|
2151
|
+
await onResult(result);
|
|
2152
|
+
}
|
|
2153
|
+
if (onProgress) {
|
|
2154
|
+
await onProgress({
|
|
2155
|
+
workerId: 1,
|
|
2156
|
+
evalId: evalCase.id,
|
|
2157
|
+
status: "completed",
|
|
2158
|
+
startedAt: 0,
|
|
2159
|
+
completedAt: Date.now()
|
|
2160
|
+
});
|
|
2161
|
+
}
|
|
2162
|
+
}
|
|
2163
|
+
return results;
|
|
2164
|
+
}
|
|
1906
2165
|
async function runEvalCase(options) {
|
|
1907
2166
|
const {
|
|
1908
2167
|
evalCase,
|