@agentv/core 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +78 -0
- package/dist/chunk-5REK5RSI.js +86 -0
- package/dist/chunk-5REK5RSI.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +617 -0
- package/dist/evaluation/validation/index.cjs.map +1 -0
- package/dist/evaluation/validation/index.d.cts +56 -0
- package/dist/evaluation/validation/index.d.ts +56 -0
- package/dist/evaluation/validation/index.js +499 -0
- package/dist/evaluation/validation/index.js.map +1 -0
- package/dist/index.cjs +2204 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +406 -0
- package/dist/index.d.ts +406 -0
- package/dist/index.js +2079 -0
- package/dist/index.js.map +1 -0
- package/package.json +49 -0
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,2204 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __export = (target, all) => {
|
|
9
|
+
for (var name in all)
|
|
10
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
11
|
+
};
|
|
12
|
+
var __copyProps = (to, from, except, desc) => {
|
|
13
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
14
|
+
for (let key of __getOwnPropNames(from))
|
|
15
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
16
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
17
|
+
}
|
|
18
|
+
return to;
|
|
19
|
+
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
28
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
+
|
|
30
|
+
// src/index.ts
|
|
31
|
+
var index_exports = {};
|
|
32
|
+
__export(index_exports, {
|
|
33
|
+
GRADER_KINDS: () => GRADER_KINDS,
|
|
34
|
+
HeuristicGrader: () => HeuristicGrader,
|
|
35
|
+
QualityGrader: () => QualityGrader,
|
|
36
|
+
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
37
|
+
buildPromptInputs: () => buildPromptInputs,
|
|
38
|
+
calculateHits: () => calculateHits,
|
|
39
|
+
calculateMisses: () => calculateMisses,
|
|
40
|
+
createAgentKernel: () => createAgentKernel,
|
|
41
|
+
createProvider: () => createProvider,
|
|
42
|
+
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
43
|
+
extractAspects: () => extractAspects,
|
|
44
|
+
extractCodeBlocks: () => extractCodeBlocks,
|
|
45
|
+
getHitCount: () => getHitCount,
|
|
46
|
+
isErrorLike: () => isErrorLike,
|
|
47
|
+
isGraderKind: () => isGraderKind,
|
|
48
|
+
isGuidelineFile: () => isGuidelineFile,
|
|
49
|
+
isJsonObject: () => isJsonObject,
|
|
50
|
+
isJsonValue: () => isJsonValue,
|
|
51
|
+
isTestMessage: () => isTestMessage,
|
|
52
|
+
isTestMessageRole: () => isTestMessageRole,
|
|
53
|
+
listTargetNames: () => listTargetNames,
|
|
54
|
+
loadTestCases: () => loadTestCases,
|
|
55
|
+
readTargetDefinitions: () => readTargetDefinitions,
|
|
56
|
+
resolveAndCreateProvider: () => resolveAndCreateProvider,
|
|
57
|
+
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
58
|
+
runEvaluation: () => runEvaluation,
|
|
59
|
+
runTestCase: () => runTestCase,
|
|
60
|
+
scoreCandidateResponse: () => scoreCandidateResponse
|
|
61
|
+
});
|
|
62
|
+
module.exports = __toCommonJS(index_exports);
|
|
63
|
+
|
|
64
|
+
// src/evaluation/types.ts
|
|
65
|
+
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
66
|
+
var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
|
|
67
|
+
var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
|
|
68
|
+
function isTestMessageRole(value) {
|
|
69
|
+
return typeof value === "string" && TEST_MESSAGE_ROLE_SET.has(value);
|
|
70
|
+
}
|
|
71
|
+
function isJsonObject(value) {
|
|
72
|
+
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
73
|
+
return false;
|
|
74
|
+
}
|
|
75
|
+
return Object.values(value).every(isJsonValue);
|
|
76
|
+
}
|
|
77
|
+
function isJsonValue(value) {
|
|
78
|
+
if (value === null || typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
|
|
79
|
+
return true;
|
|
80
|
+
}
|
|
81
|
+
if (Array.isArray(value)) {
|
|
82
|
+
return value.every(isJsonValue);
|
|
83
|
+
}
|
|
84
|
+
if (typeof value === "object") {
|
|
85
|
+
return isJsonObject(value);
|
|
86
|
+
}
|
|
87
|
+
return false;
|
|
88
|
+
}
|
|
89
|
+
function isTestMessage(value) {
|
|
90
|
+
if (typeof value !== "object" || value === null) {
|
|
91
|
+
return false;
|
|
92
|
+
}
|
|
93
|
+
const candidate = value;
|
|
94
|
+
if (!isTestMessageRole(candidate.role)) {
|
|
95
|
+
return false;
|
|
96
|
+
}
|
|
97
|
+
if (typeof candidate.content === "string") {
|
|
98
|
+
return true;
|
|
99
|
+
}
|
|
100
|
+
if (!Array.isArray(candidate.content)) {
|
|
101
|
+
return false;
|
|
102
|
+
}
|
|
103
|
+
return candidate.content.every(isJsonObject);
|
|
104
|
+
}
|
|
105
|
+
var GRADER_KIND_VALUES = ["heuristic", "llm_judge"];
|
|
106
|
+
var GRADER_KINDS = GRADER_KIND_VALUES;
|
|
107
|
+
var GRADER_KIND_SET = new Set(GRADER_KIND_VALUES);
|
|
108
|
+
function isGraderKind(value) {
|
|
109
|
+
return typeof value === "string" && GRADER_KIND_SET.has(value);
|
|
110
|
+
}
|
|
111
|
+
function getHitCount(result) {
|
|
112
|
+
return result.hits.length;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// src/evaluation/yaml-parser.ts
|
|
116
|
+
var import_node_fs2 = require("fs");
|
|
117
|
+
var import_promises2 = require("fs/promises");
|
|
118
|
+
var import_node_path2 = __toESM(require("path"), 1);
|
|
119
|
+
var import_node_url = require("url");
|
|
120
|
+
var import_yaml = require("yaml");
|
|
121
|
+
|
|
122
|
+
// src/evaluation/file-utils.ts
|
|
123
|
+
var import_node_fs = require("fs");
|
|
124
|
+
var import_promises = require("fs/promises");
|
|
125
|
+
var import_node_path = __toESM(require("path"), 1);
|
|
126
|
+
async function fileExists(filePath) {
|
|
127
|
+
try {
|
|
128
|
+
await (0, import_promises.access)(filePath, import_node_fs.constants.F_OK);
|
|
129
|
+
return true;
|
|
130
|
+
} catch {
|
|
131
|
+
return false;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
function buildSearchRoots(evalPath, repoRoot) {
|
|
135
|
+
const uniqueRoots = [];
|
|
136
|
+
const addRoot = (root) => {
|
|
137
|
+
const normalized = import_node_path.default.resolve(root);
|
|
138
|
+
if (!uniqueRoots.includes(normalized)) {
|
|
139
|
+
uniqueRoots.push(normalized);
|
|
140
|
+
}
|
|
141
|
+
};
|
|
142
|
+
let currentDir = import_node_path.default.dirname(evalPath);
|
|
143
|
+
let reachedBoundary = false;
|
|
144
|
+
while (!reachedBoundary) {
|
|
145
|
+
addRoot(currentDir);
|
|
146
|
+
const parentDir = import_node_path.default.dirname(currentDir);
|
|
147
|
+
if (currentDir === repoRoot || parentDir === currentDir) {
|
|
148
|
+
reachedBoundary = true;
|
|
149
|
+
} else {
|
|
150
|
+
currentDir = parentDir;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
addRoot(repoRoot);
|
|
154
|
+
addRoot(process.cwd());
|
|
155
|
+
return uniqueRoots;
|
|
156
|
+
}
|
|
157
|
+
function trimLeadingSeparators(value) {
|
|
158
|
+
const trimmed = value.replace(/^[/\\]+/, "");
|
|
159
|
+
return trimmed.length > 0 ? trimmed : value;
|
|
160
|
+
}
|
|
161
|
+
async function resolveFileReference(rawValue, searchRoots) {
|
|
162
|
+
const displayPath = trimLeadingSeparators(rawValue);
|
|
163
|
+
const potentialPaths = [];
|
|
164
|
+
if (import_node_path.default.isAbsolute(rawValue)) {
|
|
165
|
+
potentialPaths.push(import_node_path.default.normalize(rawValue));
|
|
166
|
+
}
|
|
167
|
+
for (const base of searchRoots) {
|
|
168
|
+
potentialPaths.push(import_node_path.default.resolve(base, displayPath));
|
|
169
|
+
}
|
|
170
|
+
const attempted = [];
|
|
171
|
+
const seen = /* @__PURE__ */ new Set();
|
|
172
|
+
for (const candidate of potentialPaths) {
|
|
173
|
+
const absoluteCandidate = import_node_path.default.resolve(candidate);
|
|
174
|
+
if (seen.has(absoluteCandidate)) {
|
|
175
|
+
continue;
|
|
176
|
+
}
|
|
177
|
+
seen.add(absoluteCandidate);
|
|
178
|
+
attempted.push(absoluteCandidate);
|
|
179
|
+
if (await fileExists(absoluteCandidate)) {
|
|
180
|
+
return { displayPath, resolvedPath: absoluteCandidate, attempted };
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
return { displayPath, attempted };
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// src/evaluation/yaml-parser.ts
|
|
187
|
+
var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
188
|
+
var ANSI_YELLOW = "\x1B[33m";
|
|
189
|
+
var ANSI_RESET = "\x1B[0m";
|
|
190
|
+
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
191
|
+
function isGuidelineFile(filePath) {
|
|
192
|
+
const normalized = filePath.split("\\").join("/");
|
|
193
|
+
return normalized.endsWith(".instructions.md") || normalized.includes("/instructions/") || normalized.endsWith(".prompt.md") || normalized.includes("/prompts/");
|
|
194
|
+
}
|
|
195
|
+
function extractCodeBlocks(segments) {
|
|
196
|
+
const codeBlocks = [];
|
|
197
|
+
for (const segment of segments) {
|
|
198
|
+
const typeValue = segment["type"];
|
|
199
|
+
if (typeof typeValue !== "string" || typeValue !== "text") {
|
|
200
|
+
continue;
|
|
201
|
+
}
|
|
202
|
+
const textValue = segment["value"];
|
|
203
|
+
if (typeof textValue !== "string") {
|
|
204
|
+
continue;
|
|
205
|
+
}
|
|
206
|
+
const matches = textValue.match(CODE_BLOCK_PATTERN);
|
|
207
|
+
if (matches) {
|
|
208
|
+
codeBlocks.push(...matches);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
return codeBlocks;
|
|
212
|
+
}
|
|
213
|
+
async function loadTestCases(testFilePath, repoRoot, options) {
|
|
214
|
+
const verbose = options?.verbose ?? false;
|
|
215
|
+
const absoluteTestPath = import_node_path2.default.resolve(testFilePath);
|
|
216
|
+
if (!await fileExists2(absoluteTestPath)) {
|
|
217
|
+
throw new Error(`Test file not found: ${testFilePath}`);
|
|
218
|
+
}
|
|
219
|
+
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
220
|
+
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
221
|
+
const rawFile = await (0, import_promises2.readFile)(absoluteTestPath, "utf8");
|
|
222
|
+
const parsed = (0, import_yaml.parse)(rawFile);
|
|
223
|
+
if (!isJsonObject(parsed)) {
|
|
224
|
+
throw new Error(`Invalid test file format: ${testFilePath}`);
|
|
225
|
+
}
|
|
226
|
+
const suite = parsed;
|
|
227
|
+
const schema = suite.$schema;
|
|
228
|
+
if (schema !== SCHEMA_EVAL_V2) {
|
|
229
|
+
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${testFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${testFilePath}.
|
|
230
|
+
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
231
|
+
throw new Error(message);
|
|
232
|
+
}
|
|
233
|
+
const rawTestcases = suite.evalcases;
|
|
234
|
+
if (!Array.isArray(rawTestcases)) {
|
|
235
|
+
throw new Error(`Invalid test file format: ${testFilePath} - missing 'evalcases' field`);
|
|
236
|
+
}
|
|
237
|
+
const globalGrader = coerceGrader(suite.grader) ?? "llm_judge";
|
|
238
|
+
const results = [];
|
|
239
|
+
for (const rawTestcase of rawTestcases) {
|
|
240
|
+
if (!isJsonObject(rawTestcase)) {
|
|
241
|
+
logWarning("Skipping invalid test case entry (expected object)");
|
|
242
|
+
continue;
|
|
243
|
+
}
|
|
244
|
+
const testcase = rawTestcase;
|
|
245
|
+
const id = asString(testcase.id);
|
|
246
|
+
const conversationId = asString(testcase.conversation_id);
|
|
247
|
+
const outcome = asString(testcase.outcome);
|
|
248
|
+
const inputMessagesValue = testcase.input_messages;
|
|
249
|
+
const expectedMessagesValue = testcase.expected_messages;
|
|
250
|
+
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
251
|
+
logWarning(`Skipping incomplete test case: ${id ?? "unknown"}`);
|
|
252
|
+
continue;
|
|
253
|
+
}
|
|
254
|
+
if (!Array.isArray(expectedMessagesValue)) {
|
|
255
|
+
logWarning(`Test case '${id}' missing expected_messages array`);
|
|
256
|
+
continue;
|
|
257
|
+
}
|
|
258
|
+
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
259
|
+
const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
|
|
260
|
+
const assistantMessages = expectedMessages.filter((message) => message.role === "assistant");
|
|
261
|
+
const userMessages = inputMessages.filter((message) => message.role === "user");
|
|
262
|
+
if (assistantMessages.length === 0) {
|
|
263
|
+
logWarning(`No assistant message found for test case: ${id}`);
|
|
264
|
+
continue;
|
|
265
|
+
}
|
|
266
|
+
if (assistantMessages.length > 1) {
|
|
267
|
+
logWarning(`Multiple assistant messages found for test case: ${id}, using first`);
|
|
268
|
+
}
|
|
269
|
+
const userSegments = [];
|
|
270
|
+
const guidelinePaths = [];
|
|
271
|
+
const userTextParts = [];
|
|
272
|
+
for (const userMessage of userMessages) {
|
|
273
|
+
const content = userMessage.content;
|
|
274
|
+
if (typeof content === "string") {
|
|
275
|
+
userSegments.push({ type: "text", value: content });
|
|
276
|
+
userTextParts.push(content);
|
|
277
|
+
continue;
|
|
278
|
+
}
|
|
279
|
+
for (const rawSegment of content) {
|
|
280
|
+
if (!isJsonObject(rawSegment)) {
|
|
281
|
+
continue;
|
|
282
|
+
}
|
|
283
|
+
const segmentType = asString(rawSegment.type);
|
|
284
|
+
if (segmentType === "file") {
|
|
285
|
+
const rawValue = asString(rawSegment.value);
|
|
286
|
+
if (!rawValue) {
|
|
287
|
+
continue;
|
|
288
|
+
}
|
|
289
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
290
|
+
rawValue,
|
|
291
|
+
searchRoots
|
|
292
|
+
);
|
|
293
|
+
if (!resolvedPath) {
|
|
294
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
295
|
+
logWarning(`File not found: ${displayPath}`, attempts);
|
|
296
|
+
continue;
|
|
297
|
+
}
|
|
298
|
+
try {
|
|
299
|
+
const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
300
|
+
if (isGuidelineFile(displayPath)) {
|
|
301
|
+
guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
|
|
302
|
+
if (verbose) {
|
|
303
|
+
console.log(` [Guideline] Found: ${displayPath}`);
|
|
304
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
305
|
+
}
|
|
306
|
+
} else {
|
|
307
|
+
userSegments.push({
|
|
308
|
+
type: "file",
|
|
309
|
+
path: displayPath,
|
|
310
|
+
text: fileContent
|
|
311
|
+
});
|
|
312
|
+
if (verbose) {
|
|
313
|
+
console.log(` [File] Found: ${displayPath}`);
|
|
314
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
} catch (error) {
|
|
318
|
+
logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
319
|
+
}
|
|
320
|
+
continue;
|
|
321
|
+
}
|
|
322
|
+
const clonedSegment = cloneJsonObject(rawSegment);
|
|
323
|
+
userSegments.push(clonedSegment);
|
|
324
|
+
const inlineValue = clonedSegment.value;
|
|
325
|
+
if (typeof inlineValue === "string") {
|
|
326
|
+
userTextParts.push(inlineValue);
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
const codeSnippets = extractCodeBlocks(userSegments);
|
|
331
|
+
const assistantContent = assistantMessages[0]?.content;
|
|
332
|
+
const expectedAssistantRaw = normalizeAssistantContent(assistantContent);
|
|
333
|
+
const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
334
|
+
const testCaseGrader = coerceGrader(testcase.grader) ?? globalGrader;
|
|
335
|
+
const testCase = {
|
|
336
|
+
id,
|
|
337
|
+
conversation_id: conversationId,
|
|
338
|
+
task: userTextPrompt,
|
|
339
|
+
user_segments: userSegments,
|
|
340
|
+
expected_assistant_raw: expectedAssistantRaw,
|
|
341
|
+
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
|
|
342
|
+
code_snippets: codeSnippets,
|
|
343
|
+
outcome,
|
|
344
|
+
grader: testCaseGrader
|
|
345
|
+
};
|
|
346
|
+
if (verbose) {
|
|
347
|
+
console.log(`
|
|
348
|
+
[Test Case: ${id}]`);
|
|
349
|
+
if (testCase.guideline_paths.length > 0) {
|
|
350
|
+
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
351
|
+
for (const guidelinePath of testCase.guideline_paths) {
|
|
352
|
+
console.log(` - ${guidelinePath}`);
|
|
353
|
+
}
|
|
354
|
+
} else {
|
|
355
|
+
console.log(" No guidelines found");
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
results.push(testCase);
|
|
359
|
+
}
|
|
360
|
+
return results;
|
|
361
|
+
}
|
|
362
|
+
async function buildPromptInputs(testCase) {
|
|
363
|
+
const guidelineContents = [];
|
|
364
|
+
for (const rawPath of testCase.guideline_paths) {
|
|
365
|
+
const absolutePath = import_node_path2.default.resolve(rawPath);
|
|
366
|
+
if (!await fileExists2(absolutePath)) {
|
|
367
|
+
logWarning(`Could not read guideline file ${absolutePath}: file does not exist`);
|
|
368
|
+
continue;
|
|
369
|
+
}
|
|
370
|
+
try {
|
|
371
|
+
const content = (await (0, import_promises2.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n");
|
|
372
|
+
guidelineContents.push(`=== ${import_node_path2.default.basename(absolutePath)} ===
|
|
373
|
+
${content}`);
|
|
374
|
+
} catch (error) {
|
|
375
|
+
logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
const requestParts = [];
|
|
379
|
+
for (const segment of testCase.user_segments) {
|
|
380
|
+
const typeValue = segment.type;
|
|
381
|
+
if (typeof typeValue === "string" && typeValue === "file") {
|
|
382
|
+
const pathValue = segment.path;
|
|
383
|
+
const textValue = segment.text;
|
|
384
|
+
const label = typeof pathValue === "string" ? pathValue : "file";
|
|
385
|
+
const body = typeof textValue === "string" ? textValue : "";
|
|
386
|
+
requestParts.push(`=== ${label} ===
|
|
387
|
+
${body}`);
|
|
388
|
+
continue;
|
|
389
|
+
}
|
|
390
|
+
if (typeof typeValue === "string" && typeValue === "text") {
|
|
391
|
+
const value = segment.value;
|
|
392
|
+
if (typeof value === "string") {
|
|
393
|
+
requestParts.push(value);
|
|
394
|
+
}
|
|
395
|
+
continue;
|
|
396
|
+
}
|
|
397
|
+
const genericValue = segment.value;
|
|
398
|
+
if (typeof genericValue === "string") {
|
|
399
|
+
requestParts.push(genericValue);
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
if (testCase.code_snippets.length > 0) {
|
|
403
|
+
requestParts.push(testCase.code_snippets.join("\n"));
|
|
404
|
+
}
|
|
405
|
+
const request = requestParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
406
|
+
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
407
|
+
return { request, guidelines };
|
|
408
|
+
}
|
|
409
|
+
async function fileExists2(absolutePath) {
|
|
410
|
+
try {
|
|
411
|
+
await (0, import_promises2.access)(absolutePath, import_node_fs2.constants.F_OK);
|
|
412
|
+
return true;
|
|
413
|
+
} catch {
|
|
414
|
+
return false;
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
function resolveToAbsolutePath(candidate) {
|
|
418
|
+
if (candidate instanceof URL) {
|
|
419
|
+
return (0, import_node_url.fileURLToPath)(candidate);
|
|
420
|
+
}
|
|
421
|
+
if (typeof candidate === "string") {
|
|
422
|
+
if (candidate.startsWith("file://")) {
|
|
423
|
+
return (0, import_node_url.fileURLToPath)(new URL(candidate));
|
|
424
|
+
}
|
|
425
|
+
return import_node_path2.default.resolve(candidate);
|
|
426
|
+
}
|
|
427
|
+
throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
|
|
428
|
+
}
|
|
429
|
+
function asString(value) {
|
|
430
|
+
return typeof value === "string" ? value : void 0;
|
|
431
|
+
}
|
|
432
|
+
function cloneJsonObject(source) {
|
|
433
|
+
const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
|
|
434
|
+
return Object.fromEntries(entries);
|
|
435
|
+
}
|
|
436
|
+
function cloneJsonValue(value) {
|
|
437
|
+
if (value === null) {
|
|
438
|
+
return null;
|
|
439
|
+
}
|
|
440
|
+
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
|
|
441
|
+
return value;
|
|
442
|
+
}
|
|
443
|
+
if (Array.isArray(value)) {
|
|
444
|
+
return value.map((item) => cloneJsonValue(item));
|
|
445
|
+
}
|
|
446
|
+
return cloneJsonObject(value);
|
|
447
|
+
}
|
|
448
|
+
function normalizeAssistantContent(content) {
|
|
449
|
+
if (typeof content === "string") {
|
|
450
|
+
return content;
|
|
451
|
+
}
|
|
452
|
+
if (!content) {
|
|
453
|
+
return "";
|
|
454
|
+
}
|
|
455
|
+
const parts = [];
|
|
456
|
+
for (const entry of content) {
|
|
457
|
+
if (typeof entry === "string") {
|
|
458
|
+
parts.push(entry);
|
|
459
|
+
continue;
|
|
460
|
+
}
|
|
461
|
+
const textValue = asString(entry["text"]);
|
|
462
|
+
if (typeof textValue === "string") {
|
|
463
|
+
parts.push(textValue);
|
|
464
|
+
continue;
|
|
465
|
+
}
|
|
466
|
+
const valueValue = asString(entry["value"]);
|
|
467
|
+
if (typeof valueValue === "string") {
|
|
468
|
+
parts.push(valueValue);
|
|
469
|
+
continue;
|
|
470
|
+
}
|
|
471
|
+
parts.push(JSON.stringify(entry));
|
|
472
|
+
}
|
|
473
|
+
return parts.join(" ");
|
|
474
|
+
}
|
|
475
|
+
function coerceGrader(candidate) {
|
|
476
|
+
if (typeof candidate !== "string") {
|
|
477
|
+
return void 0;
|
|
478
|
+
}
|
|
479
|
+
if (isGraderKind(candidate)) {
|
|
480
|
+
return candidate;
|
|
481
|
+
}
|
|
482
|
+
logWarning(`Unknown grader '${candidate}', falling back to default`);
|
|
483
|
+
return void 0;
|
|
484
|
+
}
|
|
485
|
+
function logWarning(message, details) {
|
|
486
|
+
if (details && details.length > 0) {
|
|
487
|
+
const detailBlock = details.join("\n");
|
|
488
|
+
console.warn(`${ANSI_YELLOW}Warning: ${message}
|
|
489
|
+
${detailBlock}${ANSI_RESET}`);
|
|
490
|
+
} else {
|
|
491
|
+
console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
// src/evaluation/providers/ax.ts
|
|
496
|
+
var import_ax = require("@ax-llm/ax");
|
|
497
|
+
var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
|
|
498
|
+
function buildChatPrompt(request) {
|
|
499
|
+
if (request.chatPrompt) {
|
|
500
|
+
return request.chatPrompt;
|
|
501
|
+
}
|
|
502
|
+
const systemSegments = [];
|
|
503
|
+
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
504
|
+
systemSegments.push(`Guidelines:
|
|
505
|
+
${request.guidelines.trim()}`);
|
|
506
|
+
}
|
|
507
|
+
const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
|
|
508
|
+
if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
|
|
509
|
+
systemSegments.push(metadataSystemPrompt.trim());
|
|
510
|
+
}
|
|
511
|
+
const systemContent = systemSegments.length > 0 ? systemSegments.join("\n\n") : DEFAULT_SYSTEM_PROMPT;
|
|
512
|
+
const userContent = request.prompt.trim();
|
|
513
|
+
const prompt = [
|
|
514
|
+
{
|
|
515
|
+
role: "system",
|
|
516
|
+
content: systemContent
|
|
517
|
+
},
|
|
518
|
+
{
|
|
519
|
+
role: "user",
|
|
520
|
+
content: userContent
|
|
521
|
+
}
|
|
522
|
+
];
|
|
523
|
+
return prompt;
|
|
524
|
+
}
|
|
525
|
+
function extractModelConfig(request, defaults) {
|
|
526
|
+
const temperature = request.temperature ?? defaults.temperature;
|
|
527
|
+
const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
|
|
528
|
+
const config = {};
|
|
529
|
+
if (temperature !== void 0) {
|
|
530
|
+
config.temperature = temperature;
|
|
531
|
+
}
|
|
532
|
+
if (maxTokens !== void 0) {
|
|
533
|
+
config.maxTokens = maxTokens;
|
|
534
|
+
}
|
|
535
|
+
return Object.keys(config).length > 0 ? config : void 0;
|
|
536
|
+
}
|
|
537
|
+
function mapResponse(response) {
|
|
538
|
+
const primary = response.results[0];
|
|
539
|
+
const text = typeof primary?.content === "string" ? primary.content : "";
|
|
540
|
+
const reasoning = primary?.thought ?? primary?.thoughtBlock?.data;
|
|
541
|
+
const usage = toJsonObject(response.modelUsage);
|
|
542
|
+
return {
|
|
543
|
+
text,
|
|
544
|
+
reasoning,
|
|
545
|
+
raw: response,
|
|
546
|
+
usage
|
|
547
|
+
};
|
|
548
|
+
}
|
|
549
|
+
function toJsonObject(value) {
|
|
550
|
+
if (!value || typeof value !== "object") {
|
|
551
|
+
return void 0;
|
|
552
|
+
}
|
|
553
|
+
try {
|
|
554
|
+
return JSON.parse(JSON.stringify(value));
|
|
555
|
+
} catch {
|
|
556
|
+
return void 0;
|
|
557
|
+
}
|
|
558
|
+
}
|
|
559
|
+
function ensureChatResponse(result) {
|
|
560
|
+
if (typeof ReadableStream !== "undefined" && result instanceof ReadableStream) {
|
|
561
|
+
throw new Error("Streaming responses are not supported for this provider");
|
|
562
|
+
}
|
|
563
|
+
if (!result || typeof result !== "object" || !("results" in result)) {
|
|
564
|
+
throw new Error("Unexpected response type from AxAI provider");
|
|
565
|
+
}
|
|
566
|
+
return result;
|
|
567
|
+
}
|
|
568
|
+
var AzureProvider = class {
|
|
569
|
+
constructor(targetName, config) {
|
|
570
|
+
this.config = config;
|
|
571
|
+
this.id = `azure:${targetName}`;
|
|
572
|
+
this.targetName = targetName;
|
|
573
|
+
this.defaults = {
|
|
574
|
+
temperature: config.temperature,
|
|
575
|
+
maxOutputTokens: config.maxOutputTokens
|
|
576
|
+
};
|
|
577
|
+
this.ai = import_ax.AxAI.create({
|
|
578
|
+
name: "azure-openai",
|
|
579
|
+
apiKey: config.apiKey,
|
|
580
|
+
resourceName: config.resourceName,
|
|
581
|
+
deploymentName: config.deploymentName,
|
|
582
|
+
version: config.version,
|
|
583
|
+
config: {
|
|
584
|
+
stream: false
|
|
585
|
+
}
|
|
586
|
+
});
|
|
587
|
+
}
|
|
588
|
+
id;
|
|
589
|
+
kind = "azure";
|
|
590
|
+
targetName;
|
|
591
|
+
ai;
|
|
592
|
+
defaults;
|
|
593
|
+
async invoke(request) {
|
|
594
|
+
const chatPrompt = buildChatPrompt(request);
|
|
595
|
+
const modelConfig = extractModelConfig(request, this.defaults);
|
|
596
|
+
const response = await this.ai.chat(
|
|
597
|
+
{
|
|
598
|
+
chatPrompt,
|
|
599
|
+
model: this.config.deploymentName,
|
|
600
|
+
...modelConfig ? { modelConfig } : {}
|
|
601
|
+
},
|
|
602
|
+
request.signal ? { abortSignal: request.signal } : void 0
|
|
603
|
+
);
|
|
604
|
+
return mapResponse(ensureChatResponse(response));
|
|
605
|
+
}
|
|
606
|
+
};
|
|
607
|
+
var AnthropicProvider = class {
|
|
608
|
+
constructor(targetName, config) {
|
|
609
|
+
this.config = config;
|
|
610
|
+
this.id = `anthropic:${targetName}`;
|
|
611
|
+
this.targetName = targetName;
|
|
612
|
+
this.defaults = {
|
|
613
|
+
temperature: config.temperature,
|
|
614
|
+
maxOutputTokens: config.maxOutputTokens,
|
|
615
|
+
thinkingBudget: config.thinkingBudget
|
|
616
|
+
};
|
|
617
|
+
this.ai = import_ax.AxAI.create({
|
|
618
|
+
name: "anthropic",
|
|
619
|
+
apiKey: config.apiKey
|
|
620
|
+
});
|
|
621
|
+
}
|
|
622
|
+
id;
|
|
623
|
+
kind = "anthropic";
|
|
624
|
+
targetName;
|
|
625
|
+
ai;
|
|
626
|
+
defaults;
|
|
627
|
+
async invoke(request) {
|
|
628
|
+
const chatPrompt = buildChatPrompt(request);
|
|
629
|
+
const modelConfig = extractModelConfig(request, this.defaults);
|
|
630
|
+
const response = await this.ai.chat(
|
|
631
|
+
{
|
|
632
|
+
chatPrompt,
|
|
633
|
+
model: this.config.model,
|
|
634
|
+
...modelConfig ? { modelConfig } : {}
|
|
635
|
+
},
|
|
636
|
+
request.signal ? { abortSignal: request.signal } : void 0
|
|
637
|
+
);
|
|
638
|
+
return mapResponse(ensureChatResponse(response));
|
|
639
|
+
}
|
|
640
|
+
};
|
|
641
|
+
var GeminiProvider = class {
|
|
642
|
+
constructor(targetName, config) {
|
|
643
|
+
this.config = config;
|
|
644
|
+
this.id = `gemini:${targetName}`;
|
|
645
|
+
this.targetName = targetName;
|
|
646
|
+
this.defaults = {
|
|
647
|
+
temperature: config.temperature,
|
|
648
|
+
maxOutputTokens: config.maxOutputTokens
|
|
649
|
+
};
|
|
650
|
+
this.ai = import_ax.AxAI.create({
|
|
651
|
+
name: "google-gemini",
|
|
652
|
+
apiKey: config.apiKey
|
|
653
|
+
});
|
|
654
|
+
}
|
|
655
|
+
id;
|
|
656
|
+
kind = "gemini";
|
|
657
|
+
targetName;
|
|
658
|
+
ai;
|
|
659
|
+
defaults;
|
|
660
|
+
async invoke(request) {
|
|
661
|
+
const chatPrompt = buildChatPrompt(request);
|
|
662
|
+
const modelConfig = extractModelConfig(request, this.defaults);
|
|
663
|
+
const response = await this.ai.chat(
|
|
664
|
+
{
|
|
665
|
+
chatPrompt,
|
|
666
|
+
model: this.config.model,
|
|
667
|
+
...modelConfig ? { modelConfig } : {}
|
|
668
|
+
},
|
|
669
|
+
request.signal ? { abortSignal: request.signal } : void 0
|
|
670
|
+
);
|
|
671
|
+
return mapResponse(ensureChatResponse(response));
|
|
672
|
+
}
|
|
673
|
+
};
|
|
674
|
+
|
|
675
|
+
// src/evaluation/providers/mock.ts
|
|
676
|
+
var DEFAULT_MOCK_RESPONSE = '{"answer":"Mock provider response. Configure targets.yaml to supply a custom value."}';
|
|
677
|
+
var MockProvider = class {
|
|
678
|
+
id;
|
|
679
|
+
kind = "mock";
|
|
680
|
+
targetName;
|
|
681
|
+
cannedResponse;
|
|
682
|
+
delayMs;
|
|
683
|
+
delayMinMs;
|
|
684
|
+
delayMaxMs;
|
|
685
|
+
constructor(targetName, config) {
|
|
686
|
+
this.id = `mock:${targetName}`;
|
|
687
|
+
this.targetName = targetName;
|
|
688
|
+
this.cannedResponse = config.response ?? DEFAULT_MOCK_RESPONSE;
|
|
689
|
+
this.delayMs = config.delayMs ?? 0;
|
|
690
|
+
this.delayMinMs = config.delayMinMs ?? 0;
|
|
691
|
+
this.delayMaxMs = config.delayMaxMs ?? 0;
|
|
692
|
+
}
|
|
693
|
+
async invoke(request) {
|
|
694
|
+
const delay = this.calculateDelay();
|
|
695
|
+
if (delay > 0) {
|
|
696
|
+
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
697
|
+
}
|
|
698
|
+
return {
|
|
699
|
+
text: this.cannedResponse,
|
|
700
|
+
raw: {
|
|
701
|
+
prompt: request.prompt,
|
|
702
|
+
guidelines: request.guidelines
|
|
703
|
+
}
|
|
704
|
+
};
|
|
705
|
+
}
|
|
706
|
+
calculateDelay() {
|
|
707
|
+
if (this.delayMinMs > 0 || this.delayMaxMs > 0) {
|
|
708
|
+
const min = Math.max(0, this.delayMinMs);
|
|
709
|
+
const max = Math.max(min, this.delayMaxMs);
|
|
710
|
+
return Math.floor(Math.random() * (max - min + 1)) + min;
|
|
711
|
+
}
|
|
712
|
+
return this.delayMs;
|
|
713
|
+
}
|
|
714
|
+
};
|
|
715
|
+
|
|
716
|
+
// src/evaluation/providers/targets.ts
|
|
717
|
+
var import_zod = require("zod");
|
|
718
|
+
var BASE_TARGET_SCHEMA = import_zod.z.object({
|
|
719
|
+
name: import_zod.z.string().min(1, "target name is required"),
|
|
720
|
+
provider: import_zod.z.string().min(1, "provider is required"),
|
|
721
|
+
settings: import_zod.z.record(import_zod.z.unknown()).optional(),
|
|
722
|
+
judge_target: import_zod.z.string().optional(),
|
|
723
|
+
workers: import_zod.z.number().int().min(1).optional()
|
|
724
|
+
});
|
|
725
|
+
var DEFAULT_AZURE_API_VERSION = "2024-10-01-preview";
|
|
726
|
+
function normalizeAzureApiVersion(value) {
|
|
727
|
+
if (!value) {
|
|
728
|
+
return DEFAULT_AZURE_API_VERSION;
|
|
729
|
+
}
|
|
730
|
+
const trimmed = value.trim();
|
|
731
|
+
if (trimmed.length === 0) {
|
|
732
|
+
return DEFAULT_AZURE_API_VERSION;
|
|
733
|
+
}
|
|
734
|
+
const withoutPrefix = trimmed.replace(/^api[-_]?version\s*=\s*/i, "").trim();
|
|
735
|
+
return withoutPrefix.length > 0 ? withoutPrefix : DEFAULT_AZURE_API_VERSION;
|
|
736
|
+
}
|
|
737
|
+
function resolveTargetDefinition(definition, env = process.env) {
|
|
738
|
+
const parsed = BASE_TARGET_SCHEMA.parse(definition);
|
|
739
|
+
const provider = parsed.provider.toLowerCase();
|
|
740
|
+
switch (provider) {
|
|
741
|
+
case "azure":
|
|
742
|
+
case "azure-openai":
|
|
743
|
+
return {
|
|
744
|
+
kind: "azure",
|
|
745
|
+
name: parsed.name,
|
|
746
|
+
judgeTarget: parsed.judge_target,
|
|
747
|
+
workers: parsed.workers,
|
|
748
|
+
config: resolveAzureConfig(parsed, env)
|
|
749
|
+
};
|
|
750
|
+
case "anthropic":
|
|
751
|
+
return {
|
|
752
|
+
kind: "anthropic",
|
|
753
|
+
name: parsed.name,
|
|
754
|
+
judgeTarget: parsed.judge_target,
|
|
755
|
+
workers: parsed.workers,
|
|
756
|
+
config: resolveAnthropicConfig(parsed, env)
|
|
757
|
+
};
|
|
758
|
+
case "gemini":
|
|
759
|
+
case "google":
|
|
760
|
+
case "google-gemini":
|
|
761
|
+
return {
|
|
762
|
+
kind: "gemini",
|
|
763
|
+
name: parsed.name,
|
|
764
|
+
judgeTarget: parsed.judge_target,
|
|
765
|
+
workers: parsed.workers,
|
|
766
|
+
config: resolveGeminiConfig(parsed, env)
|
|
767
|
+
};
|
|
768
|
+
case "mock":
|
|
769
|
+
return {
|
|
770
|
+
kind: "mock",
|
|
771
|
+
name: parsed.name,
|
|
772
|
+
judgeTarget: parsed.judge_target,
|
|
773
|
+
workers: parsed.workers,
|
|
774
|
+
config: resolveMockConfig(parsed)
|
|
775
|
+
};
|
|
776
|
+
case "vscode":
|
|
777
|
+
case "vscode-insiders":
|
|
778
|
+
return {
|
|
779
|
+
kind: provider,
|
|
780
|
+
name: parsed.name,
|
|
781
|
+
judgeTarget: parsed.judge_target,
|
|
782
|
+
workers: parsed.workers,
|
|
783
|
+
config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
|
|
784
|
+
};
|
|
785
|
+
default:
|
|
786
|
+
throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
function resolveAzureConfig(target, env) {
|
|
790
|
+
const settings = target.settings ?? {};
|
|
791
|
+
const endpointSource = settings.endpoint ?? settings.resource ?? settings.resourceName;
|
|
792
|
+
const apiKeySource = settings.api_key ?? settings.apiKey;
|
|
793
|
+
const deploymentSource = settings.deployment ?? settings.deploymentName ?? settings.model;
|
|
794
|
+
const versionSource = settings.version ?? settings.api_version;
|
|
795
|
+
const temperatureSource = settings.temperature;
|
|
796
|
+
const maxTokensSource = settings.max_output_tokens ?? settings.maxTokens;
|
|
797
|
+
const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
|
|
798
|
+
const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
|
|
799
|
+
const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
|
|
800
|
+
const version = normalizeAzureApiVersion(
|
|
801
|
+
resolveOptionalString(versionSource, env, `${target.name} api version`)
|
|
802
|
+
);
|
|
803
|
+
const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
|
|
804
|
+
const maxOutputTokens = resolveOptionalNumber(
|
|
805
|
+
maxTokensSource,
|
|
806
|
+
`${target.name} max output tokens`
|
|
807
|
+
);
|
|
808
|
+
return {
|
|
809
|
+
resourceName,
|
|
810
|
+
deploymentName,
|
|
811
|
+
apiKey,
|
|
812
|
+
version,
|
|
813
|
+
temperature,
|
|
814
|
+
maxOutputTokens
|
|
815
|
+
};
|
|
816
|
+
}
|
|
817
|
+
function resolveAnthropicConfig(target, env) {
|
|
818
|
+
const settings = target.settings ?? {};
|
|
819
|
+
const apiKeySource = settings.api_key ?? settings.apiKey;
|
|
820
|
+
const modelSource = settings.model ?? settings.deployment ?? settings.variant;
|
|
821
|
+
const temperatureSource = settings.temperature;
|
|
822
|
+
const maxTokensSource = settings.max_output_tokens ?? settings.maxTokens;
|
|
823
|
+
const thinkingBudgetSource = settings.thinking_budget ?? settings.thinkingBudget;
|
|
824
|
+
const apiKey = resolveString(apiKeySource, env, `${target.name} Anthropic api key`);
|
|
825
|
+
const model = resolveString(modelSource, env, `${target.name} Anthropic model`);
|
|
826
|
+
return {
|
|
827
|
+
apiKey,
|
|
828
|
+
model,
|
|
829
|
+
temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
|
|
830
|
+
maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
|
|
831
|
+
thinkingBudget: resolveOptionalNumber(thinkingBudgetSource, `${target.name} thinking budget`)
|
|
832
|
+
};
|
|
833
|
+
}
|
|
834
|
+
function resolveGeminiConfig(target, env) {
|
|
835
|
+
const settings = target.settings ?? {};
|
|
836
|
+
const apiKeySource = settings.api_key ?? settings.apiKey;
|
|
837
|
+
const modelSource = settings.model ?? settings.deployment ?? settings.variant;
|
|
838
|
+
const temperatureSource = settings.temperature;
|
|
839
|
+
const maxTokensSource = settings.max_output_tokens ?? settings.maxTokens;
|
|
840
|
+
const apiKey = resolveString(apiKeySource, env, `${target.name} Google API key`);
|
|
841
|
+
const model = resolveOptionalString(modelSource, env, `${target.name} Gemini model`, {
|
|
842
|
+
allowLiteral: true,
|
|
843
|
+
optionalEnv: true
|
|
844
|
+
}) ?? "gemini-2.5-flash";
|
|
845
|
+
return {
|
|
846
|
+
apiKey,
|
|
847
|
+
model,
|
|
848
|
+
temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
|
|
849
|
+
maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`)
|
|
850
|
+
};
|
|
851
|
+
}
|
|
852
|
+
function resolveMockConfig(target) {
|
|
853
|
+
const settings = target.settings ?? {};
|
|
854
|
+
const response = typeof settings.response === "string" ? settings.response : void 0;
|
|
855
|
+
return { response };
|
|
856
|
+
}
|
|
857
|
+
function resolveVSCodeConfig(target, env, insiders) {
|
|
858
|
+
const settings = target.settings ?? {};
|
|
859
|
+
const workspaceTemplateEnvVar = resolveOptionalLiteralString(settings.workspace_template ?? settings.workspaceTemplate);
|
|
860
|
+
const workspaceTemplate = workspaceTemplateEnvVar ? resolveOptionalString(workspaceTemplateEnvVar, env, `${target.name} workspace template path`, {
|
|
861
|
+
allowLiteral: false,
|
|
862
|
+
optionalEnv: true
|
|
863
|
+
}) : void 0;
|
|
864
|
+
const commandSource = settings.vscode_cmd ?? settings.command;
|
|
865
|
+
const waitSource = settings.wait;
|
|
866
|
+
const dryRunSource = settings.dry_run ?? settings.dryRun;
|
|
867
|
+
const subagentRootSource = settings.subagent_root ?? settings.subagentRoot;
|
|
868
|
+
const defaultCommand = insiders ? "code-insiders" : "code";
|
|
869
|
+
const command = resolveOptionalLiteralString(commandSource) ?? defaultCommand;
|
|
870
|
+
return {
|
|
871
|
+
command,
|
|
872
|
+
waitForResponse: resolveOptionalBoolean(waitSource) ?? true,
|
|
873
|
+
dryRun: resolveOptionalBoolean(dryRunSource) ?? false,
|
|
874
|
+
subagentRoot: resolveOptionalString(subagentRootSource, env, `${target.name} subagent root`, {
|
|
875
|
+
allowLiteral: true,
|
|
876
|
+
optionalEnv: true
|
|
877
|
+
}),
|
|
878
|
+
workspaceTemplate
|
|
879
|
+
};
|
|
880
|
+
}
|
|
881
|
+
function resolveString(source, env, description, allowLiteral = false) {
|
|
882
|
+
const value = resolveOptionalString(source, env, description, {
|
|
883
|
+
allowLiteral,
|
|
884
|
+
optionalEnv: false
|
|
885
|
+
});
|
|
886
|
+
if (value === void 0) {
|
|
887
|
+
throw new Error(`${description} is required`);
|
|
888
|
+
}
|
|
889
|
+
return value;
|
|
890
|
+
}
|
|
891
|
+
function resolveOptionalString(source, env, description, options) {
|
|
892
|
+
if (source === void 0 || source === null) {
|
|
893
|
+
return void 0;
|
|
894
|
+
}
|
|
895
|
+
if (typeof source !== "string") {
|
|
896
|
+
throw new Error(`${description} must be a string`);
|
|
897
|
+
}
|
|
898
|
+
const trimmed = source.trim();
|
|
899
|
+
if (trimmed.length === 0) {
|
|
900
|
+
return void 0;
|
|
901
|
+
}
|
|
902
|
+
const envValue = env[trimmed];
|
|
903
|
+
if (envValue !== void 0) {
|
|
904
|
+
if (envValue.trim().length === 0) {
|
|
905
|
+
throw new Error(`Environment variable '${trimmed}' for ${description} is empty`);
|
|
906
|
+
}
|
|
907
|
+
return envValue;
|
|
908
|
+
}
|
|
909
|
+
const allowLiteral = options?.allowLiteral ?? false;
|
|
910
|
+
const optionalEnv = options?.optionalEnv ?? false;
|
|
911
|
+
if (!allowLiteral && isLikelyEnvReference(trimmed)) {
|
|
912
|
+
if (optionalEnv) {
|
|
913
|
+
return void 0;
|
|
914
|
+
}
|
|
915
|
+
throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
|
|
916
|
+
}
|
|
917
|
+
return trimmed;
|
|
918
|
+
}
|
|
919
|
+
function resolveOptionalLiteralString(source) {
|
|
920
|
+
if (source === void 0 || source === null) {
|
|
921
|
+
return void 0;
|
|
922
|
+
}
|
|
923
|
+
if (typeof source !== "string") {
|
|
924
|
+
throw new Error("expected string value");
|
|
925
|
+
}
|
|
926
|
+
const trimmed = source.trim();
|
|
927
|
+
return trimmed.length > 0 ? trimmed : void 0;
|
|
928
|
+
}
|
|
929
|
+
function resolveOptionalNumber(source, description) {
|
|
930
|
+
if (source === void 0 || source === null || source === "") {
|
|
931
|
+
return void 0;
|
|
932
|
+
}
|
|
933
|
+
if (typeof source === "number") {
|
|
934
|
+
return Number.isFinite(source) ? source : void 0;
|
|
935
|
+
}
|
|
936
|
+
if (typeof source === "string") {
|
|
937
|
+
const numeric = Number(source);
|
|
938
|
+
if (Number.isFinite(numeric)) {
|
|
939
|
+
return numeric;
|
|
940
|
+
}
|
|
941
|
+
}
|
|
942
|
+
throw new Error(`${description} must be a number`);
|
|
943
|
+
}
|
|
944
|
+
function resolveOptionalBoolean(source) {
|
|
945
|
+
if (source === void 0 || source === null || source === "") {
|
|
946
|
+
return void 0;
|
|
947
|
+
}
|
|
948
|
+
if (typeof source === "boolean") {
|
|
949
|
+
return source;
|
|
950
|
+
}
|
|
951
|
+
if (typeof source === "string") {
|
|
952
|
+
const lowered = source.trim().toLowerCase();
|
|
953
|
+
if (lowered === "true" || lowered === "1") {
|
|
954
|
+
return true;
|
|
955
|
+
}
|
|
956
|
+
if (lowered === "false" || lowered === "0") {
|
|
957
|
+
return false;
|
|
958
|
+
}
|
|
959
|
+
}
|
|
960
|
+
throw new Error("expected boolean value");
|
|
961
|
+
}
|
|
962
|
+
function isLikelyEnvReference(value) {
|
|
963
|
+
return /^[A-Z0-9_]+$/.test(value);
|
|
964
|
+
}
|
|
965
|
+
|
|
966
|
+
// src/evaluation/providers/vscode.ts
|
|
967
|
+
var import_promises3 = require("fs/promises");
|
|
968
|
+
var import_node_os = require("os");
|
|
969
|
+
var import_node_path3 = __toESM(require("path"), 1);
|
|
970
|
+
var import_subagent = require("subagent");
|
|
971
|
+
var PROMPT_FILE_PREFIX = "bbeval-vscode-";
|
|
972
|
+
var VSCodeProvider = class {
|
|
973
|
+
id;
|
|
974
|
+
kind;
|
|
975
|
+
targetName;
|
|
976
|
+
config;
|
|
977
|
+
constructor(targetName, config, kind) {
|
|
978
|
+
this.id = `${kind}:${targetName}`;
|
|
979
|
+
this.kind = kind;
|
|
980
|
+
this.targetName = targetName;
|
|
981
|
+
this.config = config;
|
|
982
|
+
}
|
|
983
|
+
async invoke(request) {
|
|
984
|
+
if (request.signal?.aborted) {
|
|
985
|
+
throw new Error("VS Code provider request was aborted before dispatch");
|
|
986
|
+
}
|
|
987
|
+
const attachments = normalizeAttachments(request.attachments);
|
|
988
|
+
const promptContent = buildPromptDocument(request, attachments);
|
|
989
|
+
const directory = await (0, import_promises3.mkdtemp)(import_node_path3.default.join((0, import_node_os.tmpdir)(), PROMPT_FILE_PREFIX));
|
|
990
|
+
const promptPath = import_node_path3.default.join(directory, `${request.testCaseId ?? "request"}.prompt.md`);
|
|
991
|
+
try {
|
|
992
|
+
await (0, import_promises3.writeFile)(promptPath, promptContent, "utf8");
|
|
993
|
+
const session = await (0, import_subagent.dispatchAgentSession)({
|
|
994
|
+
userQuery: composeUserQuery(request),
|
|
995
|
+
promptFile: promptPath,
|
|
996
|
+
extraAttachments: attachments,
|
|
997
|
+
wait: this.config.waitForResponse,
|
|
998
|
+
dryRun: this.config.dryRun,
|
|
999
|
+
vscodeCmd: this.config.command,
|
|
1000
|
+
subagentRoot: this.config.subagentRoot,
|
|
1001
|
+
workspaceTemplate: this.config.workspaceTemplate,
|
|
1002
|
+
silent: true
|
|
1003
|
+
});
|
|
1004
|
+
if (session.exitCode !== 0 || !session.responseFile) {
|
|
1005
|
+
const failure = session.error ?? "VS Code subagent did not produce a response";
|
|
1006
|
+
throw new Error(failure);
|
|
1007
|
+
}
|
|
1008
|
+
if (this.config.dryRun) {
|
|
1009
|
+
return {
|
|
1010
|
+
text: "",
|
|
1011
|
+
raw: {
|
|
1012
|
+
session,
|
|
1013
|
+
promptFile: promptPath,
|
|
1014
|
+
attachments
|
|
1015
|
+
}
|
|
1016
|
+
};
|
|
1017
|
+
}
|
|
1018
|
+
const responseText = await (0, import_promises3.readFile)(session.responseFile, "utf8");
|
|
1019
|
+
return {
|
|
1020
|
+
text: responseText,
|
|
1021
|
+
raw: {
|
|
1022
|
+
session,
|
|
1023
|
+
promptFile: promptPath,
|
|
1024
|
+
attachments
|
|
1025
|
+
}
|
|
1026
|
+
};
|
|
1027
|
+
} finally {
|
|
1028
|
+
await (0, import_promises3.rm)(directory, { recursive: true, force: true });
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
};
|
|
1032
|
+
function buildPromptDocument(request, attachments) {
|
|
1033
|
+
const parts = [];
|
|
1034
|
+
const instructionFiles = collectInstructionFiles(attachments);
|
|
1035
|
+
if (instructionFiles.length > 0) {
|
|
1036
|
+
parts.push(buildMandatoryPrereadBlock(instructionFiles));
|
|
1037
|
+
}
|
|
1038
|
+
parts.push(`# BbEval Request`);
|
|
1039
|
+
if (request.testCaseId) {
|
|
1040
|
+
parts.push(`- Test Case: ${request.testCaseId}`);
|
|
1041
|
+
}
|
|
1042
|
+
if (request.metadata?.target) {
|
|
1043
|
+
parts.push(`- Target: ${String(request.metadata.target)}`);
|
|
1044
|
+
}
|
|
1045
|
+
parts.push("\n## Task\n", request.prompt.trim());
|
|
1046
|
+
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
1047
|
+
parts.push("\n## Guidelines\n", request.guidelines.trim());
|
|
1048
|
+
}
|
|
1049
|
+
if (attachments && attachments.length > 0) {
|
|
1050
|
+
const attachmentList = attachments.map((item) => `- ${item}`).join("\n");
|
|
1051
|
+
parts.push("\n## Attachments\n", attachmentList);
|
|
1052
|
+
}
|
|
1053
|
+
return parts.join("\n").trim();
|
|
1054
|
+
}
|
|
1055
|
+
function buildMandatoryPrereadBlock(instructionFiles) {
|
|
1056
|
+
if (instructionFiles.length === 0) {
|
|
1057
|
+
return "";
|
|
1058
|
+
}
|
|
1059
|
+
const fileList = [];
|
|
1060
|
+
const tokenList = [];
|
|
1061
|
+
let counter = 0;
|
|
1062
|
+
for (const absolutePath of instructionFiles) {
|
|
1063
|
+
counter += 1;
|
|
1064
|
+
const fileName = import_node_path3.default.basename(absolutePath);
|
|
1065
|
+
const fileUri = pathToFileUri(absolutePath);
|
|
1066
|
+
fileList.push(`[${fileName}](${fileUri})`);
|
|
1067
|
+
tokenList.push(`INSTRUCTIONS_READ: \`${fileName}\` i=${counter} SHA256=<hex>`);
|
|
1068
|
+
}
|
|
1069
|
+
const filesText = fileList.join(", ");
|
|
1070
|
+
const tokensText = tokenList.join("\n");
|
|
1071
|
+
const instruction = [
|
|
1072
|
+
`Read all instruction files: ${filesText}.`,
|
|
1073
|
+
`After reading each file, compute its SHA256 hash using this PowerShell command:`,
|
|
1074
|
+
"`Get-FileHash -Algorithm SHA256 -LiteralPath '<file-path>' | Select-Object -ExpandProperty Hash`.",
|
|
1075
|
+
`Then include, at the top of your reply, these exact tokens on separate lines:
|
|
1076
|
+
`,
|
|
1077
|
+
tokensText,
|
|
1078
|
+
`
|
|
1079
|
+
Replace \`<hex>\` with the actual SHA256 hash value computed from the PowerShell command.`,
|
|
1080
|
+
`If any file is missing, fail with ERROR: missing-file <filename> and stop.
|
|
1081
|
+
`,
|
|
1082
|
+
`Then fetch all documentation required by the instructions before proceeding with your task.`
|
|
1083
|
+
].join(" ");
|
|
1084
|
+
return `[[ ## mandatory_pre_read ## ]]
|
|
1085
|
+
|
|
1086
|
+
${instruction}
|
|
1087
|
+
|
|
1088
|
+
`;
|
|
1089
|
+
}
|
|
1090
|
+
function collectInstructionFiles(attachments) {
|
|
1091
|
+
if (!attachments || attachments.length === 0) {
|
|
1092
|
+
return [];
|
|
1093
|
+
}
|
|
1094
|
+
const unique = /* @__PURE__ */ new Map();
|
|
1095
|
+
for (const attachment of attachments) {
|
|
1096
|
+
if (!isInstructionPath(attachment)) {
|
|
1097
|
+
continue;
|
|
1098
|
+
}
|
|
1099
|
+
const absolutePath = import_node_path3.default.resolve(attachment);
|
|
1100
|
+
if (!unique.has(absolutePath)) {
|
|
1101
|
+
unique.set(absolutePath, absolutePath);
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
return Array.from(unique.values());
|
|
1105
|
+
}
|
|
1106
|
+
function isInstructionPath(filePath) {
|
|
1107
|
+
const normalized = filePath.split(import_node_path3.default.sep).join("/");
|
|
1108
|
+
return normalized.endsWith(".instructions.md") || normalized.includes("/instructions/") || normalized.endsWith(".prompt.md") || normalized.includes("/prompts/");
|
|
1109
|
+
}
|
|
1110
|
+
function pathToFileUri(filePath) {
|
|
1111
|
+
const absolutePath = import_node_path3.default.isAbsolute(filePath) ? filePath : import_node_path3.default.resolve(filePath);
|
|
1112
|
+
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
1113
|
+
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
1114
|
+
return `file:///${normalizedPath}`;
|
|
1115
|
+
}
|
|
1116
|
+
return `file://${normalizedPath}`;
|
|
1117
|
+
}
|
|
1118
|
+
function composeUserQuery(request) {
|
|
1119
|
+
const segments = [];
|
|
1120
|
+
segments.push(request.prompt.trim());
|
|
1121
|
+
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
1122
|
+
segments.push("\nGuidelines:\n", request.guidelines.trim());
|
|
1123
|
+
}
|
|
1124
|
+
return segments.join("\n").trim();
|
|
1125
|
+
}
|
|
1126
|
+
function normalizeAttachments(attachments) {
|
|
1127
|
+
if (!attachments || attachments.length === 0) {
|
|
1128
|
+
return void 0;
|
|
1129
|
+
}
|
|
1130
|
+
const deduped = /* @__PURE__ */ new Set();
|
|
1131
|
+
for (const attachment of attachments) {
|
|
1132
|
+
deduped.add(import_node_path3.default.resolve(attachment));
|
|
1133
|
+
}
|
|
1134
|
+
return Array.from(deduped);
|
|
1135
|
+
}
|
|
1136
|
+
async function ensureVSCodeSubagents(options) {
|
|
1137
|
+
const { kind, count, verbose = false } = options;
|
|
1138
|
+
const vscodeCmd = kind === "vscode-insiders" ? "code-insiders" : "code";
|
|
1139
|
+
const subagentRoot = (0, import_subagent.getSubagentRoot)(vscodeCmd);
|
|
1140
|
+
try {
|
|
1141
|
+
if (verbose) {
|
|
1142
|
+
console.log(`Provisioning ${count} subagent(s) via: subagent ${vscodeCmd} provision`);
|
|
1143
|
+
}
|
|
1144
|
+
const result = await (0, import_subagent.provisionSubagents)({
|
|
1145
|
+
targetRoot: subagentRoot,
|
|
1146
|
+
subagents: count,
|
|
1147
|
+
dryRun: false
|
|
1148
|
+
});
|
|
1149
|
+
if (verbose) {
|
|
1150
|
+
if (result.created.length > 0) {
|
|
1151
|
+
console.log(`Created ${result.created.length} new subagent(s)`);
|
|
1152
|
+
}
|
|
1153
|
+
if (result.skippedExisting.length > 0) {
|
|
1154
|
+
console.log(`Reusing ${result.skippedExisting.length} existing unlocked subagent(s)`);
|
|
1155
|
+
}
|
|
1156
|
+
console.log(`
|
|
1157
|
+
total unlocked subagents available: ${result.created.length + result.skippedExisting.length}`);
|
|
1158
|
+
}
|
|
1159
|
+
return {
|
|
1160
|
+
provisioned: true,
|
|
1161
|
+
message: `Provisioned ${count} subagent(s): ${result.created.length} created, ${result.skippedExisting.length} reused`
|
|
1162
|
+
};
|
|
1163
|
+
} catch (error) {
|
|
1164
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
1165
|
+
if (verbose) {
|
|
1166
|
+
console.warn(`Provisioning failed (continuing anyway): ${errorMessage}`);
|
|
1167
|
+
}
|
|
1168
|
+
return {
|
|
1169
|
+
provisioned: false,
|
|
1170
|
+
message: `Provisioning failed: ${errorMessage}`
|
|
1171
|
+
};
|
|
1172
|
+
}
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
// src/evaluation/providers/targets-file.ts
|
|
1176
|
+
var import_node_fs3 = require("fs");
|
|
1177
|
+
var import_promises4 = require("fs/promises");
|
|
1178
|
+
var import_node_path4 = __toESM(require("path"), 1);
|
|
1179
|
+
var import_yaml2 = require("yaml");
|
|
1180
|
+
function isRecord(value) {
|
|
1181
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
1182
|
+
}
|
|
1183
|
+
function checkVersion(parsed, absolutePath) {
|
|
1184
|
+
const version = typeof parsed.version === "number" ? parsed.version : typeof parsed.version === "string" ? parseFloat(parsed.version) : void 0;
|
|
1185
|
+
if (version === void 0) {
|
|
1186
|
+
throw new Error(
|
|
1187
|
+
`Missing version field in targets.yaml at ${absolutePath}.
|
|
1188
|
+
Please add 'version: 2.0' at the top of the file.`
|
|
1189
|
+
);
|
|
1190
|
+
}
|
|
1191
|
+
if (version < 2) {
|
|
1192
|
+
throw new Error(
|
|
1193
|
+
`Outdated targets.yaml format (version ${version}) at ${absolutePath}.
|
|
1194
|
+
Please update to version 2.0 format with 'targets' array.`
|
|
1195
|
+
);
|
|
1196
|
+
}
|
|
1197
|
+
}
|
|
1198
|
+
function extractTargetsArray(parsed, absolutePath) {
|
|
1199
|
+
const targets = parsed.targets;
|
|
1200
|
+
if (!Array.isArray(targets)) {
|
|
1201
|
+
throw new Error(`targets.yaml at ${absolutePath} must have a 'targets' array`);
|
|
1202
|
+
}
|
|
1203
|
+
return targets;
|
|
1204
|
+
}
|
|
1205
|
+
function assertTargetDefinition(value, index, filePath) {
|
|
1206
|
+
if (!isRecord(value)) {
|
|
1207
|
+
throw new Error(`targets.yaml entry at index ${index} in ${filePath} must be an object`);
|
|
1208
|
+
}
|
|
1209
|
+
const name = value.name;
|
|
1210
|
+
const provider = value.provider;
|
|
1211
|
+
const settings = value.settings;
|
|
1212
|
+
const judgeTarget = value.judge_target;
|
|
1213
|
+
if (typeof name !== "string" || name.trim().length === 0) {
|
|
1214
|
+
throw new Error(`targets.yaml entry at index ${index} in ${filePath} is missing a valid 'name'`);
|
|
1215
|
+
}
|
|
1216
|
+
if (typeof provider !== "string" || provider.trim().length === 0) {
|
|
1217
|
+
throw new Error(`targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider'`);
|
|
1218
|
+
}
|
|
1219
|
+
return {
|
|
1220
|
+
name,
|
|
1221
|
+
provider,
|
|
1222
|
+
settings: isRecord(settings) ? settings : void 0,
|
|
1223
|
+
judge_target: typeof judgeTarget === "string" ? judgeTarget : void 0
|
|
1224
|
+
};
|
|
1225
|
+
}
|
|
1226
|
+
async function fileExists3(filePath) {
|
|
1227
|
+
try {
|
|
1228
|
+
await (0, import_promises4.access)(filePath, import_node_fs3.constants.F_OK);
|
|
1229
|
+
return true;
|
|
1230
|
+
} catch {
|
|
1231
|
+
return false;
|
|
1232
|
+
}
|
|
1233
|
+
}
|
|
1234
|
+
async function readTargetDefinitions(filePath) {
|
|
1235
|
+
const absolutePath = import_node_path4.default.resolve(filePath);
|
|
1236
|
+
if (!await fileExists3(absolutePath)) {
|
|
1237
|
+
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
1238
|
+
}
|
|
1239
|
+
const raw = await (0, import_promises4.readFile)(absolutePath, "utf8");
|
|
1240
|
+
const parsed = (0, import_yaml2.parse)(raw);
|
|
1241
|
+
if (!isRecord(parsed)) {
|
|
1242
|
+
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with 'version' and 'targets' fields`);
|
|
1243
|
+
}
|
|
1244
|
+
checkVersion(parsed, absolutePath);
|
|
1245
|
+
const targets = extractTargetsArray(parsed, absolutePath);
|
|
1246
|
+
const definitions = targets.map((entry, index) => assertTargetDefinition(entry, index, absolutePath));
|
|
1247
|
+
return definitions;
|
|
1248
|
+
}
|
|
1249
|
+
function listTargetNames(definitions) {
|
|
1250
|
+
return definitions.map((definition) => definition.name);
|
|
1251
|
+
}
|
|
1252
|
+
|
|
1253
|
+
// src/evaluation/providers/index.ts
|
|
1254
|
+
function createProvider(target) {
|
|
1255
|
+
switch (target.kind) {
|
|
1256
|
+
case "azure":
|
|
1257
|
+
return new AzureProvider(target.name, target.config);
|
|
1258
|
+
case "anthropic":
|
|
1259
|
+
return new AnthropicProvider(target.name, target.config);
|
|
1260
|
+
case "gemini":
|
|
1261
|
+
return new GeminiProvider(target.name, target.config);
|
|
1262
|
+
case "mock":
|
|
1263
|
+
return new MockProvider(target.name, target.config);
|
|
1264
|
+
case "vscode":
|
|
1265
|
+
case "vscode-insiders":
|
|
1266
|
+
return new VSCodeProvider(target.name, target.config, target.kind);
|
|
1267
|
+
default: {
|
|
1268
|
+
const neverTarget = target;
|
|
1269
|
+
throw new Error(`Unsupported provider kind ${neverTarget.kind}`);
|
|
1270
|
+
}
|
|
1271
|
+
}
|
|
1272
|
+
}
|
|
1273
|
+
function resolveAndCreateProvider(definition, env = process.env) {
|
|
1274
|
+
const resolved = resolveTargetDefinition(definition, env);
|
|
1275
|
+
return createProvider(resolved);
|
|
1276
|
+
}
|
|
1277
|
+
|
|
1278
|
+
// src/evaluation/scoring.ts
|
|
1279
|
+
var KEY_TERM_MATCH_THRESHOLD = 0.5;
|
|
1280
|
+
var ACTION_WORDS = /* @__PURE__ */ new Set([
|
|
1281
|
+
"use",
|
|
1282
|
+
"avoid",
|
|
1283
|
+
"prefer",
|
|
1284
|
+
"replace",
|
|
1285
|
+
"consider",
|
|
1286
|
+
"ensure",
|
|
1287
|
+
"remove",
|
|
1288
|
+
"add"
|
|
1289
|
+
]);
|
|
1290
|
+
var STOP_WORDS = /* @__PURE__ */ new Set([
|
|
1291
|
+
"the",
|
|
1292
|
+
"a",
|
|
1293
|
+
"an",
|
|
1294
|
+
"and",
|
|
1295
|
+
"or",
|
|
1296
|
+
"but",
|
|
1297
|
+
"in",
|
|
1298
|
+
"on",
|
|
1299
|
+
"at",
|
|
1300
|
+
"to",
|
|
1301
|
+
"for",
|
|
1302
|
+
"of",
|
|
1303
|
+
"with",
|
|
1304
|
+
"by",
|
|
1305
|
+
"is",
|
|
1306
|
+
"are",
|
|
1307
|
+
"was",
|
|
1308
|
+
"were",
|
|
1309
|
+
"be",
|
|
1310
|
+
"been",
|
|
1311
|
+
"being",
|
|
1312
|
+
"have",
|
|
1313
|
+
"has",
|
|
1314
|
+
"had",
|
|
1315
|
+
"do",
|
|
1316
|
+
"does",
|
|
1317
|
+
"did",
|
|
1318
|
+
"will",
|
|
1319
|
+
"would",
|
|
1320
|
+
"could",
|
|
1321
|
+
"should"
|
|
1322
|
+
]);
|
|
1323
|
+
var ERROR_PREFIXES = [
|
|
1324
|
+
"error:",
|
|
1325
|
+
"err:",
|
|
1326
|
+
"vs code command failed",
|
|
1327
|
+
"exception",
|
|
1328
|
+
"traceback",
|
|
1329
|
+
"no response file was generated",
|
|
1330
|
+
"timed out",
|
|
1331
|
+
"cli not found"
|
|
1332
|
+
];
|
|
1333
|
+
function extractAspects(expectedResponse) {
|
|
1334
|
+
const lines = expectedResponse.split(/\r?\n/).map((line) => line.trim());
|
|
1335
|
+
const aspects = [];
|
|
1336
|
+
for (const line of lines) {
|
|
1337
|
+
if (line.length === 0) {
|
|
1338
|
+
continue;
|
|
1339
|
+
}
|
|
1340
|
+
const bulletMatch = /^([-*•]|[0-9]+\.)\s*(.+)$/.exec(line);
|
|
1341
|
+
if (bulletMatch) {
|
|
1342
|
+
const normalized = normalizeAspect(bulletMatch[2]);
|
|
1343
|
+
if (normalized.length > 0) {
|
|
1344
|
+
aspects.push(normalized);
|
|
1345
|
+
}
|
|
1346
|
+
continue;
|
|
1347
|
+
}
|
|
1348
|
+
const lowered = line.toLowerCase();
|
|
1349
|
+
if (Array.from(ACTION_WORDS).some((word) => lowered.startsWith(word))) {
|
|
1350
|
+
const normalized = normalizeAspect(line);
|
|
1351
|
+
if (normalized.length > 0) {
|
|
1352
|
+
aspects.push(normalized);
|
|
1353
|
+
}
|
|
1354
|
+
}
|
|
1355
|
+
}
|
|
1356
|
+
return aspects;
|
|
1357
|
+
}
|
|
1358
|
+
function calculateHits(candidateResponse, expectedAspects) {
|
|
1359
|
+
const { normalizedText, words } = normalizeCandidate(candidateResponse);
|
|
1360
|
+
const hits = [];
|
|
1361
|
+
for (const aspect of expectedAspects) {
|
|
1362
|
+
if (matchesAspect(aspect, normalizedText, words)) {
|
|
1363
|
+
hits.push(aspect);
|
|
1364
|
+
}
|
|
1365
|
+
}
|
|
1366
|
+
return hits;
|
|
1367
|
+
}
|
|
1368
|
+
function calculateMisses(candidateResponse, expectedAspects, resolvedHits) {
|
|
1369
|
+
const hits = new Set(resolvedHits ?? calculateHits(candidateResponse, expectedAspects));
|
|
1370
|
+
return expectedAspects.filter((aspect) => !hits.has(aspect));
|
|
1371
|
+
}
|
|
1372
|
+
function scoreCandidateResponse(candidateResponse, expectedAspects) {
|
|
1373
|
+
if (expectedAspects.length === 0) {
|
|
1374
|
+
if (isErrorLike(candidateResponse)) {
|
|
1375
|
+
return {
|
|
1376
|
+
score: 0,
|
|
1377
|
+
hits: [],
|
|
1378
|
+
misses: ["Model produced an error instead of an answer."],
|
|
1379
|
+
hitCount: 0,
|
|
1380
|
+
totalAspects: 0,
|
|
1381
|
+
rawAspects: []
|
|
1382
|
+
};
|
|
1383
|
+
}
|
|
1384
|
+
return {
|
|
1385
|
+
score: 1,
|
|
1386
|
+
hits: [],
|
|
1387
|
+
misses: [],
|
|
1388
|
+
hitCount: 0,
|
|
1389
|
+
totalAspects: 0,
|
|
1390
|
+
rawAspects: []
|
|
1391
|
+
};
|
|
1392
|
+
}
|
|
1393
|
+
const hits = calculateHits(candidateResponse, expectedAspects);
|
|
1394
|
+
const misses = expectedAspects.filter((aspect) => !hits.includes(aspect));
|
|
1395
|
+
const score = expectedAspects.length > 0 ? hits.length / expectedAspects.length : 0;
|
|
1396
|
+
return {
|
|
1397
|
+
score,
|
|
1398
|
+
hits,
|
|
1399
|
+
misses,
|
|
1400
|
+
hitCount: hits.length,
|
|
1401
|
+
totalAspects: expectedAspects.length,
|
|
1402
|
+
rawAspects: expectedAspects
|
|
1403
|
+
};
|
|
1404
|
+
}
|
|
1405
|
+
function isErrorLike(text) {
|
|
1406
|
+
if (!text) {
|
|
1407
|
+
return false;
|
|
1408
|
+
}
|
|
1409
|
+
const lowered = text.trim().toLowerCase();
|
|
1410
|
+
return ERROR_PREFIXES.some((prefix) => lowered.startsWith(prefix));
|
|
1411
|
+
}
|
|
1412
|
+
function normalizeAspect(aspect) {
|
|
1413
|
+
const sanitized = aspect.toLowerCase().replace(/[^\w\s]/g, " ").replace(/\s+/g, " ").trim();
|
|
1414
|
+
return sanitized;
|
|
1415
|
+
}
|
|
1416
|
+
function normalizeCandidate(candidate) {
|
|
1417
|
+
const lowered = candidate.toLowerCase();
|
|
1418
|
+
const normalizedText = lowered.replace(/[^\w\s]/g, " ");
|
|
1419
|
+
const words = new Set(normalizedText.split(/\s+/).filter((word) => word.length > 0));
|
|
1420
|
+
return { normalizedText, words };
|
|
1421
|
+
}
|
|
1422
|
+
function matchesAspect(aspect, candidateNormalized, candidateWords) {
|
|
1423
|
+
const keyTerms = extractKeyTerms(aspect);
|
|
1424
|
+
if (keyTerms.length === 0) {
|
|
1425
|
+
return false;
|
|
1426
|
+
}
|
|
1427
|
+
const matches = keyTerms.filter((term) => candidateWords.has(term)).length;
|
|
1428
|
+
const ratio = matches / keyTerms.length;
|
|
1429
|
+
if (ratio >= KEY_TERM_MATCH_THRESHOLD) {
|
|
1430
|
+
return true;
|
|
1431
|
+
}
|
|
1432
|
+
const aspectWords = aspect.split(" ");
|
|
1433
|
+
if (aspectWords.length >= 2) {
|
|
1434
|
+
for (let index = 0; index < aspectWords.length - 1; index += 1) {
|
|
1435
|
+
const phrase = `${aspectWords[index]} ${aspectWords[index + 1]}`;
|
|
1436
|
+
if (candidateNormalized.includes(phrase)) {
|
|
1437
|
+
return true;
|
|
1438
|
+
}
|
|
1439
|
+
}
|
|
1440
|
+
}
|
|
1441
|
+
return false;
|
|
1442
|
+
}
|
|
1443
|
+
function extractKeyTerms(aspect, maxTerms = 5) {
|
|
1444
|
+
const terms = [];
|
|
1445
|
+
const words = aspect.split(" ");
|
|
1446
|
+
for (const word of words) {
|
|
1447
|
+
if (word.length <= 2) {
|
|
1448
|
+
continue;
|
|
1449
|
+
}
|
|
1450
|
+
if (STOP_WORDS.has(word)) {
|
|
1451
|
+
continue;
|
|
1452
|
+
}
|
|
1453
|
+
terms.push(word);
|
|
1454
|
+
if (terms.length >= maxTerms) {
|
|
1455
|
+
break;
|
|
1456
|
+
}
|
|
1457
|
+
}
|
|
1458
|
+
return terms;
|
|
1459
|
+
}
|
|
1460
|
+
|
|
1461
|
+
// src/evaluation/grading.ts
|
|
1462
|
+
var import_node_crypto = require("crypto");
|
|
1463
|
+
var HeuristicGrader = class {
|
|
1464
|
+
kind = "heuristic";
|
|
1465
|
+
grade(context) {
|
|
1466
|
+
const expectedAspects = extractAspects(context.testCase.expected_assistant_raw);
|
|
1467
|
+
const result = scoreCandidateResponse(context.candidate, expectedAspects);
|
|
1468
|
+
const misses = [...result.misses];
|
|
1469
|
+
if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
|
|
1470
|
+
const firstLine = context.candidate.split(/\r?\n/)[0]?.trim();
|
|
1471
|
+
if (firstLine && !misses.includes(firstLine)) {
|
|
1472
|
+
misses.unshift(firstLine);
|
|
1473
|
+
}
|
|
1474
|
+
}
|
|
1475
|
+
return {
|
|
1476
|
+
score: result.score,
|
|
1477
|
+
hits: result.hits,
|
|
1478
|
+
misses,
|
|
1479
|
+
expectedAspectCount: result.totalAspects,
|
|
1480
|
+
rawAspects: result.rawAspects
|
|
1481
|
+
};
|
|
1482
|
+
}
|
|
1483
|
+
};
|
|
1484
|
+
var QualityGrader = class {
|
|
1485
|
+
kind = "llm_judge";
|
|
1486
|
+
resolveJudgeProvider;
|
|
1487
|
+
maxOutputTokens;
|
|
1488
|
+
temperature;
|
|
1489
|
+
constructor(options) {
|
|
1490
|
+
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
1491
|
+
this.maxOutputTokens = options.maxOutputTokens;
|
|
1492
|
+
this.temperature = options.temperature;
|
|
1493
|
+
}
|
|
1494
|
+
async grade(context) {
|
|
1495
|
+
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
1496
|
+
if (!judgeProvider) {
|
|
1497
|
+
throw new Error("No judge provider available for LLM grading");
|
|
1498
|
+
}
|
|
1499
|
+
const prompt = buildQualityPrompt(context.testCase, context.candidate);
|
|
1500
|
+
const metadata = {
|
|
1501
|
+
systemPrompt: QUALITY_SYSTEM_PROMPT
|
|
1502
|
+
};
|
|
1503
|
+
const response = await judgeProvider.invoke({
|
|
1504
|
+
prompt,
|
|
1505
|
+
metadata,
|
|
1506
|
+
testCaseId: context.testCase.id,
|
|
1507
|
+
attempt: context.attempt,
|
|
1508
|
+
maxOutputTokens: this.maxOutputTokens,
|
|
1509
|
+
temperature: this.temperature
|
|
1510
|
+
});
|
|
1511
|
+
const parsed = parseQualityResponse(response);
|
|
1512
|
+
const score = clampScore(parsed.score ?? 0);
|
|
1513
|
+
const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
1514
|
+
const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
1515
|
+
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
1516
|
+
const graderRawRequest = {
|
|
1517
|
+
id: (0, import_node_crypto.randomUUID)(),
|
|
1518
|
+
provider: judgeProvider.id,
|
|
1519
|
+
prompt,
|
|
1520
|
+
systemPrompt: QUALITY_SYSTEM_PROMPT,
|
|
1521
|
+
target: context.target.name
|
|
1522
|
+
};
|
|
1523
|
+
return {
|
|
1524
|
+
score,
|
|
1525
|
+
hits,
|
|
1526
|
+
misses,
|
|
1527
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
1528
|
+
reasoning,
|
|
1529
|
+
graderRawRequest
|
|
1530
|
+
};
|
|
1531
|
+
}
|
|
1532
|
+
};
|
|
1533
|
+
var QUALITY_SYSTEM_PROMPT = [
|
|
1534
|
+
"You are an expert evaluator. Your goal is to grade the generated_answer based on how well it achieves the expected_outcome for the original task.",
|
|
1535
|
+
"",
|
|
1536
|
+
"Use the reference_answer as a gold standard for a high-quality response. The generated_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
|
|
1537
|
+
"",
|
|
1538
|
+
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
1539
|
+
"",
|
|
1540
|
+
"You must respond with a single JSON object matching this schema:",
|
|
1541
|
+
"",
|
|
1542
|
+
"{",
|
|
1543
|
+
' "score": <number between 0.0 and 1.0>,',
|
|
1544
|
+
' "hits": [<array of strings, max 4 items, brief specific achievements>],',
|
|
1545
|
+
' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
|
|
1546
|
+
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
1547
|
+
"}"
|
|
1548
|
+
].join("\n");
|
|
1549
|
+
function buildQualityPrompt(testCase, candidate) {
|
|
1550
|
+
const parts = [
|
|
1551
|
+
"[[ ## expected_outcome ## ]]",
|
|
1552
|
+
testCase.outcome,
|
|
1553
|
+
"",
|
|
1554
|
+
"[[ ## request ## ]]",
|
|
1555
|
+
testCase.task,
|
|
1556
|
+
"",
|
|
1557
|
+
"[[ ## reference_answer ## ]]",
|
|
1558
|
+
testCase.expected_assistant_raw,
|
|
1559
|
+
"",
|
|
1560
|
+
"[[ ## generated_answer ## ]]",
|
|
1561
|
+
candidate,
|
|
1562
|
+
"",
|
|
1563
|
+
"Respond with a single JSON object matching the schema described in the system prompt."
|
|
1564
|
+
];
|
|
1565
|
+
return parts.join("\n");
|
|
1566
|
+
}
|
|
1567
|
+
function clampScore(value) {
|
|
1568
|
+
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
1569
|
+
return 0;
|
|
1570
|
+
}
|
|
1571
|
+
if (value < 0) {
|
|
1572
|
+
return 0;
|
|
1573
|
+
}
|
|
1574
|
+
if (value > 1) {
|
|
1575
|
+
return 1;
|
|
1576
|
+
}
|
|
1577
|
+
return value;
|
|
1578
|
+
}
|
|
1579
|
+
function parseQualityResponse(response) {
|
|
1580
|
+
const text = typeof response.text === "string" ? response.text.trim() : "";
|
|
1581
|
+
if (text.length === 0) {
|
|
1582
|
+
return {};
|
|
1583
|
+
}
|
|
1584
|
+
const direct = attemptParseJson(text);
|
|
1585
|
+
if (direct && validateQualityJson(direct)) {
|
|
1586
|
+
return direct;
|
|
1587
|
+
}
|
|
1588
|
+
const extracted = extractJsonBlob(text);
|
|
1589
|
+
if (extracted) {
|
|
1590
|
+
const parsed = attemptParseJson(extracted);
|
|
1591
|
+
if (parsed && validateQualityJson(parsed)) {
|
|
1592
|
+
return parsed;
|
|
1593
|
+
}
|
|
1594
|
+
}
|
|
1595
|
+
return {};
|
|
1596
|
+
}
|
|
1597
|
+
function attemptParseJson(text) {
|
|
1598
|
+
try {
|
|
1599
|
+
const parsed = JSON.parse(text);
|
|
1600
|
+
const score = typeof parsed.score === "number" ? parsed.score : void 0;
|
|
1601
|
+
const hits = parsed.hits;
|
|
1602
|
+
const misses = parsed.misses;
|
|
1603
|
+
const reasoning = typeof parsed.reasoning === "string" ? parsed.reasoning : void 0;
|
|
1604
|
+
return { score, hits, misses, reasoning };
|
|
1605
|
+
} catch {
|
|
1606
|
+
return void 0;
|
|
1607
|
+
}
|
|
1608
|
+
}
|
|
1609
|
+
function validateQualityJson(parsed) {
|
|
1610
|
+
if (typeof parsed.score !== "number") {
|
|
1611
|
+
return false;
|
|
1612
|
+
}
|
|
1613
|
+
if (Number.isNaN(parsed.score) || !Number.isFinite(parsed.score)) {
|
|
1614
|
+
return false;
|
|
1615
|
+
}
|
|
1616
|
+
if (parsed.score < 0 || parsed.score > 1) {
|
|
1617
|
+
return false;
|
|
1618
|
+
}
|
|
1619
|
+
if (parsed.hits !== void 0) {
|
|
1620
|
+
if (!Array.isArray(parsed.hits)) {
|
|
1621
|
+
return false;
|
|
1622
|
+
}
|
|
1623
|
+
if (!parsed.hits.every((item) => typeof item === "string")) {
|
|
1624
|
+
return false;
|
|
1625
|
+
}
|
|
1626
|
+
}
|
|
1627
|
+
if (parsed.misses !== void 0) {
|
|
1628
|
+
if (!Array.isArray(parsed.misses)) {
|
|
1629
|
+
return false;
|
|
1630
|
+
}
|
|
1631
|
+
if (!parsed.misses.every((item) => typeof item === "string")) {
|
|
1632
|
+
return false;
|
|
1633
|
+
}
|
|
1634
|
+
}
|
|
1635
|
+
if (parsed.reasoning !== void 0 && typeof parsed.reasoning !== "string") {
|
|
1636
|
+
return false;
|
|
1637
|
+
}
|
|
1638
|
+
return true;
|
|
1639
|
+
}
|
|
1640
|
+
function extractJsonBlob(text) {
|
|
1641
|
+
const match = text.match(/\{[\s\S]*\}/);
|
|
1642
|
+
return match?.[0];
|
|
1643
|
+
}
|
|
1644
|
+
function isNonEmptyString(value) {
|
|
1645
|
+
return typeof value === "string" && value.trim().length > 0;
|
|
1646
|
+
}
|
|
1647
|
+
|
|
1648
|
+
// src/evaluation/orchestrator.ts
|
|
1649
|
+
var import_node_crypto2 = require("crypto");
|
|
1650
|
+
var import_promises5 = require("fs/promises");
|
|
1651
|
+
var import_node_path5 = __toESM(require("path"), 1);
|
|
1652
|
+
|
|
1653
|
+
// ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
|
|
1654
|
+
var Node = class {
|
|
1655
|
+
value;
|
|
1656
|
+
next;
|
|
1657
|
+
constructor(value) {
|
|
1658
|
+
this.value = value;
|
|
1659
|
+
}
|
|
1660
|
+
};
|
|
1661
|
+
var Queue = class {
|
|
1662
|
+
#head;
|
|
1663
|
+
#tail;
|
|
1664
|
+
#size;
|
|
1665
|
+
constructor() {
|
|
1666
|
+
this.clear();
|
|
1667
|
+
}
|
|
1668
|
+
enqueue(value) {
|
|
1669
|
+
const node = new Node(value);
|
|
1670
|
+
if (this.#head) {
|
|
1671
|
+
this.#tail.next = node;
|
|
1672
|
+
this.#tail = node;
|
|
1673
|
+
} else {
|
|
1674
|
+
this.#head = node;
|
|
1675
|
+
this.#tail = node;
|
|
1676
|
+
}
|
|
1677
|
+
this.#size++;
|
|
1678
|
+
}
|
|
1679
|
+
dequeue() {
|
|
1680
|
+
const current = this.#head;
|
|
1681
|
+
if (!current) {
|
|
1682
|
+
return;
|
|
1683
|
+
}
|
|
1684
|
+
this.#head = this.#head.next;
|
|
1685
|
+
this.#size--;
|
|
1686
|
+
return current.value;
|
|
1687
|
+
}
|
|
1688
|
+
peek() {
|
|
1689
|
+
if (!this.#head) {
|
|
1690
|
+
return;
|
|
1691
|
+
}
|
|
1692
|
+
return this.#head.value;
|
|
1693
|
+
}
|
|
1694
|
+
clear() {
|
|
1695
|
+
this.#head = void 0;
|
|
1696
|
+
this.#tail = void 0;
|
|
1697
|
+
this.#size = 0;
|
|
1698
|
+
}
|
|
1699
|
+
get size() {
|
|
1700
|
+
return this.#size;
|
|
1701
|
+
}
|
|
1702
|
+
*[Symbol.iterator]() {
|
|
1703
|
+
let current = this.#head;
|
|
1704
|
+
while (current) {
|
|
1705
|
+
yield current.value;
|
|
1706
|
+
current = current.next;
|
|
1707
|
+
}
|
|
1708
|
+
}
|
|
1709
|
+
*drain() {
|
|
1710
|
+
while (this.#head) {
|
|
1711
|
+
yield this.dequeue();
|
|
1712
|
+
}
|
|
1713
|
+
}
|
|
1714
|
+
};
|
|
1715
|
+
|
|
1716
|
+
// ../../node_modules/.pnpm/p-limit@6.2.0/node_modules/p-limit/index.js
|
|
1717
|
+
function pLimit(concurrency) {
|
|
1718
|
+
validateConcurrency(concurrency);
|
|
1719
|
+
const queue = new Queue();
|
|
1720
|
+
let activeCount = 0;
|
|
1721
|
+
const resumeNext = () => {
|
|
1722
|
+
if (activeCount < concurrency && queue.size > 0) {
|
|
1723
|
+
queue.dequeue()();
|
|
1724
|
+
activeCount++;
|
|
1725
|
+
}
|
|
1726
|
+
};
|
|
1727
|
+
const next = () => {
|
|
1728
|
+
activeCount--;
|
|
1729
|
+
resumeNext();
|
|
1730
|
+
};
|
|
1731
|
+
const run = async (function_, resolve, arguments_) => {
|
|
1732
|
+
const result = (async () => function_(...arguments_))();
|
|
1733
|
+
resolve(result);
|
|
1734
|
+
try {
|
|
1735
|
+
await result;
|
|
1736
|
+
} catch {
|
|
1737
|
+
}
|
|
1738
|
+
next();
|
|
1739
|
+
};
|
|
1740
|
+
const enqueue = (function_, resolve, arguments_) => {
|
|
1741
|
+
new Promise((internalResolve) => {
|
|
1742
|
+
queue.enqueue(internalResolve);
|
|
1743
|
+
}).then(
|
|
1744
|
+
run.bind(void 0, function_, resolve, arguments_)
|
|
1745
|
+
);
|
|
1746
|
+
(async () => {
|
|
1747
|
+
await Promise.resolve();
|
|
1748
|
+
if (activeCount < concurrency) {
|
|
1749
|
+
resumeNext();
|
|
1750
|
+
}
|
|
1751
|
+
})();
|
|
1752
|
+
};
|
|
1753
|
+
const generator = (function_, ...arguments_) => new Promise((resolve) => {
|
|
1754
|
+
enqueue(function_, resolve, arguments_);
|
|
1755
|
+
});
|
|
1756
|
+
Object.defineProperties(generator, {
|
|
1757
|
+
activeCount: {
|
|
1758
|
+
get: () => activeCount
|
|
1759
|
+
},
|
|
1760
|
+
pendingCount: {
|
|
1761
|
+
get: () => queue.size
|
|
1762
|
+
},
|
|
1763
|
+
clearQueue: {
|
|
1764
|
+
value() {
|
|
1765
|
+
queue.clear();
|
|
1766
|
+
}
|
|
1767
|
+
},
|
|
1768
|
+
concurrency: {
|
|
1769
|
+
get: () => concurrency,
|
|
1770
|
+
set(newConcurrency) {
|
|
1771
|
+
validateConcurrency(newConcurrency);
|
|
1772
|
+
concurrency = newConcurrency;
|
|
1773
|
+
queueMicrotask(() => {
|
|
1774
|
+
while (activeCount < concurrency && queue.size > 0) {
|
|
1775
|
+
resumeNext();
|
|
1776
|
+
}
|
|
1777
|
+
});
|
|
1778
|
+
}
|
|
1779
|
+
}
|
|
1780
|
+
});
|
|
1781
|
+
return generator;
|
|
1782
|
+
}
|
|
1783
|
+
function validateConcurrency(concurrency) {
|
|
1784
|
+
if (!((Number.isInteger(concurrency) || concurrency === Number.POSITIVE_INFINITY) && concurrency > 0)) {
|
|
1785
|
+
throw new TypeError("Expected `concurrency` to be a number from 1 and up");
|
|
1786
|
+
}
|
|
1787
|
+
}
|
|
1788
|
+
|
|
1789
|
+
// src/evaluation/orchestrator.ts
|
|
1790
|
+
async function runEvaluation(options) {
|
|
1791
|
+
const {
|
|
1792
|
+
testFilePath,
|
|
1793
|
+
repoRoot,
|
|
1794
|
+
target,
|
|
1795
|
+
targets,
|
|
1796
|
+
env,
|
|
1797
|
+
providerFactory,
|
|
1798
|
+
graders,
|
|
1799
|
+
maxRetries,
|
|
1800
|
+
agentTimeoutMs,
|
|
1801
|
+
promptDumpDir,
|
|
1802
|
+
cache,
|
|
1803
|
+
useCache,
|
|
1804
|
+
now,
|
|
1805
|
+
testId,
|
|
1806
|
+
verbose,
|
|
1807
|
+
onResult,
|
|
1808
|
+
onProgress
|
|
1809
|
+
} = options;
|
|
1810
|
+
const load = loadTestCases;
|
|
1811
|
+
const testCases = await load(testFilePath, repoRoot, { verbose });
|
|
1812
|
+
const filteredTestCases = filterTestCases(testCases, testId);
|
|
1813
|
+
if (filteredTestCases.length === 0) {
|
|
1814
|
+
if (testId) {
|
|
1815
|
+
throw new Error(`Test case with id '${testId}' not found in ${testFilePath}`);
|
|
1816
|
+
}
|
|
1817
|
+
return [];
|
|
1818
|
+
}
|
|
1819
|
+
const resolvedTargetsByName = /* @__PURE__ */ new Map();
|
|
1820
|
+
resolvedTargetsByName.set(target.name, target);
|
|
1821
|
+
const targetDefinitions = /* @__PURE__ */ new Map();
|
|
1822
|
+
for (const definition of targets ?? []) {
|
|
1823
|
+
targetDefinitions.set(definition.name, definition);
|
|
1824
|
+
}
|
|
1825
|
+
const envLookup = env ?? process.env;
|
|
1826
|
+
const providerCache = /* @__PURE__ */ new Map();
|
|
1827
|
+
const getOrCreateProvider = (resolved) => {
|
|
1828
|
+
const existing = providerCache.get(resolved.name);
|
|
1829
|
+
if (existing) {
|
|
1830
|
+
return existing;
|
|
1831
|
+
}
|
|
1832
|
+
const factory = providerFactory ?? createProvider;
|
|
1833
|
+
const instance = factory(resolved);
|
|
1834
|
+
providerCache.set(resolved.name, instance);
|
|
1835
|
+
return instance;
|
|
1836
|
+
};
|
|
1837
|
+
const resolveTargetByName = (name) => {
|
|
1838
|
+
if (resolvedTargetsByName.has(name)) {
|
|
1839
|
+
return resolvedTargetsByName.get(name);
|
|
1840
|
+
}
|
|
1841
|
+
const definition = targetDefinitions.get(name);
|
|
1842
|
+
if (!definition) {
|
|
1843
|
+
return void 0;
|
|
1844
|
+
}
|
|
1845
|
+
const resolved = resolveTargetDefinition(definition, envLookup);
|
|
1846
|
+
resolvedTargetsByName.set(name, resolved);
|
|
1847
|
+
return resolved;
|
|
1848
|
+
};
|
|
1849
|
+
const resolveJudgeProvider = async (targetContext) => {
|
|
1850
|
+
const judgeName = targetContext.judgeTarget ?? targetContext.name;
|
|
1851
|
+
const resolvedJudge = resolveTargetByName(judgeName);
|
|
1852
|
+
if (!resolvedJudge) {
|
|
1853
|
+
return getOrCreateProvider(targetContext);
|
|
1854
|
+
}
|
|
1855
|
+
return getOrCreateProvider(resolvedJudge);
|
|
1856
|
+
};
|
|
1857
|
+
const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
|
|
1858
|
+
const primaryProvider = getOrCreateProvider(target);
|
|
1859
|
+
if (onProgress && filteredTestCases.length > 0) {
|
|
1860
|
+
for (let i = 0; i < filteredTestCases.length; i++) {
|
|
1861
|
+
await onProgress({
|
|
1862
|
+
workerId: i + 1,
|
|
1863
|
+
testId: filteredTestCases[i].id,
|
|
1864
|
+
status: "pending"
|
|
1865
|
+
});
|
|
1866
|
+
}
|
|
1867
|
+
}
|
|
1868
|
+
const workers = options.maxConcurrency ?? target.workers ?? 1;
|
|
1869
|
+
const limit = pLimit(workers);
|
|
1870
|
+
let nextWorkerId = 1;
|
|
1871
|
+
const workerIdByTestId = /* @__PURE__ */ new Map();
|
|
1872
|
+
const promises = filteredTestCases.map(
|
|
1873
|
+
(testCase) => limit(async () => {
|
|
1874
|
+
const workerId = nextWorkerId++;
|
|
1875
|
+
workerIdByTestId.set(testCase.id, workerId);
|
|
1876
|
+
if (onProgress) {
|
|
1877
|
+
await onProgress({
|
|
1878
|
+
workerId,
|
|
1879
|
+
testId: testCase.id,
|
|
1880
|
+
status: "running",
|
|
1881
|
+
startedAt: Date.now()
|
|
1882
|
+
});
|
|
1883
|
+
}
|
|
1884
|
+
try {
|
|
1885
|
+
const judgeProvider = await resolveJudgeProvider(target);
|
|
1886
|
+
const result = await runTestCase({
|
|
1887
|
+
testCase,
|
|
1888
|
+
provider: primaryProvider,
|
|
1889
|
+
target,
|
|
1890
|
+
graders: graderRegistry,
|
|
1891
|
+
maxRetries,
|
|
1892
|
+
agentTimeoutMs,
|
|
1893
|
+
promptDumpDir,
|
|
1894
|
+
cache,
|
|
1895
|
+
useCache,
|
|
1896
|
+
now,
|
|
1897
|
+
judgeProvider
|
|
1898
|
+
});
|
|
1899
|
+
if (onProgress) {
|
|
1900
|
+
await onProgress({
|
|
1901
|
+
workerId,
|
|
1902
|
+
testId: testCase.id,
|
|
1903
|
+
status: "completed",
|
|
1904
|
+
startedAt: 0,
|
|
1905
|
+
// Not used for completed status
|
|
1906
|
+
completedAt: Date.now()
|
|
1907
|
+
});
|
|
1908
|
+
}
|
|
1909
|
+
if (onResult) {
|
|
1910
|
+
await onResult(result);
|
|
1911
|
+
}
|
|
1912
|
+
return result;
|
|
1913
|
+
} catch (error) {
|
|
1914
|
+
if (onProgress) {
|
|
1915
|
+
await onProgress({
|
|
1916
|
+
workerId,
|
|
1917
|
+
testId: testCase.id,
|
|
1918
|
+
status: "failed",
|
|
1919
|
+
completedAt: Date.now(),
|
|
1920
|
+
error: error instanceof Error ? error.message : String(error)
|
|
1921
|
+
});
|
|
1922
|
+
}
|
|
1923
|
+
throw error;
|
|
1924
|
+
}
|
|
1925
|
+
})
|
|
1926
|
+
);
|
|
1927
|
+
const settled = await Promise.allSettled(promises);
|
|
1928
|
+
const results = [];
|
|
1929
|
+
for (let i = 0; i < settled.length; i++) {
|
|
1930
|
+
const outcome = settled[i];
|
|
1931
|
+
if (outcome.status === "fulfilled") {
|
|
1932
|
+
results.push(outcome.value);
|
|
1933
|
+
} else {
|
|
1934
|
+
const testCase = filteredTestCases[i];
|
|
1935
|
+
const promptInputs = await buildPromptInputs(testCase);
|
|
1936
|
+
const errorResult = buildErrorResult(
|
|
1937
|
+
testCase,
|
|
1938
|
+
target.name,
|
|
1939
|
+
(now ?? (() => /* @__PURE__ */ new Date()))(),
|
|
1940
|
+
outcome.reason,
|
|
1941
|
+
promptInputs
|
|
1942
|
+
);
|
|
1943
|
+
results.push(errorResult);
|
|
1944
|
+
if (onResult) {
|
|
1945
|
+
await onResult(errorResult);
|
|
1946
|
+
}
|
|
1947
|
+
}
|
|
1948
|
+
}
|
|
1949
|
+
return results;
|
|
1950
|
+
}
|
|
1951
|
+
async function runTestCase(options) {
|
|
1952
|
+
const {
|
|
1953
|
+
testCase,
|
|
1954
|
+
provider,
|
|
1955
|
+
target,
|
|
1956
|
+
graders,
|
|
1957
|
+
now,
|
|
1958
|
+
maxRetries,
|
|
1959
|
+
agentTimeoutMs,
|
|
1960
|
+
promptDumpDir,
|
|
1961
|
+
cache,
|
|
1962
|
+
useCache,
|
|
1963
|
+
signal,
|
|
1964
|
+
judgeProvider
|
|
1965
|
+
} = options;
|
|
1966
|
+
const promptInputs = await buildPromptInputs(testCase);
|
|
1967
|
+
if (promptDumpDir) {
|
|
1968
|
+
await dumpPrompt(promptDumpDir, testCase, promptInputs);
|
|
1969
|
+
}
|
|
1970
|
+
const cacheKey = useCache ? createCacheKey(provider, target, testCase, promptInputs) : void 0;
|
|
1971
|
+
let cachedResponse;
|
|
1972
|
+
if (cacheKey && cache) {
|
|
1973
|
+
cachedResponse = await cache.get(cacheKey);
|
|
1974
|
+
}
|
|
1975
|
+
const nowFn = now ?? (() => /* @__PURE__ */ new Date());
|
|
1976
|
+
const attemptBudget = (maxRetries ?? 0) + 1;
|
|
1977
|
+
let attempt = 0;
|
|
1978
|
+
let providerResponse = cachedResponse;
|
|
1979
|
+
let lastError;
|
|
1980
|
+
while (!providerResponse && attempt < attemptBudget) {
|
|
1981
|
+
try {
|
|
1982
|
+
providerResponse = await invokeProvider(provider, {
|
|
1983
|
+
testCase,
|
|
1984
|
+
target,
|
|
1985
|
+
promptInputs,
|
|
1986
|
+
attempt,
|
|
1987
|
+
agentTimeoutMs,
|
|
1988
|
+
signal
|
|
1989
|
+
});
|
|
1990
|
+
} catch (error) {
|
|
1991
|
+
lastError = error;
|
|
1992
|
+
if (isTimeoutLike(error) && attempt + 1 < attemptBudget) {
|
|
1993
|
+
attempt += 1;
|
|
1994
|
+
continue;
|
|
1995
|
+
}
|
|
1996
|
+
return buildErrorResult(testCase, target.name, nowFn(), error, promptInputs);
|
|
1997
|
+
}
|
|
1998
|
+
}
|
|
1999
|
+
if (!providerResponse) {
|
|
2000
|
+
return buildErrorResult(
|
|
2001
|
+
testCase,
|
|
2002
|
+
target.name,
|
|
2003
|
+
nowFn(),
|
|
2004
|
+
lastError ?? new Error("Provider did not return a response"),
|
|
2005
|
+
promptInputs
|
|
2006
|
+
);
|
|
2007
|
+
}
|
|
2008
|
+
if (cacheKey && cache && !cachedResponse) {
|
|
2009
|
+
await cache.set(cacheKey, providerResponse);
|
|
2010
|
+
}
|
|
2011
|
+
const graderKind = testCase.grader ?? "heuristic";
|
|
2012
|
+
const activeGrader = graders[graderKind] ?? graders.heuristic;
|
|
2013
|
+
if (!activeGrader) {
|
|
2014
|
+
throw new Error(`No grader registered for kind '${graderKind}'`);
|
|
2015
|
+
}
|
|
2016
|
+
let grade;
|
|
2017
|
+
try {
|
|
2018
|
+
const gradeTimestamp = nowFn();
|
|
2019
|
+
grade = await activeGrader.grade({
|
|
2020
|
+
testCase,
|
|
2021
|
+
candidate: providerResponse.text ?? "",
|
|
2022
|
+
target,
|
|
2023
|
+
provider,
|
|
2024
|
+
attempt,
|
|
2025
|
+
promptInputs,
|
|
2026
|
+
now: gradeTimestamp,
|
|
2027
|
+
judgeProvider
|
|
2028
|
+
});
|
|
2029
|
+
} catch (error) {
|
|
2030
|
+
return buildErrorResult(testCase, target.name, nowFn(), error, promptInputs);
|
|
2031
|
+
}
|
|
2032
|
+
const completedAt = nowFn();
|
|
2033
|
+
const rawRequest = {
|
|
2034
|
+
request: promptInputs.request,
|
|
2035
|
+
guidelines: promptInputs.guidelines,
|
|
2036
|
+
guideline_paths: testCase.guideline_paths
|
|
2037
|
+
};
|
|
2038
|
+
return {
|
|
2039
|
+
test_id: testCase.id,
|
|
2040
|
+
conversation_id: testCase.conversation_id,
|
|
2041
|
+
score: grade.score,
|
|
2042
|
+
hits: grade.hits,
|
|
2043
|
+
misses: grade.misses,
|
|
2044
|
+
model_answer: providerResponse.text ?? "",
|
|
2045
|
+
expected_aspect_count: grade.expectedAspectCount,
|
|
2046
|
+
target: target.name,
|
|
2047
|
+
timestamp: completedAt.toISOString(),
|
|
2048
|
+
reasoning: grade.reasoning,
|
|
2049
|
+
raw_aspects: grade.rawAspects,
|
|
2050
|
+
raw_request: rawRequest,
|
|
2051
|
+
grader_raw_request: grade.graderRawRequest
|
|
2052
|
+
};
|
|
2053
|
+
}
|
|
2054
|
+
function filterTestCases(testCases, testId) {
|
|
2055
|
+
if (!testId) {
|
|
2056
|
+
return testCases;
|
|
2057
|
+
}
|
|
2058
|
+
return testCases.filter((testCase) => testCase.id === testId);
|
|
2059
|
+
}
|
|
2060
|
+
function buildGraderRegistry(overrides, resolveJudgeProvider) {
|
|
2061
|
+
const heuristic = overrides?.heuristic ?? new HeuristicGrader();
|
|
2062
|
+
const llmJudge = overrides?.llm_judge ?? new QualityGrader({
|
|
2063
|
+
resolveJudgeProvider: async (context) => {
|
|
2064
|
+
if (context.judgeProvider) {
|
|
2065
|
+
return context.judgeProvider;
|
|
2066
|
+
}
|
|
2067
|
+
return resolveJudgeProvider(context.target);
|
|
2068
|
+
}
|
|
2069
|
+
});
|
|
2070
|
+
return {
|
|
2071
|
+
...overrides,
|
|
2072
|
+
heuristic,
|
|
2073
|
+
llm_judge: llmJudge
|
|
2074
|
+
};
|
|
2075
|
+
}
|
|
2076
|
+
async function dumpPrompt(directory, testCase, promptInputs) {
|
|
2077
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
2078
|
+
const filename = `${timestamp}_${sanitizeFilename(testCase.id)}.json`;
|
|
2079
|
+
const filePath = import_node_path5.default.resolve(directory, filename);
|
|
2080
|
+
await (0, import_promises5.mkdir)(import_node_path5.default.dirname(filePath), { recursive: true });
|
|
2081
|
+
const payload = {
|
|
2082
|
+
test_id: testCase.id,
|
|
2083
|
+
request: promptInputs.request,
|
|
2084
|
+
guidelines: promptInputs.guidelines,
|
|
2085
|
+
guideline_paths: testCase.guideline_paths
|
|
2086
|
+
};
|
|
2087
|
+
await (0, import_promises5.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
2088
|
+
}
|
|
2089
|
+
function sanitizeFilename(value) {
|
|
2090
|
+
if (!value) {
|
|
2091
|
+
return "prompt";
|
|
2092
|
+
}
|
|
2093
|
+
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
2094
|
+
return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
|
|
2095
|
+
}
|
|
2096
|
+
async function invokeProvider(provider, options) {
|
|
2097
|
+
const { testCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
2098
|
+
const controller = new AbortController();
|
|
2099
|
+
const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
|
|
2100
|
+
if (signal) {
|
|
2101
|
+
signal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
2102
|
+
}
|
|
2103
|
+
try {
|
|
2104
|
+
return await provider.invoke({
|
|
2105
|
+
prompt: promptInputs.request,
|
|
2106
|
+
guidelines: promptInputs.guidelines,
|
|
2107
|
+
attachments: testCase.guideline_paths,
|
|
2108
|
+
testCaseId: testCase.id,
|
|
2109
|
+
attempt,
|
|
2110
|
+
metadata: {
|
|
2111
|
+
target: target.name,
|
|
2112
|
+
grader: testCase.grader
|
|
2113
|
+
},
|
|
2114
|
+
signal: controller.signal
|
|
2115
|
+
});
|
|
2116
|
+
} finally {
|
|
2117
|
+
if (timeout !== void 0) {
|
|
2118
|
+
clearTimeout(timeout);
|
|
2119
|
+
}
|
|
2120
|
+
}
|
|
2121
|
+
}
|
|
2122
|
+
function buildErrorResult(testCase, targetName, timestamp, error, promptInputs) {
|
|
2123
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2124
|
+
const rawRequest = {
|
|
2125
|
+
request: promptInputs.request,
|
|
2126
|
+
guidelines: promptInputs.guidelines,
|
|
2127
|
+
guideline_paths: testCase.guideline_paths,
|
|
2128
|
+
error: message
|
|
2129
|
+
};
|
|
2130
|
+
return {
|
|
2131
|
+
test_id: testCase.id,
|
|
2132
|
+
conversation_id: testCase.conversation_id,
|
|
2133
|
+
score: 0,
|
|
2134
|
+
hits: [],
|
|
2135
|
+
misses: [`Error: ${message}`],
|
|
2136
|
+
model_answer: `Error occurred: ${message}`,
|
|
2137
|
+
expected_aspect_count: 0,
|
|
2138
|
+
target: targetName,
|
|
2139
|
+
timestamp: timestamp.toISOString(),
|
|
2140
|
+
raw_aspects: [],
|
|
2141
|
+
raw_request: rawRequest
|
|
2142
|
+
};
|
|
2143
|
+
}
|
|
2144
|
+
function createCacheKey(provider, target, testCase, promptInputs) {
|
|
2145
|
+
const hash = (0, import_node_crypto2.createHash)("sha256");
|
|
2146
|
+
hash.update(provider.id);
|
|
2147
|
+
hash.update(target.name);
|
|
2148
|
+
hash.update(testCase.id);
|
|
2149
|
+
hash.update(promptInputs.request);
|
|
2150
|
+
hash.update(promptInputs.guidelines);
|
|
2151
|
+
return hash.digest("hex");
|
|
2152
|
+
}
|
|
2153
|
+
function isTimeoutLike(error) {
|
|
2154
|
+
if (!error) {
|
|
2155
|
+
return false;
|
|
2156
|
+
}
|
|
2157
|
+
if (typeof DOMException !== "undefined" && error instanceof DOMException && error.name === "AbortError") {
|
|
2158
|
+
return true;
|
|
2159
|
+
}
|
|
2160
|
+
if (error instanceof Error) {
|
|
2161
|
+
const name = error.name?.toLowerCase();
|
|
2162
|
+
const message = error.message?.toLowerCase();
|
|
2163
|
+
return name.includes("timeout") || message.includes("timeout");
|
|
2164
|
+
}
|
|
2165
|
+
const value = String(error).toLowerCase();
|
|
2166
|
+
return value.includes("timeout");
|
|
2167
|
+
}
|
|
2168
|
+
|
|
2169
|
+
// src/index.ts
|
|
2170
|
+
function createAgentKernel() {
|
|
2171
|
+
return { status: "stub" };
|
|
2172
|
+
}
|
|
2173
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
2174
|
+
0 && (module.exports = {
|
|
2175
|
+
GRADER_KINDS,
|
|
2176
|
+
HeuristicGrader,
|
|
2177
|
+
QualityGrader,
|
|
2178
|
+
TEST_MESSAGE_ROLES,
|
|
2179
|
+
buildPromptInputs,
|
|
2180
|
+
calculateHits,
|
|
2181
|
+
calculateMisses,
|
|
2182
|
+
createAgentKernel,
|
|
2183
|
+
createProvider,
|
|
2184
|
+
ensureVSCodeSubagents,
|
|
2185
|
+
extractAspects,
|
|
2186
|
+
extractCodeBlocks,
|
|
2187
|
+
getHitCount,
|
|
2188
|
+
isErrorLike,
|
|
2189
|
+
isGraderKind,
|
|
2190
|
+
isGuidelineFile,
|
|
2191
|
+
isJsonObject,
|
|
2192
|
+
isJsonValue,
|
|
2193
|
+
isTestMessage,
|
|
2194
|
+
isTestMessageRole,
|
|
2195
|
+
listTargetNames,
|
|
2196
|
+
loadTestCases,
|
|
2197
|
+
readTargetDefinitions,
|
|
2198
|
+
resolveAndCreateProvider,
|
|
2199
|
+
resolveTargetDefinition,
|
|
2200
|
+
runEvaluation,
|
|
2201
|
+
runTestCase,
|
|
2202
|
+
scoreCandidateResponse
|
|
2203
|
+
});
|
|
2204
|
+
//# sourceMappingURL=index.cjs.map
|