@agentv/core 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs ADDED
@@ -0,0 +1,2204 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+
30
+ // src/index.ts
31
+ var index_exports = {};
32
+ __export(index_exports, {
33
+ GRADER_KINDS: () => GRADER_KINDS,
34
+ HeuristicGrader: () => HeuristicGrader,
35
+ QualityGrader: () => QualityGrader,
36
+ TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
37
+ buildPromptInputs: () => buildPromptInputs,
38
+ calculateHits: () => calculateHits,
39
+ calculateMisses: () => calculateMisses,
40
+ createAgentKernel: () => createAgentKernel,
41
+ createProvider: () => createProvider,
42
+ ensureVSCodeSubagents: () => ensureVSCodeSubagents,
43
+ extractAspects: () => extractAspects,
44
+ extractCodeBlocks: () => extractCodeBlocks,
45
+ getHitCount: () => getHitCount,
46
+ isErrorLike: () => isErrorLike,
47
+ isGraderKind: () => isGraderKind,
48
+ isGuidelineFile: () => isGuidelineFile,
49
+ isJsonObject: () => isJsonObject,
50
+ isJsonValue: () => isJsonValue,
51
+ isTestMessage: () => isTestMessage,
52
+ isTestMessageRole: () => isTestMessageRole,
53
+ listTargetNames: () => listTargetNames,
54
+ loadTestCases: () => loadTestCases,
55
+ readTargetDefinitions: () => readTargetDefinitions,
56
+ resolveAndCreateProvider: () => resolveAndCreateProvider,
57
+ resolveTargetDefinition: () => resolveTargetDefinition,
58
+ runEvaluation: () => runEvaluation,
59
+ runTestCase: () => runTestCase,
60
+ scoreCandidateResponse: () => scoreCandidateResponse
61
+ });
62
+ module.exports = __toCommonJS(index_exports);
63
+
64
+ // src/evaluation/types.ts
65
+ var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
66
+ var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
67
+ var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
68
+ function isTestMessageRole(value) {
69
+ return typeof value === "string" && TEST_MESSAGE_ROLE_SET.has(value);
70
+ }
71
+ function isJsonObject(value) {
72
+ if (typeof value !== "object" || value === null || Array.isArray(value)) {
73
+ return false;
74
+ }
75
+ return Object.values(value).every(isJsonValue);
76
+ }
77
+ function isJsonValue(value) {
78
+ if (value === null || typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
79
+ return true;
80
+ }
81
+ if (Array.isArray(value)) {
82
+ return value.every(isJsonValue);
83
+ }
84
+ if (typeof value === "object") {
85
+ return isJsonObject(value);
86
+ }
87
+ return false;
88
+ }
89
+ function isTestMessage(value) {
90
+ if (typeof value !== "object" || value === null) {
91
+ return false;
92
+ }
93
+ const candidate = value;
94
+ if (!isTestMessageRole(candidate.role)) {
95
+ return false;
96
+ }
97
+ if (typeof candidate.content === "string") {
98
+ return true;
99
+ }
100
+ if (!Array.isArray(candidate.content)) {
101
+ return false;
102
+ }
103
+ return candidate.content.every(isJsonObject);
104
+ }
105
+ var GRADER_KIND_VALUES = ["heuristic", "llm_judge"];
106
+ var GRADER_KINDS = GRADER_KIND_VALUES;
107
+ var GRADER_KIND_SET = new Set(GRADER_KIND_VALUES);
108
+ function isGraderKind(value) {
109
+ return typeof value === "string" && GRADER_KIND_SET.has(value);
110
+ }
111
+ function getHitCount(result) {
112
+ return result.hits.length;
113
+ }
114
+
115
+ // src/evaluation/yaml-parser.ts
116
+ var import_node_fs2 = require("fs");
117
+ var import_promises2 = require("fs/promises");
118
+ var import_node_path2 = __toESM(require("path"), 1);
119
+ var import_node_url = require("url");
120
+ var import_yaml = require("yaml");
121
+
122
+ // src/evaluation/file-utils.ts
123
+ var import_node_fs = require("fs");
124
+ var import_promises = require("fs/promises");
125
+ var import_node_path = __toESM(require("path"), 1);
126
+ async function fileExists(filePath) {
127
+ try {
128
+ await (0, import_promises.access)(filePath, import_node_fs.constants.F_OK);
129
+ return true;
130
+ } catch {
131
+ return false;
132
+ }
133
+ }
134
+ function buildSearchRoots(evalPath, repoRoot) {
135
+ const uniqueRoots = [];
136
+ const addRoot = (root) => {
137
+ const normalized = import_node_path.default.resolve(root);
138
+ if (!uniqueRoots.includes(normalized)) {
139
+ uniqueRoots.push(normalized);
140
+ }
141
+ };
142
+ let currentDir = import_node_path.default.dirname(evalPath);
143
+ let reachedBoundary = false;
144
+ while (!reachedBoundary) {
145
+ addRoot(currentDir);
146
+ const parentDir = import_node_path.default.dirname(currentDir);
147
+ if (currentDir === repoRoot || parentDir === currentDir) {
148
+ reachedBoundary = true;
149
+ } else {
150
+ currentDir = parentDir;
151
+ }
152
+ }
153
+ addRoot(repoRoot);
154
+ addRoot(process.cwd());
155
+ return uniqueRoots;
156
+ }
157
+ function trimLeadingSeparators(value) {
158
+ const trimmed = value.replace(/^[/\\]+/, "");
159
+ return trimmed.length > 0 ? trimmed : value;
160
+ }
161
+ async function resolveFileReference(rawValue, searchRoots) {
162
+ const displayPath = trimLeadingSeparators(rawValue);
163
+ const potentialPaths = [];
164
+ if (import_node_path.default.isAbsolute(rawValue)) {
165
+ potentialPaths.push(import_node_path.default.normalize(rawValue));
166
+ }
167
+ for (const base of searchRoots) {
168
+ potentialPaths.push(import_node_path.default.resolve(base, displayPath));
169
+ }
170
+ const attempted = [];
171
+ const seen = /* @__PURE__ */ new Set();
172
+ for (const candidate of potentialPaths) {
173
+ const absoluteCandidate = import_node_path.default.resolve(candidate);
174
+ if (seen.has(absoluteCandidate)) {
175
+ continue;
176
+ }
177
+ seen.add(absoluteCandidate);
178
+ attempted.push(absoluteCandidate);
179
+ if (await fileExists(absoluteCandidate)) {
180
+ return { displayPath, resolvedPath: absoluteCandidate, attempted };
181
+ }
182
+ }
183
+ return { displayPath, attempted };
184
+ }
185
+
186
+ // src/evaluation/yaml-parser.ts
187
+ var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
188
+ var ANSI_YELLOW = "\x1B[33m";
189
+ var ANSI_RESET = "\x1B[0m";
190
+ var SCHEMA_EVAL_V2 = "agentv-eval-v2";
191
+ function isGuidelineFile(filePath) {
192
+ const normalized = filePath.split("\\").join("/");
193
+ return normalized.endsWith(".instructions.md") || normalized.includes("/instructions/") || normalized.endsWith(".prompt.md") || normalized.includes("/prompts/");
194
+ }
195
+ function extractCodeBlocks(segments) {
196
+ const codeBlocks = [];
197
+ for (const segment of segments) {
198
+ const typeValue = segment["type"];
199
+ if (typeof typeValue !== "string" || typeValue !== "text") {
200
+ continue;
201
+ }
202
+ const textValue = segment["value"];
203
+ if (typeof textValue !== "string") {
204
+ continue;
205
+ }
206
+ const matches = textValue.match(CODE_BLOCK_PATTERN);
207
+ if (matches) {
208
+ codeBlocks.push(...matches);
209
+ }
210
+ }
211
+ return codeBlocks;
212
+ }
213
+ async function loadTestCases(testFilePath, repoRoot, options) {
214
+ const verbose = options?.verbose ?? false;
215
+ const absoluteTestPath = import_node_path2.default.resolve(testFilePath);
216
+ if (!await fileExists2(absoluteTestPath)) {
217
+ throw new Error(`Test file not found: ${testFilePath}`);
218
+ }
219
+ const repoRootPath = resolveToAbsolutePath(repoRoot);
220
+ const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
221
+ const rawFile = await (0, import_promises2.readFile)(absoluteTestPath, "utf8");
222
+ const parsed = (0, import_yaml.parse)(rawFile);
223
+ if (!isJsonObject(parsed)) {
224
+ throw new Error(`Invalid test file format: ${testFilePath}`);
225
+ }
226
+ const suite = parsed;
227
+ const schema = suite.$schema;
228
+ if (schema !== SCHEMA_EVAL_V2) {
229
+ const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${testFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${testFilePath}.
230
+ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
231
+ throw new Error(message);
232
+ }
233
+ const rawTestcases = suite.evalcases;
234
+ if (!Array.isArray(rawTestcases)) {
235
+ throw new Error(`Invalid test file format: ${testFilePath} - missing 'evalcases' field`);
236
+ }
237
+ const globalGrader = coerceGrader(suite.grader) ?? "llm_judge";
238
+ const results = [];
239
+ for (const rawTestcase of rawTestcases) {
240
+ if (!isJsonObject(rawTestcase)) {
241
+ logWarning("Skipping invalid test case entry (expected object)");
242
+ continue;
243
+ }
244
+ const testcase = rawTestcase;
245
+ const id = asString(testcase.id);
246
+ const conversationId = asString(testcase.conversation_id);
247
+ const outcome = asString(testcase.outcome);
248
+ const inputMessagesValue = testcase.input_messages;
249
+ const expectedMessagesValue = testcase.expected_messages;
250
+ if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
251
+ logWarning(`Skipping incomplete test case: ${id ?? "unknown"}`);
252
+ continue;
253
+ }
254
+ if (!Array.isArray(expectedMessagesValue)) {
255
+ logWarning(`Test case '${id}' missing expected_messages array`);
256
+ continue;
257
+ }
258
+ const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
259
+ const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
260
+ const assistantMessages = expectedMessages.filter((message) => message.role === "assistant");
261
+ const userMessages = inputMessages.filter((message) => message.role === "user");
262
+ if (assistantMessages.length === 0) {
263
+ logWarning(`No assistant message found for test case: ${id}`);
264
+ continue;
265
+ }
266
+ if (assistantMessages.length > 1) {
267
+ logWarning(`Multiple assistant messages found for test case: ${id}, using first`);
268
+ }
269
+ const userSegments = [];
270
+ const guidelinePaths = [];
271
+ const userTextParts = [];
272
+ for (const userMessage of userMessages) {
273
+ const content = userMessage.content;
274
+ if (typeof content === "string") {
275
+ userSegments.push({ type: "text", value: content });
276
+ userTextParts.push(content);
277
+ continue;
278
+ }
279
+ for (const rawSegment of content) {
280
+ if (!isJsonObject(rawSegment)) {
281
+ continue;
282
+ }
283
+ const segmentType = asString(rawSegment.type);
284
+ if (segmentType === "file") {
285
+ const rawValue = asString(rawSegment.value);
286
+ if (!rawValue) {
287
+ continue;
288
+ }
289
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference(
290
+ rawValue,
291
+ searchRoots
292
+ );
293
+ if (!resolvedPath) {
294
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
295
+ logWarning(`File not found: ${displayPath}`, attempts);
296
+ continue;
297
+ }
298
+ try {
299
+ const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
300
+ if (isGuidelineFile(displayPath)) {
301
+ guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
302
+ if (verbose) {
303
+ console.log(` [Guideline] Found: ${displayPath}`);
304
+ console.log(` Resolved to: ${resolvedPath}`);
305
+ }
306
+ } else {
307
+ userSegments.push({
308
+ type: "file",
309
+ path: displayPath,
310
+ text: fileContent
311
+ });
312
+ if (verbose) {
313
+ console.log(` [File] Found: ${displayPath}`);
314
+ console.log(` Resolved to: ${resolvedPath}`);
315
+ }
316
+ }
317
+ } catch (error) {
318
+ logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
319
+ }
320
+ continue;
321
+ }
322
+ const clonedSegment = cloneJsonObject(rawSegment);
323
+ userSegments.push(clonedSegment);
324
+ const inlineValue = clonedSegment.value;
325
+ if (typeof inlineValue === "string") {
326
+ userTextParts.push(inlineValue);
327
+ }
328
+ }
329
+ }
330
+ const codeSnippets = extractCodeBlocks(userSegments);
331
+ const assistantContent = assistantMessages[0]?.content;
332
+ const expectedAssistantRaw = normalizeAssistantContent(assistantContent);
333
+ const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
334
+ const testCaseGrader = coerceGrader(testcase.grader) ?? globalGrader;
335
+ const testCase = {
336
+ id,
337
+ conversation_id: conversationId,
338
+ task: userTextPrompt,
339
+ user_segments: userSegments,
340
+ expected_assistant_raw: expectedAssistantRaw,
341
+ guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
342
+ code_snippets: codeSnippets,
343
+ outcome,
344
+ grader: testCaseGrader
345
+ };
346
+ if (verbose) {
347
+ console.log(`
348
+ [Test Case: ${id}]`);
349
+ if (testCase.guideline_paths.length > 0) {
350
+ console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
351
+ for (const guidelinePath of testCase.guideline_paths) {
352
+ console.log(` - ${guidelinePath}`);
353
+ }
354
+ } else {
355
+ console.log(" No guidelines found");
356
+ }
357
+ }
358
+ results.push(testCase);
359
+ }
360
+ return results;
361
+ }
362
+ async function buildPromptInputs(testCase) {
363
+ const guidelineContents = [];
364
+ for (const rawPath of testCase.guideline_paths) {
365
+ const absolutePath = import_node_path2.default.resolve(rawPath);
366
+ if (!await fileExists2(absolutePath)) {
367
+ logWarning(`Could not read guideline file ${absolutePath}: file does not exist`);
368
+ continue;
369
+ }
370
+ try {
371
+ const content = (await (0, import_promises2.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n");
372
+ guidelineContents.push(`=== ${import_node_path2.default.basename(absolutePath)} ===
373
+ ${content}`);
374
+ } catch (error) {
375
+ logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
376
+ }
377
+ }
378
+ const requestParts = [];
379
+ for (const segment of testCase.user_segments) {
380
+ const typeValue = segment.type;
381
+ if (typeof typeValue === "string" && typeValue === "file") {
382
+ const pathValue = segment.path;
383
+ const textValue = segment.text;
384
+ const label = typeof pathValue === "string" ? pathValue : "file";
385
+ const body = typeof textValue === "string" ? textValue : "";
386
+ requestParts.push(`=== ${label} ===
387
+ ${body}`);
388
+ continue;
389
+ }
390
+ if (typeof typeValue === "string" && typeValue === "text") {
391
+ const value = segment.value;
392
+ if (typeof value === "string") {
393
+ requestParts.push(value);
394
+ }
395
+ continue;
396
+ }
397
+ const genericValue = segment.value;
398
+ if (typeof genericValue === "string") {
399
+ requestParts.push(genericValue);
400
+ }
401
+ }
402
+ if (testCase.code_snippets.length > 0) {
403
+ requestParts.push(testCase.code_snippets.join("\n"));
404
+ }
405
+ const request = requestParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
406
+ const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
407
+ return { request, guidelines };
408
+ }
409
+ async function fileExists2(absolutePath) {
410
+ try {
411
+ await (0, import_promises2.access)(absolutePath, import_node_fs2.constants.F_OK);
412
+ return true;
413
+ } catch {
414
+ return false;
415
+ }
416
+ }
417
+ function resolveToAbsolutePath(candidate) {
418
+ if (candidate instanceof URL) {
419
+ return (0, import_node_url.fileURLToPath)(candidate);
420
+ }
421
+ if (typeof candidate === "string") {
422
+ if (candidate.startsWith("file://")) {
423
+ return (0, import_node_url.fileURLToPath)(new URL(candidate));
424
+ }
425
+ return import_node_path2.default.resolve(candidate);
426
+ }
427
+ throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
428
+ }
429
+ function asString(value) {
430
+ return typeof value === "string" ? value : void 0;
431
+ }
432
+ function cloneJsonObject(source) {
433
+ const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
434
+ return Object.fromEntries(entries);
435
+ }
436
+ function cloneJsonValue(value) {
437
+ if (value === null) {
438
+ return null;
439
+ }
440
+ if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
441
+ return value;
442
+ }
443
+ if (Array.isArray(value)) {
444
+ return value.map((item) => cloneJsonValue(item));
445
+ }
446
+ return cloneJsonObject(value);
447
+ }
448
+ function normalizeAssistantContent(content) {
449
+ if (typeof content === "string") {
450
+ return content;
451
+ }
452
+ if (!content) {
453
+ return "";
454
+ }
455
+ const parts = [];
456
+ for (const entry of content) {
457
+ if (typeof entry === "string") {
458
+ parts.push(entry);
459
+ continue;
460
+ }
461
+ const textValue = asString(entry["text"]);
462
+ if (typeof textValue === "string") {
463
+ parts.push(textValue);
464
+ continue;
465
+ }
466
+ const valueValue = asString(entry["value"]);
467
+ if (typeof valueValue === "string") {
468
+ parts.push(valueValue);
469
+ continue;
470
+ }
471
+ parts.push(JSON.stringify(entry));
472
+ }
473
+ return parts.join(" ");
474
+ }
475
+ function coerceGrader(candidate) {
476
+ if (typeof candidate !== "string") {
477
+ return void 0;
478
+ }
479
+ if (isGraderKind(candidate)) {
480
+ return candidate;
481
+ }
482
+ logWarning(`Unknown grader '${candidate}', falling back to default`);
483
+ return void 0;
484
+ }
485
+ function logWarning(message, details) {
486
+ if (details && details.length > 0) {
487
+ const detailBlock = details.join("\n");
488
+ console.warn(`${ANSI_YELLOW}Warning: ${message}
489
+ ${detailBlock}${ANSI_RESET}`);
490
+ } else {
491
+ console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
492
+ }
493
+ }
494
+
495
+ // src/evaluation/providers/ax.ts
496
+ var import_ax = require("@ax-llm/ax");
497
+ var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
498
+ function buildChatPrompt(request) {
499
+ if (request.chatPrompt) {
500
+ return request.chatPrompt;
501
+ }
502
+ const systemSegments = [];
503
+ if (request.guidelines && request.guidelines.trim().length > 0) {
504
+ systemSegments.push(`Guidelines:
505
+ ${request.guidelines.trim()}`);
506
+ }
507
+ const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
508
+ if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
509
+ systemSegments.push(metadataSystemPrompt.trim());
510
+ }
511
+ const systemContent = systemSegments.length > 0 ? systemSegments.join("\n\n") : DEFAULT_SYSTEM_PROMPT;
512
+ const userContent = request.prompt.trim();
513
+ const prompt = [
514
+ {
515
+ role: "system",
516
+ content: systemContent
517
+ },
518
+ {
519
+ role: "user",
520
+ content: userContent
521
+ }
522
+ ];
523
+ return prompt;
524
+ }
525
+ function extractModelConfig(request, defaults) {
526
+ const temperature = request.temperature ?? defaults.temperature;
527
+ const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
528
+ const config = {};
529
+ if (temperature !== void 0) {
530
+ config.temperature = temperature;
531
+ }
532
+ if (maxTokens !== void 0) {
533
+ config.maxTokens = maxTokens;
534
+ }
535
+ return Object.keys(config).length > 0 ? config : void 0;
536
+ }
537
+ function mapResponse(response) {
538
+ const primary = response.results[0];
539
+ const text = typeof primary?.content === "string" ? primary.content : "";
540
+ const reasoning = primary?.thought ?? primary?.thoughtBlock?.data;
541
+ const usage = toJsonObject(response.modelUsage);
542
+ return {
543
+ text,
544
+ reasoning,
545
+ raw: response,
546
+ usage
547
+ };
548
+ }
549
+ function toJsonObject(value) {
550
+ if (!value || typeof value !== "object") {
551
+ return void 0;
552
+ }
553
+ try {
554
+ return JSON.parse(JSON.stringify(value));
555
+ } catch {
556
+ return void 0;
557
+ }
558
+ }
559
+ function ensureChatResponse(result) {
560
+ if (typeof ReadableStream !== "undefined" && result instanceof ReadableStream) {
561
+ throw new Error("Streaming responses are not supported for this provider");
562
+ }
563
+ if (!result || typeof result !== "object" || !("results" in result)) {
564
+ throw new Error("Unexpected response type from AxAI provider");
565
+ }
566
+ return result;
567
+ }
568
+ var AzureProvider = class {
569
+ constructor(targetName, config) {
570
+ this.config = config;
571
+ this.id = `azure:${targetName}`;
572
+ this.targetName = targetName;
573
+ this.defaults = {
574
+ temperature: config.temperature,
575
+ maxOutputTokens: config.maxOutputTokens
576
+ };
577
+ this.ai = import_ax.AxAI.create({
578
+ name: "azure-openai",
579
+ apiKey: config.apiKey,
580
+ resourceName: config.resourceName,
581
+ deploymentName: config.deploymentName,
582
+ version: config.version,
583
+ config: {
584
+ stream: false
585
+ }
586
+ });
587
+ }
588
+ id;
589
+ kind = "azure";
590
+ targetName;
591
+ ai;
592
+ defaults;
593
+ async invoke(request) {
594
+ const chatPrompt = buildChatPrompt(request);
595
+ const modelConfig = extractModelConfig(request, this.defaults);
596
+ const response = await this.ai.chat(
597
+ {
598
+ chatPrompt,
599
+ model: this.config.deploymentName,
600
+ ...modelConfig ? { modelConfig } : {}
601
+ },
602
+ request.signal ? { abortSignal: request.signal } : void 0
603
+ );
604
+ return mapResponse(ensureChatResponse(response));
605
+ }
606
+ };
607
+ var AnthropicProvider = class {
608
+ constructor(targetName, config) {
609
+ this.config = config;
610
+ this.id = `anthropic:${targetName}`;
611
+ this.targetName = targetName;
612
+ this.defaults = {
613
+ temperature: config.temperature,
614
+ maxOutputTokens: config.maxOutputTokens,
615
+ thinkingBudget: config.thinkingBudget
616
+ };
617
+ this.ai = import_ax.AxAI.create({
618
+ name: "anthropic",
619
+ apiKey: config.apiKey
620
+ });
621
+ }
622
+ id;
623
+ kind = "anthropic";
624
+ targetName;
625
+ ai;
626
+ defaults;
627
+ async invoke(request) {
628
+ const chatPrompt = buildChatPrompt(request);
629
+ const modelConfig = extractModelConfig(request, this.defaults);
630
+ const response = await this.ai.chat(
631
+ {
632
+ chatPrompt,
633
+ model: this.config.model,
634
+ ...modelConfig ? { modelConfig } : {}
635
+ },
636
+ request.signal ? { abortSignal: request.signal } : void 0
637
+ );
638
+ return mapResponse(ensureChatResponse(response));
639
+ }
640
+ };
641
+ var GeminiProvider = class {
642
+ constructor(targetName, config) {
643
+ this.config = config;
644
+ this.id = `gemini:${targetName}`;
645
+ this.targetName = targetName;
646
+ this.defaults = {
647
+ temperature: config.temperature,
648
+ maxOutputTokens: config.maxOutputTokens
649
+ };
650
+ this.ai = import_ax.AxAI.create({
651
+ name: "google-gemini",
652
+ apiKey: config.apiKey
653
+ });
654
+ }
655
+ id;
656
+ kind = "gemini";
657
+ targetName;
658
+ ai;
659
+ defaults;
660
+ async invoke(request) {
661
+ const chatPrompt = buildChatPrompt(request);
662
+ const modelConfig = extractModelConfig(request, this.defaults);
663
+ const response = await this.ai.chat(
664
+ {
665
+ chatPrompt,
666
+ model: this.config.model,
667
+ ...modelConfig ? { modelConfig } : {}
668
+ },
669
+ request.signal ? { abortSignal: request.signal } : void 0
670
+ );
671
+ return mapResponse(ensureChatResponse(response));
672
+ }
673
+ };
674
+
675
+ // src/evaluation/providers/mock.ts
676
+ var DEFAULT_MOCK_RESPONSE = '{"answer":"Mock provider response. Configure targets.yaml to supply a custom value."}';
677
+ var MockProvider = class {
678
+ id;
679
+ kind = "mock";
680
+ targetName;
681
+ cannedResponse;
682
+ delayMs;
683
+ delayMinMs;
684
+ delayMaxMs;
685
+ constructor(targetName, config) {
686
+ this.id = `mock:${targetName}`;
687
+ this.targetName = targetName;
688
+ this.cannedResponse = config.response ?? DEFAULT_MOCK_RESPONSE;
689
+ this.delayMs = config.delayMs ?? 0;
690
+ this.delayMinMs = config.delayMinMs ?? 0;
691
+ this.delayMaxMs = config.delayMaxMs ?? 0;
692
+ }
693
+ async invoke(request) {
694
+ const delay = this.calculateDelay();
695
+ if (delay > 0) {
696
+ await new Promise((resolve) => setTimeout(resolve, delay));
697
+ }
698
+ return {
699
+ text: this.cannedResponse,
700
+ raw: {
701
+ prompt: request.prompt,
702
+ guidelines: request.guidelines
703
+ }
704
+ };
705
+ }
706
+ calculateDelay() {
707
+ if (this.delayMinMs > 0 || this.delayMaxMs > 0) {
708
+ const min = Math.max(0, this.delayMinMs);
709
+ const max = Math.max(min, this.delayMaxMs);
710
+ return Math.floor(Math.random() * (max - min + 1)) + min;
711
+ }
712
+ return this.delayMs;
713
+ }
714
+ };
715
+
716
+ // src/evaluation/providers/targets.ts
717
+ var import_zod = require("zod");
718
+ var BASE_TARGET_SCHEMA = import_zod.z.object({
719
+ name: import_zod.z.string().min(1, "target name is required"),
720
+ provider: import_zod.z.string().min(1, "provider is required"),
721
+ settings: import_zod.z.record(import_zod.z.unknown()).optional(),
722
+ judge_target: import_zod.z.string().optional(),
723
+ workers: import_zod.z.number().int().min(1).optional()
724
+ });
725
+ var DEFAULT_AZURE_API_VERSION = "2024-10-01-preview";
726
+ function normalizeAzureApiVersion(value) {
727
+ if (!value) {
728
+ return DEFAULT_AZURE_API_VERSION;
729
+ }
730
+ const trimmed = value.trim();
731
+ if (trimmed.length === 0) {
732
+ return DEFAULT_AZURE_API_VERSION;
733
+ }
734
+ const withoutPrefix = trimmed.replace(/^api[-_]?version\s*=\s*/i, "").trim();
735
+ return withoutPrefix.length > 0 ? withoutPrefix : DEFAULT_AZURE_API_VERSION;
736
+ }
737
+ function resolveTargetDefinition(definition, env = process.env) {
738
+ const parsed = BASE_TARGET_SCHEMA.parse(definition);
739
+ const provider = parsed.provider.toLowerCase();
740
+ switch (provider) {
741
+ case "azure":
742
+ case "azure-openai":
743
+ return {
744
+ kind: "azure",
745
+ name: parsed.name,
746
+ judgeTarget: parsed.judge_target,
747
+ workers: parsed.workers,
748
+ config: resolveAzureConfig(parsed, env)
749
+ };
750
+ case "anthropic":
751
+ return {
752
+ kind: "anthropic",
753
+ name: parsed.name,
754
+ judgeTarget: parsed.judge_target,
755
+ workers: parsed.workers,
756
+ config: resolveAnthropicConfig(parsed, env)
757
+ };
758
+ case "gemini":
759
+ case "google":
760
+ case "google-gemini":
761
+ return {
762
+ kind: "gemini",
763
+ name: parsed.name,
764
+ judgeTarget: parsed.judge_target,
765
+ workers: parsed.workers,
766
+ config: resolveGeminiConfig(parsed, env)
767
+ };
768
+ case "mock":
769
+ return {
770
+ kind: "mock",
771
+ name: parsed.name,
772
+ judgeTarget: parsed.judge_target,
773
+ workers: parsed.workers,
774
+ config: resolveMockConfig(parsed)
775
+ };
776
+ case "vscode":
777
+ case "vscode-insiders":
778
+ return {
779
+ kind: provider,
780
+ name: parsed.name,
781
+ judgeTarget: parsed.judge_target,
782
+ workers: parsed.workers,
783
+ config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
784
+ };
785
+ default:
786
+ throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
787
+ }
788
+ }
789
+ function resolveAzureConfig(target, env) {
790
+ const settings = target.settings ?? {};
791
+ const endpointSource = settings.endpoint ?? settings.resource ?? settings.resourceName;
792
+ const apiKeySource = settings.api_key ?? settings.apiKey;
793
+ const deploymentSource = settings.deployment ?? settings.deploymentName ?? settings.model;
794
+ const versionSource = settings.version ?? settings.api_version;
795
+ const temperatureSource = settings.temperature;
796
+ const maxTokensSource = settings.max_output_tokens ?? settings.maxTokens;
797
+ const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
798
+ const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
799
+ const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
800
+ const version = normalizeAzureApiVersion(
801
+ resolveOptionalString(versionSource, env, `${target.name} api version`)
802
+ );
803
+ const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
804
+ const maxOutputTokens = resolveOptionalNumber(
805
+ maxTokensSource,
806
+ `${target.name} max output tokens`
807
+ );
808
+ return {
809
+ resourceName,
810
+ deploymentName,
811
+ apiKey,
812
+ version,
813
+ temperature,
814
+ maxOutputTokens
815
+ };
816
+ }
817
+ function resolveAnthropicConfig(target, env) {
818
+ const settings = target.settings ?? {};
819
+ const apiKeySource = settings.api_key ?? settings.apiKey;
820
+ const modelSource = settings.model ?? settings.deployment ?? settings.variant;
821
+ const temperatureSource = settings.temperature;
822
+ const maxTokensSource = settings.max_output_tokens ?? settings.maxTokens;
823
+ const thinkingBudgetSource = settings.thinking_budget ?? settings.thinkingBudget;
824
+ const apiKey = resolveString(apiKeySource, env, `${target.name} Anthropic api key`);
825
+ const model = resolveString(modelSource, env, `${target.name} Anthropic model`);
826
+ return {
827
+ apiKey,
828
+ model,
829
+ temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
830
+ maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
831
+ thinkingBudget: resolveOptionalNumber(thinkingBudgetSource, `${target.name} thinking budget`)
832
+ };
833
+ }
834
+ function resolveGeminiConfig(target, env) {
835
+ const settings = target.settings ?? {};
836
+ const apiKeySource = settings.api_key ?? settings.apiKey;
837
+ const modelSource = settings.model ?? settings.deployment ?? settings.variant;
838
+ const temperatureSource = settings.temperature;
839
+ const maxTokensSource = settings.max_output_tokens ?? settings.maxTokens;
840
+ const apiKey = resolveString(apiKeySource, env, `${target.name} Google API key`);
841
+ const model = resolveOptionalString(modelSource, env, `${target.name} Gemini model`, {
842
+ allowLiteral: true,
843
+ optionalEnv: true
844
+ }) ?? "gemini-2.5-flash";
845
+ return {
846
+ apiKey,
847
+ model,
848
+ temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
849
+ maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`)
850
+ };
851
+ }
852
+ function resolveMockConfig(target) {
853
+ const settings = target.settings ?? {};
854
+ const response = typeof settings.response === "string" ? settings.response : void 0;
855
+ return { response };
856
+ }
857
+ function resolveVSCodeConfig(target, env, insiders) {
858
+ const settings = target.settings ?? {};
859
+ const workspaceTemplateEnvVar = resolveOptionalLiteralString(settings.workspace_template ?? settings.workspaceTemplate);
860
+ const workspaceTemplate = workspaceTemplateEnvVar ? resolveOptionalString(workspaceTemplateEnvVar, env, `${target.name} workspace template path`, {
861
+ allowLiteral: false,
862
+ optionalEnv: true
863
+ }) : void 0;
864
+ const commandSource = settings.vscode_cmd ?? settings.command;
865
+ const waitSource = settings.wait;
866
+ const dryRunSource = settings.dry_run ?? settings.dryRun;
867
+ const subagentRootSource = settings.subagent_root ?? settings.subagentRoot;
868
+ const defaultCommand = insiders ? "code-insiders" : "code";
869
+ const command = resolveOptionalLiteralString(commandSource) ?? defaultCommand;
870
+ return {
871
+ command,
872
+ waitForResponse: resolveOptionalBoolean(waitSource) ?? true,
873
+ dryRun: resolveOptionalBoolean(dryRunSource) ?? false,
874
+ subagentRoot: resolveOptionalString(subagentRootSource, env, `${target.name} subagent root`, {
875
+ allowLiteral: true,
876
+ optionalEnv: true
877
+ }),
878
+ workspaceTemplate
879
+ };
880
+ }
881
+ function resolveString(source, env, description, allowLiteral = false) {
882
+ const value = resolveOptionalString(source, env, description, {
883
+ allowLiteral,
884
+ optionalEnv: false
885
+ });
886
+ if (value === void 0) {
887
+ throw new Error(`${description} is required`);
888
+ }
889
+ return value;
890
+ }
891
+ function resolveOptionalString(source, env, description, options) {
892
+ if (source === void 0 || source === null) {
893
+ return void 0;
894
+ }
895
+ if (typeof source !== "string") {
896
+ throw new Error(`${description} must be a string`);
897
+ }
898
+ const trimmed = source.trim();
899
+ if (trimmed.length === 0) {
900
+ return void 0;
901
+ }
902
+ const envValue = env[trimmed];
903
+ if (envValue !== void 0) {
904
+ if (envValue.trim().length === 0) {
905
+ throw new Error(`Environment variable '${trimmed}' for ${description} is empty`);
906
+ }
907
+ return envValue;
908
+ }
909
+ const allowLiteral = options?.allowLiteral ?? false;
910
+ const optionalEnv = options?.optionalEnv ?? false;
911
+ if (!allowLiteral && isLikelyEnvReference(trimmed)) {
912
+ if (optionalEnv) {
913
+ return void 0;
914
+ }
915
+ throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
916
+ }
917
+ return trimmed;
918
+ }
919
+ function resolveOptionalLiteralString(source) {
920
+ if (source === void 0 || source === null) {
921
+ return void 0;
922
+ }
923
+ if (typeof source !== "string") {
924
+ throw new Error("expected string value");
925
+ }
926
+ const trimmed = source.trim();
927
+ return trimmed.length > 0 ? trimmed : void 0;
928
+ }
929
+ function resolveOptionalNumber(source, description) {
930
+ if (source === void 0 || source === null || source === "") {
931
+ return void 0;
932
+ }
933
+ if (typeof source === "number") {
934
+ return Number.isFinite(source) ? source : void 0;
935
+ }
936
+ if (typeof source === "string") {
937
+ const numeric = Number(source);
938
+ if (Number.isFinite(numeric)) {
939
+ return numeric;
940
+ }
941
+ }
942
+ throw new Error(`${description} must be a number`);
943
+ }
944
+ function resolveOptionalBoolean(source) {
945
+ if (source === void 0 || source === null || source === "") {
946
+ return void 0;
947
+ }
948
+ if (typeof source === "boolean") {
949
+ return source;
950
+ }
951
+ if (typeof source === "string") {
952
+ const lowered = source.trim().toLowerCase();
953
+ if (lowered === "true" || lowered === "1") {
954
+ return true;
955
+ }
956
+ if (lowered === "false" || lowered === "0") {
957
+ return false;
958
+ }
959
+ }
960
+ throw new Error("expected boolean value");
961
+ }
962
+ function isLikelyEnvReference(value) {
963
+ return /^[A-Z0-9_]+$/.test(value);
964
+ }
965
+
966
+ // src/evaluation/providers/vscode.ts
967
+ var import_promises3 = require("fs/promises");
968
+ var import_node_os = require("os");
969
+ var import_node_path3 = __toESM(require("path"), 1);
970
+ var import_subagent = require("subagent");
971
+ var PROMPT_FILE_PREFIX = "bbeval-vscode-";
972
+ var VSCodeProvider = class {
973
+ id;
974
+ kind;
975
+ targetName;
976
+ config;
977
+ constructor(targetName, config, kind) {
978
+ this.id = `${kind}:${targetName}`;
979
+ this.kind = kind;
980
+ this.targetName = targetName;
981
+ this.config = config;
982
+ }
983
+ async invoke(request) {
984
+ if (request.signal?.aborted) {
985
+ throw new Error("VS Code provider request was aborted before dispatch");
986
+ }
987
+ const attachments = normalizeAttachments(request.attachments);
988
+ const promptContent = buildPromptDocument(request, attachments);
989
+ const directory = await (0, import_promises3.mkdtemp)(import_node_path3.default.join((0, import_node_os.tmpdir)(), PROMPT_FILE_PREFIX));
990
+ const promptPath = import_node_path3.default.join(directory, `${request.testCaseId ?? "request"}.prompt.md`);
991
+ try {
992
+ await (0, import_promises3.writeFile)(promptPath, promptContent, "utf8");
993
+ const session = await (0, import_subagent.dispatchAgentSession)({
994
+ userQuery: composeUserQuery(request),
995
+ promptFile: promptPath,
996
+ extraAttachments: attachments,
997
+ wait: this.config.waitForResponse,
998
+ dryRun: this.config.dryRun,
999
+ vscodeCmd: this.config.command,
1000
+ subagentRoot: this.config.subagentRoot,
1001
+ workspaceTemplate: this.config.workspaceTemplate,
1002
+ silent: true
1003
+ });
1004
+ if (session.exitCode !== 0 || !session.responseFile) {
1005
+ const failure = session.error ?? "VS Code subagent did not produce a response";
1006
+ throw new Error(failure);
1007
+ }
1008
+ if (this.config.dryRun) {
1009
+ return {
1010
+ text: "",
1011
+ raw: {
1012
+ session,
1013
+ promptFile: promptPath,
1014
+ attachments
1015
+ }
1016
+ };
1017
+ }
1018
+ const responseText = await (0, import_promises3.readFile)(session.responseFile, "utf8");
1019
+ return {
1020
+ text: responseText,
1021
+ raw: {
1022
+ session,
1023
+ promptFile: promptPath,
1024
+ attachments
1025
+ }
1026
+ };
1027
+ } finally {
1028
+ await (0, import_promises3.rm)(directory, { recursive: true, force: true });
1029
+ }
1030
+ }
1031
+ };
1032
+ function buildPromptDocument(request, attachments) {
1033
+ const parts = [];
1034
+ const instructionFiles = collectInstructionFiles(attachments);
1035
+ if (instructionFiles.length > 0) {
1036
+ parts.push(buildMandatoryPrereadBlock(instructionFiles));
1037
+ }
1038
+ parts.push(`# BbEval Request`);
1039
+ if (request.testCaseId) {
1040
+ parts.push(`- Test Case: ${request.testCaseId}`);
1041
+ }
1042
+ if (request.metadata?.target) {
1043
+ parts.push(`- Target: ${String(request.metadata.target)}`);
1044
+ }
1045
+ parts.push("\n## Task\n", request.prompt.trim());
1046
+ if (request.guidelines && request.guidelines.trim().length > 0) {
1047
+ parts.push("\n## Guidelines\n", request.guidelines.trim());
1048
+ }
1049
+ if (attachments && attachments.length > 0) {
1050
+ const attachmentList = attachments.map((item) => `- ${item}`).join("\n");
1051
+ parts.push("\n## Attachments\n", attachmentList);
1052
+ }
1053
+ return parts.join("\n").trim();
1054
+ }
1055
+ function buildMandatoryPrereadBlock(instructionFiles) {
1056
+ if (instructionFiles.length === 0) {
1057
+ return "";
1058
+ }
1059
+ const fileList = [];
1060
+ const tokenList = [];
1061
+ let counter = 0;
1062
+ for (const absolutePath of instructionFiles) {
1063
+ counter += 1;
1064
+ const fileName = import_node_path3.default.basename(absolutePath);
1065
+ const fileUri = pathToFileUri(absolutePath);
1066
+ fileList.push(`[${fileName}](${fileUri})`);
1067
+ tokenList.push(`INSTRUCTIONS_READ: \`${fileName}\` i=${counter} SHA256=<hex>`);
1068
+ }
1069
+ const filesText = fileList.join(", ");
1070
+ const tokensText = tokenList.join("\n");
1071
+ const instruction = [
1072
+ `Read all instruction files: ${filesText}.`,
1073
+ `After reading each file, compute its SHA256 hash using this PowerShell command:`,
1074
+ "`Get-FileHash -Algorithm SHA256 -LiteralPath '<file-path>' | Select-Object -ExpandProperty Hash`.",
1075
+ `Then include, at the top of your reply, these exact tokens on separate lines:
1076
+ `,
1077
+ tokensText,
1078
+ `
1079
+ Replace \`<hex>\` with the actual SHA256 hash value computed from the PowerShell command.`,
1080
+ `If any file is missing, fail with ERROR: missing-file <filename> and stop.
1081
+ `,
1082
+ `Then fetch all documentation required by the instructions before proceeding with your task.`
1083
+ ].join(" ");
1084
+ return `[[ ## mandatory_pre_read ## ]]
1085
+
1086
+ ${instruction}
1087
+
1088
+ `;
1089
+ }
1090
+ function collectInstructionFiles(attachments) {
1091
+ if (!attachments || attachments.length === 0) {
1092
+ return [];
1093
+ }
1094
+ const unique = /* @__PURE__ */ new Map();
1095
+ for (const attachment of attachments) {
1096
+ if (!isInstructionPath(attachment)) {
1097
+ continue;
1098
+ }
1099
+ const absolutePath = import_node_path3.default.resolve(attachment);
1100
+ if (!unique.has(absolutePath)) {
1101
+ unique.set(absolutePath, absolutePath);
1102
+ }
1103
+ }
1104
+ return Array.from(unique.values());
1105
+ }
1106
+ function isInstructionPath(filePath) {
1107
+ const normalized = filePath.split(import_node_path3.default.sep).join("/");
1108
+ return normalized.endsWith(".instructions.md") || normalized.includes("/instructions/") || normalized.endsWith(".prompt.md") || normalized.includes("/prompts/");
1109
+ }
1110
+ function pathToFileUri(filePath) {
1111
+ const absolutePath = import_node_path3.default.isAbsolute(filePath) ? filePath : import_node_path3.default.resolve(filePath);
1112
+ const normalizedPath = absolutePath.replace(/\\/g, "/");
1113
+ if (/^[a-zA-Z]:\//.test(normalizedPath)) {
1114
+ return `file:///${normalizedPath}`;
1115
+ }
1116
+ return `file://${normalizedPath}`;
1117
+ }
1118
+ function composeUserQuery(request) {
1119
+ const segments = [];
1120
+ segments.push(request.prompt.trim());
1121
+ if (request.guidelines && request.guidelines.trim().length > 0) {
1122
+ segments.push("\nGuidelines:\n", request.guidelines.trim());
1123
+ }
1124
+ return segments.join("\n").trim();
1125
+ }
1126
+ function normalizeAttachments(attachments) {
1127
+ if (!attachments || attachments.length === 0) {
1128
+ return void 0;
1129
+ }
1130
+ const deduped = /* @__PURE__ */ new Set();
1131
+ for (const attachment of attachments) {
1132
+ deduped.add(import_node_path3.default.resolve(attachment));
1133
+ }
1134
+ return Array.from(deduped);
1135
+ }
1136
+ async function ensureVSCodeSubagents(options) {
1137
+ const { kind, count, verbose = false } = options;
1138
+ const vscodeCmd = kind === "vscode-insiders" ? "code-insiders" : "code";
1139
+ const subagentRoot = (0, import_subagent.getSubagentRoot)(vscodeCmd);
1140
+ try {
1141
+ if (verbose) {
1142
+ console.log(`Provisioning ${count} subagent(s) via: subagent ${vscodeCmd} provision`);
1143
+ }
1144
+ const result = await (0, import_subagent.provisionSubagents)({
1145
+ targetRoot: subagentRoot,
1146
+ subagents: count,
1147
+ dryRun: false
1148
+ });
1149
+ if (verbose) {
1150
+ if (result.created.length > 0) {
1151
+ console.log(`Created ${result.created.length} new subagent(s)`);
1152
+ }
1153
+ if (result.skippedExisting.length > 0) {
1154
+ console.log(`Reusing ${result.skippedExisting.length} existing unlocked subagent(s)`);
1155
+ }
1156
+ console.log(`
1157
+ total unlocked subagents available: ${result.created.length + result.skippedExisting.length}`);
1158
+ }
1159
+ return {
1160
+ provisioned: true,
1161
+ message: `Provisioned ${count} subagent(s): ${result.created.length} created, ${result.skippedExisting.length} reused`
1162
+ };
1163
+ } catch (error) {
1164
+ const errorMessage = error instanceof Error ? error.message : String(error);
1165
+ if (verbose) {
1166
+ console.warn(`Provisioning failed (continuing anyway): ${errorMessage}`);
1167
+ }
1168
+ return {
1169
+ provisioned: false,
1170
+ message: `Provisioning failed: ${errorMessage}`
1171
+ };
1172
+ }
1173
+ }
1174
+
1175
+ // src/evaluation/providers/targets-file.ts
1176
+ var import_node_fs3 = require("fs");
1177
+ var import_promises4 = require("fs/promises");
1178
+ var import_node_path4 = __toESM(require("path"), 1);
1179
+ var import_yaml2 = require("yaml");
1180
+ function isRecord(value) {
1181
+ return typeof value === "object" && value !== null && !Array.isArray(value);
1182
+ }
1183
+ function checkVersion(parsed, absolutePath) {
1184
+ const version = typeof parsed.version === "number" ? parsed.version : typeof parsed.version === "string" ? parseFloat(parsed.version) : void 0;
1185
+ if (version === void 0) {
1186
+ throw new Error(
1187
+ `Missing version field in targets.yaml at ${absolutePath}.
1188
+ Please add 'version: 2.0' at the top of the file.`
1189
+ );
1190
+ }
1191
+ if (version < 2) {
1192
+ throw new Error(
1193
+ `Outdated targets.yaml format (version ${version}) at ${absolutePath}.
1194
+ Please update to version 2.0 format with 'targets' array.`
1195
+ );
1196
+ }
1197
+ }
1198
+ function extractTargetsArray(parsed, absolutePath) {
1199
+ const targets = parsed.targets;
1200
+ if (!Array.isArray(targets)) {
1201
+ throw new Error(`targets.yaml at ${absolutePath} must have a 'targets' array`);
1202
+ }
1203
+ return targets;
1204
+ }
1205
+ function assertTargetDefinition(value, index, filePath) {
1206
+ if (!isRecord(value)) {
1207
+ throw new Error(`targets.yaml entry at index ${index} in ${filePath} must be an object`);
1208
+ }
1209
+ const name = value.name;
1210
+ const provider = value.provider;
1211
+ const settings = value.settings;
1212
+ const judgeTarget = value.judge_target;
1213
+ if (typeof name !== "string" || name.trim().length === 0) {
1214
+ throw new Error(`targets.yaml entry at index ${index} in ${filePath} is missing a valid 'name'`);
1215
+ }
1216
+ if (typeof provider !== "string" || provider.trim().length === 0) {
1217
+ throw new Error(`targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider'`);
1218
+ }
1219
+ return {
1220
+ name,
1221
+ provider,
1222
+ settings: isRecord(settings) ? settings : void 0,
1223
+ judge_target: typeof judgeTarget === "string" ? judgeTarget : void 0
1224
+ };
1225
+ }
1226
+ async function fileExists3(filePath) {
1227
+ try {
1228
+ await (0, import_promises4.access)(filePath, import_node_fs3.constants.F_OK);
1229
+ return true;
1230
+ } catch {
1231
+ return false;
1232
+ }
1233
+ }
1234
+ async function readTargetDefinitions(filePath) {
1235
+ const absolutePath = import_node_path4.default.resolve(filePath);
1236
+ if (!await fileExists3(absolutePath)) {
1237
+ throw new Error(`targets.yaml not found at ${absolutePath}`);
1238
+ }
1239
+ const raw = await (0, import_promises4.readFile)(absolutePath, "utf8");
1240
+ const parsed = (0, import_yaml2.parse)(raw);
1241
+ if (!isRecord(parsed)) {
1242
+ throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with 'version' and 'targets' fields`);
1243
+ }
1244
+ checkVersion(parsed, absolutePath);
1245
+ const targets = extractTargetsArray(parsed, absolutePath);
1246
+ const definitions = targets.map((entry, index) => assertTargetDefinition(entry, index, absolutePath));
1247
+ return definitions;
1248
+ }
1249
+ function listTargetNames(definitions) {
1250
+ return definitions.map((definition) => definition.name);
1251
+ }
1252
+
1253
+ // src/evaluation/providers/index.ts
1254
+ function createProvider(target) {
1255
+ switch (target.kind) {
1256
+ case "azure":
1257
+ return new AzureProvider(target.name, target.config);
1258
+ case "anthropic":
1259
+ return new AnthropicProvider(target.name, target.config);
1260
+ case "gemini":
1261
+ return new GeminiProvider(target.name, target.config);
1262
+ case "mock":
1263
+ return new MockProvider(target.name, target.config);
1264
+ case "vscode":
1265
+ case "vscode-insiders":
1266
+ return new VSCodeProvider(target.name, target.config, target.kind);
1267
+ default: {
1268
+ const neverTarget = target;
1269
+ throw new Error(`Unsupported provider kind ${neverTarget.kind}`);
1270
+ }
1271
+ }
1272
+ }
1273
+ function resolveAndCreateProvider(definition, env = process.env) {
1274
+ const resolved = resolveTargetDefinition(definition, env);
1275
+ return createProvider(resolved);
1276
+ }
1277
+
1278
+ // src/evaluation/scoring.ts
1279
+ var KEY_TERM_MATCH_THRESHOLD = 0.5;
1280
+ var ACTION_WORDS = /* @__PURE__ */ new Set([
1281
+ "use",
1282
+ "avoid",
1283
+ "prefer",
1284
+ "replace",
1285
+ "consider",
1286
+ "ensure",
1287
+ "remove",
1288
+ "add"
1289
+ ]);
1290
+ var STOP_WORDS = /* @__PURE__ */ new Set([
1291
+ "the",
1292
+ "a",
1293
+ "an",
1294
+ "and",
1295
+ "or",
1296
+ "but",
1297
+ "in",
1298
+ "on",
1299
+ "at",
1300
+ "to",
1301
+ "for",
1302
+ "of",
1303
+ "with",
1304
+ "by",
1305
+ "is",
1306
+ "are",
1307
+ "was",
1308
+ "were",
1309
+ "be",
1310
+ "been",
1311
+ "being",
1312
+ "have",
1313
+ "has",
1314
+ "had",
1315
+ "do",
1316
+ "does",
1317
+ "did",
1318
+ "will",
1319
+ "would",
1320
+ "could",
1321
+ "should"
1322
+ ]);
1323
+ var ERROR_PREFIXES = [
1324
+ "error:",
1325
+ "err:",
1326
+ "vs code command failed",
1327
+ "exception",
1328
+ "traceback",
1329
+ "no response file was generated",
1330
+ "timed out",
1331
+ "cli not found"
1332
+ ];
1333
+ function extractAspects(expectedResponse) {
1334
+ const lines = expectedResponse.split(/\r?\n/).map((line) => line.trim());
1335
+ const aspects = [];
1336
+ for (const line of lines) {
1337
+ if (line.length === 0) {
1338
+ continue;
1339
+ }
1340
+ const bulletMatch = /^([-*•]|[0-9]+\.)\s*(.+)$/.exec(line);
1341
+ if (bulletMatch) {
1342
+ const normalized = normalizeAspect(bulletMatch[2]);
1343
+ if (normalized.length > 0) {
1344
+ aspects.push(normalized);
1345
+ }
1346
+ continue;
1347
+ }
1348
+ const lowered = line.toLowerCase();
1349
+ if (Array.from(ACTION_WORDS).some((word) => lowered.startsWith(word))) {
1350
+ const normalized = normalizeAspect(line);
1351
+ if (normalized.length > 0) {
1352
+ aspects.push(normalized);
1353
+ }
1354
+ }
1355
+ }
1356
+ return aspects;
1357
+ }
1358
+ function calculateHits(candidateResponse, expectedAspects) {
1359
+ const { normalizedText, words } = normalizeCandidate(candidateResponse);
1360
+ const hits = [];
1361
+ for (const aspect of expectedAspects) {
1362
+ if (matchesAspect(aspect, normalizedText, words)) {
1363
+ hits.push(aspect);
1364
+ }
1365
+ }
1366
+ return hits;
1367
+ }
1368
+ function calculateMisses(candidateResponse, expectedAspects, resolvedHits) {
1369
+ const hits = new Set(resolvedHits ?? calculateHits(candidateResponse, expectedAspects));
1370
+ return expectedAspects.filter((aspect) => !hits.has(aspect));
1371
+ }
1372
+ function scoreCandidateResponse(candidateResponse, expectedAspects) {
1373
+ if (expectedAspects.length === 0) {
1374
+ if (isErrorLike(candidateResponse)) {
1375
+ return {
1376
+ score: 0,
1377
+ hits: [],
1378
+ misses: ["Model produced an error instead of an answer."],
1379
+ hitCount: 0,
1380
+ totalAspects: 0,
1381
+ rawAspects: []
1382
+ };
1383
+ }
1384
+ return {
1385
+ score: 1,
1386
+ hits: [],
1387
+ misses: [],
1388
+ hitCount: 0,
1389
+ totalAspects: 0,
1390
+ rawAspects: []
1391
+ };
1392
+ }
1393
+ const hits = calculateHits(candidateResponse, expectedAspects);
1394
+ const misses = expectedAspects.filter((aspect) => !hits.includes(aspect));
1395
+ const score = expectedAspects.length > 0 ? hits.length / expectedAspects.length : 0;
1396
+ return {
1397
+ score,
1398
+ hits,
1399
+ misses,
1400
+ hitCount: hits.length,
1401
+ totalAspects: expectedAspects.length,
1402
+ rawAspects: expectedAspects
1403
+ };
1404
+ }
1405
+ function isErrorLike(text) {
1406
+ if (!text) {
1407
+ return false;
1408
+ }
1409
+ const lowered = text.trim().toLowerCase();
1410
+ return ERROR_PREFIXES.some((prefix) => lowered.startsWith(prefix));
1411
+ }
1412
+ function normalizeAspect(aspect) {
1413
+ const sanitized = aspect.toLowerCase().replace(/[^\w\s]/g, " ").replace(/\s+/g, " ").trim();
1414
+ return sanitized;
1415
+ }
1416
+ function normalizeCandidate(candidate) {
1417
+ const lowered = candidate.toLowerCase();
1418
+ const normalizedText = lowered.replace(/[^\w\s]/g, " ");
1419
+ const words = new Set(normalizedText.split(/\s+/).filter((word) => word.length > 0));
1420
+ return { normalizedText, words };
1421
+ }
1422
+ function matchesAspect(aspect, candidateNormalized, candidateWords) {
1423
+ const keyTerms = extractKeyTerms(aspect);
1424
+ if (keyTerms.length === 0) {
1425
+ return false;
1426
+ }
1427
+ const matches = keyTerms.filter((term) => candidateWords.has(term)).length;
1428
+ const ratio = matches / keyTerms.length;
1429
+ if (ratio >= KEY_TERM_MATCH_THRESHOLD) {
1430
+ return true;
1431
+ }
1432
+ const aspectWords = aspect.split(" ");
1433
+ if (aspectWords.length >= 2) {
1434
+ for (let index = 0; index < aspectWords.length - 1; index += 1) {
1435
+ const phrase = `${aspectWords[index]} ${aspectWords[index + 1]}`;
1436
+ if (candidateNormalized.includes(phrase)) {
1437
+ return true;
1438
+ }
1439
+ }
1440
+ }
1441
+ return false;
1442
+ }
1443
+ function extractKeyTerms(aspect, maxTerms = 5) {
1444
+ const terms = [];
1445
+ const words = aspect.split(" ");
1446
+ for (const word of words) {
1447
+ if (word.length <= 2) {
1448
+ continue;
1449
+ }
1450
+ if (STOP_WORDS.has(word)) {
1451
+ continue;
1452
+ }
1453
+ terms.push(word);
1454
+ if (terms.length >= maxTerms) {
1455
+ break;
1456
+ }
1457
+ }
1458
+ return terms;
1459
+ }
1460
+
1461
+ // src/evaluation/grading.ts
1462
+ var import_node_crypto = require("crypto");
1463
+ var HeuristicGrader = class {
1464
+ kind = "heuristic";
1465
+ grade(context) {
1466
+ const expectedAspects = extractAspects(context.testCase.expected_assistant_raw);
1467
+ const result = scoreCandidateResponse(context.candidate, expectedAspects);
1468
+ const misses = [...result.misses];
1469
+ if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
1470
+ const firstLine = context.candidate.split(/\r?\n/)[0]?.trim();
1471
+ if (firstLine && !misses.includes(firstLine)) {
1472
+ misses.unshift(firstLine);
1473
+ }
1474
+ }
1475
+ return {
1476
+ score: result.score,
1477
+ hits: result.hits,
1478
+ misses,
1479
+ expectedAspectCount: result.totalAspects,
1480
+ rawAspects: result.rawAspects
1481
+ };
1482
+ }
1483
+ };
1484
+ var QualityGrader = class {
1485
+ kind = "llm_judge";
1486
+ resolveJudgeProvider;
1487
+ maxOutputTokens;
1488
+ temperature;
1489
+ constructor(options) {
1490
+ this.resolveJudgeProvider = options.resolveJudgeProvider;
1491
+ this.maxOutputTokens = options.maxOutputTokens;
1492
+ this.temperature = options.temperature;
1493
+ }
1494
+ async grade(context) {
1495
+ const judgeProvider = await this.resolveJudgeProvider(context);
1496
+ if (!judgeProvider) {
1497
+ throw new Error("No judge provider available for LLM grading");
1498
+ }
1499
+ const prompt = buildQualityPrompt(context.testCase, context.candidate);
1500
+ const metadata = {
1501
+ systemPrompt: QUALITY_SYSTEM_PROMPT
1502
+ };
1503
+ const response = await judgeProvider.invoke({
1504
+ prompt,
1505
+ metadata,
1506
+ testCaseId: context.testCase.id,
1507
+ attempt: context.attempt,
1508
+ maxOutputTokens: this.maxOutputTokens,
1509
+ temperature: this.temperature
1510
+ });
1511
+ const parsed = parseQualityResponse(response);
1512
+ const score = clampScore(parsed.score ?? 0);
1513
+ const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
1514
+ const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
1515
+ const reasoning = parsed.reasoning ?? response.reasoning;
1516
+ const graderRawRequest = {
1517
+ id: (0, import_node_crypto.randomUUID)(),
1518
+ provider: judgeProvider.id,
1519
+ prompt,
1520
+ systemPrompt: QUALITY_SYSTEM_PROMPT,
1521
+ target: context.target.name
1522
+ };
1523
+ return {
1524
+ score,
1525
+ hits,
1526
+ misses,
1527
+ expectedAspectCount: hits.length + misses.length || 1,
1528
+ reasoning,
1529
+ graderRawRequest
1530
+ };
1531
+ }
1532
+ };
1533
+ var QUALITY_SYSTEM_PROMPT = [
1534
+ "You are an expert evaluator. Your goal is to grade the generated_answer based on how well it achieves the expected_outcome for the original task.",
1535
+ "",
1536
+ "Use the reference_answer as a gold standard for a high-quality response. The generated_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
1537
+ "",
1538
+ "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
1539
+ "",
1540
+ "You must respond with a single JSON object matching this schema:",
1541
+ "",
1542
+ "{",
1543
+ ' "score": <number between 0.0 and 1.0>,',
1544
+ ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
1545
+ ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
1546
+ ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
1547
+ "}"
1548
+ ].join("\n");
1549
+ function buildQualityPrompt(testCase, candidate) {
1550
+ const parts = [
1551
+ "[[ ## expected_outcome ## ]]",
1552
+ testCase.outcome,
1553
+ "",
1554
+ "[[ ## request ## ]]",
1555
+ testCase.task,
1556
+ "",
1557
+ "[[ ## reference_answer ## ]]",
1558
+ testCase.expected_assistant_raw,
1559
+ "",
1560
+ "[[ ## generated_answer ## ]]",
1561
+ candidate,
1562
+ "",
1563
+ "Respond with a single JSON object matching the schema described in the system prompt."
1564
+ ];
1565
+ return parts.join("\n");
1566
+ }
1567
+ function clampScore(value) {
1568
+ if (Number.isNaN(value) || !Number.isFinite(value)) {
1569
+ return 0;
1570
+ }
1571
+ if (value < 0) {
1572
+ return 0;
1573
+ }
1574
+ if (value > 1) {
1575
+ return 1;
1576
+ }
1577
+ return value;
1578
+ }
1579
+ function parseQualityResponse(response) {
1580
+ const text = typeof response.text === "string" ? response.text.trim() : "";
1581
+ if (text.length === 0) {
1582
+ return {};
1583
+ }
1584
+ const direct = attemptParseJson(text);
1585
+ if (direct && validateQualityJson(direct)) {
1586
+ return direct;
1587
+ }
1588
+ const extracted = extractJsonBlob(text);
1589
+ if (extracted) {
1590
+ const parsed = attemptParseJson(extracted);
1591
+ if (parsed && validateQualityJson(parsed)) {
1592
+ return parsed;
1593
+ }
1594
+ }
1595
+ return {};
1596
+ }
1597
+ function attemptParseJson(text) {
1598
+ try {
1599
+ const parsed = JSON.parse(text);
1600
+ const score = typeof parsed.score === "number" ? parsed.score : void 0;
1601
+ const hits = parsed.hits;
1602
+ const misses = parsed.misses;
1603
+ const reasoning = typeof parsed.reasoning === "string" ? parsed.reasoning : void 0;
1604
+ return { score, hits, misses, reasoning };
1605
+ } catch {
1606
+ return void 0;
1607
+ }
1608
+ }
1609
+ function validateQualityJson(parsed) {
1610
+ if (typeof parsed.score !== "number") {
1611
+ return false;
1612
+ }
1613
+ if (Number.isNaN(parsed.score) || !Number.isFinite(parsed.score)) {
1614
+ return false;
1615
+ }
1616
+ if (parsed.score < 0 || parsed.score > 1) {
1617
+ return false;
1618
+ }
1619
+ if (parsed.hits !== void 0) {
1620
+ if (!Array.isArray(parsed.hits)) {
1621
+ return false;
1622
+ }
1623
+ if (!parsed.hits.every((item) => typeof item === "string")) {
1624
+ return false;
1625
+ }
1626
+ }
1627
+ if (parsed.misses !== void 0) {
1628
+ if (!Array.isArray(parsed.misses)) {
1629
+ return false;
1630
+ }
1631
+ if (!parsed.misses.every((item) => typeof item === "string")) {
1632
+ return false;
1633
+ }
1634
+ }
1635
+ if (parsed.reasoning !== void 0 && typeof parsed.reasoning !== "string") {
1636
+ return false;
1637
+ }
1638
+ return true;
1639
+ }
1640
+ function extractJsonBlob(text) {
1641
+ const match = text.match(/\{[\s\S]*\}/);
1642
+ return match?.[0];
1643
+ }
1644
+ function isNonEmptyString(value) {
1645
+ return typeof value === "string" && value.trim().length > 0;
1646
+ }
1647
+
1648
+ // src/evaluation/orchestrator.ts
1649
+ var import_node_crypto2 = require("crypto");
1650
+ var import_promises5 = require("fs/promises");
1651
+ var import_node_path5 = __toESM(require("path"), 1);
1652
+
1653
+ // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
1654
+ var Node = class {
1655
+ value;
1656
+ next;
1657
+ constructor(value) {
1658
+ this.value = value;
1659
+ }
1660
+ };
1661
+ var Queue = class {
1662
+ #head;
1663
+ #tail;
1664
+ #size;
1665
+ constructor() {
1666
+ this.clear();
1667
+ }
1668
+ enqueue(value) {
1669
+ const node = new Node(value);
1670
+ if (this.#head) {
1671
+ this.#tail.next = node;
1672
+ this.#tail = node;
1673
+ } else {
1674
+ this.#head = node;
1675
+ this.#tail = node;
1676
+ }
1677
+ this.#size++;
1678
+ }
1679
+ dequeue() {
1680
+ const current = this.#head;
1681
+ if (!current) {
1682
+ return;
1683
+ }
1684
+ this.#head = this.#head.next;
1685
+ this.#size--;
1686
+ return current.value;
1687
+ }
1688
+ peek() {
1689
+ if (!this.#head) {
1690
+ return;
1691
+ }
1692
+ return this.#head.value;
1693
+ }
1694
+ clear() {
1695
+ this.#head = void 0;
1696
+ this.#tail = void 0;
1697
+ this.#size = 0;
1698
+ }
1699
+ get size() {
1700
+ return this.#size;
1701
+ }
1702
+ *[Symbol.iterator]() {
1703
+ let current = this.#head;
1704
+ while (current) {
1705
+ yield current.value;
1706
+ current = current.next;
1707
+ }
1708
+ }
1709
+ *drain() {
1710
+ while (this.#head) {
1711
+ yield this.dequeue();
1712
+ }
1713
+ }
1714
+ };
1715
+
1716
+ // ../../node_modules/.pnpm/p-limit@6.2.0/node_modules/p-limit/index.js
1717
+ function pLimit(concurrency) {
1718
+ validateConcurrency(concurrency);
1719
+ const queue = new Queue();
1720
+ let activeCount = 0;
1721
+ const resumeNext = () => {
1722
+ if (activeCount < concurrency && queue.size > 0) {
1723
+ queue.dequeue()();
1724
+ activeCount++;
1725
+ }
1726
+ };
1727
+ const next = () => {
1728
+ activeCount--;
1729
+ resumeNext();
1730
+ };
1731
+ const run = async (function_, resolve, arguments_) => {
1732
+ const result = (async () => function_(...arguments_))();
1733
+ resolve(result);
1734
+ try {
1735
+ await result;
1736
+ } catch {
1737
+ }
1738
+ next();
1739
+ };
1740
+ const enqueue = (function_, resolve, arguments_) => {
1741
+ new Promise((internalResolve) => {
1742
+ queue.enqueue(internalResolve);
1743
+ }).then(
1744
+ run.bind(void 0, function_, resolve, arguments_)
1745
+ );
1746
+ (async () => {
1747
+ await Promise.resolve();
1748
+ if (activeCount < concurrency) {
1749
+ resumeNext();
1750
+ }
1751
+ })();
1752
+ };
1753
+ const generator = (function_, ...arguments_) => new Promise((resolve) => {
1754
+ enqueue(function_, resolve, arguments_);
1755
+ });
1756
+ Object.defineProperties(generator, {
1757
+ activeCount: {
1758
+ get: () => activeCount
1759
+ },
1760
+ pendingCount: {
1761
+ get: () => queue.size
1762
+ },
1763
+ clearQueue: {
1764
+ value() {
1765
+ queue.clear();
1766
+ }
1767
+ },
1768
+ concurrency: {
1769
+ get: () => concurrency,
1770
+ set(newConcurrency) {
1771
+ validateConcurrency(newConcurrency);
1772
+ concurrency = newConcurrency;
1773
+ queueMicrotask(() => {
1774
+ while (activeCount < concurrency && queue.size > 0) {
1775
+ resumeNext();
1776
+ }
1777
+ });
1778
+ }
1779
+ }
1780
+ });
1781
+ return generator;
1782
+ }
1783
+ function validateConcurrency(concurrency) {
1784
+ if (!((Number.isInteger(concurrency) || concurrency === Number.POSITIVE_INFINITY) && concurrency > 0)) {
1785
+ throw new TypeError("Expected `concurrency` to be a number from 1 and up");
1786
+ }
1787
+ }
1788
+
1789
+ // src/evaluation/orchestrator.ts
1790
+ async function runEvaluation(options) {
1791
+ const {
1792
+ testFilePath,
1793
+ repoRoot,
1794
+ target,
1795
+ targets,
1796
+ env,
1797
+ providerFactory,
1798
+ graders,
1799
+ maxRetries,
1800
+ agentTimeoutMs,
1801
+ promptDumpDir,
1802
+ cache,
1803
+ useCache,
1804
+ now,
1805
+ testId,
1806
+ verbose,
1807
+ onResult,
1808
+ onProgress
1809
+ } = options;
1810
+ const load = loadTestCases;
1811
+ const testCases = await load(testFilePath, repoRoot, { verbose });
1812
+ const filteredTestCases = filterTestCases(testCases, testId);
1813
+ if (filteredTestCases.length === 0) {
1814
+ if (testId) {
1815
+ throw new Error(`Test case with id '${testId}' not found in ${testFilePath}`);
1816
+ }
1817
+ return [];
1818
+ }
1819
+ const resolvedTargetsByName = /* @__PURE__ */ new Map();
1820
+ resolvedTargetsByName.set(target.name, target);
1821
+ const targetDefinitions = /* @__PURE__ */ new Map();
1822
+ for (const definition of targets ?? []) {
1823
+ targetDefinitions.set(definition.name, definition);
1824
+ }
1825
+ const envLookup = env ?? process.env;
1826
+ const providerCache = /* @__PURE__ */ new Map();
1827
+ const getOrCreateProvider = (resolved) => {
1828
+ const existing = providerCache.get(resolved.name);
1829
+ if (existing) {
1830
+ return existing;
1831
+ }
1832
+ const factory = providerFactory ?? createProvider;
1833
+ const instance = factory(resolved);
1834
+ providerCache.set(resolved.name, instance);
1835
+ return instance;
1836
+ };
1837
+ const resolveTargetByName = (name) => {
1838
+ if (resolvedTargetsByName.has(name)) {
1839
+ return resolvedTargetsByName.get(name);
1840
+ }
1841
+ const definition = targetDefinitions.get(name);
1842
+ if (!definition) {
1843
+ return void 0;
1844
+ }
1845
+ const resolved = resolveTargetDefinition(definition, envLookup);
1846
+ resolvedTargetsByName.set(name, resolved);
1847
+ return resolved;
1848
+ };
1849
+ const resolveJudgeProvider = async (targetContext) => {
1850
+ const judgeName = targetContext.judgeTarget ?? targetContext.name;
1851
+ const resolvedJudge = resolveTargetByName(judgeName);
1852
+ if (!resolvedJudge) {
1853
+ return getOrCreateProvider(targetContext);
1854
+ }
1855
+ return getOrCreateProvider(resolvedJudge);
1856
+ };
1857
+ const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
1858
+ const primaryProvider = getOrCreateProvider(target);
1859
+ if (onProgress && filteredTestCases.length > 0) {
1860
+ for (let i = 0; i < filteredTestCases.length; i++) {
1861
+ await onProgress({
1862
+ workerId: i + 1,
1863
+ testId: filteredTestCases[i].id,
1864
+ status: "pending"
1865
+ });
1866
+ }
1867
+ }
1868
+ const workers = options.maxConcurrency ?? target.workers ?? 1;
1869
+ const limit = pLimit(workers);
1870
+ let nextWorkerId = 1;
1871
+ const workerIdByTestId = /* @__PURE__ */ new Map();
1872
+ const promises = filteredTestCases.map(
1873
+ (testCase) => limit(async () => {
1874
+ const workerId = nextWorkerId++;
1875
+ workerIdByTestId.set(testCase.id, workerId);
1876
+ if (onProgress) {
1877
+ await onProgress({
1878
+ workerId,
1879
+ testId: testCase.id,
1880
+ status: "running",
1881
+ startedAt: Date.now()
1882
+ });
1883
+ }
1884
+ try {
1885
+ const judgeProvider = await resolveJudgeProvider(target);
1886
+ const result = await runTestCase({
1887
+ testCase,
1888
+ provider: primaryProvider,
1889
+ target,
1890
+ graders: graderRegistry,
1891
+ maxRetries,
1892
+ agentTimeoutMs,
1893
+ promptDumpDir,
1894
+ cache,
1895
+ useCache,
1896
+ now,
1897
+ judgeProvider
1898
+ });
1899
+ if (onProgress) {
1900
+ await onProgress({
1901
+ workerId,
1902
+ testId: testCase.id,
1903
+ status: "completed",
1904
+ startedAt: 0,
1905
+ // Not used for completed status
1906
+ completedAt: Date.now()
1907
+ });
1908
+ }
1909
+ if (onResult) {
1910
+ await onResult(result);
1911
+ }
1912
+ return result;
1913
+ } catch (error) {
1914
+ if (onProgress) {
1915
+ await onProgress({
1916
+ workerId,
1917
+ testId: testCase.id,
1918
+ status: "failed",
1919
+ completedAt: Date.now(),
1920
+ error: error instanceof Error ? error.message : String(error)
1921
+ });
1922
+ }
1923
+ throw error;
1924
+ }
1925
+ })
1926
+ );
1927
+ const settled = await Promise.allSettled(promises);
1928
+ const results = [];
1929
+ for (let i = 0; i < settled.length; i++) {
1930
+ const outcome = settled[i];
1931
+ if (outcome.status === "fulfilled") {
1932
+ results.push(outcome.value);
1933
+ } else {
1934
+ const testCase = filteredTestCases[i];
1935
+ const promptInputs = await buildPromptInputs(testCase);
1936
+ const errorResult = buildErrorResult(
1937
+ testCase,
1938
+ target.name,
1939
+ (now ?? (() => /* @__PURE__ */ new Date()))(),
1940
+ outcome.reason,
1941
+ promptInputs
1942
+ );
1943
+ results.push(errorResult);
1944
+ if (onResult) {
1945
+ await onResult(errorResult);
1946
+ }
1947
+ }
1948
+ }
1949
+ return results;
1950
+ }
1951
+ async function runTestCase(options) {
1952
+ const {
1953
+ testCase,
1954
+ provider,
1955
+ target,
1956
+ graders,
1957
+ now,
1958
+ maxRetries,
1959
+ agentTimeoutMs,
1960
+ promptDumpDir,
1961
+ cache,
1962
+ useCache,
1963
+ signal,
1964
+ judgeProvider
1965
+ } = options;
1966
+ const promptInputs = await buildPromptInputs(testCase);
1967
+ if (promptDumpDir) {
1968
+ await dumpPrompt(promptDumpDir, testCase, promptInputs);
1969
+ }
1970
+ const cacheKey = useCache ? createCacheKey(provider, target, testCase, promptInputs) : void 0;
1971
+ let cachedResponse;
1972
+ if (cacheKey && cache) {
1973
+ cachedResponse = await cache.get(cacheKey);
1974
+ }
1975
+ const nowFn = now ?? (() => /* @__PURE__ */ new Date());
1976
+ const attemptBudget = (maxRetries ?? 0) + 1;
1977
+ let attempt = 0;
1978
+ let providerResponse = cachedResponse;
1979
+ let lastError;
1980
+ while (!providerResponse && attempt < attemptBudget) {
1981
+ try {
1982
+ providerResponse = await invokeProvider(provider, {
1983
+ testCase,
1984
+ target,
1985
+ promptInputs,
1986
+ attempt,
1987
+ agentTimeoutMs,
1988
+ signal
1989
+ });
1990
+ } catch (error) {
1991
+ lastError = error;
1992
+ if (isTimeoutLike(error) && attempt + 1 < attemptBudget) {
1993
+ attempt += 1;
1994
+ continue;
1995
+ }
1996
+ return buildErrorResult(testCase, target.name, nowFn(), error, promptInputs);
1997
+ }
1998
+ }
1999
+ if (!providerResponse) {
2000
+ return buildErrorResult(
2001
+ testCase,
2002
+ target.name,
2003
+ nowFn(),
2004
+ lastError ?? new Error("Provider did not return a response"),
2005
+ promptInputs
2006
+ );
2007
+ }
2008
+ if (cacheKey && cache && !cachedResponse) {
2009
+ await cache.set(cacheKey, providerResponse);
2010
+ }
2011
+ const graderKind = testCase.grader ?? "heuristic";
2012
+ const activeGrader = graders[graderKind] ?? graders.heuristic;
2013
+ if (!activeGrader) {
2014
+ throw new Error(`No grader registered for kind '${graderKind}'`);
2015
+ }
2016
+ let grade;
2017
+ try {
2018
+ const gradeTimestamp = nowFn();
2019
+ grade = await activeGrader.grade({
2020
+ testCase,
2021
+ candidate: providerResponse.text ?? "",
2022
+ target,
2023
+ provider,
2024
+ attempt,
2025
+ promptInputs,
2026
+ now: gradeTimestamp,
2027
+ judgeProvider
2028
+ });
2029
+ } catch (error) {
2030
+ return buildErrorResult(testCase, target.name, nowFn(), error, promptInputs);
2031
+ }
2032
+ const completedAt = nowFn();
2033
+ const rawRequest = {
2034
+ request: promptInputs.request,
2035
+ guidelines: promptInputs.guidelines,
2036
+ guideline_paths: testCase.guideline_paths
2037
+ };
2038
+ return {
2039
+ test_id: testCase.id,
2040
+ conversation_id: testCase.conversation_id,
2041
+ score: grade.score,
2042
+ hits: grade.hits,
2043
+ misses: grade.misses,
2044
+ model_answer: providerResponse.text ?? "",
2045
+ expected_aspect_count: grade.expectedAspectCount,
2046
+ target: target.name,
2047
+ timestamp: completedAt.toISOString(),
2048
+ reasoning: grade.reasoning,
2049
+ raw_aspects: grade.rawAspects,
2050
+ raw_request: rawRequest,
2051
+ grader_raw_request: grade.graderRawRequest
2052
+ };
2053
+ }
2054
+ function filterTestCases(testCases, testId) {
2055
+ if (!testId) {
2056
+ return testCases;
2057
+ }
2058
+ return testCases.filter((testCase) => testCase.id === testId);
2059
+ }
2060
+ function buildGraderRegistry(overrides, resolveJudgeProvider) {
2061
+ const heuristic = overrides?.heuristic ?? new HeuristicGrader();
2062
+ const llmJudge = overrides?.llm_judge ?? new QualityGrader({
2063
+ resolveJudgeProvider: async (context) => {
2064
+ if (context.judgeProvider) {
2065
+ return context.judgeProvider;
2066
+ }
2067
+ return resolveJudgeProvider(context.target);
2068
+ }
2069
+ });
2070
+ return {
2071
+ ...overrides,
2072
+ heuristic,
2073
+ llm_judge: llmJudge
2074
+ };
2075
+ }
2076
+ async function dumpPrompt(directory, testCase, promptInputs) {
2077
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
2078
+ const filename = `${timestamp}_${sanitizeFilename(testCase.id)}.json`;
2079
+ const filePath = import_node_path5.default.resolve(directory, filename);
2080
+ await (0, import_promises5.mkdir)(import_node_path5.default.dirname(filePath), { recursive: true });
2081
+ const payload = {
2082
+ test_id: testCase.id,
2083
+ request: promptInputs.request,
2084
+ guidelines: promptInputs.guidelines,
2085
+ guideline_paths: testCase.guideline_paths
2086
+ };
2087
+ await (0, import_promises5.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
2088
+ }
2089
+ function sanitizeFilename(value) {
2090
+ if (!value) {
2091
+ return "prompt";
2092
+ }
2093
+ const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
2094
+ return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
2095
+ }
2096
+ async function invokeProvider(provider, options) {
2097
+ const { testCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
2098
+ const controller = new AbortController();
2099
+ const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
2100
+ if (signal) {
2101
+ signal.addEventListener("abort", () => controller.abort(), { once: true });
2102
+ }
2103
+ try {
2104
+ return await provider.invoke({
2105
+ prompt: promptInputs.request,
2106
+ guidelines: promptInputs.guidelines,
2107
+ attachments: testCase.guideline_paths,
2108
+ testCaseId: testCase.id,
2109
+ attempt,
2110
+ metadata: {
2111
+ target: target.name,
2112
+ grader: testCase.grader
2113
+ },
2114
+ signal: controller.signal
2115
+ });
2116
+ } finally {
2117
+ if (timeout !== void 0) {
2118
+ clearTimeout(timeout);
2119
+ }
2120
+ }
2121
+ }
2122
+ function buildErrorResult(testCase, targetName, timestamp, error, promptInputs) {
2123
+ const message = error instanceof Error ? error.message : String(error);
2124
+ const rawRequest = {
2125
+ request: promptInputs.request,
2126
+ guidelines: promptInputs.guidelines,
2127
+ guideline_paths: testCase.guideline_paths,
2128
+ error: message
2129
+ };
2130
+ return {
2131
+ test_id: testCase.id,
2132
+ conversation_id: testCase.conversation_id,
2133
+ score: 0,
2134
+ hits: [],
2135
+ misses: [`Error: ${message}`],
2136
+ model_answer: `Error occurred: ${message}`,
2137
+ expected_aspect_count: 0,
2138
+ target: targetName,
2139
+ timestamp: timestamp.toISOString(),
2140
+ raw_aspects: [],
2141
+ raw_request: rawRequest
2142
+ };
2143
+ }
2144
+ function createCacheKey(provider, target, testCase, promptInputs) {
2145
+ const hash = (0, import_node_crypto2.createHash)("sha256");
2146
+ hash.update(provider.id);
2147
+ hash.update(target.name);
2148
+ hash.update(testCase.id);
2149
+ hash.update(promptInputs.request);
2150
+ hash.update(promptInputs.guidelines);
2151
+ return hash.digest("hex");
2152
+ }
2153
+ function isTimeoutLike(error) {
2154
+ if (!error) {
2155
+ return false;
2156
+ }
2157
+ if (typeof DOMException !== "undefined" && error instanceof DOMException && error.name === "AbortError") {
2158
+ return true;
2159
+ }
2160
+ if (error instanceof Error) {
2161
+ const name = error.name?.toLowerCase();
2162
+ const message = error.message?.toLowerCase();
2163
+ return name.includes("timeout") || message.includes("timeout");
2164
+ }
2165
+ const value = String(error).toLowerCase();
2166
+ return value.includes("timeout");
2167
+ }
2168
+
2169
+ // src/index.ts
2170
+ function createAgentKernel() {
2171
+ return { status: "stub" };
2172
+ }
2173
+ // Annotate the CommonJS export names for ESM import in node:
2174
+ 0 && (module.exports = {
2175
+ GRADER_KINDS,
2176
+ HeuristicGrader,
2177
+ QualityGrader,
2178
+ TEST_MESSAGE_ROLES,
2179
+ buildPromptInputs,
2180
+ calculateHits,
2181
+ calculateMisses,
2182
+ createAgentKernel,
2183
+ createProvider,
2184
+ ensureVSCodeSubagents,
2185
+ extractAspects,
2186
+ extractCodeBlocks,
2187
+ getHitCount,
2188
+ isErrorLike,
2189
+ isGraderKind,
2190
+ isGuidelineFile,
2191
+ isJsonObject,
2192
+ isJsonValue,
2193
+ isTestMessage,
2194
+ isTestMessageRole,
2195
+ listTargetNames,
2196
+ loadTestCases,
2197
+ readTargetDefinitions,
2198
+ resolveAndCreateProvider,
2199
+ resolveTargetDefinition,
2200
+ runEvaluation,
2201
+ runTestCase,
2202
+ scoreCandidateResponse
2203
+ });
2204
+ //# sourceMappingURL=index.cjs.map