@agtlantis/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs ADDED
@@ -0,0 +1,3998 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+
30
+ // src/index.ts
31
+ var index_exports = {};
32
+ __export(index_exports, {
33
+ ANTHROPIC_PRICING: () => import_core10.ANTHROPIC_PRICING,
34
+ CompositeReporter: () => CompositeReporter,
35
+ ConsoleReporter: () => ConsoleReporter,
36
+ DEFAULT_PRICING_CONFIG: () => import_core10.DEFAULT_PRICING_CONFIG,
37
+ EvalError: () => EvalError,
38
+ EvalErrorCode: () => EvalErrorCode,
39
+ GOOGLE_PRICING: () => import_core10.GOOGLE_PRICING,
40
+ JsonReporter: () => JsonReporter,
41
+ MarkdownReporter: () => MarkdownReporter,
42
+ MockProvider: () => import_testing.MockProvider,
43
+ OPENAI_PRICING: () => import_core10.OPENAI_PRICING,
44
+ TestCaseCollection: () => TestCaseCollection,
45
+ accuracy: () => accuracy,
46
+ addCostsToResults: () => addCostsToResults,
47
+ afterTurns: () => afterTurns,
48
+ aggregateIterationResults: () => aggregateIterationResults,
49
+ aiUser: () => aiUser,
50
+ and: () => and,
51
+ applyPromptSuggestions: () => applyPromptSuggestions,
52
+ bumpVersion: () => bumpVersion,
53
+ calculateAvgPassRate: () => calculateAvgPassRate,
54
+ calculateAvgStdDev: () => calculateAvgStdDev,
55
+ calculateCostFromUsage: () => import_core10.calculateCostFromUsage,
56
+ calculateIterationStats: () => calculateIterationStats,
57
+ calculateMultiTurnIterationStats: () => calculateMultiTurnIterationStats,
58
+ calculateReportCosts: () => calculateReportCosts,
59
+ calculateResultCost: () => calculateResultCost,
60
+ checkCondition: () => checkCondition,
61
+ checkCycleCondition: () => checkCycleCondition,
62
+ checkCycleTermination: () => checkCycleTermination,
63
+ checkTermination: () => checkTermination,
64
+ compareReports: () => compareReports,
65
+ compileTemplate: () => import_core9.compileTemplate,
66
+ consistency: () => consistency,
67
+ createCompositeReporter: () => createCompositeReporter,
68
+ createConsoleReporter: () => createConsoleReporter,
69
+ createDefaultReporter: () => createDefaultReporter,
70
+ createEvalSuite: () => createEvalSuite,
71
+ createFilePromptRepository: () => import_core9.createFilePromptRepository,
72
+ createImprover: () => createImprover,
73
+ createJsonReporter: () => createJsonReporter,
74
+ createJudge: () => createJudge,
75
+ createMarkdownReporter: () => createMarkdownReporter,
76
+ createMockAgent: () => createMockAgent,
77
+ createMockImprover: () => createMockImprover,
78
+ createMockJudge: () => createMockJudge,
79
+ createReportRunner: () => createReportRunner,
80
+ createSession: () => createSession,
81
+ customCondition: () => customCondition,
82
+ cycleAnd: () => and2,
83
+ cycleNot: () => not2,
84
+ cycleOr: () => or2,
85
+ cycleToMarkdown: () => cycleToMarkdown,
86
+ defaultHistoryStorage: () => defaultHistoryStorage,
87
+ defineConfig: () => defineConfig,
88
+ deserializePrompt: () => deserializePrompt,
89
+ discoverEvalFiles: () => discoverEvalFiles,
90
+ executeMultiTurnTestCase: () => executeMultiTurnTestCase,
91
+ executeTestCase: () => executeTestCase,
92
+ fieldEquals: () => fieldEquals,
93
+ fieldIsSet: () => fieldIsSet,
94
+ getFieldValue: () => getFieldValue,
95
+ getFileSourceDisplayInfo: () => import_core8.getFileSourceDisplayInfo,
96
+ getFileSourcesDisplayInfo: () => import_core8.getFileSourcesDisplayInfo,
97
+ inferMediaType: () => import_core8.inferMediaType,
98
+ isCustomCondition: () => isCustomCondition,
99
+ isCustomCycleCondition: () => isCustomCycleCondition,
100
+ isCycleTerminated: () => isCycleTerminated,
101
+ isFieldSetCondition: () => isFieldSetCondition,
102
+ isFieldValueCondition: () => isFieldValueCondition,
103
+ isFileSource: () => import_core8.isFileSource,
104
+ isFileSourceBase64: () => import_core8.isFileSourceBase64,
105
+ isFileSourceData: () => import_core8.isFileSourceData,
106
+ isFileSourcePath: () => import_core8.isFileSourcePath,
107
+ isFileSourceUrl: () => import_core8.isFileSourceUrl,
108
+ isIteratedResult: () => isIteratedResult,
109
+ isMaxCostCondition: () => isMaxCostCondition,
110
+ isMaxRoundsCondition: () => isMaxRoundsCondition,
111
+ isMaxTurnsCondition: () => isMaxTurnsCondition,
112
+ isMultiTurnResult: () => isMultiTurnResult,
113
+ isMultiTurnTestCase: () => isMultiTurnTestCase,
114
+ isNoImprovementCondition: () => isNoImprovementCondition,
115
+ isSingleTurnResult: () => isSingleTurnResult,
116
+ isTargetScoreCondition: () => isTargetScoreCondition,
117
+ isTerminated: () => isTerminated,
118
+ loadHistory: () => loadHistory,
119
+ logCycle: () => logCycle,
120
+ maxCost: () => maxCost,
121
+ maxRounds: () => maxRounds,
122
+ mock: () => import_testing.mock,
123
+ naturalLanguage: () => naturalLanguage,
124
+ noImprovement: () => noImprovement,
125
+ not: () => not,
126
+ or: () => or,
127
+ relevance: () => relevance,
128
+ reportToMarkdown: () => reportToMarkdown,
129
+ resolveFileSource: () => import_core8.resolveFileSource,
130
+ resolveFileSourcesInInput: () => import_core8.resolveFileSourcesInInput,
131
+ resumeSession: () => resumeSession,
132
+ runImprovementCycle: () => runImprovementCycle,
133
+ runImprovementCycleAuto: () => runImprovementCycleAuto,
134
+ runWithConcurrency: () => runWithConcurrency,
135
+ saveCycleJson: () => saveCycleJson,
136
+ saveCycleMarkdown: () => saveCycleMarkdown,
137
+ saveHistory: () => saveHistory,
138
+ saveReportMarkdown: () => saveReportMarkdown,
139
+ scanForFileSources: () => import_core8.scanForFileSources,
140
+ schema: () => schema,
141
+ selectRepresentativeResult: () => selectRepresentativeResult,
142
+ serializePrompt: () => serializePrompt,
143
+ suggestionDiff: () => suggestionDiff,
144
+ suggestionPreview: () => suggestionPreview,
145
+ suggestionSummary: () => suggestionSummary,
146
+ targetScore: () => targetScore,
147
+ testCase: () => testCase,
148
+ testCases: () => testCases,
149
+ toEvalAgent: () => toEvalAgent
150
+ });
151
+ module.exports = __toCommonJS(index_exports);
152
+
153
+ // src/core/runner.ts
154
+ var import_core2 = require("@agtlantis/core");
155
+
156
+ // src/multi-turn/types.ts
157
+ function isMaxTurnsCondition(condition) {
158
+ return condition.type === "maxTurns";
159
+ }
160
+ function isFieldSetCondition(condition) {
161
+ return condition.type === "fieldSet";
162
+ }
163
+ function isFieldValueCondition(condition) {
164
+ return condition.type === "fieldValue";
165
+ }
166
+ function isCustomCondition(condition) {
167
+ return condition.type === "custom";
168
+ }
169
+ function isMultiTurnTestCase(testCase2) {
170
+ return "multiTurn" in testCase2;
171
+ }
172
+ function isTerminated(result) {
173
+ return result.terminated === true;
174
+ }
175
+
176
+ // src/core/errors.ts
177
+ var EvalErrorCode = /* @__PURE__ */ ((EvalErrorCode2) => {
178
+ EvalErrorCode2["LLM_API_ERROR"] = "LLM_API_ERROR";
179
+ EvalErrorCode2["LLM_RATE_LIMIT"] = "LLM_RATE_LIMIT";
180
+ EvalErrorCode2["LLM_TIMEOUT"] = "LLM_TIMEOUT";
181
+ EvalErrorCode2["JSON_PARSE_ERROR"] = "JSON_PARSE_ERROR";
182
+ EvalErrorCode2["VERDICT_PARSE_ERROR"] = "VERDICT_PARSE_ERROR";
183
+ EvalErrorCode2["TEMPLATE_COMPILE_ERROR"] = "TEMPLATE_COMPILE_ERROR";
184
+ EvalErrorCode2["AGENT_EXECUTION_ERROR"] = "AGENT_EXECUTION_ERROR";
185
+ EvalErrorCode2["INVALID_CONFIG"] = "INVALID_CONFIG";
186
+ EvalErrorCode2["MISSING_API_KEY"] = "MISSING_API_KEY";
187
+ EvalErrorCode2["PROMPT_NOT_FOUND"] = "PROMPT_NOT_FOUND";
188
+ EvalErrorCode2["PROMPT_INVALID_FORMAT"] = "PROMPT_INVALID_FORMAT";
189
+ EvalErrorCode2["PROMPT_WRITE_ERROR"] = "PROMPT_WRITE_ERROR";
190
+ EvalErrorCode2["PROMPT_READ_ERROR"] = "PROMPT_READ_ERROR";
191
+ EvalErrorCode2["SUGGESTION_APPLY_ERROR"] = "SUGGESTION_APPLY_ERROR";
192
+ EvalErrorCode2["SCHEMA_VALIDATION_ERROR"] = "SCHEMA_VALIDATION_ERROR";
193
+ EvalErrorCode2["SCHEMA_GENERATION_ERROR"] = "SCHEMA_GENERATION_ERROR";
194
+ EvalErrorCode2["FILE_READ_ERROR"] = "FILE_READ_ERROR";
195
+ EvalErrorCode2["FILE_WRITE_ERROR"] = "FILE_WRITE_ERROR";
196
+ EvalErrorCode2["FILE_TOO_LARGE"] = "FILE_TOO_LARGE";
197
+ EvalErrorCode2["CONCURRENT_MODIFICATION"] = "CONCURRENT_MODIFICATION";
198
+ EvalErrorCode2["UNKNOWN_ERROR"] = "UNKNOWN_ERROR";
199
+ return EvalErrorCode2;
200
+ })(EvalErrorCode || {});
201
+ var EvalError = class _EvalError extends Error {
202
+ code;
203
+ cause;
204
+ context;
205
+ constructor(message, options) {
206
+ super(message);
207
+ this.name = "EvalError";
208
+ this.code = options.code;
209
+ this.cause = options.cause;
210
+ this.context = options.context;
211
+ if (Error.captureStackTrace) {
212
+ Error.captureStackTrace(this, _EvalError);
213
+ }
214
+ }
215
+ /**
216
+ * Creates an EvalError from an unknown error with a specific code.
217
+ */
218
+ static from(error, code, context) {
219
+ if (error instanceof _EvalError) {
220
+ return error;
221
+ }
222
+ const cause = error instanceof Error ? error : new Error(String(error));
223
+ return new _EvalError(cause.message, { code, cause, context });
224
+ }
225
+ toJSON() {
226
+ return {
227
+ name: this.name,
228
+ message: this.message,
229
+ code: this.code,
230
+ context: this.context,
231
+ cause: this.cause?.message
232
+ };
233
+ }
234
+ };
235
+
236
+ // src/multi-turn/termination.ts
237
+ function getFieldValue(obj, fieldPath) {
238
+ if (obj === null || obj === void 0) {
239
+ return void 0;
240
+ }
241
+ const parts = fieldPath.split(".");
242
+ let current = obj;
243
+ for (const part of parts) {
244
+ if (current === null || current === void 0) {
245
+ return void 0;
246
+ }
247
+ if (typeof current !== "object") {
248
+ return void 0;
249
+ }
250
+ current = current[part];
251
+ }
252
+ return current;
253
+ }
254
+ function isSet(value) {
255
+ return value !== null && value !== void 0;
256
+ }
257
+ function checkMaxTurns(condition, context) {
258
+ const shouldTerminate = context.currentTurn >= condition.count;
259
+ if (shouldTerminate) {
260
+ return {
261
+ terminated: true,
262
+ terminationType: "maxTurns",
263
+ matchedCondition: condition,
264
+ reason: `Maximum turns reached (${condition.count})`
265
+ };
266
+ }
267
+ return {
268
+ terminated: false,
269
+ reason: `Turn ${context.currentTurn} of ${condition.count}`
270
+ };
271
+ }
272
+ function checkFieldSet(condition, context) {
273
+ const fieldValue = getFieldValue(context.lastOutput, condition.fieldPath);
274
+ const fieldIsSet2 = isSet(fieldValue);
275
+ if (fieldIsSet2) {
276
+ return {
277
+ terminated: true,
278
+ terminationType: "condition",
279
+ matchedCondition: condition,
280
+ reason: `Field "${condition.fieldPath}" is set (value: ${JSON.stringify(fieldValue)})`
281
+ };
282
+ }
283
+ return {
284
+ terminated: false,
285
+ reason: `Field "${condition.fieldPath}" is not set`
286
+ };
287
+ }
288
+ function checkFieldValue(condition, context) {
289
+ const fieldValue = getFieldValue(context.lastOutput, condition.fieldPath);
290
+ const matches = fieldValue === condition.expectedValue;
291
+ if (matches) {
292
+ return {
293
+ terminated: true,
294
+ terminationType: "condition",
295
+ matchedCondition: condition,
296
+ reason: `Field "${condition.fieldPath}" equals expected value`
297
+ };
298
+ }
299
+ return {
300
+ terminated: false,
301
+ reason: `Field "${condition.fieldPath}" does not equal expected value (got: ${JSON.stringify(fieldValue)})`
302
+ };
303
+ }
304
+ async function checkCustom(condition, context) {
305
+ const description = condition.description ?? "Custom condition";
306
+ try {
307
+ const shouldTerminate = await condition.check(context);
308
+ if (shouldTerminate) {
309
+ return {
310
+ terminated: true,
311
+ terminationType: "condition",
312
+ matchedCondition: condition,
313
+ reason: `${description} met`
314
+ };
315
+ }
316
+ return {
317
+ terminated: false,
318
+ reason: `${description} not met`
319
+ };
320
+ } catch (error) {
321
+ const errorMessage = error instanceof Error ? error.message : String(error);
322
+ return {
323
+ terminated: false,
324
+ reason: `${description} failed: ${errorMessage}`
325
+ };
326
+ }
327
+ }
328
+ async function checkCondition(condition, context) {
329
+ if (isMaxTurnsCondition(condition)) {
330
+ return checkMaxTurns(condition, context);
331
+ }
332
+ if (isFieldValueCondition(condition)) {
333
+ return checkFieldValue(condition, context);
334
+ }
335
+ if (isFieldSetCondition(condition)) {
336
+ return checkFieldSet(condition, context);
337
+ }
338
+ if (isCustomCondition(condition)) {
339
+ return checkCustom(condition, context);
340
+ }
341
+ const _exhaustive = condition;
342
+ throw new EvalError(`Unknown condition type: ${JSON.stringify(_exhaustive)}`, {
343
+ code: "UNKNOWN_ERROR" /* UNKNOWN_ERROR */,
344
+ context: { condition: _exhaustive }
345
+ });
346
+ }
347
+ async function checkTermination(conditions, context) {
348
+ if (conditions.length === 0) {
349
+ return {
350
+ terminated: false,
351
+ reason: "No termination conditions specified"
352
+ };
353
+ }
354
+ for (const condition of conditions) {
355
+ const result = await checkCondition(condition, context);
356
+ if (result.terminated) {
357
+ return result;
358
+ }
359
+ }
360
+ return {
361
+ terminated: false,
362
+ reason: "No termination conditions met"
363
+ };
364
+ }
365
+
366
+ // src/utils/json.ts
367
+ function truncate(str, maxLength) {
368
+ if (!str) {
369
+ return "";
370
+ }
371
+ if (str.length <= maxLength) {
372
+ return str;
373
+ }
374
+ return str.slice(0, maxLength) + "...";
375
+ }
376
+
377
+ // src/utils/condition-composites.ts
378
+ function createAndCheck(conditions, checkFn) {
379
+ return async (context) => {
380
+ for (const condition of conditions) {
381
+ const result = await checkFn(condition, context);
382
+ if (!result.terminated) {
383
+ return false;
384
+ }
385
+ }
386
+ return true;
387
+ };
388
+ }
389
+ function createOrCheck(conditions, checkFn) {
390
+ return async (context) => {
391
+ for (const condition of conditions) {
392
+ const result = await checkFn(condition, context);
393
+ if (result.terminated) {
394
+ return true;
395
+ }
396
+ }
397
+ return false;
398
+ };
399
+ }
400
+ function createNotCheck(condition, checkFn) {
401
+ return async (context) => {
402
+ const result = await checkFn(condition, context);
403
+ return !result.terminated;
404
+ };
405
+ }
406
+ function formatCompositeDescription(type, conditions) {
407
+ if (conditions.length === 0) {
408
+ return `${type}() - empty, never terminates`;
409
+ }
410
+ return `${type}(${conditions.map((c) => c.type).join(", ")})`;
411
+ }
412
+
413
+ // src/multi-turn/conditions.ts
414
+ function naturalLanguage(options) {
415
+ const { provider, prompt, systemPrompt } = options;
416
+ const defaultSystemPrompt = `You are an assistant that evaluates whether a conversation should terminate.
417
+ Analyze the conversation history and determine if the specified condition is met.
418
+ Respond with ONLY "yes" or "no" - nothing else.`;
419
+ return {
420
+ type: "custom",
421
+ check: async (context) => {
422
+ const historyText = context.history.map(
423
+ (h) => `Turn ${h.turn}:
424
+ Input: ${JSON.stringify(h.input)}
425
+ Output: ${JSON.stringify(h.output)}`
426
+ ).join("\n\n");
427
+ const userPrompt = `## Termination Condition
428
+ ${prompt}
429
+
430
+ ## Conversation History
431
+ ${historyText || "(No history yet)"}
432
+
433
+ ## Current Turn
434
+ Turn: ${context.currentTurn}
435
+ Last Output: ${JSON.stringify(context.lastOutput)}
436
+
437
+ Should the conversation terminate based on the condition above? Answer "yes" or "no" only.`;
438
+ const execution = provider.simpleExecution(async (session) => {
439
+ const result = await session.generateText({
440
+ messages: [
441
+ { role: "system", content: systemPrompt ?? defaultSystemPrompt },
442
+ { role: "user", content: userPrompt }
443
+ ]
444
+ });
445
+ return result.text;
446
+ });
447
+ const executionResult = await execution.result();
448
+ if (executionResult.status !== "succeeded") {
449
+ throw executionResult.status === "failed" ? executionResult.error : new Error("Execution was canceled");
450
+ }
451
+ const responseText = executionResult.value;
452
+ const answer = responseText.toLowerCase().trim();
453
+ return answer === "yes" || answer.startsWith("yes");
454
+ },
455
+ description: `NL: ${truncate(prompt, 50)}`
456
+ };
457
+ }
458
+ function and(...conditions) {
459
+ if (conditions.length === 0) {
460
+ return {
461
+ type: "custom",
462
+ check: () => false,
463
+ description: formatCompositeDescription("and", [])
464
+ };
465
+ }
466
+ return {
467
+ type: "custom",
468
+ check: createAndCheck(conditions, checkCondition),
469
+ description: formatCompositeDescription("and", conditions)
470
+ };
471
+ }
472
+ function or(...conditions) {
473
+ if (conditions.length === 0) {
474
+ return {
475
+ type: "custom",
476
+ check: () => false,
477
+ description: formatCompositeDescription("or", [])
478
+ };
479
+ }
480
+ return {
481
+ type: "custom",
482
+ check: createOrCheck(conditions, checkCondition),
483
+ description: formatCompositeDescription("or", conditions)
484
+ };
485
+ }
486
+ function not(condition) {
487
+ return {
488
+ type: "custom",
489
+ check: createNotCheck(condition, checkCondition),
490
+ description: `not(${condition.type})`
491
+ };
492
+ }
493
+ function afterTurns(count) {
494
+ return {
495
+ type: "custom",
496
+ check: (context) => context.currentTurn >= count,
497
+ description: `afterTurns(${count})`
498
+ };
499
+ }
500
+ function fieldEquals(fieldPath, expectedValue) {
501
+ return {
502
+ type: "custom",
503
+ check: async (context) => {
504
+ const result = await checkCondition(
505
+ { type: "fieldValue", fieldPath, expectedValue },
506
+ context
507
+ );
508
+ return result.terminated;
509
+ },
510
+ description: `fieldEquals(${fieldPath}, ${JSON.stringify(expectedValue)})`
511
+ };
512
+ }
513
+ function fieldIsSet(fieldPath) {
514
+ return {
515
+ type: "custom",
516
+ check: async (context) => {
517
+ const result = await checkCondition({ type: "fieldSet", fieldPath }, context);
518
+ return result.terminated;
519
+ },
520
+ description: `fieldIsSet(${fieldPath})`
521
+ };
522
+ }
523
+
524
+ // src/multi-turn/runner.ts
525
+ var import_core = require("@agtlantis/core");
526
+ var DEFAULT_MAX_TURNS = 10;
527
+ var DEFAULT_ON_CONDITION_MET = "pass";
528
+ var DEFAULT_ON_MAX_TURNS_REACHED = "fail";
529
+ function aggregateTokenUsage(usages) {
530
+ return usages.reduce(
531
+ (acc, usage) => ({
532
+ inputTokens: acc.inputTokens + usage.inputTokens,
533
+ outputTokens: acc.outputTokens + usage.outputTokens,
534
+ totalTokens: acc.totalTokens + usage.totalTokens
535
+ }),
536
+ { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
537
+ );
538
+ }
539
+ function getEffectiveMaxTurns(conditions, safetyLimit) {
540
+ const maxTurnsCondition = conditions.find((c) => c.type === "maxTurns");
541
+ if (maxTurnsCondition && maxTurnsCondition.type === "maxTurns") {
542
+ return Math.min(maxTurnsCondition.count, safetyLimit);
543
+ }
544
+ return safetyLimit;
545
+ }
546
+ async function resolveInput(followUpInput, context) {
547
+ const inputValue = followUpInput.input;
548
+ if (typeof inputValue === "function") {
549
+ const result = inputValue(context);
550
+ return result instanceof Promise ? await result : result;
551
+ }
552
+ return inputValue;
553
+ }
554
+ function buildContext(currentTurn, history) {
555
+ return {
556
+ currentTurn,
557
+ history,
558
+ lastOutput: history.length > 0 ? history[history.length - 1].output : void 0
559
+ };
560
+ }
561
+ function getFollowUpInput(followUpInputs, followUpIndex) {
562
+ let currentIndex = 0;
563
+ for (const followUp of followUpInputs) {
564
+ const repeatCount = followUp.turns ?? 1;
565
+ if (!Number.isFinite(repeatCount) && followUpIndex >= currentIndex) {
566
+ return followUp;
567
+ }
568
+ if (followUpIndex < currentIndex + repeatCount) {
569
+ return followUp;
570
+ }
571
+ currentIndex += repeatCount;
572
+ }
573
+ return null;
574
+ }
575
+ function validateFollowUpInputs(followUpInputs) {
576
+ for (let i = 0; i < followUpInputs.length; i++) {
577
+ const followUp = followUpInputs[i];
578
+ if (followUp.turns === void 0) {
579
+ continue;
580
+ }
581
+ if (typeof followUp.turns !== "number" || followUp.turns < 1) {
582
+ throw new EvalError("turns must be a positive number or Infinity", {
583
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
584
+ context: {
585
+ description: followUp.description,
586
+ turns: followUp.turns
587
+ }
588
+ });
589
+ }
590
+ if (!Number.isFinite(followUp.turns) && i < followUpInputs.length - 1) {
591
+ throw new EvalError(
592
+ "turns: Infinity must be the last followUpInput (subsequent items would be unreachable)",
593
+ {
594
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
595
+ context: {
596
+ description: followUp.description,
597
+ position: i,
598
+ totalItems: followUpInputs.length
599
+ }
600
+ }
601
+ );
602
+ }
603
+ }
604
+ }
605
+ async function getTurnInput(turn, testCaseInput, followUpInputs, conversationHistory) {
606
+ if (turn === 1) {
607
+ return { type: "success", input: testCaseInput };
608
+ }
609
+ const followUpIndex = turn - 2;
610
+ const followUp = getFollowUpInput(followUpInputs, followUpIndex);
611
+ if (!followUp) {
612
+ return { type: "exhausted" };
613
+ }
614
+ const ctx = buildContext(turn, conversationHistory);
615
+ const input = await resolveInput(followUp, ctx);
616
+ return { type: "success", input };
617
+ }
618
+ function isFileResolutionError(result) {
619
+ return "type" in result && result.type === "fileResolutionError";
620
+ }
621
+ async function executeSingleTurn(input, agent, testCaseId, turn) {
622
+ let resolvedInput;
623
+ try {
624
+ resolvedInput = await (0, import_core.resolveFileSourcesInInput)(input, {
625
+ basePath: process.cwd()
626
+ });
627
+ } catch (e) {
628
+ return {
629
+ type: "fileResolutionError",
630
+ reason: `FileSource resolution failed on turn ${turn}: ${e instanceof Error ? e.message : String(e)}`
631
+ };
632
+ }
633
+ const startTime = performance.now();
634
+ let output;
635
+ let metadata;
636
+ let error;
637
+ try {
638
+ const agentResult = await agent.execute(resolvedInput);
639
+ output = agentResult.result;
640
+ metadata = agentResult.metadata;
641
+ } catch (e) {
642
+ error = EvalError.from(e, "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */, {
643
+ testCaseId,
644
+ turn,
645
+ agentName: agent.config.name
646
+ });
647
+ }
648
+ const latencyMs = performance.now() - startTime;
649
+ return { output, metadata, latencyMs, error };
650
+ }
651
+ function determinePassFromTermination(termination, onConditionMet, onMaxTurnsReached) {
652
+ if (!isTerminated(termination)) {
653
+ return true;
654
+ }
655
+ switch (termination.terminationType) {
656
+ case "error":
657
+ case "exhausted":
658
+ return false;
659
+ case "maxTurns":
660
+ return onMaxTurnsReached === "pass";
661
+ case "condition":
662
+ return onConditionMet === "pass";
663
+ default:
664
+ return true;
665
+ }
666
+ }
667
+ async function executeMultiTurnTestCase(testCase2, context, options) {
668
+ const { agent, judge, agentDescription } = context;
669
+ const { multiTurn } = testCase2;
670
+ const signal = options?.signal;
671
+ const maxTurns = getEffectiveMaxTurns(
672
+ multiTurn.terminateWhen,
673
+ multiTurn.maxTurns ?? DEFAULT_MAX_TURNS
674
+ );
675
+ const onConditionMet = multiTurn.onConditionMet ?? DEFAULT_ON_CONDITION_MET;
676
+ const onMaxTurnsReached = multiTurn.onMaxTurnsReached ?? DEFAULT_ON_MAX_TURNS_REACHED;
677
+ const followUpInputs = multiTurn.followUpInputs ?? [];
678
+ validateFollowUpInputs(followUpInputs);
679
+ const conversationHistory = [];
680
+ const tokenUsages = [];
681
+ let totalLatencyMs = 0;
682
+ let termination = {
683
+ terminated: false,
684
+ reason: "Execution not started"
685
+ };
686
+ for (let turn = 1; turn <= maxTurns; turn++) {
687
+ if (signal?.aborted) {
688
+ throw new EvalError("Multi-turn test execution aborted", {
689
+ code: "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */,
690
+ context: { testCaseId: testCase2.id, turn, reason: "aborted" }
691
+ });
692
+ }
693
+ const inputResult = await getTurnInput(
694
+ turn,
695
+ testCase2.input,
696
+ followUpInputs,
697
+ conversationHistory
698
+ );
699
+ if (inputResult.type === "exhausted") {
700
+ termination = {
701
+ terminated: true,
702
+ terminationType: "exhausted",
703
+ reason: "All follow-up inputs exhausted"
704
+ };
705
+ break;
706
+ }
707
+ const input = inputResult.input;
708
+ const turnResult = await executeSingleTurn(input, agent, testCase2.id ?? "unknown", turn);
709
+ if (isFileResolutionError(turnResult)) {
710
+ termination = {
711
+ terminated: true,
712
+ terminationType: "error",
713
+ reason: turnResult.reason
714
+ };
715
+ break;
716
+ }
717
+ const {
718
+ output: agentOutput,
719
+ metadata: agentMetadata,
720
+ latencyMs,
721
+ error: agentError
722
+ } = turnResult;
723
+ totalLatencyMs += latencyMs;
724
+ const turnUsage = agentMetadata?.tokenUsage ?? {
725
+ inputTokens: 0,
726
+ outputTokens: 0,
727
+ totalTokens: 0
728
+ };
729
+ tokenUsages.push(turnUsage);
730
+ conversationHistory.push({
731
+ turn,
732
+ input,
733
+ output: agentOutput,
734
+ metadata: agentMetadata
735
+ });
736
+ if (agentError) {
737
+ termination = {
738
+ terminated: true,
739
+ terminationType: "error",
740
+ reason: `Agent execution failed on turn ${turn}: ${agentError.message}`
741
+ };
742
+ break;
743
+ }
744
+ const ctx = buildContext(turn, conversationHistory);
745
+ termination = await checkTermination(multiTurn.terminateWhen, ctx);
746
+ if (termination.terminated) {
747
+ break;
748
+ }
749
+ if (turn >= maxTurns) {
750
+ termination = {
751
+ terminated: true,
752
+ terminationType: "maxTurns",
753
+ matchedCondition: { type: "maxTurns", count: maxTurns },
754
+ reason: `Maximum turns reached (${maxTurns})`
755
+ };
756
+ break;
757
+ }
758
+ }
759
+ const aggregatedTokenUsage = aggregateTokenUsage(tokenUsages);
760
+ const metrics = {
761
+ latencyMs: totalLatencyMs,
762
+ tokenUsage: aggregatedTokenUsage
763
+ };
764
+ const lastTurn = conversationHistory[conversationHistory.length - 1];
765
+ const finalOutput = lastTurn?.output;
766
+ const judgeResult = await judge.evaluate({
767
+ input: testCase2.input,
768
+ output: finalOutput,
769
+ agentDescription,
770
+ files: testCase2.files
771
+ });
772
+ const passedTermination = determinePassFromTermination(
773
+ termination,
774
+ onConditionMet,
775
+ onMaxTurnsReached
776
+ );
777
+ const passed = passedTermination && judgeResult.passed;
778
+ return {
779
+ testCase: testCase2,
780
+ output: finalOutput,
781
+ metrics,
782
+ verdicts: judgeResult.verdicts,
783
+ overallScore: judgeResult.overallScore,
784
+ passed,
785
+ judgeMetadata: judgeResult.metadata,
786
+ conversationHistory,
787
+ termination,
788
+ totalTurns: conversationHistory.length
789
+ };
790
+ }
791
+
792
+ // src/multi-turn/ai-user.ts
793
+ var DEFAULT_SYSTEM_PROMPT = `You are simulating a realistic user in a conversation with an AI assistant.
794
+
795
+ ## Your Role
796
+ Generate natural, context-appropriate user messages based on the conversation history.
797
+
798
+ ## Guidelines
799
+
800
+ 1. **Stay in Character**: Respond as a real user would - with natural language, occasional typos, or casual phrasing when appropriate.
801
+
802
+ 2. **Be Goal-Oriented**: Users have objectives. Pursue them logically based on the conversation context:
803
+ - If the assistant asks a question, provide a reasonable answer
804
+ - If clarification is needed, ask for it naturally
805
+ - If a task is progressing, guide it toward completion
806
+
807
+ 3. **React Appropriately**: Respond to what the assistant says:
808
+ - Acknowledge when the assistant is helpful
809
+ - Express confusion if the response is unclear
810
+ - Correct misunderstandings if they occur
811
+
812
+ 4. **Keep It Realistic**: Real users:
813
+ - Don't always provide perfect information upfront
814
+ - May change their mind or add requirements
815
+ - Sometimes need time to think or decide
816
+
817
+ ## Output Format
818
+ Respond with ONLY the user's message. No additional formatting, explanation, or meta-commentary.`;
819
+ function aiUser(options) {
820
+ const { provider, systemPrompt, formatHistory, buildInput } = options;
821
+ const defaultFormatHistory = (ctx) => ctx.history.map(
822
+ (h, i) => `[Turn ${i + 1}]
823
+ User: ${JSON.stringify(h.input)}
824
+ Assistant: ${JSON.stringify(h.output)}`
825
+ ).join("\n\n");
826
+ return async (context) => {
827
+ const historyText = (formatHistory ?? defaultFormatHistory)(context);
828
+ const resolvedSystemPrompt = typeof systemPrompt === "function" ? systemPrompt(context) : systemPrompt ?? DEFAULT_SYSTEM_PROMPT;
829
+ const userPrompt = historyText ? `## Conversation History
830
+ ${historyText}
831
+
832
+ ## Your Task
833
+ Generate the next user message based on the conversation above:` : `## Your Task
834
+ This is the start of a new conversation. Generate an appropriate opening message from the user:`;
835
+ const execution = provider.simpleExecution(async (session) => {
836
+ const result = await session.generateText({
837
+ messages: [
838
+ { role: "system", content: resolvedSystemPrompt },
839
+ { role: "user", content: userPrompt }
840
+ ]
841
+ });
842
+ return result.text;
843
+ });
844
+ const executionResult = await execution.result();
845
+ if (executionResult.status !== "succeeded") {
846
+ throw executionResult.status === "failed" ? executionResult.error : new Error("Execution was canceled");
847
+ }
848
+ const responseText = executionResult.value;
849
+ return buildInput(responseText, context);
850
+ };
851
+ }
852
+
853
+ // src/utils/semaphore.ts
854
+ function createSemaphore(limit) {
855
+ let running = 0;
856
+ const waiting = [];
857
+ return {
858
+ async acquire() {
859
+ if (running < limit) {
860
+ running++;
861
+ return;
862
+ }
863
+ return new Promise((resolve2) => {
864
+ waiting.push(resolve2);
865
+ });
866
+ },
867
+ release() {
868
+ running--;
869
+ const next = waiting.shift();
870
+ if (next) {
871
+ running++;
872
+ next();
873
+ }
874
+ }
875
+ };
876
+ }
877
+
878
+ // src/core/constants.ts
879
+ var SCORE = {
880
+ /** Minimum possible score */
881
+ MIN: 0,
882
+ /** Maximum possible score */
883
+ MAX: 100,
884
+ /** Default threshold for passing evaluation */
885
+ DEFAULT_PASS_THRESHOLD: 70,
886
+ /** Threshold for majority-based pass determination (50%) */
887
+ MAJORITY_PASS_THRESHOLD: 0.5
888
+ };
889
+ var ZERO_TOKEN_USAGE = {
890
+ inputTokens: 0,
891
+ outputTokens: 0,
892
+ totalTokens: 0
893
+ };
894
+
895
+ // src/core/runner.ts
896
+ async function executeTestCase(testCase2, context, signal) {
897
+ const { agent, judge, agentDescription } = context;
898
+ if (signal?.aborted) {
899
+ throw new EvalError("Test execution aborted", {
900
+ code: "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */,
901
+ context: { testCaseId: testCase2.id, reason: "aborted" }
902
+ });
903
+ }
904
+ let resolvedInput;
905
+ try {
906
+ resolvedInput = await (0, import_core2.resolveFileSourcesInInput)(testCase2.input, {
907
+ basePath: process.cwd()
908
+ });
909
+ } catch (e) {
910
+ const error2 = EvalError.from(e, "FILE_READ_ERROR" /* FILE_READ_ERROR */, {
911
+ testCaseId: testCase2.id,
912
+ agentName: agent.config.name
913
+ });
914
+ return createFailedResult(testCase2, error2);
915
+ }
916
+ const startTime = performance.now();
917
+ let output;
918
+ let tokenUsage = ZERO_TOKEN_USAGE;
919
+ let error;
920
+ try {
921
+ const agentResult = await agent.execute(resolvedInput);
922
+ output = agentResult.result;
923
+ if (agentResult.metadata?.tokenUsage) {
924
+ tokenUsage = agentResult.metadata.tokenUsage;
925
+ }
926
+ } catch (e) {
927
+ error = EvalError.from(e, "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */, {
928
+ testCaseId: testCase2.id,
929
+ agentName: agent.config.name
930
+ });
931
+ output = void 0;
932
+ }
933
+ const latencyMs = performance.now() - startTime;
934
+ const metrics = { latencyMs, tokenUsage };
935
+ const testResult = { testCase: testCase2, output, metrics, error };
936
+ if (error) {
937
+ return {
938
+ kind: "single-turn",
939
+ ...testResult,
940
+ verdicts: [],
941
+ overallScore: 0,
942
+ passed: false,
943
+ judgeMetadata: void 0
944
+ };
945
+ }
946
+ if (signal?.aborted) {
947
+ throw new EvalError("Test execution aborted before evaluation", {
948
+ code: "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */,
949
+ context: { testCaseId: testCase2.id, reason: "aborted" }
950
+ });
951
+ }
952
+ const judgeResult = await judge.evaluate({
953
+ input: testCase2.input,
954
+ output,
955
+ agentDescription,
956
+ files: testCase2.files
957
+ });
958
+ return {
959
+ kind: "single-turn",
960
+ ...testResult,
961
+ verdicts: judgeResult.verdicts,
962
+ overallScore: judgeResult.overallScore,
963
+ passed: judgeResult.passed,
964
+ judgeMetadata: judgeResult.metadata
965
+ };
966
+ }
967
+ function createFailedResult(testCase2, error) {
968
+ return {
969
+ kind: "single-turn",
970
+ testCase: testCase2,
971
+ output: void 0,
972
+ metrics: { latencyMs: 0, tokenUsage: ZERO_TOKEN_USAGE },
973
+ error,
974
+ verdicts: [],
975
+ overallScore: 0,
976
+ passed: false,
977
+ judgeMetadata: void 0
978
+ };
979
+ }
980
+ function toMultiTurnResult(result) {
981
+ return {
982
+ kind: "multi-turn",
983
+ testCase: result.testCase,
984
+ output: result.output,
985
+ metrics: result.metrics,
986
+ verdicts: result.verdicts,
987
+ overallScore: result.overallScore,
988
+ passed: result.passed,
989
+ judgeMetadata: result.judgeMetadata,
990
+ conversationHistory: result.conversationHistory,
991
+ totalTurns: result.totalTurns,
992
+ terminationReason: result.termination.reason,
993
+ termination: result.termination
994
+ };
995
+ }
996
+ async function runWithConcurrency(testCases2, context, options = {}) {
997
+ const { concurrency = 1, stopOnFirstFailure = false, signal } = options;
998
+ if (concurrency < 1) {
999
+ throw new EvalError("Concurrency must be at least 1", {
1000
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
1001
+ context: { concurrency }
1002
+ });
1003
+ }
1004
+ if (testCases2.length === 0) {
1005
+ return [];
1006
+ }
1007
+ const semaphore = createSemaphore(concurrency);
1008
+ const results = [];
1009
+ let shouldStop = false;
1010
+ let firstError;
1011
+ const internalAbort = new AbortController();
1012
+ const propagateExternalAbort = () => {
1013
+ shouldStop = true;
1014
+ internalAbort.abort();
1015
+ };
1016
+ signal?.addEventListener("abort", propagateExternalAbort);
1017
+ if (signal?.aborted) {
1018
+ shouldStop = true;
1019
+ }
1020
+ try {
1021
+ const executeOne = async (testCase2, index) => {
1022
+ if (shouldStop) return;
1023
+ await semaphore.acquire();
1024
+ try {
1025
+ if (shouldStop) return;
1026
+ const result = await executeTestCaseByType(testCase2, context, internalAbort.signal);
1027
+ results[index] = result;
1028
+ if (stopOnFirstFailure && !result.passed) {
1029
+ shouldStop = true;
1030
+ internalAbort.abort();
1031
+ }
1032
+ } catch (e) {
1033
+ if (!firstError && !isAbortError(e)) {
1034
+ firstError = e instanceof Error ? e : new Error(String(e));
1035
+ }
1036
+ shouldStop = true;
1037
+ internalAbort.abort();
1038
+ } finally {
1039
+ semaphore.release();
1040
+ }
1041
+ };
1042
+ const promises = testCases2.map((tc, i) => executeOne(tc, i));
1043
+ await Promise.allSettled(promises);
1044
+ if (firstError) {
1045
+ throw firstError;
1046
+ }
1047
+ return results.filter((r) => r !== void 0);
1048
+ } finally {
1049
+ signal?.removeEventListener("abort", propagateExternalAbort);
1050
+ }
1051
+ }
1052
+ function isAbortError(e) {
1053
+ return e instanceof DOMException && e.name === "AbortError" || e instanceof EvalError && e.context?.reason === "aborted";
1054
+ }
1055
+ async function executeTestCaseByType(testCase2, context, signal) {
1056
+ if (isMultiTurnTestCase(testCase2)) {
1057
+ const multiTurnResult = await executeMultiTurnTestCase(testCase2, context, { signal });
1058
+ return toMultiTurnResult(multiTurnResult);
1059
+ }
1060
+ return executeTestCase(testCase2, context, signal);
1061
+ }
1062
+
1063
+ // src/core/types.ts
1064
+ function toEvalAgent(agent) {
1065
+ return {
1066
+ config: {
1067
+ name: agent.config.name,
1068
+ description: agent.config.description
1069
+ },
1070
+ prompt: agent.prompt,
1071
+ execute: async (input, options) => {
1072
+ const result = await agent.execute(input, options);
1073
+ return {
1074
+ result: result.result,
1075
+ metadata: result.metadata
1076
+ };
1077
+ }
1078
+ };
1079
+ }
1080
+ function isSingleTurnResult(result) {
1081
+ return result.kind === "single-turn" || result.kind === "single-turn-iterated";
1082
+ }
1083
+ function isMultiTurnResult(result) {
1084
+ return result.kind === "multi-turn" || result.kind === "multi-turn-iterated";
1085
+ }
1086
+ function isIteratedResult(result) {
1087
+ return result.kind === "single-turn-iterated" || result.kind === "multi-turn-iterated";
1088
+ }
1089
+
1090
+ // src/core/iteration.ts
1091
+ function calculateIterationStats(results) {
1092
+ if (results.length === 0) {
1093
+ return {
1094
+ iterations: 0,
1095
+ scores: [],
1096
+ mean: 0,
1097
+ stdDev: 0,
1098
+ min: 0,
1099
+ max: 0,
1100
+ passRate: 0,
1101
+ passCount: 0
1102
+ };
1103
+ }
1104
+ const scores = results.map((r) => r.overallScore);
1105
+ const passCount = results.filter((r) => r.passed).length;
1106
+ const mean = scores.reduce((sum, s) => sum + s, 0) / scores.length;
1107
+ const variance = scores.reduce((sum, s) => sum + Math.pow(s - mean, 2), 0) / scores.length;
1108
+ const stdDev = Math.sqrt(variance);
1109
+ return {
1110
+ iterations: results.length,
1111
+ scores,
1112
+ mean,
1113
+ stdDev,
1114
+ min: Math.min(...scores),
1115
+ max: Math.max(...scores),
1116
+ passRate: passCount / results.length,
1117
+ passCount
1118
+ };
1119
+ }
1120
+ function calculateMultiTurnIterationStats(results) {
1121
+ const baseStats = calculateIterationStats(results);
1122
+ const turns = results.map((r) => r.totalTurns);
1123
+ const terminationCounts = {};
1124
+ for (const r of results) {
1125
+ const type = r.termination.terminationType;
1126
+ if (type) {
1127
+ terminationCounts[type] = (terminationCounts[type] || 0) + 1;
1128
+ }
1129
+ }
1130
+ return {
1131
+ ...baseStats,
1132
+ avgTurns: turns.length > 0 ? turns.reduce((a, b) => a + b, 0) / turns.length : 0,
1133
+ minTurns: turns.length > 0 ? Math.min(...turns) : 0,
1134
+ maxTurns: turns.length > 0 ? Math.max(...turns) : 0,
1135
+ terminationCounts
1136
+ };
1137
+ }
1138
+ function selectRepresentativeResult(results, mean) {
1139
+ if (results.length === 0) {
1140
+ throw new EvalError("Cannot select representative result from empty array", {
1141
+ code: "UNKNOWN_ERROR" /* UNKNOWN_ERROR */
1142
+ });
1143
+ }
1144
+ return results.reduce((closest, current) => {
1145
+ const closestDiff = Math.abs(closest.overallScore - mean);
1146
+ const currentDiff = Math.abs(current.overallScore - mean);
1147
+ return currentDiff < closestDiff ? current : closest;
1148
+ });
1149
+ }
1150
+ function aggregateIterationResults(allIterationResults) {
1151
+ if (allIterationResults.length === 0) {
1152
+ return [];
1153
+ }
1154
+ const testCount = allIterationResults[0].length;
1155
+ const aggregated = [];
1156
+ for (let i = 0; i < testCount; i++) {
1157
+ const resultsForTestCase = allIterationResults.map((iteration) => iteration[i]);
1158
+ const stats = calculateIterationStats(resultsForTestCase);
1159
+ const representative = selectRepresentativeResult(resultsForTestCase, stats.mean);
1160
+ const isMultiTurn = resultsForTestCase.some((r) => isMultiTurnResult(r));
1161
+ const passedByMajority = stats.passRate >= SCORE.MAJORITY_PASS_THRESHOLD;
1162
+ if (isMultiTurn) {
1163
+ const multiTurnResults = resultsForTestCase.filter(
1164
+ (r) => isMultiTurnResult(r)
1165
+ );
1166
+ const multiTurnRep = representative;
1167
+ const aggregatedResult = {
1168
+ kind: "multi-turn-iterated",
1169
+ testCase: multiTurnRep.testCase,
1170
+ output: multiTurnRep.output,
1171
+ metrics: multiTurnRep.metrics,
1172
+ verdicts: multiTurnRep.verdicts,
1173
+ error: multiTurnRep.error,
1174
+ overallScore: stats.mean,
1175
+ passed: passedByMajority,
1176
+ iterationStats: stats,
1177
+ iterationResults: resultsForTestCase,
1178
+ conversationHistory: multiTurnRep.conversationHistory,
1179
+ totalTurns: multiTurnRep.totalTurns,
1180
+ terminationReason: multiTurnRep.terminationReason,
1181
+ termination: multiTurnRep.termination,
1182
+ multiTurnIterationStats: calculateMultiTurnIterationStats(multiTurnResults)
1183
+ };
1184
+ aggregated.push(aggregatedResult);
1185
+ } else {
1186
+ const aggregatedResult = {
1187
+ kind: "single-turn-iterated",
1188
+ testCase: representative.testCase,
1189
+ output: representative.output,
1190
+ metrics: representative.metrics,
1191
+ verdicts: representative.verdicts,
1192
+ error: representative.error,
1193
+ overallScore: stats.mean,
1194
+ passed: passedByMajority,
1195
+ iterationStats: stats,
1196
+ iterationResults: resultsForTestCase
1197
+ };
1198
+ aggregated.push(aggregatedResult);
1199
+ }
1200
+ }
1201
+ return aggregated;
1202
+ }
1203
+ function filterIteratedResults(results) {
1204
+ return results.filter(
1205
+ (r) => r.kind === "single-turn-iterated" || r.kind === "multi-turn-iterated"
1206
+ );
1207
+ }
1208
+ function averageIterationStat(results, selector) {
1209
+ const iteratedResults = filterIteratedResults(results);
1210
+ if (iteratedResults.length === 0) {
1211
+ return void 0;
1212
+ }
1213
+ const total = iteratedResults.reduce((sum, r) => sum + selector(r.iterationStats), 0);
1214
+ return total / iteratedResults.length;
1215
+ }
1216
+ function calculateAvgStdDev(results) {
1217
+ return averageIterationStat(results, (stats) => stats.stdDev);
1218
+ }
1219
+ function calculateAvgPassRate(results) {
1220
+ return averageIterationStat(results, (stats) => stats.passRate);
1221
+ }
1222
+
1223
+ // src/core/suite.ts
1224
+ function calculateAggregatedMetrics(results) {
1225
+ if (results.length === 0) {
1226
+ return { avgLatencyMs: 0, totalTokens: 0 };
1227
+ }
1228
+ const totalLatencyMs = sumBy(results, (r) => r.metrics.latencyMs);
1229
+ const totalTokens = sumBy(results, (r) => r.metrics.tokenUsage.totalTokens);
1230
+ return {
1231
+ avgLatencyMs: totalLatencyMs / results.length,
1232
+ totalTokens
1233
+ };
1234
+ }
1235
+ function sumBy(items, selector) {
1236
+ return items.reduce((sum, item) => sum + selector(item), 0);
1237
+ }
1238
+ function calculateSummary(results, iterations) {
1239
+ const metrics = calculateAggregatedMetrics(results);
1240
+ const passedCount = results.filter((r) => r.passed).length;
1241
+ const failedCount = results.length - passedCount;
1242
+ const avgScore = results.length > 0 ? sumBy(results, (r) => r.overallScore) / results.length : 0;
1243
+ const summary = {
1244
+ totalTests: results.length,
1245
+ passed: passedCount,
1246
+ failed: failedCount,
1247
+ avgScore,
1248
+ metrics
1249
+ };
1250
+ const hasMultipleIterations = iterations && iterations > 1;
1251
+ if (hasMultipleIterations) {
1252
+ summary.iterations = iterations;
1253
+ summary.avgStdDev = calculateAvgStdDev(results);
1254
+ summary.avgPassRate = calculateAvgPassRate(results);
1255
+ }
1256
+ return summary;
1257
+ }
1258
+ function createEvalSuite(config) {
1259
+ const { agent, agentDescription, judge, improver } = config;
1260
+ const description = agentDescription ?? agent.config.description ?? agent.config.name;
1261
+ const suite = {
1262
+ async run(testCases2, options) {
1263
+ const iterations = options?.iterations ?? 1;
1264
+ validateIterations(iterations);
1265
+ const executeContext = { agent, judge, agentDescription: description };
1266
+ const results = iterations <= 1 ? await runWithConcurrency(testCases2, executeContext, options) : await runMultipleIterations(testCases2, executeContext, options, iterations);
1267
+ const summary = calculateSummary(results, iterations > 1 ? iterations : void 0);
1268
+ const suggestions = improver ? (await improver.improve(agent.prompt, results)).suggestions : [];
1269
+ return {
1270
+ summary,
1271
+ results,
1272
+ suggestions,
1273
+ generatedAt: /* @__PURE__ */ new Date(),
1274
+ promptVersion: agent.prompt.version
1275
+ };
1276
+ },
1277
+ withAgent(newAgent) {
1278
+ return createEvalSuite({
1279
+ ...config,
1280
+ agent: newAgent,
1281
+ agentDescription: void 0
1282
+ });
1283
+ }
1284
+ };
1285
+ return suite;
1286
+ }
1287
+ function validateIterations(iterations) {
1288
+ if (iterations < 1 || !Number.isInteger(iterations)) {
1289
+ throw new EvalError(
1290
+ `Invalid iterations value: ${iterations}. Must be a positive integer.`,
1291
+ { code: "INVALID_CONFIG" /* INVALID_CONFIG */, context: { iterations } }
1292
+ );
1293
+ }
1294
+ }
1295
+ async function runMultipleIterations(testCases2, executeContext, options, iterations) {
1296
+ const allIterationResults = [];
1297
+ for (let i = 0; i < iterations; i++) {
1298
+ const iterationResults = await runWithConcurrency(
1299
+ testCases2,
1300
+ executeContext,
1301
+ { ...options, iterations: void 0 }
1302
+ );
1303
+ allIterationResults.push(iterationResults);
1304
+ }
1305
+ return aggregateIterationResults(allIterationResults);
1306
+ }
1307
+
1308
+ // src/index.ts
1309
+ var import_core8 = require("@agtlantis/core");
1310
+
1311
+ // src/judge/llm-judge.ts
1312
+ var import_ai = require("ai");
1313
+ var import_zod = require("zod");
1314
+
1315
+ // src/judge/prompts/default.ts
1316
+ var defaultJudgePrompt = {
1317
+ id: "default-judge",
1318
+ version: "2.0.0",
1319
+ system: `You are an expert evaluator specializing in assessing AI Agent outputs.
1320
+
1321
+ Your role is to fairly and thoroughly evaluate the agent's output against the provided criteria.
1322
+
1323
+ ## Evaluation Principles
1324
+
1325
+ 1. **Scoring**: Assign a score between 0-100 for each criterion
1326
+ - 90-100: Exceptional - Exceeds expectations with no significant issues
1327
+ - 70-89: Good - Meets expectations with minor issues
1328
+ - 50-69: Acceptable - Partially meets expectations, notable issues present
1329
+ - 30-49: Poor - Falls short of expectations, significant issues
1330
+ - 0-29: Failing - Does not meet minimum requirements
1331
+
1332
+ 2. **Reasoning**: Always provide specific, evidence-based reasoning
1333
+ - Quote or reference specific parts of the output
1334
+ - Explain both strengths and weaknesses
1335
+ - Be constructive and actionable in feedback
1336
+
1337
+ 3. **Objectivity**: Evaluate based solely on the criteria provided
1338
+ - Avoid personal preferences or unstated requirements
1339
+ - Consider the agent's intended purpose and context
1340
+ - Weight severity of issues proportionally
1341
+
1342
+ ## Response Format
1343
+
1344
+ You MUST respond with valid JSON only. No additional text or explanation outside the JSON structure.
1345
+
1346
+ {
1347
+ "verdicts": [
1348
+ {
1349
+ "criterionId": "criterion-id",
1350
+ "score": 0-100,
1351
+ "reasoning": "Detailed explanation with specific evidence from the output",
1352
+ "passed": true/false
1353
+ }
1354
+ ]
1355
+ }`,
1356
+ renderUserPrompt: (ctx) => {
1357
+ const fileSection = buildFileSection(ctx.files);
1358
+ return `
1359
+ ## Agent Under Evaluation
1360
+ ${ctx.agentDescription}
1361
+
1362
+ ## Input Provided to Agent
1363
+ \`\`\`json
1364
+ ${JSON.stringify(ctx.input, null, 2)}
1365
+ \`\`\`
1366
+ ${fileSection}
1367
+ ## Agent Output
1368
+ \`\`\`json
1369
+ ${JSON.stringify(ctx.output, null, 2)}
1370
+ \`\`\`
1371
+
1372
+ ## Evaluation Criteria
1373
+ ${ctx.criteria.map((c) => `- **${c.name}** (id: ${c.id}, weight: ${c.weight ?? 1}): ${c.description}`).join("\n")}
1374
+
1375
+ Please evaluate the agent's output against each criterion listed above.`.trim();
1376
+ }
1377
+ };
1378
+ function buildFileSection(files) {
1379
+ if (!files || files.length === 0) {
1380
+ return "";
1381
+ }
1382
+ return `
1383
+ ## Reference Files
1384
+ ${files.map((f) => `### ${f.path}
1385
+ \`\`\`
1386
+ ${f.content}
1387
+ \`\`\``).join("\n\n")}
1388
+ `;
1389
+ }
1390
+
1391
+ // src/judge/llm-judge.ts
1392
+ function toEvalTokenUsage(usage) {
1393
+ return {
1394
+ inputTokens: usage.inputTokens ?? 0,
1395
+ outputTokens: usage.outputTokens ?? 0,
1396
+ totalTokens: usage.totalTokens ?? 0
1397
+ };
1398
+ }
1399
+ function hasValidator(criterion) {
1400
+ return "validator" in criterion && typeof criterion.validator === "function";
1401
+ }
1402
+ var JudgeResponseSchema = import_zod.z.object({
1403
+ verdicts: import_zod.z.array(
1404
+ import_zod.z.object({
1405
+ criterionId: import_zod.z.string(),
1406
+ score: import_zod.z.number().min(SCORE.MIN).max(SCORE.MAX),
1407
+ reasoning: import_zod.z.string(),
1408
+ passed: import_zod.z.boolean().optional()
1409
+ })
1410
+ )
1411
+ });
1412
+ function validateAllCriteriaHaveVerdicts(verdicts, criteriaIds) {
1413
+ const providedIds = new Set(verdicts.map((v) => v.criterionId));
1414
+ const missingIds = criteriaIds.filter((id) => !providedIds.has(id));
1415
+ if (missingIds.length > 0) {
1416
+ throw new EvalError("Judge response missing verdicts for some criteria", {
1417
+ code: "VERDICT_PARSE_ERROR" /* VERDICT_PARSE_ERROR */,
1418
+ context: { missingCriteriaIds: missingIds, providedIds: [...providedIds] }
1419
+ });
1420
+ }
1421
+ }
1422
+ function calculateOverallScore(verdicts, criteriaWeights) {
1423
+ let totalWeight = 0;
1424
+ let weightedSum = 0;
1425
+ for (const verdict of verdicts) {
1426
+ const weight = criteriaWeights.get(verdict.criterionId) ?? 1;
1427
+ weightedSum += verdict.score * weight;
1428
+ totalWeight += weight;
1429
+ }
1430
+ if (totalWeight === 0) {
1431
+ return 0;
1432
+ }
1433
+ return Math.round(weightedSum / totalWeight * 100) / 100;
1434
+ }
1435
+ function runValidatorCriteria(validatorCriteria, output) {
1436
+ return validatorCriteria.map((criterion) => {
1437
+ const result = criterion.validator(output);
1438
+ if (result.valid) {
1439
+ return {
1440
+ criterionId: criterion.id,
1441
+ score: 100,
1442
+ reasoning: `${criterion.name} \uD1B5\uACFC`,
1443
+ passed: true
1444
+ };
1445
+ }
1446
+ return {
1447
+ criterionId: criterion.id,
1448
+ score: 0,
1449
+ reasoning: `${criterion.name} \uC2E4\uD328:
1450
+ ${result.errorSummary ?? "\uC720\uD6A8\uC131 \uAC80\uC99D \uC624\uB958"}`,
1451
+ passed: false
1452
+ };
1453
+ });
1454
+ }
1455
+ async function runLLMEvaluation(provider, prompt, context, llmCriteriaIds, passThreshold) {
1456
+ const messages = [
1457
+ { role: "system", content: prompt.system },
1458
+ { role: "user", content: prompt.renderUserPrompt(context) }
1459
+ ];
1460
+ let response;
1461
+ let usage;
1462
+ try {
1463
+ const execution = provider.simpleExecution(async (session) => {
1464
+ const result = await session.generateText({
1465
+ messages,
1466
+ output: import_ai.Output.object({ schema: JudgeResponseSchema })
1467
+ });
1468
+ return result.output;
1469
+ });
1470
+ const executionResult = await execution.result();
1471
+ if (executionResult.status !== "succeeded") {
1472
+ throw executionResult.status === "failed" ? executionResult.error : new Error("Execution was canceled");
1473
+ }
1474
+ response = executionResult.value;
1475
+ usage = executionResult.summary.totalLLMUsage;
1476
+ } catch (cause) {
1477
+ throw EvalError.from(cause, "LLM_API_ERROR" /* LLM_API_ERROR */, {
1478
+ promptId: prompt.id,
1479
+ promptVersion: prompt.version
1480
+ });
1481
+ }
1482
+ validateAllCriteriaHaveVerdicts(response.verdicts, llmCriteriaIds);
1483
+ const verdicts = response.verdicts.map((v) => ({
1484
+ criterionId: v.criterionId,
1485
+ score: v.score,
1486
+ reasoning: v.reasoning,
1487
+ passed: v.passed ?? v.score >= passThreshold
1488
+ }));
1489
+ return { verdicts, usage };
1490
+ }
1491
+ function createJudge(config) {
1492
+ const {
1493
+ provider,
1494
+ prompt = defaultJudgePrompt,
1495
+ criteria,
1496
+ passThreshold = SCORE.DEFAULT_PASS_THRESHOLD,
1497
+ model
1498
+ } = config;
1499
+ const validatorCriteria = [];
1500
+ const llmCriteria = [];
1501
+ const criteriaWeights = /* @__PURE__ */ new Map();
1502
+ const llmCriteriaIds = [];
1503
+ for (const c of criteria) {
1504
+ criteriaWeights.set(c.id, c.weight ?? 1);
1505
+ if (hasValidator(c)) {
1506
+ validatorCriteria.push(c);
1507
+ } else {
1508
+ llmCriteria.push(c);
1509
+ llmCriteriaIds.push(c.id);
1510
+ }
1511
+ }
1512
+ return {
1513
+ async evaluate(evalContext) {
1514
+ const { input, output, agentDescription, files } = evalContext;
1515
+ const validatorVerdicts = runValidatorCriteria(validatorCriteria, output);
1516
+ let llmVerdicts = [];
1517
+ let llmUsage;
1518
+ if (llmCriteria.length > 0) {
1519
+ const context = {
1520
+ agentDescription,
1521
+ input,
1522
+ output,
1523
+ criteria: llmCriteria,
1524
+ files
1525
+ };
1526
+ const llmResult = await runLLMEvaluation(
1527
+ provider,
1528
+ prompt,
1529
+ context,
1530
+ llmCriteriaIds,
1531
+ passThreshold
1532
+ );
1533
+ llmVerdicts = llmResult.verdicts;
1534
+ llmUsage = llmResult.usage;
1535
+ }
1536
+ const allVerdicts = [...validatorVerdicts, ...llmVerdicts];
1537
+ const overallScore = calculateOverallScore(allVerdicts, criteriaWeights);
1538
+ const passed = overallScore >= passThreshold;
1539
+ const metadata = llmUsage ? { tokenUsage: toEvalTokenUsage(llmUsage), model } : void 0;
1540
+ return {
1541
+ verdicts: allVerdicts,
1542
+ overallScore,
1543
+ passed,
1544
+ metadata
1545
+ };
1546
+ }
1547
+ };
1548
+ }
1549
+
1550
+ // src/judge/criteria/validate-schema.ts
1551
+ function formatZodErrors(error) {
1552
+ return error.errors.map((e) => {
1553
+ const path3 = e.path.length > 0 ? `${e.path.join(".")}: ` : "";
1554
+ return `- ${path3}${e.message}`;
1555
+ }).join("\n");
1556
+ }
1557
+ function schema(options) {
1558
+ const { schema: schema2, id, weight, name, description } = options;
1559
+ return {
1560
+ id: id ?? "schema-validation",
1561
+ name: name ?? "\uC2A4\uD0A4\uB9C8 \uC720\uD6A8\uC131",
1562
+ description: description ?? "\uCD9C\uB825\uC774 \uC9C0\uC815\uB41C \uC2A4\uD0A4\uB9C8(Zod)\uB97C \uC900\uC218\uD558\uB294\uC9C0 \uD504\uB85C\uADF8\uB798\uBC0D \uBC29\uC2DD\uC73C\uB85C \uAC80\uC99D\uD569\uB2C8\uB2E4.",
1563
+ weight,
1564
+ validator: (output) => {
1565
+ const result = schema2.safeParse(output);
1566
+ if (result.success) {
1567
+ return { valid: true };
1568
+ }
1569
+ return {
1570
+ valid: false,
1571
+ errors: result.error.errors,
1572
+ errorSummary: formatZodErrors(result.error)
1573
+ };
1574
+ }
1575
+ };
1576
+ }
1577
+
1578
+ // src/judge/criteria/index.ts
1579
+ function accuracy(options) {
1580
+ return {
1581
+ id: "accuracy",
1582
+ name: "Accuracy",
1583
+ description: "Evaluates whether the output is factually correct, free from errors, and avoids hallucinations. Check for incorrect facts, made-up information, or misrepresentation of the input data.",
1584
+ weight: options?.weight
1585
+ };
1586
+ }
1587
+ function consistency(options) {
1588
+ return {
1589
+ id: "consistency",
1590
+ name: "Consistency",
1591
+ description: "Evaluates whether the output is internally coherent and logically consistent. Check for self-contradictions, conflicting statements, or logical inconsistencies within the response.",
1592
+ weight: options?.weight
1593
+ };
1594
+ }
1595
+ function relevance(options) {
1596
+ return {
1597
+ id: "relevance",
1598
+ name: "Relevance",
1599
+ description: "Evaluates whether the output directly addresses the input and fulfills the user intent. Check for off-topic content, missing key requirements, or responses that fail to answer the actual question.",
1600
+ weight: options?.weight
1601
+ };
1602
+ }
1603
+
1604
+ // src/reporter/markdown.ts
1605
+ var import_promises = require("fs/promises");
1606
+ var import_core3 = require("@agtlantis/core");
1607
+ var PASS_ICON = "\u2705";
1608
+ var FAIL_ICON = "\u274C";
1609
+ var PRIORITY_ORDER = { high: 0, medium: 1, low: 2 };
1610
+ function reportToMarkdown(report, options = {}) {
1611
+ const {
1612
+ expandPassedTests = false,
1613
+ includeRawOutput = false,
1614
+ outputPreviewLength = 200
1615
+ } = options;
1616
+ const { summary, results, suggestions, generatedAt, promptVersion } = report;
1617
+ const passRate = summary.totalTests > 0 ? (summary.passed / summary.totalTests * 100).toFixed(1) : "0.0";
1618
+ const lines = [];
1619
+ lines.push("# Evaluation Report");
1620
+ lines.push("");
1621
+ lines.push(`> Generated: ${generatedAt.toISOString()}`);
1622
+ lines.push(`> Prompt Version: ${promptVersion}`);
1623
+ lines.push("");
1624
+ lines.push("## Summary");
1625
+ lines.push("");
1626
+ lines.push(`| Metric | Value |`);
1627
+ lines.push(`|--------|-------|`);
1628
+ lines.push(`| Total Tests | ${summary.totalTests} |`);
1629
+ if (summary.iterations && summary.iterations > 1) {
1630
+ lines.push(`| **Iterations** | **${summary.iterations}** |`);
1631
+ }
1632
+ lines.push(`| Passed | ${summary.passed} (${passRate}%) |`);
1633
+ lines.push(`| Failed | ${summary.failed} |`);
1634
+ if (summary.avgStdDev !== void 0) {
1635
+ lines.push(
1636
+ `| Average Score | ${summary.avgScore.toFixed(1)} \xB1 ${summary.avgStdDev.toFixed(1)} |`
1637
+ );
1638
+ } else {
1639
+ lines.push(`| Average Score | ${summary.avgScore.toFixed(1)} |`);
1640
+ }
1641
+ if (summary.avgPassRate !== void 0) {
1642
+ lines.push(`| Avg Pass Rate | ${(summary.avgPassRate * 100).toFixed(1)}% |`);
1643
+ }
1644
+ lines.push(`| Avg Latency | ${summary.metrics.avgLatencyMs.toFixed(0)}ms |`);
1645
+ lines.push(`| Total Tokens | ${summary.metrics.totalTokens} |`);
1646
+ if (summary.costSummary?.total !== void 0) {
1647
+ lines.push(`| Est. Cost | $${summary.costSummary.total.toFixed(4)} |`);
1648
+ }
1649
+ lines.push("");
1650
+ const failedResults = results.filter((r) => !r.passed);
1651
+ if (failedResults.length > 0) {
1652
+ lines.push(`## ${FAIL_ICON} Failed Tests`);
1653
+ lines.push("");
1654
+ for (const result of failedResults) {
1655
+ lines.push(formatTestResult(result, outputPreviewLength, includeRawOutput));
1656
+ }
1657
+ }
1658
+ const passedResults = results.filter((r) => r.passed);
1659
+ if (passedResults.length > 0) {
1660
+ lines.push(`## ${PASS_ICON} Passed Tests`);
1661
+ lines.push("");
1662
+ if (expandPassedTests) {
1663
+ for (const result of passedResults) {
1664
+ lines.push(formatTestResult(result, outputPreviewLength, includeRawOutput));
1665
+ }
1666
+ } else {
1667
+ lines.push("<details>");
1668
+ lines.push("<summary>Click to expand passed tests</summary>");
1669
+ lines.push("");
1670
+ for (const result of passedResults) {
1671
+ lines.push(formatTestResult(result, outputPreviewLength, includeRawOutput));
1672
+ }
1673
+ lines.push("</details>");
1674
+ lines.push("");
1675
+ }
1676
+ }
1677
+ if (suggestions.length > 0) {
1678
+ lines.push("## \u{1F4A1} Improvement Suggestions");
1679
+ lines.push("");
1680
+ const sortedSuggestions = [...suggestions].sort(
1681
+ (a, b) => PRIORITY_ORDER[a.priority] - PRIORITY_ORDER[b.priority]
1682
+ );
1683
+ for (const suggestion of sortedSuggestions) {
1684
+ lines.push(formatSuggestion(suggestion));
1685
+ }
1686
+ }
1687
+ return lines.join("\n");
1688
+ }
1689
+ async function saveReportMarkdown(report, path3, options) {
1690
+ const markdown = reportToMarkdown(report, options);
1691
+ await (0, import_promises.writeFile)(path3, markdown, "utf-8");
1692
+ }
1693
+ function jsonCodeBlock(value, maxLength) {
1694
+ const json = JSON.stringify(value, null, 2);
1695
+ const content = maxLength !== void 0 ? truncate(json, maxLength) : json;
1696
+ return ["```json", content, "```"];
1697
+ }
1698
+ function passFailIcon(passed) {
1699
+ return passed ? PASS_ICON : FAIL_ICON;
1700
+ }
1701
+ function formatTestResult(result, previewLength, includeRaw) {
1702
+ const lines = [];
1703
+ const testId = result.testCase.id ?? "unnamed";
1704
+ const scoreDisplay = result.iterationStats ? `${result.overallScore.toFixed(1)} \xB1 ${result.iterationStats.stdDev.toFixed(1)}` : result.overallScore.toFixed(1);
1705
+ lines.push(`### ${testId} (Score: ${scoreDisplay})`);
1706
+ lines.push("");
1707
+ if (result.testCase.description) {
1708
+ lines.push(`> ${result.testCase.description}`);
1709
+ lines.push("");
1710
+ }
1711
+ const fileDisplayInfos = (0, import_core3.getFileSourcesDisplayInfo)(result.testCase.input);
1712
+ if (fileDisplayInfos.length > 0) {
1713
+ lines.push("**Files:**");
1714
+ for (const info of fileDisplayInfos) {
1715
+ const namePrefix = info.filename ? `${info.filename} - ` : "";
1716
+ lines.push(`- ${namePrefix}${info.source}: ${info.description} (${info.mediaType})`);
1717
+ }
1718
+ lines.push("");
1719
+ }
1720
+ if (result.totalTurns !== void 0) {
1721
+ lines.push(
1722
+ `**Multi-turn:** ${result.totalTurns} turns | Termination: ${result.terminationReason ?? "unknown"}`
1723
+ );
1724
+ lines.push("");
1725
+ }
1726
+ if (result.multiTurnIterationStats) {
1727
+ lines.push(...formatMultiTurnIterationStats(result.multiTurnIterationStats));
1728
+ }
1729
+ if (result.iterationStats && result.iterationResults) {
1730
+ lines.push(...formatIterationResults(result.iterationStats, result.iterationResults));
1731
+ }
1732
+ if (result.conversationHistory && result.conversationHistory.length > 0) {
1733
+ lines.push(...formatConversationHistory(result.conversationHistory, previewLength));
1734
+ } else {
1735
+ lines.push(
1736
+ ...formatSingleTurnInputOutput(result.testCase.input, result.output, previewLength)
1737
+ );
1738
+ }
1739
+ lines.push("**Verdicts:**");
1740
+ for (const verdict of result.verdicts) {
1741
+ lines.push(
1742
+ `- ${passFailIcon(verdict.passed)} **${verdict.criterionId}**: ${verdict.score} - ${verdict.reasoning}`
1743
+ );
1744
+ }
1745
+ lines.push("");
1746
+ if (includeRaw) {
1747
+ lines.push("<details>");
1748
+ lines.push("<summary>Raw Output</summary>");
1749
+ lines.push("");
1750
+ lines.push(...jsonCodeBlock(result.output));
1751
+ lines.push("</details>");
1752
+ lines.push("");
1753
+ }
1754
+ return lines.join("\n");
1755
+ }
1756
+ function formatMultiTurnIterationStats(stats) {
1757
+ const terminationSummary = Object.entries(stats.terminationCounts).map(([type, count]) => `${type}: ${count}`).join(", ") || "none";
1758
+ return [
1759
+ "**Multi-turn Iteration Statistics:**",
1760
+ "",
1761
+ "| Metric | Value |",
1762
+ "|--------|-------|",
1763
+ `| Avg Turns | ${stats.avgTurns.toFixed(1)} |`,
1764
+ `| Min/Max Turns | ${stats.minTurns} / ${stats.maxTurns} |`,
1765
+ `| Termination Distribution | ${terminationSummary} |`,
1766
+ ""
1767
+ ];
1768
+ }
1769
+ function formatIterationResults(stats, results) {
1770
+ const lines = [
1771
+ "**Iteration Results:**",
1772
+ "",
1773
+ "| # | Score | Passed | Latency |",
1774
+ "|---|-------|--------|---------|"
1775
+ ];
1776
+ results.forEach((iter, idx) => {
1777
+ lines.push(
1778
+ `| ${idx + 1} | ${iter.overallScore.toFixed(1)} | ${passFailIcon(iter.passed)} | ${iter.metrics.latencyMs.toFixed(0)}ms |`
1779
+ );
1780
+ });
1781
+ lines.push("");
1782
+ lines.push(
1783
+ `**Stats:** ${stats.mean.toFixed(1)} \xB1 ${stats.stdDev.toFixed(1)} (min: ${stats.min.toFixed(0)}, max: ${stats.max.toFixed(0)}, pass rate: ${(stats.passRate * 100).toFixed(0)}%)`
1784
+ );
1785
+ lines.push("");
1786
+ return lines;
1787
+ }
1788
+ function formatConversationHistory(history, previewLength) {
1789
+ const lines = ["**Conversation History:**", ""];
1790
+ for (const turn of history) {
1791
+ lines.push("<details>");
1792
+ lines.push(`<summary>Turn ${turn.turn}</summary>`);
1793
+ lines.push("");
1794
+ lines.push("**Input:**");
1795
+ lines.push(...jsonCodeBlock(turn.input, previewLength));
1796
+ lines.push("");
1797
+ lines.push("**Output:**");
1798
+ lines.push(...jsonCodeBlock(turn.output, previewLength));
1799
+ lines.push("</details>");
1800
+ lines.push("");
1801
+ }
1802
+ return lines;
1803
+ }
1804
+ function formatSingleTurnInputOutput(input, output, previewLength) {
1805
+ return [
1806
+ "**Input:**",
1807
+ ...jsonCodeBlock(input, previewLength),
1808
+ "",
1809
+ "**Output:**",
1810
+ ...jsonCodeBlock(output, previewLength),
1811
+ ""
1812
+ ];
1813
+ }
1814
+ function formatSuggestion(suggestion) {
1815
+ const lines = [];
1816
+ const priorityIcon = { high: "\u{1F534}", medium: "\u{1F7E1}", low: "\u{1F7E2}" }[suggestion.priority] ?? "\u26AA";
1817
+ lines.push(`### ${priorityIcon} [${suggestion.priority.toUpperCase()}] ${suggestion.type}`);
1818
+ lines.push("");
1819
+ lines.push(`**Reasoning:** ${suggestion.reasoning}`);
1820
+ lines.push("");
1821
+ lines.push(`**Expected Improvement:** ${suggestion.expectedImprovement}`);
1822
+ lines.push("");
1823
+ lines.push("**Diff:**");
1824
+ lines.push("```diff");
1825
+ lines.push(`- ${suggestion.currentValue.split("\n").join("\n- ")}`);
1826
+ lines.push(`+ ${suggestion.suggestedValue.split("\n").join("\n+ ")}`);
1827
+ lines.push("```");
1828
+ lines.push("");
1829
+ return lines.join("\n");
1830
+ }
1831
+ function compareReports(before, after) {
1832
+ const scoreDelta = after.summary.avgScore - before.summary.avgScore;
1833
+ const beforePassRate = before.summary.totalTests > 0 ? before.summary.passed / before.summary.totalTests : 0;
1834
+ const afterPassRate = after.summary.totalTests > 0 ? after.summary.passed / after.summary.totalTests : 0;
1835
+ const passRateDelta = afterPassRate - beforePassRate;
1836
+ const metricsDelta = {
1837
+ latencyMs: after.summary.metrics.avgLatencyMs - before.summary.metrics.avgLatencyMs,
1838
+ tokenUsage: after.summary.metrics.totalTokens - before.summary.metrics.totalTokens
1839
+ };
1840
+ const beforeScores = buildScoreMap(before.results);
1841
+ const afterScores = buildScoreMap(after.results);
1842
+ const improved = [];
1843
+ const regressed = [];
1844
+ for (const [id, afterScore] of afterScores) {
1845
+ const beforeScore = beforeScores.get(id);
1846
+ if (beforeScore === void 0) continue;
1847
+ if (afterScore > beforeScore) {
1848
+ improved.push(id);
1849
+ } else if (afterScore < beforeScore) {
1850
+ regressed.push(id);
1851
+ }
1852
+ }
1853
+ const removed = [...beforeScores.keys()].filter((id) => !afterScores.has(id));
1854
+ return {
1855
+ scoreDelta,
1856
+ passRateDelta,
1857
+ metricsDelta,
1858
+ improved,
1859
+ regressed,
1860
+ removed
1861
+ };
1862
+ }
1863
+ function buildScoreMap(results) {
1864
+ const scoreMap = /* @__PURE__ */ new Map();
1865
+ for (const result of results) {
1866
+ scoreMap.set(result.testCase.id ?? "unnamed", result.overallScore);
1867
+ }
1868
+ return scoreMap;
1869
+ }
1870
+
1871
+ // src/reporter/json-reporter.ts
1872
+ var import_node_fs2 = require("fs");
1873
+
1874
+ // src/reporter/cost-helpers.ts
1875
+ var import_core4 = require("@agtlantis/core");
1876
+ function toLanguageModelUsage(usage) {
1877
+ return {
1878
+ inputTokens: usage.inputTokens,
1879
+ outputTokens: usage.outputTokens,
1880
+ totalTokens: usage.totalTokens
1881
+ };
1882
+ }
1883
+ var PROVIDER_MAPPING = {
1884
+ gemini: "google",
1885
+ openai: "openai",
1886
+ anthropic: "anthropic",
1887
+ google: "google"
1888
+ };
1889
+ function detectProvider(model) {
1890
+ if (!model) return "google";
1891
+ if (model.startsWith("gpt-") || model.startsWith("o1") || model.startsWith("o3")) {
1892
+ return "openai";
1893
+ }
1894
+ if (model.startsWith("gemini-")) {
1895
+ return "google";
1896
+ }
1897
+ if (model.startsWith("claude-")) {
1898
+ return "anthropic";
1899
+ }
1900
+ return "google";
1901
+ }
1902
+ function normalizeProvider(provider) {
1903
+ if (!provider) return "google";
1904
+ return PROVIDER_MAPPING[provider] ?? provider;
1905
+ }
1906
+ function calculateComponentCost(tokenUsage, model, provider, config) {
1907
+ if (!tokenUsage) return void 0;
1908
+ const normalizedProvider = provider ? normalizeProvider(provider) : detectProvider(model);
1909
+ const providerPricing = config?.providerPricing?.[normalizedProvider];
1910
+ const result = (0, import_core4.calculateCostFromUsage)(
1911
+ toLanguageModelUsage(tokenUsage),
1912
+ model ?? "unknown",
1913
+ normalizedProvider,
1914
+ providerPricing
1915
+ );
1916
+ return result.total;
1917
+ }
1918
+ function buildCostBreakdown(costs) {
1919
+ const total = (costs.agent ?? 0) + (costs.judge ?? 0) + (costs.improver ?? 0);
1920
+ return {
1921
+ ...costs,
1922
+ total: total > 0 ? total : void 0
1923
+ };
1924
+ }
1925
+ function calculateResultCost(result, config) {
1926
+ const agentCost = calculateComponentCost(
1927
+ result.metrics.tokenUsage,
1928
+ result.agentMetadata?.model,
1929
+ result.agentMetadata?.provider,
1930
+ config
1931
+ );
1932
+ const judgeCost = result.judgeMetadata?.tokenUsage ? calculateComponentCost(
1933
+ result.judgeMetadata.tokenUsage,
1934
+ result.judgeMetadata.model,
1935
+ result.judgeMetadata.provider,
1936
+ config
1937
+ ) : void 0;
1938
+ return buildCostBreakdown({
1939
+ agent: agentCost,
1940
+ judge: judgeCost
1941
+ });
1942
+ }
1943
+ function calculateReportCosts(report, config) {
1944
+ let totalAgent = 0;
1945
+ let totalJudge = 0;
1946
+ for (const result of report.results) {
1947
+ const breakdown = calculateResultCost(result, config);
1948
+ totalAgent += breakdown.agent ?? 0;
1949
+ totalJudge += breakdown.judge ?? 0;
1950
+ }
1951
+ return {
1952
+ total: totalAgent + totalJudge,
1953
+ byComponent: {
1954
+ agent: totalAgent,
1955
+ judge: totalJudge
1956
+ }
1957
+ };
1958
+ }
1959
+ function addCostsToResults(results, config) {
1960
+ return results.map((result) => {
1961
+ const costBreakdown = calculateResultCost(result, config);
1962
+ const metricsWithCost = {
1963
+ latencyMs: result.metrics.latencyMs,
1964
+ tokenUsage: result.metrics.tokenUsage,
1965
+ costBreakdown
1966
+ };
1967
+ return {
1968
+ testCase: result.testCase,
1969
+ output: result.output,
1970
+ metrics: metricsWithCost,
1971
+ error: result.error,
1972
+ verdicts: result.verdicts,
1973
+ overallScore: result.overallScore,
1974
+ passed: result.passed
1975
+ };
1976
+ });
1977
+ }
1978
+
1979
+ // src/reporter/format-utils.ts
1980
+ var import_node_fs = require("fs");
1981
+ var import_node_path = __toESM(require("path"), 1);
1982
+ function formatScoreDelta(delta) {
1983
+ if (delta === null) {
1984
+ return "-";
1985
+ }
1986
+ const sign = delta >= 0 ? "+" : "";
1987
+ return `${sign}${delta.toFixed(1)}`;
1988
+ }
1989
+ function buildOutputPath(outputDir, name, extension, addTimestamp) {
1990
+ (0, import_node_fs.mkdirSync)(outputDir, { recursive: true });
1991
+ const filename = addTimestamp ? `${name}-${Date.now()}.${extension}` : `${name}.${extension}`;
1992
+ return import_node_path.default.join(outputDir, filename);
1993
+ }
1994
+ function toISOStringIfDate(value) {
1995
+ return value instanceof Date ? value.toISOString() : value;
1996
+ }
1997
+
1998
+ // src/reporter/json-reporter.ts
1999
+ var JsonReporter = class {
2000
+ outputDir;
2001
+ pricing;
2002
+ addTimestamp;
2003
+ constructor(options) {
2004
+ this.outputDir = options.outputDir;
2005
+ this.pricing = options.pricing;
2006
+ this.addTimestamp = options.addTimestamp ?? true;
2007
+ }
2008
+ save(report, name) {
2009
+ const filepath = buildOutputPath(this.outputDir, name, "json", this.addTimestamp);
2010
+ const costs = this.pricing ? calculateReportCosts(report, this.pricing) : void 0;
2011
+ const output = {
2012
+ summary: report.summary,
2013
+ results: report.results,
2014
+ suggestions: report.suggestions,
2015
+ generatedAt: report.generatedAt.toISOString(),
2016
+ promptVersion: report.promptVersion,
2017
+ ...costs && { costs }
2018
+ };
2019
+ (0, import_node_fs2.writeFileSync)(filepath, JSON.stringify(output, null, 2));
2020
+ return filepath;
2021
+ }
2022
+ };
2023
+
2024
+ // src/reporter/markdown-reporter.ts
2025
+ var import_node_fs3 = require("fs");
2026
+ var MarkdownReporter = class {
2027
+ outputDir;
2028
+ addTimestamp;
2029
+ markdownOptions;
2030
+ constructor(options) {
2031
+ this.outputDir = options.outputDir;
2032
+ this.addTimestamp = options.addTimestamp ?? true;
2033
+ this.markdownOptions = options.markdown ?? {};
2034
+ }
2035
+ save(report, name) {
2036
+ const filepath = buildOutputPath(this.outputDir, name, "md", this.addTimestamp);
2037
+ const markdown = reportToMarkdown(report, this.markdownOptions);
2038
+ (0, import_node_fs3.writeFileSync)(filepath, markdown);
2039
+ return filepath;
2040
+ }
2041
+ };
2042
+
2043
+ // src/reporter/console-reporter.ts
2044
+ var ConsoleReporter = class {
2045
+ verbosity;
2046
+ pricing;
2047
+ constructor(options = {}) {
2048
+ this.verbosity = options.verbosity ?? "summary";
2049
+ this.pricing = options.pricing;
2050
+ }
2051
+ log(report) {
2052
+ const { summary } = report;
2053
+ const passRate = summary.totalTests > 0 ? summary.passed / summary.totalTests : 0;
2054
+ console.log(`
2055
+ \u{1F4CA} Eval Report: ${summary.totalTests} tests`);
2056
+ console.log(` Score: ${summary.avgScore.toFixed(1)} | Pass Rate: ${(passRate * 100).toFixed(0)}%`);
2057
+ if (this.verbosity === "summary") {
2058
+ this.logCostIfAvailable(report);
2059
+ return;
2060
+ }
2061
+ console.log("");
2062
+ for (const result of report.results) {
2063
+ const testId = result.testCase.id || "unknown";
2064
+ const status = result.passed ? "\u2713" : "\u2717";
2065
+ console.log(` ${status} [${testId}] Score: ${result.overallScore.toFixed(1)}`);
2066
+ if (this.verbosity === "full") {
2067
+ console.log(` Input: ${truncate(JSON.stringify(result.testCase.input), 80)}`);
2068
+ console.log(` Output: ${truncate(String(result.output), 80)}`);
2069
+ }
2070
+ }
2071
+ this.logCostIfAvailable(report);
2072
+ }
2073
+ logCostIfAvailable(report) {
2074
+ if (this.pricing) {
2075
+ const costs = calculateReportCosts(report, this.pricing);
2076
+ console.log(`
2077
+ \u{1F4B0} Cost: $${costs.total.toFixed(4)}`);
2078
+ }
2079
+ }
2080
+ };
2081
+
2082
+ // src/reporter/composite-reporter.ts
2083
+ var CompositeReporter = class {
2084
+ constructor(reporters) {
2085
+ this.reporters = reporters;
2086
+ }
2087
+ /**
2088
+ * Saves to all reporters that support saving.
2089
+ * Returns the first successful file path (usually JsonReporter).
2090
+ */
2091
+ save(report, name) {
2092
+ const errors = [];
2093
+ let firstPath;
2094
+ for (const reporter of this.reporters) {
2095
+ if (!reporter.save) {
2096
+ reporter.log?.(report);
2097
+ continue;
2098
+ }
2099
+ try {
2100
+ const savedPath = reporter.save(report, name);
2101
+ if (!firstPath) firstPath = savedPath;
2102
+ } catch (error) {
2103
+ errors.push({
2104
+ reporter: reporter.constructor.name,
2105
+ error
2106
+ });
2107
+ }
2108
+ reporter.log?.(report);
2109
+ }
2110
+ if (!firstPath) {
2111
+ const details = errors.length > 0 ? errors.map((e) => `${e.reporter}: ${e.error.message}`).join(", ") : "No reporters support save()";
2112
+ throw new Error(`No reporter saved successfully. ${details}`);
2113
+ }
2114
+ return firstPath;
2115
+ }
2116
+ log(report) {
2117
+ for (const reporter of this.reporters) {
2118
+ reporter.log?.(report);
2119
+ }
2120
+ }
2121
+ };
2122
+
2123
+ // src/reporter/factory.ts
2124
+ function createJsonReporter(outputDir, options) {
2125
+ return new JsonReporter({ outputDir, ...options });
2126
+ }
2127
+ function createMarkdownReporter(outputDir, options) {
2128
+ return new MarkdownReporter({ outputDir, ...options });
2129
+ }
2130
+ function createConsoleReporter(options) {
2131
+ return new ConsoleReporter(options);
2132
+ }
2133
+ function createCompositeReporter(reporters) {
2134
+ return new CompositeReporter(reporters);
2135
+ }
2136
+ function createDefaultReporter(outputDir, options) {
2137
+ return new CompositeReporter([
2138
+ new JsonReporter({
2139
+ outputDir,
2140
+ pricing: options?.pricing,
2141
+ addTimestamp: options?.addTimestamp
2142
+ }),
2143
+ new ConsoleReporter({
2144
+ verbosity: options?.verbosity,
2145
+ pricing: options?.pricing
2146
+ })
2147
+ ]);
2148
+ }
2149
+
2150
+ // src/reporter/runner.ts
2151
+ function createReportRunner(options) {
2152
+ const { outputDir, pricing, verbosity } = options;
2153
+ const jsonReporter = new JsonReporter({ outputDir, pricing });
2154
+ const consoleReporter = verbosity !== false ? new ConsoleReporter({ verbosity: verbosity || "summary", pricing }) : null;
2155
+ return async (suite, testCases2, name) => {
2156
+ const report = await suite.run(testCases2);
2157
+ consoleReporter?.log(report);
2158
+ const savedPath = jsonReporter.save(report, name);
2159
+ return { report, savedPath };
2160
+ };
2161
+ }
2162
+
2163
+ // src/reporter/cycle-json.ts
2164
+ var import_node_fs4 = require("fs");
2165
+ var import_node_path2 = __toESM(require("path"), 1);
2166
+ function saveCycleJson(result, options) {
2167
+ const { outputDir, name, directory, saveRounds = true } = options;
2168
+ const cycleDir = resolveCycleDirectory(outputDir, name, directory);
2169
+ (0, import_node_fs4.mkdirSync)(cycleDir, { recursive: true });
2170
+ saveCycleSummary(cycleDir, result);
2171
+ if (saveRounds) {
2172
+ saveRoundReports(cycleDir, result.rounds);
2173
+ }
2174
+ return cycleDir;
2175
+ }
2176
+ function resolveCycleDirectory(outputDir, name, directory) {
2177
+ if (directory) {
2178
+ return directory;
2179
+ }
2180
+ if (outputDir && name) {
2181
+ return import_node_path2.default.join(outputDir, `${name}-${Date.now()}`);
2182
+ }
2183
+ throw new Error('saveCycleJson requires either "directory" or both "outputDir" and "name"');
2184
+ }
2185
+ function saveCycleSummary(cycleDir, result) {
2186
+ const summaryPath = import_node_path2.default.join(cycleDir, "cycle-summary.json");
2187
+ const summary = {
2188
+ rounds: result.rounds.map((round) => ({
2189
+ round: round.round,
2190
+ completedAt: toISOStringIfDate(round.completedAt),
2191
+ score: round.report.summary.avgScore,
2192
+ scoreDelta: round.scoreDelta,
2193
+ cost: round.cost,
2194
+ suggestionsGenerated: round.suggestionsGenerated.length,
2195
+ suggestionsApproved: round.suggestionsApproved.length,
2196
+ promptVersionAfter: round.promptVersionAfter
2197
+ })),
2198
+ terminationReason: result.terminationReason,
2199
+ totalCost: result.totalCost,
2200
+ roundCount: result.rounds.length,
2201
+ initialScore: result.rounds[0]?.report.summary.avgScore ?? null,
2202
+ finalScore: result.rounds[result.rounds.length - 1]?.report.summary.avgScore ?? null
2203
+ };
2204
+ (0, import_node_fs4.writeFileSync)(summaryPath, JSON.stringify(summary, null, 2));
2205
+ }
2206
+ function saveRoundReports(cycleDir, rounds) {
2207
+ for (const round of rounds) {
2208
+ const roundPath = import_node_path2.default.join(cycleDir, `round-${round.round}-report.json`);
2209
+ const roundData = {
2210
+ round: round.round,
2211
+ completedAt: toISOStringIfDate(round.completedAt),
2212
+ report: {
2213
+ ...round.report,
2214
+ generatedAt: toISOStringIfDate(round.report.generatedAt)
2215
+ },
2216
+ suggestionsGenerated: round.suggestionsGenerated,
2217
+ suggestionsApproved: round.suggestionsApproved,
2218
+ promptSnapshot: round.promptSnapshot,
2219
+ cost: round.cost,
2220
+ scoreDelta: round.scoreDelta
2221
+ };
2222
+ (0, import_node_fs4.writeFileSync)(roundPath, JSON.stringify(roundData, null, 2));
2223
+ }
2224
+ }
2225
+
2226
+ // src/reporter/cycle-console.ts
2227
+ function logCycle(result, options = {}) {
2228
+ const { verbosity = "summary", showRounds = false } = options;
2229
+ console.log("\n\u{1F504} Improvement Cycle Complete");
2230
+ console.log(` Rounds: ${result.rounds.length}`);
2231
+ console.log(` Termination: ${result.terminationReason}`);
2232
+ console.log(` Total Cost: $${result.totalCost.toFixed(4)}`);
2233
+ if (result.rounds.length > 0) {
2234
+ const firstScore = result.rounds[0].report.summary.avgScore;
2235
+ const lastScore = result.rounds[result.rounds.length - 1].report.summary.avgScore;
2236
+ const delta = lastScore - firstScore;
2237
+ console.log(` Score: ${firstScore.toFixed(1)} -> ${lastScore.toFixed(1)} (${formatScoreDelta(delta)})`);
2238
+ }
2239
+ if (showRounds) {
2240
+ const consoleReporter = new ConsoleReporter({ verbosity });
2241
+ for (const round of result.rounds) {
2242
+ console.log(`
2243
+ -- Round ${round.round} --`);
2244
+ consoleReporter.log(round.report);
2245
+ }
2246
+ }
2247
+ }
2248
+
2249
+ // src/reporter/cycle-markdown.ts
2250
+ var import_node_fs5 = require("fs");
2251
+ function cycleToMarkdown(result, options = {}) {
2252
+ const { includeRoundDetails = true, showPromptEvolution = false } = options;
2253
+ const lines = [];
2254
+ lines.push("# Improvement Cycle Report");
2255
+ lines.push("");
2256
+ lines.push("## Summary");
2257
+ lines.push("");
2258
+ lines.push("| Metric | Value |");
2259
+ lines.push("|--------|-------|");
2260
+ lines.push(`| Rounds | ${result.rounds.length} |`);
2261
+ lines.push(`| Termination | ${result.terminationReason} |`);
2262
+ lines.push(`| Total Cost | $${result.totalCost.toFixed(4)} |`);
2263
+ if (result.rounds.length > 0) {
2264
+ const first = result.rounds[0].report.summary.avgScore;
2265
+ const last = result.rounds[result.rounds.length - 1].report.summary.avgScore;
2266
+ lines.push(`| Initial Score | ${first.toFixed(1)} |`);
2267
+ lines.push(`| Final Score | ${last.toFixed(1)} |`);
2268
+ lines.push(`| Improvement | ${formatScoreDelta(last - first)} |`);
2269
+ }
2270
+ lines.push("");
2271
+ lines.push("## Score Progression");
2272
+ lines.push("");
2273
+ lines.push("| Round | Score | Delta | Cost |");
2274
+ lines.push("|-------|-------|-------|------|");
2275
+ for (const round of result.rounds) {
2276
+ const delta = formatScoreDelta(round.scoreDelta);
2277
+ lines.push(
2278
+ `| ${round.round} | ${round.report.summary.avgScore.toFixed(1)} | ${delta} | $${round.cost.total.toFixed(4)} |`
2279
+ );
2280
+ }
2281
+ lines.push("");
2282
+ if (includeRoundDetails) {
2283
+ lines.push("## Round Details");
2284
+ lines.push("");
2285
+ for (const round of result.rounds) {
2286
+ lines.push(`### Round ${round.round}`);
2287
+ lines.push("");
2288
+ lines.push(reportToMarkdown(round.report));
2289
+ lines.push("");
2290
+ }
2291
+ }
2292
+ if (showPromptEvolution && result.rounds.length > 0) {
2293
+ lines.push("## Prompt Evolution");
2294
+ lines.push("");
2295
+ lines.push("### Initial Prompt");
2296
+ lines.push("");
2297
+ lines.push("```");
2298
+ lines.push(result.rounds[0].promptSnapshot.userTemplate);
2299
+ lines.push("```");
2300
+ lines.push("");
2301
+ lines.push("### Final Prompt");
2302
+ lines.push("");
2303
+ lines.push("```");
2304
+ const finalPrompt = result.finalPrompt;
2305
+ if ("userTemplate" in finalPrompt && typeof finalPrompt.userTemplate === "string") {
2306
+ lines.push(finalPrompt.userTemplate);
2307
+ } else {
2308
+ lines.push("[Compiled prompt - template not available]");
2309
+ }
2310
+ lines.push("```");
2311
+ }
2312
+ return lines.join("\n");
2313
+ }
2314
+ function saveCycleMarkdown(result, filePath, options) {
2315
+ const markdown = cycleToMarkdown(result, options);
2316
+ (0, import_node_fs5.writeFileSync)(filePath, markdown);
2317
+ }
2318
+
2319
+ // src/improver/utils.ts
2320
+ var import_core5 = require("@agtlantis/core");
2321
+ function suggestionDiff(suggestion) {
2322
+ const oldLines = suggestion.currentValue.split("\n");
2323
+ const newLines = suggestion.suggestedValue.split("\n");
2324
+ const lines = [];
2325
+ lines.push(`--- ${suggestion.type} (current)`);
2326
+ lines.push(`+++ ${suggestion.type} (suggested)`);
2327
+ lines.push("");
2328
+ for (const line of oldLines) {
2329
+ lines.push(`- ${line}`);
2330
+ }
2331
+ for (const line of newLines) {
2332
+ lines.push(`+ ${line}`);
2333
+ }
2334
+ return lines.join("\n");
2335
+ }
2336
+ function suggestionPreview(suggestion) {
2337
+ const lines = [];
2338
+ lines.push(`=== Suggestion Preview ===`);
2339
+ lines.push(`Type: ${suggestion.type}`);
2340
+ lines.push(`Priority: ${suggestion.priority}`);
2341
+ lines.push(``);
2342
+ lines.push(`Reasoning: ${suggestion.reasoning}`);
2343
+ lines.push(``);
2344
+ lines.push(`Expected Improvement: ${suggestion.expectedImprovement}`);
2345
+ lines.push(``);
2346
+ lines.push(`--- Current Value ---`);
2347
+ lines.push(suggestion.currentValue);
2348
+ lines.push(``);
2349
+ lines.push(`--- Suggested Value ---`);
2350
+ lines.push(suggestion.suggestedValue);
2351
+ return lines.join("\n");
2352
+ }
2353
+ function suggestionSummary(suggestion) {
2354
+ const priorityTag = `[${suggestion.priority.toUpperCase()}]`;
2355
+ return `${priorityTag} ${suggestion.type}: ${truncate(suggestion.reasoning, 60)}`;
2356
+ }
2357
+ function safeReplace(str, search, replacement) {
2358
+ return str.replace(search, () => replacement);
2359
+ }
2360
+ function bumpVersion(version, bump) {
2361
+ const parts = version.split(".").map((n) => parseInt(n, 10));
2362
+ if (parts.length !== 3 || parts.some(isNaN)) {
2363
+ throw new EvalError(
2364
+ `Invalid version format: "${version}". Expected semver format (x.y.z)`,
2365
+ {
2366
+ code: "SUGGESTION_APPLY_ERROR" /* SUGGESTION_APPLY_ERROR */,
2367
+ context: { version, expectedFormat: "x.y.z" }
2368
+ }
2369
+ );
2370
+ }
2371
+ const [major, minor, patch] = parts;
2372
+ switch (bump) {
2373
+ case "major":
2374
+ return `${major + 1}.0.0`;
2375
+ case "minor":
2376
+ return `${major}.${minor + 1}.0`;
2377
+ case "patch":
2378
+ return `${major}.${minor}.${patch + 1}`;
2379
+ }
2380
+ }
2381
+ function applyPromptSuggestions(currentPrompt, suggestions, options) {
2382
+ const approvedSuggestions = suggestions.filter((s) => s.approved);
2383
+ if (approvedSuggestions.length === 0) {
2384
+ return {
2385
+ prompt: currentPrompt,
2386
+ appliedCount: 0,
2387
+ skipped: []
2388
+ };
2389
+ }
2390
+ let newPrompt = { ...currentPrompt };
2391
+ let appliedCount = 0;
2392
+ const skipped = [];
2393
+ for (const suggestion of approvedSuggestions) {
2394
+ const applyResult = applySingleSuggestion(newPrompt, suggestion);
2395
+ if (applyResult.success) {
2396
+ newPrompt = applyResult.prompt;
2397
+ appliedCount++;
2398
+ } else {
2399
+ skipped.push({ suggestion, reason: applyResult.reason });
2400
+ }
2401
+ }
2402
+ if (options?.bumpVersion && appliedCount > 0) {
2403
+ newPrompt = {
2404
+ ...newPrompt,
2405
+ version: bumpVersion(currentPrompt.version, options.bumpVersion)
2406
+ };
2407
+ }
2408
+ return {
2409
+ prompt: newPrompt,
2410
+ appliedCount,
2411
+ skipped
2412
+ };
2413
+ }
2414
+ var AGENT_PROMPT_CORE_FIELDS = [
2415
+ "id",
2416
+ "version",
2417
+ "system",
2418
+ "renderUserPrompt",
2419
+ "userTemplate"
2420
+ ];
2421
+ function applySingleSuggestion(prompt, suggestion) {
2422
+ switch (suggestion.type) {
2423
+ case "system_prompt": {
2424
+ if (!prompt.system.includes(suggestion.currentValue)) {
2425
+ return {
2426
+ success: false,
2427
+ reason: `currentValue not found in system prompt: "${truncate(suggestion.currentValue, 50)}"`
2428
+ };
2429
+ }
2430
+ return {
2431
+ success: true,
2432
+ prompt: {
2433
+ ...prompt,
2434
+ system: safeReplace(
2435
+ prompt.system,
2436
+ suggestion.currentValue,
2437
+ suggestion.suggestedValue
2438
+ )
2439
+ }
2440
+ };
2441
+ }
2442
+ case "user_prompt": {
2443
+ const userTemplate = prompt.userTemplate;
2444
+ if (typeof userTemplate !== "string") {
2445
+ throw new EvalError(
2446
+ `Cannot apply user_prompt suggestion: prompt does not have a userTemplate field. The renderUserPrompt is a function and cannot be modified directly.`,
2447
+ {
2448
+ code: "SUGGESTION_APPLY_ERROR" /* SUGGESTION_APPLY_ERROR */,
2449
+ context: {
2450
+ suggestionType: suggestion.type,
2451
+ hasUserTemplate: "userTemplate" in prompt
2452
+ }
2453
+ }
2454
+ );
2455
+ }
2456
+ if (!userTemplate.includes(suggestion.currentValue)) {
2457
+ return {
2458
+ success: false,
2459
+ reason: `currentValue not found in userTemplate: "${truncate(suggestion.currentValue, 50)}"`
2460
+ };
2461
+ }
2462
+ const newTemplate = safeReplace(
2463
+ userTemplate,
2464
+ suggestion.currentValue,
2465
+ suggestion.suggestedValue
2466
+ );
2467
+ return {
2468
+ success: true,
2469
+ prompt: {
2470
+ ...prompt,
2471
+ userTemplate: newTemplate,
2472
+ renderUserPrompt: (0, import_core5.compileTemplate)(newTemplate, prompt.id)
2473
+ }
2474
+ };
2475
+ }
2476
+ case "parameters": {
2477
+ const updatedPrompt = { ...prompt };
2478
+ let found = false;
2479
+ for (const [key, value] of Object.entries(updatedPrompt)) {
2480
+ if (AGENT_PROMPT_CORE_FIELDS.includes(
2481
+ key
2482
+ )) {
2483
+ continue;
2484
+ }
2485
+ if (typeof value === "string" && value.includes(suggestion.currentValue)) {
2486
+ updatedPrompt[key] = safeReplace(
2487
+ value,
2488
+ suggestion.currentValue,
2489
+ suggestion.suggestedValue
2490
+ );
2491
+ found = true;
2492
+ break;
2493
+ }
2494
+ }
2495
+ if (!found) {
2496
+ return {
2497
+ success: false,
2498
+ reason: `currentValue not found in any parameter field: "${truncate(suggestion.currentValue, 50)}"`
2499
+ };
2500
+ }
2501
+ return {
2502
+ success: true,
2503
+ prompt: updatedPrompt
2504
+ };
2505
+ }
2506
+ default: {
2507
+ const _exhaustive = suggestion.type;
2508
+ return {
2509
+ success: false,
2510
+ reason: `Unknown suggestion type: ${suggestion.type}`
2511
+ };
2512
+ }
2513
+ }
2514
+ }
2515
+
2516
+ // src/improver/llm-improver.ts
2517
+ var import_ai2 = require("ai");
2518
+ var import_zod2 = require("zod");
2519
+
2520
+ // src/improver/prompts/default.ts
2521
+ var defaultImproverPrompt = {
2522
+ id: "default-improver",
2523
+ version: "2.0.0",
2524
+ system: `You are an expert prompt engineer specializing in optimizing AI Agent prompts.
2525
+
2526
+ Your role is to analyze test results and evaluation feedback to propose targeted improvements.
2527
+
2528
+ ## Improvement Principles
2529
+
2530
+ 1. **Focus on Impact**: Prioritize changes that address the lowest-scoring criteria
2531
+ - Target specific failure patterns, not general improvements
2532
+ - One well-crafted change is better than many superficial ones
2533
+
2534
+ 2. **Be Specific and Actionable**: Provide concrete changes, not vague suggestions
2535
+ - Show exact text to add, modify, or remove
2536
+ - Explain the mechanism by which the change will help
2537
+
2538
+ 3. **Consider Trade-offs**: Evaluate side effects of each change
2539
+ - Will this fix break other test cases?
2540
+ - Does it increase prompt length/cost significantly?
2541
+ - Could it introduce new failure modes?
2542
+
2543
+ 4. **Maintain Prompt Quality**: Preserve clarity and structure
2544
+ - Keep prompts readable and maintainable
2545
+ - Avoid over-engineering or excessive constraints
2546
+ - Ensure changes align with the agent's core purpose
2547
+
2548
+ ## Suggestion Priority Levels
2549
+ - **high**: Critical issues causing test failures, should be addressed immediately
2550
+ - **medium**: Issues affecting quality scores, recommended for next iteration
2551
+ - **low**: Minor optimizations, nice-to-have improvements
2552
+
2553
+ ## Response Format
2554
+
2555
+ You MUST respond with valid JSON only. No additional text outside the JSON structure.
2556
+
2557
+ {
2558
+ "suggestions": [
2559
+ {
2560
+ "type": "system_prompt" | "user_prompt" | "parameters",
2561
+ "priority": "high" | "medium" | "low",
2562
+ "currentValue": "The specific text or value being changed",
2563
+ "suggestedValue": "The proposed replacement text or value",
2564
+ "reasoning": "Why this change addresses the identified issue",
2565
+ "expectedImprovement": "Predicted impact on scores and behavior"
2566
+ }
2567
+ ]
2568
+ }`,
2569
+ renderUserPrompt: (ctx) => {
2570
+ const failedDetails = buildFailedCaseDetails(ctx.evaluatedResults);
2571
+ return `
2572
+ ## Current Agent Prompt
2573
+
2574
+ ### System Prompt
2575
+ \`\`\`
2576
+ ${ctx.agentPrompt.system}
2577
+ \`\`\`
2578
+
2579
+ ## Test Results Summary
2580
+ - Total tests: ${ctx.evaluatedResults.length}
2581
+ - Passed: ${ctx.evaluatedResults.filter((r) => r.passed).length}
2582
+ - Failed: ${ctx.evaluatedResults.filter((r) => !r.passed).length}
2583
+
2584
+ ## Performance Metrics
2585
+ - Average latency: ${ctx.aggregatedMetrics.avgLatencyMs}ms
2586
+ - Total tokens used: ${ctx.aggregatedMetrics.totalTokens}
2587
+
2588
+ ## Failed/Low-Score Cases Details
2589
+ ${failedDetails}
2590
+
2591
+ Based on the above results, please propose specific prompt improvements.`.trim();
2592
+ }
2593
+ };
2594
+ function buildFailedCaseDetails(results) {
2595
+ const failedOrLowScore = results.filter((r) => !r.passed || r.overallScore < 70);
2596
+ if (failedOrLowScore.length === 0) {
2597
+ return "(None - all tests passed with acceptable scores)";
2598
+ }
2599
+ return failedOrLowScore.map(
2600
+ (r) => `
2601
+ ### ${r.testCase.id ?? "unnamed"} (Score: ${r.overallScore})
2602
+ **Input:** ${truncate(JSON.stringify(r.testCase.input), 200)}
2603
+ **Output:** ${truncate(JSON.stringify(r.output), 200)}
2604
+ **Evaluation:**
2605
+ ${r.verdicts.map((v) => `- ${v.criterionId}: ${v.score}/100 - ${v.reasoning}`).join("\n")}`
2606
+ ).join("\n");
2607
+ }
2608
+
2609
+ // src/improver/llm-improver.ts
2610
+ function toEvalTokenUsage2(usage) {
2611
+ return {
2612
+ inputTokens: usage.inputTokens ?? 0,
2613
+ outputTokens: usage.outputTokens ?? 0,
2614
+ totalTokens: usage.totalTokens ?? 0
2615
+ };
2616
+ }
2617
+ var ImproverResponseSchema = import_zod2.z.object({
2618
+ suggestions: import_zod2.z.array(
2619
+ import_zod2.z.object({
2620
+ type: import_zod2.z.enum(["system_prompt", "user_prompt", "parameters"]),
2621
+ priority: import_zod2.z.enum(["high", "medium", "low"]),
2622
+ currentValue: import_zod2.z.string(),
2623
+ suggestedValue: import_zod2.z.string(),
2624
+ reasoning: import_zod2.z.string(),
2625
+ expectedImprovement: import_zod2.z.string()
2626
+ })
2627
+ )
2628
+ });
2629
+ function aggregateMetrics(results) {
2630
+ if (results.length === 0) {
2631
+ return {
2632
+ avgLatencyMs: 0,
2633
+ totalTokens: 0
2634
+ };
2635
+ }
2636
+ let totalLatency = 0;
2637
+ let totalTokens = 0;
2638
+ for (const result of results) {
2639
+ totalLatency += result.metrics.latencyMs;
2640
+ totalTokens += result.metrics.tokenUsage.totalTokens;
2641
+ }
2642
+ return {
2643
+ avgLatencyMs: Math.round(totalLatency / results.length),
2644
+ totalTokens
2645
+ };
2646
+ }
2647
+ function createImprover(config) {
2648
+ const { provider, prompt = defaultImproverPrompt, model } = config;
2649
+ return {
2650
+ async improve(agentPrompt, results) {
2651
+ const context = {
2652
+ agentPrompt,
2653
+ evaluatedResults: results,
2654
+ aggregatedMetrics: aggregateMetrics(results)
2655
+ };
2656
+ const messages = [
2657
+ { role: "system", content: prompt.system },
2658
+ { role: "user", content: prompt.renderUserPrompt(context) }
2659
+ ];
2660
+ let response;
2661
+ let llmUsage;
2662
+ try {
2663
+ const execution = provider.simpleExecution(async (session) => {
2664
+ const result = await session.generateText({
2665
+ messages,
2666
+ output: import_ai2.Output.object({ schema: ImproverResponseSchema })
2667
+ });
2668
+ return result.output;
2669
+ });
2670
+ const executionResult = await execution.result();
2671
+ if (executionResult.status !== "succeeded") {
2672
+ throw executionResult.status === "failed" ? executionResult.error : new Error("Execution was canceled");
2673
+ }
2674
+ response = executionResult.value;
2675
+ llmUsage = executionResult.summary.totalLLMUsage;
2676
+ } catch (cause) {
2677
+ throw EvalError.from(cause, "LLM_API_ERROR" /* LLM_API_ERROR */, {
2678
+ promptId: prompt.id,
2679
+ promptVersion: prompt.version
2680
+ });
2681
+ }
2682
+ const suggestions = response.suggestions.map((s) => ({
2683
+ ...s,
2684
+ approved: void 0,
2685
+ modified: void 0
2686
+ }));
2687
+ const metadata = llmUsage ? { tokenUsage: toEvalTokenUsage2(llmUsage), model } : void 0;
2688
+ return { suggestions, metadata };
2689
+ }
2690
+ };
2691
+ }
2692
+
2693
+ // src/index.ts
2694
+ var import_testing = require("@agtlantis/core/testing");
2695
+
2696
+ // src/testing/mock-agent.ts
2697
+ function createMockAgent(config = {}) {
2698
+ const {
2699
+ name = "MockAgent",
2700
+ description = "A mock agent for testing",
2701
+ response = {},
2702
+ tokenUsage = { inputTokens: 10, outputTokens: 20, totalTokens: 30 },
2703
+ delay = 0,
2704
+ shouldError = false,
2705
+ errorMessage = "Mock agent execution failed",
2706
+ executeFn
2707
+ } = config;
2708
+ return {
2709
+ config: { name, description },
2710
+ prompt: {
2711
+ id: "mock-prompt",
2712
+ version: "1.0.0",
2713
+ system: "You are a mock agent",
2714
+ renderUserPrompt: (input) => JSON.stringify(input)
2715
+ },
2716
+ execute: async (input) => {
2717
+ if (executeFn) {
2718
+ return executeFn(input);
2719
+ }
2720
+ if (delay > 0) {
2721
+ await new Promise((resolve2) => setTimeout(resolve2, delay));
2722
+ }
2723
+ if (shouldError) {
2724
+ throw new Error(errorMessage);
2725
+ }
2726
+ return {
2727
+ result: response,
2728
+ metadata: { tokenUsage }
2729
+ };
2730
+ }
2731
+ };
2732
+ }
2733
+ function createMockJudge(config = {}) {
2734
+ const {
2735
+ score = 80,
2736
+ passed = true,
2737
+ verdicts = [
2738
+ { criterionId: "default", score: 80, reasoning: "Default verdict", passed: true }
2739
+ ],
2740
+ metadata,
2741
+ shouldError = false,
2742
+ errorMessage = "Mock judge evaluation failed",
2743
+ evaluateFn
2744
+ } = config;
2745
+ return {
2746
+ evaluate: async (context) => {
2747
+ if (evaluateFn) {
2748
+ return evaluateFn(context);
2749
+ }
2750
+ if (shouldError) {
2751
+ throw new Error(errorMessage);
2752
+ }
2753
+ return {
2754
+ verdicts,
2755
+ overallScore: score,
2756
+ passed,
2757
+ metadata
2758
+ };
2759
+ }
2760
+ };
2761
+ }
2762
+ function createMockImprover(config = {}) {
2763
+ const {
2764
+ suggestions = [],
2765
+ shouldError = false,
2766
+ errorMessage = "Mock improver failed",
2767
+ improveFn
2768
+ } = config;
2769
+ return {
2770
+ improve: async (agentPrompt, results) => {
2771
+ if (improveFn) {
2772
+ return improveFn(agentPrompt, results);
2773
+ }
2774
+ if (shouldError) {
2775
+ throw new Error(errorMessage);
2776
+ }
2777
+ return { suggestions };
2778
+ }
2779
+ };
2780
+ }
2781
+
2782
+ // src/index.ts
2783
+ var import_core9 = require("@agtlantis/core");
2784
+ var import_core10 = require("@agtlantis/core");
2785
+
2786
+ // src/cli/config/types.ts
2787
+ function defineConfig(config) {
2788
+ return config;
2789
+ }
2790
+
2791
+ // src/cli/config/loader.ts
2792
+ var import_node_fs6 = require("fs");
2793
+ var import_node_path3 = require("path");
2794
+ var import_node_url = require("url");
2795
+ var import_bundle_require = require("bundle-require");
2796
+ var import_fast_glob = __toESM(require("fast-glob"), 1);
2797
+
2798
+ // src/cli/config/schema.ts
2799
+ var import_zod3 = require("zod");
2800
+ var llmConfigSchema = import_zod3.z.object({
2801
+ provider: import_zod3.z.enum(["openai", "gemini"], {
2802
+ errorMap: () => ({
2803
+ message: "provider must be 'openai' or 'gemini'"
2804
+ })
2805
+ }),
2806
+ apiKey: import_zod3.z.string().optional(),
2807
+ defaultModel: import_zod3.z.string().optional(),
2808
+ reasoningEffort: import_zod3.z.enum(["minimal", "low", "medium", "high"]).optional(),
2809
+ defaultResponseFormat: import_zod3.z.object({
2810
+ type: import_zod3.z.enum(["json_object", "text"])
2811
+ }).optional()
2812
+ });
2813
+ var criterionSchema = import_zod3.z.object({
2814
+ id: import_zod3.z.string().min(1, "Criterion id is required"),
2815
+ name: import_zod3.z.string().min(1, "Criterion name is required"),
2816
+ description: import_zod3.z.string().min(1, "Criterion description is required"),
2817
+ weight: import_zod3.z.number().positive().optional(),
2818
+ validator: import_zod3.z.function().optional()
2819
+ });
2820
+ var judgeConfigSchema = import_zod3.z.object({
2821
+ llm: llmConfigSchema.optional(),
2822
+ criteria: import_zod3.z.array(criterionSchema).min(1, "At least one criterion is required"),
2823
+ passThreshold: import_zod3.z.number().min(0).max(100).optional(),
2824
+ prompt: import_zod3.z.any().optional()
2825
+ });
2826
+ var improverConfigSchema = import_zod3.z.object({
2827
+ llm: llmConfigSchema.optional(),
2828
+ prompt: import_zod3.z.any().optional()
2829
+ }).optional();
2830
+ var outputConfigSchema = import_zod3.z.object({
2831
+ dir: import_zod3.z.string().optional(),
2832
+ filename: import_zod3.z.string().optional(),
2833
+ verbose: import_zod3.z.boolean().optional()
2834
+ }).optional();
2835
+ var runConfigSchema = import_zod3.z.object({
2836
+ concurrency: import_zod3.z.number().int().positive().optional(),
2837
+ iterations: import_zod3.z.number().int().positive().optional(),
2838
+ stopOnFirstFailure: import_zod3.z.boolean().optional()
2839
+ }).optional();
2840
+ var maxTurnsConditionSchema = import_zod3.z.object({
2841
+ type: import_zod3.z.literal("maxTurns"),
2842
+ count: import_zod3.z.number().int().positive()
2843
+ });
2844
+ var fieldSetConditionSchema = import_zod3.z.object({
2845
+ type: import_zod3.z.literal("fieldSet"),
2846
+ fieldPath: import_zod3.z.string().min(1)
2847
+ });
2848
+ var fieldValueConditionSchema = import_zod3.z.object({
2849
+ type: import_zod3.z.literal("fieldValue"),
2850
+ fieldPath: import_zod3.z.string().min(1),
2851
+ expectedValue: import_zod3.z.unknown()
2852
+ });
2853
+ var customConditionSchema = import_zod3.z.object({
2854
+ type: import_zod3.z.literal("custom"),
2855
+ check: import_zod3.z.function(),
2856
+ description: import_zod3.z.string().optional()
2857
+ });
2858
+ var terminationConditionSchema = import_zod3.z.union([
2859
+ maxTurnsConditionSchema,
2860
+ fieldSetConditionSchema,
2861
+ fieldValueConditionSchema,
2862
+ customConditionSchema
2863
+ ]);
2864
+ var followUpInputSchema = import_zod3.z.object({
2865
+ input: import_zod3.z.unknown(),
2866
+ description: import_zod3.z.string().optional(),
2867
+ turns: import_zod3.z.number().optional()
2868
+ });
2869
+ var multiTurnConfigSchema = import_zod3.z.object({
2870
+ followUpInputs: import_zod3.z.array(followUpInputSchema).optional(),
2871
+ terminateWhen: import_zod3.z.array(terminationConditionSchema).min(1, "At least one termination condition is required"),
2872
+ maxTurns: import_zod3.z.number().int().positive().optional(),
2873
+ onConditionMet: import_zod3.z.enum(["pass", "fail"]).optional(),
2874
+ onMaxTurnsReached: import_zod3.z.enum(["pass", "fail"]).optional()
2875
+ });
2876
+ var testCaseSchema = import_zod3.z.object({
2877
+ id: import_zod3.z.string().optional(),
2878
+ input: import_zod3.z.unknown(),
2879
+ tags: import_zod3.z.array(import_zod3.z.string()).optional(),
2880
+ description: import_zod3.z.string().optional(),
2881
+ expectedOutput: import_zod3.z.unknown().optional(),
2882
+ files: import_zod3.z.array(import_zod3.z.any()).optional(),
2883
+ multiTurn: multiTurnConfigSchema.optional()
2884
+ });
2885
+ var agentSchema = import_zod3.z.object({
2886
+ config: import_zod3.z.object({
2887
+ name: import_zod3.z.string(),
2888
+ description: import_zod3.z.string().optional()
2889
+ }),
2890
+ prompt: import_zod3.z.object({
2891
+ id: import_zod3.z.string(),
2892
+ version: import_zod3.z.string(),
2893
+ system: import_zod3.z.string(),
2894
+ renderUserPrompt: import_zod3.z.function()
2895
+ }),
2896
+ execute: import_zod3.z.function()
2897
+ });
2898
+ var evalConfigSchema = import_zod3.z.object({
2899
+ name: import_zod3.z.string().optional(),
2900
+ agentDescription: import_zod3.z.string().optional(),
2901
+ agent: agentSchema,
2902
+ llm: llmConfigSchema,
2903
+ judge: judgeConfigSchema,
2904
+ improver: improverConfigSchema,
2905
+ testCases: import_zod3.z.array(testCaseSchema).optional(),
2906
+ output: outputConfigSchema,
2907
+ run: runConfigSchema,
2908
+ include: import_zod3.z.array(import_zod3.z.string().min(1, "Include pattern cannot be empty")).min(1, "Include array must have at least one pattern").optional(),
2909
+ agents: import_zod3.z.record(import_zod3.z.string(), agentSchema).optional()
2910
+ }).refine(
2911
+ (data) => {
2912
+ const hasTestCases = (data.testCases?.length ?? 0) > 0;
2913
+ const hasInclude = (data.include?.length ?? 0) > 0;
2914
+ return hasTestCases || hasInclude;
2915
+ },
2916
+ {
2917
+ message: "Either testCases or include must be provided. Use testCases for inline TypeScript tests, or include for YAML file discovery.",
2918
+ path: ["testCases"]
2919
+ }
2920
+ );
2921
+
2922
+ // src/cli/config/loader.ts
2923
+ var ConfigError = class extends Error {
2924
+ constructor(message, code, context) {
2925
+ super(message);
2926
+ this.code = code;
2927
+ this.context = context;
2928
+ this.name = "ConfigError";
2929
+ }
2930
+ };
2931
+ async function discoverEvalFiles(config, options = {}) {
2932
+ const patterns = options.include ?? config.include;
2933
+ if (!patterns || patterns.length === 0) {
2934
+ throw new ConfigError(
2935
+ `No include patterns specified.
2936
+
2937
+ Add an include field to your config:
2938
+ include: ['evals/**/*.eval.yaml']
2939
+
2940
+ Or use the --include CLI option:
2941
+ npx agent-eval --include "evals/**/*.eval.yaml"`,
2942
+ "CONFIG_NO_INCLUDE_PATTERN"
2943
+ );
2944
+ }
2945
+ const cwd = options.cwd ?? process.cwd();
2946
+ const ignore = options.ignore ?? ["**/node_modules/**"];
2947
+ const files = await (0, import_fast_glob.default)(patterns, {
2948
+ absolute: true,
2949
+ cwd,
2950
+ ignore,
2951
+ onlyFiles: true,
2952
+ dot: false,
2953
+ followSymbolicLinks: false,
2954
+ unique: true,
2955
+ suppressErrors: false
2956
+ });
2957
+ return files.sort();
2958
+ }
2959
+
2960
+ // src/improvement-cycle/types.ts
2961
+ function isTargetScoreCondition(condition) {
2962
+ return condition.type === "targetScore";
2963
+ }
2964
+ function isMaxRoundsCondition(condition) {
2965
+ return condition.type === "maxRounds";
2966
+ }
2967
+ function isNoImprovementCondition(condition) {
2968
+ return condition.type === "noImprovement";
2969
+ }
2970
+ function isMaxCostCondition(condition) {
2971
+ return condition.type === "maxCost";
2972
+ }
2973
+ function isCustomCycleCondition(condition) {
2974
+ return condition.type === "custom";
2975
+ }
2976
+ function isCycleTerminated(result) {
2977
+ return result.terminated === true;
2978
+ }
2979
+
2980
+ // src/improvement-cycle/conditions.ts
2981
+ function targetScore(threshold) {
2982
+ if (!Number.isFinite(threshold)) {
2983
+ throw new EvalError("threshold must be a finite number", {
2984
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
2985
+ context: { threshold }
2986
+ });
2987
+ }
2988
+ if (threshold < 0 || threshold > 100) {
2989
+ throw new EvalError("threshold must be between 0 and 100", {
2990
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
2991
+ context: { threshold }
2992
+ });
2993
+ }
2994
+ return { type: "targetScore", threshold };
2995
+ }
2996
+ function maxRounds(count) {
2997
+ if (!Number.isInteger(count) || count < 1) {
2998
+ throw new EvalError("count must be a positive integer", {
2999
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
3000
+ context: { count }
3001
+ });
3002
+ }
3003
+ return { type: "maxRounds", count };
3004
+ }
3005
+ function noImprovement(consecutiveRounds, minDelta) {
3006
+ if (!Number.isInteger(consecutiveRounds) || consecutiveRounds < 1) {
3007
+ throw new EvalError("consecutiveRounds must be a positive integer", {
3008
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
3009
+ context: { consecutiveRounds }
3010
+ });
3011
+ }
3012
+ if (minDelta !== void 0 && (!Number.isFinite(minDelta) || minDelta < 0)) {
3013
+ throw new EvalError("minDelta must be a non-negative finite number", {
3014
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
3015
+ context: { minDelta }
3016
+ });
3017
+ }
3018
+ return {
3019
+ type: "noImprovement",
3020
+ consecutiveRounds,
3021
+ ...minDelta !== void 0 && { minDelta }
3022
+ };
3023
+ }
3024
+ function maxCost(maxUSD) {
3025
+ if (!Number.isFinite(maxUSD) || maxUSD <= 0) {
3026
+ throw new EvalError("maxUSD must be a positive finite number", {
3027
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
3028
+ context: { maxUSD }
3029
+ });
3030
+ }
3031
+ return { type: "maxCost", maxUSD };
3032
+ }
3033
+ function customCondition(check, description) {
3034
+ return {
3035
+ type: "custom",
3036
+ check,
3037
+ ...description !== void 0 && { description }
3038
+ };
3039
+ }
3040
+ function and2(...conditions) {
3041
+ if (conditions.length === 0) {
3042
+ return {
3043
+ type: "custom",
3044
+ check: () => false,
3045
+ description: formatCompositeDescription("and", [])
3046
+ };
3047
+ }
3048
+ return {
3049
+ type: "custom",
3050
+ check: createAndCheck(conditions, checkCycleCondition),
3051
+ description: formatCompositeDescription("and", conditions)
3052
+ };
3053
+ }
3054
+ function or2(...conditions) {
3055
+ if (conditions.length === 0) {
3056
+ return {
3057
+ type: "custom",
3058
+ check: () => false,
3059
+ description: formatCompositeDescription("or", [])
3060
+ };
3061
+ }
3062
+ return {
3063
+ type: "custom",
3064
+ check: createOrCheck(conditions, checkCycleCondition),
3065
+ description: formatCompositeDescription("or", conditions)
3066
+ };
3067
+ }
3068
+ function not2(condition) {
3069
+ return {
3070
+ type: "custom",
3071
+ check: createNotCheck(condition, checkCycleCondition),
3072
+ description: `not(${condition.type})`
3073
+ };
3074
+ }
3075
+ function checkTargetScore(condition, ctx) {
3076
+ if (ctx.latestScore >= condition.threshold) {
3077
+ return {
3078
+ terminated: true,
3079
+ matchedCondition: condition,
3080
+ reason: `Target score ${condition.threshold} reached (current: ${ctx.latestScore})`
3081
+ };
3082
+ }
3083
+ return {
3084
+ terminated: false,
3085
+ reason: `Score ${ctx.latestScore} below target ${condition.threshold}`
3086
+ };
3087
+ }
3088
+ function checkMaxRounds(condition, ctx) {
3089
+ if (ctx.currentRound >= condition.count) {
3090
+ return {
3091
+ terminated: true,
3092
+ matchedCondition: condition,
3093
+ reason: `Maximum rounds reached (${condition.count})`
3094
+ };
3095
+ }
3096
+ return {
3097
+ terminated: false,
3098
+ reason: `Round ${ctx.currentRound} of ${condition.count}`
3099
+ };
3100
+ }
3101
+ function checkNoImprovement(condition, ctx) {
3102
+ const { consecutiveRounds, minDelta = 0 } = condition;
3103
+ const { history } = ctx;
3104
+ let noImprovementCount = 0;
3105
+ for (let i = history.length - 1; i >= 0; i--) {
3106
+ const round = history[i];
3107
+ if (round.scoreDelta === null) break;
3108
+ if (round.scoreDelta <= minDelta) {
3109
+ noImprovementCount++;
3110
+ } else {
3111
+ break;
3112
+ }
3113
+ }
3114
+ if (noImprovementCount >= consecutiveRounds) {
3115
+ return {
3116
+ terminated: true,
3117
+ matchedCondition: condition,
3118
+ reason: `No improvement for ${noImprovementCount} consecutive round${noImprovementCount === 1 ? "" : "s"}`
3119
+ };
3120
+ }
3121
+ const roundWord = noImprovementCount === 1 ? "round" : "rounds";
3122
+ return {
3123
+ terminated: false,
3124
+ reason: `${noImprovementCount} ${roundWord} without improvement (need ${consecutiveRounds})`
3125
+ };
3126
+ }
3127
+ function checkMaxCost(condition, ctx) {
3128
+ if (ctx.totalCost >= condition.maxUSD) {
3129
+ return {
3130
+ terminated: true,
3131
+ matchedCondition: condition,
3132
+ reason: `Cost limit exceeded ($${ctx.totalCost.toFixed(2)} >= $${condition.maxUSD.toFixed(2)})`
3133
+ };
3134
+ }
3135
+ return {
3136
+ terminated: false,
3137
+ reason: `Cost $${ctx.totalCost.toFixed(2)} under limit $${condition.maxUSD.toFixed(2)}`
3138
+ };
3139
+ }
3140
+ async function checkCustomCondition(condition, ctx) {
3141
+ const description = condition.description ?? "Custom condition";
3142
+ try {
3143
+ const shouldTerminate = await condition.check(ctx);
3144
+ if (shouldTerminate) {
3145
+ return {
3146
+ terminated: true,
3147
+ matchedCondition: condition,
3148
+ reason: `${description} met`
3149
+ };
3150
+ }
3151
+ return {
3152
+ terminated: false,
3153
+ reason: `${description} not met`
3154
+ };
3155
+ } catch (error) {
3156
+ const message = error instanceof Error ? error.message : String(error);
3157
+ return {
3158
+ terminated: false,
3159
+ reason: `${description} check failed: ${message}`
3160
+ };
3161
+ }
3162
+ }
3163
+ async function checkCycleCondition(condition, context) {
3164
+ if (isTargetScoreCondition(condition)) {
3165
+ return checkTargetScore(condition, context);
3166
+ }
3167
+ if (isMaxRoundsCondition(condition)) {
3168
+ return checkMaxRounds(condition, context);
3169
+ }
3170
+ if (isNoImprovementCondition(condition)) {
3171
+ return checkNoImprovement(condition, context);
3172
+ }
3173
+ if (isMaxCostCondition(condition)) {
3174
+ return checkMaxCost(condition, context);
3175
+ }
3176
+ if (isCustomCycleCondition(condition)) {
3177
+ return checkCustomCondition(condition, context);
3178
+ }
3179
+ const _exhaustive = condition;
3180
+ throw new EvalError(`Unknown condition type: ${JSON.stringify(_exhaustive)}`, {
3181
+ code: "UNKNOWN_ERROR" /* UNKNOWN_ERROR */,
3182
+ context: { condition: _exhaustive }
3183
+ });
3184
+ }
3185
+ async function checkCycleTermination(conditions, context) {
3186
+ if (conditions.length === 0) {
3187
+ return {
3188
+ terminated: false,
3189
+ reason: "No termination conditions specified"
3190
+ };
3191
+ }
3192
+ for (const condition of conditions) {
3193
+ const result = await checkCycleCondition(condition, context);
3194
+ if (result.terminated) {
3195
+ return result;
3196
+ }
3197
+ }
3198
+ return {
3199
+ terminated: false,
3200
+ reason: "No termination conditions met"
3201
+ };
3202
+ }
3203
+
3204
+ // src/improvement-cycle/runner.ts
3205
+ var import_core7 = require("@agtlantis/core");
3206
+
3207
+ // src/improvement-cycle/history.ts
3208
+ var import_node_crypto = __toESM(require("crypto"), 1);
3209
+ var import_node_fs7 = require("fs");
3210
+ var import_promises2 = require("fs/promises");
3211
+ var import_node_path4 = require("path");
3212
+ var import_core6 = require("@agtlantis/core");
3213
+ var defaultHistoryStorage = {
3214
+ readFile: (path3) => (0, import_promises2.readFile)(path3, "utf-8"),
3215
+ writeFile: (path3, content) => (0, import_promises2.writeFile)(path3, content, "utf-8"),
3216
+ exists: import_node_fs7.existsSync,
3217
+ mkdir: (path3, options) => (0, import_promises2.mkdir)(path3, options)
3218
+ };
3219
+ function hasUserTemplate(prompt) {
3220
+ return typeof prompt.userTemplate === "string";
3221
+ }
3222
+ function serializePrompt(prompt) {
3223
+ const p = prompt;
3224
+ if (!hasUserTemplate(p)) {
3225
+ throw new EvalError("Cannot serialize prompt: userTemplate field is required", {
3226
+ code: "PROMPT_INVALID_FORMAT" /* PROMPT_INVALID_FORMAT */,
3227
+ context: { promptId: p.id }
3228
+ });
3229
+ }
3230
+ const { id, version, system, userTemplate, renderUserPrompt, ...rest } = p;
3231
+ const customFields = Object.keys(rest).length > 0 ? rest : void 0;
3232
+ return {
3233
+ id,
3234
+ version,
3235
+ system,
3236
+ userTemplate,
3237
+ ...customFields && { customFields }
3238
+ };
3239
+ }
3240
+ function validateDeserializedPrompt(obj, promptId) {
3241
+ const requiredStrings = ["id", "version", "system", "userTemplate"];
3242
+ for (const field of requiredStrings) {
3243
+ if (typeof obj[field] !== "string") {
3244
+ throw new EvalError(`Invalid deserialized prompt: ${field} must be a string`, {
3245
+ code: "PROMPT_INVALID_FORMAT" /* PROMPT_INVALID_FORMAT */,
3246
+ context: { promptId, field, actual: typeof obj[field] }
3247
+ });
3248
+ }
3249
+ }
3250
+ if (typeof obj.renderUserPrompt !== "function") {
3251
+ throw new EvalError("Invalid deserialized prompt: renderUserPrompt must be a function", {
3252
+ code: "PROMPT_INVALID_FORMAT" /* PROMPT_INVALID_FORMAT */,
3253
+ context: { promptId, actual: typeof obj.renderUserPrompt }
3254
+ });
3255
+ }
3256
+ }
3257
+ function deserializePrompt(serialized) {
3258
+ const { id, version, system, userTemplate, customFields } = serialized;
3259
+ let renderUserPrompt;
3260
+ try {
3261
+ renderUserPrompt = (0, import_core6.compileTemplate)(userTemplate, id);
3262
+ } catch (error) {
3263
+ const message = error instanceof Error ? error.message : String(error);
3264
+ throw new EvalError(`Failed to compile userTemplate: ${message}`, {
3265
+ code: "TEMPLATE_COMPILE_ERROR" /* TEMPLATE_COMPILE_ERROR */,
3266
+ context: { promptId: id, userTemplate }
3267
+ });
3268
+ }
3269
+ const result = {
3270
+ ...customFields,
3271
+ id,
3272
+ version,
3273
+ system,
3274
+ userTemplate,
3275
+ renderUserPrompt
3276
+ };
3277
+ validateDeserializedPrompt(result, id);
3278
+ return result;
3279
+ }
3280
+ function serializeRoundResult(result) {
3281
+ const { summary } = result.report;
3282
+ return {
3283
+ round: result.round,
3284
+ completedAt: result.completedAt.toISOString(),
3285
+ avgScore: summary.avgScore,
3286
+ passed: summary.passed,
3287
+ failed: summary.failed,
3288
+ totalTests: summary.totalTests,
3289
+ suggestionsGenerated: result.suggestionsGenerated,
3290
+ suggestionsApproved: result.suggestionsApproved,
3291
+ promptSnapshot: result.promptSnapshot,
3292
+ promptVersionAfter: result.promptVersionAfter,
3293
+ cost: result.cost,
3294
+ scoreDelta: result.scoreDelta
3295
+ };
3296
+ }
3297
+ function validateHistorySchema(data) {
3298
+ if (typeof data !== "object" || data === null) {
3299
+ throw new EvalError("Invalid history: not an object", {
3300
+ code: "SCHEMA_VALIDATION_ERROR" /* SCHEMA_VALIDATION_ERROR */
3301
+ });
3302
+ }
3303
+ const h = data;
3304
+ if (h.schemaVersion !== "1.1.0") {
3305
+ throw new EvalError(`Unsupported schema version: ${String(h.schemaVersion)}`, {
3306
+ code: "SCHEMA_VALIDATION_ERROR" /* SCHEMA_VALIDATION_ERROR */,
3307
+ context: { schemaVersion: h.schemaVersion }
3308
+ });
3309
+ }
3310
+ const requiredFields = [
3311
+ "sessionId",
3312
+ "startedAt",
3313
+ "initialPrompt",
3314
+ "currentPrompt",
3315
+ "rounds",
3316
+ "totalCost"
3317
+ ];
3318
+ for (const field of requiredFields) {
3319
+ if (!(field in h)) {
3320
+ throw new EvalError(`Invalid history: missing field "${field}"`, {
3321
+ code: "SCHEMA_VALIDATION_ERROR" /* SCHEMA_VALIDATION_ERROR */,
3322
+ context: { missingField: field }
3323
+ });
3324
+ }
3325
+ }
3326
+ }
3327
+ var ImprovementSessionImpl = class {
3328
+ _history;
3329
+ _isUpdating = false;
3330
+ _savePromise = Promise.resolve();
3331
+ config;
3332
+ constructor(history, config = {}) {
3333
+ this._history = history;
3334
+ this.config = {
3335
+ autoSave: config.autoSave ?? false,
3336
+ ...config
3337
+ };
3338
+ }
3339
+ get sessionId() {
3340
+ return this._history.sessionId;
3341
+ }
3342
+ get history() {
3343
+ return this._history;
3344
+ }
3345
+ get canSave() {
3346
+ return this.config.path !== void 0;
3347
+ }
3348
+ addRound(roundResult, updatedPrompt) {
3349
+ if (this._isUpdating) {
3350
+ throw new EvalError("Session is being updated", {
3351
+ code: "CONCURRENT_MODIFICATION" /* CONCURRENT_MODIFICATION */,
3352
+ context: { sessionId: this.sessionId }
3353
+ });
3354
+ }
3355
+ if (this._history.completedAt) {
3356
+ throw new EvalError("Cannot add round to completed session", {
3357
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
3358
+ context: { sessionId: this.sessionId }
3359
+ });
3360
+ }
3361
+ this._isUpdating = true;
3362
+ try {
3363
+ const serializedRound = serializeRoundResult(roundResult);
3364
+ this._history = {
3365
+ ...this._history,
3366
+ currentPrompt: updatedPrompt,
3367
+ rounds: [...this._history.rounds, serializedRound],
3368
+ totalCost: this._history.totalCost + roundResult.cost.total
3369
+ };
3370
+ if (this.config.autoSave && this.canSave) {
3371
+ this.save().catch((err) => this.handleAutoSaveError(err));
3372
+ }
3373
+ } finally {
3374
+ this._isUpdating = false;
3375
+ }
3376
+ }
3377
+ complete(terminationReason) {
3378
+ this._history = {
3379
+ ...this._history,
3380
+ completedAt: (/* @__PURE__ */ new Date()).toISOString(),
3381
+ terminationReason
3382
+ };
3383
+ if (this.config.autoSave && this.canSave) {
3384
+ this.save().catch((err) => this.handleAutoSaveError(err));
3385
+ }
3386
+ }
3387
+ handleAutoSaveError(error) {
3388
+ const err = error instanceof Error ? error : new Error(String(error));
3389
+ if (this.config.onAutoSaveError) {
3390
+ this.config.onAutoSaveError(err);
3391
+ } else {
3392
+ console.error("Auto-save failed:", err);
3393
+ }
3394
+ }
3395
+ async save() {
3396
+ if (!this.config.path) {
3397
+ throw new EvalError("Cannot save: no path configured", {
3398
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
3399
+ context: { sessionId: this.sessionId }
3400
+ });
3401
+ }
3402
+ this._savePromise = this._savePromise.then(async () => {
3403
+ await saveHistory(this._history, this.config.path, this.config.storage);
3404
+ });
3405
+ return this._savePromise;
3406
+ }
3407
+ async flush() {
3408
+ return this._savePromise;
3409
+ }
3410
+ };
3411
+ function createSession(initialPrompt, config) {
3412
+ const serializedPrompt = serializePrompt(initialPrompt);
3413
+ const history = {
3414
+ schemaVersion: "1.1.0",
3415
+ sessionId: import_node_crypto.default.randomUUID(),
3416
+ startedAt: (/* @__PURE__ */ new Date()).toISOString(),
3417
+ initialPrompt: serializedPrompt,
3418
+ currentPrompt: serializedPrompt,
3419
+ rounds: [],
3420
+ totalCost: 0
3421
+ };
3422
+ return new ImprovementSessionImpl(history, config);
3423
+ }
3424
+ async function resumeSession(path3, config) {
3425
+ const history = await loadHistory(path3, config?.storage);
3426
+ const reopenedHistory = {
3427
+ ...history,
3428
+ completedAt: void 0,
3429
+ terminationReason: void 0
3430
+ };
3431
+ return new ImprovementSessionImpl(reopenedHistory, { ...config, path: path3 });
3432
+ }
3433
+ async function saveHistory(history, path3, storage = defaultHistoryStorage) {
3434
+ try {
3435
+ const dir = (0, import_node_path4.dirname)(path3);
3436
+ if (dir && dir !== "." && dir !== "/" && !storage.exists(dir)) {
3437
+ await storage.mkdir(dir, { recursive: true });
3438
+ }
3439
+ await storage.writeFile(path3, JSON.stringify(history, null, 2));
3440
+ } catch (error) {
3441
+ if (error instanceof EvalError) throw error;
3442
+ throw EvalError.from(error, "FILE_WRITE_ERROR" /* FILE_WRITE_ERROR */, { path: path3 });
3443
+ }
3444
+ }
3445
+ async function loadHistory(path3, storage = defaultHistoryStorage) {
3446
+ try {
3447
+ if (!storage.exists(path3)) {
3448
+ throw new EvalError(`History file not found: ${path3}`, {
3449
+ code: "FILE_READ_ERROR" /* FILE_READ_ERROR */,
3450
+ context: { path: path3 }
3451
+ });
3452
+ }
3453
+ const content = await storage.readFile(path3);
3454
+ const history = JSON.parse(content);
3455
+ validateHistorySchema(history);
3456
+ return history;
3457
+ } catch (error) {
3458
+ if (error instanceof EvalError) throw error;
3459
+ throw EvalError.from(error, "FILE_READ_ERROR" /* FILE_READ_ERROR */, { path: path3 });
3460
+ }
3461
+ }
3462
+
3463
+ // src/improvement-cycle/runner.ts
3464
+ function initializeCycleState(initialPrompt, existingSession) {
3465
+ const resumeFromRound = existingSession ? existingSession.history.rounds.length : 0;
3466
+ return {
3467
+ currentPrompt: initialPrompt,
3468
+ currentRound: resumeFromRound,
3469
+ previousScores: existingSession ? existingSession.history.rounds.map((r) => r.avgScore) : [],
3470
+ totalCost: existingSession ? existingSession.history.totalCost : 0,
3471
+ completedRounds: []
3472
+ };
3473
+ }
3474
+ function calculateScoreDelta(currentScore, previousScores) {
3475
+ if (previousScores.length === 0) {
3476
+ return null;
3477
+ }
3478
+ const previousScore = previousScores[previousScores.length - 1];
3479
+ return currentScore - previousScore;
3480
+ }
3481
+ function buildCycleContext(state, currentScore) {
3482
+ return {
3483
+ currentRound: state.currentRound,
3484
+ latestScore: currentScore,
3485
+ previousScores: [...state.previousScores],
3486
+ totalCost: state.totalCost,
3487
+ history: state.completedRounds
3488
+ };
3489
+ }
3490
+ function createRoundResult(state, report, improveResult, cost, scoreDelta, promptSnapshot) {
3491
+ return {
3492
+ round: state.currentRound,
3493
+ report,
3494
+ completedAt: /* @__PURE__ */ new Date(),
3495
+ suggestionsGenerated: improveResult.suggestions,
3496
+ suggestionsApproved: [],
3497
+ // Will be updated after decision
3498
+ promptSnapshot,
3499
+ promptVersionAfter: state.currentPrompt.version,
3500
+ cost,
3501
+ scoreDelta
3502
+ };
3503
+ }
3504
+ async function handleStopDecision(state, session, roundResult, promptSnapshot, terminatedByCondition, conditionReason) {
3505
+ const reason = terminatedByCondition ? conditionReason : "User requested stop";
3506
+ session.addRound(roundResult, promptSnapshot);
3507
+ session.complete(reason);
3508
+ await session.flush();
3509
+ state.completedRounds.push(roundResult);
3510
+ return {
3511
+ rounds: state.completedRounds,
3512
+ finalPrompt: deserializePrompt(session.history.currentPrompt),
3513
+ terminationReason: reason,
3514
+ totalCost: state.totalCost,
3515
+ history: session.history
3516
+ };
3517
+ }
3518
+ function handleRollbackDecision(state, rollbackToRound) {
3519
+ const targetRoundIndex = rollbackToRound - 1;
3520
+ if (targetRoundIndex < 0 || targetRoundIndex >= state.completedRounds.length) {
3521
+ throw new Error(`Cannot rollback to round ${rollbackToRound}: round not found`);
3522
+ }
3523
+ const targetRound = state.completedRounds[targetRoundIndex];
3524
+ state.currentPrompt = deserializePrompt(targetRound.promptSnapshot);
3525
+ state.previousScores = state.previousScores.slice(0, rollbackToRound - 1);
3526
+ }
3527
+ function handleContinueDecision(state, session, roundResult, approvedSuggestions, versionBump) {
3528
+ const updatedRoundResult = {
3529
+ ...roundResult,
3530
+ suggestionsApproved: approvedSuggestions
3531
+ };
3532
+ if (approvedSuggestions.length > 0) {
3533
+ const applyResult = applyPromptSuggestions(state.currentPrompt, approvedSuggestions, {
3534
+ bumpVersion: versionBump
3535
+ });
3536
+ state.currentPrompt = applyResult.prompt;
3537
+ updatedRoundResult.promptVersionAfter = state.currentPrompt.version;
3538
+ }
3539
+ const updatedPromptSnapshot = serializePrompt(state.currentPrompt);
3540
+ session.addRound(updatedRoundResult, updatedPromptSnapshot);
3541
+ state.completedRounds.push(updatedRoundResult);
3542
+ return updatedRoundResult;
3543
+ }
3544
+ async function executeRound(config, state, pricingConfig) {
3545
+ const { createAgent, judge, improver, testCases: testCases2, options = {} } = config;
3546
+ const agent = createAgent(state.currentPrompt);
3547
+ const suite = createEvalSuite({
3548
+ agent,
3549
+ judge,
3550
+ agentDescription: options.agentDescription
3551
+ });
3552
+ const report = await suite.run(testCases2, options.runOptions);
3553
+ const improveResult = improver ? await improver.improve(state.currentPrompt, report.results) : { suggestions: [] };
3554
+ const cost = calculateRoundCost(report, improveResult, pricingConfig);
3555
+ return { report, improveResult, cost };
3556
+ }
3557
+ function detectProviderForImprover(model) {
3558
+ if (!model) return "anthropic";
3559
+ if (model.startsWith("claude-")) return "anthropic";
3560
+ if (model.startsWith("gpt-") || model.startsWith("o1") || model.startsWith("o3")) return "openai";
3561
+ if (model.startsWith("gemini-")) return "google";
3562
+ return "anthropic";
3563
+ }
3564
+ function toLanguageModelUsage2(usage) {
3565
+ return {
3566
+ inputTokens: usage.inputTokens,
3567
+ outputTokens: usage.outputTokens,
3568
+ totalTokens: usage.totalTokens
3569
+ };
3570
+ }
3571
+ function calculateImproverCost(improveResult, pricingConfig) {
3572
+ const usage = improveResult.metadata?.tokenUsage;
3573
+ if (!usage) return 0;
3574
+ const model = improveResult.metadata?.model ?? "unknown";
3575
+ const provider = detectProviderForImprover(model);
3576
+ const providerPricing = pricingConfig?.providerPricing?.[provider];
3577
+ const result = (0, import_core7.calculateCostFromUsage)(
3578
+ toLanguageModelUsage2(usage),
3579
+ model,
3580
+ provider,
3581
+ providerPricing
3582
+ );
3583
+ return result.total;
3584
+ }
3585
+ function calculateRoundCost(report, improveResult, pricingConfig) {
3586
+ const reportCosts = pricingConfig ? calculateReportCosts(report, pricingConfig) : { total: 0, byComponent: { agent: 0, judge: 0 } };
3587
+ const improverCost = calculateImproverCost(improveResult, pricingConfig);
3588
+ return {
3589
+ agent: reportCosts.byComponent.agent ?? 0,
3590
+ judge: reportCosts.byComponent.judge ?? 0,
3591
+ improver: improverCost,
3592
+ total: reportCosts.total + improverCost
3593
+ };
3594
+ }
3595
+ async function* runImprovementCycle(config) {
3596
+ const { initialPrompt, terminateWhen = [], options = {} } = config;
3597
+ const { pricingConfig, versionBump = "patch", history: historyConfig, session: existingSession } = options;
3598
+ const session = existingSession ?? createSession(
3599
+ initialPrompt,
3600
+ historyConfig ? { path: historyConfig.path, autoSave: historyConfig.autoSave } : void 0
3601
+ );
3602
+ const state = initializeCycleState(initialPrompt, existingSession);
3603
+ try {
3604
+ while (true) {
3605
+ state.currentRound++;
3606
+ const { report, improveResult, cost } = await executeRound(config, state, pricingConfig);
3607
+ state.totalCost += cost.total;
3608
+ const currentScore = report.summary.avgScore;
3609
+ const scoreDelta = calculateScoreDelta(currentScore, state.previousScores);
3610
+ const promptSnapshot = serializePrompt(state.currentPrompt);
3611
+ const roundResult = createRoundResult(state, report, improveResult, cost, scoreDelta, promptSnapshot);
3612
+ const context = buildCycleContext(state, currentScore);
3613
+ state.previousScores.push(currentScore);
3614
+ const terminationCheck = await checkCycleTermination(terminateWhen, context);
3615
+ const pendingSuggestions = improveResult.suggestions.map((s) => ({
3616
+ ...s,
3617
+ approved: false
3618
+ }));
3619
+ const roundYield = {
3620
+ roundResult,
3621
+ pendingSuggestions,
3622
+ terminationCheck,
3623
+ context
3624
+ };
3625
+ const decision = yield roundYield;
3626
+ if (!decision || decision.action === "stop") {
3627
+ return await handleStopDecision(
3628
+ state,
3629
+ session,
3630
+ roundResult,
3631
+ promptSnapshot,
3632
+ terminationCheck.terminated,
3633
+ terminationCheck.reason
3634
+ );
3635
+ }
3636
+ if (decision.action === "rollback" && decision.rollbackToRound !== void 0) {
3637
+ handleRollbackDecision(state, decision.rollbackToRound);
3638
+ continue;
3639
+ }
3640
+ handleContinueDecision(
3641
+ state,
3642
+ session,
3643
+ roundResult,
3644
+ decision.approvedSuggestions ?? [],
3645
+ versionBump
3646
+ );
3647
+ }
3648
+ } catch (error) {
3649
+ const errorMessage = error instanceof Error ? error.message : String(error);
3650
+ session.complete(`Error: ${errorMessage}`);
3651
+ throw error;
3652
+ }
3653
+ }
3654
+ async function runImprovementCycleAuto(config) {
3655
+ const cycle = runImprovementCycle(config);
3656
+ let iteratorResult = await cycle.next();
3657
+ while (!iteratorResult.done) {
3658
+ const roundYield = iteratorResult.value;
3659
+ let decision;
3660
+ if (roundYield.terminationCheck.terminated) {
3661
+ decision = { action: "stop" };
3662
+ } else {
3663
+ const approvedSuggestions = roundYield.pendingSuggestions.map((s) => ({
3664
+ ...s,
3665
+ approved: true
3666
+ }));
3667
+ decision = { action: "continue", approvedSuggestions };
3668
+ }
3669
+ iteratorResult = await cycle.next(decision);
3670
+ }
3671
+ return iteratorResult.value;
3672
+ }
3673
+
3674
+ // src/core/test-case-collection.ts
3675
+ var TestCaseCollection = class _TestCaseCollection {
3676
+ cases;
3677
+ constructor(cases) {
3678
+ this.cases = Object.freeze([...cases]);
3679
+ }
3680
+ // ============================================================================
3681
+ // Static Factories
3682
+ // ============================================================================
3683
+ /**
3684
+ * Create a collection from an array of test cases.
3685
+ */
3686
+ static from(cases) {
3687
+ return new _TestCaseCollection(cases);
3688
+ }
3689
+ /**
3690
+ * Create an empty collection.
3691
+ */
3692
+ static empty() {
3693
+ return new _TestCaseCollection([]);
3694
+ }
3695
+ // ============================================================================
3696
+ // Properties
3697
+ // ============================================================================
3698
+ /**
3699
+ * Number of test cases in the collection.
3700
+ */
3701
+ get length() {
3702
+ return this.cases.length;
3703
+ }
3704
+ /**
3705
+ * Whether the collection is empty.
3706
+ */
3707
+ get isEmpty() {
3708
+ return this.cases.length === 0;
3709
+ }
3710
+ // ============================================================================
3711
+ // Selection Methods (return new TestCaseCollection - chainable)
3712
+ // ============================================================================
3713
+ /**
3714
+ * Returns all test cases.
3715
+ * Returns `this` since the collection is immutable (frozen array).
3716
+ * Useful as explicit starting point in chains.
3717
+ */
3718
+ all() {
3719
+ return this;
3720
+ }
3721
+ /**
3722
+ * Returns the first N test cases (default: 1).
3723
+ * Useful for cost-controlled testing during development.
3724
+ */
3725
+ minimal(count = 1) {
3726
+ return this.first(count);
3727
+ }
3728
+ /**
3729
+ * Returns the first N test cases.
3730
+ */
3731
+ first(count) {
3732
+ if (count <= 0) {
3733
+ return _TestCaseCollection.empty();
3734
+ }
3735
+ return new _TestCaseCollection([...this.cases.slice(0, count)]);
3736
+ }
3737
+ /**
3738
+ * Returns the last N test cases (default: 1).
3739
+ * Preserves original order (earlier cases first).
3740
+ */
3741
+ last(count = 1) {
3742
+ if (count <= 0) {
3743
+ return _TestCaseCollection.empty();
3744
+ }
3745
+ const startIndex = Math.max(0, this.cases.length - count);
3746
+ return new _TestCaseCollection([...this.cases.slice(startIndex)]);
3747
+ }
3748
+ /**
3749
+ * Returns N random test cases.
3750
+ *
3751
+ * @param count - Number of cases to select
3752
+ * @param options - Optional seed for reproducibility
3753
+ *
3754
+ * @example
3755
+ * ```typescript
3756
+ * // Different each time
3757
+ * collection.random(5)
3758
+ *
3759
+ * // Same result with same seed
3760
+ * collection.random(5, { seed: 42 })
3761
+ * ```
3762
+ */
3763
+ random(count, options) {
3764
+ if (count <= 0 || this.cases.length === 0) {
3765
+ return _TestCaseCollection.empty();
3766
+ }
3767
+ const actualCount = Math.min(count, this.cases.length);
3768
+ const indices = [...Array(this.cases.length).keys()];
3769
+ const rng = options?.seed !== void 0 ? createSeededRng(options.seed) : Math.random;
3770
+ for (let i = indices.length - 1; i > 0; i--) {
3771
+ const j = Math.floor(rng() * (i + 1));
3772
+ [indices[i], indices[j]] = [indices[j], indices[i]];
3773
+ }
3774
+ const selected = indices.slice(0, actualCount).map((i) => this.cases[i]);
3775
+ return new _TestCaseCollection([...selected]);
3776
+ }
3777
+ /**
3778
+ * Filter test cases by predicate.
3779
+ */
3780
+ filter(predicate) {
3781
+ return new _TestCaseCollection([...this.cases.filter(predicate)]);
3782
+ }
3783
+ /**
3784
+ * Find test case by ID.
3785
+ * Returns collection with single case or empty collection.
3786
+ */
3787
+ byId(id) {
3788
+ const found = this.cases.find((tc) => tc.id === id);
3789
+ return found ? new _TestCaseCollection([found]) : _TestCaseCollection.empty();
3790
+ }
3791
+ /**
3792
+ * Find test cases by multiple IDs.
3793
+ * Preserves order of provided IDs (first occurrence).
3794
+ * Skips non-existent IDs. Duplicate IDs in input are deduplicated.
3795
+ *
3796
+ * @example
3797
+ * ```typescript
3798
+ * collection.byIds(['a', 'b', 'a']) // returns [case-a, case-b] (deduplicated)
3799
+ * collection.byIds(['b', 'a']) // returns [case-b, case-a] (order preserved)
3800
+ * ```
3801
+ */
3802
+ byIds(ids) {
3803
+ const uniqueIds = [...new Set(ids)];
3804
+ const idSet = new Set(uniqueIds);
3805
+ const idToCase = /* @__PURE__ */ new Map();
3806
+ for (const tc of this.cases) {
3807
+ if (tc.id && idSet.has(tc.id) && !idToCase.has(tc.id)) {
3808
+ idToCase.set(tc.id, tc);
3809
+ }
3810
+ }
3811
+ const result = uniqueIds.map((id) => idToCase.get(id)).filter((tc) => tc !== void 0);
3812
+ return new _TestCaseCollection(result);
3813
+ }
3814
+ // ============================================================================
3815
+ // Access Methods
3816
+ // ============================================================================
3817
+ /**
3818
+ * Get test case by ID.
3819
+ * Returns undefined if not found.
3820
+ */
3821
+ get(id) {
3822
+ return this.cases.find((tc) => tc.id === id);
3823
+ }
3824
+ /**
3825
+ * Get test case by index.
3826
+ * Supports negative indices (e.g., -1 for last item).
3827
+ * Returns undefined if index is out of bounds.
3828
+ */
3829
+ at(index) {
3830
+ const normalizedIndex = index < 0 ? this.cases.length + index : index;
3831
+ if (normalizedIndex < 0 || normalizedIndex >= this.cases.length) {
3832
+ return void 0;
3833
+ }
3834
+ return this.cases[normalizedIndex];
3835
+ }
3836
+ // ============================================================================
3837
+ // Conversion Methods
3838
+ // ============================================================================
3839
+ /**
3840
+ * Convert to array.
3841
+ * Returns a mutable copy of the internal array.
3842
+ */
3843
+ toArray() {
3844
+ return [...this.cases];
3845
+ }
3846
+ // ============================================================================
3847
+ // Iterator Support
3848
+ // ============================================================================
3849
+ /**
3850
+ * Iterator support for for...of loops and spread operator.
3851
+ */
3852
+ [Symbol.iterator]() {
3853
+ return this.cases[Symbol.iterator]();
3854
+ }
3855
+ };
3856
+ var autoIdCounter = 0;
3857
+ function testCase(input, id) {
3858
+ return {
3859
+ id: id ?? `test-${++autoIdCounter}`,
3860
+ input
3861
+ };
3862
+ }
3863
+ function testCases(inputs, prefix = "case") {
3864
+ return inputs.map((input, index) => ({
3865
+ id: `${prefix}-${index}`,
3866
+ input
3867
+ }));
3868
+ }
3869
+ function createSeededRng(seed) {
3870
+ let state = seed;
3871
+ return () => {
3872
+ state = state + 1831565813 | 0;
3873
+ let t = Math.imul(state ^ state >>> 15, state | 1);
3874
+ t ^= t + Math.imul(t ^ t >>> 7, t | 61);
3875
+ return ((t ^ t >>> 14) >>> 0) / 4294967296;
3876
+ };
3877
+ }
3878
+ // Annotate the CommonJS export names for ESM import in node:
3879
+ 0 && (module.exports = {
3880
+ ANTHROPIC_PRICING,
3881
+ CompositeReporter,
3882
+ ConsoleReporter,
3883
+ DEFAULT_PRICING_CONFIG,
3884
+ EvalError,
3885
+ EvalErrorCode,
3886
+ GOOGLE_PRICING,
3887
+ JsonReporter,
3888
+ MarkdownReporter,
3889
+ MockProvider,
3890
+ OPENAI_PRICING,
3891
+ TestCaseCollection,
3892
+ accuracy,
3893
+ addCostsToResults,
3894
+ afterTurns,
3895
+ aggregateIterationResults,
3896
+ aiUser,
3897
+ and,
3898
+ applyPromptSuggestions,
3899
+ bumpVersion,
3900
+ calculateAvgPassRate,
3901
+ calculateAvgStdDev,
3902
+ calculateCostFromUsage,
3903
+ calculateIterationStats,
3904
+ calculateMultiTurnIterationStats,
3905
+ calculateReportCosts,
3906
+ calculateResultCost,
3907
+ checkCondition,
3908
+ checkCycleCondition,
3909
+ checkCycleTermination,
3910
+ checkTermination,
3911
+ compareReports,
3912
+ compileTemplate,
3913
+ consistency,
3914
+ createCompositeReporter,
3915
+ createConsoleReporter,
3916
+ createDefaultReporter,
3917
+ createEvalSuite,
3918
+ createFilePromptRepository,
3919
+ createImprover,
3920
+ createJsonReporter,
3921
+ createJudge,
3922
+ createMarkdownReporter,
3923
+ createMockAgent,
3924
+ createMockImprover,
3925
+ createMockJudge,
3926
+ createReportRunner,
3927
+ createSession,
3928
+ customCondition,
3929
+ cycleAnd,
3930
+ cycleNot,
3931
+ cycleOr,
3932
+ cycleToMarkdown,
3933
+ defaultHistoryStorage,
3934
+ defineConfig,
3935
+ deserializePrompt,
3936
+ discoverEvalFiles,
3937
+ executeMultiTurnTestCase,
3938
+ executeTestCase,
3939
+ fieldEquals,
3940
+ fieldIsSet,
3941
+ getFieldValue,
3942
+ getFileSourceDisplayInfo,
3943
+ getFileSourcesDisplayInfo,
3944
+ inferMediaType,
3945
+ isCustomCondition,
3946
+ isCustomCycleCondition,
3947
+ isCycleTerminated,
3948
+ isFieldSetCondition,
3949
+ isFieldValueCondition,
3950
+ isFileSource,
3951
+ isFileSourceBase64,
3952
+ isFileSourceData,
3953
+ isFileSourcePath,
3954
+ isFileSourceUrl,
3955
+ isIteratedResult,
3956
+ isMaxCostCondition,
3957
+ isMaxRoundsCondition,
3958
+ isMaxTurnsCondition,
3959
+ isMultiTurnResult,
3960
+ isMultiTurnTestCase,
3961
+ isNoImprovementCondition,
3962
+ isSingleTurnResult,
3963
+ isTargetScoreCondition,
3964
+ isTerminated,
3965
+ loadHistory,
3966
+ logCycle,
3967
+ maxCost,
3968
+ maxRounds,
3969
+ mock,
3970
+ naturalLanguage,
3971
+ noImprovement,
3972
+ not,
3973
+ or,
3974
+ relevance,
3975
+ reportToMarkdown,
3976
+ resolveFileSource,
3977
+ resolveFileSourcesInInput,
3978
+ resumeSession,
3979
+ runImprovementCycle,
3980
+ runImprovementCycleAuto,
3981
+ runWithConcurrency,
3982
+ saveCycleJson,
3983
+ saveCycleMarkdown,
3984
+ saveHistory,
3985
+ saveReportMarkdown,
3986
+ scanForFileSources,
3987
+ schema,
3988
+ selectRepresentativeResult,
3989
+ serializePrompt,
3990
+ suggestionDiff,
3991
+ suggestionPreview,
3992
+ suggestionSummary,
3993
+ targetScore,
3994
+ testCase,
3995
+ testCases,
3996
+ toEvalAgent
3997
+ });
3998
+ //# sourceMappingURL=index.cjs.map